
    UhOZ                    p   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  e#jR                  e*      Z+dPdejX                  dejZ                  dee.   fdZ/	 dQdej`                  dejZ                  dejb                  de.fdZ2dQdZ3e G d de              Z4e G d de              Z5 G d dejl                        Z7	 dRdejl                  d ejX                  d!ejX                  d"ejX                  d#eejX                     d$e8d%e8fd&Z9 G d' d(ejl                        Z: G d) d*ejl                        Z; G d+ d,ejl                        Z< G d- d.ejl                        Z= G d/ d0ejl                        Z> G d1 d2ejl                        Z? G d3 d4ejl                        Z@ G d5 d6ejl                        ZA G d7 d8ejl                        ZB G d9 d:ejl                        ZCe! G d; d<e             ZD G d= d>eD      ZE G d? d@eD      ZF G dA dBee      ZG e!dCD       G dE dFeDe             ZH G dG dHejl                        ZI e!dID       G dJ dKeD             ZJ e!dLD       G dM dNeDe             ZKg dOZLy)SzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableListOptionalTupleUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tuplelogging	torch_int   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   | j                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }d|z
  }|j                  |j                  t        j
                        t	        j                  |      j                        S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r    r!   r"   bszsrc_lenexpanded_maskinverted_masks          ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr2   +   s     99;LC ,g'GD$)*11#q'7KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc                    | \  }}t        j                  ||ft        j                  |      j                  |      }t        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t        j                  t        j                  ||||      |gd      }|ddddddf   j                  |d|||z         S )zB
    Make causal mask used for bi-directional self-attention.
    )r5   r   r   r!   r5   dimN)r)   fullr+   r,   aranger%   masked_fill_viewr'   catzerosr&   )r4   r!   r5   r6   r-   r"   r    	mask_conds           r1   _make_causal_maskrC   9   s     #LC::w(%++e*<*@*@PDTYYr]6:Ii9q="6"6tyy}a"HH!L775>D!yy%++g/EU[abdhioqrdAq !((a'DZ:Z[[r3   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r:   )neintr)   cumsumtype_aslong)	input_idspadding_idxr6   r    incremental_indicess        r1   "create_position_ids_from_input_idsrM   K   sW     <<$((*D <<!4<<TBE[[_cc##%33r3   c                   @   e Zd ZU dZdZeej                     ed<   dZ	ee
e
ej                           ed<   dZee
ej                        ed<   dZee
ej                        ed<   dZeej                     ed<   dZee
ej                        ed<   dZeed	<   d
e
e   fdZy)Kosmos2ModelOutputa
  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
            the weighted average in the self-attention heads.
        vision_model_output(`BaseModelOutputWithPooling`, *optional*):
            The output of the [`Kosmos2VisionModel`].
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)text_model_outputrV   Ngetattrto_tuple.0kselfs     r1   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   =      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrb   s   `r1   r^   zKosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r3   )__name__
__module____qualname____doc__rP   r   r)   FloatTensor__annotations__rQ   r   rR   rS   rT   rU   rV   r   r   r^    r3   r1   rO   rO   [   s    $L 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r3   rO   c                   h   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                           ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed	<   dZeed
<   dee   fdZy)*Kosmos2ForConditionalGenerationModelOutputaR  
    Model output class for `Kosmos2ForConditionalGeneration`.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
            the weighted average in the self-attention heads.
        vision_model_output(`BaseModelOutputWithPooling`, *optional*):
            The output of the [`Kosmos2VisionModel`].
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
    NlosslogitsrQ   rR   rS   rT   rU   rV   rW   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywrZ   r\   r_   s     r1   rc   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rd   re   rf   ri   s   `r1   r^   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rj   r3   )rk   rl   rm   rn   rt   r   r)   ro   rp   ru   rQ   r   rR   rS   rT   rU   rV   r   r   r^   rq   r3   r1   rs   rs      s    &P )-D(5$$
%,*.FHU&&'.AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r3   rs   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )Kosmos2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r8   
persistent)super__init__rz   hidden_size	embed_dim
image_size
patch_sizer
   	Parameterr)   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr=   r&   rb   rz   	__class__s     r1   r   z Kosmos2VisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr3   
embeddingsheightwidthrW   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr8   g      ?r   r   bicubicF)r%   modealign_cornersr:   )shaper   weight	unsqueezer)   jit
is_tracingr   r   r   reshapepermuter
   
functionalinterpolater?   r@   )rb   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr;   
new_height	new_widthsqrt_num_positionss                r1   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr3   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r!   r   r   r8   r:   )r   r   
ValueErrorr   r   r!   r'   flatten	transposer   r&   r)   r@   r   r   r   )rb   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r1   forwardzKosmos2VisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr3   F)rk   rl   rm   r   r   r)   TensorrF   r   ro   r   __classcell__r   s   @r1   ry   ry      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r3   ry   modulequerykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr8   r:   ptrainingr   r   )	r)   matmulr   r
   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r1   eager_attention_forwardr   !  s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r3   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   rz   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr
   Lineark_projv_projq_projout_projr   s     r1   r   zKosmos2VisionAttention.__init__:  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar3   rR   r   causal_attention_maskoutput_attentionsrW   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
#Input shape: Batch x Time x Channelr   r   flash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r?   r   r   r   rz   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )rb   rR   r   r   r   r   
seq_lengthr   queriesrh   valuesattention_interfacer   r   s                 r1   r   zKosmos2VisionAttention.forwardN  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r3   )NNF)rk   rl   rm   rn   r   r)   r   r   r*   r   r   r   r   s   @r1   r   r   7  s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r3   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Kosmos2VisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r   r   rz   r   
hidden_actactivation_fnr
   r   r   intermediate_sizefc1fc2r   s     r1   r   zKosmos2VisionMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr3   rR   rW   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   rb   rR   s     r1   r   zKosmos2VisionMLP.forward  s4    /**=9/r3   )rk   rl   rm   r   r)   r   r   r   r   s   @r1   r   r     s$    KU\\ ell r3   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
Kosmos2VisionEncoderLayerrz   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr
   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r1   r   z"Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr3   rR   r   r   r   rW   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rR   r   r   r   )r	  r  r  r
  )rb   rR   r   r   r   residualr   outputss           r1   r   z!Kosmos2VisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr3   r   )rk   rl   rm   r   r   r)   r   r   r*   r   ro   r   r   r   s   @r1   r  r    sg    S2 S -2&||& &  %||	&
 $D>& 
u  	!&r3   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    rz   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   rz   r
   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointingrb   rz   r   r   s      r1   r   zKosmos2VisionEncoder.__init__  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#r   r   r   output_hidden_statesreturn_dictrW   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nrq   )r   r   r   c              3   &   K   | ]	  }||  y wr   rq   )r`   vs     r1   rc   z/Kosmos2VisionEncoder.forward.<locals>.<genexpr>%  s     eqWXWdes   )rP   rR   rS   )rz   r   r  use_return_dict	enumerater  r  r   _gradient_checkpointing_func__call__rg   r   )rb   inputs_embedsr   r   r   r  r  encoder_statesall_attentionsrR   idxencoder_layerlayer_outputss                r1   r   zKosmos2VisionEncoder.forward  sH   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r3   NNNNN)rk   rl   rm   rn   r   r   r   r)   r   r*   r	   r   r   r   r   r   s   @r1   r  r    s    ,2 , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
r3   r  c                        e Zd Zdef fdZ	 	 	 	 	 d
deej                     dee   dee   dedee   de	e
ef   fd	Z xZS )Kosmos2VisionTransformerrz   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r  )r   r   rz   r   ry   r   r
   r  r  pre_layrnormr  encoderpost_layernorm)rb   rz   r   r   s      r1   r   z!Kosmos2VisionTransformer.__init__.  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr3   r   r   r  r   r  rW   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r!  r   r  r  r   r   )rP   pooler_outputrR   rS   )rz   r   r  r  r   r   r+  r,  r-  r   rR   rS   )
rb   r   r   r  r   r  rR   encoder_outputsrP   pooled_outputs
             r1   r   z Kosmos2VisionTransformer.forward8  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r3   NNNFN)rk   rl   rm   r   r   r   r)   ro   r*   r	   r   r   r   r   r   s   @r1   r)  r)  ,  s    Q2 Q 59,0/3).&*'
u001'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
r3   r)  c                       e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         	 	 	 	 dd	ee
j                     d
ee
j                     dedee
j                     fd       Zd Z xZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.r   embedding_dimrK   c                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y )Nr   )r   r   offsetr5  rK   make_weights)rb   r   r5  rK   r   s       r1   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__g  s@    *&-$++5}kRr3   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )Nweightsr9   Fr   )get_embeddinghasattrr'   r;  r!   r5   r   )rb   r9  r5  rK   emb_weightss        r1   r8  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightso  s[    ((T4#%..t||/A/A$,,J]J].^KYFr3   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r   r   r:   r8   N)mathlogr)   expr=   int64floatr   r@   sincosr?   rA   r'   get_default_dtype)r9  r5  rK   half_dimembs        r1   r<  z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingw  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r3   rJ   r!  r6   r   c                 v   |F|j                         \  }}|[t        || j                  |      j                  |j                        }n*|j                         d d \  }}|| j                  ||      }| j                  dz   |z   |z   }|| j                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr8   r   r   )r%   rM   rK   r'   r5   &create_position_ids_from_inputs_embedsr;  r8  r7  r5  index_selectr?   r   detach)rb   rJ   r!  r6   r   r-   seq_lenmax_poss           r1   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forward  s'     $>>+LC#At//1G "Y%%&  )--/4LC##JJ=Zpq ""Q&03IIT\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr3   c                 0   |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      j                         |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr8   r   r9   r   )	r%   r)   r=   rK   rI   r5   r   r&   r   )rb   r!  r6   input_shapesequence_lengthr   s         r1   rK  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<GGILbbbr3   r   )NNr   N)rk   rl   rm   rn   rF   r   r   r8  staticmethodr<  r)   no_gradr   r   rK  r   r   s   @r1   r4  r4  c  s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( U]]_ -104&'/3wELL)w  -w !$	w
 u||,w w6cr3   r4  c                       e Zd ZdZ	 	 	 	 ddedededededef fdZd	ej                  d
ej                  fdZ
	 	 	 	 	 ddej                  deej                     deeej                        deej                     deej                     ded
eej                  eej                     eeej                        f   fdZ xZS )KosmosTextAttentionr   r   r   r   
is_decoderadd_inner_attn_layernormr   c                 \   t         |           || _        || _        || _        || _        ||z  | _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        d | _        |r't        j"                  ||j$                        | _        y y )Nr   r   r   r   )r   r  )r   r   rz   r   r   r   r   r   r   rW  r
   r   r   r   r   r   inner_attn_lnr  r  )	rb   rz   r   r   r   rW  rX  r   r   s	           r1   r   zKosmosTextAttention.__init__  s    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$ii	94@ii	94@ii	94@		)YTB "#!#iV=R=R!SD $r3   
projectionrW   c                     |j                         d d | j                  | j                  fz   }|j                  |      j	                  dddd      }|S )Nr8   r   r   r   r   )r%   r   r   r?   r   )rb   r[  new_projection_shapenew_projections       r1   _shapezKosmosTextAttention._shape  sO    )0"58WW#)=>FFq!QPQRr3   rR   encoder_hidden_statespast_key_valuer   layer_head_maskr   c                    |du}|j                   dd \  }	}
||n|}|r/|r-|d   j                   d   |j                   d   k(  r|d   }|d   }n|| j                  | j                  |            }| j                  | j                  |            }|:|s8t	        j
                  |d   |gd      }t	        j
                  |d   |gd      }| j                  | j                  |            }| j                  r||f}t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||||f| j                  sd	n| j                  | j                   d
|\  }}|j#                  |	|
d      j%                         }| j&                  | j'                  |      }| j)                  |      }|||fS )r   Nr   r   r   r:   r   r   r   r   )r   r   r8   )r   r_  r   r   r)   r@   r   rW  r   rz   r   r   r   r   r   r   r   r   r   rZ  r   )rb   rR   r`  ra  r   rb  r   r   is_cross_attentionr   r   current_states
key_statesvalue_statesquery_statesr   r   r   s                     r1   r   zKosmosTextAttention.forward  s    3$>!.!4!4Ra!8
J 3H2S.Yf .^A5F5L5LQ5OSaSgSghiSj5j'*J)!,LT[[%@AJ;;t{{>'BCL)2D"YYq(9:'FAN
$yy.*;\)JPQR{{4;;}#=>?? ),7N(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPR),,[9KmmK0L.88r3   )r   FFT)NNNNF)rk   rl   rm   rn   rF   rD  r*   r   r)   r   r_  r   r   r   r   r   s   @r1   rV  rV    s.   G  ).!T !T 	!T
 !T !T #'!T !TF %,,  9=8<1526"'I9||I9  (5I9 !u||!45	I9
 !.I9 "%,,/I9  I9 
u||Xell3XeELL>Q5RR	SI9r3   rV  c                   *     e Zd Zdef fdZd Z xZS )Kosmos2TextFFNrz   c                    t         |           |j                  | _        t        |j                     | _        |j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y r  )r   r   r   r   activation_functionr   activation_dropoutr
   r   r   ffn_dimr   r   r  r  ffn_layernormr   s     r1   r   zKosmos2TextFFN.__init__5  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr3   c                 b   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )	r   r   r
   r   r   rm  r   ro  r   r   s     r1   r   zKosmos2TextFFN.forwardA  s    **488M+BC--mt?V?Vaeanan-o**=9/--mt||VZVcVc-dr3   )rk   rl   rm   r   r   r   r   r   s   @r1   rj  rj  4  s    
U0 
Ur3   rj  c                   ~    e Zd Zdef fdZ	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eeej                        d
ee	   dee	   deej                  eeej                  ej                  f      f   fdZ xZS )Kosmos2TextBlockrz   c                    t         |           |j                  | _        t        || j                  |j                  |j
                  dd      | _        |j                  | _        t        j                  | j                  |j                        | _        |j                  rdt        || j                  |j                  |j
                  dd      | _        t        j                  | j                  |j                        | _        t        |      | _        t        j                  | j                  |j                        | _        y )NT)r   r   r   rW  rX  r  F)r   r   r   rV  attention_headsr   r  r   r
   r  r  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrj  ffnfinal_layer_normr   s     r1   r   zKosmos2TextBlock.__init__L  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr3   rR   r   r`  encoder_attention_maskrb  cross_attn_layer_head_maskra  r   	use_cacherW   c
           
         |}||d d nd }| j                  |      } | j                  d	|||||d|
\  }}}t        j                  j	                  || j                  | j
                        }||z   }d }d }|t        | d      st        d|  d      |}| j                  |      }||dd  nd } | j                  d	||||||d|
\  }}}t        j                  j	                  || j                  | j
                        }||z   }||z   }|}| j                  |      }| j                  |      }||z   }|f}|r|||fz  }|	r||fz  }|S )
Nr   )rR   ra  r   rb  r   r   rw  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )rR   r`  r   rb  ra  r   rq   )ru  r  r
   r   r   r   r=  r   rx  rw  rz  ry  )rb   rR   r   r`  r{  rb  r|  ra  r   r}  r   r  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer  s                      r1   r   zKosmos2TextBlock.forwardi  s    ! :H9S>"1#5Y] 11-@ ?Mdnn ?
'3)+/?
 ?
;(*; --mt||VZVcVc-d =0 (,$! ,40 =dV DD D 
 %H 88GM @N?Yrs(;_c%N_dN_N_ O+&;5 :8"3O OKM-/K MM11-4<<Z^ZgZg1hM$}4M !24P P !--m< / =0 ")+=>>G)++Gr3   )NNNNNNFT)rk   rl   rm   r   r   r)   r   r   r   r*   ro   r   r   r   s   @r1   rr  rr  K  s   X0 X@ 268<9=26=A8<,1$(Q||Q !.Q  (5	Q
 !) 6Q "%,,/Q %-U\\$:Q !u||!45Q $D>Q D>Q 
u  (51B1BEDUDU1U+V"WW	XQr3   rr  c            &           e Zd ZdZdef fdZd Z	 	 	 	 	 ddeej                     deej                     deej                     de
d	eej                     f
d
Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej                        deej                     d	eej                     dee   dee   dee   dee   dee   deeef   f"d       Z xZS )Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    rz   c                    t         |           || _        |j                  | _        |j                  | _        |j
                  rt        j                  |j                        nd| _	        t        j                  |j                  |j                  |j                        | _        t        |j                   |j                  |j                        | _        t        j$                  t'        |j(                        D cg c]  }t+        |       c}      | _        t        j,                  |j                  |j.                        | _        d| _        y c c}w )Nr$   )rK   )r   r5  rK   F)r   r   rz   r   	layerdropscale_embeddingr@  sqrtr   embed_scaler
   r   
vocab_sizepad_token_idembed_tokensr4  max_position_embeddingsembed_positionsr  r  r  rr  r  r  
layer_normr  r  s      r1   r   zKosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmuV]]G[$\!%5f%=$\],,v'7'79N9NO&+# %]s   =Ec                     d }|d   dkD  r#t        ||j                  |j                  |      }|=t        ||j                  |d         j	                  |j                        }||n||z   }|S )Nr8   r   )r5   r6   r"   )rC   r!   r5   r2   r'   )rb   r   rQ  r!  r6   combined_attention_maskexpanded_attn_masks          r1   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-nm>Q>Q[fgi[j!k!n!n$$" '>&E"K]`wKw $ '&r3   r!  rT   img_input_maskr6   r   c                    || j                  |      }|[|j                  |j                        j                  d|j	                  d            ||j                  t
        j                        <   || j                  z  }| j                  ||||      }|j                  |j                        }||z   }t        j                  j                  || j                  | j                        }|S )Nr8   r   )rJ   r!  r6   r   r   )r  r'   r5   r?   r%   r)   r*   r  r  r
   r   r   r   )	rb   rJ   r!  rT   r  r6   r   	positionsrR   s	            r1   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM.++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%	1--mt||VZVcVc-dr3   rJ   r   image_embeds_position_maskr`  r{  	head_maskcross_attn_head_maskrQ   r}  r   r  r  r   rW   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||
t        d      |"|j                  }|j                  d|d         }n!|
|
j                         d d }nt        d      |	|	d   d   j                  d   nd}|dkD  rd }d }| j                  ||
||||      }| j                  ||||      }||t        ||
j                  |d         }t        j                  j                  || j                  | j                         }| j"                  r%| j                   r|rt$        j'                  d	       d
}|rdnd }|rdnd }|r|dnd }|rdnd }t)        ||gddg      D ]j  \  }}|	|j                         d   t+        | j,                        k7  s3t        d| dt+        | j,                         d|j                         d    d       t/        | j,                        D ]  \  }}|r||fz  }| j                   r%t1        j2                  g       }|| j4                  k  r?|	|	|   nd }| j"                  r?| j                   r3| j7                  |j8                  |||||||   nd |||   nd d ||
      }n  ||f||||||   nd |||   nd |||d|}|d   }|r|||rdnd   fz  }|s||d   fz  }|||d   fz  } | j;                  |      }|r||fz  }t=        |||||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer8   z5You have to specify either input_ids or inputs_embedsr   r   )rJ   r!  rT   r  r6   r   r  r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frq   r  r  zThe `z` should be specified for z layers, but it is for .)r   r`  r{  rb  r|  ra  r   r}  r   r   )rP   rQ   rR   rS   cross_attentions)rz   r   r  r}  r  r   r   r?   r%   r  r  r2   r!   r
   r   r   r   r  r   r   ziplenr  r  r)   randr  r  r   r  r   )rb   rJ   r   rT   r  r`  r{  r  r  rQ   r!  r   r}  r   r  r  r   rQ  r6   rR   all_hidden_statesall_self_attnsall_cross_attentionspresent_key_value_states	attn_mask	mask_namer$  decoder_layerdropout_probabilityra  r&  s                                  r1   r   zKosmos2TextTransformer.forward  s.   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"#//K!r;r?;I&',,.s3KTUU DSC^!3A!6!<!<Q!?de "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12H-J]J]grsugv%w"--mt||VZVcVc-d&&4==##p "	 #7BD0d&7<Q<]rdh)22  %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 0	@C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! ! *!,M(]@Q1WX-Y,[[( =#3"55(4(]1-=,??(a0	@f 6  -!118+4+%1
 	
r3   )NNNr   NNNNNNNNNNNNNNNN)rk   rl   rm   rn   r   r   r  r   r)   r   rF   r  r   r   ro   r*   r   r   r	   r   r   r   r   r   s   @r1   r  r    s   ,0 ,('4 15/315&'/3!  -! u||,	!
 !.! !$! u||,!F  -115/3=A8<9=,07;=A04/3$(,0/3&*!U
ELL)U
 !.U
 u||,	U

 %-U\\$:U
  (5U
 !) 6U
 ELL)U
 'u||4U
 "$u'8'8"9:U
  -U
 u||,U
 D>U
 $D>U
 'tnU
  d^!U
" -.#U
$ 
u??	@%U
 U
r3   r  c                   .    e Zd ZeZdZddgZdZdZdZ	d Z
y)Kosmos2PreTrainedModelTr  rr  c                    t        | t              r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        | t        t        f      r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        |t              rt        j                  j                  |j                  d|j                   dz  z         t        j                  j                  |j"                  j$                  |j                  j&                  |z         t        j                  j                  |j(                  j$                  |j                  j&                  |z         yt        |t*              r'|j                   dz  d|j                  j,                  z  dz  z  z  }|j                   dz  |z  }t        j                  j                  |j.                  j$                  |       t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       |j.                  j6                  .|j.                  j6                  j8                  j;                          |j0                  j6                  .|j0                  j6                  j8                  j;                          |j2                  j6                  .|j2                  j6                  j8                  j;                          |j4                  j6                  /|j4                  j6                  j8                  j;                          yyt        |t<              rL|j                  j>                  dz  d|j                  j,                  z  dz  z  z  }d|j                  j>                  z  dz  |z  }t        j                  j                  |j@                  j$                  |       t        j                  j                  |jB                  j$                  |       |j@                  j6                  .|j@                  j6                  j8                  j;                          |jB                  j6                  /|jB                  j6                  j8                  j;                          yyt        |tD              r|jF                  j6                  j8                  j;                          |jF                  j$                  j8                  jI                  d       |jJ                  j6                  j8                  j;                          |jJ                  j$                  j8                  jI                  d       yt        |tL              r|jN                  j6                  j8                  j;                          |jN                  j$                  j8                  jI                  d       |jP                  j6                  j8                  j;                          |jP                  j$                  j8                  jI                  d       yt        |tR              rt        j                  j                  |j.                  j$                         t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       |j.                  j6                  .|j.                  j6                  j8                  j;                          |j0                  j6                  .|j0                  j6                  j8                  j;                          |j2                  j6                  .|j2                  j6                  j8                  j;                          |j4                  j6                  /|j4                  j6                  j8                  j;                          yyt        |tT              rt        j                  j                  |j@                  j$                         t        j                  j                  |jB                  j$                  |       |j@                  j6                  .|j@                  j6                  j8                  j;                          |jB                  j6                  /|jB                  j6                  j8                  j;                          yyt        |t              r{t        j                  j                  |jV                  j$                         |jV                  j6                  /|jV                  j6                  j8                  j;                          yyt        |tX              r{t        j                  j                  |jZ                  j$                         |jZ                  j6                  /|jZ                  j6                  j8                  j;                          yyt        |t\              r|j^                  j$                  j8                  j                  d       |j^                  j`                  F|j^                  j$                  j8                  |j^                  j`                     j;                          yyy)zInitialize the weightsr   r   )meanstd)r  r   Nr$   )1
isinstanceKosmos2VisionModelrz   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configry   r
   initnormal_r   r   r   r   initializer_ranger   r   r  r   r   r   r   r   datazero_r   r   r   r   r  r	  fill_r  r)  r+  r-  rV  rj  lm_headKosmos2ImageToTextProjectiondenser  r  rK   )rb   r   factorr  in_proj_stdout_proj_stdfc_stds          r1   _init_weightsz$Kosmos2PreTrainedModel._init_weights  s   d./[[33F|-LMN[[..AAFd-/EFG++&&C|-LMN++))22Cf56GGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 67!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE}}!!-""''--/}}!!-""''--/}}!!-""''--/##/$$))//1 0 01!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?zz*

$$**,zz*

$$**, + 9:##((..0%%**005##((..0%%**005 89$$))//1&&++11#6!!&&++113!!((--33C8 34GGOOFMM00cO:GGOOFMM00cO:GGOOFMM00cO:GGOOFOO22O<}}!!-""''--/}}!!-""''--/}}!!-""''--/##/$$))//1 0/GGOOFJJ--3O7GGOOFJJ--3O7zz*

$$**,zz*

$$**, + 67GGOOFNN11sO;~~"".##((..0 / <=GGOOFLL//SO9||  ,!!&&,,. - 67&&++33#3F""..:##**//0C0C0O0OPVVX ; 8r3   N)rk   rl   rm   r   config_classsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_2_supports_sdpar  rq   r3   r1   r  r    s1     L&*#46HI"&!NQYr3   r  c                        e Zd ZeZdZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )r  r   rz   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r)  model	post_initr   s     r1   r   zKosmos2VisionModel.__init__  s&     -f5
r3   rW   c                 B    | j                   j                  j                  S r   )r  r   r   ri   s    r1   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r3   r   r  r   r  c                 .    | j                  |||||      S )N)r   r   r  r   r  r  )rb   r   r   r  r   r  s         r1   r   zKosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r3   r2  )rk   rl   rm   r   r  main_input_namer   r
   Moduler  r   r   r)   ro   r*   r	   r   r   r   r   r   s   @r1   r  r  	  s    &L$O2 5bii 5  59,0/3).&*
u001
 $D>
 'tn	

 #'
 d^
 
u00	1
 
r3   r  c            '           e Zd ZeZdef fdZdej                  fdZd Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     deeej                         deej                     deej                     dee   dee   dee   dee   dee   deeef   f"d              Z xZS )r  rz   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r  r  r  r   s     r1   r   zKosmos2TextModel.__init__-  s&     +F3
r3   rW   c                 .    | j                   j                  S r   r  r  ri   s    r1   r  z%Kosmos2TextModel.get_input_embeddings3      zz&&&r3   c                 &    || j                   _        y r   r  rb   r   s     r1   set_input_embeddingsz%Kosmos2TextModel.set_input_embeddings6      "'

r3   rJ   r   rT   r  r`  r{  r  r  rQ   r!  r   r}  r   r  r  r   c                 F     | j                   d|||||||||	|
|||||d|S )a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        rJ   r   rT   r  r`  r{  r  r  rQ   r!  r   r}  r   r  r  rq   r  )rb   rJ   r   rT   r  r`  r{  r  r  rQ   r!  r   r}  r   r  r  r   s                    r1   r   zKosmos2TextModel.forward9  sT    H tzz 
)%'A"7#9!5+'%/!5#
  !
 	
r3   r  )rk   rl   rm   r   r  r   r
   r  r  r  r   r   r   r)   r   r   ro   r*   r   r   r	   r   r   r   r   r   s   @r1   r  r  *  s   $L0 'bii '(  -115/3=A8<9=,07;=A04/3$(,0/3&*!3
ELL)3
 !.3
 u||,	3

 %-U\\$:3
  (53
 !) 63
 ELL)3
 'u||43
 "$u'8'8"9:3
  -3
 u||,3
 D>3
 $D>3
 'tn3
  d^!3
" -.#3
$ 
u??	@%3
  3
r3   r  c                       e Zd Zy)KwargsForCausalLMN)rk   rl   rm   rq   r3   r1   r  r  q  s    r3   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            )           e Zd ZeZdgZdef fdZdej                  fdZ	d Z
dej                  fdZd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej"                     d
eej"                     deej"                     deej"                     deej"                     deej"                     deej"                     deej"                     deeej&                        deej"                     deej"                     deej(                     dee   dee   dee   dee   dee   deeef   f$d              Z	 	 	 	 	 	 d fd	Zed        Z xZS )r  zlm_head.weightrz   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)in_featuresout_featuresr   )
r   r   r  r  r
   r   r   r  r  r  r   s     r1   r   zKosmos2TextForCausalLM.__init__~  sI     +F3
yyV-=-=FL]L]dij 	r3   rW   c                 .    | j                   j                  S r   r  ri   s    r1   r  z+Kosmos2TextForCausalLM.get_input_embeddings  r  r3   c                 &    || j                   _        y r   r  r  s     r1   r  z+Kosmos2TextForCausalLM.set_input_embeddings  r  r3   c                     | j                   S r   r  ri   s    r1   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings  s    ||r3   c                     || _         y r   r  rb   new_embeddingss     r1   set_output_embeddingsz,Kosmos2TextForCausalLM.set_output_embeddings  s	    %r3   rJ   r   rT   r  r`  r{  r  r  rQ   r!  r   labelsr}  r   r  r  r   c                    ||n| j                   j                  }||rt        j                  d       d} | j                  d	|||||||||	|
||||dd|}| j                  |d         }d}|* | j                  d	||| j                   j                  d|}t        |||j                  |j                  |j                  |j                        S )
aK  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FTr  r   )ru   r  r  )rt   ru   rQ   rR   rS   r  rq   )rz   r  r   warningr  r  loss_functionr  r   rQ   rR   rS   r  )rb   rJ   r   rT   r  r`  r{  r  r  rQ   r!  r   r  r}  r   r  r  r   r  	lm_logitsrt   s                        r1   r   zKosmos2TextForCausalLM.forward  s   R &1%<k$++B]B]klI$** 
)%'A"7#9!5+'%/!5
  !
$ LL,	%4%%sYvRVR]R]RhRhslrsD0#33!//))$55
 	
r3   c                 r   t        || j                  j                  d      }	|d }d }nt|r|j                         \  }
}|j                         d   }t	        j
                  |t	        j                  |
||z
  ft        j                  |j                        fd      }t        | (  |f||||||	|d|}|S )Nr   )rK   r6   r8   )r%   r!   r5   r   r:   )rQ   r   rT   r  r}  r   cache_position)rM   rz   r  r%   r)   r@   rA   r*   r5   r   prepare_inputs_for_generation)rb   rJ   rT   r  rQ   r   r}  r  model_kwargsr   r   rN  mask_lenmodel_inputsr   s                 r1   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s     :00#$
 &L)-&'3"+.."2J1668<H)..KKj'H2D%EUZZ`i`p`pq *& w<

+)%'A%)

 

 r3   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nrq   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)rL  r'   r5   )r`   
past_statebeam_idxs     r1   rc   z8Kosmos2TextForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)rg   )rQ   r  reordered_past
layer_pasts    `  r1   _reorder_cachez%Kosmos2TextForCausalLM._reorder_cache  s?     ) 	Jncmnn N	 r3   )NNNNNNNNNNNNNNNN)NNNNNN) rk   rl   rm   r   r  _tied_weights_keysr   r
   r  r  r  r  r  r   r   r   r)   r   r   ro   
LongTensorr*   r   r  r	   r   r   r   r  rS  r  r   r   s   @r1   r  r  t  s#    %L*+0 'bii '(ryy &  -115/3=A8<9=,07;=A04/3-1$(,0/3&*#M
ELL)M
 !.M
 u||,	M

 %-U\\$:M
  (5M
 !) 6M
 ELL)M
 'u||4M
 "$u'8'8"9:M
  -M
 u||,M
 ))*M
 D>M
 $D>M
  'tn!M
" d^#M
$ *+%M
& 
u77	8'M
  M
d #'/b  r3   r  c                   .     e Zd ZdZdef fdZd Z xZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)rz   c                    t         |           t        j                  |j                  j
                  |j                  j                        | _        t        j                  t        j                  |j                  |j                  j                              | _        t        |j                  |j                  j                  |j                  j                  |j                  j                   dd      | _        y )NF)r   rW  rX  )r   r   r
   r   r  r   r  r   r  r   r)   r   latent_query_numlatent_queryrV  rt  r   x_attnr   s     r1   r   z%Kosmos2ImageToTextProjection.__init__#  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r3   c                    | j                  |      }| j                  j                  d      j                  |j	                  d      dd      }t        j                  ||gd      }| j                  ||d d d       \  }}}||fS )Nr   r8   r   r:   )rR   r`  ra  r   r   )r  r  r   r&   r%   r)   r@   r  )rb   featuresrR   r  key_value_statesr   r   s          r1   r   z$Kosmos2ImageToTextProjection.forward1  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m\%BJ)-&"2" *5 *
&|Q l**r3   )rk   rl   rm   rn   r   r   r   r   r   s   @r1   r  r     s    w
} 
+r3   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %           e Zd ZeZdZdef fdZdej                  fdZ	d Z
	 	 ddej                  dee   dee   fd	Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     deeej                        deej$                     deej$                     deej$                     dee   dee   dee   dedee   dee   deeef   f d              Z xZS )r  r   rz   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r1   r   zKosmos2Model.__init__L  sN     *6+=+=>.v/C/CD(DV(L% 	r3   rW   c                 B    | j                   j                  j                  S r   r  r  r  ri   s    r1   r  z!Kosmos2Model.get_input_embeddingsV      $$111r3   c                 :    || j                   j                  _        y r   r  r  s     r1   r  z!Kosmos2Model.set_input_embeddingsY      -2*r3   return_attentionsr   c                     | j                  ||      }| j                   j                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }}|r||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r8   r:   )r  r  r-  r
   r   	normalizer  )rb   r   r  r   rV   rT   rU   s          r1   get_image_featureszKosmos2Model.get_image_features\  s    " #//%%= 0 

 ((..==>QRS>TU}}..|.D.2.K.KL.Y++!666r3   rJ   r  r   r  rQ   rT   r!  r   r}  r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}|$|t	        d      | j                  |d|      \  }} | j                  d||||||||	|
||dd|}t        |j                  |j                  |j                  |j                  |||      S )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r  r   )rJ   r   rT   r  r  rQ   r!  r   r}  r   r  r  )rP   rQ   rR   rS   rT   rU   rV   rq   )rz   r   r  r  r   r  r  rO   rP   rQ   rR   rS   )rb   r   rJ   r  r   r  rQ   rT   r!  r   r}  r   r  r   r  r   rV   rU   r  s                      r1   r   zKosmos2Model.forward{  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``262I2IOg 3J 3/L/ "$// 
)%'A+'%/!5
 
  "%77#33!//))%"7 3
 	
r3   )FF)NNNNNNNNNNNNFN)rk   rl   rm   r   r  r  r   r
   r  r  r  r)   ro   r   r*   r  r   r   r   r   r   r   r	   r   rO   r   r   r   s   @r1   r  r  C  s    !L$O} 2bii 23 -238	'' $D> #+4.	>  04,0=A15,0=A/304/3$(,0/3).&*a
u||,a
 ELL)a
 %-U\\$:	a

 !.a
 ELL)a
 "$u'8'8"9:a
 u||,a
  -a
 u||,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
  a
r3   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            %           e Zd ZeZdZdgZdef fdZdej                  fdZ
d Zdej                  fdZd	 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     deeej(                        deej$                     deej$                     deej$                     deej*                     dee   dee   dee   dee   dee   deeef   f d              Z	 	 	 	 	 ddeej$                     deej$                     d
eej$                     deej$                     deej$                     f
dZ xZS )r  r   ztext_model.lm_head.weightrz   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  r  r  r  r  r  r  r  r   s     r1   r   z(Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	r3   rW   c                 B    | j                   j                  j                  S r   r  ri   s    r1   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings  r  r3   c                 :    || j                   j                  _        y r   r  r  s     r1   r  z4Kosmos2ForConditionalGeneration.set_input_embeddings  r  r3   c                 6    | j                   j                         S r   )r  r  ri   s    r1   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r3   c                 :    | j                   j                  |       y r   )r  r  r  s     r1   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings   s    --n=r3   rJ   r  r   r  rQ   rT   r!  r   r  r}  r   r  r  r   c                 r   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}||t	        d      | j                  ||||      }| j
                  j                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }} | j                  d
||||||||	|
|||dd|}t        |j                  |j                  |j                   |j"                  |j$                  |||	      S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr   )r   r   r  r  r   r8   r:   T)rJ   r   rT   r  r  rQ   r!  r   r  r}  r   r  r  )rt   ru   rQ   rR   rS   rT   rU   rV   rq   )rz   r   r  r  r   r  r  r-  r
   r   r  r  r  rs   rt   ru   rQ   rR   rS   )rb   r   rJ   r  r   r  rQ   rT   r!  r   r  r}  r   r  r  r   rV   rU   
lm_outputss                      r1   r   z'Kosmos2ForConditionalGeneration.forward  sw   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``"&"3"3)"3%9'	 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/$T__ 
)%'A+'%/!5
 

" :$$&66$22!,,%"7 3	
 		
r3   c                 ~   |j                  dd       }||t        d| d      |||}|n| j                  |      }| j                  j                  j	                  |d         }t
        j                  j                  |d      }| j                  |      \  }}	 | j                  j                  d||||d|}
|
S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r8   r:   )rJ   r   rT   r  rq   )popr   r  r  r-  r
   r   r  r  r  generate)rb   r   r  rJ   r   rT   r   r*  rV   rU   outputs              r1   r,  z(Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/))) 
)%'A	

 
 r3   )NNNNNNNNNNNNNNr'  )rk   rl   rm   r   r  r  r  r   r
   r  r  r  r  r  r   r   r   r)   r   r   ro   r  r*   r   r  r	   r   rs   r   r,  r   r   s   @r1   r  r    s>    !L$O56	} 	2bii 237ryy 7>  04,0=A15,0=A/304/3-1$(,0/3&*x
u||,x
 ELL)x
 %-U\\$:	x

 !.x
 ELL)x
 "$u'8'8"9:x
 u||,x
  -x
 u||,x
 ))*x
 D>x
 $D>x
 'tnx
 d^x
  *+!x
" 
u@@	A#x
  x
x 04=A,015/3#u||,# %-U\\$:# ELL)	#
 !.# u||,#r3   r  )r  r  r  r   )r   )r   )Mrn   r@  dataclassesr   typingr   r   r   r   r   r	   r)   torch.utils.checkpointr
   activationsr   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_kosmos2r   r   r   
get_loggerrk   r   r   r!   rF   r2   Sizer5   rC   rM   rO   rs   r  ry   rD  r   r   r   r  r  r)  r4  rV  rj  rr  r  r  r  r  r  r  r  r  r  __all__rq   r3   r1   <module>r<     s     ! > >    ! ) B  G & b b X X 
		H	%[u|| [EKK [(3- [ jk\ZZ\(-\=B\\\cf\$4  3
 3
 3
l 6
 6
 6
tPbii Pv %II%<<% 
% <<	%
 U\\*% % %,L)RYY L)`ryy  /		 /f^
299 ^
D3
ryy 3
nUcryy Ucpv9")) v9rRYY .oryy odl
RYY l
^ YY_ YY YYx
/ 
BD
- D
N ?,j > c3_ ccL +299  +F 
V
) V

V
r {&<o {{| Xr3   