
    UhN(                    f   d Z ddlmZ ddlmZmZmZmZmZm	Z	m
Z
 ddlZddlmc mZ ddlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2  e*       rddl3m4Z4 ddl5m6Z6  e+jn                  e8      Z9e G d de             Z:e G d de             Z;	 	 	 	 d@dZ<g fdZ= G d dej|                        Z? G d d ej                        ZA G d! d"ej                        ZC e%j                  eC        G d# d$ej                  j                        ZEd% ZFdAd&ZG G d' d(ej                        ZH	 dBd)ej                  d*ej                  d+ej                  d,ej                  d-eej                     d.eJd/eJfd0ZK G d1 d2ej                        ZL G d3 d4ej                        ZM G d5 d6ej                        ZNe( G d7 d8e!             ZO G d9 d:ee'      ZPe( G d; d<eO             ZQ G d= d>eOe      ZRg d?ZSy)CzPyTorch Idefics model.    )	dataclass)AnyCallableDictListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)ALL_LAYERNORM_LAYERS)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformer)	BlockMask)make_flex_block_causal_maskc                       e Zd ZU dZdZeej                     ed<   dZ	ee
e
ej                           ed<   dZee
ej                        ed<   dZee
ej                        ed<   dZee
ej                        ed<   y)IdeficsBaseModelOutputWithPasta	  
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r'   r   torchFloatTensor__annotations__r(   r	   r)   r*   r+        ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/idefics/modeling_idefics.pyr&   r&   6   s    "H 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;Br4   r&   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)	IdeficsCausalLMOutputWithPasta  
    Base class for Idefics causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr(   r)   r*   r+   )r,   r-   r.   r/   r8   r   r0   r1   r2   r9   r(   r   r)   r	   r*   r+   r3   r4   r5   r7   r7   c   s    @ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;Br4   r7   c                    t        j                  | j                  d         j                  dd      j	                  d|      j                  d      j                  | j                        }| j                  d|      } |j                  dd       |d<   |j                  dd       |d<   |j                  dd       |d<   |j                  dd       |d<   d|v r|d   }|j                  d|      |d<   ||j                  d|      |d	<   |d   |d   j                  d|      |d<   |d   |d   j                  d|      |d<   | |fS |d   |d   j                  d|      |d<   | |fS |d   |d   j                  d|      |d<   | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r0   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderrA   encoder_outputsmodel_kwargsexpanded_return_idxr@   s           r5   expand_inputs_for_generationrP      s    	Y__Q'(--b!4;;A{KPPQSTWWXaXhXhi  &&q*=>I#/#3#3ND#IL /;/?/?@Z\`/aL+,+7+;+;<RTX+YL'(+7+;+;<RTX+YL'(<'%&67)7)D)DQH[)\%&!)7)D)DQH[)\%&*+7/;<R/S/`/`"0
+, N#/'3N'C'P'PQRTg'h^$ l"" 
0	1	=3?@Z3[3h3h"4
/0 l"" 
,	-	9/;<R/S/`/`"0
+, l""r4   c                 2   t         j                  t         j                  t         j                  d}|D cg c]  }||   	 }}| j	                         D ];  |r&t        fd|D              rj                  d       +j                  d       = | S c c}w )N)	LayerNormLinear	Embeddingc              3   6   K   | ]  }t        |        y wN)
isinstance).0tmodules     r5   	<genexpr>zfreeze_model.<locals>.<genexpr>   s     $]qZ%:$]s   TF)r   rR   rS   rT   modulesanyrequires_grad_)modelmodule_exceptionsmappingmmodule_exceptions_mappedrZ   s        @r5   freeze_modelrd      s    \\))\\G
 5FFq
FF--/ )$]D\$]!]!!$'!!%(	)
 L  Gs   Bc                   N     e Zd ZdZ	 	 	 	 ddee   ddf fdZd ZdefdZ	 xZ
S )	IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    Npartially_freezereturnc           	      B   |||kD  rt        d| d|       t        	|   d|||||d| || _        || _        || _        || _        |r| j                  j                  d       | j
                  dkD  r)t        j                  | j
                  |||      | _        yy)	a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrG   dtypepadding_idxFr   )rj   rk   rG   rl   r3   )
ValueErrorsuper__init__rj   rm   num_additional_embeddingsrg   weightr^   r   rT   additional_embedding)
selfrj   rq   rk   rg   rG   rl   rm   kwargs	__class__s
            r5   rp   z"IdeficsDecoupledEmbedding.__init__   s    6 "{^'CN{m[`ao`pqrr 	
)'#	
 	
 -&)B& 0KK&&u-))A-(*#==+	)D% .r4   c                 b   | j                   dk(  r t        j                  || j                        S |j	                         }t        j                  || j                  k\        }||   }| j                  || j                  z
        }d||<   t        j                  || j                        }|||<   |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	rq   F	embeddingrr   cloner0   whererj   rs   )rt   rJ   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectors         r5   forwardz!IdeficsDecoupledEmbedding.forward  s    * ))Q.;;y$++66 OO%	#(;;yD<O<O/O#P %./G%H" $ 9 9:TW[WjWj:j k /0	*+kk)T[[9 1F,-r4   c                 z    dj                  | j                  | j                  | j                  | j                        S )NzVnum_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={})formatrj   rq   rk   rg   rt   s    r5   
extra_reprz$IdeficsDecoupledEmbedding.extra_repr.  s9    gnn**!!	
 	
r4   )FNNN)r,   r-   r.   r/   r   boolrp   r   strr   __classcell__rv   s   @r5   rf   rf      sF     ,13
 #4.3 
3j%N
C 
r4   rf   c                        e Zd ZdZ	 	 	 	 	 ddedededededdf fd	Zd
ej                  dej                  fdZ	de
fdZ xZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    Nin_featuresout_featuresout_additional_featuresbiasrg   rh   c                 "   t         |   |||||       || _        || _        || _        || _        |r8| j                  j                  d       |r| j                  j                  d       |dkD  r t        j                  |||||      | _        yy)aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )r   r   r   rG   rl   N)ro   rp   r   rg   r   r   rr   r^   r   r   rS   additional_fc)	rt   r   r   r   r   rg   rG   rl   rv   s	           r5   rp   zIdeficsDecoupledLinear.__init__@  s     	lD&%H'>$ 0&(KK&&u-		((/"Q&!#'4"D 'r4   inputc                     t        j                  || j                  | j                        }| j                  dkD  r)| j                  |      }t        j                  ||fd      }|S )Nr   r;   )rx   linearrr   r   r   r   r0   cat)rt   r   outputadditional_featuress       r5   r   zIdeficsDecoupledLinear.forwardd  sV    %dii8''!+"&"4"4U";YY(;<bAFr4   c                     dj                  | j                  | j                  | j                  | j                  du| j
                        S )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zYin_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}N)r   r   r   r   r   rg   r   s    r5   r   z!IdeficsDecoupledLinear.extra_reprm  sE    jqq((IIT!!!
 	
r4   )r   TTNN)r,   r-   r.   r/   intr   rp   r0   Tensorr   r   r   r   r   s   @r5   r   r   7  s     ()!%"" " "%	"
 " " 
"HU\\ ell 
C 
r4   r   c                   ,     e Zd Zd fd	Zd Zd Z xZS )IdeficsRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)ro   rp   r   	Parameterr0   onesrr   variance_epsilon)rt   hidden_sizeepsrv   s      r5   rp   zIdeficsRMSNorm.__init__z  s1     	ll5::k#:; #r4   c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   r;   T)keepdim)rF   r0   float32powmeanrsqrtr   rr   rl   float16bfloat16)rt   r)   variances      r5   r   zIdeficsRMSNorm.forward  s     ##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r4   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tuplerr   rC   r   r   s    r5   r   zIdeficsRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr4   )gư>)r,   r-   r.   rp   r   r   r   r   s   @r5   r   r   y  s    $+Jr4   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )IdeficsEmbeddingc                    t         |           || _        || _        || _        d| j                  t        j                  d| j                  dt
        j                        j                  |t
        j                        | j                  z  z  z  }| j                  d|d       | j                  || j                  j                  t        j                         	       y )
N      ?r   r   rl   rG   rl   inv_freqF
persistentseq_lenrG   rl   )ro   rp   dimmax_position_embeddingsbaser0   rB   int64rF   floatregister_buffer_set_cos_sin_cacher   rG   get_default_dtype)rt   r   r   r   rG   r   rv   s         r5   rp   zIdeficsEmbedding.__init__  s    '>$	IIQ!5;;?BB&X]XcXcBdgkgogooq
 	ZeD 	+DMM4H4HPUPgPgPi 	  	
r4   c                    || _         t        j                  | j                   |t        j                        j	                  | j
                        }t        j                  d|| j
                        }t        j                  ||fd      }| j                  d|j                         j                  |      d       | j                  d|j                         j                  |      d       y )	Nr   zi,j->ijr;   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr0   rB   r   type_asr   einsumr   r   cosrF   sin)rt   r   rG   rl   rY   freqsembs          r5   r   z#IdeficsEmbedding._set_cos_sin_cache  s    ")LL00u{{S[[\`\i\ijY4==9iiB/\3779<<+>5Q\3779<<+>5Qr4   c                    || j                   kD  r(| j                  ||j                  |j                         | j                  d | j                  |j                        | j                  d | j                  |j                        fS )Nr   r   )r   r   rG   rl   r   rF   r   )rt   xr   s      r5   r   zIdeficsEmbedding.forward  sy    T,,,##GAHHAGG#T OOHW%((qww(7OOHW%((qww(7
 	
r4   )i   i'  NrV   )r,   r-   r.   rp   r   r   r   r   s   @r5   r   r     s    
"R
r4   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr;   r   r   )rC   r0   r   )r   x1x2s      r5   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r4   c                     ||   j                  |      }||   j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr     sg    * l

%
%m
4C
l

%
%m
4C3w;q>C/0G3w;q>C/0GGr4   c                   2     e Zd Zdededef fdZd Z xZS )
IdeficsMLPr   intermediate_size
hidden_actc                     t         |           t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        t        |   | _        y )NFr   )	ro   rp   r   rS   	gate_proj	down_projup_projr   act_fn)rt   r   r   r   rv   s       r5   rp   zIdeficsMLP.__init__  s[     	;0AN#4kNyy.?eLZ(r4   c                     | j                  | j                  | j                  |            | j                  |      z        S rV   )r   r   r   r   )rt   r   s     r5   r   zIdeficsMLP.forward  s0    ~~dkk$..*;<t||ANOOr4   )r,   r-   r.   r   r   rp   r   r   r   s   @r5   r   r     s*    
)
) 
) 	
)Pr4   r   rZ   querykeyvaluerA   scalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr;   )r   rl   ptrainingr   r   )r0   matmul	transposer   
functionalsoftmaxr   rF   rl   r   r   
contiguous)
rZ   r   r   r   rA   r   r   ru   attn_weightsattn_outputs
             r5   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r4   c                       e Zd ZdZ	 	 	 	 	 ddededededededee   f fd	Z	d
e
j                  dedefdZ	 	 	 	 	 	 	 dde
j                  dee
j                     dee
j                     dee
j                     deee
j                        dededee
j                     dee
j                  ee
j                     eee
j                        f   fdZ xZS )IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc                    t         	|           || _        || _        || _        ||z  | _        || _        d| _        | j
                  dz  | _        || _	        |-t        j                  d| j                  j                   d       | j
                  |z  | j                  k7  rt        d| j                   d| d      || _        t!        t"        j$                  d      st        d	      | j                  rt!        |j&                  d
      s| j                  n|j&                  j(                  }t#        j*                  | j                  || j
                  z  d      | _        t#        j*                  ||| j
                  z  d      | _        t#        j*                  ||| j
                  z  d      | _        nt#        j*                  | j                  || j
                  z  d      | _        t#        j*                  | j                  || j
                  z  d      | _        t#        j*                  | j                  || j
                  z  d      | _        t#        j*                  || j
                  z  |d      | _        t5        | j
                        | _        || _        | j8                  rMt;        | j
                  |j<                        | _        t;        | j
                  |j<                        | _         y y )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!ro   rp   r   r   r   head_dimr   	is_causalr   r   loggerwarning_oncerv   r,   rn   r   hasattrr   r   vision_configr  rS   q_projk_projv_projo_projr   
rotary_embr   r   rms_norm_epsq_layer_normk_layer_norm)
rt   r   r   r   r   r   r   r   kv_input_dimrv   s
            r5   rp   zIdeficsAttention.__init__  s    	&"#y0}}d*" !8!8 9 :, , MMI%$*:*::QRVRbRbQc$YKr3 
 #5r}}&DEHII""(/0D0Dk(R  X^XlXlXvXv  ))  DMM)DK
 ))L)dmm2KRWXDK))DMM)DK ))  DMM)DK
 ))  DMM)DK
 ))  DMM)DK
 ii%

 +4==9, .t}}&BUBU VD .t}}&BUBU VD r4   tensorr   bszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )rD   r   r  r   r   )rt   r  r   r  s       r5   _shapezIdeficsAttention._shape]  s7    {{3GQQRSUVWbbddr4   r)   key_value_statesrA   r   past_key_valueoutput_attentions	use_cachecache_positionrh   c	                 @   | j                   xs |d u}
|j                         \  }}}| j                  |      j                  ||| j                  | j
                        j                  dd      }|
s| j                  |      j                  ||| j                  | j
                        j                  dd      }| j                  |      j                  ||| j                  | j
                        j                  dd      }n|j                         \  }}}| j                  |      j                  ||| j                  | j
                        j                  dd      }| j                  |      j                  ||| j                  | j
                        j                  dd      }|j                  d   }|||d   z  }|
s2| j                  |t        ||            \  }}t        |||||      \  }}|%d|i}|j                  ||| j                  |      \  }}| j                  r"| j!                  |      }| j#                  |      }t$        }| j&                  j(                  dk7  rN| j&                  j(                  dk(  r|rt*        j-                  d	       nt.        | j&                  j(                     } || ||||f| j0                  sd
n| j2                  | j4                  d|	\  }}|j7                  ||d      j9                         }| j;                  |      }|rd }|||fS )Nr   r   r   r   )r   r  eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r;   )r   sizer	  rD   r   r  r   r
  r  rC   r  maxr   updater   r   r  r  r   r   _attn_implementationr  r  r   r   r   r   reshaper   r  )rt   r)   r  rA   r   r  r  r  r  ru   r   r  q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer   r   s                            r5   r   zIdeficsAttention.forward`  s    "44T8HPT8T%**,UA{{=166sE4>>SWS`S`akklmopq!]388eT^^UYUbUbcmmnoqrsJ;;}5::3t~~W[WdWdeoopqstuL+002LAvq%56;;CY]YfYfgqqrsuvwJ,-223PTP]P]^hhijlmn   %%b)
%.++J!|SU=STHC';L*VY[^`l'm$L* %,n=L'5'<'<ZW[WeWegs't$J,,\:L**:6J(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))#ub9DDFkk+.LL.88r4   )r  FNFNNNNNFFN)r,   r-   r.   r/   r   r   r   r   r   rp   r0   r   r  
LongTensorr	   r   r   r   s   @r5   r   r   	  sv   G #(#'$#'OWOW OW 	OW
 !OW !OW OW C=OWbeU\\ eC ec e 4815378<"'59J9||J9 #5<<0J9 !.	J9
 u//0J9 !u||!45J9  J9 J9 !!1!12J9 
u||Xell3XeELL>Q5RR	SJ9r4   r   c                   J    e Zd Zddedee   f fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee
ej                        dee   d	ee   d
eej                     de
ej                  ee
ej                  ej                  f      f   fdZ xZS )IdeficsDecoderLayerr   r   c                    t         |           |j                  | _        t        | j                  |j                  |j
                  ||      | _        t        | j                  |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        |j
                  | _        y )N)r   r   r   r   r   r   r   r   r  )ro   rp   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r  input_layernormpost_attention_layernormrt   r   r   rv   s      r5   rp   zIdeficsDecoderLayer.__init__  s    !--)((00NN
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%~~r4   r)   rA   r   r  r  r  r  rh   c                    |}	| j                  |      } | j                  d|||||||d|\  }}
}t        j                  j	                  || j                  | j
                        }|	|z   }|}	| j                  |      }| j                  |      }t        j                  j	                  || j                  | j
                        }|	|z   }|f}|r||
fz  }|r||fz  }|S )a]  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        )r)   rA   r   r  r  r  r  r   r3   )r6  r4  r   r   r   r   r7  r5  )rt   r)   rA   r   r  r  r  r  ru   residualself_attn_weightspresent_key_valueoutputss                r5   r   zIdeficsDecoderLayer.forward  s   2 !,,]; ?Mdnn 	?
')%)/)	?
 	?
;(*; --mt||VZVcVc-d =0 !55mD/--mt||VZVcVc-d =0 ")++G)++Gr4   rV   )NNNFFN)r,   r-   r.   r   r   r   rp   r0   r   r.  r	   r   r1   r   r   r   s   @r5   r0  r0    s    &} &# &, 26378<,1$)59:||: !.: u//0	:
 !u||!45: $D>: D>: !!1!12: 
u  (51B1BEDUDU1U+V"WW	X:r4   r0  c                   j    e Zd Zddedee   f fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee
ej                        de
ej                  ee
ej                  ej                  f      f   fdZ xZS )IdeficsGatedCrossAttentionLayerr   r   c           	      	   t         |           |j                  | _        t        | j                  |j                  d|j
                  ||j                  |      | _        t        | j                  |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        |j
                  | _        t#        j$                         | _        t#        j$                         | _        |j*                  dk(  r|j,                  dk(  rtt#        j.                  t1        j2                  dd| j                              | _        t#        j.                  t1        j2                  dd| j                              | _        n|j,                  dk(  r\t#        j.                  t1        j2                  d            | _        t#        j.                  t1        j2                  d            | _        nt9        d	|j,                   d
      |j*                  dk(  r|j,                  dk(  rtt#        j.                  t1        j:                  dd| j                              | _        t#        j.                  t1        j:                  dd| j                              | _        n|j,                  dk(  r\t#        j.                  t1        j:                  d            | _        t#        j.                  t1        j:                  d            | _        n|t9        d	|j,                   d
      |j*                  dv r;|j,                  dk(  rt#        j.                  t1        j<                  d|j>                  dd| j                  f            | _        t#        j.                  t1        j<                  d|j>                  dd| j                  f            | _        n|j,                  dk(  rut#        j.                  t1        j<                  d|j>                  d            | _        t#        j.                  t1        j<                  d|j>                  d            | _        n2t9        d	|j,                   d
      tA        d|j*                   d      tC        | d      rtC        | d      st9        d      y )NT)r   r   r   r   r   r   r   r2  r  zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr  )r   stdr  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"ro   rp   r   r   r3  r   r   
cross_attnr   r   r   r5  r   r  r6  r7  r   r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r0   rA  rH  rI  rn   r   rD  alphas_initializer_rangeNotImplementedErrorr  r8  s      r5   rp   z(IdeficsGatedCrossAttentionLayer.__init__   s   !--*((00#NN!00
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%nn ggi##w.  H,(*U[[AtGWGW5X(Y%#%<<Aq$BRBR0S#T ""g-(*U[[^(D%#%<<A#?  #CFDUDUCVVW!XYY%%/  H,(*UZZ1dFVFV5W(X%#%<<

1aAQAQ0R#S ""g-(*UZZ](C%#%<<

1#>  #CFDUDUCVVW!XYY%%)II  H,(*LLcv/N/NVWYZ\`\l\lUmn)% $&<<LLcv/N/NVWYZ\`\l\lUmn$  ""g-(*LLcv/N/NVWY)% $&<<#6KjKjrs0u#v  #CFDUDUCVVW!XYY &(DVE]E]D^^s&tuu01gdM6RJKK 7Sr4   r)   rA   r+   r?   cross_attention_gater  r  r  rh   c	                    |t        d      |t        d      |t        d      |}
| j                  |      } | j                  d	||||d|	\  }}}t        j
                  j                  || j                  | j                        }|j                  |dk(  dddddf   d      }|
| j                  | j                        |z  z   }|}
| j                  |      }| j                  |      }t        j
                  j                  || j                  | j                        }|
| j                  | j                        |z  z   }|f}|r||fz  }|r||fz  }|S )
a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            cross_attention_gate (`torch.FloatTensor`, *optional*):
                gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r)   r  rA   r  r   r   r  r3   )rn   rQ  r6  rJ  r   r   r   r   r   masked_fillrL  rH  r7  r5  rM  rI  )rt   r)   rA   r+   r?   rR  r  r  r  ru   r:  r;  r<  r=  s                 r5   r   z'IdeficsGatedCrossAttentionLayer.forwardB  s   : &# 
  ' ^  %%&uvv ,,]; ?Ndoo ?
'0//	?

 ?
;(*; --mt{{UYUbUb-c%113G13LaQRTXj2Y[^_ 4#6#6t7L7L#MP]#]] !55mD/--mt{{UYUbUb-c 4>>$2B2B#Cm#SS ")++G)++Gr4   rV   r-  )r,   r-   r.   r   r   r   rp   r0   r   r   r	   r1   r   r   r   s   @r5   r?  r?    s   @L} @L# @LJ 266:7;7;,1$)8<K||K !.K &ell3	K
 'u||4K 'u||4K $D>K D>K !u||!45K 
u  (51B1BEDUDU1U+V"WW	XKr4   r?  c                   :    e Zd ZeZdZdZddgZdZdZ	dZ
dZdZd Zy)IdeficsPreTrainedModelr_   Tr0  r?  Fc                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t        j                        rJ|j                  j                  j                  d       |j                  j                  j                          y t        |t              r&|j                  j                  j                  d       y t        |t               r%|j"                  j                  j                          y t        |t$              rV| j                   j&                  dk(  rI|j(                  j                  j                          |j*                  j                  j                          y | j                   j&                  dk(  rK|j(                  j                  j                  d       |j*                  j                  j                  d       y | j                   j&                  dv rw|j(                  j                  j                  d| j                   j,                         |j*                  j                  j                  d| j                   j,                         y y t        |t.              r%|j0                  j                  j                          y y )Nr  )r   rG  r   rA  r   >   rD  rE  rF  )r   initializer_rangerW   r   rS   Conv2drr   datanormal_r   zero_rT   rm   rR   fill_r   r!   class_embeddingr?  rN  rH  rI  rP  r    latents)rt   rZ   rG  s      r5   _init_weightsz$IdeficsPreTrainedModel._init_weights  sp    kk++fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)KK""$/MM$$S) 78""''//1 ?@{{,,7'',,224""''--/..&8'',,2237""''--c2..2RR'',,44#4;;CgCg4h""''//Sdkk>b>b/c S  9:NN'') ;r4   N)r,   r-   r.   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_cache_class_supports_flash_attn_2_supports_static_cache_supports_attention_backendr`  r3   r4   r5   rV  rV    sA     L&*#.0QRN !""&*r4   rV  c                       e Zd Zy)KwargsForCausalLMN)r,   r-   r.   r3   r4   r5   rk  rk    s    r4   rk  c            '           e Zd ZdZdef fdZd#dZg fdZg fdZd Z	d Z
ee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$d	eej                     d
eej                      deej                     deeej$                        deej$                     deej$                     deej$                     deej$                     deej                      dee   dee   dee   dee   dee   deej                     dee   deeef   f"d              Z	 d%d
eej                   df   dej                   dej                   dedef
dZed
ej                   deded ej<                  dej                   d!efd"       Z xZ S )&IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c           	      j   t         |   |       || _        |j                  | _        |j
                  | _        t        |j
                  |j                  |j                  |j                  | j                        | _
        |j                  j                  | _        |j                  | _        t        |j                        | _        |j                  r]|j                   }t#        ||j                  j$                  |j&                  |j(                  |j*                  |j,                        | _        t1        j2                  t5        |j6                        D cg c]  }t9        ||       c}      | _        |j<                  | _        |j6                  | j<                  z  }t1        j2                  t5        |      D cg c]  }t?        ||       c}      | _         d| _!        tE        |j                  |jF                        | _$        | jK                          | jM                  |       y c c}w c c}w )N)rj   rq   rk   rg   rm   )r   Fr  )'ro   rp   r   pad_token_idrm   
vocab_sizerf   additional_vocab_sizer   freeze_text_layersembed_tokensr  
image_sizer"   vision_modeluse_resamplerperceiver_configr    r  resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layersr0  layerscross_layer_intervalr?  gated_cross_attn_layersgradient_checkpointingr   r  norm	post_initfreeze_relevant_params)rt   r   rw  inum_cross_layersrv   s        r5   rp   zIdeficsModel.__init__  s    !.. ++5!,,&,&B&B ,,#66((
 !..99#114V5I5IJ %66'@$$.. 00 22 33 44(D$ mm?DVE]E]?^_! 15_
 %+$?$?!!33t7P7PP')}}KPQaKbca,VqAc(
$ ',#"6#5#56;N;NO	 	##F+ ` ds   H+<H0c                     || j                   }|j                  r| j                  |j                         |j                  r"t	        | j
                  |j                         y y N)r`   )r   rr  freeze_text_module_exceptionsfreeze_vision_layersrd   ru  freeze_vision_module_exceptions)rt   r   s     r5   r  z#IdeficsModel.freeze_relevant_params  sQ    >[[F$$##F$H$HI&&**f>d>de 'r4   c                 X    | j                   | j                  fD ]  }t        ||        y r  )r  r  rd   )rt   r`   rZ   s      r5   rr  zIdeficsModel.freeze_text_layers  s+    {{DII. 	FF3DE	Fr4   c                 2    t        | j                  |       y r  )rd   ru  )rt   r`   s     r5   r  z!IdeficsModel.freeze_vision_layers  s    T&&:KLr4   c                     | j                   S rV   rs  r   s    r5   get_input_embeddingsz!IdeficsModel.get_input_embeddings  s       r4   c                     || _         y rV   r  rt   r   s     r5   set_input_embeddingsz!IdeficsModel.set_input_embeddings  s
    !r4   rJ   rA   r   r(   inputs_embedsr<   r=   r>   r?   r  r  output_hidden_statesinterpolate_pos_encodingreturn_dictr  ru   rh   c                    ||j                   n|j                   }||n| j                  j                  }||n| j                  j                  }|
|
n| j                  j                  }
||n| j                  j
                  }|du |duz  rt        d      | j                  r%| j                  r|
rt        j                  d       d}
|| j                  |      }d}|
rIt        |t              s9d}|t               }n*t        j                  |      }t        j                  d       |j                   \  }}}||j#                         nd}||z   }|2t%        j&                  |||j                   d   z   |j                   	      }|F|D|j)                         j+                  d
      dz
  }|j-                  |dk(  d       |dd| df   }n||j/                  d      }t1        |||fD cg c]  }|du  c}      dk7  rt        d      |~|j3                  | j4                  |      }|j                   dd \  }} |j7                         j8                  ||z  g|j                   dd  }| j;                  ||      j<                  }nJ|H|j?                         \  }}}}|j3                  | j4                  |      }|j9                  ||z  ||      }| j                  j@                  rN|4| jC                        }|j?                  d      |j?                  d      }}n|j?                         \  }}}}|}n0|#j?                  d      |j?                  d      }}nt        d      |j9                  ||z  |      }|	j?                  d      }|	j/                  d
      }	|	jE                  ddd|      }	|	j9                  ||||z        }	|C|j?                         \  }}}||f} |	t%        jF                  | |	      }	| jI                  |	      }	nd}	|	dk(  jK                  d
      j3                  | j4                        jM                  d      j3                  |      }!|2t%        jF                  ||ft$        jN                  |j                         }| jQ                  |||||      }|}"|rdnd}#|rdnd}$d}%tS        | jT                        D ]  \  }&}'|r|#|"fz  }#fd}(| j                  r[| j                  rOd}|
rt        j                  d       d}
| jW                  |(|'|"|||||	|!||
|&| jX                  | jZ                  |      })n+ |(|'|"f|||||	|!||
|&| jX                  | jZ                  |d})|)d   }"|
r	|)|rdnd   }%|s|$|)d   fz  }$ | j]                  |"      }"|r|#|"fz  }#|
r|%nd}*|r|*j_                         }*|j9                  ||||      }ta        |"|*|#|$|      S c c}w )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   rG   r;   r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)rl   rG   )r<   r  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer  r   r   r3   c                 t    |
|z  dk(  r||
|z     } ||f||||||	d d}|d   } | |f|||||	|d}|S )Nr   )rA   r+   r?   rR  r  r  r  )rA   r   r  r  r  r  r3   )
main_blockr)   rA   r   r  r+   r?   rR  r  r  r   r  r  r  xblockr=  layer_outputsru   s                    r5   vblockz$IdeficsModel.forward.<locals>.vblock  s    " 33q84YBV5VWF$%
'5,?-A-A*;"+'+
 !
G %,AJM *!	!#1!-#1&7'#1	! 	! %$r4   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)rA   r   r  r+   r?   rR  r  r  r   r  r  r  )r'   r(   r)   r*   r+   )1rG   r   r  r  r  use_return_dictrn   r  r   r  r  rs  rW   r   r   from_legacy_cacherC   get_seq_lengthr0   rB   longcumsummasked_fill_r   sumrF   rl   r   rD   ru  r'   r  rv  r|  rE   r   invert_attention_maskr]   squeezer   _update_causal_mask	enumerater  _gradient_checkpointing_funcr  r  r  to_legacy_cacher&   )+rt   rJ   rA   r   r(   r  r<   r=   r>   r?   r  r  r  r  r  r  ru   rG   return_legacy_cache
batch_size
seq_lengthr%  past_key_values_lengthseq_length_with_pastr   
num_imagesr+   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaperR  r)   all_hidden_statesall_self_attnsnext_decoder_cacheidxdecoder_layerr  r  
next_caches+                   `                          r5   r   zIdeficsModel.forward  s   : &/%:!!@T@T1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M $Z?"&&".."."@"@"Q##^ %2$7$7!
JETE`!?!?!Afg),BB!"\\&(>ATATUVAW(W`m`t`tN %,*>)..077;a?L%%n&91='J;<8L!)33A6LL2JL`#abaT	bcghhq  %'??F?KL%1%7%7%;"J
9<22499*z:QkT`TfTfghgiTjkL #'"3"3)D\ #4 #   &1G_GdGdGfDJ
M3D":"="=DJJW]"="^"5":"::
;RTact"u;;$$#+'+'?'?@S'T$3G3L3LQ3OQeQjQjklQm0K_KdKdKfH
J7H"6!)/B/G/G/JL_LdLdefLg,Mabb166z:P]C]_pq ,0033==bA3::1aMR388\S]`mSmn*9L9Q9Q9S63Q"24I!J#+',zz2DV'T$#'#=#=>R#S #'  $83#>"C"C"C"K!O!OVZV`V`!O!a j jop j quu 

 !"ZZ12%**]MaMaN 11M>?L]
 & #7BD0d!"+DKK"8 c	6C#!m%55!+%Z **t}}"&''t !&I $ A A!!" #'((%--00"!$ !'!!! $2!-#2(;)=)=&7'!)-)B)B,0,H,H#1! !$ *!,M%28I1q%Q" =#3"55Gc	6J 		-0  -!11+4'$
#335J166z:}^op-+&+% 3
 	
y cs   -W(r#   input_tensorc           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r  flex_attentionr   Fr  )r  r  is_trainingr   r;   )sequence_lengthtarget_lengthrl   r  r  )cudaxpunpu)r   r"  r]   rW   r0   r   r$   r  is_compileabler   _ignore_causal_mask_sdpar   rl   rC   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrG   typefinfomin_unmask_unattended)rt   rA   r  r  r(   r  past_seen_tokensusing_compilable_cacherl   r  r  causal_mask	min_dtypes                r5   r  z IdeficsModel._update_causal_mask(  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr4   r  r  rl   r  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerl   rG   r   )diagonalr  r;   r   )r   r0   r  r  fullrG   triurB   r#  expandrz   rC   rF   rT  )rA   r  r  rl   r  r  ru   r  r  mask_lengthpadding_masks              r5   r  zBIdeficsModel._prepare_4d_causal_attention_mask_with_cache_positionl  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r4   rV   )NNNNNNNNNNNNFNNF)!r,   r-   r.   r/   r   rp   r  rr  r  r  r  r   r   r   r0   r.  r   r   r1   r   r   r   r
   r	   r&   r   r   r  staticmethodr   rl   r  r   r   s   @r5   rm  rm    s   .,} .,`f 46 F 68 M!"  151537=A5948@D<@7;$(,0/338&*59!R
E,,-R
 !.R
 u//0	R

 "$u'8'8"9:R
   1 12R
 u001R
 #+5+<+<"=R
 'u'8'89R
 'u||4R
 D>R
 $D>R
 'tnR
 #+4.R
 d^R
  !!1!12!R
" -.#R
$ 
u44	5%R
  R
v #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r4   rm  c            )           e Zd ZddgZd$ fd	Zd Zd Zd Zd Zd Z	d	 Z
d
 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deej                      deej"                     deej                      deeej&                        deej&                     deej&                     deej&                     deej&                     deej"                     deej                      dee   dee   dee   dee   dee   deej                      dee   deeef   f$d              Z	 	 	 	 	 	 	 	 	 d& fd	Z	 d'ded eeef   d!edeeef   f fd"Z e!d#        Z" xZ#S )(IdeficsForVisionText2Textzmodel.embed_tokens.weightzlm_head.weightc                     t         |   |       t        |      | _        t	        |j
                  |j                  |j                  d|j                        | _	        | j                          y )NF)r   r   r   r   rg   )ro   rp   rm  r_   r   r   rp  rq  freeze_lm_headlm_headr  )rt   r   ru  rv   s      r5   rp   z"IdeficsForVisionText2Text.__init__  s[     !&)
-****$*$@$@#22
 	r4   c                 .    | j                   j                  S rV   r_   rs  r   s    r5   r  z.IdeficsForVisionText2Text.get_input_embeddings  s    zz&&&r4   c                 &    || j                   _        y rV   r  r  s     r5   r  z.IdeficsForVisionText2Text.set_input_embeddings  s    "'

r4   c                     | j                   S rV   r  r   s    r5   get_output_embeddingsz/IdeficsForVisionText2Text.get_output_embeddings  s    ||r4   c                     || _         y rV   r  )rt   new_embeddingss     r5   set_output_embeddingsz/IdeficsForVisionText2Text.set_output_embeddings  s	    %r4   c                     || _         y rV   r_   )rt   decoders     r5   set_decoderz%IdeficsForVisionText2Text.set_decoder  s	    
r4   c                     | j                   S rV   r  r   s    r5   get_decoderz%IdeficsForVisionText2Text.get_decoder  s    zzr4   c                    | j                         }| j                         }t        | j                  dd      r`|j                  |_        |j
                  dkD  r@|j                  |j
                  k(  sJ |j                  j                  |j                  _        t        |d      rJt        |d      r=|j                  |_        t        |d      rt        |d      r|j
                  |_        yyyyy)	z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
        tie_word_embeddingsTr   r   rj   r   rq   N)r  r  getattrr   rr   rq   r   rs   r   r  rj   r   )rt   output_embeddingsinput_embeddingss      r5   tie_weightsz%IdeficsForVisionText2Text.tie_weights  s    
 !6684464;; 5t<'7'>'>$99A=(@@DTDnDnnnn9I9^9^9e9e!//6$n5'BRTd:e-=-L-L*(*CD "=J =M<f<f!9JD ;f5r4   rJ   rA   r   r(   r  r<   r=   r>   r?   labelsr  r  r  r  r  r  ru   rh   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d|||||||||	||||d|d|}|d   }| j                  |      }d}|
* | j                  d||
| j                   j                  d|}t        |||j                  |j                  |j                  |j                        S )aC  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```NT)rJ   rA   r   r(   r  r<   r=   r>   r?   r  r  r  r  r  r  r   )r9   r  rp  )r8   r9   r(   r)   r*   r+   r3   )r   r  r  r  r_   r  loss_functionrp  r7   r(   r)   r*   r+   )rt   rJ   rA   r   r(   r  r<   r=   r>   r?   r  r  r  r  r  r  r  ru   r=  r)   r9   r8   s                         r5   r   z!IdeficsForVisionText2Text.forward  s%   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $** 
)%+'%%=!5!5/!5%=)
  !
&  
m,%4%%pVFt{{OeOepiopD,#33!//)) ' ; ;
 	
r4   c                    i }|"| j                   j                  r||d<   n||d<   n||d<   |j                  dd      |d<   t        |   |f||||||
|	d||}|	#|!|d   j
                  d   }|	d d | d f   |d	<   |S )
Nr>   r=   r<   r  F)r(   rA   r  r  r   r  r?   rJ   r   r?   )r   rv  popro   prepare_inputs_for_generationrC   )rt   rJ   rA   r   r  r(   r  r<   r+   r?   r  ru   images_kwargsmodel_inputsr  rv   s                  r5   r   z7IdeficsForVisionText2Text.prepare_inputs_for_generationD  s      *{{((8K45<O89,8M.)4:JJ?Y[`4a01w<
+)')%!5
 
 
  +0E%k288;J3GJ;<3XL/0r4   r=  rN   rL   c                     t        |   |||fi |}d|v rT|d   }|d d dd d f   j                  d      }|j                  dd      r||d<   nt	        j
                  ||gd      |d<   |j                  |d<   |S )Nr?   r;   r   r  Tr   r+   )ro   #_update_model_kwargs_for_generationr   rI   r0   r   r+   )rt   r=  rN   rL   ru   r?   	last_maskrv   s          r5   r  z=IdeficsForVisionText2Text._update_model_kwargs_for_generationq  s     wB
 	
 "\1#/0F#G ,QAX6@@CIT27@347<yyBVXaAbhi7j34 /6.I.I*+r4   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr3   c              3   B   K   | ]  }|j                  d         yw)r   N)rH   )rX   
past_statebeam_idxs     r5   r[   z;IdeficsForVisionText2Text._reorder_cache.<locals>.<genexpr>  s     $gjZ%<%<Q%I$gs   )r   )pastr	  reordered_past
layer_pasts    `  r5   _reorder_cachez(IdeficsForVisionText2Text._reorder_cache  s8     	jJu$g\f$ggiiN	jr4   rV   )NNNNNNNNNNNNNFNN)	NNNNNNNNNr  )$r,   r-   r.   _tied_weights_keysrp   r  r  r  r  r  r  r  r   r   r   r0   r.  r   r   r1   r   r   rk  r
   r	   r7   r   r   r   r   r   r   r  r  r  r   r   s   @r5   r  r    sc   57GH'(&g*  151537=A5948@D<@7;-1$(,0/338&*59#b
E,,-b
 !.b
 u//0	b

 "$u'8'8"9:b
   1 12b
 u001b
 #+5+<+<"=b
 'u'8'89b
 'u||4b
 ))*b
 D>b
 $D>b
 'tnb
 #+4.b
  d^!b
" !!1!12#b
$ *+%b
& 
u33	4'b
  b
N  !+b $)	 38n !	 
c3h4  r4   r  )r  rm  rV  )r   FNN)r   )r  )Tr/   dataclassesr   typingr   r   r   r   r   r	   r
   r0   torch.nn.functionalr   r   rx   torch.utils.checkpointactivationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_ideficsr   	perceiverr    visionr!   r"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerr,   r  r&   r7   rP   rd   rT   rf   rS   r   Moduler   appendr   r   r   r   r   r   r   r   r0  r?  rV  rk  rm  r  __all__r3   r4   r5   <module>r&     sK  (  ! D D D      ! . ) > B + X X & 1 h h 0 0 E  !;J 
		H	% )C[ )C )CX &CK &C &CV *#Z +- k
 k
\>
RYY >
DJRYY J.    N +$
uxx $
N(:P P2 %II%<<% 
% <<	%
 U\\*% % %0a9ryy a9JN")) NbNbii Nb **_ ** **Z ?,j > `) ` `Fk 6 k\ Rr4   