
    Uh'T                        d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ  ej:                  e      Zd Z  G d de	jB                        Z"d Z#d,dZ$	 d-de	jB                  dejJ                  dejJ                  dejJ                  deejJ                     de&de&fdZ' G d de	jB                        Z( G d d e	jB                        Z) G d! d"e	jB                        Z* G d# d$e	jB                        Z+ G d% d&e	jB                        Z,e G d' d(e             Z-d) Z.e G d* d+e-             Z/d+d(gZ0y).zPyTorch Pixtral model.    )Callable)OptionalTupleUnionN)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc                    g }| D ]  }|j                   dd  \  }}t        j                  t        j                  |      t        j                  |      d      }t        j                  |d      j                  dd      j                  dd      \  }}||z  |z   }	|j                  |	d d df           t        j                  |      S )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_grididss
             ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgridr1   %   s    I" $BC(~~ell62ELL4GRVWTr2::2qAGG2Ny 6)QT#$ 99Y    c                   \     e Zd ZdZd fd	Z ej                         ed               Z xZ	S )PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    c                    t         
|           d| _        |j                  | _        |j
                  | _        |j                  |j                  z  }d| j                  t        j                  d| j                  d      j                         | j                  z  z  z  }t        j                  ||j                        }t        j                  ||j                        }t        j                  ||d d d         j                         }t        j                  ||dd d         j                         }t        j                  |d d d d d f   j                  d|d      |d d d d d f   j                  |dd      gd      j!                  d| j                  dz        }	| j#                  d	t        j                  |	|	fd      d
       y )Ndefault      ?r   r   )devicer   r   r   inv_freqF)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r    floatr8   outerr%   repeatr"   register_buffer)selfconfigr8   max_patches_per_sidefreqshwfreqs_hfreqs_wr9   	__class__s             r0   r<   zPixtralRotaryEmbedding.__init__<   s}   "??%%	%00F4E4EEtyyU\\!TXXq%A%G%G%IDHH%TUVLL-ellCLL-ellC++ass,224++aqt!t-335994
#**1.BAFa
#**+?AF 
 '"dhh!m
$ 	 	ZHh3GR)P]bcr2   c                    | j                   |   }t        |j                  j                  t              r/|j                  j                  dk7  r|j                  j                  nd}t        j                  |d      5  |}|j                         }|j                         }d d d        j                  |j                        j                  |j                        fS # 1 sw Y   AxY w)NmpscpuF)device_typeenabled)dtype)r9   
isinstancer8   typestrr   autocastcossintorU   )rG   xposition_idsrJ   rS   embrZ   r[   s           r0   forwardzPixtralRotaryEmbedding.forwardU   s     l+'1!((--'E!((--[`J`ahhmmfk^^UC 	C'')C'')C	
 vvAGGv$cff177f&;;;	 	s   <#CC(N)
__name__
__module____qualname____doc__r<   r   no_gradr   r`   __classcell__rO   s   @r0   r4   r4   0   s2    	d2 U]]_	<  	<r2   r4   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r%   )r]   x1x2s      r0   rotate_halfrl   d   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerl   )qkrZ   r[   r^   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embrt   k   sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr2   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr   r   )r   rU   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32r\   rU   r{   r~   
contiguous)
ru   rv   rw   rx   ry   rz   r{   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r2   c                        e Zd ZdZ fdZ	 	 	 d
dej                  deej                     deeej                  ej                  f      dee	   de
e   deej                  eej                     f   fd	Z xZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        d| _        | j                  dz  | _	        d| _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        y )NFg      ࿩bias)r;   r<   rH   hidden_size	embed_dimnum_attention_heads	num_headsr>   	is_causalrz   attention_dropoutr{   r   Lineark_projv_projq_projo_projrG   rH   rO   s     r0   r<   zPixtralAttention.__init__   s    ++33$..8}}d*//iiUKiiUKiiUKiiUKr2   hidden_statesry   position_embeddingsoutput_attentionsr   returnc                 D   |j                         \  }}}| j                  |      }	| j                  |      }
| j                  |      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
|j	                  ||| j
                  | j                        j                  dd      }|\  }}t        |	|
||d      \  }	}
t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     }| j                  j                  dk(  r%|d	   j                  |j                   d
      |d	<   d} || |	|
||f| j"                  sdn| j$                  | j&                  d|\  }}|j)                  ||d      j+                         }| j-                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )rq   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.flash_attention_2r^   T)non_blockingN        )r{   rz   r   )sizer   r   r   viewr   r>   r   rt   r   rH   _attn_implementationloggerwarning_oncer   r\   r8   r~   r{   rz   r"   r   r   )rG   r   ry   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesrZ   r[   attention_interfacer   r   s                    r0   r`   zPixtralAttention.forward   s    "/!3!3!5
GQ{{=1[[/
{{=1#((Wdnndmm\ffghjkl__Z$..$--Xbbcdfgh
#((Wdnndmm\ffghjkl&S#7jRUWZjk#l j(?;;++w6{{//69>O##L
 '>dkk>^>^&_# ;;++/BB%+N%;%>%>}?S?Sbf%>%gF>"!N$7	%
  $}}C$,,LL	%
 	%
!\ "))*grBMMOkk+. LL((r2   )NNF)rb   rc   rd   re   r<   r   Tensorr   r   boolr   r
   r`   rg   rh   s   @r0   r   r      s    L* 26KO,16)||6) !.6) &eELL%,,,F&GH	6)
 $D>6) -.6) 
u||Xell33	46)r2   r   c                   $     e Zd Z fdZd Z xZS )
PixtralMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y )NFr   )r;   r<   rH   r   intermediate_sizer   r   	gate_projup_proj	down_projr	   
hidden_actact_fnr   s     r0   r<   zPixtralMLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r2   c                     | j                  | j                  | j                  |            | j                  |      z        }|S ra   )r   r   r   r   )rG   r]   r   s      r0   r`   zPixtralMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )rb   rc   rd   r<   r`   rg   rh   s   @r0   r   r      s    0r2   r   c                   ,     e Zd Zd fd	Zd Zd Z xZS )PixtralRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r;   r<   r   	Parameterr   onesweightvariance_epsilon)rG   r   epsrO   s      r0   r<   zPixtralRMSNorm.__init__  s1     	ll5::k#:; #r2   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   r   T)keepdim)	rU   r\   r   r   powmeanrsqrtr   r   )rG   r   input_dtypevariances       r0   r`   zPixtralRMSNorm.forward	  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler   r   r   rG   s    r0   
extra_reprzPixtralRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr2   )gư>)rb   rc   rd   r<   r`   r   rg   rh   s   @r0   r   r      s    $;Jr2   r   c                        e Zd Z fdZ	 	 d	dej
                  dej
                  deeej
                  ej
                  f      dee   de	e
   deej                     fdZ xZS )
PixtralAttentionLayerc                     t         |           t        |j                  d      | _        t        |      | _        t        |      | _        t        |j                  d      | _	        y )Nh㈵>r   )
r;   r<   r   r   attention_normr   feed_forwardr   	attentionffn_normr   s     r0   r<   zPixtralAttentionLayer.__init__  sP    ,V-?-?TJ&v.)&1&v'9'9tDr2   r   ry   r   r   r   r   c                     |}| j                  |      } | j                  d||||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   ry   r   r    )r   r   r   r   )	rG   r   ry   r   r   r   residualr   outputss	            r0   r`   zPixtralAttentionLayer.forward  s    $ !++M:&4dnn '
') 3/	'

 '
#| !=0 m4))-8 =0 "&Gr2   )NN)rb   rc   rd   r<   r   r   r   r   r   r   r
   FloatTensorr`   rg   rh   s   @r0   r   r     s    E LP,0'||' ' &eELL%,,,F&GH	'
 $D>' -.' 
u  	!'r2   r   c                        e Zd Z fdZ	 	 	 	 	 d
deej                     deeej                  ej                  f      dee   dee   dee   de	e
   deeef   fd	Z xZS )PixtralTransformerc                     t         |           || _        t        j                  j                         | _        t        |j                        D ]&  }| j                  j                  t        |             ( d| _        y )NF)r;   r<   rH   r   r   
ModuleListlayersrangenum_hidden_layersr$   r   gradient_checkpointing)rG   rH   r   rO   s      r0   r<   zPixtralTransformer.__init__G  sc    hh))+v//0 	>AKK4V<=	>&+#r2   ry   r   r   output_hidden_statesreturn_dictr   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}	|}
| j                  D ]`  }|r||
fz   }| j
                  r,| j                  r | j                  |j                  |
|||      }n ||
|f||d|}|d   }
|sX|	|d   fz   }	b |r||
fz   }|st        d |
||	fD              S t        |
||	      S )av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r   r   r   r   c              3   &   K   | ]	  }||  y wra   r   ).0vs     r0   	<genexpr>z-PixtralTransformer.forward.<locals>.<genexpr>  s     eqWXWdes   )last_hidden_stater   
attentions)rH   r   r   use_return_dictr   r   r~   _gradient_checkpointing_func__call__r   r   )rG   inputs_embedsry   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputss                r0   r`   zPixtralTransformer.forwardO  sN   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[ 	FM#!/=2B!B**t}} $ A A!**!"'%! !.!"! )<&7	!
 ! *!,M !/=3C2E!E/	F2  +}.>>Ne]NN$Seee+>Vd
 	
r2   )NNNNN)rb   rc   rd   r<   r   r   r   r   r   r   r
   r   r   r`   rg   rh   s   @r0   r   r   F  s    , 26KO,0/3&*H
 !.H
 &eELL%,,,F&GH	H

 $D>H
 'tnH
 d^H
 -.H
 
uo%	&H
r2   r   c                   H    e Zd ZeZdZdZdZdZdZ	dZ
dZdgZdZ	dZ
dZdZd Zy)PixtralPreTrainedModelmodelpixel_valuesTr   c                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t              r&|j                  j                  j                  d       y y )Nr   )r   stdr7   )rH   initializer_rangerV   r   r   Conv2dr   datanormal_r   zero_r   fill_)rG   ru   r   s      r0   _init_weightsz$PixtralPreTrainedModel._init_weights  s    kk++fryy"))45MM&&CS&9{{&  &&( '/MM$$S) 0r2   N)rb   rc   rd   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_no_split_modulesr  r   r2   r0   r   r     sT    &L$O&*#"&!N01!N"&*r2   r   c                    |j                   }|j                  }|j                  d   }t        j                  |      j
                  }t        j                  ||f|||      }t        j                  |       j                  d      }t        j                  dg| d d z         j                  d      }t        ||      D ]  \  }	}
d||	|
|	|
f<    |d d d d d d f   j                  |j                  d   ddd      }|S )Nr   )
fill_valuerU   r8   r   r   )rU   r8   r   r   finfominfulltensorcumsumzipexpand)r&   r  rU   r8   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartends              r0   generate_block_attention_maskr    s    LLE]]Fll1oGKK""E**gw/EW]^KLL!23::2>MllA3):3B)?#?@GGKO/=9 .
s,-E#IuSy(). dD!Q./66v||A2rRKr2   c                        e Zd ZdZ fdZd Zee	 	 	 	 ddej                  de
ej                     de
e   de
e   de
e   d	ee   d
eeef   fd              Z xZS )PixtralVisionModelvision_encoderc                 z   t         |   |       || _        t        j                  |j
                  |j                  |j                  |j                  d      | _        |j                  | _        t        |j                  d      | _
        t        |      | _        t        |      | _        | j                          y )NF)in_channelsout_channelskernel_sizestrider   r   r   )r;   r<   rH   r   r   num_channelsr   rB   
patch_convr   ln_prer   transformerr4   patch_positional_embedding	post_initr   s     r0   r<   zPixtralVisionModel.__init__  s     ))++++))$$
 !++$V%7%7TB-f5*@*H'r2   c                     | j                   S ra   )r&  r   s    r0   get_input_embeddingsz'PixtralVisionModel.get_input_embeddings  s    r2   r   image_sizesr   r   r   r   r   c           
         ||j                   \  }}	}
}|
|fg|z  }| j                  |      }t        ||      D cg c]1  \  }}|dd |d   | j                  z  d |d   | j                  z  f   3 }}}t	        j
                  |D cg c]  }|j                  d      j                   c}d      j                  d      }| j                  |      }t        || j                  j                  | j                  j                  z        }||d<   | j                  ||      }t        |D cg c]!  }|j                   d   |j                   d   z  # c}|      } | j                  |f||||d	d
|S c c}}w c c}w c c}w )N.r   r   r   )r'   r^   r   r   T)ry   r   r   r   r   )r   r&  r  rB   r   r%   flattenTrn   r'  r1   rH   rA   r)  r  r(  )rG   r   r-  r   r   r   argsr   r   r   r*   r+   patch_embedsembedr   r&   r}   r^   r   ry   s                       r0   r`   zPixtralVisionModel.forward  s    +7+=+=(J65"E?+j8K |4  #<=
t #5$q'T__457U$q'T__:T7UUV
 
 yy:K!LQ!))A,..!LRST^^_`a{{<0 0)?)?4;;CYCY)Y
 ".~"==lLY60AB1QWWR[1772;&BL
  t
) 3!5/
 
 	
+
 "M Cs   6E2"E8+&E=)NNNN)rb   rc   rd   r  r<   r,  r   r   r   r   r   r   r   r
   r   r   r   r`   rg   rh   s   @r0   r  r    s    ("  /3/3,0&*-
ll-
 ell+-
 'tn	-

 $D>-
 d^-
 -.-
 
uo%	&-
  -
r2   r  )Nr   )r   )1re   collections.abcr   typingr   r   r   r   torch.utils.checkpointr   activationsr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerrb   r   r1   Moduler4   rl   rt   r   rC   r   r   r   r   r   r   r   r  r  __all__r   r2   r0   <module>rB     sw    $ ) )    ! B / 6 F & > > 6 
		H	% 0<RYY 0<h(F %II%<<% 
% <<	%
 U\\*% % %.M)ryy M)b "JRYY J(/BII /dQ
 Q
h *_ * *2  F
/ F
 F
R  !9
:r2   