
    UhY5                       d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmc mZ d dlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4  e0       rd dl5m6Z6 ddl7m8Z8  e1jr                  e:      Z; G d dejx                        Z= G d dejx                        Z> G d de	j                  jx                        Z? G d dejx                        Z@ ed       G d dejx                               ZA G d d ejx                        ZBd!e	j                  d"e	j                  d#e	j                  d$ee	j                  e	j                  f   fd%ZDd&e	j                  d'eEd$e	j                  fd(ZF	 d[d)ejx                  d*e	j                  d+e	j                  d,e	j                  d-ee	j                     d.eGd/eGfd0ZH	 d[d)ejx                  d*e	j                  d+e	j                  d,e	j                  d-ee	j                     d.eGd/eGfd1ZI G d2 d3ejx                        ZJ G d4 d5ejx                        ZKe. G d6 d7e)             ZLe. G d8 d9eL             ZM G d: d;ee-      ZN G d< d=eLe      ZOe G d> d?e#             ZP G d@ dAe	j                  jx                        ZQ G dB dCejx                        ZRdD ZS G dE dFejx                        ZTdGe	j                  d*e	j                  fdHZUd*e	j                  d+e	j                  dGe	j                  d$ee	j                  e	j                  f   fdIZV G dJ dKejx                        ZW G dL dMejx                        ZX G dN dOejx                        ZY G dP dQejx                        ZZ G dR dSejx                        Z[ G dT dUejx                        Z\ G dV dWeL      Z] G dX dYeLe      Z^g dZZ_y)\    N)	dataclass)CallableListOptionalTupleUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCacheHybridChunkedCache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )Llama4ConfigLlama4TextConfig)	BlockMask)make_flex_block_causal_maskc                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Llama4TextExpertsconfigc                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | _        t        j                  t        j                  | j                  | j
                  d| j                  z              | _        t        j                  t        j                  | j                  | j                  | j
                  f            | _        t        |j                     | _        y N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr   
hidden_actact_fnselfr(   	__class__s     |/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/llama4/modeling_llama4.pyr-   zLlama4TextExperts.__init__1   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 \   |j                  | j                  d| j                        }t        j                  || j
                        }|j                  dd      \  }}t        j                  || j                  |      z  | j                        }|j                  d| j                        }|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r+   dim)	viewr/   r1   r5   bmmr7   chunkr:   r8   )r<   r@   gate_upgateupnext_statess         r>   forwardzLlama4TextExperts.forward;   s     &**4+;+;RAQAQR))M4+<+<====+biidkk$&7!7$..I!&&r4+;+;<r?   )	__name__
__module____qualname__r#   r-   r5   TensorrM   __classcell__r=   s   @r>   r'   r'   0   s+    0/ 0U\\ ell r?   r'   c                   &     e Zd Zd fd	Zd Z xZS )Llama4TextMLPc                 f   t         |           ||j                  }|| _        t	        j
                  |j                  |d      | _        t	        j
                  |j                  |d      | _        t	        j
                  ||j                  d      | _	        t        |j                     | _        y NFbias)r,   r-   r0   r(   r3   Linearr1   	gate_projup_projr8   r   r9   activation_fn)r<   r(   r0   r=   s      r>   r-   zLlama4TextMLP.__init__R   s    $ & 8 86#5#57HuUyy!3!35FUS#4f6H6HuU#F$5$56r?   c                     | j                  | j                  |            | j                  |      z  }| j                  |      S N)r]   r[   r\   r8   )r<   xr8   s      r>   rM   zLlama4TextMLP.forward^   s7    &&t~~a'89DLLOK	~~i((r?   r_   rN   rO   rP   r-   rM   rR   rS   s   @r>   rU   rU   Q   s    
7)r?   rU   c                   8     e Zd Zddef fdZd Zd Zd Z xZS )Llama4TextL2Normepsc                 0    t         |           || _        y r_   )r,   r-   rd   )r<   rd   r=   s     r>   r-   zLlama4TextL2Norm.__init__d   s    r?   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S Nr+   rC   T)keepdimr5   rsqrtpowmeanrd   r<   r`   s     r>   _normzLlama4TextL2Norm._normh   4    5;;quuQx}}R}>IJJJr?   c                 ^    | j                  |j                               j                  |      S r_   )rn   floattype_asrm   s     r>   rM   zLlama4TextL2Norm.forwardk   s"    zz!'')$,,Q//r?   c                      d| j                    S )Nzeps=rd   r<   s    r>   
extra_reprzLlama4TextL2Norm.extra_reprn   s    dhhZ  r?   )gư>)	rN   rO   rP   rq   r-   rn   rM   rv   rR   rS   s   @r>   rc   rc   c   s    E K0!r?   rc   c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )Llama4TextRMSNormc                     t         |           || _        t        j                  t        j                  |            | _        y)z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r,   r-   rd   r3   r4   r5   onesweight)r<   r1   rd   r=   s      r>   r-   zLlama4TextRMSNorm.__init__s   s0     	ll5::k#:;r?   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S rg   ri   rm   s     r>   rn   zLlama4TextRMSNorm._norm{   ro   r?   c                 |    | j                  |j                               j                  |      }|| j                  z  S r_   )rn   rq   rr   r{   )r<   r`   outputs      r>   rM   zLlama4TextRMSNorm.forward~   s0    AGGI&..q1##r?   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler{   shaperd   ru   s    r>   rv   zLlama4TextRMSNorm.extra_repr   s'    ))*+6$((<<r?   )gh㈵>)rN   rO   rP   r-   rn   rM   rv   rR   rS   s   @r>   rx   rx   r   s    <K$=r?   rx   Llama4TextMoec                   $     e Zd Z fdZd Z xZS )r   c                 *   t         |           |j                  | _        |j                  | _        |j                  | _        t        |      | _	        t        j                  |j                  |j                  d      | _        t        |      | _        y rW   )r,   r-   num_experts_per_toktop_kr1   
hidden_dimr.   r/   r'   expertsr3   rZ   routerrU   shared_expertr;   s     r>   r-   zLlama4TextMoe.__init__   sp    //
 ,,!33(0ii 2 2F4L4LSXY*62r?   c                    |j                   \  }}}|j                  d| j                        }| j                  |      }||z  }t	        j
                  || j                  d      \  }}t	        j                  |t        d            j                  d||      j                  dd      }	t	        j                  ||j                        j                  dd      j                  |	j                  d      d      }t	        j                   |	j                               j#                  |j$                        }	|j                  dd      j                  d|      }t	        j&                  |d|      j#                  |j                        }
|
|	j                  dd      z  }
| j)                  |
      }| j+                  |      }|j-                  d||j                  d|             ||	fS )	NrC   r!   rD   z-infr   device)inputrE   index)rE   r   src)r   reshaper   r   r5   topkr   	full_likerq   scatter_	transposearanger   rF   expandsizesigmoidtodtypegatherr   r   scatter_add_)r<   r@   batchseq_lenr   router_logitstokens_per_expertrouter_top_valuerouter_indicesrouter_scores	routed_in
routed_outouts                r>   rM   zLlama4TextMoe.forward   s   %2%8%8"w
%--b$//BM2!GO+0::mTZZUV+W(.OOM5=9BB1nVfgqqrsuvw 	 LL*=3G3GHMMaQST[[\i\n\nop\qsuv 	 m&9&9&;<??@S@ST'//A6==b*MLL 
 "]!!
"	 	  5 5b! <<	\\),
  / 	Qn*//"j:YZM!!r?   ra   rS   s   @r>   r   r      s    3"r?   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Llama4TextRotaryEmbeddingr(   c                 `   t         |           |j                  dnd| _        |j                  | _        |j                  | _        || _        t        | j                     | _	        | j                  | j                  |      \  }| _
        | j                  d|d       | j                  | _        y )Nllama3defaultinv_freqF)
persistent)r,   r-   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr(   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r<   r(   r   r   r=   s       r>   r-   z"Llama4TextRotaryEmbedding.__init__   s    %+%8%8%D)"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r?   c                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                  |j
                        |z  j                  dd      }t        j                  t        j                  |      |      }|| j                  z  }d d d        |S # 1 sw Y   S xY w)	Nr   rC   r!   mpscpuF)device_typeenabledr+   )r   rq   r   r   
isinstancer   typestrr5   autocastr   r   polar	ones_liker   )r<   r`   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r>   rM   z!Llama4TextRotaryEmbedding.forward   s
    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	;&))!((36KKVVWXZ[\EEOOE$:EBI!D$:$::I	;
 	;
 s   A'D88Er_   )
rN   rO   rP   r#   r-   r5   no_gradr   rM   rR   rS   s   @r>   r   r      s4    // / U]]_
  
r?   r   xqxkr   rA   c           	      &   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        j
                  ||d d d d d d d f   z        j                  d      }t        j
                  ||d d d d d d d f   z        j                  d      }|j                  |       |j                  |      fS )NrC   r+   r
   )r5   view_as_complexrq   r   r   view_as_realflattenrr   )r   r   r   xq_xk_xq_outxk_outs          r>   apply_rotary_embr      s    
 

 2
 2 2 IBHHSbM I2 Iq I
JC


 2
 2 2 IBHHSbM I2 Iq I
JCi1dA&> >?GGJFi1dA&> >?GGJF>>"v~~b111r?   r@   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r   r   r   )r@   r   r   num_key_value_headsslenhead_dims         r>   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr?   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr+   r
   rC   rD   ptrainingr!   )r   num_key_value_groupsr5   matmulr   r   r3   
functionalsoftmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r>   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r?   c                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            | j
                  dz  z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )	Nr+   r
         r   rC   rD   r   r!   )r   r   r5   r   r   r   r   r3   r   r   r   r   r   r   s                r>   vision_eager_attention_forwardr     s     3 ; ;<JUF$?$?@L<<z';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r?   c                   2    e Zd ZdZdef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr(   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  | _        |j                  |j                  z  | _	        |j                  | _        | j                  dz  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        |j                   |   | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j
                  |j(                        | _        | j                  j2                  r(| j"                  rt5        |j6                        | _        y y y )Nr   r   TrX   )r,   r-   r(   	layer_idxgetattrr1   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper3   rZ   attention_biasq_projk_projv_projo_projuse_qk_normrc   rms_norm_epsqk_normr<   r(   r   r=   s      r>   r-   zLlama4TextAttention.__init__#  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"r?   r@   position_embeddingsr   past_key_valuecache_positionr   rA   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }	 | j	                  |      j                  g |d| j                   }
| j                  |      j                  |      j                  dd      }| j                  r)t        |	|
|j                  |	j                              \  }	}
t        | d      r"| j                  |	      }	| j                  |
      }
| j                  r| j                  st        j                  t        j                   |j#                         dz   | j$                  z        dz         | j&                  z  dz   }|j                  d|d   ddf      j)                  g |dd      }|	|z  j                  |	j*                        }	|	j                  dd      }	|
j                  dd      }
|%d|i}|j-                  |
|| j.                  |      \  }
}t0        }| j2                  j4                  dk7  r^| j2                  j4                  dk(  r(|j7                  d	d
      rt8        j;                  d       nt<        | j2                  j4                     } || |	|
||f| j>                  sdn| j@                  | jB                  d|\  }} |jD                  g |d jG                         }| jI                  |      }||fS )NrC   r!   r+   r
        ?r  eagersdpaoutput_attentionsF`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )%r   r   r  rF   r  r  r   r  r   r   r   hasattrr
  r   r5   logfloorrq   r   r   r   r   updater   r   r(   _attn_implementationgetloggerwarning_oncer   r   r   r   r   r   r  )r<   r@   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   s                    r>   rM   zLlama4TextAttention.forwardA  s    $))#2.88b8$--8{{=166|D4T[[/44UkU2Ut}}U
{{=166|DNNqRST=='7j*=*@*@ATAT*U($L* 4#<<5Lj1J ''		%++~';';'='CtGWGW&WX[^^_bfbqbqqtww  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(;6::<;M;MNL#--a3))!Q/
%,n=L'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r?   NN)rN   rO   rP   __doc__r#   r-   r5   rQ   r   r   r   
LongTensorr   r   rM   rR   rS   s   @r>   r   r      s    GA/ AF +/59?)||?) #5<<#=>?) !.	?)
 !?) !!1!12?) -.?) 
u||Xell3XeELL>Q5RR	S?)r?   r   c                       e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej
                     deej
                     deej                     deeej
                        dee	   dee	   d	ee	   d
eej                     deeej
                  ej
                  f      de
e   deej                  eeej                  ej                  f      f   fdZ xZS )Llama4TextDecoderLayerc                    t         |           |j                  | _        t        ||      | _        |j
                  d uxr t        |j                  |         | _        ||j                  v | _
        | j                  rt        |      | _        nt        ||j                        | _        t        |j                  |j                         | _        t        |j                  |j                         | _        || _        y )N)r0   rt   )r,   r-   r1   r   	self_attnattention_chunk_sizeboolr  use_chunked_attention
moe_layersis_moe_layerr   feed_forwardrU   intermediate_size_mlprx   r	  input_layernormpost_attention_layernormr   r  s      r>   r-   zLlama4TextDecoderLayer.__init__  s    !--,VY?%+%@%@%L%wQUV\VkVkluVvQw"%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%"r?   r@   r   chunk_causal_maskr   r  r  output_router_logits	use_cacher  r  r   rA   c                 b   |}| j                  |      }| j                  r||} | j                  d||
|||||	d|\  }}||z   }|}| j                  |      }| j	                  |      }| j
                  r|\  }}nd }||j                  |j                        z   }|f}|r||fz  }|r||fz  }|S )N)r@   r  r   r  r  r6  r   )r2  r-  r*  r3  r0  r/  rF   r   )r<   r@   r   r4  r   r  r  r5  r6  r  r  r   residualattention_statesself_attn_weightsr   outputss                    r>   rM   zLlama4TextDecoderLayer.forward  s     !,,]; %%*;*G.N /=dnn 	/
' 3))/)	/
 	/
++ !#33 !55mD))-8+8(M= M =#5#5hnn#EE ")++G''Gr?   )	NNNNFFFNN)rN   rO   rP   r-   r5   rQ   r   r&  r   r,  r   r   FloatTensorrM   rR   rS   s   @r>   r(  r(    s.   #& 2648378<,1/4$)59KO5||5 !.5 $ELL1	5
 u//05 !u||!455 $D>5 'tn5 D>5 !!1!125 &eELL%,,,F&GH5 -.5 
u  (51B1BEDUDU1U+V"WW	X5r?   r(  c                   <    e Zd ZeZdZdgZdZdZdZ	dZ
dZdZdZd Zy)Llama4PreTrainedModelTpast_key_valuesFc                 V   t        | j                  d      r| j                  j                  n| j                  j                  j                  }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t	        |t
        j                        rJ|j                  j                  j                  d       |j                  j                  j                          y t	        |t               r&|j                  j                  j                  d       y t	        |t"              rO|j$                  j                  j                  d|       |j&                  j                  j                  d|       y t	        |t(              ra|j*                  j                  j                  |j,                         |j.                  j                  j                  |j,                         y y )Ninitializer_ranger  )rl   stdr  )rC  )r  r(   rB  text_configr   r3   rZ   r{   datanormal_rY   zero_	Embeddingpadding_idx	LayerNormfill_rx   r'   r7   r8   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r<   r   rC  s      r>   _init_weightsz#Llama4PreTrainedModel._init_weights  s    t{{$78 KK))((:: 	
 fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)KK""$ 12MM$$S) 12$$,,#3,?!!))s)< 12""''//FLL/A++0088V\\8J 3r?   N)rN   rO   rP   r"   config_classsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendrP  r8  r?   r>   r?  r?    sE    L&*##4"5"N  $!"&Kr?   r?  c                       e Zd ZdgZdZeZdef fdZd Zd Z	e
e	 	 	 	 	 	 	 	 	 	 d"dej                  deej                     d	eej                     d
ee   deej"                     dee   dee   dee   dee   deej                     dee   deeef   fd              Zej2                  j5                  d      	 	 	 d#dej                  dej                  dej                  d
edef
d       Zdedededej:                  dej                  f
dZedej                  dededej@                  dej                  d efd!       Z! xZ"S )$Llama4TextModelr(  modelr(   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nrt   )r(   F)r,   r-   pad_token_idrI  
vocab_sizer3   rH  r1   embed_tokens
ModuleListrangenum_hidden_layersr(  layersrx   r	  normr   
rotary_embgradient_checkpointing	post_initr  s      r>   r-   zLlama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHgh9#FI6h
 &f&8&8f>Q>QR	36B&+# 	 is   Dc                     | j                   S r_   ra  ru   s    r>   get_input_embeddingsz$Llama4TextModel.get_input_embeddings
  s       r?   c                     || _         y r_   rk  r<   r   s     r>   set_input_embeddingsz$Llama4TextModel.set_input_embeddings  s
    !r?   	input_idsr   r   r@  inputs_embedsr6  r  output_hidden_statesreturn_dictr  flash_attn_kwargsrA   c                 H   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|>| j                  |j                  | j                  j                  j                              }|rb|`| j                   j                         j                  2t!        | j                   |j"                  d   |j"                  d         }n
t%               }|
F||j'                         nd}t)        j*                  |||j"                  d   z   |j                        }
||
j-                  d      }| j/                  |||
|||      \  }}|}| j1                  ||      }|rdnd }|rdnd }| j2                  d | j                   j4                   D ]k  }|r||fz  }| j                  r2| j                  r&| j7                  |j8                  ||||||d||
|      }n ||f|||||||
|d	|}|d   }|sc||d   fz  }m | j;                  |      }|r||fz  }t=        ||r|nd ||
      S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r!   r   )r6  r8  )r   r4  r   r  r  r6  r  r  )last_hidden_stater@  r@   
attentions)r(   r  rr  r6  use_return_dict
ValueErrorrh  r   r  r  ra  r   r{   r   get_text_configr+  r   r   r   get_seq_lengthr5   r   	unsqueeze_update_causal_maskrg  re  rd  _gradient_checkpointing_func__call__rf  r   )r<   rp  r   r   r@  rq  r6  r  rr  rs  r  rt  past_seen_tokensr   r4  r@   freq_cisall_hidden_statesall_self_attnsdecoder_layerlayer_outputss                        r>   rM   zLlama4TextModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --ill4;L;L;S;S;Z;Z.[\M0{{**,AAM"4T[[-BUBUVWBXZgZmZmnoZp"q"..!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L)-)A)AM>?L]ir *B *
&& & ??=,? #7BD0d![[)H4;;+H+HI #	6M#!m%55!**t}} $ A A!**!% #%"! !.!!#.&7!-#2&7'#1(0! (! *!,M =#3"55G#	6J 		-0  -!11&+/8Od+%	
 	
r?   F)	recursiveinput_tensorc           	      &   | j                   j                  dk(  r||dk(  j                         r||fS y| j                   j                  dvry|j                  d   }|j	                  | j
                        }| j                   j                  }	|	d u}
|d   }||j                         xs |}n||j                  d   n|}|
rL||	k\  }||	k  ||z   |	kD  z  }|r5t        j                  ||	|z   dz
  t        j                  |||z   |	            n|}| j                   j                  dk(  rit        |t        j                        r;|
r$|t        ||	z
  dz   d      f}t        ||	||	      }t        ||||df
      }||fS t        |t              r||fS |j                  |j
                  }}|
rt        ||	      n|}| j!                  ||||||j                  d         }|
rE||	kD  r?t        ||	z
  dz   d      }|z   }| j#                  | j                   j                  |||      }|d d ||f   }|j                  d   |	k  }|r2t$        j&                  j)                  |d|	|j                  d   z
  f      }|s|d d | d d d f   }n|d d |d d f   }|j+                  |j                  d   ddd      }||d d d d d d f   z  }| j                   j                  dk(  rHt        j,                  |      j.                  }t        j                  |dk(  |d      j	                  |      }| j                   j                  dk(  r`|^|j
                  j0                  dv rF|j2                  dk(  r7|s5t        j,                  |      j.                  }t5        j6                  ||      }| j                   j                  dk(  rY|W|j9                         }|t        j,                  |      j.                  k7  }t5        j:                  |||| j<                        rd }||fS )Nflash_attention_2r  r$  )r  flex_attentionr  r!   r   rC   r  )offsets)query_length
key_lengthr  )sequence_lengthtarget_lengthr   r  
batch_size)startendr   r  r  )cudaxpunpu   )rq  past_key_values_lengthis_training)r(   r  anyr   r   r   r+  get_max_cache_shaper5   wherer   rQ   maxr%   r$   r   5_prepare_4d_causal_attention_mask_with_cache_positioncreate_chunked_attention_maskr3   r   padr   finfominr   ndimr   _unmask_unattendedr,  _ignore_causal_mask_sdpar   )r<   r   r  r  r@  r  chunked_attention_maskr6  r  r+  using_chunked_attentionfirst_cache_positionfull_cache_lengthcond1cond2r  r  r   r   r  r   	start_idxend_idxlocal_attention_maskrequires_padding	min_dtypes                             r>   r~  z#Llama4TextModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K%~55;;++3VV&,,Q/'**4;;7#{{??"6d"B-a0& / C C E X<J<V 4 4R 8\k"(,@@E),@@$69MME  (?:Q>KK';o'MOcd '  ;;++/??.%,,7*3S9MPd9dgh9hjk5lmG-H&(<ozcj.* "="!00115	" &'===.)4%'=== %**L,?,?vH_-/CDevPP+')#))!, Q 
 #'8;O'O03GG!KQOI*,G%)%G%G00	 &H &" $2!Yw5F2F#G 399"=@TT')}}'8'8(1.BEYE_E_`bEc.c*d($ $)?d_L\L]_`@`)a&)?dN\]@])^&%;%B%B<CUCUVWCXZ\^`bd%e"%;>RSTVZ\`bcSc>d%d"{{//7:!KK.22	).5Kq5PR[]`)a)d)dej)k& KK,,6*%%**.DD##q(%
 E*..I0CCKQZ[K ;;++v5:P:\%;%@%@%B"%U);)?)??K%>>*'; MM	 #222r?   r+  r  r  r   c                 (   t        j                  |||      }t        j                  |j                  d      |z  |j                  d      |z  z
        }|j                  d      |j                  d      z
  }|dk(  |dk  z  }|j	                  |      S )u  
        Generate the following:

        'What'      :  0 ■ ⬚ ⬚ ⬚ ⬚ ⬚    |
        '▁is'       :  1 ■ ■ ⬚ ⬚ ⬚ ⬚     |
        '▁ch'       :  2 ■ ■ ■ ⬚ ⬚ ⬚     |
        'unked'     :  3 ⬚ ⬚ ⬚ ■ ⬚ ⬚    |
        '▁attention':  4 ⬚ ⬚ ⬚ ■ ■ ⬚    |
        '?'         :  5 ⬚ ⬚ ⬚ ■ ■ ■     |

        If the chunk size is 3.
        This can just be applied over the already created attention mask
        r   r   r!   )r5   r   absr}  r   )	r<   r+  r  r  r   arange_vector	block_pos	token_posmasks	            r>   r  z-Llama4TextModel.create_chunked_attention_mask  s      UC?II##A&*>>AXAXYZA[_sAss
	 "++A.1H1H1KK	Q9>2wwvr?   r  r  r   r  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to place the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  
fill_valuer   r   r!   diagonalr   rC   r   rE   r5   r  r  fullr   triur   r   r   cloner   r   masked_fillr   r  r  r   r  r  r   r   r  mask_lengthpadding_masks              r>   r  zELlama4TextModel._prepare_4d_causal_attention_mask_with_cache_position  s   @ %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg"))E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r?   )
NNNNNNNNNN)FNT)#rN   rO   rP   _no_split_modulesbase_model_prefixr#   rQ  r-   rl  ro  r   r   r5   r&  r   rQ   r   r=  r,  r   r   r   r   r   rM   compilerdisabler~  intr   r  staticmethodr   r  rR   rS   s   @r>   r\  r\    sF   12#L/  !"  '+1537+/59$(,0/3&*59m
##m
 !.m
 u//0	m

 "%m
   1 12m
 D>m
 $D>m
 'tnm
 d^m
 !!1!12m
 $$89m
 
u--	.m
  m
^ ^^e, #(#~3~3 ll~3 	~3
 ~3  ~3 -~3@$'03:=GL||	0 666 6 {{	6
 6 6 6r?   r\  c                       e Zd Zy)KwargsForCausalLMN)rN   rO   rP   r8  r?   r>   r  r  T  s    r?   r  c            !           e Zd ZdgZdZdgZddiZeZdef fdZ	d Z
d	 Zd
 Zd Zd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 ddej&                  deej*                     deej&                     deeeeej2                     f      deej2                     deej&                     dee   dee   dee   dee   deej&                     deeej*                  f   dee   deeef   fd              Z  xZ!S )Llama4ForCausalLMr(  language_modelzlm_head.weightlm_headcolwise_repr(   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rW   )
r,   r-   r\  r]  r`  r3   rZ   r1   r  ri  r;   s     r>   r-   zLlama4ForCausalLM.__init__^  sU     $V,
 ++yy!3!3V5F5FUS 	r?   c                 .    | j                   j                  S r_   r]  ra  ru   s    r>   rl  z&Llama4ForCausalLM.get_input_embeddingsg  s    zz&&&r?   c                 &    || j                   _        y r_   r  rn  s     r>   ro  z&Llama4ForCausalLM.set_input_embeddingsj  s    "'

r?   c                     | j                   S r_   r  ru   s    r>   get_output_embeddingsz'Llama4ForCausalLM.get_output_embeddingsm  s    ||r?   c                     || _         y r_   r  r<   new_embeddingss     r>   set_output_embeddingsz'Llama4ForCausalLM.set_output_embeddingsp  s	    %r?   c                     || _         y r_   r]  r<   decoders     r>   set_decoderzLlama4ForCausalLM.set_decoders  s	    
r?   c                     | j                   S r_   r  ru   s    r>   get_decoderzLlama4ForCausalLM.get_decoderv  s    zzr?   rp  r   r   r@  rq  labelsr6  r  rr  rs  r  logits_to_keepr   rA   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  d||||||||	d|d
|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```NT)
rp  r   r   r@  rq  r6  r  rr  rs  r  r   )logitsr  r`  )lossr  r@  r@   rx  r8  )r(   r  rr  ry  r]  r   r  slicer  loss_functionr`  r   r@  r@   rx  )r<   rp  r   r   r@  rq  r  r6  r  rr  rs  r  r  r   r<  r@   slice_indicesr  r  s                      r>   rM   zLlama4ForCausalLM.forwardy  s4   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $** 
)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r?   )NNNNNNNNNNNr   )"rN   rO   rP   r  r  _tied_weights_keys_tp_planr#   rQ  r-   rl  ro  r  r  r  r  r   r   r5   r&  r   rQ   r   r   r   r=  r,  r  r   r  r   r   rM   rR   rS   s   @r>   r  r  W  s   12(*+=)H#L/ '(&  '+1537KO59-1$(,0/3&*5934I
##I
 !.I
 u//0	I

 "%tE4E4E/F(F"GHI
   1 12I
 ))*I
 D>I
 $D>I
 'tnI
 d^I
 !!1!12I
 c5<</0I
 *+I
 
u,,	-I
  I
r?   r  c                      e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Llama4CausalLMOutputWithPasta  
    Base class for Llava causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r@  r@   rx  image_hidden_states)rN   rO   rP   r%  r  r   r5   r=  __annotations__r  r@  r   r@   r   rx  r  r8  r?   r>   r  r    s    < )-D(5$$
%, $FE$9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r?   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionMLP2c                 ~   t         |           |j                  | _        |j                  | _        t	        j
                  | j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j                         | _        |j                  | _        y rW   )r,   r-   r1   r0   r3   rZ   projector_input_dimfc1projector_output_dimfc2GELUr]   projector_dropoutr   r;   s     r>   r-   zLlama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//r?   c                     | j                  |      }| j                  |      }t        j                  || j                  | j                        }| j                  | j                  |            S )Nr   )r  r]   Fr   r   r  r<   r@   s     r>   rM   zLlama4VisionMLP2.forward  sT    /**=9		-4<<$--X!!$((="9::r?   ra   rS   s   @r>   r  r    s    0;r?   r  c                   $     e Zd Z fdZd Z xZS )Llama4MultiModalProjectorc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y rW   )	r,   r-   r3   rZ   vision_configvision_output_dimrD  r1   linear_1r;   s     r>   r-   z"Llama4MultiModalProjector.__init__  s?    		  22**
r?   c                 (    | j                  |      }|S r_   )r  )r<   image_featuresr@   s      r>   rM   z!Llama4MultiModalProjector.forward	  s    n5r?   ra   rS   s   @r>   r  r     s    
r?   r  c           
      J   | j                   \  }}}t        t        j                  |            }| j	                  |||d      } | j                         \  }}}}| j	                  ||t        ||z        t        ||z              }|j                  dddd      j                         }|j	                  |t        ||z        t        ||z        t        ||dz  z              }|j                  dddd      j                         }|j	                  |d|j                   d         }	|	S )NrC   r   r+   r!   r
   )r   r  mathsqrtrF   r   permuter   )
r  shuffle_ratior  num_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r>   pixel_shuffler    s%   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='Jx"''
FC@U<VX[\dgt\tXuvO%--aAq9DDFO%**C./U]5J1KSQY]jlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMr?   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionPixelShuffleMLPc                     t         |           |j                  | _        t        |j                  | j                  dz  z        | _        |j                  | _        t        |      | _	        y r*   )
r,   r-   pixel_shuffle_ratior  r  	inner_dimr  
output_dimr  mlpr;   s     r>   r-   z$Llama4VisionPixelShuffleMLP.__init__#  sX    #)#=#= V77D<T<TVW<WXY 55#F+r?   encoded_patchesrA   c                 P    t        || j                        }| j                  |      S r_   )r  r  r  )r<   r  s     r>   rM   z#Llama4VisionPixelShuffleMLP.forward*  s#    '9Q9QRxx((r?   rN   rO   rP   r-   r5   rQ   rM   rR   rS   s   @r>   r  r  "  s#    ,)u|| ) )r?   r  freqs_cic                     |j                   }t        |j                        D cg c]  \  }}|dk(  s||dz
  k(  r|nd }}} | j                  | S c c}}w )Nr!   )r  	enumerater   rF   )r  r   r  idr   s         r>   reshape_for_broadcastr!  0  sW    ::D=Fu{{=STTQ!q&AMQq0TET8==%   Us   Ac                 B   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        ||      }|j                  |j                        }t        j                  ||z        j                  d      }t        j                  ||z        j                  d      }|j                  |       |j                  |      fS )NrC   r+   )r  r   r
   )r5   r   rq   r   r   r!  r   r   r   r   rr   )r   r   r  query_key_	query_outkey_outs          r>   vision_apply_rotary_embr'  6  s    
 ""#85;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!4!Lciin!Lb!L!!LMD$hfEH{{6==)H""6H#45==a@I  199!<GU#W__S%999r?   c                        e Zd Zdef fdZ	 	 d
dej                  dej                  deej                     dee   de	e
   deej                  eej                     eeej                        f   fd	Z xZS )Llama4VisionAttentionr(   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  |j
                  z  | _        d| _        |j                  | _	        | j                  dz  | _
        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr!   r   TrX   )r,   r-   r(   r1   	embed_dimr   	num_headsr   r   r   r   r3   rZ   r  r  r  r  r;   s     r>   r-   zLlama4VisionAttention.__init__E  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZr?   r@   r  r   r  r   rA   c                 R   |j                   d d }g |d| j                  }| j                  |      j                  |      }| j	                  |      j                  |      }	| j                  |      j                  |      }
t        ||	|      \  }}	|j                  dd      }|	j                  dd      }	|
j                  dd      }
t        }| j                  j                  dvr^| j                  j                  dk(  r(|j                  dd      rt        j                  d	       nt        | j                  j                     } || ||	|
d f| j                  sd
n| j                   d dd|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )NrC   )r  r!   r+   )r  r  r  r  Fr  r  )r   r   r   )r   r   r  rF   r  r  r'  r   r   r(   r  r  r  r  r   r   r   r   r   r  )r<   r@   r  r   r  r   r  r  r   r   r   r#  r   r   s                 r>   rM   zLlama4VisionAttention.forwardT  s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g j#--a3))!Q/
#--a3(F;;++3NN{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2H
%
 
%
!\ *k));;;;FFHkk+.L((r?   r$  )rN   rO   rP   r	   r-   r5   rQ   r   r   r   r   r   rM   rR   rS   s   @r>   r)  r)  D  s    [1 [& 26*..)||.) ,,.) !.	.)
 !.) -..) 
u||Xell3XeELL>Q5RR	S.)r?   r)  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionMLPc                 &   t         |           || _        t        j                         | _        t        j                  |j                  |j                  d      | _	        t        j                  |j                  |j                  d      | _
        y )NTrX   )r,   r-   r(   r3   r  r]   rZ   r1   r0   r  r  r;   s     r>   r-   zLlama4VisionMLP.__init__  se    WWY99V//1I1IPTU99V55v7I7IPTUr?   r@   rA   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r_   )r  r]   r  r  s     r>   rM   zLlama4VisionMLP.forward  s4    /**=9/r?   r  rS   s   @r>   r/  r/    s$    VU\\ ell r?   r/  c            
            e Zd Zdef fdZ	 	 ddej                  dej                  deej                     dee   fdZ	 xZ
S )	Llama4VisionEncoderLayerr(   c                    t         |           |j                  | _        t        |      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y r_   )r,   r-   r1   r)  r*  r/  r  r3   rJ  r2  r3  r;   s     r>   r-   z!Llama4VisionEncoderLayer.__init__  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%r?   hidden_stater  r   r  c                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r  r   )r2  r*  r3  r  )r<   r5  r  r   r  r9  r   r<  s           r>   rM   z Llama4VisionEncoderLayer.forward  s      ++L9%)^^) &4 &
"l
  ,.  44\Bxx-,./&Gr?   r$  )rN   rO   rP   r	   r-   r5   rQ   r   r,  rM   rR   rS   s   @r>   r3  r3    sZ    I1 I 26,0ll ,, !.	
 $D>r?   r3  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  deej                     dee	   dee	   d	ee	   d
e
eef   fdZ xZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r(   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        || _        y c c}w )NF)
r,   r-   r(   r3   rb  rc  rd  r3  re  rh  )r<   r(   _r=   s      r>   r-   zLlama4VisionEncoder.__init__  sW    mmuU[UmUmOn$o!%=f%E$op&+# %ps   A*r@   r  r   r  rr  rs  rA   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}| j                  D ]^  }	|r||fz   }| j
                  r,| j                  r | j                  |	j                  ||||      }
n |	||||      }
|r	||
d   fz   }|
d   }` |r||fz   }|st        d |||fD              S t        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr8  )r5  r   r  r  r!   r   c              3   &   K   | ]	  }||  y wr_   r8  .0vs     r>   	<genexpr>z.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     eqWXWde   rw  r@   rx  )r(   r  rr  ry  re  rh  r   r  r  r   r   )r<   r@   r  r   r  rr  rs  encoder_statesall_attentionsencoder_layerr  s              r>   rM   zLlama4VisionEncoder.forward  s6   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[ 	-M#!/=2B!B**t}} $ A A!**!"%! !.!.#1&7%	! !!/=3C2E!E)!,M-	-0  +}.>>Ne]NN$Seee+>Vd
 	
r?   NNNN)rN   rO   rP   r%  r	   r-   r5   rQ   r   r,  r   r   r   rM   rR   rS   s   @r>   r8  r8    s    1  26,0/3&*G
||G
 ,,G
 !.	G

 $D>G
 'tnG
 d^G
 
uo%	&G
r?   r8  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4UnfoldConvolutionc                 <   t         |           |j                  }t        |t              r||f}t
        j                  j                  ||j                        | _        t        j                  |j                  |d   z  |d   z  |j                  d      | _        y )N)kernel_sizestrider   r!   FrX   )r,   r-   r  r   r  r5   r3   UnfoldunfoldrZ   num_channelsr1   linear)r<   r(   rJ  r=   s      r>   r-   z Llama4UnfoldConvolution.__init__  s    ''k3'&4Khhoo+fFWFWoXii+a.0;q>A
r?   r@   rA   c                 p    | j                  |      }|j                  ddd      }| j                  |      }|S )Nr   r+   r!   )rM  r  rO  r  s     r>   rM   zLlama4UnfoldConvolution.forward'  s8    M2%--aA6M2r?   r  rS   s   @r>   rH  rH    s#    

U\\ ell r?   rH  c                   $     e Zd Z fdZd Z xZS )Llama4VisionRotaryEmbeddingc                    t         |           |j                  |j                  z  }t	        j
                  |dz  t        j                        j                  |dz  d      }t	        j                  ||d d gd      }d|d<   ||z  }||z  }|j                  |j                  z  dz  }d|j                  t	        j
                  d|d      d |dz   j                         |z  z  z  }|dz   d	   |d d d d f   z  j                  dd
      }|dz   d	   |d d d d f   z  j                  dd
      }	t	        j                  ||	gd
      j                         j                         dd d df   }
|
j                  |j                  d
dd      dk  d      }
t	        j                   t	        j"                  t	        j$                  |
      t	        j&                  |
      gd
            }|| _        y )Nr+   )r   r!   r   rD   r   )rC   rC   r  ).NrC   .)r,   r-   
image_sizer  r5   r   int32r   catr1   r   
rope_thetarq   repeat_interleaver   r  r   stackcossinr  )r<   r(   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   r  r=   s               r>   r-   z$Llama4VisionRotaryEmbedding.__init__/  s   6#4#44,,sAvU[[9AA#q&!L))Wgbqk2:#3%%)C)CCqH6,,a11MN_QY]^Q^1`1f1f1hks1stu	!A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7G,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc r?   c                 L    | j                   j                  |j                        S r_   )r  r   r   r  s     r>   rM   z#Llama4VisionRotaryEmbedding.forward@  s    }} 4 455r?   ra   rS   s   @r>   rR  rR  .  s    !"6r?   rR  c                        e Zd ZdZdgZeZdef fdZd Z	 	 	 	 dde	j                  dee	j                     dee   d	ee   d
ee   deeee	j                  df   f   fdZ xZS )rL  vision_modelr3  r(   c                 r   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  dz  dz   | _        |j                  dz  | _        t        |      | _	        t        j                  | j                  t        j                  | j                        z        | _        t        j                  | j                  t        j                  | j                  | j                        z        | _        t!        |      | _        t        j$                  | j                        | _        t        j$                  | j                        | _        t+        |      | _        t/        |      | _        | j3                          y )Nr+   r!   r   )r,   r-   rT  r  r1   rN  r
  rN  rH  patch_embeddingr3   r4   r5   randnrM  rO  rR  rotary_embeddingrJ  layernorm_prelayernorm_postr8  r]  r  vision_adapterri  r;   s     r>   r-   zLlama4VisionModel.__init__I  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&Ar?   c                     | j                   S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )rh  ru   s    r>   rl  z&Llama4VisionModel.get_input_embeddingsb  s     ###r?   pixel_valuesr   r  rr  rs  rA   .c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  \  }}}}	d}
d}| j                  |      }|j                  \  }}}|j                  ||
z  |z  ||      }| j                  j                  |j                  d   d|j                  d         }t        j                  ||gd      }|dz  }|j                  ||
z  |||      }| j                  j                  |j                  |j                        }||z   }| j                  |      }|j!                  |d|      }| j#                  |      }| j%                  |d|||      }|j&                  }| j)                  |      }|ddddddf   }| j+                  |      }|r|j,                  nd}|r|d   }nd}|st/        d	 |||fD              S t1        |||
      S )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr!   r   rC   rD   )r   r   )r   rr  r  r  r+   c              3   &   K   | ]	  }||  y wr_   r8  r=  s     r>   r@  z,Llama4VisionModel.forward.<locals>.<genexpr>  s     _qQRQ^_rA  rB  )r(   r  rr  ry  r   rh  r   rM  r   r5   rV  rO  r   r   r   rk  rF   rj  r]  rw  rl  rm  r@   r   r   )r<   ro  r   r  rr  rs  batch_size_times_num_tilesrN  r  r  num_concurrent_media
num_chunksr5  r:  r
  r   rM  positional_embeddingr  r~   r@   rx  s                         r>   rM   zLlama4VisionModel.forwardh  sS   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"L&% 
++L9%1%7%7";
 $++&)==
JKYc
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&)==z;Xb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
r?   rF  )rN   rO   rP   r  r  r	   rQ  r-   rl  r5   rQ   r   r,  r   r   r   rM   rR   rS   s   @r>   rL  rL  D  s    &34%L1 2$ 26,0/3&*_
ll_
 !._
 $D>	_

 'tn_
 d^_
 
ellC&7 88	9_
r?   rL  c            (           e Zd ZddgZi ZdZeZdef fdZd Z	d Z
d Zd	 Zd
 Zd Zdej                   deeee   f   defdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dej.                  dej                   deej2                     deej.                     deeej                         deej                      deeeee   f      dee   deej.                     dee   dee   dee   dee   deej.                     deeej2                  f   dej2                  dee   deeef   f$d       Z	 	 	 	 	 	 d'd Z e!dej2                  d!ed"ed#ejD                  dej2                  d$efd%       Z# xZ$S )(Llama4ForConditionalGenerationr(  r3   r(   c                 h   t         |   |       t        |j                        | _        t        |      | _        t        |j                        | _	        |j                  j                  | _
        | j                  j                  | j                  j                  nd| _        | j                          y )NrC   )r,   r-   rL  r   rf  r  multi_modal_projectorr  rD  r  r`  r(   r_  ri  r;   s     r>   r-   z'Llama4ForConditionalGeneration.__init__  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdr?   c                 6    | j                   j                         S r_   )r  rl  ru   s    r>   rl  z3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799r?   c                 :    | j                   j                  |       y r_   )r  ro  rn  s     r>   ro  z3Llama4ForConditionalGeneration.set_input_embeddings  s    007r?   c                 6    | j                   j                         S r_   )r  r  ru   s    r>   r  z4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::r?   c                 :    | j                   j                  |       y r_   )r  r  r  s     r>   r  z4Llama4ForConditionalGeneration.set_output_embeddings  s    11.Ar?   c                 :    | j                   j                  |       y r_   )r  r  r  s     r>   r  z*Llama4ForConditionalGeneration.set_decoder  s    ''0r?   c                 6    | j                   j                         S r_   )r  r  ru   s    r>   r  z*Llama4ForConditionalGeneration.get_decoder  s    ""..00r?   ro  vision_feature_layervision_feature_select_strategyc                     |dvrt        d| j                         |j                         D ci c]  \  }}|	|| }}} | j                  |fddi|}|j                  }|S c c}}w )a  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, List[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   r  z$Unexpected select feature strategy: rr  F)rz  r  itemsrf  rw  )	r<   ro  r  r  r   kr?  image_outputsr5  s	            r>   get_image_featuresz1Llama4ForConditionalGeneration.get_image_features  s    . *1DDCDDgDgChijj#)<<>C41aQ]!Q$CC))),]U]V\]$66 Ds
   
A&A&rp  r   r   r@  rq  r  r6  r  rr  rs  r  r  image_sizesr   rA   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  j
                  }||n| j                   j                  j                  }|du |duz  rt        d      ||t        d      | | j                         |      }|b| j                  ||||      }|j                  }|j                  d|j                  d            }| j                  |      }|| j                   j                  k(  j                  d      }|j!                  |j"                        }|j                  d|j                  d            }|d   j%                  d      }|j'                         }||j                  d      k7  r t        d| d	|j                  d             |j                  d      j)                  d|j                  d            }|j+                  ||      }|j                  |      } | j,                  d|||||
|||||d

|}|d   }d}|	<||dd|j                  d   dz
   df   j!                  |j"                        }|dddddf   |j!                  |j"                        dk7     j/                         }|	dddf   |j!                  |	j"                        dk7     j/                         } n1|dddddf   j/                         }|	dddf   j/                         } t1        j2                         }! |!|j                  d|j                  d            | j                  d      j!                  |j"                              }|s|f|dd z   }"||f|"z   S |"S t5        |||j6                  |j8                  |j:                  |      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nrv  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)ro  r  r  r  rC   ).r   r   zMismatch: final_mask wants z0 embeddings, but multi_modal_projector returned )
r   r   r@  rq  r6  r  rr  rs  r  r  r!   .)r  r  r@  r@   rx  r  r8  )r(   r  rr  ry  r   r  r  rz  rl  r  r   rF   r   rz  image_token_idr}  r   r   r   sumr   masked_scatterr  r   r3   CrossEntropyLossr  r@  r@   rx  )#r<   rp  ro  r   r   r@  rq  r  r  r  r6  r  rr  rs  r  r  r  r   r  original_inputs_embeds_shapevision_flatprojected_vision_flatspecial_image_mask
final_maskfinal_mask_1dnum_tokens_to_fillexpanded_maskr<  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr~   s#                                      r>   rM   z&Llama4ForConditionalGeneration.forward  sa   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $/ !**?? 	 .9 +**II 	' -t";<YZZ#(Av   7D557	BM#!44)%9/M'	 5 N ,9+>+>((--b.2E2Eb2IJK$($>$>{$K!"+t{{/I/I"I!T!TUW!X+..}/C/CDJ)..r=3E3Eb3IJM&v.66r:M!.!2!2!4!%:%?%?%BB 12D1E F::O:T:TUV:W9XZ 
 *33B7>>r=CUCUVXCYZM)88H]^M)../KLM%$%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
r?   c           	      f     | j                   j                  |f|||||d|}	|d   dk(  r||	d<   |	S )N)r@  rq  r   r  r  r   ro  )r  prepare_inputs_for_generation)
r<   rp  r@  rq  ro  r   r  r  r   model_inputss
             r>   r  z<Llama4ForConditionalGeneration.prepare_inputs_for_generation  s_     It**HH
+')))
 
 !! ,8L(r?   r  r  r   r  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  r  r!   r  r   rC   r   r  r  s              r>   r  zTLlama4ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r?   )NNNNNNNNNNNNNNr   N)NNNNNN)%rN   rO   rP   r  r  r  r"   rQ  r-   rl  ro  r  r  r  r  r5   r=  r   r  r   r   r  r   r&  r   rQ   r,  r   r  r   r  rM   r  r  r   r  rR   rS   s   @r>   rw  rw    s   13MNHL	| 	:8;B11'' $CcN3 ),	<  '+*.1537=A59@D8<-1$(,0/3&*5934$(#U
##U
 ''U
 !.	U

 u//0U
 "$u'8'8"9:U
   1 12U
 'uS$s)^'<=U
 )1U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
  c5<</0!U
" \\#U
$ *+%U
& 
u22	3'U
 U
t < 444 4 {{	4
 4 4 4r?   rw  )r?  r\  rL  r  rw  )r  )`r  dataclassesr   typingr   r   r   r   r   r5   torch.nnr3   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r	   activationsr   cache_utilsr   r   r   
generationr   integrations.hub_kernelsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    configuration_llama4r"   r#   !torch.nn.attention.flex_attentionr$   integrations.flex_attentionr%   
get_loggerrN   r  Moduler'   rU   rc   rx   r   r   rQ   r   r  r   rq   r   r   r   r(  r?  r\  r  r  r  r  r  r  r  r!  r'  r)  r/  r3  r8  rH  rR  rL  rw  __all__r8  r?   r>   <module>r     s     ! 9 9     N ! B B ) C > B m m K F & h h @  !;J			H	%		 B)BII )$!uxx !=		 =( _-)"BII )" .)"X		 >	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %D %II%<<% 
% <<	%
 U\\*% % %4`)")) `)FFRYY FR $KO $K $KN \+ \ \~
 ?,j >m
- m
` $<; $< $<N;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:>)BII >)Bbii )ryy )XW
")) W
tbii (6")) 6,C
- C
Ll%:O l^	r?   