
    Uh                     v   d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlm
c mZ d dlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,  ejZ                  e.      Z/ G d de$      Z0 G d de
jb                        Z2 G d de!      Z3 G d de
jb                        Z4 G d de
jb                        Z5 G d de
jb                        Z6 G d de
jb                        Z7 G d  d!e
jb                        Z8 G d" d#e
jb                        Z9 G d$ d%e
jb                        Z: G d& d'e(      Z; G d( d)e
jx                        Z= G d* d+e
jb                        Z> G d, d-e
jb                        Z? G d. d/e
jb                        Z@ G d0 d1e
jb                        ZA G d2 d3e
jb                        ZB ed45       G d6 d7e             ZC G d8 d9      ZD G d: d;e eC      ZE G d< d=e&eE      ZF G d> d?e%eEe      ZG G d@ dAeE      ZH G dB dCeEe      ZIg dDZJy)E    N)cached_property)ListOptionalTupleUnion   )Cache)GenerationMixin)FlashAttentionKwargs)CausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)KwargsForCausalLMLlamaDecoderLayerLlamaForCausalLM
LlamaModel)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   f    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )Emu3DecoderLayerconfig	layer_idxc                 n    t         |   ||       t        j                  |j                        | _        y N)super__init__nnDropoutattention_dropoutdropoutselfr    r!   	__class__s      w/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/emu3/modular_emu3.pyr%   zEmu3DecoderLayer.__init__,   s(    +zz&":":;    hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    |}
| j                  |      } | j                  d||||||||d|	\  }}|
| j                  |      z   }|}
| j                  |      }| j	                  |      }|
| j                  |      z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r/   r0   r1   r2   r3   r4   r5   r6    )input_layernorm	self_attnr)   post_attention_layernormmlp)r+   r/   r0   r1   r2   r3   r4   r5   r6   kwargsresidualself_attn_weightsoutputss                r-   forwardzEmu3DecoderLayer.forward0   s    > !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !4<<#>> !55mD/ 4<<#>> ")++Gr.   )NNNFFNN)__name__
__module____qualname__r   intr%   torchTensorr   
LongTensorr	   boolr   FloatTensorrB   __classcell__r,   s   @r-   r   r   +   s    <z <c < 2637*.,1$)59KO<||< !.< u//0	<
 !< $D>< D>< !!1!12< &eELL%,,,F&GH< 
u  (51B1BEDUDU1U+V"WW	X<r.   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r    c                    t         |           t        j                  |j                  |j
                        | _        | j                  j                  j                  j                  d|j                  z  d|j                  z         y )Ng            ?)
r$   r%   r&   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r+   r    r,   s     r-   r%   z!Emu3VQVAEVectorQuantizer.__init__z   sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr.   hidden_statec                    |j                   \  }}}}}|j                  ddddd      j                         }|j                  d|      }t	        j
                  |dz  dd      }t	        j
                  | j                  j                  dz  d	      }	dt	        j                  || j                  j                  j                  dd            z  }
||	z   |
z
  }
t	        j                  |
d	      }|j                  ||||      }|S )
Nr   r   r      r   T)dimkeepdimr^   )shapepermute
contiguousviewrG   sumrU   rV   matmul	transposeargmin)r+   rZ   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r-   rB   z Emu3VQVAEVectorQuantizer.forward   s    8D8J8J5
Hh#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;T^^=R=R=\=\]^`a=bcc	$}4y@	$||I1=388XvW\]##r.   )
rC   rD   rE   __doc__r   r%   rG   rH   rB   rL   rM   s   @r-   rO   rO   o   s&    e e
$ELL $r.   rO   c                       e Zd Zy)Emu3VQVAEEncoderConvDownsampleN)rC   rD   rE   r9   r.   r-   ru   ru      s    r.   ru   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   kernel_sizestridepadding)r$   r%   r&   Conv2dconv)r+   in_channelsr,   s     r-   r%   z%Emu3VQVAEEncoderConvUpsample.__init__   s'    IIk;AaYZ[	r.   c                 X    t        j                  |dd      }| j                  |      }|S )N       @nearestscale_factormode)Finterpolater~   r+   r/   s     r-   rB   z$Emu3VQVAEEncoderConvUpsample.forward   s(    m#IV		-0r.   rC   rD   rE   r%   rB   rL   rM   s   @r-   rw   rw      s    \r.   rw   c            	       \     e Zd Zdededee   dee   f fdZdej                  fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelrz   r{   c                 P   t         	|           t        |dd  |dd        D cg c]
  \  }}||z
   }}}d| _        |d d d   D ]%  }| xj                  |dz  |dz  z   |dz  fz  c_        ' | xj                  dz  c_        t	        j
                  ||||      | _        y c c}}w )Nr   r9   r]   r   )r   r   )r{   )r$   r%   zipr|   r&   Conv3dr~   )
r+   r   r   rz   r{   
one_kernel
one_stridepadding_sizespad_sizer,   s
            r-   r%   zEmu3VQVAEConv3d.__init__   s     	ORS^_`_aSbdjklkmdnOop5KZj0pp%dd+ 	JHLLX]X\98q=IIL	JII	
	 qs   B"r/   c                 h    t        j                  || j                        }| j                  |      }|S r#   )r   padr|   r~   r   s     r-   rB   zEmu3VQVAEConv3d.forward   s*    mT\\:		-0r.   )
rC   rD   rE   rF   r   r%   rG   rH   rB   rL   rM   s   @r-   r   r      sF    

 
 3Z	

 c

,U\\ r.   r   c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t         |           t        j                  |ddd      | _        t        j
                  ||ddd      | _        t        j
                  ||ddd      | _        y )N    ư>Tnum_channels
num_groupsepsaffiner   r   ry   )r$   r%   r&   	GroupNorm
norm_layerr}   conv_yconv_br+   r   r   r,   s      r-   r%   zEmu3VQVAESpatialNorm.__init__   sn    
 	,,%	
 ii
 ii
r.   r/   quant_statesc                     t        j                  ||j                  dd  d      }| j                  |      }|| j	                  |      z  | j                  |      z   }|S )Nr   )sizer   )r   r   ra   r   r   r   )r+   r/   r   s      r-   rB   zEmu3VQVAESpatialNorm.forward   sX    }}\8K8KBC8PW`a6%L(AADKKP\D]]r.   	rC   rD   rE   rF   r%   rG   rH   rB   rL   rM   s   @r-   r   r      s5    

 
8U\\  r.   r   c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalUpsampler   r   c                 J    t         |           t        ||dd      | _        y )Nr   r   r   r   r   r   rz   r{   r$   r%   r   r~   r+   r   r   r,   s      r-   r%   z"Emu3VQVAETemporalUpsample.__init__   (    
 	#!	
	r.   r/   c                 P   |j                   \  }}}}}|j                  ddddd      j                         j                  |d|      }t	        j
                  |dd	      }|j                  ||||d      j                  ddddd      j                         }| j                  |      }|S )
Nr   r   r   r\   r   r]   r   r   r   )ra   rb   rc   rd   r   r   r~   )r+   r/   ri   rk   rj   rl   rm   s          r-   rB   z!Emu3VQVAETemporalUpsample.forward   s    8E8K8K5
Hh%--aAq!<GGINNz[]_ghm#IV%**:xPRS[[\]_`bcefhijuuw		-0r.   r   rM   s   @r-   r   r      s*    

 
U\\ r.   r   c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalDownsampler   r   c                 J    t         |           t        ||dd      | _        y )N)r\   r   r   )r   r   r   r   r   r   s      r-   r%   z$Emu3VQVAETemporalDownsample.__init__   r   r.   r/   c                 (    | j                  |      }|S r#   )r~   r   s     r-   rB   z#Emu3VQVAETemporalDownsample.forward  s    		-0r.   r   rM   s   @r-   r   r      s*    

 
U\\ r.   r   c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockc                 p   t         |           || _        ||n|| _        t	        j
                  |      | _        t        ||dd      | _        t	        j
                  |      | _	        t        ||dd      | _
        | j                  | j                  k7  r t	        j                  ||ddd      | _        y y )Nr   r   r   r   r   ry   )r$   r%   r   r   r&   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   s      r-   r%   z%Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r.   c                 L   |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S r#   )	r   rG   sigmoidr   r   r   r   r   r   )r+   r/   r?   s      r-   rB   z$Emu3VQVAETemporalResnetBlock.forward,  s     

=1}55

=1

=1}55

=1t000((2H-''r.   r#   r   rM   s   @r-   r   r     s     @(r.   r   c                   ~     e Zd Z	 	 ddedee   dee   f fdZddej                  deej                     fdZ xZ	S )	Emu3VQVAEResnetBlockr   r   quant_channelsc                    t         |           || _        ||n|}|| _        || _        |=t        j                  |ddd      | _        t        j                  |ddd      | _        n"t        ||      | _        t        ||      | _        t        j                  ||ddd      | _        t        j                  ||ddd      | _        | j                  | j                  k7  r t        j                  ||ddd      | _        y y )	Nr   r   Tr   r   r   ry   r   )r$   r%   r   r   r   r&   r   r   r   r   r}   r   r   r   )r+   r   r   r   r,   s       r-   r%   zEmu3VQVAEResnetBlock.__init__=  s    	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nkJDJ-nlKDJYY

 YY

 t000 "		!D 1r.   r/   c                 v   | j                   dn|f}|} | j                  |g| }|t        j                  |      z  }| j	                  |      } | j
                  |g| }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S Nr9   )
r   r   rG   r   r   r   r   r   r   r   )r+   r/   r   	norm_argsr?   s        r-   rB   zEmu3VQVAEResnetBlock.forwardi  s    --5BN;L	 "

==9=}55

=1"

==9=}55

=1t000((2H-''r.   )NNr#   )
rC   rD   rE   rF   r   r%   rG   rH   rB   rL   rM   s   @r-   r   r   <  sU     '+(,	** sm* !	*X(U\\ (8ELLCY (r.   r   c                   $     e Zd Zdef fdZ xZS )Emu3VQVAEAttentionBlockr    c                 2    t         |   |       d| _        y )Nr   )r$   r%   num_key_value_groupsrY   s     r-   r%   z Emu3VQVAEAttentionBlock.__init__|  s      %&!r.   )rC   rD   rE   r   r%   rL   rM   s   @r-   r   r   {  s    & & &r.   r   c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 $    t        |   di | y r   )r$   r%   )r+   r>   r,   s     r-   r%   zEmu3VQVAEGroupNorm.__init__  s    "6"r.   c                     t        j                  || j                  | j                  | j                  | j
                        S r#   )r   
group_normr   rV   biasr   )r+   inputr   s      r-   rB   zEmu3VQVAEGroupNorm.forward  s)    ||E4??DKKDHHUUr.   r#   )rC   rD   rE   rs   r%   rB   rL   rM   s   @r-   r   r     s    #Vr.   r   c                   `     e Zd Zd fd	Zddej
                  deej
                     fdZ xZS )Emu3VQVAEMiddleBlockc                     t         |           t        |||      | _        t	        |      | _        |t        |ddd      | _        nt        ||      | _        t        |||      | _	        y )Nr   r   r   r   r   Tr   )
r$   r%   r   block_1r   attn_1r   	attn_normr   block_2)r+   r    r   r   r,   s       r-   r%   zEmu3VQVAEMiddleBlock.__init__  so    +#$)

 .f5!/[UW]ajnoDN1.+NDN+#$)
r.   r/   r   c                 b   | j                  ||      }|}| j                  ||      }|j                  \  }}}}|j                  ||||z        j	                  dd      }| j                  |      d   }|j                  ||||      j                  dddd      }||z   }| j                  ||      }|S )Nr   r   r   r   )	r   r   ra   rd   rg   r   reshaperb   r   )r+   r/   r   r?   ri   rk   rl   rm   s           r-   rB   zEmu3VQVAEMiddleBlock.forward  s    ]LA }lC.;.A.A+
Hfe%**:x%PZZ[\^_`M215%--j&%RZZ[\^_abdef =0]LAr.   r#   )	rC   rD   rE   r%   rG   rK   r   rB   rL   rM   s   @r-   r   r     s,    
(
U%6%6 
huO`O`Fa 
r.   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEDownBlockc           
         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }dt        |      z   }|| _        t        j                         | _        t        | j                        D ]K  }t        j                         }t        j                         }t        j                         }|||   z  }	|||   z  }
t        | j
                        D ]~  }|j                  t        |	|
             |
}	|j                  .||j                  v s=|j                  t!        |             |j                  t        j"                  |	ddd              t        j$                         }||_        ||_        ||_        || j                  dz
  k7  rt-        |	      |_        | j                  j                  |       N y )N)r   r   r   r   r   Tr   r   )r$   r%   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelstuplein_channel_multiplierr&   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsru   
downsample)r+   r    r   r   r   i_levelr   r   r   block_in	block_outi_blockr   r,   s                r-   r%   zEmu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112 	#GMMOE==?DJ$'<W'EEH%(:7(CCI !4!45 
q($,%. %**67fF]F];]KK 7 ?@%%bllUW]ajn&op
q 99;DDJDI(DO$..22"@"JIIT"1	#r.   r/   c                 >   t        | j                        D ]  \  }}t        | j                        D ]  } |j                  |   |      }t        |j                        dkD  s1|} |j                  |   |      }|j                  \  }}}}	|j                  ||||	z        j                  dd      } |j                  |   |      d   }|j                  |||	|      j                  dddd      }||z   } || j                  dz
  k7  s|j                  |      } |S )Nr   r   r   r   )	enumerater   r   r   r   r   r   r   ra   rd   rg   r   rb   r   r   )
r+   r/   r   blocksr   r?   ri   rk   rl   rm   s
             r-   rB   zEmu3VQVAEDownBlock.forward  s5   (3 	AOGV !4!45 = 5W 5m Dv{{#a',H$>F$5$5g$>}$MM:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= $..22 & 1 1- @	A" r.   rC   rD   rE   r%   rG   rK   rB   rL   rM   s   @r-   r   r     s    ##JU%6%6 r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Emu3VQVAEUpBlockc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  |j                  d   z  }t        j                         | _
        t        t        | j                              D ]5  }t        j                         }t        j                         }t        j                         }|j                  |j                  |   z  }t        | j
                  dz         D ]e  }	|j                  t        |||             |}||j                  v s1|j                  t!        |             |j                  t#        ||             g t        j$                         }
||
_        ||
_        ||
_        |dk7  rt-        |      |
_        | j                  j1                  d|
       8 y )Nr]   r   r   r   )r$   r%   r   r   r   r   rT   r   r&   r   upreversedr   r   r   r   r   r   r   r   r   r   rw   upsampleinsert)r+   r    r   r   r   r   r   r   r   r   r  r,   s              r-   r%   zEmu3VQVAEUpBlock.__init__  s   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;< 	"GMMOE==?DJ,,v/H/H/QQI !4!4q!89 V($,%.'5 %f555KK 7 ?@%%&:>8&TUV BBHBG&BM!|:8DGGNN1b!3	"r.   r/   r   c                 h   t        | j                  d d d         D ]  \  }}t        | j                  dz         D ]  } |j                  |   ||      }t        |j                        dkD  s2|} |j                  |   ||      }|j                  \  }}}	}
|j                  |||	|
z        j                  dd      } |j                  |   |      d   }|j                  ||	|
|      j                  dddd      }||z   } |t        | j                        dz
  k7  s|j                  |      } |S )Nr]   r   r   r   r   )r   r  r   r   r   r   r   r   ra   rd   rg   r   rb   r  )r+   r/   r   r   r   r   r?   ri   rk   rl   rm   s              r-   rB   zEmu3VQVAEUpBlock.forward  sD   (27 	?OGV !4!4q!89 = 5W 5m\ Rv{{#a',H$>F$5$5g$>}l$[M:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= #dgg,** & >	?  r.   r   rM   s   @r-   r   r     s(    #"JU%6%6 eFWFW r.   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEEncoderc                    t         |           |j                  }|j                  }|j                  }|j
                  }|j                  }|rd|z  n|}||d   z  }t        j                  j                  ||ddd      | _
        t        |      | _        t        ||      | _        t        j                  j                  d|dd	      | _        t        j                  j                  ||ddd      | _        t%        t'        j(                  |j*                              }	t        j,                         | _        t        j,                         | _        t3        |	      D ])  }
t5        ||      }| j.                  j7                  |       + t3        |j8                        D ]*  }t;        ||
      }| j0                  j7                  |       , y )Nr   r]   r   r   ry   r   r   T)r   r   r   r   r   )r$   r%   r   r   double_latentlatent_channelsr   rG   r&   r}   conv_inr   
down_blockr   middle_blockr   norm_outconv_outrF   mathlog2temporal_downsample_factorr   	time_convtime_res_stackr   r   r   r   r   )r+   r    r   r   r	  r
  r   r   r   temporal_down_blocksir~   _time_res_convr,   s                 r-   r%   zEmu3VQVAEEncoder.__init__)  s   ,,((,, 00#66.;q?* #5b#99xx{MqYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+, 	(A.|\JDNN!!$'	( v,,- 	6A8()M &&}5	6r.   pixel_valuesc                 h   |j                   d   } |j                  dg|j                   dd   }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }|t        j                  |      z  }| j                  |      } |j                  d|g|j                   dd   }|j                  ddddd      }| j                  D ]"  } ||      }|t        j                  |      z  }$ | j                  D ]
  } ||      } |j                  ddddd      }|S )Nr   r]   r   r   r   r\   )ra   r   r  r  r  r  rG   r   r  rb   r  r  )r+   r  temporal_dimr/   r~   layers         r-   rB   zEmu3VQVAEEncoder.forwardP  sH   #))!,+|++BH1C1CAB1GH \26))-8 m4}55m4---b,YATATUVUWAXY%--aAq!< NN 	:D /MU]]=99M	: (( 	1E!-0M	1 &--aAq!<r.   )rC   rD   rE   r%   rG   rI   rB   rL   rM   s   @r-   r  r  (  s    %6NE$4$4 r.   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Emu3VQVAEDecoderr    c                    t         	|           |j                  }|j                  |j                  d   z  }t        j                         | _        t        |j                        D ]>  }t        |j                  |j                        }| j                  j                  |       @ t        t        j                  |j                               }t        j                         | _        t        |      D ]=  }t%        |j                  |j                        }| j"                  j                  |       ? t        j&                  |j                  |ddd      | _        t+        |||      | _        t/        |      | _        |j                  |j                  d   z  }t3        ||      | _        t        j&                  ||j6                  ddd      | _        y )Nr]   r   r   r   ry   )r   r   )r$   r%   rT   r   r   r&   r   r  r   r   r   r
  r   rF   r  r  r  r  r   r}   r  r   r  r   up_blockr   r  r   r  )
r+   r    r   r   r  r  temp_upsample_block_numr  r~   r,   s
            r-   r%   zEmu3VQVAEDecoder.__init__o  s   ))''&*C*CB*GG mmov,,- 	6A8"22AWAWM &&}5		6 #&dii0Q0Q&R"S./ 	(A,V-C-CVE[E[\DNN!!$'	( yy""
 1R`a(0''&*C*CA*FF,^XF		
r.   r/   r   c                    t        j                  ||fd      }|j                  ddddd      }| j                  D ]
  } ||      } | j                  D ]"  } ||      }|t        j
                  |      z  }$ |j                  ddddd      }t        j                  |dd      \  }} |j                  dg|j                  dd   } |j                  dg|j                  dd   }| j                  |      }| j                  ||      }| j                  ||      }| j                  ||      }|t        j
                  |      z  }| j                  |      }|S )Nr   r`   r   r   r   r\   r]   )rG   catrb   r  r  r   chunkr   ra   r  r  r   r  r  )r+   r/   r   hidden_quant_statesr  s        r-   rB   zEmu3VQVAEDecoder.forward  sp   #ii(E1M199!Q1aH (( 	=E"'(;"<	= ^^ 	FE"'(;"<5==1D#EE	F 299!Q1aH&+kk2Eqa&P#|---bK=3F3Fqr3JK+|++BH1C1CAB1GH]3 ))-Fm\Bm\B}55m4r.   )	rC   rD   rE   r   r%   rG   rH   rB   rL   rM   s   @r-   r  r  n  s+    %
 %
NU\\  r.   r  aF  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )custom_introc                        e Zd ZeZdZdZdZdZdZ	dZ
g dZd Zdef fdZdej                  dej                  fd	Zd
ej                  fdZ xZS )	Emu3VQVAE
emuvideovqr  T)r   r   r   rO   c                 |   t        |t        j                  t        j                  f      rt        j                  j                  |j                  dd       |j                  qt        j                  j                  |j                        \  }}dt        j                  |      z  }t        j                  j                  |j                  | |       y y t        |t        j                        rt        j                  j                  |j                  t        j                  d             |j                  xt        j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  j                  |j                  | |       y y t        |t        j                  t        j                  t        j                   f      rUt        j                  j#                  |j                  d       t        j                  j#                  |j                  d	       y t        |t        j$                        rc|j                  j&                  j)                          |j*                  2|j                  j&                  |j*                     j-                          y y y )
Nfan_outrelu)r   nonlinearityr      )ar   rQ           )
isinstancer&   r}   r   initkaiming_normal_rV   r   _calculate_fan_in_and_fan_outr  sqrtrX   Linearkaiming_uniform_BatchNorm2dr   r   	constant_rR   rW   normal_padding_idxzero_)r+   modulefan_inr  bounds        r-   _init_weightszEmu3VQVAE._init_weights  s   fryy"))45GG##FMM	PV#W{{&GGAA&--P	DIIf--  ufe< ' 		*GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  NOGGfmmS1GGfkk3/-MM&&(!!-""6#5#56<<> . .r.   r    c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        dt        |j                        dz
  z  | _        t        |j                  |j                  dd      | _        t        |j                  |j                  dd      | _        dt        |j                        dz
  z  | _        | j%                          | j'                          y )Nr   r   )r   r   r   r   r   )r$   r%   r    r  encoderr  decoderrO   quantizer   r   vision_spatial_factorr   r
  rT   
quant_convpost_quant_convspatial_scale_factoreval	post_initrY   s     r-   r%   zEmu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r.   image_sizesc                    |j                   dk(  }|rL| j                  j                  }|j                  \  }}}}|j	                  d      j                  d|ddd      }n|j                  \  }}}}}| j                  |      }	|	j                  ddddd      }	| j                  |	      }	|	j                  ddddd      }	| j                  |	      }
|r|
j                  d      n|
}t        ||      D cg c]B  \  }}|d t        |d   | j                  z        d t        |d   | j                  z        f   D }}}|S c c}}w )Nr\   r   r   r   r   )ndimr    r  ra   	unsqueezerepeatrB  rb   rF  rD  squeezer   rF   rE  )r+   r  rK  is_imagerj   ri   rk   rl   rm   r/   codesimage_tokenssingle_imager   s                 r-   encodezEmu3VQVAE.encode  sX   $$){{==H2>2D2D/J&%'11!4;;AxAqQL<H<N<N9J(FE\2 &--aAq!<6 &--aAq!<m,+3u}}Q' '*,&D
"d D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr
 

 
s   1AD=r/   c                    |j                   dk(  }|r|j                  d      }|j                  \  }}}}| j                  j	                  |j                               }|j                  d   }|j                  |||||      j                  ddddd      j                         }| j                  |      }	|j                  ddddd      }|	j                  ddddd      }	| j                  |	|      }
|
j                  ||| j                  j                  z  | j                  j                  || j                  z  || j                  z        }
|r	|
d d df   S |
S )Nr   r   r]   r   r\   r   )rM  rN  ra   rD  rU   flattenrd   rb   rc   rG  rC  r   r    r  r   rH  )r+   r/   rQ  ri   rj   rl   rm   quantrk   
post_quantvideos              r-   decodezEmu3VQVAE.decode  sK    %%*)33A6M.;.A.A+
Hfe''(=(=(?@;;r?

:xIQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/t{{===KK$$T...D---
 'uQT{1E1r.   )rC   rD   rE   r   config_classbase_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backend_no_split_modulesr@  r%   rG   rH   rU  r[  rL   rM   s   @r-   r(  r(    sp     #L$$ON!"&?* *5<< ell 82ELL 2r.   r(  c                       e Zd ZdZd Zed        Zed        Zed        Zed        Z	ed        Z
ed        Zd	eej                     d
ej                  fdZd	ej                  d
ej                  fdZy)Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 j    || _         |j                  d      | _        |j                  d      | _        y )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r+   rg  s     r-   r%   z#Emu3ImageVocabularyMapping.__init__/  s+    "%MM/:'mmI6r.   c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w Nz<|visual tokensortedrg  items
startswithr+   namevals      r-   rS  z'Emu3ImageVocabularyMapping.image_tokens4  s8    DNN,@,@,BhytSdooVfFgshiih
   A	
A	
c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w rl  rm  rq  s      r-   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str8  s8    T^^-A-A-Ci	ctWgGhtijjirt  c                 t    | j                   D ci c]  }t        |dd       | j                  |     c}S c c}w )Nir   )rv  rF   rg  )r+   tokens     r-   img2bpez"Emu3ImageVocabularyMapping.img2bpe<  s5    FJF[F[\UE"RL!4>>%#88\\\s   #5c                 j    | j                   j                         D ci c]  \  }}||
 c}}S c c}}w r#   )ry  ro  )r+   kvs      r-   bpe2imgz"Emu3ImageVocabularyMapping.bpe2img@  s+    !%!3!3!56A1666s   /c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S Nr   dtype)rG   zerosmaxr}  keysrF   ro  r+   mappingr{  r|  s       r-   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorD  [    ++c$,,"3"3"56:%))LLL&&( 	DAqGAJ	r.   c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S r  )rG   r  r  ry  r  rF   ro  r  s       r-   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorK  r  r.   	img_batchr7   c                 ,   |j                   }t        j                  |j                  d   dft        j                        | j
                  z  }| j                  |j                  d         }t        j                  ||gd      }|j                  |      S )Nr   r   r  cpur]   r`   )	devicerG   onesra   rF   ri  r  tor#  )r+   r  r  eol_row
img_tokenss        r-   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpeR  sw    !!**iooa0!4EIIFIZIZZ00e1DE
YY
G4"=
}}V$$r.   c                     |j                   }|dd df   }| j                  |j                  d         }|j                  |      S )N.r]   r  )r  r  r  )r+   r  r  r  s       r-   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2imgY  sG    !!c3B3h'	00e1DE
}}V$$r.   N)rC   rD   rE   rs   r%   r   rS  rv  ry  r}  r  r  r   rG   rH   r  r  r9   r.   r-   re  re  *  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r.   re  c                        e Zd ZdgZdZdZd Zy)Emu3PreTrainedModelr   Tc                    | j                   j                         j                  }t        |t        j
                  t        j                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t              r&|j                  j                  j                  d       y y )Nr0  )meanstdrQ   )r    get_text_configinitializer_ranger1  r&   r6  r}   rV   rW   r:  r   r<  rR   r;  Emu3RMSNormfill_)r+   r=  r  s      r-   r@  z!Emu3PreTrainedModel._init_weightsg  s    kk))+==fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .,MM$$S) -r.   N)rC   rD   rE   rc  ra  rb  r@  r9   r.   r-   r  r  `  s      "&*r.   r  c                   $     e Zd Zdef fdZ xZS )Emu3TextModelr    c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w r#   )r$   r%   r&   r   r   num_hidden_layersr   layersr*   s      r-   r%   zEmu3TextModel.__init__v  sD     mmBGH`H`BabYfi0b
bs   A)rC   rD   rE   r   r%   rL   rM   s   @r-   r  r  u  s    
z 
 
r.   r  c                   ,     e Zd ZeZ fdZ fdZ xZS )Emu3ForCausalLMc                 D    t         |   |       t        |      | _        y r#   )r$   r%   r  modelrY   s     r-   r%   zEmu3ForCausalLM.__init__  s     "6*
r.   c                  6    t               j                          y)a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```N)r$   rB   )super_kwargsr,   s    r-   rB   zEmu3ForCausalLM.forward  s    $ 	r.   )rC   rD   rE   r   r\  r%   rB   rL   rM   s   @r-   r  r  }  s    !L+ r.   r  c            !       ,    e Zd ZddiZdZ fdZd Zd Zdej                  dej                  fd	Zdej                  dej                  fd
Zej                  dej                  dedefd       Zee	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej$                  deej$                     deej                     dee   deej                     dee   dee   dee   dee   deej                     dee   deeef   fd              Z xZS )	Emu3Modelztext_model.model
text_modelFc                    t         |   |       t        j                  |j                        | _        | j
                  j                  ,| j
                  j                  D cg c]  }d| 	 c}| _        t        |j                        | _	        t        |j                        | _        | j                          y c c}w )Nztext_model.)r$   r%   r  _from_configtext_configr  _tied_weights_keysr(  	vq_configvqmodelre  vocabulary_mapvocabulary_mappingrJ  )r+   r    r{  r,   s      r-   r%   zEmu3Model.__init__  s     '44V5G5GH??--9BF//BdBd&eQQC'8&eD# !1!12"<V=R=R"S 	 'fs   #B<c                 6    | j                   j                         S r#   )r  get_input_embeddingsr+   s    r-   r  zEmu3Model.get_input_embeddings  s    3355r.   c                 :    | j                   j                  |       y r#   )r  set_input_embeddingsr+   values     r-   r  zEmu3Model.set_input_embeddings  s    ,,U3r.   r  rK  c                 N    t         j                  d       | j                  |      S )Nz`model.get_image_tokens()` is deprecated and will be removed in v4.58. To obtain discrete token use `model.get_image_features()`)loggerwarningget_image_featues)r+   r  rK  s      r-   get_image_tokenszEmu3Model.get_image_tokens  s'     O	
 %%l33r.   c                     | j                   j                  ||      }|D cg c]+  }| j                  j                  |      j	                         - }}t        j                  |      }|S c c}w )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        )r  rU  r  r  rW  rG   r#  )r+   r  rK  image_tokens_listtokensbpe_tokens_list
bpe_tokenss          r-   get_image_featureszEmu3Model.get_image_features  sc     !LL//kJctuY_422BB6JRRTuuYY/
 vs   0A*rS  rl   rm   c                     |ddddf   j                  d||dz         }| j                  j                  |      }| j                  j	                  |      }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr]   r   )rd   r  r  r  r[  )r+   rS  rl   rm   	sequencesimages         r-   decode_image_tokenszEmu3Model.decode_image_tokens  sX     !CRC(--b&%!)D	..>>yI##L1r.   	input_idsr0   r1   past_key_valuesinputs_embedsr4   r3   output_hidden_statesreturn_dictr5   r>   r7   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }|du |duz  rt	        d      ||t	        d      |c| j                  ||      }|| j                  j                  k(  }|j                  |j                  |j                        }|j                  ||      } | j                  d|||||||	|
d|d
|}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT
r  r0   r1   r  r  r4   r3   r  r  r5   r9   )r    r3   r  use_return_dict
ValueErrorr  r  rj  r  r  r  masked_scatterr  )r+   r  r  rK  r0   r1   r  r  r4   r3   r  r  r5   r>   rS  special_image_maskrA   s                    r-   rB   zEmu3Model.forward  s5   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<s  #(Av  #22<ML!*d.E.E.T.T!T'??9+;+;Y__ML!001C\RI "$// 
)%+'/!5)
 
 r.   )NNNNNNNNNNNN)rC   rD   rE   _checkpoint_conversion_mapping_supports_static_cacher%   r  r  rG   rK   rI   r  r  no_gradrF   r  r   r   rH   r   r	   rJ   r   r   r   r   r   rB   rL   rM   s   @r-   r  r    s   &8,%G""
644U->-> 4UM]M] 4u/@/@ uO_O_ " ]]0@0@ # VY  $  '+*.$(1537+/59$(,0/3&*59;##; ''; \\	;
 !.; u//0; "%;   1 12; D>; $D>; 'tn; d^; !!1!12; -.; 
u,,	-;  ;r.   r  c            %       b    e Zd ZdZddddZdZ fdZd Zd	 Ze	d
        Z
e	d        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dej                  dej                   dej"                  deej"                     deej                     dee   deej                      dee   dee   dee   dee   deej                     deej                     deeej"                  f   dee   deeef   f d              Z	 	 	 	 	 	 	 d$ fd	Zedej"                  deded ej<                  dej"                  d!efd"       Z xZ S )%Emu3ForConditionalGeneration zmodel.text_modelzmodel.vqmodellm_head)z^text_model.modelz^vqmodelz^text_model.lm_headFc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NF)r   )r$   r%   r  r  r&   r6  r  hidden_size
vocab_sizer  rJ  rY   s     r-   r%   z%Emu3ForConditionalGeneration.__init__"  sS     v&
yy!3!3!?!?ASASA^A^ejkr.   c                 6    | j                   j                         S r#   )r  r  r  s    r-   r  z1Emu3ForConditionalGeneration.get_input_embeddings)  s    zz..00r.   c                 :    | j                   j                  |       y r#   )r  r  r  s     r-   r  z1Emu3ForConditionalGeneration.set_input_embeddings,  s    

''.r.   c                 .    | j                   j                  S r#   )r  r  r  s    r-   r  z'Emu3ForConditionalGeneration.text_model0  s    zz$$$r.   c                 .    | j                   j                  S r#   )r  r  r  s    r-   r  z$Emu3ForConditionalGeneration.vqmodel4  s    zz!!!r.   r  r  rK  r0   r1   r  r  r4   r3   r  r  r5   labelslogits_to_keepr>   r7   c                 "   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  } | j                  d|||||||	|
d|d
|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                        S )at  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```NTr  r   )logitsr  r  )lossr  r  r/   
attentionsr9   )r    r3   r  r  r  r1  rF   slicer  loss_functionr  r  r   r  r/   r  )r+   r  r  rK  r0   r1   r  r  r4   r3   r  r  r5   r  r  r>   rA   r/   slice_indicesr  r  s                        r-   rB   z$Emu3ForConditionalGeneration.forward8  sA   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r.   c	                 R    t        |   |f|||||||d|	}
|d   dk7  rd |
d<   |
S )N)r  r0   r  r5   r1   r  r4   r   r  )r$   prepare_inputs_for_generation)r+   r  r  r0   r  r5   r1   r4   r  r>   model_inputsr,   s              r-   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generation  sZ     w<

+)')%%

 

 !!+/L(r.   sequence_lengthtarget_lengthr  ri   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr\   )
fill_valuer  r  r   )diagonal)r  r]   r   )r^   rG   finfominfullr  triuaranger   expandclonera   r  masked_fill)r0   r  r  r  r5   ri   r>   causal_mask	min_dtypemask_lengthpadding_masks              r-   5_prepare_4d_causal_attention_mask_with_cache_positionzREmu3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r.   )NNNNNNNNNNNNNr   )NNNNNTN)!rC   rD   rE   r]  r  r  r%   r  r  propertyr  r  r   r   rG   rI   rK   rH   r   r	   rJ   r   rF   r   r   r   r   rB   r  staticmethodr  r  rL   rM   s   @r-   r  r    s>   /#(&"
 #1/ % % " "  '+*.$(1537+/59$(,0/3&*59-134d
##d
 ''d
 \\	d

 !.d
 u//0d
 "%d
   1 12d
 D>d
 $D>d
 'tnd
 d^d
 !!1!12d
 ))*d
 c5<</0d
  *+!d
" 
u,,	-#d
  d
R > 444 4 {{	4
 4 4 4r.   r  )r  r  r  r  r(  r  )Kr  	functoolsr   typingr   r   r   r   rG   torch.nnr&   torch.nn.functional
functionalr   torch.utils.checkpointcache_utilsr	   
generationr
   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerrC   r  r   r   rO   ru   rw   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r(  re  r  r  r  r  r  __all__r9   r.   r-   <module>r     s  "  % / /        ) B 6 - & > > h e e 4 K K 
		H	%A( AH$ryy $D	%H 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~&o &V V299 D8 8v7ryy 7tCryy CLCryy CL l2 l2l2^3% 3%l*2I **
J 3 
&(;_ 8}# }@\#6 \~r.   