
    Uh                     (   d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlZddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;  e&jx                  e=      Z> G d de*      Z? G d de      Z@e G d de9             ZAe G d de6             ZB G d dej                        ZD G d d e.      ZE G d! d"e1      ZF G d# d$e2      ZG G d% d&e,      ZH G d' d(ej                        ZJdZK G d) d*e0      ZL G d+ d,e/      ZM G d- d.e-      ZN G d/ d0ej                        ZO G d1 d2e8      ZP G d3 d4e7      ZQg d5ZRy)6    N)Callable)	dataclass)partial)AnyDictListOptionalTupleUnion   )CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                   8     e Zd ZdZdZ	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3TextConfiga!  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.
    gemma3_textc	                 f    t        
|   | fi |	 || _        || _        || _        t        |        y N)super__init__rope_local_base_freqsliding_window_patternrope_scalingr   )self
vocab_size
rope_thetar4   r2   r3   max_position_embeddingsfinal_logit_softcappingattn_logit_softcappingsuper_kwargs	__class__s             {/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.pyr1   zGemma3TextConfig.__init__   s7     	..$8!&<#(t$    )i@  g    .ANg     @   i   NN)__name__
__module____qualname____doc__
model_typer1   __classcell__r<   s   @r=   r,   r,   ;   s5    wr J %  ' $#% %r>   r,   c                        e Zd ZdZdZddddZeedZ	 	 	 	 	 	 	 dde	e
eeeef   f      d	e	e
eeeef   f      d
ededededef fdZ xZS )Gemma3Configa  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configrP   rQ   mm_tokens_per_imageinitializer_rangec                 z   | t               }t        j                  d       nt        |t              rt        di |}t        |t              rt        di |}n!|t               }t        j                  d       || _        || _        || _        || _	        || _
        || _        || _        t        	| 8  di | y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config. )r,   loggerinfo
isinstancedictr*   rP   rQ   rR   rK   rL   rJ   rS   r0   r1   )
r5   rP   rQ   rR   rK   rL   rJ   rS   kwargsr<   s
            r=   r1   zGemma3Config.__init__  s     *,KKKZ[T**9[9KmT*.??M".0MKK`a&*#6 ..!2!2"6"r>   )NN   i i  i   g{Gz?)r@   rA   rB   rC   rD   attribute_mapr,   r*   sub_configsr	   r   r   strr   intfloatr1   rE   rF   s   @r=   rH   rH      s    .` J-))M (+K JNMQ#&&&!(#'#e$4d38n$DEF#  &8$sCx.&H IJ# !	#
 # # # !# #r>   rH   c                       e Zd Zy)Gemma3ModelOutputWithPastNr@   rA   rB   rU   r>   r=   rb   rb   *      r>   rb   c                       e Zd Zy)Gemma3CausalLMOutputWithPastNrc   rU   r>   r=   rf   rf   /  rd   r>   rf   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 v    t         |   |||       | j                  dt        j                  |      d       y )Nrl   F)
persistent)r0   r1   register_buffertorchtensor)r5   ri   rj   rk   rl   r<   s        r=   r1   z&Gemma3TextScaledWordEmbedding.__init__9  s3    D]ELL,ERWXr>   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S r/   )r0   forwardrl   toweightdtype)r5   rr   r<   s     r=   rt   z%Gemma3TextScaledWordEmbedding.forward=  s2    wy)D,<,<,?,?@Q@Q,RRRr>   )      ?)r@   rA   rB   rC   r_   r`   r1   rp   Tensorrt   rE   rF   s   @r=   rh   rh   4  sG    Ys Y3 YS Y_d YS S Sr>   rh   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 $    t         |   |       y r/   r0   r1   r5   r|   r<   s     r=   r1   zGemma3MLP.__init__B       r>   r@   rA   rB   r,   r1   rE   rF   s   @r=   r{   r{   A  s    !/ ! !r>   r{   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormdimepsc                 "    t         |           y r/   r~   )r5   r   r   r<   s      r=   r1   zGemma3RMSNorm.__init__G  s    r>   )gư>)r@   rA   rB   r_   r`   r1   rE   rF   s   @r=   r   r   F  s    C e  r>   r   c                   &     e Zd Zddef fdZ xZS )Gemma3RotaryEmbeddingr|   c                 $    t         |   |       y r/   r~   )r5   r|   devicer<   s      r=   r1   zGemma3RotaryEmbedding.__init__L  r   r>   r/   r   rF   s   @r=   r   r   K  s    !/ ! !r>   r   c                       e Zd Zdedef fdZ	 	 ddej                  dej                  deej                     dee	   deej                     d	ee   d
eej                  eej                     eeej                        f   fdZ xZS )Gemma3Attentionr|   	layer_idxc                 8   t        |dz   |j                  z        | _        t        |           | j                  r|j
                  nd | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N   )r   r   )boolr3   
is_slidingr0   r1   sliding_windowr   head_dimrms_norm_epsq_normk_normr5   r|   r   r<   s      r=   r1   zGemma3Attention.__init__R  so    	A1N1NNO7;f33D#V=P=PQ#V=P=PQr>   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionrZ   returnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|~|||| j                  d}|j                  |
|| j                  |      \  }
}|J| j                  j                  dk(  r1|j                   d   }|
d d d d d |d d f   |d d d d d |d d f   }}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j!                  dd	      rt"        j%                  d
       nt&        | j                  j                     }||j)                  |	      } || |	|
||f| j*                  r| j,                  nd| j.                  | j                  d|\  }} |j0                  g |d j3                         }| j5                  |      }||fS )Nr   r   )sincosr   r   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscalingr   )shaper   q_projview	transposek_projv_projr   r   r$   r   updater   r|   _attn_implementationr%   getrV   warning_oncer   ru   trainingattention_dropoutr   reshape
contiguouso_proj)r5   r   r   r   r   r   rZ   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsseq_lenattention_interfaceattn_outputattn_weightss                      r=   rt   zGemma3Attention.forward[  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j% "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL
(?;;++w6{{//69fjjI\^c>d##L '>dkk>^>^&_#%+..|<N$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r>   )NN)r@   rA   rB   r,   r_   r1   rp   ry   r	   r   
LongTensorr   r   tuplert   rE   rF   s   @r=   r   r   Q  s    R/ RC R +/59@)||@) #\\@) !.	@)
 !@) !!1!12@) -.@) 
u||Xell3XeELL>Q5RR	S@)r>   r   c                   r    e Zd Zdedef fdZ edd      	 	 	 	 	 	 ddej                  dej                  d	ej                  d
e	ej                     de	ej                     de	e   de	e   de	e   de	ej                     deej                  e	eej                  ej                  f      f   fd       Z xZS )Gemma3DecoderLayerr|   r   c                 ,   t         |           || _        |j                  | _        || _        t        ||      | _        t        |      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        | j                  j                  | _        |j                   | _        y )N)r|   r   r   )r0   r1   r|   hidden_sizer   r   	self_attnr{   mlpr   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   r   s      r=   r1   zGemma3DecoderLayer.__init__  s    !--"()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'..33$33r>   last_cache_positionz4.53.0)versionr   position_embeddings_globalposition_embeddings_localr   position_idsr   r   	use_cacher   r   c
                    | j                   r?|<t        |	j                  d   | j                        }| j                  j
                  dk(  r|d d | d f   }nt        j                  |j                        j                  }t        j                  t        j                  |t        j                        | j                         }t        j                  |||      }|	d   |z
  dz   }t        j                  |d      }t        j                  t        ||j                  d         |j                         }||z  }|d d d d d d |f   }|}| j#                  |      }| j$                  j                   r|}n|} | j$                  d
||||||||	d	|
\  }}| j'                  |      }||z   }|}| j)                  |      }| j+                  |      }| j-                  |      }||z   }|f}|r||fz  }|S )Nr   r   rw   diagonalr   r   )minr   )r   r   r   r   r   r   r   r   rU   )r   maxr   r   r|   r   rp   finforw   r   tril	ones_liker   whereclamparanger   r   r   r   r   r   r   )r5   r   r   r   r   r   r   r   r   r   rZ   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualr   self_attn_weightsoutputss                       r=   rt   zGemma3DecoderLayer.forward  s    ??~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@)^!\'+.??!CV3  %||)>+?+?+CD^MbMb  &!/1a0E!F ,,]; >>$$";"<+94>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr>   )NNNFFN)r@   rA   rB   r,   r_   r1   r   rp   ry   r	   r   r   r   r   FloatTensorrt   rE   rF   s   @r=   r   r     s   4/ 4C 4 *H= 2637*.,1$)59K||K %*LLK $)<<	K
 !.K u//0K !K $D>K D>K !!1!12K 
u  (51B1BEDUDU1U+V"WW	XK >Kr>   r   c                       e Zd ZdZg dZd Zy)Gemma3PreTrainedModel )r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t              r&|j                  j                  j                  d       y t        |t              r%|j                   j                  j                          y y )Nr   )meanstdrx   )r|   rS   rX   nnLinearConv2drv   datanormal_biaszero_	Embeddingrk   r   fill_Gemma3MultiModalProjectormm_input_projection_weight)r5   moduler   s      r=   _init_weightsz#Gemma3PreTrainedModel._init_weights  s   kk++fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> ..MM$$S) 9:--2288: ;r>   N)r@   rA   rB   base_model_prefix_no_split_modulesr   rU   r>   r=   r   r     s    ;r>   r   c                       e Zd ZeZdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )Gemma3TextModelr|   c                 6   t         |   |       t        |j                  |j                  | j
                  | j                  j                  dz        | _        t        j                  |      }|j                  |_        ddi|_        t        |      | _        y )N      ?)rl   	rope_typedefault)r|   )r0   r1   rh   r6   r   rk   r|   embed_tokenscopydeepcopyr2   r7   r4   r   rotary_emb_localr   s     r=   r1   zGemma3TextModel.__init__  s      :v1143C3CQUQ\Q\QhQhjmQm
 v&"77*I6 5V Dr>   rr   r   r   past_key_valuesinputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|rA|?| j                  s3|j                  \  }}}t        | j                   |||j                        }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }| j%                  |||	||      }|}| j'                  ||      }| j)                  ||      }|rdnd }|rdnd }| j*                  d | j                   j,                   D ]t  }|r||fz  }| j
                  r;| j                  r/| j/                  t1        |j2                  fi |
|||||||||	
      }n ||f||||||||	d	|
}|d   }|sl||d   fz  }v | j5                  |      }|r||fz  }t7        ||||
      S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenrw   r   r   r   rU   )r   r   r   r   r   r   r   r   )last_hidden_stater  r   
attentions)r|   r   r  r   
ValueErrorgradient_checkpointingr   rV   r   r	  r   r   rw   get_seq_lengthrp   r   r   	unsqueeze_update_causal_mask
rotary_embr  layersnum_hidden_layers_gradient_checkpointing_funcr   __call__normr   )r5   rr   r   r   r  r  r   r   r  r   r  
batch_sizer   _past_seen_tokenscausal_maskr   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          r=   rt   zGemma3TextModel.forward+  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#))	O !CRC^==?de"\\  =#6#6q#99$++N )33A6L..
 & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HI "	6M#!m%55!**t}} $ A AM22H6GH!.- #%"! !.!!/I.G#.!-#2&7'#1! (! *!,M =#3"55E"	6H 		-0-!11&+++%	
 	
r>   )	NNNNNNNNN)r@   rA   rB   r,   config_classr1   r	   rp   r   ry   r   r   r   r   r   r   rt   rE   rF   s   @r=   r  r    s    #LE/ E" 1515371559$(,0/359t
E,,-t
 !.t
 u//0	t

 "+.t
   1 12t
 D>t
 $D>t
 'tnt
 !!1!12t
 $$89t
 
!t
r>   r  c                   ,     e Zd ZeZdZdef fdZ xZS )Gemma3ForCausalLMlanguage_modelr|   c                 D    t         |   |       t        |      | _        y r/   )r0   r1   r  modelr   s     r=   r1   zGemma3ForCausalLM.__init__  s     $V,
r>   )r@   rA   rB   r,   r*  r  r1   rE   rF   s   @r=   r,  r,    s     #L(-/ - -r>   r,  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r   r|   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr   r  )kernel_sizestride)r0   r1   r   	Parameterrp   zerosrQ   r   rP   r   r   layer_norm_epsmm_soft_emb_normr_   
image_size
patch_sizepatches_per_imagerR   tokens_per_sider2  	AvgPool2davg_poolr   s     r=   r1   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r>   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr   r   )r   r   r   r:  r   r=  flattenr7  rp   matmulr   type_as)	r5   r>  r"  r#  
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r=   rt   z!Gemma3MultiModalProjector.forward  s    $2$8$8!
Az"0":":1a"@"9"A"A
D$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r>   )	r@   rA   rB   rH   r1   rp   ry   rt   rE   rF   s   @r=   r   r     s#    \| \ @ell @r>   r   c            !          e Zd Zdej                  dej                  fdZ	 ddefdZee		 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     d	eej                     d
eeeej                     ef      deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   fd              Zy)Gemma3Modelpixel_valuesr   c                 `    | j                  |      j                  }| j                  |      }|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )rJ  )vision_towerr  multi_modal_projector)r5   rJ  r>  image_featuress       r=   get_image_featureszGemma3Model.get_image_features  s3     ***EWW33NCr>   is_trainingc                 6   | j                   j                  j                  dk(  r|S ||j                         dk(  r|S t	        |t
              }t        j                  | j                        j                  }|j                  d d \  }	}
|r|j                         }nUt	        |t              r|j                         }n4t	        |t        j                        r|j                  d   n
|d   |
z   dz   }||j                         dk(  r|S t        j                  |
|f|| j                  |j                        }|
dk7  rt        j                   |d      }|t        j"                  ||j                  	      |j%                  dd      kD  z  }|d d d d d d f   j'                  |	ddd      }|`|
dk7  rZ|j)                  d      |j)                  d      k(  }d
||dk(  <   |dk(  }|t*        j,                  j/                  |dd      d d d df    z  }t        j0                  |j3                         d      dz
  }t        j4                  ||t        j6                  |d            }|j)                  d      |j)                  d      k(  }d
||dk(  <   ||z  j)                  d      j9                  |j                  t        j:                        }|j=                         }|d d d d d d d |
f   j?                  |d      |d d d d d d d |
f<   ||j=                         }|j                  d   }|d d d d d d d |f   |d d d d d d f   j9                  |j                        z   }|dk(  }|d d d d d d d |f   j?                  ||      |d d d d d d d |f<   |S )Nr      r   r   r   r   )
fill_valuerw   r   r   r   F)r   r   )valuer   r   r   ) r|   rP   r   r   rX   r   rp   r   rw   r   r   get_max_cache_shaper   ry   fullr   triur   r   expandr  r   
functionalpadcumsumr_   r   	full_likeru   r   clonemasked_fill)r5   r   token_type_idsr  r   input_tensorrP  using_static_cacher   inputs_lead_dimsequence_lengthtarget_lengthr%  token_type_maskis_imagenew_image_startimage_group_idssame_image_mask
image_maskmask_lengthpadding_masks                        r=   r  zGemma3Model._update_causal_mask  s    ;;""77;NN!!%.*<*<*>!*C "!'EKK

+//	+7+=+=bq+A(+??AM5+??AM nell; $$R(#A&81<  %.*<*<*>!*C!!jjm,$**]k]r]r

 a**[1=Ku||M.:O:OPSaSiSijlnoSppp!$a"23::?ArSUV %/Q*>,66q9^=U=UVW=XXO38ONa/0 &*H&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(OU__UcegEhiO-77:o>W>WXY>ZZO5:OOr12)O;FFqILL[M_M_glgqgqLrJ%++-K5@AqJZ?JZAZ5[5g5gC6K1a!1/!112 %%++-K(..r2K 'q!Q'<=qRVX\^_O_@`@c@cdodvdv@wwL'1,L1<Q1l{l=R1S1_1_i2K1a+-. r>   Nrr   r   r   r  r`  r   r  labelsr   r   r  return_dictc                 0   |d u |d uz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|d uxr |	d u}|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|{| j                  |      }|\| | j                         t        j                  | j                  j
                  t        j                   |j                              k(  }nR|| j                  j
                  k(  j#                  d      }|j%                  |      j'                  |j                        }t)               sx||   j+                         |j+                         k7  rT|j-                  d      j-                  d      d   }t        d| d	|j                  d   |j                  d   z   d
      |j'                  |j                  |j.                        }|j1                  ||      }| j3                  ||||||      } | j4                  d|||||
||d|d	|}t7        |j8                  |
r|j:                  nd |j<                  |j>                  |      S d       S )Nr  r   r   r   )rw   r   r   rU  zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r  r  r   r   r  ro  r   )r  r  r   r  image_hidden_statesrU   ) r  r|   r   r  use_return_dictrM   r6   r^  get_input_embeddingsr  rp   r   r   r   rO  rq   longr  	expand_asru   r   numelsumrw   masked_scatterr  r-  rb   r  r  r   r  )r5   rr   rJ  r   r   r  r`  r   r  rn  r   r   r  ro  	lm_kwargsrP  special_image_maskllm_input_idsr$  rN  image_tokens_in_textr%  r   s                          r=   rt   zGemma3Model.forward.  s_   & -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$D0GV45G  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN %26Qd6O6O6QLL!;!;5::VcVjVjk7 &" '04;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i"+--@R2S2Y2Y2[_m_s_s_u2u(:'?'?A'?'F'J'Jq'J'QRS'T$ /00N~OcOcdeOfiwi}i}~  jA  PA  OB B44 
 ,..}/C/C]EXEXYN)889K^\M..NO^]\g
 &$%% 
&%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r>   )F)NNNNNNNNNNNNN)r@   rA   rB   rp   ry   rO  r   r  r   r   r   r   r	   r   r   r   r
   rb   rt   rU   r>   r=   rI  rI    s   u||  * "N N`  '+*.1537KO595959-1$(,0/3&*Y
##Y
 ''Y
 !.	Y

 u//0Y
 "%U->->(?(F"GHY
 !!1!12Y
 !!1!12Y
   1 12Y
 ))*Y
 D>Y
 $D>Y
 'tnY
 d^Y
  
u//	0!Y
  Y
r>   rI  c            "           e Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej
                  dej                  deej                     deej
                     dee	e
ej                     ef      deej
                     deej
                     deej                     d	eej
                     d
ee   dee   dee   dee   de	eej                  f   de	eef   fd       Z	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3ForConditionalGenerationrr   rJ  r   r   r  r`  r   r  rn  r   r   r  ro  logits_to_keepr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j!                  d| j                   j"                  j$                        }|j!                  d      j                  |j                        } |||      }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                  |j,                  |j.                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rr   rJ  r`  r   r   r  r  r   rn  r   r  ro  r   r   .r   r   )losslogitsr  r   r  rq  rU   )r|   r   r  rr  r/  rX   r_   slicelm_headr`   r   ru   r   r   r   CrossEntropyLossr   rP   r6   rf   r  r   r  rq  )r5   rr   rJ  r   r   r  r`  r   r  rn  r   r   r  ro  r  ry  r   r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsoutputs                               r=   rt   z&Gemma3ForConditionalGeneration.forward  s}   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5#)
 
"  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r>   c                     t        |   |f||||||	|
|d|}|d   dk(  r||d<   |d uxr |d u}|d   dk(  r;t        |t              r+||n|}| j                  j                  ||||||      }||d<   |S )N)r  r  r   r   r   r   r  r`  r   rJ  r   )r0   prepare_inputs_for_generationrX   r   r/  r  )r5   rr   r  r  r   r   rJ  r   r`  r   r  rn  rZ   model_inputsrP  ra  r%  r<   s                    r=   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s      w<
+')%)))
 
 !!+7L($D0GV45G!!j+&N,9,E=9L**88Q]_jK .9L)*r>   )NNNNNNNNNNNNNr   )
NNNNNNNTNN)r@   rA   rB   r   rp   r   r   r	   ry   r   r   r   r   r_   r
   rf   rt   r  rE   rF   s   @r=   r~  r~    s    '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B ) )r>   r~  )rH   r,   r   r  r,  r~  rI  )Sr
  collections.abcr   dataclassesr   	functoolsr   typingr   r   r   r	   r
   r   rp   torch.nnr   torch.utils.checkpointcache_utilsr   r   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r   r    r!   r"   r#   r$   r%   paligemma.modeling_paligemmar&   r'   r(   r)   siglipr*   
get_loggerr@   rV   r,   rH   rb   rf   r   rh   r{   r   r   r   Moduler   GEMMA3_START_DOCSTRINGr   r  r,  r   rI  r~  __all__rU   r>   r=   <module>r     s     $ !  : :    : : 3 B 7 9 5 & X X 0 6
 
 
  ( 
		H	%N%| N%b[## [#| 	 < 	 	 	#B 	 	
SBLL 
S!	 !
M 
!1 !J)o J)Z[ [|  ;1 ;4F
k F
R-) -!@		 !@Hz
. z
zi%F iXr>   