
    Uh                       d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1  e(       rd dl2m3Z3 ddl4m5Z5  e*jl                  e7      Z8e G d de             Z9e G d de%             Z: G d dejv                        Z< G d dejz                        Z> G d d ejz                        Z? G d! d"ejz                        Z@d# ZAdFd$ZBd%ej                  d&eDd'ej                  fd(ZE	 	 	 dGd)ejz                  d*ej                  d+ej                  d,ej                  d-e	ej                     d.eFd/e	eF   d0e	eF   d'e
ej                  ej                  f   fd1ZG G d2 d3ejz                        ZH G d4 d5ejz                        ZIe& G d6 d7e!             ZJe& G d8 d9eJ             ZKe& G d: d;eJe             ZL G d< d=ejz                        ZM e&d>?       G d@ dAeJ             ZN e&dB?       G dC dDeJe             ZOg dEZPy)H    N)Callable)	dataclass)partial)ListOptionalTupleUnion   )ACT2FN)CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging)deprecate_kwarg   )	AutoModel   )Gemma3ConfigGemma3TextConfig)	BlockMask)make_flex_block_causal_maskc                   :    e Zd ZU dZdZeej                     ed<   y)Gemma3ModelOutputWithPasta  
    Base class for Gemma3 outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r(   r   torchFloatTensor__annotations__     |/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/gemma3/modeling_gemma3.pyr'   r'   =   s    8 8<%"3"34;r1   r'   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                     ef      ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Gemma3CausalLMOutputWithPasta  
    Base class for Gemma3 causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr(   )r)   r*   r+   r,   r5   r   r-   r.   r/   r6   r7   r	   r   r   r8   r   r9   r(   r0   r1   r2   r4   r4   ^   s    < )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;r1   r4   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 v    t         |   |||       | j                  dt        j                  |      d       y )Nr?   F
persistent)super__init__register_bufferr-   tensor)selfr<   r=   r>   r?   	__class__s        r2   rD   z&Gemma3TextScaledWordEmbedding.__init__   s3    D]ELL,ERWXr1   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S N)rC   forwardr?   toweightdtype)rG   rI   rH   s     r2   rL   z%Gemma3TextScaledWordEmbedding.forward   s2    wy)D,<,<,?,?@Q@Q,RRRr1   )      ?)r)   r*   r+   r,   intfloatrD   r-   TensorrL   __classcell__rH   s   @r2   r;   r;      sG    Ys Y3 YS Y_d YS S Sr1   r;   c                   *     e Zd Zdef fdZd Z xZS )	Gemma3MLPconfigc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFbias)rC   rD   rX   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrG   rX   rH   s     r2   rD   zGemma3MLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r1   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rK   )rc   re   ra   rb   )rG   xrc   s      r2   rL   zGemma3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r1   )r)   r*   r+   r#   rD   rL   rT   rU   s   @r2   rW   rW      s    7/ 7r1   rW   c                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )Gemma3RMSNormdimepsc                     t         |           || _        t        j                  t        j                  |            | _        y rK   )rC   rD   rl   r_   	Parameterr-   zerosrN   )rG   rk   rl   rH   s      r2   rD   zGemma3RMSNorm.__init__   s.    ll5;;s#34r1   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr   T)keepdim)r-   rsqrtpowmeanrl   )rG   rh   s     r2   _normzGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr1   c                     | j                  |j                               }|d| j                  j                         z   z  }|j                  |      S )NrP   )rv   rR   rN   type_as)rG   rh   outputs      r2   rL   zGemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r1   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tuplerN   shaperl   rG   s    r2   
extra_reprzGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r1   )gư>)
r)   r*   r+   rQ   rR   rD   rv   rL   r~   rT   rU   s   @r2   rj   rj      s&    5C 5e 5
K!=r1   rj   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Gemma3RotaryEmbeddingrX   c                    t         |           t        |d      rG|j                  ;|j                  j	                  d|j                  j	                  d            | _        nd| _        |j                  | _        |j                  | _        || _	        t        | j
                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                  | _        y )Nrope_scaling	rope_typetypedefaultinv_freqFrA   )rC   rD   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrX   r   rope_init_fnattention_scalingrE   r   original_inv_freq)rG   rX   devicer   rH   s       r2   rD   zGemma3RotaryEmbedding.__init__   s    6>*v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r1   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rq   r!   mpscpuF)device_typeenabledr   rk   rO   )r   rR   expandr|   rM   r   
isinstancer   strr-   autocast	transposecatcosr   sinrO   )
rG   rh   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r2   rL   zGemma3RotaryEmbedding.forward   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.rK   )
r)   r*   r+   r#   rD   r-   no_gradr   rL   rT   rU   s   @r2   r   r      s4    // /" U]]_<  <r1   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrq   r   r   )r|   r-   r   )rh   x1x2s      r2   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r1   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r2   apply_rotary_pos_embr      sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr1   r8   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r|   r   reshape)r8   r   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr1   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	N      r   r
   rq   )rk   rO   )ptrainingr!   )r   r   num_key_value_groupsr-   matmulr   tanhr|   r_   
functionalsoftmaxfloat32rM   rO   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r2   eager_attention_forwardr     sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r1   c                       e Zd ZdZdedef fdZ	 	 ddej                  dej                  de	ej                     de	e
   d	e	ej                     d
ee   deej                  e	ej                     e	eej                        f   fdZ xZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrX   	layer_idxc                    t         |           t        |dz   |j                  z        | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        d| _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  | j                  z  |j                  |j&                        | _        | j
                  j0                  | _        | j                  r|j2                  nd | _        t5        |j                  |j6                        | _        t5        |j                  |j6                        | _        y )Nr!   r   r   Tr[   )rk   rl   )rC   rD   boolsliding_window_pattern
is_slidingrX   r   getattrr]   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout	is_causalr_   r`   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowrj   rms_norm_epsq_normk_normrG   rX   r   rH   s      r2   rD   zGemma3Attention.__init__-  s   	A1N1NNO"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;f33D#V=P=PQ#V=P=PQr1   r8   position_embeddingsr   past_key_valuecache_positionr   r   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|~|||| j                  d}|j                  |
|| j                  |      \  }
}|J| j                  j                  dk(  r1|j                   d   }|
d d d d d |d d f   |d d d d d |d d f   }}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j!                  dd	      rt"        j%                  d
       nt&        | j                  j                     }||j)                  |	      } || |	|
||f| j*                  r| j,                  nd| j.                  | j                  d|\  }} |j0                  g |d j3                         }| j5                  |      }||fS )Nrq   r!   r   )r   r   r   r   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r|   r   r   viewr   r   r   r   r   r   r   updater   rX   _attn_implementationr   r   loggerwarning_oncer   rM   r   r   r   r   r   r   )rG   r8   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsseq_lenattention_interfacer   r   s                      r2   rL   zGemma3Attention.forwardJ  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j% "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL
(?;;++w6{{//69fjjI\^c>d##L '>dkk>^>^&_#%+..|<N$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r1   )NN)r)   r*   r+   r,   r#   rQ   rD   r-   rS   r   r   
LongTensorr   r   r{   rL   rT   rU   s   @r2   r   r   *  s    GR/ RC RD +/59@)||@) #\\@) !.	@)
 !@) !!1!12@) -.@) 
u||Xell3XeELL>Q5RR	S@)r1   r   c                   r    e Zd Zdedef fdZ edd      	 	 	 	 	 	 ddej                  dej                  d	ej                  d
e	ej                     de	ej                     de	e   de	e   de	e   de	ej                     deej                  e	eej                  ej                  f      f   fd       Z xZS )Gemma3DecoderLayerrX   r   c                 ,   t         |           || _        |j                  | _        || _        t        ||      | _        t        |      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        | j                  j                  | _        |j                   | _        y )N)rX   r   rl   )rC   rD   rX   r]   r   r   	self_attnrW   mlprj   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   r   s      r2   rD   zGemma3DecoderLayer.__init__  s    !--"()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'..33$33r1   last_cache_positionz4.53.0)versionr8   position_embeddings_globalposition_embeddings_localr   r   r   r   	use_cacher   r   c
                    | j                   r?|<t        |	j                  d   | j                        }| j                  j
                  dk(  r|d d | d f   }nt        j                  |j                        j                  }t        j                  t        j                  |t        j                        | j                         }t        j                  |||      }|	d   |z
  dz   }t        j                  |d      }t        j                  t        ||j                  d         |j                         }||z  }|d d d d d d |f   }|}| j#                  |      }| j$                  j                   r|}n|} | j$                  d
||||||||	d	|
\  }}| j'                  |      }||z   }|}| j)                  |      }| j+                  |      }| j-                  |      }||z   }|f}|r||fz  }|S )Nr   r   r   diagonalrq   r!   )minr   )r8   r   r   r   r   r   r  r   r0   )r   maxr|   r   rX   r   r-   finforO   r  tril	ones_liker   whereclamparanger   r  r  r  r	  r  r
  )rG   r8   r  r  r   r   r   r   r  r   r   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualr   self_attn_weightsoutputss                       r2   rL   zGemma3DecoderLayer.forward  s    ??~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@)^!\'+.??!CV3  %||)>+?+?+CD^MbMb  &!/1a0E!F ,,]; >>$$";"<+94>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr1   )NNNFFN)r)   r*   r+   r#   rQ   rD   r   r-   rS   r   r   r   r   r{   r.   rL   rT   rU   s   @r2   r  r    s   4/ 4C 4 *H= 2637*.,1$)59K||K %*LLK $)<<	K
 !.K u//0K !K $D>K D>K !!1!12K 
u  (51B1BEDUDU1U+V"WW	XK >Kr1   r  c                   H    e Zd ZeZdZdZg dZdgZdZ	dZ
dZdZdZdZdZd Zy)Gemma3PreTrainedModel T)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr7   c                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t              r&|j                  j                  j                  d       y t        |t              r%|j                   j                  j                          y y )Nr   )ru   stdrP   )rX   initializer_ranger   r_   r`   Conv2drN   datanormal_r\   zero_	Embeddingr>   rj   fill_Gemma3MultiModalProjectormm_input_projection_weight)rG   r   r+  s      r2   _init_weightsz#Gemma3PreTrainedModel._init_weights  s   kk++fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> ..MM$$S) 9:--2288: ;r1   N)r)   r*   r+   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr5  r0   r1   r2   r%  r%    sT    L&*# $5"5!N  $!"&;r1   r%  c                   $    e Zd ZeZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
e   d	e
ej                     d
e
e   de
e   de
e   de
ej                     dee   defd              Z ej*                         	 ddeej                  df   dej                  dej                  dedef
d       Zedej                  dededej4                  dej                  defd       Z xZS )Gemma3TextModelrX   c           	         t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        t+        j,                  |      }|j.                  |_        ddi|_        t%        |      | _        | j7                          y c c}w )N      ?)r?   r  rX   Fr   r   )rC   rD   pad_token_idr>   
vocab_sizer;   r]   rX   embed_tokensr_   
ModuleListrangenum_hidden_layersr  layersrj   r   normr   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar   rotary_emb_local	post_initr   s      r2   rD   zGemma3TextModel.__init__  s    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdy	2d
 "&"4"4&:M:MN	/v>&+# v&"77*I6 5V D 	 es   "Ec                     | j                   S rK   rI  r}   s    r2   get_input_embeddingsz$Gemma3TextModel.get_input_embeddings.  s       r1   c                     || _         y rK   rX  rG   r   s     r2   set_input_embeddingsz$Gemma3TextModel.set_input_embeddings1  s
    !r1   rI   r   r   r7   inputs_embedsr  r   output_hidden_statesr   flash_attn_kwargsr   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|rA|?| j                  s3|j                  \  }}}t        | j                   |||j                        }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }| j%                  |||	||      }|}| j'                  ||      }| j)                  ||      }|rdnd }|rdnd }| j*                  d | j                   j,                   D ]t  }|r||fz  }| j
                  r;| j                  r/| j/                  t1        |j2                  fi |
|||||||||	
      }n ||f||||||||	d	|
}|d   }|sl||d   fz  }v | j5                  |      }|r||fz  }t7        ||||
      S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenrO   r   r!   r  r0   )r  r  r   r   r   r   r  r   )last_hidden_stater7   r8   r9   )rX   r   r^  r  
ValueErrorrP  r   r   r   rI  r|   r   rO   get_seq_lengthr-   r  r   r   _update_causal_maskrO  rU  rM  rL  _gradient_checkpointing_funcr   __call__rN  r   )rG   rI   r   r   r7   r]  r  r   r^  r   r_  
batch_sizer   _past_seen_tokensr   r8   r  r  all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          r2   rL   zGemma3TextModel.forward4  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#))	O !CRC^==?de"\\  =#6#6q#99$++N )33A6L..
 & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HI "	6M#!m%55!**t}} $ A AM22H6GH!.- #%"! !.!!/I.G#.!-#2&7'#1! (! *!,M =#3"55E"	6H 		-0-!11&+++%	
 	
r1   r$   input_tensorc           
         | j                   j                  dk(  r|S | j                   j                  dk(  r't        |t        j                        rt        |      }|S |j                  |j                  }}|j                  d   }t        |t        t        f      r|j                         }	n ||j                  d   n|j                  d   }	| j                  |||	||||j                  d         }
|
S )Nr   flex_attentionr!   rq   r   sequence_lengthtarget_lengthrO   r   r   rj  )rX   r   r   r-   rS   r%   rO   r   r|   r   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)rG   r   rq  r   r7   r   rO   r   ru  rv  r   s              r2   rg  z#Gemma3TextModel._update_causal_mask  s     ;;++/BB!!;;++/??.%,,7!<^!L!!$**L,?,?v&,,Q/o['AB+??AM8F8RN004XdXjXjklXmM PP+')#))!, Q 
 r1   ru  rv  rO   rj  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S 	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   
fill_valuerO   r   r!   r  r  rq   r   rk   r-   r  r  fullr   triur  r   r   cloner|   rM   masked_fillr   ru  rv  rO   r   rj  r   r   r  mask_lengthpadding_masks              r2   rx  zEGemma3TextModel._prepare_4d_causal_attention_mask_with_cache_position     < %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r1   )	NNNNNNNNNF)r)   r*   r+   r#   r6  rD   rY  r\  r   r   r   r-   r   rS   r   r.   r   r   r   r   rL   r   r	   rg  staticmethodrQ   rO   rx  rT   rU   s   @r2   rC  rC    s   #L/ 4!"  1515371559$(,0/359t
E,,-t
 !.t
 u//0	t

 "+.t
   1 12t
 D>t
 $D>t
 'tnt
 !!1!12t
 $$89t
 
!t
  t
l U]]_ #($ellK78$ ll$ 	$
 %$  $ $L 444 4 {{	4
 4 4 4r1   rC  c                       e Zd ZdgZddiZddgdgfiZeZdZdef fdZ	d	 Z
d
 Zd Zd Zd Zd Zee	 	 	 	 	 	 	 	 	 	 	 ddeej(                     deej*                     deej(                     dee   deej.                     deej(                     dee   dee   dee   deej(                     deeej*                  f   defd              Z	 	 	 	 	 	 	 d fd	Z xZS )Gemma3ForCausalLMlm_head.weightlm_headcolwise_repr8   r6   language_modelrX   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rZ   )
rC   rD   rC  modelrH  r_   r`   r]   r  rV  rf   s     r2   rD   zGemma3ForCausalLM.__init__  sU     $V,
 ++yy!3!3V5F5FUS 	r1   c                 .    | j                   j                  S rK   r  rI  r}   s    r2   rY  z&Gemma3ForCausalLM.get_input_embeddings  s    zz&&&r1   c                 &    || j                   _        y rK   r  r[  s     r2   r\  z&Gemma3ForCausalLM.set_input_embeddings  s    "'

r1   c                     | j                   S rK   r  r}   s    r2   get_output_embeddingsz'Gemma3ForCausalLM.get_output_embeddings"      ||r1   c                     || _         y rK   r  rG   new_embeddingss     r2   set_output_embeddingsz'Gemma3ForCausalLM.set_output_embeddings%  	    %r1   c                     || _         y rK   r  )rG   decoders     r2   set_decoderzGemma3ForCausalLM.set_decoder(  s	    
r1   c                     | j                   S rK   r  r}   s    r2   get_decoderzGemma3ForCausalLM.get_decoder+  s    zzr1   rI   r   r   r7   r]  labelsr  r   r^  r   logits_to_keepr   c                 .   | j                   rF| j                  j                  dk7  r-t        j	                  d| j                  j                   d       ||n| j                  j
                  }|	|	n| j                  j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                   ||| j"                  fi |}t%        |||j&                  |j(                  |j*                        S )a'  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	rI   r   r   r7   r]  r  r   r^  r   )r5   r6   r7   r8   r9   r0   )r   rX   r   r   r   r   r^  r  rd  r   rQ   slicer  final_logit_softcappingr-   r   loss_functionrH  r   r7   r8   r9   )rG   rI   r   r   r7   r]  r  r  r   r^  r   r  loss_kwargsr#  r8   slice_indicesr6   r5   s                     r2   rL   zGemma3ForCausalLM.forward.  s   P ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooUUD%#33!//))
 	
r1   c	                    t        |   |f|||||||d|	}
||
j                  dd       }t        |t              r|j
                  dk(  r| j                  j                  dk(  s|
d   #|
d   j                  \  }}}|
d   j                  }n!|
d   j                  \  }}|
d   j                  }| j                  j                  |||j                         | j                  j                  j                  |||      }||
d<   |
S )	N)r7   r   r]  r   r   r  r  r  r   r   r]  rI   rt  r   )rC   prepare_inputs_for_generationpopr   r   ndimrX   r   r|   r   r  rx  rw  r  rN   rO   )rG   rI   r7   r   r]  r   r   r  r  r   model_inputsrk  rj  ru  r   rH   s                  r2   r  z/Gemma3ForCausalLM.prepare_inputs_for_generation  s5    w<

+)')%)

 

 !  !148A 4##q(KK448KKO,81=o1N1T1T.
OQ%o6==.:;.G.M.M+
O%k299!ZZ]] /-AACll))//-% ^ N .<L)*r1   )NNNNNNNNNNr   )NNNNNTN)r)   r*   r+   _tied_weights_keys_tp_plan_pp_planr#   r6  r7  rD   rY  r\  r  r  r  r  r   r   r   r-   r   rS   r   r.   r   r	   rQ   r   rL   r  rT   rU   s   @r2   r  r    s   *+=)H_-z:;H#L(/ '(&  1515371559-1$(,0/35934P
E,,-P
 !.P
 u//0	P

 "+.P
   1 12P
 ))*P
 D>P
 $D>P
 'tnP
 !!1!12P
 c5<</0P
 
 P
  P
j 4 4r1   r  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r3  rX   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr  rE  )kernel_sizestride)rC   rD   r_   rn   r-   ro   vision_configr]   text_configr4  rj   layer_norm_epsmm_soft_emb_normrQ   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrf   s     r2   rD   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r1   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr!   r   )r|   r   r   r  r   r  flattenr  r-   r   r4  rx   )	rG   r  rj  rk  
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r2   rL   z!Gemma3MultiModalProjector.forward  s    $2$8$8!
Az"0":":1a"@"9"A"A
D$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r1   )	r)   r*   r+   r"   rD   r-   rS   rL   rT   rU   s   @r2   r3  r3    s#    \| \ @ell @r1   r3  zx
    The Base Gemma3 model which consists of a vision backbone and a language model withou language modeling head.,
    )custom_introc            !            e Zd ZddiZdef fdZd Zd Z	 ddefdZ	d	e
j                  d
e
j                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 dde
j                  d	e
j                   dee
j                     dee
j                     deeee
j                      ef      dee
j                     dee
j                     dee
j                      dee
j                     dee   dee   dee   dee   d
eeef   fd              Z xZS )Gemma3Modelzlanguage_model.modelr  rX   c                    t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                  j                  | j                  j                  nd| _        | j                          y )NrF  rq   )rC   rD   r    from_configr  vision_towerr3  multi_modal_projectorr  rH  r  rX   rG  rV  )rG   rX   r  rH   s      r2   rD   zGemma3Model.__init__  s     %119M9MN%>v%F" ,,77"..f6H6HI,8<8P8P8\DKK44bdr1   c                 6    | j                   j                         S rK   )r  rY  r}   s    r2   rY  z Gemma3Model.get_input_embeddings  s    ""7799r1   c                 :    | j                   j                  |       y rK   )r  r\  r[  s     r2   r\  z Gemma3Model.set_input_embeddings  s    007r1   is_trainingc                 6   | j                   j                  j                  dk(  r|S ||j                         dk(  r|S t	        |t
              }t        j                  | j                        j                  }|j                  d d \  }	}
|r|j                         }nUt	        |t              r|j                         }n4t	        |t        j                        r|j                  d   n
|d   |
z   dz   }||j                         dk(  r|S t        j                  |
|f|| j                  |j                        }|
dk7  rt        j                   |d      }|t        j"                  ||j                  	      |j%                  dd      kD  z  }|d d d d d d f   j'                  |	ddd      }|`|
dk7  rZ|j)                  d      |j)                  d      k(  }d
||dk(  <   |dk(  }|t*        j,                  j/                  |dd      d d d df    z  }t        j0                  |j3                         d      dz
  }t        j4                  ||t        j6                  |d            }|j)                  d      |j)                  d      k(  }d
||dk(  <   ||z  j)                  d      j9                  |j                  t        j:                        }|j=                         }|d d d d d d d |
f   j?                  |d      |d d d d d d d |
f<   ||j=                         }|j                  d   }|d d d d d d d |f   |d d d d d d f   j9                  |j                        z   }|dk(  }|d d d d d d d |f   j?                  ||      |d d d d d d d |f<   |S )Nr   r{  r   rq   r   r!   r|  r  r  F)r!   r   )r   r   r   r   ) rX   r  r   rk   r   r   r-   r  rO   r  r|   rw  r   rS   r  r   r  r  r   r   r   r_   r   padcumsumrQ   r  	full_likerM   r   r  r  )rG   r   token_type_idsr7   r   rq  r  using_static_cacher  inputs_lead_dimru  rv  r   token_type_maskis_imagenew_image_startimage_group_idssame_image_mask
image_maskr  r  s                        r2   rg  zGemma3Model._update_causal_mask  s    ;;""77;NN!!%.*<*<*>!*C "!'EKK

+//	+7+=+=bq+A(+??AM5+??AM nell; $$R(#A&81<  %.*<*<*>!*C!!jjm,$**]k]r]r

 a**[1=Ku||M.:O:OPSaSiSijlnoSppp!$a"23::?ArSUV %/Q*>,66q9^=U=UVW=XXO38ONa/0 &*H&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(OU__UcegEhiO-77:o>W>WXY>ZZO5:OOr12)O;FFqILL[M_M_glgqgqLrJ%++-K5@AqJZ?JZAZ5[5g5gC6K1a!1/!112 %%++-K(..r2K 'q!Q'<=qRVX\^_O_@`@c@cdodvdv@wwL'1,L1<Q1l{l=R1S1_1_i2K1a+-. r1   pixel_valuesr   c                 `    | j                  |      j                  }| j                  |      }|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r  )r  rd  r  )rG   r  r  image_featuress       r2   get_image_featureszGemma3Model.get_image_featuresG  s3     ***EWW33NCr1   rI   r   r   r7   r  r   r]  r  r  r   r^  return_dictc                 0   |du |duz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|duxr |	du}|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|{| j                  |      }|\| | j                         t        j                  | j                  j
                  t        j                   |j                              k(  }nR|| j                  j
                  k(  j#                  d      }|j%                  |      j'                  |j                        }t)               sx||   j+                         |j+                         k7  rT|j-                  d      j-                  d      d   }t        d	| d
|j                  d   |j                  d   z   d      |j'                  |j                  |j.                        }|j1                  ||      }| j3                  ||||||      } | j4                  d|||||
||d|d	|}t7        |j8                  |
r|j:                  nd|j<                  |j>                  |      S d      S )a]  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nra  r   r!   r  )rO   r   rq   r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r7   r]  r  r   r^  r  r   )rd  r7   r8   r9   r(   r0   ) re  rX   r   r^  use_return_dictimage_token_idrH  r  rY  rf  r-   r  r|   r   r  rF   longr   	expand_asrM   r   numelsumrO   masked_scatterrg  r  r'   rd  r7   r8   r9   )rG   rI   r  r   r   r7   r  r   r]  r  r  r   r^  r  	lm_kwargsr  special_image_maskllm_input_idsrl  r  image_tokens_in_textr   r#  s                          r2   rL   zGemma3Model.forwardU  s`   \ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$D0GV45G  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN %26Qd6O6O6QLL!;!;5::VcVjVjk7 &" '04;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i"+--@R2S2Y2Y2[_m_s_s_u2u(:'?'?A'?'F'J'Jq'J'QRS'T$ /00N~OcOcdeOfiwi}i}~  jA  PA  OB B44 
 ,..}/C/C]EXEXYN)889K^\M..NO^]\g
 &$%% 
&%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r1   r  )NNNNNNNNNNNNN)r)   r*   r+   _checkpoint_conversion_mappingr"   rD   rY  r\  r   rg  r-   rS   r  r   r   r   r.   r   r	   r   r   r   r'   rL   rT   rU   s   @r2   r  r    s    '=>N%O"
| 
:8 "N N`u||    '+*.1537KO595959-1$(,0/3&*t
##t
 ''t
 !.	t

 u//0t
 "%U->->(?(F"GHt
 !!1!12t
 !!1!12t
   1 12t
 ))*t
 D>t
 $D>t
 'tnt
 d^t
  
u//	0!t
  t
r1   r  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c            "           e Zd ZdddddZdgZdef fdZd	 Zd
 Zd Z	d Z
ed        Zed        Zed        Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dej"                  dej$                  deej(                     deej"                     deeeej$                     ef      deej"                     deej"                     deej$                     deej"                     dee   dee   dee   dee   deeej(                  f   deeef   fd       Z	 	 	 	 	 	 	 	 	 	 d' fd 	Zedej(                  d!ed"ed#ej>                  dej(                  d$efd%       Z  xZ!S )(Gemma3ForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorr  )z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headr  rX   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y rZ   )rC   rD   r  r  r_   r`   r  r]   rH  r  rV  rf   s     r2   rD   z'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr1   c                 6    | j                   j                         S rK   )r  rY  r}   s    r2   rY  z3Gemma3ForConditionalGeneration.get_input_embeddings  s    zz..00r1   c                 :    | j                   j                  |       y rK   )r  r\  r[  s     r2   r\  z3Gemma3ForConditionalGeneration.set_input_embeddings  s    

''.r1   c                     | j                   S rK   r  r}   s    r2   r  z4Gemma3ForConditionalGeneration.get_output_embeddings  r  r1   c                     || _         y rK   r  r  s     r2   r  z4Gemma3ForConditionalGeneration.set_output_embeddings  r  r1   c                 .    | j                   j                  S rK   )r  r  r}   s    r2   r  z-Gemma3ForConditionalGeneration.language_model  s    zz(((r1   c                 .    | j                   j                  S rK   )r  r  r}   s    r2   r  z+Gemma3ForConditionalGeneration.vision_tower  s    zz&&&r1   c                 .    | j                   j                  S rK   )r  r  r}   s    r2   r  z4Gemma3ForConditionalGeneration.multi_modal_projector  s    zz///r1   rI   r  r   r   r7   r  r   r]  r  r  r   r^  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j!                  d| j                   j"                  j$                        }|j!                  d      j                  |j                        } |||      }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                  |j,                  |j.                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rI   r  r  r   r   r7   r]  r  r  r   r^  r  r   r   .rq   r!   )r5   r6   r7   r8   r9   r(   r0   )rX   r   r^  r  r  r   rQ   r  r  rR   r|   rM   r   r   r_   CrossEntropyLossr   r  rH  r4   r7   r8   r9   r(   )rG   rI   r  r   r   r7   r  r   r]  r  r  r   r^  r  r  r  r#  r8   r  r6   r5   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsry   s                               r2   rL   z&Gemma3ForConditionalGeneration.forward  s}   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5#)
 
"  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r1   c                     t        |   |f||||||	|
|d|}|d   dk(  r||d<   |d uxr |d u}|d   dk(  r;t        |t              r+||n|}| j                  j                  ||||||      }||d<   |S )N)r7   r]  r   r   r   r  r  r  r   r  r   )rC   r  r   r   r  rg  )rG   rI   r7   r]  r   r   r  r   r  r  r  r  r   r  r  rq  r   rH   s                    r2   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationz  s      w<
+')%)))
 
 !!+7L($D0GV45G!!j+&N,9,E=9L**88Q]_jK .9L)*r1   ru  rv  rO   rj  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S rz  r~  r  s              r2   rx  zTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  r  r1   )NNNNNNNNNNNNNr   )
NNNNNNNTNN)"r)   r*   r+   r  r  r"   rD   rY  r\  r  r  propertyr  r  r  r   r-   r   r.   r   rS   r	   r   r   r   rQ   r   r4   rL   r  r  rO   rx  rT   rU   s   @r2   r  r    ss    "8-"?#,	&" ++| 1/& ) ) ' ' 0 0  '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B )V 444 4 {{	4
 4 4 4r1   r  )r%  rC  r  r  r  )Nr!   )r   NN)QrQ  collections.abcr   dataclassesr   	functoolsr   typingr   r   r   r	   r-   torch.nnr_   activationsr   cache_utilsr   r   r   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   autor    configuration_gemma3r"   r#   !torch.nn.attention.flex_attentionr$   integrations.flex_attentionr%   
get_loggerr)   r   r'   r4   r1  r;   ModulerW   rj   r   r   r   rS   rQ   r   rR   r   r   r  r%  rC  r  r3  r  r  __all__r0   r1   r2   <module>r     s  ,  $ !  / /   ! : : ) B O K F &  1  @  !;J 
		H	% < 7 < <@ $<; $< $<N
SBLL 
S		  =BII =(<BII <D(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %F`)bii `)F[ [| !;O !; !;H w+ w wt j- j jZ!@		 !@H 
i
' i

i
X 
G%:O G
GTr1   