
    Uh/}                        d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
ZddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*  e       rd dl+m,Z, ddl-m.Z.  ej^                  e0      Z1 G d de      Z2 G d de(      Z3 G d de&      Z4	 	 	 d/de	jj                  dejl                  dejl                  dejl                  deejl                     de7dee7   dee7   d eejl                  ejl                  f   fd!Z8 G d" d#e"      Z9 G d$ d%e	jj                        Z: G d& d'e'      Z; G d( d)e#      Z< G d* d+e$      Z= G d, d-e%      Z>g d.Z?y)0    )partial)CallableOptionalTupleUnionN   )ACT2FN)CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)is_torch_flex_attn_availablelogging)deprecate_kwarg   )	GemmaAttentionGemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassificationGemmaMLP
GemmaModelGemmaRMSNormapply_rotary_pos_emb	repeat_kv)	BlockMask)make_flex_block_causal_maskc                        e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )Gemma2Configa  
    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma2-7B.
    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma2Model`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

    ```python
    >>> from transformers import Gemma2Model, Gemma2Config
    >>> # Initializing a Gemma2 gemma2-7b style configuration
    >>> configuration = Gemma2Config()
    >>> # Initializing a model from the gemma2-7b style configuration
    >>> model = Gemma2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 F   t        |   d||||d| || _        |	| _        || _        || _        || _        || _        || _        || _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        y )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappingcache_implementation)selfr7   r9   r:   r;   r<   r>   r=   rE   r8   r?   r@   rA   r0   r2   r1   r3   rB   rC   rD   rF   rG   rH   rI   rJ   kwargs	__class__s                             {/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.pyr6   zGemma2Config.__init__   s    8 	 	
%%% 3		

 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#$8!    )i  i 	  i $              gelu_pytorch_tanhi    g{Gz?gư>Tr      r   Tg     @F        rS   i   g      >@g      I@hybrid)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr6   __classcell__rM   s   @rN   r#   r#   6   s    FP J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 - $ ! $#%369 69rO   r#   c                       e Zd Zy)Gemma2RMSNormN)rX   rY   rZ   r4   rO   rN   rc   rc      s    rO   rc   c                        e Zd Z fdZ xZS )	Gemma2MLPc                 R    t         |           t        |j                     | _        y N)r5   r6   r	   rE   act_fnrK   configrM   s     rN   r6   zGemma2MLP.__init__   s     V556rO   rX   rY   rZ   r6   r`   ra   s   @rN   re   re      s    7 7rO   re   modulequerykeyvaluer+   dropoutscalingsoftcapreturnc                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	N      r   r   )dimdtype)ptrainingrU   )r=   r   num_key_value_groupstorchmatmul	transposetanhshapenn
functionalsoftmaxfloat32tory   rp   r{   
contiguous)rl   rm   rn   ro   r+   rp   rq   rr   rL   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 rN   eager_attention_forwardr      sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$rO   c                   2    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )Gemma2Attentionrj   	layer_idxc                    t         |   ||       | j                  j                  | _        | j                  j                  | _        d| _        |j                  dz  | _        t        |dz        s|j                  | _	        y d | _	        y )NTru   r   )
r5   r6   rj   rI   rD   	is_causalrF   rq   boolrG   rK   rj   r   rM   s      rN   r6   zGemma2Attention.__init__   sp    +&*kk&H&H#!%!>!>33T9;?	A;Nf33TXrO   r*   position_embeddingsr+   past_key_valuecache_positionrL   rs   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|~|||| j                  d}|j                  |
|| j                  |      \  }
}|J| j                  j                  dk(  r1|j                   d   }|
d d d d d |d d f   |d d d d d |d d f   }}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd	      rt        j!                  d
       nt"        | j                  j                     } || |	|
||f| j$                  r| j&                  nd| j(                  | j                  | j*                  d|\  }} |j,                  g |d j/                         }| j1                  |      }||fS )Nrw   rU   r   )sincosr   rG   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rV   )rp   rq   rG   rr   )r   r=   q_projviewr   k_projv_projr   rG   updater   rj   _attn_implementationr   getloggerwarning_oncer   r{   rD   rq   rI   reshaper   o_proj)rK   r*   r   r+   r   r   rL   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsseq_lenattention_interfacer   r   s                      rN   forwardzGemma2Attention.forward   sa    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j% "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL
(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7%
 /3mmD**LL..//%
 %
!\ *k));;;;FFHkk+.L((rO   )NN)rX   rY   rZ   r#   intr6   r}   Tensorr   r   r
   
LongTensorr   r   r   r`   ra   s   @rN   r   r      s    Y| Y Y +/59;)||;) #5<<#=>;) !.	;)
 !;) !!1!12;) -.;) 
u||Xell3XeELL>Q5RR	S;)rO   r   c                   x    e Zd Zdedef fdZ edd      	 	 	 	 	 	 ddej                  de	ej                  ej                  f   d	e
ej                     d
e
ej                     de
e   de
e   de
e   de
ej                     de	ej                  e
e	ej                  ej                  f      f   fd       Z xZS )Gemma2DecoderLayerrj   r   c                    t         |           |j                  | _        || _        t	        |dz         | _        t        ||      | _        t        |      | _	        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        |j                   | _        y )Nr   )rj   r   )eps)r5   r6   r9   rj   r   
is_slidingr   	self_attnre   mlprc   r@   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormrG   r   s      rN   r6   zGemma2DecoderLayer.__init__>  s    !--"9q=11()LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%)6v7I7IvObOb)c&*78J8JPVPcPc*d'$33rO   last_cache_positionz4.53.0)versionr*   r   r+   position_idsr   r   rA   r   rs   c	                    | j                   r?|<t        |j                  d   | j                        }
| j                  j
                  dk(  r|d d |
 d f   }nt        j                  |j                        j                  }t        j                  t        j                  |t        j                        | j                         }t        j                  |||      }|d   |
z
  dz   }t        j                  |d      }t        j                  t        |
|j                  d         |j                         }||z  }|d d d d d d |f   }|}| j#                  |      } | j$                  d
||||||||d	|	\  }}| j'                  |      }||z   }|}| j)                  |      }| j+                  |      }| j-                  |      }||z   }|f}|r||fz  }|S )Nr   r   ry   )diagonalrw   rU   )mindevice)r*   r   r+   r   r   r   rA   r   r4   )r   maxr   rG   rj   r   r}   finfory   r   tril	ones_liker   whereclamparanger   r   r   r   r   r   r   )rK   r*   r   r+   r   r   r   rA   r   rL   effective_seq_len	min_dtypesliding_window_maskoffsetmask_indexesresidualself_attn_weightsoutputss                     rN   r   zGemma2DecoderLayer.forwardL  s    ??~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@)^!\'+.??!CV3  %||)>+?+?+CD^MbMb  &!/1a0E!F ,,]; ,:4>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++GrO   )NNNFFN)rX   rY   rZ   r#   r   r6   r   r}   r   r   r   r   r
   r   FloatTensorr   r`   ra   s   @rN   r   r   =  s   4| 4 4 *H=
 2637*.,1$)59E||E #5<<#=>E !.	E
 u//0E !E $D>E D>E !!1!12E 
u  (51B1BEDUDU1U+V"WW	XE >ErO   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   deej                     dee   d	ee   d
ee   deej                     dee   defdZ ej                          	 ddeej                  df   dej                  dej                  de	d	ef
d       Z xZS )Gemma2Modelrj   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w rg   )r5   r6   r   
ModuleListranger;   r   r-   r   s      rN   r6   zGemma2Model.__init__  sD     mmDI&JbJbDcdy	2d
ds   Ar(   r+   r   r%   r)   rA   r   output_hidden_statesr   flash_attn_kwargsrs   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|rL|J| j                  s>|j                  \  }}}t        | j                   |||j                  | j                        }|	F||j                         nd}t        j                   |||j                  d   z   |j                        }	||	j#                  d      }| j%                  |||	||      }|}| j'                  ||      }t        j(                  | j                   j*                  dz  |j                  	      }||z  }|rd
nd }|rd
nd }| j,                  d | j                   j.                   D ]r  }|r||fz  }| j
                  r:| j                  r.| j1                  t3        |j4                  fi |
||||||||		      }n ||f|||||||	d|
}|d   }|sj||d   fz  }t | j7                  |      }|r||fz  }t9        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenry   r   r   rU   r   g      ?r   r4   )r   r+   r   r   r   rA   r   )last_hidden_stater%   r*   
attentions)rj   r   r   rA   
ValueErrorgradient_checkpointingr{   r   r   r,   r   r   ry   r   get_seq_lengthr}   r   	unsqueeze_update_causal_mask
rotary_embtensorr9   r-   r;   _gradient_checkpointing_funcr   __call__r.   r   )rK   r(   r+   r   r%   r)   rA   r   r   r   r   
batch_sizer   _past_seen_tokensr   r*   r   
normalizerall_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          rN   r   zGemma2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#)){{O !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]

 & #oom\J
 \\$++"9"93">mFYFYZ
%
2 #7BD0d![[)H4;;+H+HI  	6M#!m%55!**t}} $ A AM22H6GH!' #%"
! !.!
!(;#.!-#2&7'#1
! (
! *!,M =#3"55A 	6D 		-0-!11&+++%	
 	
rO   r    input_tensorc           
         | j                   j                  dk(  r|S | j                   j                  dk(  r't        |t        j                        rt        |      }|S |j                  |j                  }}|j                  d   }t        |t        t        f      r|j                         }	n ||j                  d   n|j                  d   }	| j                  |||	||||j                  d         }
|
S )Nr   flex_attentionrU   rw   r   sequence_lengthtarget_lengthry   r   r   r   )rj   r   
isinstancer}   r   r!   ry   r   r   r   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)rK   r+   r   r   r%   r   ry   r   r   r   r   s              rN   r   zGemma2Model._update_causal_mask  s     ;;++/BB!!;;++/??.%,,7!<^!L!!$**L,?,?v&,,Q/o['AB+??AM8F8RN004XdXjXjklXmM PP+')#))!, Q 
 rO   )	NNNNNNNNN)F)rX   rY   rZ   r#   r6   r   r}   r   r   r   r   r   r   r   r   r   no_gradr   r   r`   ra   s   @rN   r   r     s]   
| 
 1515371559$(,0/359s
E,,-s
 !.s
 u//0	s

 "+.s
   1 12s
 D>s
 $D>s
 'tns
 !!1!12s
 $$89s
 
!s
j U]]_ #($ellK78$ ll$ 	$
 %$  $ $rO   r   c                   Z    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     deej                     dee
   d	ee
   d
ee
   deej                     deeej                  f   defdZ	 	 	 	 	 	 	 d fd	Z xZS )Gemma2ForCausalLMc                 d    t         |   |       t        |      | _        | j	                          y rg   r5   r6   r   model	post_initri   s     rN   r6   zGemma2ForCausalLM.__init__:  &      (
rO   r(   r+   r   r%   r)   labelsrA   r   r   r   logits_to_keeprs   c                 .   | j                   rF| j                  j                  dk7  r-t        j	                  d| j                  j                   d       ||n| j                  j
                  }|	|	n| j                  j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                   ||| j"                  fi |}t%        |||j&                  |j(                  |j*                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r(   r+   r   r%   r)   rA   r   r   r   )losslogitsr%   r*   r   r4   )r{   rj   r   r   r   r   r   r  r   r   r   slicelm_headrH   r}   r   loss_functionr7   r   r%   r*   r   )rK   r(   r+   r   r%   r)   r  rA   r   r   r   r  loss_kwargsr   r*   slice_indicesr
  r	  s                     rN   r   zGemma2ForCausalLM.forward?  s   B ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooUUD%#33!//))
 	
rO   c	                    t        |   |f|||||||d|	}
||
j                  dd       }t        |t              r|j
                  dk(  r| j                  j                  dk(  s|
d   #|
d   j                  \  }}}|
d   j                  }n!|
d   j                  \  }}|
d   j                  }| j                  j                  |||j                         | j                  j                  j                  |||      }||
d<   |
S )	N)r%   r+   r)   r   r   rA   r  r  r   r   r)   r(   r   r+   )r5   prepare_inputs_for_generationpopr   r   ndimrj   r   r   r   r  r   r   r  weightry   )rK   r(   r%   r+   r)   r   r   rA   r  rL   model_inputsr   r   r   r   rM   s                  rN   r  z/Gemma2ForCausalLM.prepare_inputs_for_generation  s5    w<

+)')%)

 

 !  !148A 4##q(KK448KKO,81=o1N1T1T.
OQ%o6==.:;.G.M.M+
O%k299!ZZ]] /-AACll))//-% ^ N .<L)*rO   )NNNNNNNNNNr   )NNNNNTN)rX   rY   rZ   r6   r   r}   r   r   r   r   r   r   r   r   r   r  r`   ra   s   @rN   r   r   9  s4    1515371559-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 "+.K
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
` 4 4rO   r   c                        e Zd Z fdZ xZS )Gemma2ForSequenceClassificationc                 d    t         |   |       t        |      | _        | j	                          y rg   r  ri   s     rN   r6   z(Gemma2ForSequenceClassification.__init__  r  rO   rk   ra   s   @rN   r  r         rO   r  c                        e Zd Z fdZ xZS )Gemma2ForTokenClassificationc                 d    t         |   |       t        |      | _        | j	                          y rg   r  ri   s     rN   r6   z%Gemma2ForTokenClassification.__init__  r  rO   rk   ra   s   @rN   r  r    r  rO   r  )r#   r   r   Gemma2PreTrainedModelr  r  )rV   NN)@	functoolsr   typingr   r   r   r   r}   torch.nnr   torch.utils.checkpointactivationsr	   cache_utilsr
   r   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   gemma.modeling_gemmar   r   r   r   r   r   r   r   r   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerrX   r   r#   rc   re   Moduler   floatr   r   r   r   r   r  r  __all__r4   rO   rN   <module>r2     s     3 3    ! : : 3 B O 5 & : 0
 
 
  !;J 
		H	%P9# P9f	L 	7 7 ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FD)n D)NU Upa* aHG( GT&D #> rO   