
    UhA                        d dl mZmZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZmZmZmZ ddlmZ  ej>                  e       Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z)g dZ*y)    )ListOptionalTupleUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)auto_docstringcan_return_tuplelogging   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridAttentionconfig	layer_idxc                 &    t         |   ||       y Nsuper__init__selfr   r   	__class__s      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr$   z"GraniteMoeHybridAttention.__init__)   s    +    __name__
__module____qualname__r   intr$   __classcell__r'   s   @r(   r   r   (   s    ,5 ,# , ,r)   r   c                   (     e Zd Zdedef fdZ xZS )GraniteMoeHybridMambaLayerr   r   c                 8    t         |   t        |      |       y r!   )r#   r$   r   r%   s      r(   r$   z#GraniteMoeHybridMambaLayer.__init__.   s    V,i8r)   r*   r0   s   @r(   r2   r2   -   s    95 9# 9 9r)   r2   c                         e Zd Zd fd	Z xZS )GraniteMoeHybridRMSNormGatedc                 &    t         |   ||       y r!   r"   )r&   hidden_sizeepsr'   s      r(   r$   z%GraniteMoeHybridRMSNormGated.__init__3   s    c*r)   )gư>)r+   r,   r-   r$   r/   r0   s   @r(   r5   r5   2   s    + +r)   r5   c                   $     e Zd Zdef fdZ xZS )GraniteMoeHybridMLPr   c                 $    t         |   |       y r!   r"   r&   r   r'   s     r(   r$   zGraniteMoeHybridMLP.__init__8   s     r)   )r+   r,   r-   r   r$   r/   r0   s   @r(   r:   r:   7   s    !5 ! !r)   r:   c                   R    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     dee	   dee
   dee
   d	eej                     d
ee
   deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )GraniteMoeHybridDecoderLayerr   r   c                     t         |   ||       t        |      | _        d | _        d | _        |j                  |   dk(  rt        ||      | _        nt        ||      | _        |j                  |   | _	        y )Nmamba)
r#   r$   r:   
shared_mlp	self_attnr@   layers_block_typer2   r   
layer_typer%   s      r(   r$   z%GraniteMoeHybridDecoderLayer.__init__=   sm    +-f5
##I.'93FIFDJ6vyIDN 229=r)   hidden_statesattention_maskpast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsreturnc	                    |}
| j                  |      }| j                  | j                  ||||      }d}n | j                  d|||||||d|	\  }}}|
|| j                  z  z   }|}
| j	                  |      }| j                  |      \  }}|| j                  |      z   }|
|| j                  z  z   }|f}|r||fz  }|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        N)rE   rJ   cache_paramsrF   )rE   rF   rG   rH   rI   rJ   rL    )input_layernormr@   rB   residual_multiplierpost_attention_layernormblock_sparse_moerA   )r&   rE   rF   rG   rH   rI   rJ   rK   rL   kwargsresidualself_attn_weights_moe_hidden_statesrouter_logitsoutputss                   r(   forwardz$GraniteMoeHybridDecoderLayer.forwardJ   s1   J !,,];::! JJ+-+-	 ' M !%2@$.. 	3+--"3#-$7	3 	3/M,a !=43K3K#KK !55mD+/+@+@+O(=)DOOM,JJ =43K3K#KK ")++G((G''Gr)   )NNFFNFN)r+   r,   r-   r   r.   r$   torchTensorr   r	   bool
LongTensorr   FloatTensorr\   r/   r0   s   @r(   r>   r>   <   s    >5 ># >  26*.,1$)59/4KOR||R !.R !	R
 $D>R D>R !!1!12R 'tnR &eELL%,,,F&GHR 
u  (51B1BEDUDU1U+V"WW	XRr)   r>   c                   ,     e Zd ZeZdgZdZ fdZ xZS )GraniteMoeHybridPreTrainedModelr>   Tc                    t         |           t        |t        j                        rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          y y t        |t              r|j                  j                  j                  d       t        j                   t        j"                  d|j$                  dz               |j&                  _        |j(                  j                  j                  d       y t        |t*              r&|j
                  j                  j                  d       y y )Ng        )meanstdg      ?r   )r#   _init_weights
isinstancer   Conv1dweightdatanormal_r   initializer_rangebiaszero_r2   dt_biasfill_r]   logarange	num_headsA_logDr5   )r&   moduler'   s     r(   rg   z-GraniteMoeHybridPreTrainedModel._init_weights   s    fryy*MM&&CT[[5R5R&S{{&  &&( ' :;NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <=MM$$S) >r)   )	r+   r,   r-   r   config_class_no_split_modules_is_statefulrg   r/   r0   s   @r(   rc   rc      s     )L78L* *r)   rc   c                   \    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	eeeej                     f      de	ej                     de	e   d	e	e   d
e	e   de	e   de	e   de	ej                     deeef   fd              Zd Z xZS )GraniteMoeHybridModelr   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w r!   )r#   r$   r   
ModuleListrangenum_hidden_layersr>   layersr%   s      r(   r$   zGraniteMoeHybridModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A	input_idsrF   position_idspast_key_valuesinputs_embedsrI   rH   output_hidden_statesrK   return_dictrJ   rM   c                 |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|t        j                  d       |F||j                         nd}t        j                  |||j                  d   z   |j                         }||j#                  d      }| j%                  |||||      }| j'                  ||      }|}d }| j(                  | j)                  ||      }|rdnd }|rdnd }|	rdnd }d }| j*                  D ]_  }|j,                  d	k(  r|n|}|r||fz  } ||||||||	|
      }|d   }|r	||rdnd   }|r|d   	||d   fz  }|	sQ|d   W||d   fz  }a | j/                  |      }|r||fz  }|r|nd }t1        |||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicerP   r@   )rF   rG   rH   rI   rJ   rK   rL   r   )last_hidden_stater   rE   
attentionsrZ   )r   rH   r   rI   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthr]   rs   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr   rD   normr   )r&   r   rF   r   r   r   rI   rH   r   rK   r   rJ   past_seen_tokenscausal_mask
mamba_maskrE   rL   all_hidden_statesall_self_attnsall_router_logitsnext_decoder_cachedecoder_layer
layer_masklayer_outputs
next_caches                            r(   r\   zGraniteMoeHybridModel.forward   s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 ,,^^L
 &"??&"&//-"N #7BD0d"6BD!![[ 	>M'4'?'?7'JP[J#!m%55!))."3#-%9$7	M *!,M%28I1q%Q"  #/"}Q'7&99N# $0%-*;)==%?	>B 		-0  -!11+4'$
%+&+%+
 	
r)   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r]   all)r&   rF   rJ   r   s       r(   r   z(GraniteMoeHybridModel._update_mamba_mask2  s7     $
!q ^%?EIIn`aNaDbJr)   )NNNNNNNNNNN)r+   r,   r-   r   r$   r   r   r]   r`   r   r^   r   r	   r   ra   r_   r   r
   r\   r   r/   r0   s   @r(   r|   r|      sB   
5 
  '+1537KO59$(,0/3/3&*59t
##t
 !.t
 u//0	t

 "%tE4E4E/F(F"GHt
   1 12t
 D>t
 $D>t
 'tnt
 'tnt
 d^t
 !!1!12t
 
u--	.t
  t
l	r)   r|   c                   J     e Zd ZdgZdef fdZ	 	 	 	 	 	 ddZdefdZ xZ	S )GraniteMoeHybridForCausalLMzlm_head.weightr   c                 d    t         |   |       t        |      | _        | j	                          y r!   )r#   r$   r|   model	post_initr<   s     r(   r$   z$GraniteMoeHybridForCausalLM.__init__A  s&     *62
r)   c                 J   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  |||||d       |
S )Nr   r   r   r   r   r   )r   r   rI   rF   rJ   )
r   r   r   dtyper   longcumsummasked_fill_
contiguousupdate)r&   r   r   rF   r   rJ   r   rI   rU   empty_past_kvmodel_inputss              r(   prepare_inputs_for_generationz9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationG  sT    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 r)   rM   c                      y)aG  
        Function overwritten as this class uses its own `HybridMambaAttentionDynamicCache`
        and do not need to initialize the Cache in advance in order to save memory
        (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
        for `HybridMambaAttentionDynamicCache`).
        FrP   )r&   s    r(   _supports_default_dynamic_cachez;GraniteMoeHybridForCausalLM._supports_default_dynamic_cache  s     r)   )NNNNNT)
r+   r,   r-   _tied_weights_keysr   r$   r   r_   r   r/   r0   s   @r(   r   r   >  s?    *+5  7r r)   r   )r   r|   rc   )+typingr   r   r   r   r]   r   cache_utilsr	   modeling_outputsr
   r   utilsr   r   r   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr+   r   r   r2   r5   r:   r>   rc   r|   r   __all__rP   r)   r(   <module>r      s     0 /     O > > 3 b b  C 
		H	%, 9 ,
9 9
+#4 +
!- !
`#? `F*&E *(H1 HVI"= IX fr)   