
    Uh                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZmZmZmZ d
dlmZ  e
j$                  e      Z G d dej*                        Z G d de      Z G d de      Z G d de      Z G d de      Zg dZy)    )OptionalTupleN)nn   )ACT2FN)Cache)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 h   t         t        |           |j                  | _        |j
                  | _        t        |j                     | _        t        j                  | j                  | j                  dz  d      | _        t        j                  | j                  | j                  d      | _        y )Nr
   F)bias)superr   __init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr   	__class__s     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   zGraniteMoeSharedMLP.__init__-   s    !413 ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uU    hidden_statesreturnc                     | j                  |      }|j                  dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr
   )dimr   r   )r   chunkr   r   )r!   r%   chunked_hidden_statess      r#   forwardzGraniteMoeSharedMLP.forward6   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r$   )
__name__
__module____qualname____doc__r   r   torchTensorr,   __classcell__r"   s   @r#   r   r   $   s2    V5 VU\\ ell r$   r   c                   r    e Zd Zdedef fdZ	 	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     dee   deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )GraniteMoeSharedDecoderLayerr   	layer_idxc                 t    t         |   ||       |j                  dk(  rd | _        y t        |      | _        y )Nr   )r   r   r   r   
shared_mlpr!   r   r7   r"   s      r#   r   z%GraniteMoeSharedDecoderLayer.__init__?   s3    +"("A"AQ"F$L_`fLgr$   r%   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsr&   c
                 ~   |}| j                  |      } | j                  d||||||||	d|
\  }}}||| j                  z  z   }|}| j                  |      }| j	                  |      \  }}| j
                  |}n|| j                  |      z   }~||| j                  z  z   }|f}|r||fz  }|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r%   r;   r<   r=   r>   r?   r@   rB    )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moer9   )r!   r%   r;   r<   r=   r>   r?   r@   rA   rB   kwargsresidualself_attn_weightspresent_key_valuemoe_hidden_statesrouter_logitsoutputss                    r#   r,   z$GraniteMoeSharedDecoderLayer.forwardC   s   L !,,]; ?Mdnn 
?
')%)/) 3
?
 
?
;(*; !=43K3K#KK !55mD+/+@+@+O(=??"-M-0NNM =43K3K#KK ")++G)++G''Gr$   )NNNFFNFN)r-   r.   r/   r   intr   r1   r2   r   
LongTensorr   boolr   FloatTensorr,   r3   r4   s   @r#   r6   r6   >   s   h5 h# h 2637*.,1$)59/4KOR||R !.R u//0	R
 !R $D>R D>R !!1!12R 'tnR &eELL%,,,F&GHR 
u  (51B1BEDUDU1U+V"WW	XRr$   r6   c                       e Zd ZeZdgZy)GraniteMoeSharedPreTrainedModelr6   N)r-   r.   r/   r   config_class_no_split_modulesrD   r$   r#   rV   rV      s    )L78r$   rV   c                   $     e Zd Zdef fdZ xZS )GraniteMoeSharedModelr   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r   r   r   
ModuleListrangenum_hidden_layersr6   layersr:   s      r#   r   zGraniteMoeSharedModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A)r-   r.   r/   r   r   r3   r4   s   @r#   rZ   rZ      s    
5 
 
r$   rZ   c                   *     e Zd ZdgZdef fdZ xZS )GraniteMoeSharedForCausalLMzlm_head.weightr   c                 d    t         |   |       t        |      | _        | j	                          y r\   )r   r   rZ   model	post_initr    s     r#   r   z$GraniteMoeSharedForCausalLM.__init__   s&     *62
r$   )r-   r.   r/   _tied_weights_keysr   r   r3   r4   s   @r#   rb   rb      s    *+5  r$   rb   )rb   rZ   rV   )typingr   r   r1   r   activationsr   cache_utilsr   utilsr	   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr-   loggerModuler   r6   rV   rZ   rb   __all__rD   r$   r#   <module>rq      s     #   !     C 
		H	%")) 4W#9 Wt9&? 9

O 
"7  fr$   