
    UhL                        d dl mZmZmZ d dlZd dlmZ d dlmc mZ	 d dl
ZddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ d	d
lmZ  ej:                  e      Z G d dej@                        Z! G d de      Z"ddZ# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z( G d de      Z)g dZ*y)    )CallableOptionalTupleN   )Cache)ALL_ATTENTION_FUNCTIONS)logging   )	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )OlmoLayerNormz/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2    t         |           |f| _        y N)super__init__normalized_shape)selfr   	__class__s     w/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/olmo/modular_olmo.pyr   zOlmoLayerNorm.__init__   s    !,    hidden_statesc                     |j                   }t        j                  |j                  t        j
                        | j                  d d d      j                  |      S )N)dtypegh㈵>)eps)r%   F
layer_normtotorchfloat32r   )r   r#   
orig_dtypes      r!   forwardzOlmoLayerNorm.forward#   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r"   )
__name__
__module____qualname____doc__intr   r*   Tensorr-   __classcell__r    s   @r!   r   r      s4    9/C /D /
U\\ 
ell 
r"   r   c                        e Zd Z fdZ xZS )OlmoMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr    s     r!   r   zOlmoMLP.__init__+   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr"   )r.   r/   r0   r   r4   r5   s   @r!   r7   r7   *   s    Y Yr"   r7   c                 
   | j                   |j                   }}|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }	|j                  |      |	j                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r%   	unsqueezer   r)   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r!   apply_rotary_pos_embrM   2   s|    ( WWaggFF
--
&C
--
&C3w;q>C/0G3w;q>C/0G::fwzz&111r"   c                      e Zd Z	 	 d	dej                  deej                  ej                  f   deej                     dee   deej                     deej                  eej                     eeej                        f   fdZ	y)
OlmoAttentionNr#   position_embeddingsattention_maskpast_key_valuecache_positionr   c                 V   |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  |	j                  | j
                  j                   | j
                  j                         |
j                  | j
                  j                   | j
                  j                         |j                  | j
                  j                   | j
                  j                         |	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j
                  j                  dk7  r^| j
                  j                  dk(  r(|j                  dd	      rt         j#                  d
       nt$        | j
                  j                     } || |	|
||f| j&                  sdn| j(                  | j*                  d|\  }} |j,                  g |d j/                         }| j1                  |      }||fS )N)minmaxr   r
   )rF   rE   rS   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscaling)shapehead_dimq_projk_projv_projr@   clip_qkvclamp_view	transposerM   update	layer_idxr   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutr]   reshape
contiguouso_proj)r   r#   rP   rQ   rR   rS   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrE   rF   cache_kwargsattention_interfaceattn_outputattn_weightss                     r!   r-   zOlmoAttention.forwardO   sy    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r"   )NN)
r.   r/   r0   r*   r3   r   r   r   
LongTensorr-    r"   r!   rO   rO   N   s     +/598)||8) #5<<#=>8) !.	8)
 !8) !!1!128) 
u||Xell3XeELL>Q5RR	S8)r"   rO   c                   (     e Zd Zdedef fdZ xZS )OlmoDecoderLayerr@   rh   c                     t         |   ||       t        |j                        | _        t        |j                        | _        t        ||      | _        y )N)r@   rh   )r   r   r   r   input_layernormpost_attention_layernormrO   	self_attnr   r@   rh   r    s      r!   r   zOlmoDecoderLayer.__init__   sF    +,V-?-?@(5f6H6H(I%&f	Jr"   )r.   r/   r0   r   r2   r   r4   r5   s   @r!   r   r      s    Kz Kc K Kr"   r   c                       e Zd Zd Zy)OlmoRotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	||	fcd d d        S # 1 sw Y   y xY w)
Nr   rU   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandr^   r)   device
isinstancetypestrr*   autocastrf   catrE   attention_scalingrF   )
r   xrG   inv_freq_expandedposition_ids_expandedr   freqsembrE   rF   s
             r!   r-   zOlmoRotaryEmbedding.forward   s0    MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C8	 	 	s    BE22E;N)r.   r/   r0   r-   r}   r"   r!   r   r      s    
r"   r   c                       e Zd Zd Zy)OlmoPreTrainedModelc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Nr[   )meanstd)r@   initializer_ranger   r:   r;   weightdatanormal_r9   zero_	Embeddingpadding_idx)r   moduler   s      r!   _init_weightsz!OlmoPreTrainedModel._init_weights   s    kk++fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r"   N)r.   r/   r0   r   r}   r"   r!   r   r      s    	?r"   r   c                   $     e Zd Zdef fdZ xZS )	OlmoModelr@   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                        | _
        y c c}w r   )r   r   r:   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r!   r   zOlmoModel.__init__   s[     mmBGH`H`BabYfi0b
 "&"4"45	 cs   A1)r.   r/   r0   r   r   r4   r5   s   @r!   r   r      s    6z 6 6r"   r   c                       e Zd Zy)OlmoForCausalLMN)r.   r/   r0   r}   r"   r!   r   r      s    r"   r   )r   r   r   )Nr   )+typingr   r   r   r*   torch.nnr:   torch.nn.functional
functionalr'   torch.utils.checkpointcache_utilsr   modeling_utilsr   utilsr	   llama.modeling_llamar   r   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr.   rk   Moduler   r7   rM   rO   r   r   r   r   r   __all__r}   r"   r!   <module>r      s    , ,        5 
 
 
 + 
		H	%
BII 
Yh Y289)N 9)xK( K. 
?. 
?6
 6	& 	 Br"   