
    Uh2                        d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZm Z m!Z! ddl"m#Z#  ejH                  e%      Z&dZ'dZ( G d de      Z) G d de      Z* G d dejV                        Z, G d de      Z- G d de      Z. G d de      Z/ G d de      Z0 G d  d!e      Z1 G d" d#e      Z2g d$Z3y)%    )partial)CallableOptionalTupleN   )CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )CLIPMLP)	LlamaAttentionLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )	PhiConfigzmicrosoft/phi-1r   c                   (    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	eej                  e	ej                     e	eej                        f   fd
Z xZS )PhiAttentionconfig	layer_idxc                    t         |   ||       t        j                  |j                  |j
                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _	        t        j                  |j                  |j                  | j                  z  d      | _
        t        j                  |j
                  | j                  z  |j                  d      | _        | `t        | j                  |j                  z        | _        |j                   | _        | j                   r}t        j"                  |j                  |j
                  z  |j$                  d      | _        t        j"                  |j                  |j
                  z  |j$                  d      | _        y y )NTbias)epselementwise_affine)super__init__nnLinearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projdenseo_projintpartial_rotary_factorrotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfr   r   	__class__s      u/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/phi/modular_phi.pyr&   zPhiAttention.__init__%   sb   +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^eijYYv99DMMI6K]K]dhi
K0L0L LM"//!||""f&@&@@fF[F[pt D  "||""f&@&@@fF[F[pt D	     hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  r"| j                  |	      }	| j                  |
      }
|\  }}|	dd | j                  f   |	d| j                  d f   }}|
dd | j                  f   |
d| j                  d f   }}t        ||||      \  }}t        j                  ||fd      }	t        j                  ||fd      }
|'|||d}|j                  |
|| j                  |      \  }
}t         }| j"                  j$                  dk7  r^| j"                  j$                  dk(  r(|j'                  d	d
      rt(        j+                  d       nt,        | j"                  j$                     } || |	|
||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j9                  |      }||fS )Nr   r   .)dim)sincosrC   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscaling)shaper+   r,   view	transposer.   r/   r5   r8   r9   r4   r   torchcatupdater   r   r   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutrO   reshape
contiguousr0   )r;   r?   r@   rA   rB   rC   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrI   rH   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfaceattn_outputattn_weightss                         r=   forwardzPhiAttention.forward6   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST++L9L))*5J&S 1 1 1112d//112 	
 s/d////0sD--//0 
 2)Wc3O	7 yy)Z!8bAYY2;
%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHjj-L((r>   )NN)__name__
__module____qualname__r   r2   r&   rS   Tensorr   r   r   
LongTensorrl   __classcell__r<   s   @r=   r   r   $   s    y S , +/59A)||A) #5<<#=>A) !.	A)
 !A) !!1!12A) 
u||Xell3XeELL>Q5RR	SA)r>   r   c                       e Zd Zy)PhiMLPNrm   rn   ro    r>   r=   ru   ru   z       r>   ru   c                       e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
ej                        dee   d	ee   d
eej                     dee
ej                  ej                  f      de
ej                  ee
ej                  ej                  f      f   fdZ xZS )PhiDecoderLayerr   r   c                    t         |           t        ||      | _        t	        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                        | _        y )N)r   r#   )r%   r&   r   	self_attnru   mlpr'   r6   r)   r7   input_layernormDropoutresid_pdropresid_dropoutr:   s      r=   r&   zPhiDecoderLayer.__init__   s]    %f	B&>!||F,>,>FDYDYZZZ(:(:;r>   r?   rA   position_idsrB   rL   	use_cacherC   r@   rD   c	                     |}
| j                  |      } | j                  d||||||||d|	\  }}| j                  |      }| j                  | j                  |            }||z   |
z   }|f}|r||fz  }|S )N)r?   rA   r   rB   rL   r   rC   r@   rw   )r   r}   r   r~   )r;   r?   rA   r   rB   rL   r   rC   r@   r^   residualattn_outputsself_attn_weightsfeed_forward_hidden_statesoutputss                  r=   rl   zPhiDecoderLayer.forward   s     !,,]; +9$.. 
+
')%)/) 3
+
 
+
'' )),7%)%7%78O%P"$'AAHL ")++Gr>   )NNNFFNN)rm   rn   ro   r   r2   r&   rS   rp   r   rq   r   boolFloatTensorrl   rr   rs   s   @r=   rz   rz   ~   s   <y <S < 26378<,1$)59KO%||% !.% u//0	%
 !u||!45% $D>% D>% !!1!12% &eELL%,,,F&GH% 
u  (51B1BEDUDU1U+V"WW	X%r>   rz   c                       e Zd Zy)PhiRotaryEmbeddingNrv   rw   r>   r=   r   r      rx   r>   r   c                       e Zd Zd Zy)PhiPreTrainedModelc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y t        |t        j                        rJ|j
                  j                  j                  d       |j                  j                  j                          y y )NrM   )meanstdg      ?)r   initializer_range
isinstancer'   r(   weightdatanormal_r"   zero_	Embeddingpadding_idxr6   fill_)r;   moduler   s      r=   _init_weightsz PhiPreTrainedModel._init_weights   s    kk++fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)KK""$ .r>   N)rm   rn   ro   r   rw   r>   r=   r   r      s    %r>   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )PhiModelr   c           	      d   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        | `y c c}w )Nr|   )r%   r&   r'   
ModuleListrangenum_hidden_layersrz   layersr   
embd_pdropembed_dropoutr6   r)   r7   final_layernormnormr:   s      r=   r&   zPhiModel.__init__   s     mmAFvG_G_A`aI_VY/a
  ZZ(9(9:!||F,>,>FDYDYZI	 bs   B-	input_idsrA   r   past_key_valuesinputs_embedsr   rL   output_hidden_statesrC   flash_attn_kwargsrD   c
                 4   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|
t               }|	F||j                         nd}t        j                  |||j                  d   z   |j                        }	||	j!                  d      }| j#                  |||	||      }| j%                  |      }|}| j'                  ||      }|rdnd }|rdnd }| j(                  d | j                   j*                   D ]r  }|r||fz  }| j
                  r:| j                  r.| j-                  t/        |j0                  fi |
|||||||	|	      }n ||f||||||	|d|
}|d   }|sj||d   fz  }t | j3                  |      }|r||fz  }t5        ||r|nd ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicerw   )rA   r   rB   rL   r   rC   r@   )last_hidden_stater   r?   
attentions)r   rL   r   r   
ValueErrorgradient_checkpointingrZ   rX   rY   embed_tokensr	   get_seq_lengthrS   arangerP   r   	unsqueeze_update_causal_maskr   
rotary_embr   r   _gradient_checkpointing_funcr   __call__r   r   )r;   r   rA   r   r   r   r   rL   r   rC   r   past_seen_tokenscausal_maskr?   r@   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r=   rl   zPhiModel.forward   s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 **=9% #oom\J #7BD0d![[)H4;;+H+HI  	6M#!m%55!**t}} $ A AM22H6GH! #%"'
! !.!
!#.!-#2&7'#1(;
! (
! *!,M =#3"55A 	6D ,,];  -!11&+/8Od+%	
 	
r>   )	NNNNNNNNN)rm   rn   ro   r   r&   r   rS   rq   rp   r   r   r   r   r
   r   rl   rr   rs   s   @r=   r   r      s    y  151537+/59$(,0/359f
E,,-f
 !.f
 u//0	f

 "%f
   1 12f
 D>f
 $D>f
 'tnf
 !!1!12f
 $$89f
 
!f
r>   r   c                        e Zd Z fdZ xZS )PhiForCausalLMc                     t         |   |       t        j                  |j                  |j
                  d      | _        y )NTr!   )r%   r&   r'   r(   r)   
vocab_sizelm_head)r;   r   r<   s     r=   r&   zPhiForCausalLM.__init__6  s0     yy!3!3V5F5FTRr>   )rm   rn   ro   r&   rr   rs   s   @r=   r   r   5  s    S Sr>   r   c                       e Zd Zy)PhiForSequenceClassificationNrv   rw   r>   r=   r   r   ;  rx   r>   r   c                       e Zd Zy)PhiForTokenClassificationNrv   rw   r>   r=   r   r   ?  rx   r>   r   )r   r   r   r   r   )4	functoolsr   typingr   r   r   rS   torch.nnr'   cache_utilsr   r	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   clip.modeling_clipr   llama.modeling_llamar   r   r   r   r   r   r   r   r   configuration_phir   
get_loggerrm   rX   _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   ru   Modulerz   r   r   r   r   r   r   __all__rw   r>   r=   <module>r      s     , ,   . B 6 &  (
 
 
 ) 
		H	%' S)> S)l	W 	-bii -`	- 	%- % p
z p
fS% S	#A 		 ; 	r>   