
    Uh                        d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ  ej6                  e      Z G d de	j<                        Z G d de	j@                        Z! G d de	j@                        Z" G d de	j@                        Z#e G d de             Z$e G d de$             Z% ed       G d de$e             Z&g dZ'y)zPyTorch XGLM model.    N)ListOptionalTupleUnion)nn   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )
XGLMConfigc            
       `     e Zd ZdZd	dedededee   f fdZdej                  f fdZ
 xZS )
XGLMScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 6    t         |   |||       || _        y N)super__init__r   )selfr   r   r   r   	__class__s        x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/xglm/modeling_xglm.pyr   z XGLMScaledWordEmbedding.__init__*   s    D&    	input_idsc                 <    t         |   |      | j                  z  S r   )r   forwardr   )r   r"   r   s     r    r$   zXGLMScaledWordEmbedding.forward.   s    wy)D,<,<<<r!   )      ?)__name__
__module____qualname____doc__intr   floatr   torchTensorr$   __classcell__r   s   @r    r   r   %   sE    's '3 'S '_ghm_n '= = =r!   r   c            	            e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         dd	ee
j                     d
efd       Z xZS )!XGLMSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.num_positionsr   r   c                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y )N   )r   r   offsetr   r   make_weights)r   r2   r   r   r   s       r    r   z*XGLMSinusoidalPositionalEmbedding.__init__5   s@    *&-$++5}kRr!   r   c                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtor8   r:   r;   register_buffer)r   r   r   r   emb_weightss        r    r6   z.XGLMSinusoidalPositionalEmbedding.make_weights<   s[    ((T4#%..t||/A/A$,,J]J].^KYFr!   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r4   i'  r   )r:   r   dimN)mathlogr,   exparangeint64r+   	unsqueezecatsincosviewzerosr?   get_default_dtype)r   r   r   half_dimembs        r    r=   z/XGLMSinusoidalPositionalEmbedding.get_embeddingD   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r!   position_idspast_key_values_lengthc                    |j                         \  }}|| j                  z  }d|z   |z   }|| j                  j                  d      kD  r'| j                  || j                  | j
                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr4   r   rE   )
sizer5   r8   r6   r   r   index_selectrO   shapedetach)r   rT   rU   bszseq_lenmax_poss         r    r$   z)XGLMSinusoidalPositionalEmbedding.forwardY   s    #((*W# g+ 66T\\&&q))gt'9'94;K;KL||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr!   r   )Nr   )r&   r'   r(   r)   r*   r   r   r6   staticmethodr=   r,   no_gradr-   r$   r.   r/   s   @r    r1   r1   2   s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( U]]_	wHU\\$: 	w[^ 	w 	wr!   r1   c                   t    e Zd ZdZ	 	 	 ddededededef
 fdZdej                  d	ed
efdZ
	 	 	 	 	 ddej                  deej                     deeej                        deej                     deej                     dedeej                  eej                     eeej                        f   fdZ xZS )XGLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbiasc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rf   )r   r   rb   rc   rd   head_dim
ValueErrorscalingre   r   Lineark_projv_projq_projout_proj)r   rb   rc   rd   re   rf   r   s         r    r   zXGLMAttention.__init__i   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$ii	94@ii	94@ii	94@		)YTBr!   tensorr\   r[   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r4   )rO   rc   ri   	transpose
contiguous)r   rq   r\   r[   s       r    _shapezXGLMAttention._shape   s7    {{3GQQRSUVWbbddr!   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 L   |du}|j                         \  }}	}
| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j	                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f} | j                  ||	|      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         || j                  z  |	|fk7  r/t        d|| j                  z  |	|f d|j                                ||j                         |d|	|fk7  r#t        d	|d|	|f d|j                                |j                  || j                  |	|      |z   }t        j                  |t        j                   t        j"                  |j$                        j&                  |j(                  
            }|j                  || j                  z  |	|      }|j$                  t        j*                  k(  rNt,        j.                  j1                  |dt        j2                        j5                  t        j*                        }n!t,        j.                  j1                  |d      }||j                         | j                  fk7  r*t        d| j                  f d|j                                |j                  dddd      |j                  || j                  |	|      z  }|j                  || j                  z  |	|      }|r?|j                  || j                  |	|      }|j                  || j                  z  |	|      }nd}t,        j.                  j7                  || j6                  | j8                        }t        j                  ||      }|j                         || j                  z  |	| j                  fk7  r7t        d|| j                  |	| j                  f d|j                                |j                  || j                  |	| j                        }|j                  dd      }|j;                  ||	| j<                        }| j?                  |      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   rE   r4   rC   z$Attention weights should be of size z	, but is z!Attention mask should be of size )r;   )rD   r:   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ) rW   ro   rk   ru   rm   rn   r,   rL   re   rc   ri   rO   bmmrs   rj   maxrq   finfor:   minr;   float16r   
functionalsoftmaxfloat32r?   rd   r   reshaperb   rp   )r   rv   rw   rx   ry   rz   r{   is_cross_attentionr[   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r    r$   zXGLMAttention.forward   sV    .T9',,.Wa {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S4>>-A7GTL .==002U]]0[^^_d_l_lmL==0020FL&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01>AAr!   )        FT)NNNNF)r&   r'   r(   r)   r*   r+   boolr   r,   r-   ru   r   r   r$   r.   r/   s   @r    ra   ra   f   s,   G  CC C 	C
 C C6eU\\ eC ec e 488<1526"'vB||vB #5<<0vB !u||!45	vB
 !.vB "%,,/vB  vB 
u||Xell3XeELL>Q5RR	SvBr!   ra   c                   <    e Zd Zdef fdZ	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eeej                        d
ee	   dee	   dej                  fdZ
 xZS )XGLMDecoderLayerconfigc                 ,   t         |           |j                  | _        t	        | j                  |j
                  |j                  d      | _        |j                  | _        t        |j                     | _        |j                  | _        |j                  rVt	        | j                  |j
                  |j                  d      | _        t        j                   | j                        | _        t        j                   | j                        | _        t        j&                  | j                  |j(                        | _        t        j&                  |j(                  | j                        | _        t        j                   | j                        | _        y )NT)rb   rc   rd   re   )r   r   d_modelrb   ra   attention_headsattention_dropout	self_attnrd   r	   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normrl   ffn_dimfc1fc2final_layer_normr   r   r   s     r    r   zXGLMDecoderLayer.__init__  s   &nn,,,,	
 ~~#F$>$>?"(";";%% -.. 0000	!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r!   rv   ry   encoder_hidden_statesencoder_attention_maskrz   cross_attn_layer_head_maskrx   r{   	use_cacher|   c
                 t   |}
| j                  |      }||dd nd}| j                  |||||      \  }}}t        j                  j	                  || j                  | j
                        }|
|z   }d}d}|w|}
| j                  |      }||dd nd}| j                  ||||||      \  }}}t        j                  j	                  || j                  | j
                        }|
|z   }||z   }|}
| j                  |      }| j                  | j                  |            }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }|
|z   }|f}|r|||fz  }|	r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr4   )rv   rx   ry   rz   r{   r~   )rv   rw   ry   rz   rx   r{   )r   r   r   r   rd   r   r   r   r   r   r   r   r   )r   rv   ry   r   r   rz   r   rx   r{   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valueoutputss                     r    r$   zXGLMDecoderLayer.forward  s   < !11-@ :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;(*; --mt||VZVcVc-d =0 (,$! ,$H 88GM @N?Yrs(;_c%NRN_N_+!65 :8"3 O` OKM-/K MM11-4<<Z^ZgZg1hM$}4M !24P P !--m<**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0 ")+=>>G)++Gr!   )NNNNNNFT)r&   r'   r(   r   r   r,   r-   r   r   r   r$   r.   r/   s   @r    r   r      s    =z =@ 268<9=26=A8<,1$(W||W !.W  (5	W
 !) 6W "%,,/W %-U\\$:W !u||!45W $D>W D>W 
Wr!   r   c                   $    e Zd ZeZdZdZdgZd Zy)XGLMPreTrainedModelmodelTr   c                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Nr   )meanstd)r   init_std
isinstancer   rl   weightdatanormal_rf   zero_	Embeddingr   )r   moduler   s      r    _init_weightsz!XGLMPreTrainedModel._init_weights  s    kk""fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r!   N)	r&   r'   r(   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr    r!   r    r   r   x  s!    L&*#+,	?r!   r   c                        e Zd Zddedeej                     f fdZd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deeej                        deej                     dee   dee   dee   dee   deeej                     ef   fd       Z xZS )	XGLMModelr   embed_tokensc                    t         |   |       |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  rt        j                  |j                        nd}||| _        n2t        |j                  |j                  | j
                  |      | _        t        |j                  |j                  |j                        | _        t#        j$                  t'        |j(                        D cg c]  }t+        |       c}      | _        t#        j.                  |j                        | _        d| _        | j5                          yc c}w )zZ
        embed_tokens (`nn.Embedding`, *optional*):
            output embeddings
        r%   N)r   F)r   r   rd   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrF   sqrtr   r   r   
vocab_sizer1   embed_positionsr   
ModuleListrange
num_layersr   layersr   
layer_normgradient_checkpointing	post_init)r   r   r   r   r   r   s        r    r   zXGLMModel.__init__  s   
 	 ~~))!..$*$B$B!393I3Idii/s# ,D 7!!6>>43C3CQ\!D  A**NN 

 mmuVM^M^G_$`!%5f%=$`a,,v~~6&+# %as   E.c                     | j                   S r   r   r   s    r    get_input_embeddingszXGLMModel.get_input_embeddings  s       r!   c                     || _         y r   r   r   values     r    set_input_embeddingszXGLMModel.set_input_embeddings  s
    !r!   r"   ry   rT   r   r   	head_maskcross_attn_head_maskpast_key_valuesinputs_embedsr   r{   output_hidden_statesreturn_dictr|   c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }||	t        d      |8| j                  ||       |j                         }|j                  d|d         }n!|	|	j                         dd }nt        d      ||d   d   j                  d   nd}|Vt        j                  ||d   |z   t        j                  ||j                  n|	j                        }|j                  d      }|	| j                  |      }	t!        |||	|      }||t#        ||	j$                  |d         }|	| j'                  ||      j)                  |	j                        z   }t*        j,                  j/                  |t1        | j.                        | j2                  	      }| j4                  r%| j2                  r|
rt6        j9                  d
       d}
|rdnd}|rdnd}|r|dnd}|
rdnd}t;        ||gddg      D ]j  \  }}|	|j                         d   t=        | j>                        k7  s3t        d| dt=        | j>                         d|j                         d    d       tA        | j>                        D ]  \  }}|r||fz  }| j2                  r%t        jB                  g       }|| jD                  k  r?|||   nd}| j4                  r?| j2                  r3| jG                  |jH                  |||||||   nd|||   ndd||

      }n ||||||||   nd|||   nd|||
	      }|d   }|
r|||rdnd   fz  }|s||d   fz  }|||d   fz  } | jK                  |      }|r||fz  }|
r|nd}|stM        d |||||fD              S tO        |||||      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timerE   z5You have to specify either input_ids or inputs_embedsr   r4   r9   )r   r~   z_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...Fr   r   r   zThe `z` should be specified for z layers, but it is for .)ry   r   r   rz   r   rx   r{   r   r   r   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     r    	<genexpr>z$XGLMModel.forward.<locals>.<genexpr>T  s      = s   )last_hidden_stater   rv   
attentionscross_attentions)(r   r{   r   r   use_return_dictrj   %warn_if_padding_and_no_attention_maskrW   rO   rY   r,   rI   longr;   rK   r   r   r   r:   r   r?   r   r   rd   r+   r   r   loggerwarning_onceziplenr   	enumeraterandr   _gradient_checkpointing_func__call__r   tupler   )r   r"   ry   rT   r   r   r   r   r   r   r   r{   r   r   input_shaperU   rv   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilityrx   layer_outputs
next_caches                                r    r$   zXGLMModel.forward  s   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"66y.Q#..*K!r;r?;I&',,.s3KTUUCRC^!3A!6!<!<Q!?de <<&B"88jj+4+@y''mFZFZ	L (11!4L  --i8M:K8N

 !,1G1S%?&(;(;[QS_&" &(<(<\Ka(b(e(e  )
 
 --muT\\?R]a]j]j-k&&4==##u "	 #7BD0d&7<Q<]rdh#,R$ %(4H(IKYoKp$q 	 Iy$>>#A&#dkk*::$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 /	@C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! *!,M"}:KQQR'S&UU" =#3"55(4(]1-=,??(_/	@b 6  -!11+4'$
 '5FXlm  
 9+&+%1
 	
r!   r   )NNNNNNNNNNNNN)r&   r'   r(   r   r   r   r   r   r   r   r   r,   r-   r   FloatTensorr   r   r   r   r$   r.   r/   s   @r    r   r     s|   z ",,9O >!"  -115/38<9=,07;=A04$(,0/3&*l
ELL)l
 !.l
 u||,	l

  (5l
 !) 6l
 ELL)l
 'u||4l
 "$u'8'8"9:l
  -l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$MM	Nl
 l
r!   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            "           e Zd ZdZdgZ fdZd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     deeej                        deej                     deej                     dee   dee   dee   dee   deeej                     ef   fd       Zed        Z xZS )XGLMForCausalLMr   zlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFrh   )
r   r   r   r   r   rl   hidden_sizer   lm_headr   r   s     r    r   zXGLMForCausalLM.__init__l  sH     v&
yy!3!3V5F5FUS 	r!   c                 .    | j                   j                  S r   r   r   r   s    r    r   z$XGLMForCausalLM.get_input_embeddingst  s    zz&&&r!   c                 &    || j                   _        y r   r  r   s     r    r   z$XGLMForCausalLM.set_input_embeddingsw  s    "'

r!   c                     | j                   S r   r  r   s    r    get_output_embeddingsz%XGLMForCausalLM.get_output_embeddingsz  s    ||r!   c                     || _         y r   r  )r   new_embeddingss     r    set_output_embeddingsz%XGLMForCausalLM.set_output_embeddings}  s	    %r!   r"   ry   rT   r   r   r   r   r   r   labelsr   r{   r   r   r|   c                 $   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||||||	||||      }| j                  |d         }d}|
? | j                  ||
f| j                   j                  | j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r"   ry   rT   r   r   r   r   r   r   r   r{   r   r   r   )r   r   r   )losslogitsr   rv   r   r   )r   r{   r   r   r   r  loss_functionr   r   r   r   rv   r   r   )r   r"   ry   rT   r   r   r   r   r   r   r!  r   r{   r   r   kwargsr   r$  r#  outputs                       r    r$   zXGLMForCausalLM.forward  sV   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **)%"7#9!5+'/!5#  
  gaj)%4%%  ;;11![[55	
 D Y,F'+'7D7V#CVC0#33!//))$55
 	
r!   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)rX   r?   r;   )r   
past_statebeam_idxs     r    r   z1XGLMForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)r  )r   r+  reordered_past
layer_pasts    `  r    _reorder_cachezXGLMForCausalLM._reorder_cache  s=    ) 	Jncmnn N	 r!   )NNNNNNNNNNNNNN)r&   r'   r(   r   _tied_weights_keysr   r   r   r  r   r   r   r,   r-   r   r  r   r   r   r   r$   r^   r.  r.   r/   s   @r    r  r  b  s     *+'(&  -115/38<9=,07;=A04)-$(,0/3&*W
ELL)W
 !.W
 u||,	W

  (5W
 !) 6W
 ELL)W
 'u||4W
 "$u'8'8"9:W
  -W
 &W
 D>W
 $D>W
 'tnW
 d^W
" 
uU\\"$EE	F#W
 W
r  r!   r  )r  r   r   )(r)   rF   typingr   r   r   r   r,   torch.utils.checkpointr   activationsr	   
generationr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_xglmr   
get_loggerr&   r   r   r   Moduler1   ra   r   r   r   r  __all__r   r!   r    <module>r<     s      / /    ! ) e l - , * 
		H	%
=bll 
=1w		 1whWBBII WBturyy up ?/ ? ?$ S
# S
 S
l y)? yyx Br!   