
    Uh                    0   d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ dd
lmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z) G d de	jT                        Z+ G d de	jT                        Z, G d de	jT                        Z- G d de	jT                        Z. G d de	jT                        Z/ G d de	jT                        Z0 G d de	jT                        Z1 G d de	jT                        Z2 G d d e	jT                        Z3 G d! d"e	jT                        Z4e# G d# d$e             Z5 e#d%&       G d' d(e5             Z6 e#d)&       G d* d+e5e             Z7e# G d, d-e5             Z8 G d. d/e	jT                        Z9 e#d0&       G d1 d2e5             Z:e# G d3 d4e5             Z;e# G d5 d6e5             Z< G d7 d8e	jT                        Z=e# G d9 d:e5             Z>d=d;Z?g d<Z@y)>zPyTorch X-MOD model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
XmodConfigc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )XmodEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr%   register_buffertorcharangeexpandzerosr'   sizelongr"   selfconfig	__class__s     x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/xmod/modeling_xmod.pyr.   zXmodEmbeddings.__init__5   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr(   r   r*   r   r,   devicer&   )"create_position_ids_from_input_idsr"   &create_position_ids_from_inputs_embedsrC   hasattrr*   rA   r?   rB   rD   r'   rM   r3   r7   r%   r5   r8   r<   )rF   	input_idsr*   r'   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr7   
embeddingsr5   s                rI   forwardzXmodEmbeddings.forwardN   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
rJ   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr(   r   rL   r   )rC   r?   r@   r"   rD   rM   	unsqueezerA   )rF   rR   rT   sequence_lengthr'   s        rI   rO   z5XmodEmbeddings.create_position_ids_from_inputs_embedsv   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rJ   )NNNNr   )__name__
__module____qualname____doc__r.   rY   rO   __classcell__rH   s   @rI   r    r    /   s    

4 rs&P=rJ   r    c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )XmodSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r%   r&   relative_keyrelative_key_query   r   )r-   r.   r1   num_attention_headsrP   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer:   attention_probs_dropout_probr<   r=   r%   r4   r/   distance_embedding
is_decoderrF   rG   r%   rH   s      rI   r.   zXmodSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++rJ   xreturnc                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr(   r   rj   r   r   )rC   rk   rn   viewpermute)rF   rx   new_x_shapes      rI   transpose_for_scoresz&XmodSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$rJ   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   rj   dimr(   rh   ri   rL   r+   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) rq   r~   rr   rs   r?   catrv   matmul	transposer%   shapetensorrD   rM   r{   r@   ru   r4   tor,   einsummathsqrtrn   r   
functionalsoftmaxr<   r|   
contiguousrC   ro   )rF   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               rI   rY   zXmodSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11GrJ   NNNNNNF)r]   r^   r_   r.   r?   Tensorr~   r   FloatTensorr   boolrY   ra   rb   s   @rI   rd   rd      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	crJ   rd   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )XmodSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr#   )r-   r.   r   rp   r1   denser8   r9   r:   r;   r<   rE   s     rI   r.   zXmodSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rJ   r   input_tensorry   c                 T    | j                  |      }| j                  |      }||z   }|S r   )r   r<   )rF   r   r   s      rI   rY   zXmodSelfOutput.forward  s.    

=1]3%4rJ   r]   r^   r_   r.   r?   r   rY   ra   rb   s   @rI   r   r     s1    >U\\  RWR^R^ rJ   r   c                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )XmodAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        |j                  | _        y )Nr%   )	r-   r.   rd   rF   r   outputsetpruned_headspre_normrw   s      rI   r.   zXmodAttention.__init__  s>    %fF]^	$V,ErJ   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rF   rk   rn   r   r   rq   rr   rs   r   r   ro   union)rF   headsindexs      rI   prune_headszXmodAttention.prune_heads'  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rJ   r   r   r   r   r   r   r   ry   c           	         |}| j                   r| j                  j                  |      }| j                  |||||||      }	| j                  |	d   |      }
| j                   s| j                  j                  |
      }
|
f|	dd  z   }|S )Nr   r   )r   r   r8   rF   )rF   r   r   r   r   r   r   r   residualself_outputsattention_outputr   s               rI   rY   zXmodAttention.forward9  s     !== KK11-@Myy!"
  ;;|AA}}#{{445EF#%QR(88rJ   r   r   )r]   r^   r_   r.   r   r?   r   r   r   r   r   rY   ra   rb   s   @rI   r   r     s    (;* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	rJ   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )XmodIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r-   r.   r   rp   r1   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrE   s     rI   r.   zXmodIntermediate.__init__X  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rJ   r   ry   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rF   r   s     rI   rY   zXmodIntermediate.forward`  s&    

=100?rJ   r   rb   s   @rI   r   r   W  s#    9U\\ ell rJ   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )XmodAdapterc                    t         |           |j                  |j                  z  | _        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r-   r.   r1   adapter_reduction_factorbottleneck_sizer   rp   dense1dense2r   r   r   r   adapter_act_fnrE   s     rI   r.   zXmodAdapter.__init__g  s    %11V5T5TTii 2 2D4H4HIii 4 4f6H6HIf''-"():):";D"("3"3DrJ   r   ry   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   s     rI   rY   zXmodAdapter.forwardq  s4    M2++M:M2rJ   r   rb   s   @rI   r   r   f  s#    4U\\ ell rJ   r   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej
                  fdZdej
                  dej
                  fdZ xZS )
XmodOutputc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        |j                  | _	        t        j                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        |j                  | _        t        j                  i       | _        |j"                  D ]#  }t%        |      | j                   t'        |      <   % y r   )r-   r.   r   rp   r   r1   r   r8   r9   ln_before_adapterr:   r;   r<   adapter_layer_normadapter_reuse_layer_norm
ModuleDictadapter_modules	languagesr   r   )rF   rG   languagerH   s      rI   r.   zXmodOutput.__init__y  s    YYv779K9KL
f&8&8f>S>ST!'!9!9zz&"<"<=$$&(ll63E3E6K`K`&aD#&*D#(.(G(G%!}}R0(( 	FH2=f2ED  X/	FrJ   r   r   lang_idsry   c                 x    | j                  |      }| j                  |      }||z   }| j                  ||      }|S r   )r   r<   lang_adapter)rF   r   r   r   s       rI   rY   zXmodOutput.forward  s@    

=1]3%4))(MBrJ   c                    t        j                  |d      \  }}| j                  s|}| j                  | j                  |      }n| j                  r| j                  |      }| j                  r|}t        j                  ||j                         d      }g }t        t        ||            D ]i  \  }\  }}	t        | j                  j                               t        |j                                  }
|j                   | j                  |
   |	             k t        j                   |d      }| j#                  |      }|z  }|S )NT)return_countsr   )r?   unique_consecutiver   r   r   r8   splittolist	enumerateziplistr   keysrm   itemappendr   r<   )rF   r   r   lang_lengthsr   split_hidden_stateslang_wise_outputsilang_idsplit_hidden_statelangs              rI   r   zXmodOutput.lang_adapter  s1   !&!9!9(RV!W,%%$H"". 33MBM** NN=9M!!$H#kk-9L9L9NPQR09#hH[:\0] 	U,A,+,,1134S5HID$$%?T%9%9$%?@R%ST	U 		"3Q7]3!rJ   )	r]   r^   r_   r.   r?   r   rY   r   ra   rb   s   @rI   r   r   x  s[    FU\\  Y^YeYe jojvjv U\\ %,, rJ   r   c                   4    e Zd Z fdZ	 	 	 	 	 	 ddej
                  dej
                  deej                     deej                     deej                     deej                     deeeej                           d	ee	   d
eej
                     fdZ
d Z xZS )	XmodLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        |j                  | _        y )Nr   z> should be used as a decoder model if cross attention is addedr&   r   )r-   r.   chunk_size_feed_forwardseq_len_dimr   	attentionrv   add_cross_attentionrl   crossattentionr   intermediater   r   r   rE   s     rI   r.   zXmodLayer.__init__  s    '-'E'E$&v. ++#)#=#= ##?? D6)g!hii"/PZ"[D,V4 (rJ   r   r   r   r   r   r   r   r   ry   c	           	         ||d d nd }	| j                  |||||	      }
|
d   }| j                  r|
dd }|
d   }n|
dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |||||||      }|d   }||dd z   }|d   }|z   }|}| j
                  r| j                  j                  |      }t        | j                  | j                  | j                  |      }| j                  |||      }| j
                  s| j                  j                  |      }|f|z   }| j                  r|fz   }|S )
Nrj   )r   r   r   r   r(   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r   rv   rP   rl   r  r   r   r8   r   feed_forward_chunkr   r   )rF   r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputsr   intermediate_outputlayer_outputs                       rI   rY   zXmodLayer.forward  s    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P#==#{{445EF7##((	
 {{#6(K}};;00>L/G+ ??!2 44GrJ   c                 $    | j                  |      S r   )r  )rF   r   s     rI   r  zXmodLayer.feed_forward_chunk  s      !122rJ   r   )r]   r^   r_   r.   r?   r   r   r   r   r   rY   r  ra   rb   s   @rI   r   r     s    (& 7;15=A>BDH,1I||I ,,I !!2!23	I
 E--.I  ((9(9:I !)):): ;I !uU->->'?!@AI $D>I 
u||	IV3rJ   r   c                   \    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  dej
                  deej                     deej                     deej                     deej                     deeeej                           d	ee	   d
ee	   dee	   dee	   de
eej
                     ef   fdZ xZS )XmodEncoderc                 j   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        |j                  | _
        | j                  r0t        j                  |j                  |j                        | _        d| _        y c c}w )Nr#   F)r-   r.   rG   r   
ModuleListrangenum_hidden_layersr   layerr   is_pre_normr8   r1   r9   gradient_checkpointing)rF   rG   _rH   s      rI   r.   zXmodEncoder.__init__
  s    ]]uVE]E]?^#_!If$5#_`
!??\\&*<*<&BWBWXDN&+#	 $`s   B0r   r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictry   c                    | j                   r%| j                  r|rt        j                  d       d}|
rdnd }|	rdnd }|	r| j                  j
                  rdnd }|rdnd }t        | j                        D ]  \  }}|
r||fz   }|||   nd }|||   nd }| j                   r0| j                  r$| j                  |j                  ||||||||		      }n |||||||||	      }|d   }|r	||d   fz  }|	s~||d   fz   }| j                  j
                  s||d   fz   } | j                  r| j                  |      }|
r||fz   }|st        d |||||fD              S t        |||||	      S )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F r   r(   r   rj   c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     rI   	<genexpr>z&XmodEncoder.forward.<locals>.<genexpr>Z  s      
 = 
s   )last_hidden_stater  r   
attentionscross_attentions)r  trainingloggerwarning_oncerG   r  r   r  _gradient_checkpointing_func__call__r  r8   tupler   )rF   r   r   r   r   r   r   r  r   r   r  r  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacher   layer_modulelayer_head_maskr   layer_outputss                        rI   rY   zXmodEncoder.forward  s    &&4==##p "	"6BD$5b4%64;;;Z;Zr`d#,R$(4 %	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%
! !-!"#)*"%	! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(K%	VN  NN=9M 1]4D D 
 "&%'(
 
 
 9+.+*1
 	
rJ   )	NNNNNNFFT)r]   r^   r_   r.   r?   r   r   r   r   r   r   r   rY   ra   rb   s   @rI   r  r  	  s   , 7;15=A>BEI$(,1/4&*X
||X
 ,,X
 !!2!23	X

 E--.X
  ((9(9:X
 !)):): ;X
 "%e.?.?(@"ABX
 D>X
 $D>X
 'tnX
 d^X
 
uU\\"$MM	NX
rJ   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
XmodPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r-   r.   r   rp   r1   r   Tanh
activationrE   s     rI   r.   zXmodPooler.__init__p  s9    YYv1163E3EF
'')rJ   r   ry   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r6  )rF   r   first_token_tensorpooled_outputs       rI   rY   zXmodPooler.forwardu  s6     +1a40

#566rJ   r   rb   s   @rI   r3  r3  o  s#    $
U\\ ell rJ   r3  c                   0    e Zd ZeZdZdZd ZdefdZ	d Z
y)XmodPreTrainedModelrobertaTc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsg        )meanstdNg      ?)r   r   rp   weightdatanormal_rG   initializer_rangebiaszero_r/   r"   r8   fill_
XmodLMHead)rF   modules     rI   _init_weightsz!XmodPreTrainedModel._init_weights  s&   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)
+KK""$ ,rJ   r   c           	          || j                   j                  vr0t        |  d| dt        | j                   j                               || j                   _        y)z
        Set the default language code for the model. This is used when the language is not specified in the input.

        Args:
            language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
        z does not have an adapter for z. Supported languages: N)rG   r   rl   r   default_language)rF   r   s     rI   set_default_languagez(XmodPreTrainedModel.set_default_language  s[     4;;000&6xj@WX\]a]h]h]r]rXsWtu  (0$rJ   c                    t         j                  d       | j                  j                  j	                         D ]	  }d|_         t         j                  d       | j                  j                  j                  D ]x  }|j                  j                  0|j                  j                  j	                         D ]	  }d|_         |j                  j                  j	                         D ]	  }d|_         z y)z
        Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
        fine-tuned on a downstream task.
        zFreezing embeddingsFzFreezing adaptersN)r&  infor=  rX   
parametersrequires_gradencoderr  r   r   r   )rF   	parameterr  s      rI   'freeze_embeddings_and_language_adaptersz;XmodPreTrainedModel.freeze_embeddings_and_language_adapters  s    
 	)*00;;= 	,I&+I#	,'(\\))// 	0E||..:!&!@!@!K!K!M 4I.3I+4"\\99DDF 0	*/	'0		0rJ   N)r]   r^   r_   r   config_classbase_model_prefixsupports_gradient_checkpointingrJ  r   rM  rT  r  rJ   rI   r<  r<  ~  s)    L!&*#%$0S 00rJ   r<  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc            "           e Zd Zd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   deee	j                     ef   fd       Z xZS )	XmodModelc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r-   r.   rG   r    rX   r  rR  r3  pooler	post_init)rF   rG   add_pooling_layerrH   s      rI   r.   zXmodModel.__init__  sM    
 	 (0"6*,=j(4 	rJ   c                 .    | j                   j                  S r   rX   r3   rF   s    rI   get_input_embeddingszXmodModel.get_input_embeddings  s    ...rJ   c                 &    || j                   _        y r   r`  )rF   rs   s     rI   set_input_embeddingszXmodModel.set_input_embeddings  s    */'rJ   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrR  r  r   r   )rF   heads_to_pruner  r   s       rI   _prune_headszXmodModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	CrJ   rQ   r   r   r*   r'   r   rR   r   r   r  r   r   r  r  ry   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r||n| j                   j
                  }nd}||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |\  }}||j                  n|j                  }|
|
d   d   j                  d   nd}|| j                   j                  t        d      t        | j                  j                  d   j                  j                   j#                               }|j%                  | j                   j                        }|t'        j(                  ||	      z  }|t'        j(                  |||z   f|	      }|pt+        | j,                  d
      r4| j,                  j.                  ddd|f   }|j1                  ||      }|}n&t'        j2                  |t&        j4                  |      }| j7                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|	t'        j(                  ||	      }	| j9                  |	      }nd}| j;                  || j                   j<                        }| j-                  |||||      }| j                  |||||||
||||      }|d   } | j>                  | j?                  |       nd}!|s
| |!f|dd z   S tA        | |!|jB                  |jD                  |jF                  |jH                        S )  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer(   z5You have to specify either input_ids or inputs_embedsr   rj   zPInput language unknown. Please call `XmodPreTrainedModel.set_default_language()`)rM   r*   rL   )rQ   r'   r*   rR   rS   )
r   r   r   r   r   r  r   r   r  r  r   )r"  pooler_outputr  r   r#  r$  )%rG   r   r  use_return_dictrv   r   rl   %warn_if_padding_and_no_attention_maskrC   rM   r   rL  r   rR  r  r   r   r   r   r?   onesrP   rX   r*   rA   rB   rD   get_extended_attention_maskinvert_attention_maskget_head_maskr  r\  r   r  r   r#  r$  )"rF   rQ   r   r   r*   r'   r   rR   r   r   r  r   r   r  r  rT   
batch_sizerU   rM   rS   adapter_languagesdefault_lang_idrV   rW   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr:  s"                                     rI   rY   zXmodModel.forward  s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de{{++3 !stt $T\\%7%7%:%A%A%Q%Q%V%V%X Y/55dkk6R6RSO&Jv)NNH!"ZZ*jCY6Y)ZdjkN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rJ   )T)NNNNNNNNNNNNNN)r]   r^   r_   r.   rb  rd  rh  r   r   r?   r   
LongTensorr   r   r   r   r   r   rY   ra   rb   s   @rI   rZ  rZ    s    "/0C  -1/31515/3,0048<9==A$(,0/3&*z
ELL)z
 5++,z
 !.	z

 !.z
 u||,z
 ELL)z
  -z
  (5z
 !) 6z
 "$u'8'8"9:z
 D>z
 $D>z
 'tnz
 d^z
  
uU\\"$PP	Q!z
 z
rJ   rZ  zQ
    X-MOD Model with a `language modeling` head on top for CLM fine-tuning.
    c            $           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deeee	j                           dee   dee   dee   dee   deee	j                     ef   f d       Zd Z xZS )XmodForCausalLMlm_head.decoder.weightlm_head.decoder.biasc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzLIf you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`Fr^  
r-   r.   rv   r&  warningrZ  r=  rH  lm_headr]  rE   s     rI   r.   zXmodForCausalLM.__init__m  sL       NNij 5A!&) 	rJ   c                 .    | j                   j                  S r   r  decoderra  s    rI   get_output_embeddingsz%XmodForCausalLM.get_output_embeddingsz      ||###rJ   c                 &    || j                   _        y r   r  rF   new_embeddingss     rI   set_output_embeddingsz%XmodForCausalLM.set_output_embeddings~      -rJ   rQ   r   r   r*   r'   r   rR   r   r   labelsr  r   r   r  r  ry   c                    ||n| j                   j                  }|
d}| j                  |||||||||	|||||      }|d   }| j                  |      }d}|
* | j                  ||
fd| j                   j
                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )aS  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
        >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
        >>> config.is_decoder = True
        >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
        >>> model.set_default_language("en_XX")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   r   r*   r'   r   rR   r   r   r  r   r   r  r  r   r0   rj   )losslogitsr  r   r#  r$  )rG   rl  r=  r  loss_functionr0   r   r  r   r#  r$  )rF   rQ   r   r   r*   r'   r   rR   r   r   r  r  r   r   r  r  kwargsr   r|  prediction_scoreslm_lossr   s                         rI   rY   zXmodForCausalLM.forward  s$   \ &1%<k$++B]B]I,,))%'"7#9+/!5#  
" "!* LL9(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rJ   c                 J    d}|D ]  }|t        fd|D              fz  } |S )Nr  c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectr   rM   )r  
past_statebeam_idxs     rI   r!  z1XmodForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)r*  )rF   r  r  reordered_past
layer_pasts     `  rI   _reorder_cachezXmodForCausalLM._reorder_cache  s=    ) 	Jncmnn N	 rJ   )NNNNNNNNNNNNNNN)r]   r^   r_   _tied_weights_keysr.   r  r  r   r   r?   r}  r   r   r   r   r   r   rY   r  ra   rb   s   @rI   r  r  d  s    34JK
$.  15/36:59371559=A>B-1EI$(,0/3&*!Y
E,,-Y
 5++,Y
 !!2!23	Y

 !!1!12Y
 u//0Y
 E--.Y
   1 12Y
  ((9(9:Y
 !)):): ;Y
 ))*Y
 "%e.?.?(@"ABY
 D>Y
 $D>Y
 'tnY
  d^!Y
$ 
uU\\"$EE	F%Y
 Y
xrJ   r  c                        e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )XmodForMaskedLMr  r  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzkIf you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  rE   s     rI   r.   zXmodForMaskedLM.__init__  sR     NN1
 !5A!&) 	rJ   c                 .    | j                   j                  S r   r  ra  s    rI   r  z%XmodForMaskedLM.get_output_embeddings  r  rJ   c                 &    || j                   _        y r   r  r  s     rI   r  z%XmodForMaskedLM.set_output_embeddings   r  rJ   rQ   r   r   r*   r'   r   rR   r   r   r  r   r  r  ry   c                    ||n| j                   j                  }| j                  |||||||||	|||      }|d   }| j                  |      }d}|
Ft	               } ||j                  d| j                   j                        |
j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)r   r   r*   r'   r   rR   r   r   r   r  r  r   r(   rj   r  r  r   r#  )
rG   rl  r=  r  r	   r{   r0   r   r   r#  )rF   rQ   r   r   r*   r'   r   rR   r   r   r  r   r  r  r   r|  r  masked_lm_lossloss_fctr   s                       rI   rY   zXmodForMaskedLM.forward  s   4 &1%<k$++B]B],,))%'"7#9/!5#  
 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rJ   )NNNNNNNNNNNNN)r]   r^   r_   r  r.   r  r  r   r   r?   r}  r   r   r   r   r   r   rY   ra   rb   s   @rI   r  r    sr   24JK $.  15/36:59371559=A>B-1,0/3&*:
E,,-:
 5++,:
 !!2!23	:

 !!1!12:
 u//0:
 E--.:
   1 12:
  ((9(9::
 !)):): ;:
 ))*:
 $D>:
 'tn:
 d^:
 
uU\\"N2	3:
 :
rJ   r  c                   .     e Zd ZdZ fdZd Zd Z xZS )rH  z*Roberta Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y r   )r-   r.   r   rp   r1   r   r8   r9   
layer_normr0   r  	Parameterr?   rB   rE  rE   s     rI   r.   zXmodLMHead.__init__E  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrJ   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   r  r  rF   featuresr  rx   s       rI   rY   zXmodLMHead.forwardN  s;    JJx GOOA LLOrJ   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)r  rE  rM   typera  s    rI   _tie_weightszXmodLMHead._tie_weightsX  sC     <<##((F2 $		DLL))DIrJ   )r]   r^   r_   r`   r.   rY   r  ra   rb   s   @rI   rH  rH  B  s    4&*rJ   rH  z
    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eej                     ef   fd       Z xZS )XmodForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr  )	r-   r.   
num_labelsrG   rZ  r=  XmodClassificationHead
classifierr]  rE   s     rI   r.   z&XmodForSequenceClassification.__init__i  sJ      ++ 5A08 	rJ   rQ   r   r   r*   r'   r   rR   r  r   r  r  ry   c                     ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|s|f|d	d z   }||f|z   S |S t        |||j                   |j"                  
      S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	r   r   r*   r'   r   rR   r   r  r  r   r   
regressionsingle_label_classificationmulti_label_classificationr(   rj   r  )rG   rl  r=  r  problem_typer  r,   r?   rD   rm   r
   squeezer	   r{   r   r   r   r#  rF   rQ   r   r   r*   r'   r   rR   r  r   r  r  r   r|  r  r  r  r   s                     rI   rY   z%XmodForSequenceClassification.forwardt  s   0 &1%<k$++B]B],,))%'/!5#  
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rJ   NNNNNNNNNNN)r]   r^   r_   r.   r   r   r?   r}  r   r   r   r   r   r   rY   ra   rb   s   @rI   r  r  a  s;   	  15/36:59371559-1,0/3&*H
E,,-H
 5++,H
 !!2!23	H

 !!1!12H
 u//0H
 E--.H
   1 12H
 ))*H
 $D>H
 'tnH
 d^H
 
uU\\"$<<	=H
 H
rJ   r  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eej                     ef   fd       Z xZS )XmodForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r-   r.   rZ  r=  r   r:   r;   r<   rp   r1   r  r]  rE   s     rI   r.   zXmodForMultipleChoice.__init__  sV      (zz&"<"<=))F$6$6: 	rJ   rQ   r   r*   r   r  r'   r   rR   r   r  r  ry   c                    ||n| j                   j                  }||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|2|j                  |j	                  d      |j	                  d      z        nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
|
      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )	a|  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        lang_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r(   r   r   )	r   r'   r*   r   r   rR   r   r  r  rj   r  )rG   rl  r   r{   rC   repeatr=  r<   r  r	   r   r   r#  )rF   rQ   r   r*   r   r  r'   r   rR   r   r  r  num_choicesflat_input_idsflat_lang_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r:  r  reshaped_logitsr  r  r   s                             rI   rY   zXmodForMultipleChoice.forward  s   ` &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bRZRf	q(9INN1<M(MNlpLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,"*..,/!5#  
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rJ   r  )r]   r^   r_   r.   r   r   r?   r}  r   r   r   r   r   r   rY   ra   rb   s   @rI   r  r    s;     15/3596:-1371559,0/3&*]
E,,-]
 5++,]
 !!1!12	]

 !!2!23]
 ))*]
 u//0]
 E--.]
   1 12]
 $D>]
 'tn]
 d^]
 
uU\\"$==	>]
 ]
rJ   r  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eej                     ef   fd       Z xZS )XmodForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r  )r-   r.   r  rZ  r=  classifier_dropoutr;   r   r:   r<   rp   r1   r  r]  rF   rG   r  rH   s      rI   r.   z#XmodForTokenClassification.__init__1  s      ++ 5A)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rJ   rQ   r   r   r*   r'   r   rR   r  r   r  r  ry   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r(   rj   r  )rG   rl  r=  r<   r  r	   r{   r  r   r   r#  r  s                     rI   rY   z"XmodForTokenClassification.forward?  s    , &1%<k$++B]B],,))%'/!5#  
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rJ   r  )r]   r^   r_   r.   r   r   r?   r}  r   r   r   r   r   r   rY   ra   rb   s   @rI   r  r  .  s-     15/36:59371559-1,0/3&*7
E,,-7
 5++,7
 !!2!23	7

 !!1!127
 u//07
 E--.7
   1 127
 ))*7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 7
rJ   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r-   r.   r   rp   r1   r   r  r;   r:   r<   r  out_projr  s      rI   r.   zXmodClassificationHead.__init__~  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrJ   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r8  )r<   r   r?   tanhr  r  s       rI   rY   zXmodClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rJ   )r]   r^   r_   r`   r.   rY   ra   rb   s   @rI   r  r  {  s    7IrJ   r  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee	   dee	   dee	   de
eej                     ef   fd       Z xZS )XmodForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r-   r.   r  rZ  r=  r   rp   r1   
qa_outputsr]  rE   s     rI   r.   z!XmodForQuestionAnswering.__init__  sU      ++ 5A))F$6$68I8IJ 	rJ   rQ   r   r   r*   r'   r   rR   start_positionsend_positionsr   r  r  ry   c                 *   ||n| j                   j                  }| j                  ||||||||
||
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||	t        |j                               dkD  r|j                  d      }t        |	j                               dkD  r|	j                  d      }	|j                  d      }|j                  d|      }|	j                  d|      }	t        |      } |||      } |||	      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
rj  Nr  r   r   r(   r   )ignore_indexrj   )r  start_logits
end_logitsr   r#  )rG   rl  r=  r  r   r  r   r   rC   clampr	   r   r   r#  )rF   rQ   r   r   r*   r'   r   rR   r  r  r   r  r  r   r|  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                           rI   rY   z XmodForQuestionAnswering.forward  s   * &1%<k$++B]B],,))%'/!5#  
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rJ   )NNNNNNNNNNNN)r]   r^   r_   r.   r   r   r?   r}  r   r   r   r   r   r   rY   ra   rb   s   @rI   r  r    sT     15/36:593715596:48,0/3&*E
E,,-E
 5++,E
 !!2!23	E

 !!1!12E
 u//0E
 E--.E
   1 12E
 "%"2"23E
   0 01E
 $D>E
 'tnE
 d^E
 
uU\\"$@@	AE
 E
rJ   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )nerm   r?   cumsumtype_asrD   )rQ   r"   rS   maskincremental_indicess        rI   rN   rN     sW     <<$((*D <<!4<<TBE[[_cc##%33rJ   )r  r  r  r  r  r  rZ  r<  )r   )Ar`   r   typingr   r   r   r   r?   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   
generationr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_xmodr   
get_loggerr]   r&  Moduler    rd   r   r   r   r   r   r   r  r3  r<  rZ  r  r  rH  r  r  r  r  r  rN   __all__r  rJ   rI   <module>r      sR     / /    A A ' )	 	 	 . l l , * 
		H	%V=RYY V=tC		 CLRYY 5BII 5rryy ")) $/ /d\3		 \3~b
")) b
L  30/ 30 30l ^
# ^
^
B 
{)? {
{| V
) V
 V
t* *> V
$7 V
V
r j
/ j
 j
Z H
!4 H
 H
XRYY , R
2 R
 R
l4 	rJ   