
    UhW                       d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+  e)jX                  e-      Z.dEdZ/ G d dej`                        Z1 G d dej`                        Z2 G d dej`                        Z3de2iZ4 G d dej`                        Z5 G d dej`                        Z6 G d dej`                        Z7 G d dej`                        Z8 G d  d!ej`                        Z9 G d" d#ej`                        Z: G d$ d%ej`                        Z;e( G d& d'e!             Z<e G d( d)e'             Z=e( G d* d+e<             Z> G d, d-ej`                        Z? G d. d/ej`                        Z@ e(d01       G d2 d3e<             ZA e(d41       G d5 d6e<             ZB e(d71       G d8 d9e<             ZC e(d:1       G d; d<e<             ZDe( G d= d>e<             ZEe( G d? d@e<             ZF e(dA1       G dB dCe<e             ZGg dDZHy)FzPyTorch ELECTRA model.    N)	dataclass)CallableListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)GenerationMixin)"BaseModelOutputWithCrossAttentions)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )ElectraConfigc                    	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }	g }
|D ]^  \  }}t        j                  d| d|        |j                  j                  ||      }|	j                  |       |
j                  |       ` t        |	|
      D ]  \  }}|}	 t        | t               r|j#                  dd      }|d	k(  r$|j#                  d
d      }|j#                  dd
      }|j#                  dd      }|j#                  dd      }|j%                  d      }t'        d |D              rt        j                  d|        | }|D ]  }|j)                  d|      r|j%                  d|      }n|g}|d   dk(  s|d   dk(  rt+        |d      }nV|d   dk(  s|d   dk(  rt+        |d      }n9|d   dk(  rt+        |d      }n$|d   dk(  rt+        |d      }nt+        ||d         }t-        |      dk\  st/        |d          }||   } j1                  d!      rt+        |d      }n|dk(  r|j3                  |      }	 |j4                  |j4                  k7  r&t7        d"|j4                   d#|j4                   d$      	 t;        d%| |       t=        j>                  |      |_          | S # t        $ r t        j                  d        w xY w# t6        $ r1}|xj8                  |j4                  |j4                  fz  c_         d}~ww xY w# tB        $ r}t;        d| ||       Y d}~d}~ww xY w)&z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zelectra/embeddings/zgenerator/embeddings/	generatorzelectra/zdiscriminator/z
generator/dense_1dense_predictionz!generator_predictions/output_biaszgenerator_lm_head/bias/c              3   $   K   | ]  }|d v  
 yw))global_steptemperatureN ).0ns     ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/electra/modeling_electra.py	<genexpr>z-load_tf_weights_in_electra.<locals>.<genexpr>^   s     E1166E   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r    _embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )"renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzip
isinstanceElectraForMaskedLMreplacesplitany	fullmatchgetattrlenintendswith	transposeshape
ValueErrorargsprinttorch
from_numpydataAttributeError)modelconfigtf_checkpoint_pathdiscriminator_or_generatorr;   nptftf_path	init_varsnamesarraysnamerU   arrayoriginal_namepointerm_namescope_namesnumes                       r-   load_tf_weights_in_electraro   4   sz   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	
 5&) 6e!3	%!34||$9;RS)[8||J0@A||L*=<<	+=>D<< CE]^D::c?D EEEi78G +<< 0&9"$((9f"=K#)(Kq>X-Q71J%gx8G ^}4A&8P%gv6G ^'77%gx8G ^w.%g|<G%g{1~>G{#q(k!n-C%clG#+$ }-!'848#U+==EKK/$~gmm_DUV[VaVaUbbm%noo 0
 .tf5}E ++E2GLg6n LQ  Q	
 	@  7==%++66
  	Im_-tQ7	sV   K1 B*MB0M9A	M?L*M1 L	M,M		MM	M5M00M5c                        e Zd ZdZ fdZ	 	 	 	 	 d
deej                     deej                     deej                     deej                     de	dej                  fd	Z xZS )ElectraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       t+        |dd      | _        | j#                  d	t%        j.                  | j0                  j3                         t$        j4                  
      d       y )N)padding_idxepsposition_ids)r    F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r	   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrY   arangeexpandrP   ry   zerosrv   sizelongselfr^   	__class__s     r-   r   zElectraEmbeddings.__init__   s1   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsr{   rv   inputs_embedspast_key_values_lengthreturnc                 Z   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }	|	}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j                  |      }| j                  |      }|S )Nrw   r    r{   r   r}   devicerz   )r   rv   hasattrr{   r   rY   r   r   r   r   r   ry   r   r   r   )r   r   r{   rv   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r-   forwardzElectraEmbeddings.forward   sH     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r   )NNNNr   )__name__
__module____qualname____doc__r   r   rY   
LongTensorFloatTensorrR   Tensorr   __classcell__r   s   @r-   rq   rq      s    Q
. 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
'r   rq   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )ElectraSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()ry   rz   relative_keyrelative_key_queryr9   r    )r~   r   hidden_sizenum_attention_headsr   rV   rR   attention_head_sizeall_head_sizer	   Linearquerykeyvaluer   attention_probs_dropout_probr   rP   ry   r   r   distance_embedding
is_decoderr   r^   ry   r   s      r-   r   zElectraSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r   xr   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nrw   r   r9   r    r   )r   r   r   viewpermute)r   r   new_x_shapes      r-   transpose_for_scoresz)ElectraSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r    r9   dimrw   r   r   r   r|   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r   r   r   r   rY   catr   matmulrT   ry   rU   tensorr   r   r   r   r   r   tor}   einsummathsqrtr   r	   
functionalsoftmaxr   r   
contiguousr   r   )r   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r-   r   zElectraSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr   NNNNNNF)r   r   r   r   rY   r   r   r   r   r   boolr   r   r   s   @r-   r   r      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ElectraSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nrt   )r~   r   r	   r   r   denser   r   r   r   r   r   s     r-   r   zElectraSelfOutput.__init__Q  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   r   r   r   r   s      r-   r   zElectraSelfOutput.forwardW  7    

=1]3}|'CDr   r   r   r   r   rY   r   r   r   r   s   @r-   r   r   P  1    >U\\  RWR^R^ r   r   eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )ElectraAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nry   )	r~   r   ELECTRA_SELF_ATTENTION_CLASSES_attn_implementationr   r   outputsetpruned_headsr   s      r-   r   zElectraAttention.__init__e  sC    263N3NO,C
	 (/Er   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r    r   )rQ   r   r   r   r   r  r   r   r   r   r  r   r   union)r   headsindexs      r-   prune_headszElectraAttention.prune_headsm  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r    )r   r  )r   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r-   r   zElectraAttention.forward  sW     yy!"
  ;;|AF#%QR(88r   r   r   )r   r   r   r   r  rY   r   r   r   r   r   r   r   r   s   @r-   r  r  d  s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ElectraIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r~   r   r	   r   r   intermediate_sizer   rJ   
hidden_actstrr   intermediate_act_fnr   s     r-   r   zElectraIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r  )r   r   s     r-   r   zElectraIntermediate.forward  s&    

=100?r   r   r   s   @r-   r  r    s#    9U\\ ell r   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ElectraOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r~   r   r	   r   r  r   r   r   r   r   r   r   r   s     r-   r   zElectraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r-   r   zElectraOutput.forward  r   r   r   r   s   @r-   r  r    r   r   r  c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )ElectraLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr    z> should be used as a decoder model if cross attention is addedrz   r  )r~   r   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionrV   crossattentionr  intermediater  r  r   s     r-   r   zElectraLayer.__init__  s    '-'E'E$)&1 ++#)#=#= ##?? D6)g!hii"26S]"^D/7#F+r   r   r   r   r   r   r   r   r   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr9   )r   r   r   r    rw   r$  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r"  r   r   rV   r$  r   feed_forward_chunkr   r!  )r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r-   r   zElectraLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr   c                 L    | j                  |      }| j                  ||      }|S r   )r%  r  )r   r  intermediate_outputr.  s       r-   r'  zElectraLayer.feed_forward_chunk  s,    "//0@A{{#68HIr   r   )r   r   r   r   rY   r   r   r   r   r   r   r'  r   r   s   @r-   r  r    s    ," 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br   r  c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )ElectraEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r~   r   r^   r	   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r   r^   _r   s      r-   r   zElectraEncoder.__init__  sN    ]]%H`H`Ba#bQL$8#bc
&+# $cs   A#r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
Nr*   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   rw   r    r9   c              3   $   K   | ]  }|| 
 y wr   r*   )r+   vs     r-   r.   z)ElectraEncoder.forward.<locals>.<genexpr>W  s      
 = 
r/   )last_hidden_stater:  r   
attentionscross_attentions)r^   r#  r8  trainingr?   warning_once	enumerater7  _gradient_checkpointing_func__call__tupler   )r   r   r   r   r   r   r:  r   r   r;  r<  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r-   r   zElectraEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r   )	NNNNNNFFT)r   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   s   @r-   r2  r2    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r   r2  c                   (     e Zd ZdZ fdZd Z xZS )ElectraDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.c                    t         |           t        j                  |j                  |j                        | _        t        |j                        | _        t        j                  |j                  d      | _	        || _
        y Nr    )r~   r   r	   r   r   r   r   r  
activationr%   r^   r   s     r-   r   z(ElectraDiscriminatorPredictions.__init__n  s^    YYv1163E3EF
():):; "		&*<*<a @r   c                     | j                  |      }| j                  |      }| j                  |      j                  d      }|S )Nrw   )r   rU  r%   squeeze)r   discriminator_hidden_statesr   logitss       r-   r   z'ElectraDiscriminatorPredictions.forwardv  s?    

#>?6&&}5==bAr   r   r   r   r   r   r   r   r   s   @r-   rR  rR  k  s    Or   rR  c                   (     e Zd ZdZ fdZd Z xZS )ElectraGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                     t         |           t        d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _
        y )Ngelurt   )r~   r   r   rU  r	   r   r   r   r   r   r   r   s     r-   r   z$ElectraGeneratorPredictions.__init__  sV    (0f&;&;AVAVWYYv1163H3HI
r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   rU  r   )r   generator_hidden_statesr   s      r-   r   z#ElectraGeneratorPredictions.forward  s3    

#:;6}5r   rZ  r   s   @r-   r\  r\  ~  s    KJr   r\  c                   "    e Zd ZeZeZdZdZd Z	y)ElectraPreTrainedModelelectraTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)rJ   r	   r   r2   r[   normal_r^   initializer_ranger5   zero_r   rs   r   fill_)r   modules     r-   _init_weightsz$ElectraPreTrainedModel._init_weights  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r   N)
r   r   r   r!   config_classro   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingrl  r*   r   r-   rb  rb    s     L0O!&*#*r   rb  c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)ElectraForPreTrainingOutputa  
    Output type of [`ElectraForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss of the ELECTRA objective.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Prediction scores of the head (scores for each token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NlossrY  r   rA  )r   r   r   r   rs  r   rY   r   __annotations__rY  r   r   rA  r*   r   r-   rr  rr    sg    * )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r   rr  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   deee	j                     ef   fd       Z xZS )ElectraModelc                 "   t         |   |       t        |      | _        |j                  |j
                  k7  r/t        j                  |j                  |j
                        | _        t        |      | _
        || _        | j                          y r   )r~   r   rq   r   r   r   r	   r   embeddings_projectr2  encoderr^   	post_initr   s     r-   r   zElectraModel.__init__  sl     +F3  F$6$66&(ii0E0EvGYGY&ZD#%f-r   c                 .    | j                   j                  S r   r   r   r   s    r-   get_input_embeddingsz!ElectraModel.get_input_embeddings  s    ...r   c                 &    || j                   _        y r   r|  )r   r   s     r-   set_input_embeddingsz!ElectraModel.set_input_embeddings  s    */'r   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsry  r7  r"  r  )r   heads_to_pruner7  r  s       r-   _prune_headszElectraModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr   r   r   r{   rv   r   r   r   r   r:  r   r   r;  r<  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }n!||j                         d d }nt	        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|t        j                  ||      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |t        j                   |      }| j#                  ||      }| j                   j$                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j'                  |      }nd }| j)                  || j                   j*                        }| j                  |||||	      }t        | d
      r| j-                  |      }| j/                  ||||||	|
|||
      }|S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerw   z5You have to specify either input_ids or inputs_embedsr   r9   )r   r{   r   )r   rv   r{   r   r   rx  )	r   r   r   r   r:  r   r   r;  r<  )r^   r   r;  use_return_dictrV   %warn_if_padding_and_no_attention_maskr   r   rU   rY   onesr   r   r{   r   r   r   get_extended_attention_maskr   invert_attention_maskget_head_maskr6  rx  ry  )r   r   r   r{   rv   r   r   r   r   r:  r   r   r;  r<  r   
batch_sizer   r   r   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr9  encoder_hidden_shapeencoder_extended_attention_maskr   s                               r-   r   zElectraModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZFCN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z"&"B"B>S^"_ ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+&&y$++2O2OP	%)'#9 ( 
 4-. 33MBM2"7#B+/!5# % 
 r   )NNNNNNNNNNNNN)r   r   r   r   r~  r  r  r   r   rY   r   r   r   r   r   r   r   r   r   r   s   @r-   rv  rv    sk   
/0C  -11515/3,0048<9==A$(,0/3&*WELL)W !.W !.	W
 u||,W ELL)W  -W  (5W !) 6W "$u'8'8"9:W D>W $D>W 'tnW d^W 
uU\\"$FF	GW Wr   rv  c                   (     e Zd ZdZ fdZd Z xZS )ElectraClassificationHeadz-Head for sentence-level classification tasks.c                 z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        d      | _	        t        j                  |      | _        t        j                  |j                  |j                        | _        y )Nr^  )r~   r   r	   r   r   r   classifier_dropoutr   r   rU  r   r   
num_labelsout_projr   r^   r  r   s      r-   r   z"ElectraClassificationHead.__init__?  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 )0zz"45		&"4"4f6G6GHr   c                     |d d dd d f   }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S )Nr   )r   r   rU  r  )r   featureskwargsr   s       r-   r   z!ElectraClassificationHead.forwardI  sZ    Q1WLLOJJqMOOALLOMM!r   rZ  r   s   @r-   r  r  <  s    7Ir   r  c                        e Zd ZdZdef fdZ	 ddej                  deej                     dej                  fdZ
 xZS )	ElectraSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ElectraConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r^   c                 f   t         |           t        |dd      | _        | j                  dk(  rt        t        j                         | _        t        |d      rq|j                  ret        |d      r(|j                  r|j                  dkD  r|j                  }n|j                  }t        j                  |j                  |      | _        t        |dd       }|rt        |      nt        j                         | _        t        j                         | _        t        |d      r3|j"                  dkD  r$t        j$                  |j"                        | _        t        j                         | _        t        |d	      r5|j(                  dkD  r%t        j$                  |j(                        | _        y y y )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r~   r   rP   r  NotImplementedErrorr	   Identitysummaryr   r  r  r  r   r   r   rU  first_dropoutr  r   last_dropoutr  )r   r^   num_classesactivation_stringr   s       r-   r   zElectraSequenceSummary.__init__n  sU   #FNFC& &%{{}6-.63J3Jv78V=Z=Z_e_p_pst_t$//$0099V%7%7EDL#F,@$GIZN3D$E`b`k`k`m[[]6238T8TWX8X!#F,H,H!IDKKM612v7R7RUV7V "

6+F+F GD 8W2r   r   	cls_indexr   c                    | j                   dk(  r|dddf   }n| j                   dk(  r|dddf   }n| j                   dk(  r|j                  d      }n| j                   d	k(  r|At        j                  |d
ddddf   |j                  d   dz
  t        j
                        }nX|j                  d      j                  d      }|j                  d|j                         dz
  z  |j                  d      fz         }|j                  d|      j                  d      }n| j                   dk(  rt        | j                        }| j                  |      }| j                  |      }| j!                  |      }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r  Nrw   firstr   re  r    r   r  .r   r|   )rw   r  )r  re  rY   	full_likerU   r   	unsqueezer   r   r   gatherrW  r  r  r  rU  r  )r   r   r  r  s       r-   r   zElectraSequenceSummary.forward  sn    &"1b5)F')"1a4(F&("''A'.F+- !OO!#rr1*-!''+a/**	 &//3==bA	%,,Uimmo6I-JmN`N`acNdMf-fg	"))"i8@@DF&(%%##F+f%(""6*r   r   )r   r   r   r   r!   r   rY   r   r   r   r   r   r   s   @r-   r  r  T  sQ    2H} H< Y])"..);CEDTDT;U)			)r   r  z
    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS ) ElectraForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y r   )	r~   r   r  r^   rv  rc  r  r8   rz  r   s     r-   r   z)ElectraForSequenceClassification.__init__  sH      ++#F+3F; 	r   r   r   r{   rv   r   r   labelsr   r;  r<  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r{   rv   r   r   r   r;  r<  r   r    
regressionsingle_label_classificationmulti_label_classificationrw   rs  rY  r   rA  )r^   r  rc  r8   problem_typer  r}   rY   r   rR   r   rW  r   r   r
   r   r   rA  )r   r   r   r{   rv   r   r   r  r   r;  r<  rX  sequence_outputrY  rs  loss_fctr  s                    r-   r   z(ElectraForSequenceClassification.forward  s   ( &1%<k$++B]B]&*ll))%'/!5# '3 
'
# 6a81{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!<QR!@@F)-)9TGf$EvE'5CC2==	
 	
r   
NNNNNNNNNN)r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   s   @r-   r  r    s     -11515/3,004)-,0/3&*D
ELL)D
 !.D
 !.	D

 u||,D
 ELL)D
  -D
 &D
 $D>D
 'tnD
 d^D
 
uU\\"$<<	=D
 D
r   r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )ElectraForPreTrainingc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r~   r   rv  rc  rR  discriminator_predictionsrz  r   s     r-   r   zElectraForPreTraining.__init__  s3     #F+)H)P&r   r   r   r{   rv   r   r   r  r   r;  r<  r   c                 d   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|t	        j
                         }|a|j                  d|j                  d         dk(  }|j                  d|j                  d         |   }||   } |||j                               }n4 ||j                  d|j                  d         |j                               }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
            Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import ElectraForPreTraining, AutoTokenizer
        >>> import torch

        >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

        >>> sentence = "The quick brown fox jumps over the lazy dog"
        >>> fake_sentence = "The quick brown fox fake over the lazy dog"

        >>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
        >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
        >>> discriminator_outputs = discriminator(fake_inputs)
        >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

        >>> fake_tokens
        ['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]']

        >>> predictions.squeeze().tolist()
        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        ```Nr  r   rw   r    r  )r^   r  rc  r  r	   r
   r   rU   floatrr  r   rA  )r   r   r   r{   rv   r   r   r  r   r;  r<  rX  discriminator_sequence_outputrY  rs  r  active_lossactive_logitsactive_labelsr  s                       r-   r   zElectraForPreTraining.forward   sp   Z &1%<k$++B]B]&*ll))%'/!5# '3 
'
# )DA(F%//0MN++-H),11"6S6Y6YZ[6\]abb &B0M0S0STU0V WXc d &{ 3}/B/B/DEB0M0S0STU0V WY_YeYeYghY!<QR!@@F)-)9TGf$EvE*5CC2==	
 	
r   r  )r   r   r   r   r   r   rY   r   r   r   r   rr  r   r   r   s   @r-   r  r    s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   r  z
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
    the two to have been trained for the masked language modeling task.
    c                   p    e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )rK   generator_lm_head.weightc                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  |j                        | _
        | j                          y r   )r~   r   rv  rc  r\  generator_predictionsr	   r   r   r   generator_lm_headrz  r   s     r-   r   zElectraForMaskedLM.__init__  sR     #F+%@%H"!#6+@+@&BSBS!Tr   c                     | j                   S r   r  r}  s    r-   get_output_embeddingsz(ElectraForMaskedLM.get_output_embeddings      %%%r   c                     || _         y r   r  )r   r   s     r-   set_output_embeddingsz(ElectraForMaskedLM.set_output_embeddings  s
    !0r   r   r   r{   rv   r   r   r  r   r;  r<  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|Pt        j                         } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   rw   r    r  )r^   r  rc  r  r  r	   r   r   r   r   r   rA  )r   r   r   r{   rv   r   r   r  r   r;  r<  r`  generator_sequence_outputprediction_scoresrs  r  r  s                    r-   r   zElectraForMaskedLM.forward  s   ( &1%<k$++B]B]"&,,))%'/!5# #/ 
#
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D'),CAB,GGF)-)9TGf$EvE$1??.99	
 	
r   r  )r   r   r   _tied_weights_keysr   r  r  r   r   rY   r   r   r   r   r   r   r   r   s   @r-   rK   rK   u  s    55&1  -11515/3,004)-,0/3&*4
ELL)4
 !.4
 !.	4

 u||,4
 ELL)4
  -4
 &4
 $D>4
 'tn4
 d^4
 
uU\\"N2	34
 4
r   rK   z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )ElectraForTokenClassificationc                 `   t         |   |       |j                  | _        t        |      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )r~   r   r  rv  rc  r  r   r	   r   r   r   r   r8   rz  r  s      r-   r   z&ElectraForTokenClassification.__init__  s      ++#F+)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJr   r   r   r{   rv   r   r   r  r   r;  r<  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rw   r    r  )r^   r  rc  r   r8   r   r   r  r   r   rA  )r   r   r   r{   rv   r   r   r  r   r;  r<  rX  r  rY  rs  r  r  s                    r-   r   z%ElectraForTokenClassification.forward  s    $ &1%<k$++B]B]&*ll))%'/!5# '3 
'
# )DA(F%(,5R(S%!>?')HFKKDOO<fkk"oNDY!<QR!@@F)-)9TGf$EvE$5CC2==	
 	
r   r  )r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   s   @r-   r  r    s     -11515/3,004)-,0/3&*1
ELL)1
 !.1
 !.	1

 u||,1
 ELL)1
  -1
 &1
 $D>1
 'tn1
 d^1
 
uU\\"$99	:1
 1
r   r  c                       e Zd ZeZdZ fdZe	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )ElectraForQuestionAnsweringrc  c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r~   r   r  rv  rc  r	   r   r   
qa_outputsrz  r   s     r-   r   z$ElectraForQuestionAnswering.__init__  sS      ++#F+))F$6$68I8IJ 	r   r   r   r{   rv   r   r   start_positionsend_positionsr   r;  r<  r   c           
      &   ||n| j                   j                  }| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	N)r   r{   rv   r   r   r   r;  r   r    rw   r   )ignore_indexr9   )rs  start_logits
end_logitsr   rA  )r^   r  rc  r  rM   rW  r   rQ   r   clampr   r   r   rA  )r   r   r   r{   rv   r   r   r  r  r   r;  r<  rX  r  rY  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r-   r   z#ElectraForQuestionAnswering.forward!  s    &1%<k$++B]B]&*ll))%'/!5 '3 	'
# 6a81#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J ,AB/0F 0:/EZMF*Q6Q+%!5CC2==
 	
r   )NNNNNNNNNNN)r   r   r   r!   rm  ro  r   r   r   rY   r   r   r   r   r   r   r   r   s   @r-   r  r    s6    L!  -11515/3,0042604,0/3&*@
ELL)@
 !.@
 !.	@

 u||,@
 ELL)@
  -@
 "%,,/@
  -@
 $D>@
 'tn@
 d^@
 
uU\\"$@@	A@
 @
r   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )ElectraForMultipleChoicec                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y rT  )r~   r   rv  rc  r  sequence_summaryr	   r   r   r8   rz  r   s     r-   r   z!ElectraForMultipleChoice.__init__g  sM     #F+ 6v >))F$6$6: 	r   r   r   r{   rv   r   r   r  r   r;  r<  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr    rw   r   r  r   r  )r^   r  rU   r   r   rc  r  r8   r   r   r   rA  )r   r   r   r{   rv   r   r   r  r   r;  r<  num_choicesrX  r  pooled_outputrY  reshaped_logitsrs  r  r  s                       r-   r   z ElectraForMultipleChoice.forwardq  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 '+ll))%'/!5# '3 
'
# 6a8--o>/ ++b+6')HOV4D%'*Eab*IIF)-)9TGf$EvE("5CC2==	
 	
r   r  )r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   s   @r-   r  r  e  s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   r  zS
    ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   deee	j                     ef   fd       Zd Z xZS )ElectraForCausalLMr  c                 $   t         |   |       |j                  st        j	                  d       t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )NzOIf you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`)r~   r   r   r?   warningrv  rc  r\  r  r	   r   r   r   r  init_weightsr   s     r-   r   zElectraForCausalLM.__init__  sj       NNlm#F+%@%H"!#6+@+@&BSBS!Tr   c                     | j                   S r   r  r}  s    r-   r  z(ElectraForCausalLM.get_output_embeddings  r  r   c                     || _         y r   r  )r   new_embeddingss     r-   r  z(ElectraForCausalLM.set_output_embeddings  s
    !/r   r   r   r{   rv   r   r   r   r   r  r:  r   r   r;  r<  r   c                    ||n| j                   j                  }|	d}| j                  |||||||||
||||      }|d   }| j                  | j	                  |            }d}|	* | j
                  ||	fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a3  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
        >>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
        >>> config.is_decoder = True
        >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   r{   rv   r   r   r   r   r:  r   r   r;  r<  r   r   r    )rs  rY  r:  r   rA  rB  )r^   r  rc  r  r  loss_functionr   r   r:  r   rA  rB  )r   r   r   r{   rv   r   r   r   r   r  r:  r   r   r;  r<  r  r   r  r  lm_lossr  s                        r-   r   zElectraForCausalLM.forward  s.   R &1%<k$++B]B]I,,))%'"7#9+/!5#  
  "!* 2243M3Mo3^_(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   c                 J    d}|D ]  }|t        fd|D              fz  } |S )Nr*   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectr   r   )r+   
past_statebeam_idxs     r-   r.   z4ElectraForCausalLM._reorder_cache.<locals>.<genexpr>B  s.     nU_j--aZ=N=N1OPns   58)rH  )r   r:  r  reordered_past
layer_pasts     `  r-   _reorder_cachez!ElectraForCausalLM._reorder_cache>  s=    ) 	Jncmnn N	 r   )NNNNNNNNNNNNNN)r   r   r   r  r   r  r  r   r   rY   r   r   r   r   r   r   r   r	  r   r   s   @r-   r  r    s    55
&0  -11515/3,0048<9=)-8<$(,0/3&*S
ELL)S
 !.S
 !.	S

 u||,S
 ELL)S
  -S
  (5S
 !) 6S
 &S
 "$u||"45S
 D>S
 $D>S
 'tnS
 d^S
" 
uU\\"$EE	F#S
 S
lr   r  )
r  rK   r  r  r  r  r  rv  rb  ro   )discriminator)Ir   r   rA   dataclassesr   typingr   r   r   r   r   rY   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   r   
generationr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_electrar!   
get_loggerr   r?   ro   Modulerq   r   r   r  r  r  r  r  r2  rR  r\  rb  rr  rv  r  r  r  r  rK   r  r  r  r  __all__r*   r   r-   <module>r     s     	 ! 9 9    A A 1 )	 	 	 . l l 
 1 
		H	%Od?		 ?FC299 CN		  !" 0ryy 0h"))  BII S299 SnZ
RYY Z
zbii &")) $ *_ * *. :+ : :8 s) s sl		 0`RYY `F P
'= P
P
f [
2 [
[
| H
/ H
H
V @
$: @
@
F O
"8 O
 O
d d
5 d
 d
N 
r/ r
rjr   