
    Uh@                     .   d Z ddlZddlZddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ dd
lmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  e#jL                  e'      Z(d Z) G d de	jT                        Z+ G d de	jT                        Z, G d de	jT                        Z- G d de	jT                        Z. G d de	jT                        Z/ G d de	jT                        Z0 G d de	jT                        Z1 G d de	jT                        Z2 G d  d!e	jT                        Z3 G d" d#e	jT                        Z4 G d$ d%e	jT                        Z5 G d& d'e	jT                        Z6e" G d( d)e             Z7 e"d*+       G d, d-e7             Z8e" G d. d/e7             Z9 e"d0+       G d1 d2e7e             Z: e"d3+       G d4 d5e7             Z;e" G d6 d7e7             Z<e" G d8 d9e7             Z=e" G d: d;e7             Z>g d<Z?y)=zPyTorch RemBERT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )RemBertConfigc           
         	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]s  \  }
t        fddD              rt        j                  d d|
        |j                  j                  |      }|j                         |	j                  |       u t        ||	      D ]  \  }j!                  d	d
      j#                  d      t        d D              r(t        j                  ddj%                                d| }D ]  }|j'                  d|      r|j#                  d|      }n|g}|d   dk(  s|d   dk(  rt)        |d      }nW|d   dk(  s|d   dk(  rt)        |d      }n:|d   dk(  rt)        |d      }n%|d   dk(  rt)        |d      }n	 t)        ||d         }t/        |      dk\  st1        |d         }||   } dd dk(  rt)        |d      }n|dk(  r|j3                  |      }	 |j4                  |j4                  k7  r&t7        d|j4                   d|j4                   d       	 t        j                  d!        t=        j>                  |      |_          | S # t        $ r t        j                  d        w xY w# t*        $ r7 t        j                  dj-                  dj%                                     Y w xY w# t8        $ r1}|xj:                  |j4                  |j4                  fz  c_         d}~ww xY w)"z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from c              3   &   K   | ]  }|v  
 y wN ).0denynames     ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/rembert/modeling_rembert.py	<genexpr>z-load_tf_weights_in_rembert.<locals>.<genexpr>E   s     Xtt|Xs   )adam_vadam_moutput_embeddingclszLoading TF weight z with shape zbert/zrembert//c              3   $   K   | ]  }|d v  
 yw))r&   r'   AdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepNr    )r!   ns     r$   r%   z-load_tf_weights_in_rembert.<locals>.<genexpr>V   s      
 nn
   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifierzSkipping {}   r   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )!renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesanyload_variableappendzipreplacesplitjoin	fullmatchgetattrAttributeErrorformatlenint	transposeshape
ValueErrorAssertionErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr<   nptftf_path	init_varsnamesarraysrV   arraypointerm_namescope_namesnumer#   s                    @r$   load_tf_weights_in_rembertrl   /   sV   
 ggoo01G
KK8	BC''0IEF  	e X(WXX(l5'BC&&w5Te	 5&) 1/e||GZ0 zz#  

 
 KK)CHHTN#345 	'F||,f5 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g{1~>G ;1$+a.)!#,+	', #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.c1/d LS  Q	
 	n & KK 4 4SXXd^ DE  	FFw}}ekk22F	s5   J7 !K ?L7 K<LL	M&,MMc                        e Zd ZdZ fdZ	 	 	 	 	 d
deej                     deej                     deej                     deej                     de	dej                  fd	Z xZS )RemBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 |   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )N)padding_idxepsposition_ids)r   F)
persistent)super__init__r   	Embedding
vocab_sizeinput_embedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrZ   arangeexpandselfr^   	__class__s     r$   rw   zRemBertEmbeddings.__init__   s    !||v::H[H[ 
 $&<<0N0NPVPkPk#l %'\\&2H2H&JeJe%f" f&A&AvG\G\]zz&"<"<= 	ELL)G)GHOOPWXej 	 	
    	input_idstoken_type_idsrs   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                  |      }||z   }	| j                  |      }
|	|
z  }	| j                  |	      }	| j                  |	      }	|	S )Nrt   r   dtypedevice)sizers   rZ   zeroslongr   r|   r   r~   r   r   )r   r   r   rs   r   r   input_shape
seq_lengthr   
embeddingsr~   s              r$   forwardzRemBertEmbeddings.forward   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r   )NNNNr   )__name__
__module____qualname____doc__rw   r   rZ   
LongTensorFloatTensorrT   Tensorr   __classcell__r   s   @r$   rn   rn      s    Q
( 15593759&'E,,- !!1!12 u//0	
   1 12 !$ 
r   rn   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rv   rw   r   Linearhidden_sizedenseTanh
activationr   s     r$   rw   zRemBertPooler.__init__   s9    YYv1163E3EF
'')r   hidden_statesr   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r   )r   r   first_token_tensorpooled_outputs       r$   r   zRemBertPooler.forward   s6     +1a40

#566r   r   r   r   rw   rZ   r   r   r   r   s   @r$   r   r      s#    $
U\\ ell r   r   c                        e Zd Z fdZd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	e
d
e	fdZ xZS )RemBertSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |j"                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())rv   rw   r   num_attention_headshasattrrW   rT   attention_head_sizeall_head_sizer   r   querykeyvaluer   attention_probs_dropout_probr   
is_decoderr   s     r$   rw   zRemBertSelfAttention.__init__   s)    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++r   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nrt   r   r:   r   r
   )r   r   r   viewpermute)r   xnew_x_shapes      r$   transpose_for_scoresz)RemBertSelfAttention.transpose_for_scores   sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsr   c                    | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }| j                  r|
|f}t	        j                  ||
j                  dd            }|t        j                  | j                        z  }|||z   }t        j                  j                  |d      }| j                  |      }|||z  }t	        j                  ||      }|j!                  dddd      j#                         }|j%                         d d | j&                  fz   } |j(                  | }|r||fn|f}| j                  r||fz   }|S )Nr   r   r:   dimrt   r
   )r   r   r   r   rZ   catr   matmulrU   mathsqrtr   r   
functionalsoftmaxr   r   
contiguousr   r   r   )r   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                     r$   r   zRemBertSelfAttention.forward   sX    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB?? (5N !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2mM]?? 11Gr   NNNNNF)r   r   r   rw   r   rZ   r   r   r   r   boolr   r   r   s   @r$   r   r      s    ,(% 7;15=A>BDH"'L||L !!2!23L E--.	L
  ((9(9:L !)):): ;L !uU->->'?!@AL  L 
Lr   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )RemBertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nrq   )rv   rw   r   r   r   r   r   r   r   r   r   r   s     r$   rw   zRemBertSelfOutput.__init__3  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   r   r   r   r   s      r$   r   zRemBertSelfOutput.forward9  7    

=1]3}|'CDr   r   r   s   @r$   r   r   2  1    >U\\  RWR^R^ r   r   c                       e Zd Z fdZd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )RemBertAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rv   rw   r   r   r   outputsetpruned_headsr   s     r$   rw   zRemBertAttention.__init__A  s0    (0	'/Er   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )rS   r   r   r   r   r   r   r   r   r   r   r   r   union)r   headsindexs      r$   prune_headszRemBertAttention.prune_headsH  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   )r   r   )r   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r$   r   zRemBertAttention.forward[  sW     yy!"
  ;;|AF#%QR(88r   r   )r   r   r   rw   r   rZ   r   r   r   r   r   r   r   r   s   @r$   r   r   @  s    ";, 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rv   rw   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r$   rw   zRemBertIntermediate.__init__u  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r  r   r   s     r$   r   zRemBertIntermediate.forward}  s&    

=100?r   r   r   s   @r$   r   r   t  s#    9U\\ ell r   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )RemBertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rv   rw   r   r   r   r   r   r   r   r   r   r   r   s     r$   rw   zRemBertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r$   r   zRemBertOutput.forward  r   r   r   r   s   @r$   r  r    r   r   r  c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )RemBertLayerc                 b   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is added)rv   rw   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionrW   crossattentionr   intermediater  r   r   s     r$   rw   zRemBertLayer.__init__  s    '-'E'E$)&1 ++#)#=#= ##?? D6)g!hii"26":D/7#F+r   r   r   r   r   r   r   r   r   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr:   )r   r   r   r   rt   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r   r   rW   r  r   feed_forward_chunkr  r  )r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r$   r   zRemBertLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr   c                 L    | j                  |      }| j                  ||      }|S r   )r  r   )r   r   intermediate_outputr  s       r$   r  zRemBertLayer.feed_forward_chunk  s,    "//0@A{{#68HIr   r   )r   r   r   rw   rZ   r   r   r   r   r   r   r  r   r   s   @r$   r  r    s    ,$ 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Dr   r  c                       e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	e	d
e	de	de
eef   fdZ xZS )RemBertEncoderc                 .   t         |           || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _        y c c}w )NF)rv   rw   r^   r   r   rz   r   embedding_hidden_mapping_in
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r   r^   _r   s      r$   rw   zRemBertEncoder.__init__  sn    +-99V5P5PRXRdRd+e(]]%H`H`Ba#bQL$8#bc
&+# $cs   ,Br   r   r   r   r   past_key_values	use_cacher   output_hidden_statesreturn_dictr   c                    | j                   r%| j                  r|rt        j                  d       d}| j	                  |      }|	rdnd }|rdnd }|r| j
                  j                  rdnd }|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                   r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j
                  j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr    r   rt   r   r:   c              3   $   K   | ]  }|| 
 y wr   r    )r!   vs     r$   r%   z)RemBertEncoder.forward.<locals>.<genexpr>5  s      
 = 
r0   )last_hidden_stater(  r   
attentionscross_attentions)r&  trainingr@   warning_oncer!  r^   r  	enumerater%  _gradient_checkpointing_func__call__tupler   )r   r   r   r   r   r   r(  r)  r   r*  r+  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r$   r   zRemBertEncoder.forward  s    &&4==##p "	88G"6BD$5b4%64;;;Z;Zr`d#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r   )	NNNNNNFFT)r   r   r   rw   rZ   r   r   r   r   r   r   r   r   r   r   s   @r$   r  r    s    , 7;15=A>BEI$("'%* S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
  S
 #S
 S
 
u??	@S
r   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rv   rw   r   r   r   r   r   r  r  r   transform_act_fnr   r   r   s     r$   rw   z'RemBertPredictionHeadTransform.__init__K  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   rC  r   r  s     r$   r   z&RemBertPredictionHeadTransform.forwardT  s4    

=1--m<}5r   r   r   s   @r$   rA  rA  J  s$    UU\\ ell r   rA  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertLMPredictionHeadc                 n   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _        t        j                  |j
                  |j                        | _        y r   )rv   rw   r   r   r   output_embedding_sizer   ry   decoderr   r  r   r   r   r   s     r$   rw   z RemBertLMPredictionHead.__init__\  sz    YYv1163O3OP
yy!=!=v?P?PQ !2!23f&B&BH]H]^r   r   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   rI  r  s     r$   r   zRemBertLMPredictionHead.forwardc  s@    

=16}5]3r   r   r   s   @r$   rF  rF  [  s$    _U\\ ell r   rF  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )rv   rw   rF  predictionsr   s     r$   rw   zRemBertOnlyMLMHead.__init__m  s    26:r   sequence_outputr   c                 (    | j                  |      }|S r   )rN  )r   rO  prediction_scoress      r$   r   zRemBertOnlyMLMHead.forwardq  s     ,,_=  r   r   r   s   @r$   rL  rL  l  s#    ;!u|| ! !r   rL  c                   "    e Zd ZeZeZdZdZd Z	y)RemBertPreTrainedModelrembertTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)r   r   r   r3   r\   normal_r^   initializer_ranger6   zero_rx   rp   r   fill_)r   modules     r$   _init_weightsz$RemBertPreTrainedModel._init_weights}  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r   N)
r   r   r   r   config_classrl   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr]  r    r   r$   rS  rS  v  s     L0O!&*#*r   rS  a  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                        e Zd Zd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deeee	j                           dee   dee   dee   dee   deeef   fd       Z xZS )RemBertModelc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rv   rw   r^   rn   r   r  encoderr   pooler	post_init)r   r^   add_pooling_layerr   s      r$   rw   zRemBertModel.__init__  sM    
 	 +F3%f-/@mF+d 	r   c                 .    | j                   j                  S r   r   r|   r   s    r$   get_input_embeddingsz!RemBertModel.get_input_embeddings  s    ...r   c                 &    || j                   _        y r   rk  )r   r   s     r$   set_input_embeddingsz!RemBertModel.set_input_embeddings  s    */'r   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrf  r%  r  r   )r   heads_to_pruner%  r   s       r$   _prune_headszRemBertModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr   r   r   r   rs   r   r   r   r   r(  r)  r   r*  r+  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|t        j                  |||z   f|      }|&t        j                  |t        j                  |      }| j                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j!                  |      }nd }| j#                  || j                   j$                        }| j'                  |||||	      }| j)                  ||||||	|
|||

      }|d   }| j*                  | j+                  |      nd }|s
||f|dd  z   S t-        |||j.                  |j0                  |j2                  |j4                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timert   z5You have to specify either input_ids or inputs_embedsr   r:   )r   r   )r   rs   r   r   r   )	r   r   r   r   r(  r)  r   r*  r+  r   )r/  pooler_outputr(  r   r0  r1  )r^   r   r*  use_return_dictr   r)  rW   %warn_if_padding_and_no_attention_maskr   r   rV   rZ   onesr   r   get_extended_attention_maskinvert_attention_maskget_head_maskr$  r   rf  rg  r   r(  r   r0  r1  )r   r   r   r   rs   r   r   r   r   r(  r)  r   r*  r+  r   
batch_sizer   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr'  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputsrO  r   s                                r$   r   zRemBertModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!"[[EJJvVN 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )T)NNNNNNNNNNNNN)r   r   r   rw   rm  ro  rs  r   r   rZ   r   r   r   r   r   r   r   r   r   s   @r$   rd  rd    sw    /0C  155959371559=A>BEI$(,0/3&*f
E,,-f
 !!1!12f
 !!1!12	f

 u//0f
 E--.f
   1 12f
  ((9(9:f
 !)):): ;f
 "%e.?.?(@"ABf
 D>f
 $D>f
 'tnf
 d^f
 
uBB	Cf
 f
r   rd  c                       e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deeef   fd       ZddZedefd       Z xZS )RemBertForMaskedLMcls.predictions.decoder.weightc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NznIf you want to use `RemBertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fri  
rv   rw   r   r@   warningrd  rT  rL  r)   rh  r   s     r$   rw   zRemBertForMaskedLM.__init__'  sR     NN1
 $FeD%f- 	r   c                 B    | j                   j                  j                  S r   r)   rN  rI  rl  s    r$   get_output_embeddingsz(RemBertForMaskedLM.get_output_embeddings6      xx##+++r   c                 :    || j                   j                  _        y r   r  r   new_embeddingss     r$   set_output_embeddingsz(RemBertForMaskedLM.set_output_embeddings9      '5$r   r   r   r   rs   r   r   r   r   labelsr   r*  r+  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|d   }| j                  |      }d}|	Ft	               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)
r   r   rs   r   r   r   r   r   r*  r+  r   rt   r:   losslogitsr   r0  )
r^   rv  rT  r)   r   r   ry   r   r   r0  )r   r   r   r   rs   r   r   r   r   r  r   r*  r+  r   rO  rQ  masked_lm_lossloss_fctr   s                      r$   r   zRemBertForMaskedLM.forward<  s    , &1%<k$++B]B],,))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    |j                   }|d   }| j                  j                  J d       t        j                  ||j                  |j                   d   df      gd      }t        j                  |df| j                  j                  t        j                  |j                        }t        j                  ||gd      }||dS )Nr   z.The PAD token should be defined for generationr   rt   r   r   )r   r   )	rV   r^   r{   rZ   r   	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r$   prepare_inputs_for_generationz0RemBertForMaskedLM.prepare_inputs_for_generationu  s    oo*1~ {{''3e5ee3NN4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy+6A>	&.IIr   c                      y)z
        Legacy correction: RemBertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
        `prepare_inputs_for_generation` method.
        Fr    )r)   s    r$   can_generatezRemBertForMaskedLM.can_generate  s     r   )NNNNNNNNNNNNr   )r   r   r   _tied_weights_keysrw   r  r  r   r   rZ   r   r   r   r   r   r   r   r  classmethodr  r   r   s   @r$   r  r  #  sp   :;,6  155959371559=A>B-1,0/3&*6
E,,-6
 !!1!126
 !!1!12	6

 u//06
 E--.6
   1 126
  ((9(9:6
 !)):): ;6
 ))*6
 $D>6
 'tn6
 d^6
 
un$	%6
 6
pJ T  r   r  zS
    RemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deeee	j                           dee	j                     dee   dee   dee   dee   deeef   fd       Zd Z xZS )RemBertForCausalLMr  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzOIf you want to use `RemBertForCausalLM` as a standalone, add `is_decoder=True.`Fr  r  r   s     r$   rw   zRemBertForCausalLM.__init__  sL       NNlm#FeD%f- 	r   c                 B    | j                   j                  j                  S r   r  rl  s    r$   r  z(RemBertForCausalLM.get_output_embeddings  r  r   c                 :    || j                   j                  _        y r   r  r  s     r$   r  z(RemBertForCausalLM.set_output_embeddings  r  r   r   r   r   rs   r   r   r   r   r(  r  r)  r   r*  r+  r   c                    ||n| j                   j                  }| j                  |||||||||	||||      }|d   }| j                  |      }d}|
* | j                  ||
fd| j                   j
                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RemBertForCausalLM, RemBertConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/rembert")
        >>> config = RemBertConfig.from_pretrained("google/rembert")
        >>> config.is_decoder = True
        >>> model = RemBertForCausalLM.from_pretrained("google/rembert", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```N)r   r   rs   r   r   r   r   r(  r)  r   r*  r+  r   ry   r:   )r  r  r(  r   r0  r1  )r^   rv  rT  r)   loss_functionry   r   r(  r   r0  r1  )r   r   r   r   rs   r   r   r   r   r(  r  r)  r   r*  r+  kwargsr   rO  rQ  lm_lossr   s                        r$   r   zRemBertForCausalLM.forward  s   R &1%<k$++B]B],,))%'"7#9+/!5#  
  "!* HH_5(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   c                 \    d}|D ]#  }|t        fd|d d D              |dd  z   fz  }% |S )Nr    c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selecttor   )r!   
past_statebeam_idxs     r$   r%   z4RemBertForCausalLM._reorder_cache.<locals>.<genexpr>  s.     rU_j--aZ=N=N1OPrs   58r:   )r7  )r   r(  r  reordered_past
layer_pasts     `  r$   _reorder_cachez!RemBertForCausalLM._reorder_cache  sT    ) 	JrcmnpopcqrrQR.! N	
 r   )NNNNNNNNNNNNNN)r   r   r   r  rw   r  r  r   r   rZ   r   r   r   r   r   r   r   r  r   r   s   @r$   r  r    s    ;;
,6  155959371559=A>BEI-1$(,0/3&*Q
E,,-Q
 !!1!12Q
 !!1!12	Q

 u//0Q
 E--.Q
   1 12Q
  ((9(9:Q
 !)):): ;Q
 "%e.?.?(@"ABQ
 ))*Q
 D>Q
 $D>Q
 'tnQ
 d^Q
" 
u77	8#Q
 Q
fr   r  z
    RemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS ) RemBertForSequenceClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   rv   rw   
num_labelsrd  rT  r   r   classifier_dropout_probr   r   r   r9   rh  r   s     r$   rw   z)RemBertForSequenceClassification.__init__  si      ++#F+zz&"@"@A))F$6$68I8IJ 	r   r   r   r   rs   r   r   r  r   r*  r+  r   c                 @   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   rs   r   r   r   r*  r+  r   
regressionsingle_label_classificationmulti_label_classificationrt   r:   r  )r^   rv  rT  r   r9   problem_typer  r   rZ   r   rT   r	   squeezer   r   r   r   r   r0  )r   r   r   r   rs   r   r   r  r   r*  r+  r   r   r  r  r  r   s                    r$   r   z(RemBertForSequenceClassification.forward  s   ( &1%<k$++B]B],,))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   
NNNNNNNNNN)r   r   r   rw   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @r$   r  r    s     266:59481559-1,0/3&*E
E--.E
 !!2!23E
 !!1!12	E

 u001E
 E--.E
   1 12E
 ))*E
 $D>E
 'tnE
 d^E
 
u..	/E
 E
r   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )RemBertForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )rv   rw   rd  rT  r   r   r  r   r   r   r9   rh  r   s     r$   rw   z!RemBertForMultipleChoice.__init__`  sV     #F+zz&"@"@A))F$6$6: 	r   r   r   r   rs   r   r   r  r   r*  r+  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rt   r   r  r:   r  )r^   rv  rV   r   r   rT  r   r9   r   r   r   r0  )r   r   r   r   rs   r   r   r  r   r*  r+  num_choicesr   r   r  reshaped_logitsr  r  r   s                      r$   r   z RemBertForMultipleChoice.forwardj  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ,,))%'/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   r  )r   r   r   rw   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @r$   r  r  ^  s     266:59481559-1,0/3&*X
E--.X
 !!2!23X
 !!1!12	X

 u001X
 E--.X
   1 12X
 ))*X
 $D>X
 'tnX
 d^X
 
u//	0X
 X
r   r  c                   D    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eef   fd       Z xZS )RemBertForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y NFr  r  r   s     r$   rw   z&RemBertForTokenClassification.__init__  sk      ++#FeDzz&"@"@A))F$6$68I8IJ 	r   r   r   r   rs   r   r   r  r   r*  r+  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rt   r:   r  )r^   rv  rT  r   r9   r   r   r  r   r   r0  )r   r   r   r   rs   r   r   r  r   r*  r+  r   rO  r  r  r  r   s                    r$   r   z%RemBertForTokenClassification.forward  s    $ &1%<k$++B]B],,))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   r  )r   r   r   rw   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @r$   r  r    s   	  266:59481559-1,0/3&*2
E--.2
 !!2!232
 !!1!12	2

 u0012
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
r   r  c                   d    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eef   fd       Z xZS )RemBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
rv   rw   r  rd  rT  r   r   r   
qa_outputsrh  r   s     r$   rw   z$RemBertForQuestionAnswering.__init__  sU      ++#FeD))F$6$68I8IJ 	r   r   r   r   rs   r   r   start_positionsend_positionsr   r*  r+  r   c                    ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|       |j                  d|       t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   rt   r   )ignore_indexr:   )r  start_logits
end_logitsr   r0  )r^   rv  rT  r  rM   r  rS   r   clamp_r   r   r   r0  )r   r   r   r   rs   r   r   r  r  r   r*  r+  r   rO  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          r$   r   z#RemBertForQuestionAnswering.forward  s    &1%<k$++B]B],,))%'/!5#  

 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )NNNNNNNNNNN)r   r   r   rw   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @r$   r  r  	  s$   	  266:594815596:48,0/3&*>
E--.>
 !!2!23>
 !!1!12	>

 u001>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
r   r  )
r  r  r  r  r  r  r  rd  rS  rl   )@r   r   rB   typingr   r   r   rZ   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   
generationr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_rembertr   
get_loggerr   r@   rl   Modulern   r   r   r   r   r   r  r  r  rA  rF  rL  rS  rd  r  r  r  r  r  r  __all__r    r   r$   <module>r     sP     	 ) )    A A ! )	 	 	 . l l , 0 
		H	%Pf3		 3nBII f299 fT		 0ryy 0h"))  BII U299 Up\
RYY \
@RYY "bii "! ! *_ * *. 	F
) F
F
R e/ e eP 
p/ p
pf Q
'= Q
Q
h d
5 d
 d
N ?
$: ?
 ?
D K
"8 K
 K
\r   