
    Uh7                    ^   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1  e/jd                  e3      Z4d Z5 G d dejl                        Z7 G d dejl                        Z8 G d de8      Z9 G d dejl                        Z:e8e9dZ; G d dejl                        Z< G d dejl                        Z= G d  d!ejl                        Z> G d" d#ejl                        Z? G d$ d%ejl                        Z@ G d& d'ejl                        ZA G d( d)ejl                        ZB G d* d+ejl                        ZC G d, d-ejl                        ZD G d. d/ejl                        ZE G d0 d1ejl                        ZFe- G d2 d3e&             ZGe G d4 d5e,             ZH e-d67       G d8 d9eG             ZI e-d:7       G d; d<eG             ZJ e-d=7       G d> d?eGe             ZKe- G d@ dAeG             ZL e-dB7       G dC dDeG             ZM e-dE7       G dF dGeG             ZNe- G dH dIeG             ZOe- G dJ dKeG             ZPe- G dL dMeG             ZQg dNZRy)OzPyTorch BERT model.    N)	dataclass)ListOptionalTupleUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringget_torch_versionlogging   )
BertConfigc           	      L   	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]^  \  }
}t        j                  d|
 d|        |j                  j                  ||
      }|j                  |
       |	j                  |       ` t        ||	      D ]  \  }
}|
j                  d      }
t!        d |
D              r(t        j                  d	dj#                  |
              R| }|
D ]  }|j%                  d
|      r|j                  d|      }n|g}|d   dk(  s|d   dk(  rt'        |d      }nW|d   dk(  s|d   dk(  rt'        |d      }n:|d   dk(  rt'        |d      }n%|d   dk(  rt'        |d      }n	 t'        ||d         }t+        |      dk\  st-        |d         }||   } dd dk(  rt'        |d      }n|dk(  r|j/                  |      }	 |j0                  |j0                  k7  r&t3        d|j0                   d|j0                   d      	 t        j                  d|
        t7        j8                  |      |_         | S # t        $ r t        j                  d        w xY w# t(        $ r+ t        j                  d	dj#                  |
              Y w xY w# t2        $ r1}|xj4                  |j0                  |j0                  fz  c_         d}~ww xY w)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py	<genexpr>z*load_tf_weights_in_bert.<locals>.<genexpr>R   s      
 nn
   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r#   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr>   nptftf_path	init_varsnamesarraysnamerV   arraypointerm_namescope_namesnumes                     r0   load_tf_weights_in_bertrl   5   s,   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	 5&) ,/ezz#  

 
 KK)CHHTN#345 	'F||,f5 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g{1~>G ;1$+a.)!#,+	', #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.Y,/Z L  Q	
 	Z & KK)CHHTN+; <=  	FFw}}ekk22F	s5   J 9J2?K) J/20K&%K&)	L#2,LL#c                        e Zd ZdZ fdZ	 	 	 	 	 d
deej                     deej                     deej                     deej                     de	dej                  fd	Z xZS )BertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r#   F)
persistenttoken_type_idsdtype)super__init__r	   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrQ   rs   register_bufferrY   arangeexpandzerosru   sizelongselfr]   	__class__s     r0   r|   zBertEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsrx   ru   inputs_embedspast_key_values_lengthreturnc                 Z   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }	|	}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j                  |      }| j                  |      }|S )Nrv   r#   rx   r   rz   devicert   )r   ru   hasattrrx   r   rY   r   r   r   r   r   rs   r   r   r   )r   r   rx   ru   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r0   forwardzBertEmbeddings.forward   sH     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r   )NNNNr   )__name__
__module____qualname____doc__r|   r   rY   
LongTensorFloatTensorrT   Tensorr   __classcell__r   s   @r0   rn   rn   ~   s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
'r   rn   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )BertSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rs   rt   relative_keyrelative_key_queryr<   r#   )r{   r|   r   num_attention_headsr   rW   rT   attention_head_sizeall_head_sizer	   Linearquerykeyvaluer   attention_probs_dropout_probr   rQ   rs   r   r}   distance_embedding
is_decoderr   r]   rs   r   s      r0   r|   zBertSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r   xr   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nrv   r   r<   r#   r   )r   r   r   viewpermute)r   r   new_x_shapes      r0   transpose_for_scoresz&BertSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r#   r<   dimrv   r   r   r   ry   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r   r   r   r   rY   catr   matmulrU   rs   rV   tensorr   r   r   r   r   r   torz   einsummathsqrtr   r	   
functionalsoftmaxr   r   
contiguousr   r   )r   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r0   r   zBertSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr   NNNNNNF)r   r   r   r|   rY   r   r   r   r   r   boolr   r   r   s   @r0   r   r      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr   r   c                       e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej
                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     f fd
Z
 xZS )BertSdpaSelfAttentionc                     t         |   ||       |j                  | _        t	        j
                  t                     t	        j
                  d      k  | _        y )Nrs   z2.2.0)r{   r|   r   dropout_probr   parser!   require_contiguous_qkvr   s      r0   r|   zBertSdpaSelfAttention.__init__E  sH    9PQ"??&-mm4E4G&H7==Y`Ka&a#r   r   r   r   r   r   r   r   r   c           	      p   | j                   dk7  s|s|*t        j                  d       t        |   |||||||      S |j                         \  }}	}
| j                  | j                  |            }|d u}|r|n|}|r|n|}|r*|r(|d   j                  d   |j                  d   k(  r|\  }}n|| j                  | j                  |            }| j                  | j                  |            }|:|s8t        j                  |d   |gd      }t        j                  |d   |gd      }| j                  r||f}| j                  rK|j                  j                   dk(  r2|0|j#                         }|j#                         }|j#                         }| j                  r|s	||	dkD  rdnd	}t        j$                  j&                  j)                  ||||| j*                  r| j,                  nd
|      }|j/                  dd      }|j1                  ||	| j2                        }|f}| j                  r||fz   }|S )Nrt   a  BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   r<   r#   r   cudaTF        )	attn_mask	dropout_p	is_causal)rs   rB   warning_oncer{   r   r   r   r   rV   r   r   rY   r   r   r   r   typer   r	   r   scaled_dot_product_attentiontrainingr   rU   reshaper   )r   r   r   r   r   r   r   r   bsztgt_len_r   r   current_statesr   r   r   attn_outputr   r   s                      r0   r   zBertSdpaSelfAttention.forwardK  s`    '':59JiNcH 7?%&!  (,,.Wa//

=0IJ 3$>2D.-3E/> .^A5F5L5LQ5OSaSgSghiSj5j%3"I{11$((>2JKI33DJJ~4NOK)2D!II~a'8)&D!L	#ii):K(HaP?? (5N
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OO,>>CY^ehi^iDot 	 hh))FF$+/==d''c G 
 "++Aq1!))#w8J8JK.?? 11Gr   r   r   )r   r   r   r|   rY   r   r   r   r   r   r   r   r   s   @r0   r   r   D  s    b 2615=A>BDH,1[||[ !.[ E--.	[
  ((9(9:[ !)):): ;[ !uU->->'?!@A[ $D>[ 
u||	[ [r   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nrq   )r{   r|   r	   r   r   denser   r   r   r   r   r   s     r0   r|   zBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r  r   r   r   r   r  s      r0   r   zBertSelfOutput.forward  7    

=1]3}|'CDr   r   r   r   r|   rY   r   r   r   r   s   @r0   r  r    1    >U\\  RWR^R^ r   r  )eagersdpac                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )BertAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr   )	r{   r|   BERT_SELF_ATTENTION_CLASSES_attn_implementationr   r  outputsetpruned_headsr   s      r0   r|   zBertAttention.__init__  sC    /0K0KL,C
	 %V,Er   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r#   r   )rS   r   r   r   r   r  r   r   r   r   r  r  r   union)r   headsindexs      r0   prune_headszBertAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r#   )r   r  )r   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r0   r   zBertAttention.forward  sW     yy!"
  ;;|AF#%QR(88r   r   r   )r   r   r   r|   r!  rY   r   r   r   r   r   r   r   r   s   @r0   r  r    s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r{   r|   r	   r   r   intermediate_sizer  
isinstance
hidden_actstrr   intermediate_act_fnr   s     r0   r|   zBertIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r  r,  r   r   s     r0   r   zBertIntermediate.forward  s&    

=100?r   r  r   s   @r0   r&  r&    s#    9U\\ ell r   r&  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
BertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r
  )r{   r|   r	   r   r(  r   r  r   r   r   r   r   r   s     r0   r|   zBertOutput.__init__   s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r  r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r  r  s      r0   r   zBertOutput.forward  r  r   r  r   s   @r0   r0  r0    r  r   r0  c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )	BertLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr#   z> should be used as a decoder model if cross attention is addedrt   r   )r{   r|   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionrW   crossattentionr&  intermediater0  r  r   s     r0   r|   zBertLayer.__init__  s    '-'E'E$&v. ++#)#=#= ##?? D6)g!hii"/PZ"[D,V4 (r   r   r   r   r   r   r   r   r   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr<   )r   r   r   r#   rv   r:  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r8  r   r   rW   r:  r   feed_forward_chunkr6  r7  )r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr$  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r0   r   zBertLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr   c                 L    | j                  |      }| j                  ||      }|S r   )r;  r  )r   r$  intermediate_outputrD  s       r0   r=  zBertLayer.feed_forward_chunk]  s,    "//0@A{{#68HIr   r   )r   r   r   r|   rY   r   r   r   r   r   r   r=  r   r   s   @r0   r4  r4    s    )" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br   r4  c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )BertEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r{   r|   r]   r	   
ModuleListrangenum_hidden_layersr4  layergradient_checkpointing)r   r]   r  r   s      r0   r|   zBertEncoder.__init__d  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
Nr-   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   rv   r#   r<   c              3   $   K   | ]  }|| 
 y wr   r-   )r.   vs     r0   r1   z&BertEncoder.forward.<locals>.<genexpr>  s      
 = 
r2   )last_hidden_staterO  r   
attentionscross_attentions)r]   r9  rN  r   rB   r   	enumeraterM  _gradient_checkpointing_func__call__tupler   )r   r   r   r   r   r   rO  r   r   rP  rQ  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r0   r   zBertEncoder.forwardj  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r   )	NNNNNNFFT)r   r   r   r|   rY   r   r   r   r   r   r   r   r   r   r   s   @r0   rH  rH  c  s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r   rH  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
BertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r{   r|   r	   r   r   r  Tanh
activationr   s     r0   r|   zBertPooler.__init__  s9    YYv1163E3EF
'')r   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r  rh  )r   r   first_token_tensorpooled_outputs       r0   r   zBertPooler.forward  s6     +1a40

#566r   r  r   s   @r0   re  re    s#    $
U\\ ell r   re  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r
  )r{   r|   r	   r   r   r  r)  r*  r+  r   transform_act_fnr   r   r   s     r0   r|   z$BertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  ro  r   r.  s     r0   r   z#BertPredictionHeadTransform.forward  s4    

=1--m<}5r   r  r   s   @r0   rm  rm    s$    UU\\ ell r   rm  c                   *     e Zd Z fdZd Zd Z xZS )BertLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)r8   )r{   r|   rm  	transformr	   r   r   r~   decoder	ParameterrY   r   r8   r   s     r0   r|   zBertLMPredictionHead.__init__  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr   c                 :    | j                   | j                  _         y r   )r8   ru  r   s    r0   _tie_weightsz!BertLMPredictionHead._tie_weights  s     IIr   c                 J    | j                  |      }| j                  |      }|S r   )rt  ru  r.  s     r0   r   zBertLMPredictionHead.forward  s$    }5]3r   )r   r   r   r|   ry  r   r   r   s   @r0   rr  rr    s    &&r   rr  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BertOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r{   r|   rr  predictionsr   s     r0   r|   zBertOnlyMLMHead.__init__  s    /7r   sequence_outputr   c                 (    | j                  |      }|S r   )r~  )r   r  prediction_scoress      r0   r   zBertOnlyMLMHead.forward  s     ,,_=  r   r  r   s   @r0   r|  r|    s#    8!u|| ! !r   r|  c                   $     e Zd Z fdZd Z xZS )BertOnlyNSPHeadc                 l    t         |           t        j                  |j                  d      | _        y Nr<   )r{   r|   r	   r   r   seq_relationshipr   s     r0   r|   zBertOnlyNSPHead.__init__  s'     "		&*<*<a @r   c                 (    | j                  |      }|S r   )r  )r   rk  seq_relationship_scores      r0   r   zBertOnlyNSPHead.forward  s    !%!6!6}!E%%r   r   r   r   r|   r   r   r   s   @r0   r  r    s    A&r   r  c                   $     e Zd Z fdZd Z xZS )BertPreTrainingHeadsc                     t         |           t        |      | _        t	        j
                  |j                  d      | _        y r  )r{   r|   rr  r~  r	   r   r   r  r   s     r0   r|   zBertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @r   c                 N    | j                  |      }| j                  |      }||fS r   )r~  r  )r   r  rk  r  r  s        r0   r   zBertPreTrainingHeads.forward  s0     ,,_=!%!6!6}!E "888r   r  r   s   @r0   r  r    s    A
9r   r  c                   &    e Zd ZeZeZdZdZdZ	d Z
y)BertPreTrainedModelbertTc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsr   )meanstdNg      ?)r)  r	   r   r5   r[   normal_r]   initializer_ranger8   zero_r}   rp   r   fill_rr  )r   modules     r0   _init_weightsz!BertPreTrainedModel._init_weights  s'   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) 45KK""$ 6r   N)r   r   r   r$   config_classrl   load_tf_weightsbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  r-   r   r0   r  r    s"    L-O&*#N%r   r  c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)BertForPreTrainingOutputa\  
    Output type of [`BertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprediction_logitsseq_relationship_logitsr   rV  )r   r   r   r   r  r   rY   r   __annotations__r  r  r   r   rV  r-   r   r0   r  r  2  s~    2 )-D(5$$
%,59x 1 129;?Xe&7&78?8<M8E%"3"345<59Ju00129r   r  a  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                        e Zd ZddgZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	ee
j                        de	e   de	e   de	e   de	e   deee
j                     ef   fd       Z xZS )	BertModelrn   r4  c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        |j                  | _
        |j                  | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r{   r|   r]   rn   r   rH  encoderre  poolerr  attn_implementationrs   	post_init)r   r]   add_pooling_layerr   s      r0   r|   zBertModel.__init__c  si    
 	 (0"6*,=j(4#)#>#> '-'E'E$ 	r   c                 .    | j                   j                  S r   r   r   rx  s    r0   get_input_embeddingszBertModel.get_input_embeddingsv  s    ...r   c                 &    || j                   _        y r   r  )r   r   s     r0   set_input_embeddingszBertModel.set_input_embeddingsy  s    */'r   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rM  r8  r!  )r   heads_to_prunerM  r  s       r0   _prune_headszBertModel._prune_heads|  sE    
 +002 	CLE5LLu%//;;EB	Cr   r   r   rx   ru   r   r   r   r   rO  r   r   rP  rQ  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                   |t        j"                  |      }| j                  |||||	      }|t        j$                  |||z   f|
      }| j&                  dk(  xr | j(                  dk(  xr	 |d u xr | }|rQ|j+                         dk(  r>| j                   j                  rt-        ||||      }n+t/        ||j0                  |      }n| j3                  ||      }| j                   j                  rs|q|j                         \  }}}||f}|t        j$                  ||
      }|r,|j+                         dk(  rt/        ||j0                  |      }n| j5                  |      }nd }| j7                  || j                   j8                        }| j;                  ||||||	|
|||
      }|d   }| j<                  | j=                  |      nd }|s
||f|dd  z   S t?        |||j@                  |jB                  |jD                  |jF                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerv   z5You have to specify either input_ids or inputs_embedsr   r<   rx   r   )r   ru   rx   r   r   )r   r  rt   )r  )	r   r   r   r   rO  r   r   rP  rQ  r#   )rU  pooler_outputrO  r   rV  rW  )$r]   r   rP  use_return_dictr   r   rW   %warn_if_padding_and_no_attention_maskr   r   rV   r   r   rx   r   rY   r   r   onesr  rs   r   r   r   rz   get_extended_attention_maskinvert_attention_maskget_head_maskrL  r  r  r   rO  r   rV  rW  ) r   r   r   rx   ru   r   r   r   r   rO  r   r   rP  rQ  r   
batch_sizer   r   r   r   r   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr  encoder_hidden_shapeencoder_extended_attention_maskencoder_outputsr  rk  s                                    r0   r   zBertModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~Wb&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y$++2O2OP	,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )T)NNNNNNNNNNNNN)r   r   r   _no_split_modulesr|   r  r  r  r    r   rY   r   r   r   r   r   r   r   r   r   r   s   @r0   r  r  T  sx    *;7&/0C  -11515/3,0048<9==A$(,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
  (5L
 !) 6L
 "$u'8'8"9:L
 D>L
 $D>L
 'tnL
 d^L
 
uU\\"$PP	QL
 L
r   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )BertForPreTrainingpredictions.decoder.biascls.predictions.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r{   r|   r  r  r  clsr  r   s     r0   r|   zBertForPreTraining.__init__  s4     f%	'/ 	r   c                 B    | j                   j                  j                  S r   r  r~  ru  rx  s    r0   get_output_embeddingsz(BertForPreTraining.get_output_embeddings&      xx##+++r   c                     || j                   j                  _        |j                  | j                   j                  _        y r   r  r~  ru  r8   r   new_embeddingss     r0   set_output_embeddingsz(BertForPreTraining.set_output_embeddings)  ,    '5$$2$7$7!r   r   r   rx   ru   r   r   labelsnext_sentence_labelr   rP  rQ  r   c                 
   ||n| j                   j                  }| j                  |||||||	|
|	      }|dd \  }}| j                  ||      \  }}d}|u|st	               } ||j                  d| j                   j                        |j                  d            } ||j                  dd      |j                  d            }||z   }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                        S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        Nr   rx   ru   r   r   r   rP  rQ  r<   rv   )r  r  r  r   rV  )
r]   r  r  r  r   r   r~   r  r   rV  )r   r   r   rx   ru   r   r   r  r  r   rP  rQ  r   r  rk  r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr  s                         r0   r   zBertForPreTraining.forward-  sG   V &1%<k$++B]B]))))%'/!5#  

 *1!&48HH_m4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J')?@712;NF/9/EZMF*Q6Q'/$:!//))
 	
r   NNNNNNNNNNN)r   r   r   _tied_weights_keysr|   r  r  r    r   rY   r   r   r   r   r  r   r   r   s   @r0   r  r    sC    56VW,8  -11515/3,004)-6:,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
 &L
 &ell3L
 $D>L
 'tnL
 d^L
 
uU\\"$<<	=L
 L
r   r  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   deee	j                     ef   fd       Zd Z xZS )BertLMHeadModelzcls.predictions.decoder.biasr  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r{   r|   r   rB   warningr  r  r|  r  r  r   s     r0   r|   zBertLMHeadModel.__init__  sL       NNijf>	"6* 	r   c                 B    | j                   j                  j                  S r   r  rx  s    r0   r  z%BertLMHeadModel.get_output_embeddings  r  r   c                     || j                   j                  _        |j                  | j                   j                  _        y r   r  r  s     r0   r  z%BertLMHeadModel.set_output_embeddings  r  r   r   r   rx   ru   r   r   r   r   r  rO  r   r   rP  rQ  r   c                    ||n| j                   j                  }|	d}| j                  |||||||||
||||      }|d   }| j                  |      }d}|	) | j                  ||	| j                   j
                  fi |}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        NF)r   rx   ru   r   r   r   r   rO  r   r   rP  rQ  r   r<   )r  logitsrO  r   rV  rW  )r]   r  r  r  loss_functionr~   r   rO  r   rV  rW  )r   r   r   rx   ru   r   r   r   r   r  rO  r   r   rP  rQ  loss_kwargsr   r  r  lm_lossr  s                        r0   r   zBertLMHeadModel.forward  s   2 &1%<k$++B]B]I))))%'"7#9+/!5#  
  "!* HH_5(d(():FDKKDZDZj^ijG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   c                 J    d}|D ]  }|t        fd|D              fz  } |S )Nr-   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectr   r   )r.   
past_statebeam_idxs     r0   r1   z1BertLMHeadModel._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)r[  )r   rO  r  reordered_past
layer_pasts     `  r0   _reorder_cachezBertLMHeadModel._reorder_cache  s=    ) 	Jncmnn N	 r   )NNNNNNNNNNNNNN)r   r   r   r  r|   r  r  r    r   rY   r   r   r   r   r   r   r   r  r   r   s   @r0   r  r  }  s}    9:Z[
,8  -11515/3,0048<9=)-8<$(,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
  (5>
 !) 6>
 &>
 "$u||"45>
 D>>
 $D>>
 'tn>
 d^>
" 
uU\\"$EE	F#>
 >
@r   r  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       ZddZedefd       Z xZS )BertForMaskedLMr  r  c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  r   s     r0   r|   zBertForMaskedLM.__init__  sR     NN1
 f>	"6* 	r   c                 B    | j                   j                  j                  S r   r  rx  s    r0   r  z%BertForMaskedLM.get_output_embeddings  r  r   c                     || j                   j                  _        |j                  | j                   j                  _        y r   r  r  s     r0   r  z%BertForMaskedLM.set_output_embeddings  r  r   r   r   rx   ru   r   r   r   r   r  r   rP  rQ  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|d   }| j                  |      }d}|	Ft	               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r   rx   ru   r   r   r   r   r   rP  rQ  r   rv   r<   r  r  r   rV  )
r]   r  r  r  r   r   r~   r   r   rV  )r   r   r   rx   ru   r   r   r   r   r  r   rP  rQ  r   r  r  r  r  r  s                      r0   r   zBertForMaskedLM.forward  s    . &1%<k$++B]B]))))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    |j                   }|d   }| j                  j                  t        d      t	        j
                  ||j                  |j                   d   df      gd      }t	        j                  |df| j                  j                  t        j                  |j                        }t	        j
                  ||gd      }||dS )Nr   z.The PAD token should be defined for generationr#   rv   r   r   )r   r   )
rV   r]   r   rW   rY   r   	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r0   prepare_inputs_for_generationz-BertForMaskedLM.prepare_inputs_for_generation6  s    oo*1~ ;;##+MNNNN4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy+6A>	&.IIr   c                      y)z
        Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
        `prepare_inputs_for_generation` method.
        Fr-   )r  s    r0   can_generatezBertForMaskedLM.can_generateF  s     r   )NNNNNNNNNNNNr   )r   r   r   r  r|   r  r  r    r   rY   r   r   r   r   r   r   r  classmethodr  r   r   s   @r0   r  r    sj   46VW,8  -11515/3,0048<9=)-,0/3&*7
ELL)7
 !.7
 !.	7

 u||,7
 ELL)7
  -7
  (57
 !) 67
 &7
 $D>7
 'tn7
 d^7
 
uU\\"N2	37
 7
rJ  T  r   r  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForNextSentencePredictionc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r{   r|   r  r  r  r  r  r   s     r0   r|   z&BertForNextSentencePrediction.__init__U  s4     f%	"6* 	r   r   r   rx   ru   r   r   r  r   rP  rQ  r   c                    d|v r+t        j                  dt               |j                  d      }|
|
n| j                  j
                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|2t               } ||j                  dd      |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r#   rv   r<   r  )warningswarnFutureWarningpopr]   r  r  r  r   r   r   r   rV  )r   r   r   rx   ru   r   r   r  r   rP  rQ  kwargsr   rk  seq_relationship_scoresr  r  r  s                     r0   r   z%BertForNextSentencePrediction.forward^  s   T !F*MM%
 ZZ 56F%0%<k$++B]B]))))%'/!5#  

  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//))	
 	
r   
NNNNNNNNNN)r   r   r   r|   r    r   rY   r   r   r   r   r   r   r   r   s   @r0   r
  r
  O  s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   r
  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForSequenceClassificationc                 n   t         |   |       |j                  | _        || _        t	        |      | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y r   )r{   r|   
num_labelsr]   r  r  classifier_dropoutr   r	   r   r   r   r   r;   r  r   r]   r  r   s      r0   r|   z&BertForSequenceClassification.__init__  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   rx   ru   r   r   r  r   rP  rQ  r   c                 @   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r#   
regressionsingle_label_classificationmulti_label_classificationrv   r<   r  )r]   r  r  r   r;   problem_typer  rz   rY   r   rT   r   squeezer   r   r
   r   r   rV  )r   r   r   rx   ru   r   r   r  r   rP  rQ  r   rk  r  r  r  r  s                    r0   r   z%BertForSequenceClassification.forward  s   ( &1%<k$++B]B]))))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   r  )r   r   r   r|   r    r   rY   r   r   r   r   r   r   r   r   s   @r0   r  r    s     -11515/3,004)-,0/3&*E
ELL)E
 !.E
 !.	E

 u||,E
 ELL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\\"$<<	=E
 E
r   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForMultipleChoicec                 *   t         |   |       t        |      | _        |j                  |j                  n|j
                  }t        j                  |      | _        t        j                  |j                  d      | _        | j                          y )Nr#   )r{   r|   r  r  r  r   r	   r   r   r   r   r;   r  r  s      r0   r|   zBertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	r   r   r   rx   ru   r   r   r  r   rP  rQ  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr#   rv   r   r  r<   r  )r]   r  rV   r   r   r  r   r;   r   r   r   rV  )r   r   r   rx   ru   r   r   r  r   rP  rQ  num_choicesr   rk  r  reshaped_logitsr  r  r  s                      r0   r   zBertForMultipleChoice.forward!  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   r  )r   r   r   r|   r    r   rY   r   r   r   r   r   r   r   r   s   @r0   r!  r!    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   r!  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   dee   de	e
ej                     ef   fd       Z xZS )BertForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y NFr  )r{   r|   r  r  r  r  r   r	   r   r   r   r   r;   r  r  s      r0   r|   z#BertForTokenClassification.__init__  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   rx   ru   r   r   r  r   rP  rQ  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rv   r<   r  )r]   r  r  r   r;   r   r   r  r   r   rV  )r   r   r   rx   ru   r   r   r  r   rP  rQ  r   r  r  r  r  r  s                    r0   r   z"BertForTokenClassification.forward  s    $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   r  )r   r   r   r|   r    r   rY   r   r   r   r   r   r   r   r   s   @r0   r'  r'  }  s     -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\\"$99	:2
 2
r   r'  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   de	e
ej                     ef   fd       Z xZS )BertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r)  )
r{   r|   r  r  r  r	   r   r   
qa_outputsr  r   s     r0   r|   z!BertForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r   r   r   rx   ru   r   r   start_positionsend_positionsr   rP  rQ  r   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r#   rv   r   )ignore_indexr<   )r  start_logits
end_logitsr   rV  )r]   r  r  r.  rM   r  r   rS   r   clampr   r   r   rV  )r   r   r   rx   ru   r   r   r/  r0  r   rP  rQ  r   r  r  r3  r4  r  ignored_indexr  
start_lossend_lossr  s                          r0   r   z BertForQuestionAnswering.forward  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   r  )r   r   r   r|   r    r   rY   r   r   r   r   r   r   r   r   s   @r0   r,  r,    s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
uU\\"$@@	A>
 >
r   r,  )r  r!  r
  r  r,  r  r'  r4  r  r  r  rl   )Sr   r   rD   r  dataclassesr   typingr   r   r   r   rY   torch.utils.checkpoint	packagingr   r	   torch.nnr
   r   r   activationsr   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r    r!   r"   configuration_bertr$   
get_loggerr   rB   rl   Modulern   r   r   r  r  r  r&  r0  r4  rH  re  rm  rr  r|  r  r  r  r  r  r  r  r  r
  r  r!  r'  r,  __all__r-   r   r0   <module>rI     s      	  ! / /     A A ! ) w
 
 
 . l l L L * 
		H	%FR=RYY =@C		 CLb- bJRYY  ! 0BII 0fryy  S		 SlZ
")) Z
z ")) "299 .!bii !&bii &	9299 	9 %/ % %4 :{ : :B 	q
# q
q
h `
, `
`
F 
])? ]
]@ i) i iX 
\
$7 \

\
~ V
$7 V
V
r g
/ g
 g
T B
!4 B
 B
J J
2 J
 J
Zr   