
    Uh                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'  ejP                  e)      Z* G d dejV                        Z, G d dejV                        Z- G d dejV                        Z. G d dejV                        Z/ G d dejV                        Z0 G d dejV                        Z1 G d dejV                        Z2 G d dejV                        Z3 G d d ejV                        Z4e G d! d"e             Z5e G d# d$e5             Z6e G d% d&e5             Z7 G d' d(ejV                        Z8 ed)*       G d+ d,e5             Z9e G d- d.e5             Z:e G d/ d0e5             Z; G d1 d2ejV                        Z<e G d3 d4e5             Z=d7d5Z>g d6Z?y)8zPyTorch I-BERT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )IBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           |j                  | _        d| _        d| _        d| _        d| _        d| _        t        |j                  |j                  |j                  | j                  | j                        | _        t        |j                  |j                  | j                  | j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d	
       t)        |dd      | _        |j                  | _        t        |j$                  |j                  | j,                  | j                  | j                        | _        t1        | j                  | j                        | _        t1        | j                  | j                        | _        t7        |j                  |j8                  | j                  | j                  |j:                        | _        t1        | j
                  | j                        | _        tA        jB                  |jD                        | _#        y )N             )padding_idx
weight_bit
quant_mode)r(   r)   position_ids)r   F)
persistentposition_embedding_typeabsoluter)   eps
output_bitr)   force_dequant)$super__init__r)   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandgetattrr-   r'   position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr3   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__s     z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/ibert/modeling_ibert.pyr5   zIBertEmbeddings.__init__5   s    ++!#-++)) 
 &4""F$6$64CUCUbfbqbq&
"
 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$ "..#1**(())$
   ((>(>4??['(>(>4??[ &%%)) ..
 "*$,,4??!Szz&"<"<=    c                    |D|1t        || j                  |      j                  |j                        }n| j	                  |      }||j                         }n|j                         d d }|:t        j                  |t        j                  | j                  j                        }|| j                  |      \  }}nd }| j                  |      \  }}	| j                  ||||	      \  }
}| j                  dk(  r,| j                  |      \  }}| j                  |
|||      \  }
}| j                  |
|      \  }
}| j!                  |
      }
| j#                  |
|      \  }
}|
|fS )Nr+   dtypedeviceidentityidentity_scaling_factorr.   )"create_position_ids_from_input_idsr'   torY   &create_position_ids_from_inputs_embedssizerB   zeroslongr*   r>   r@   rH   r-   rG   rK   rO   rL   )rQ   	input_idstoken_type_idsr*   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorr@   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorrG   "position_embeddings_scaling_factors                 rT   forwardzIBertEmbeddings.forwardi   s    $At//1G "Y%%&   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN :>:N:Ny:Y7M7+/(FJF`F`aoFpCC040D0D(*$H	 1E 1
-
- '':5FJF^F^_kFlC!C484H4H),(J	 5I 51J1 15zKd0e-
-\\*-
040F0FzSl0m-
-444rU   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr+   r   rW   r   )r`   rB   rC   r'   rb   rY   	unsqueezerE   )rQ   re   rg   sequence_lengthr*   s        rT   r_   z6IBertEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rU   )NNNNr   )__name__
__module____qualname____doc__r5   rm   r_   __classcell__rS   s   @rT   r!   r!   0   s     2>j rs-5^=rU   r!   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )IBertSelfAttentionc           	         t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        d| _        d| _        d| _	        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  | j                  d| j                  | j                  | j                  d	      | _        t        |j                  | j                  d| j                  | j                  | j                  d	      | _        t        |j                  | j                  d| j                  | j                  | j                  d	      | _        t#        | j                  | j                  
      | _        t#        | j                  | j                  
      | _        t#        | j                  | j                  
      | _        t#        | j                  | j                  
      | _        t-        j.                  |j0                        | _        t5        |dd      | _        | j6                  dk7  rt        d      t9        | j                  | j                  |j:                        | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r#   r&   Tbiasr(   bias_bitr)   per_channelr/   r-   r.   zDI-BERT only supports 'absolute' for `config.position_embedding_type`r)   r3   )r4   r5   r<   num_attention_headshasattr
ValueErrorr)   r(   r~   r8   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrL   r   rM   attention_probs_dropout_probrO   rF   r-   r   r3   softmaxrP   s     rT   r5   zIBertSelfAttention.__init__   s?    : ::a?PVXhHi#F$6$6#7 8 445Q8  !++#)#=#= #&v'9'9F<V<V'V#W !558P8PP !]]

 ]]
 !]]

 !)$// R&t||P ($// R!)$,,4??!Szz&"E"EF'.v7PR\']$'':5cdd!$,,4??Z`ZnZnorU   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr+   r      r   r
   )r`   r   r   viewpermute)rQ   xnew_x_shapes      rT   transpose_for_scoresz'IBertSelfAttention.transpose_for_scores   sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$rU   c                    | j                  ||      \  }}| j                  ||      \  }}	| j                  ||      \  }
}| j                  ||      \  }}| j	                  ||	      \  }}| j                  |
|      \  }}| j                  |      }| j                  |      }| j                  |      }t        j                  ||j                  dd            }t        j                  | j                        }||z  }| j                  r	||z  |z  }nd }|||z   }| j                  ||      \  }}| j                  |      }|||z  }t        j                  ||      }|||z  }nd }|j!                  dddd      j#                         }|j%                         d d | j&                  fz   } |j(                  | }| j+                  ||      \  }}|r||fn|f}|r||fn|f}||fS )Nr+   r   r   r   r
   )r   r   r   r   r   r   r   rB   matmul	transposemathsqrtr   r)   r   rO   r   
contiguousr`   r   r   rL   )rQ   hidden_stateshidden_states_scaling_factorattention_mask	head_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factorattention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factors                               rT   rm   zIBertSelfAttention.forward   sY    ?CjjXt>u;;:>((=Rn:o77>BjjXt>u;; 372G2G?3
// /3.A.A/Sq.r+	+262G2G?3
//
 //<--i8	//< !<<Y5H5HR5PQ		$223+e3??.HKc.cfk.k+.2+%/.@ ;?,,=;
77 ,,7  -	9O_kB)5+ILf+f(+/(%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD 7;6L6L77
33 7H=/2mM] ! *+IJ.0 	 ---rU   NNF)rq   rr   rs   r5   r   rm   ru   rv   s   @rT   rx   rx      s     8pt% K.rU   rx   c                   $     e Zd Z fdZd Z xZS )IBertSelfOutputc           	         t         |           |j                  | _        d| _        d| _        d| _        d| _        d| _        t        |j                  |j                  d| j                  | j
                  | j                  d      | _
        t        | j                  | j                        | _        t        |j                  |j                  | j                  | j                  |j                        | _        t        | j                  | j                        | _        t%        j&                  |j(                        | _        y Nr#   r&   r%   Tr|   r/   r0   )r4   r5   r)   r8   r(   r~   r9   r:   r   r<   denser   ln_input_actr   rJ   r3   rK   rL   r   rM   rN   rO   rP   s     rT   r5   zIBertSelfOutput.__init__9  s     ++ ]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rU   c                     | j                  ||      \  }}| j                  |      }| j                  ||||      \  }}| j                  ||      \  }}| j	                  ||      \  }}||fS NrZ   r   rO   r   rK   rL   rQ   r   r   input_tensorinput_tensor_scaling_factors        rT   rm   zIBertSelfOutput.forwardV      6:jjPl6m33]36:6G6G(!$?	 7H 7
33 7;nn]Tp6q336:6L6L77
33 :::rU   rq   rr   rs   r5   rm   ru   rv   s   @rT   r   r   8      >:;rU   r   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )IBertAttentionc                     t         |           |j                  | _        t        |      | _        t        |      | _        t               | _        y N)	r4   r5   r)   rx   rQ   r   outputsetpruned_headsrP   s     rT   r5   zIBertAttention.__init__h  s=     ++&v.	%f-ErU   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   dim)lenr   rQ   r   r   r   r   r   r   r   r   r   r   union)rQ   headsindexs      rT   prune_headszIBertAttention.prune_headso  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rU   c                     | j                  |||||      \  }}| j                  |d   |d   ||      \  }}	|f|dd  z   }
|	f|dd  z   }|
|fS )Nr   r   )rQ   r   )rQ   r   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factors               rT   rm   zIBertAttention.forward  s     59II(5
11 =AKKO8;]Lh=
99 $%QR(88"A!CFabcbdFe!e...rU   r   )rq   rr   rs   r5   r   rm   ru   rv   s   @rT   r   r   g  s    ";, /rU   r   c                   $     e Zd Z fdZd Z xZS )IBertIntermediatec           	         t         |           |j                  | _        d| _        d| _        d| _        t        |j                  |j                  d| j                  | j
                  | j                  d      | _	        |j                  dk7  rt        d      t        | j                  |j                        | _        t        | j                  | j                        | _        y )	Nr#   r&   Tr|   r   z3I-BERT only supports 'gelu' for `config.hidden_act`r   r/   )r4   r5   r)   r8   r(   r~   r   r<   intermediate_sizer   
hidden_actr   r   r3   intermediate_act_fnr   rL   rP   s     rT   r5   zIBertIntermediate.__init__  s     ++ $$]]

 &RSS#*dooU[UiUi#j !)$,,4??!SrU   c                     | j                  ||      \  }}| j                  ||      \  }}| j                  ||      \  }}||fS r   )r   r   rL   )rQ   r   r   s      rT   rm   zIBertIntermediate.forward  sa    6:jjPl6m336:6N6N77
33
 7;6L6L77
33 :::rU   r   rv   s   @rT   r   r     s    T(
;rU   r   c                   $     e Zd Z fdZd Z xZS )IBertOutputc           	         t         |           |j                  | _        d| _        d| _        d| _        d| _        d| _        t        |j                  |j                  d| j                  | j
                  | j                  d      | _        t        | j                  | j                        | _        t        |j                  |j                  | j                  | j                  |j                         | _        t        | j                  | j                        | _        t'        j(                  |j*                        | _        y r   )r4   r5   r)   r8   r(   r~   r9   r:   r   r   r<   r   r   r   r   rJ   r3   rK   rL   r   rM   rN   rO   rP   s     rT   r5   zIBertOutput.__init__  s     ++ $$]]

 %T%6%64??S%%%)) ..
 "*$,,4??!Szz&"<"<=rU   c                     | j                  ||      \  }}| j                  |      }| j                  ||||      \  }}| j                  ||      \  }}| j	                  ||      \  }}||fS r   r   r   s        rT   rm   zIBertOutput.forward  r   rU   r   rv   s   @rT   r   r     r   rU   r   c                   2     e Zd Z fdZ	 	 	 ddZd Z xZS )
IBertLayerc                 X   t         |           |j                  | _        d| _        d| _        t        |      | _        t        |      | _        t        |      | _
        t        | j                  | j                        | _        t        | j                  | j                        | _        y )Nr#   r   r/   )r4   r5   r)   r8   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrP   s     rT   r5   zIBertLayer.__init__  s}     ++'/-f5!&)$,T\\doo$V!&t||PrU   c                     | j                  |||||      \  }}|d   }|d   }	|dd  }
| j                  ||	      \  }}|f|
z   }
|
S )N)r   r   r   )r   feed_forward_chunk)rQ   r   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factors                rT   rm   zIBertLayer.forward  s     IM(/ IW I
E E 2!4*OPQ*R'(,484K4K=5
11  /G+rU   c                     | j                  ||      \  }}| j                  ||      \  }}| j                  ||      \  }}| j                  ||||      \  }}||fS r   )r   r   r   r   )rQ   r   r   intermediate_output"intermediate_output_scaling_factorr   r   s          rT   r   zIBertLayer.feed_forward_chunk  s    <@<U<U==
99 CGBSBS=C
?? CGBUBU!CC
?? 59KK!CEUWv5
11 888rU   r   )rq   rr   rs   r5   rm   r   ru   rv   s   @rT   r   r     s    Q" 69rU   r   c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )IBertEncoderc                     t         |           || _        |j                  | _        t	        j
                  t        |j                        D cg c]  }t        |       c}      | _	        y c c}w r   )
r4   r5   rR   r)   r   
ModuleListrangenum_hidden_layersr   layer)rQ   rR   _rS   s      rT   r5   zIBertEncoder.__init__$  sP     ++]]fF^F^@_#`1Jv$6#`a
#`s   A-c                    |rdnd }|rdnd }	d }
d }t        | j                        D ]3  \  }}|r||fz   }|||   nd } ||||||      }|d   }|s+|	|d   fz   }	5 |r||fz   }|st        d ||||	|
fD              S t        ||||	|
      S )N r   r   c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     rT   	<genexpr>z'IBertEncoder.forward.<locals>.<genexpr>O  s      
 = 
s   )last_hidden_statepast_key_valuesr   
attentionscross_attentions)	enumerater   tupler   )rQ   r   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_masklayer_outputss                   rT   rm   zIBertEncoder.forward*  s     #7BD$5b4#!(4 	POA|#$58H$H!.7.CilO(,!M *!,M &9]1=M<O&O#!	P$   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
rU   )NNFFTr   rv   s   @rT   r   r   #  s    b "6
rU   r   c                   $     e Zd Z fdZd Z xZS )IBertPoolerc                     t         |           |j                  | _        t        j                  |j
                  |j
                        | _        t        j                         | _        y r   )	r4   r5   r)   r   Linearr<   r   Tanh
activationrP   s     rT   r5   zIBertPooler.__init__d  sF     ++YYv1163E3EF
'')rU   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r  )rQ   r   first_token_tensorpooled_outputs       rT   rm   zIBertPooler.forwardj  s6     +1a40

#566rU   r   rv   s   @rT   r  r  c  s    $rU   r  c                   "    e Zd ZeZdZd ZddZy)IBertPreTrainedModelibertc                    t        |t        t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        t        j                  f      rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        t        j                  f      rJ|j                  j
                  j                          |j                  j
                  j!                  d       yt        |t"              r%|j                  j
                  j                          yy)zInitialize the weightsg        )meanstdNg      ?)
isinstancer   r   r  weightdatanormal_rR   initializer_ranger}   zero_r   	Embeddingr'   r   rK   fill_IBertLMHead)rQ   modules     rT   _init_weightsz"IBertPreTrainedModel._init_weightsx  s5   f{BII67 MM&&CT[[5R5R&S{{&  &&( ' >?MM&&CT[[5R5R&S!!-""6#5#56<<> .r|| <=KK""$MM$$S),KK""$ -rU   Nc                     t        d      )Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rQ   new_num_tokenss     rT   resize_token_embeddingsz,IBertPreTrainedModel.resize_token_embeddings  s    !"Z[[rU   r   )rq   rr   rs   r   config_classbase_model_prefixr1  r5  r  rU   rT   r"  r"  s  s    L%$\rU   r"  c                   V    e Zd ZdZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e   de	e   deeee
j                     f   fd       Z xZS )
IBertModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    c                     t         |   |       || _        |j                  | _        t	        |      | _        t        |      | _        |rt        |      nd| _	        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r4   r5   rR   r)   r!   rj   r   encoderr  pooler	post_init)rQ   rR   add_pooling_layerrS   s      rT   r5   zIBertModel.__init__  sZ    
 	  ++)&1#F+->k&)D 	rU   c                 .    | j                   j                  S r   rj   r>   rQ   s    rT   get_input_embeddingszIBertModel.get_input_embeddings  s    ...rU   c                 &    || j                   _        y r   r@  )rQ   r   s     rT   set_input_embeddingszIBertModel.set_input_embeddings  s    */'rU   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr;  r   r   r   )rQ   heads_to_pruner   r   s       rT   _prune_headszIBertModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	CrU   rc   r   rd   r*   r   re   r   r  r  returnc
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|&t        j                  |
t        j                  |      }| j                  ||
      }| j                  || j                   j                        }| j                  ||||      \  }}| j!                  |||||||	      }|d   }| j"                  | j#                  |      nd }|	s
||f|d	d  z   S t%        |||j&                  |j(                  |j*                  |j,                  
      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer+   z5You have to specify either input_ids or inputs_embeds)rY   rW   )rc   r*   rd   re   )r   r   r   r  r  r   r   )r  pooler_outputr  r   r	  r
  )rR   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr`   rY   rB   onesra   rb   get_extended_attention_maskget_head_maskr   rj   r;  r<  r   r  r   r	  r
  )rQ   rc   r   rd   r*   r   re   r   r  r  rg   
batch_size
seq_lengthrY   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr   s                       rT   rm   zIBertModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m &&y$++2O2OP	<@OO%)'	 =L =
99 ,,+2/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rU   )T)	NNNNNNNNN)rq   rr   rs   rt   r5   rB  rD  rH  r   r   rB   
LongTensorFloatTensorboolr   r   r   rm   ru   rv   s   @rT   r9  r9    s    "/0C  156:59371559,0/3&*K
E,,-K
 !!2!23K
 !!1!12	K

 u//0K
 E--.K
   1 12K
 $D>K
 'tnK
 d^K
 
;U5CTCT=UU	VK
 K
rU   r9  c                   r    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   dee   deeee	j                     f   fd       Z xZS )IBertForMaskedLMzlm_head.decoder.biaszlm_head.decoder.weightc                     t         |   |       t        |d      | _        t	        |      | _        | j                          y NF)r>  )r4   r5   r9  r#  r/  lm_headr=  rP   s     rT   r5   zIBertForMaskedLM.__init__  s6     %@
"6* 	rU   c                 .    | j                   j                  S r   )r_  decoderrA  s    rT   get_output_embeddingsz&IBertForMaskedLM.get_output_embeddings  s    ||###rU   c                 \    || j                   _        |j                  | j                   _        y r   )r_  ra  r}   )rQ   new_embeddingss     rT   set_output_embeddingsz&IBertForMaskedLM.set_output_embeddings  s     -*//rU   rc   r   rd   r*   r   re   labelsr   r  r  rI  c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   rd   r*   r   re   r   r  r  r   r+   r   losslogitsr   r	  )
rR   rL  r#  r_  r   r   r;   r   r   r	  )rQ   rc   r   rd   r*   r   re   rf  r   r  r  r   rW  prediction_scoresmasked_lm_lossloss_fctr   s                    rT   rm   zIBertForMaskedLM.forward  s    ( &1%<k$++B]B]**))%'/!5#  

 "!* LL9')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rU   
NNNNNNNNNN)rq   rr   rs   _tied_weights_keysr5   rb  re  r   r   rB   rX  rY  rZ  r   r   r   rm   ru   rv   s   @rT   r\  r\    s,   02JK$0  156:59371559-1,0/3&*1
E,,-1
 !!2!231
 !!1!12	1

 u//01
 E--.1
   1 121
 ))*1
 $D>1
 'tn1
 d^1
 
~uU%6%677	81
 1
rU   r\  c                   0     e Zd ZdZ fdZd ZddZ xZS )r/  z)I-BERT Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y )N)r1   )r4   r5   r   r  r<   r   rK   rJ   
layer_normr;   ra  	ParameterrB   ra   r}   rP   s     rT   r5   zIBertLMHead.__init__S  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrU   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   rs  ra  )rQ   featureskwargsr   s       rT   rm   zIBertLMHead.forward\  s;    JJx GOOA LLOrU   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)ra  r}   rY   typerA  s    rT   _tie_weightszIBertLMHead._tie_weightsf  sC    <<##((F2 $		DLL ))DIrU   )rI  N)rq   rr   rs   rt   r5   rm   r{  ru   rv   s   @rT   r/  r/  P  s    3&*rU   r/  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eeej                     f   fd       Z xZS )IBertForSequenceClassificationc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y r^  )r4   r5   
num_labelsr9  r#  IBertClassificationHead
classifierr=  rP   s     rT   r5   z'IBertForSequenceClassification.__init__v  sC      ++%@
1&9 	rU   rc   r   rd   r*   r   re   rf  r   r  r  rI  c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|
s|f|d	d z   }||f|z   S |S t        |||j                   |j"                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrh  r   r   
regressionsingle_label_classificationmulti_label_classificationr+   r   ri  )rR   rL  r#  r  problem_typer  rX   rB   rb   r   r	   squeezer   r   r   r   r   r	  rQ   rc   r   rd   r*   r   re   rf  r   r  r  r   rW  rk  rj  rn  r   s                    rT   rm   z&IBertForSequenceClassification.forward  s   ( &1%<k$++B]B]**))%'/!5#  

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rU   ro  )rq   rr   rs   r5   r   r   rB   rX  rY  rZ  r   r   r   rm   ru   rv   s   @rT   r~  r~  o  s$     156:59371559-1,0/3&*B
E,,-B
 !!2!23B
 !!1!12	B

 u//0B
 E--.B
   1 12B
 ))*B
 $D>B
 'tnB
 d^B
 
'u/@/@)AA	BB
 B
rU   r~  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eeej                     f   fd       Z xZS )IBertForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r4   r5   r9  r#  r   rM   rN   rO   r  r<   r  r=  rP   s     rT   r5   zIBertForMultipleChoice.__init__  sV     '
zz&"<"<=))F$6$6: 	rU   rc   rd   r   rf  r*   r   re   r   r  r  rI  c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r+   r   )r*   rd   r   r   re   r   r  r  r   ri  )rR   rL  shaper   r`   r#  rO   r  r   r   r   r	  )rQ   rc   rd   r   rf  r*   r   re   r   r  r  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r   rk  reshaped_logitsrj  rn  r   s                           rT   rm   zIBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ***..,/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rU   ro  )rq   rr   rs   r5   r   r   rB   rX  rY  rZ  r   r   r   rm   ru   rv   s   @rT   r  r    s$     15596:-1371559,0/3&*W
E,,-W
 !!1!12W
 !!2!23	W

 ))*W
 u//0W
 E--.W
   1 12W
 $D>W
 'tnW
 d^W
 
(%0A0A*BB	CW
 W
rU   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eeej                     f   fd       Z xZS )IBertForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r^  )r4   r5   r  r9  r#  r   rM   rN   rO   r  r<   r  r=  rP   s     rT   r5   z$IBertForTokenClassification.__init__/  sk      ++%@
zz&"<"<=))F$6$68I8IJ 	rU   rc   r   rd   r*   r   re   rf  r   r  r  rI  c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrh  r   r+   r   ri  )rR   rL  r#  rO   r  r   r   r  r   r   r	  r  s                    rT   rm   z#IBertForTokenClassification.forward:  s    $ &1%<k$++B]B]**))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rU   ro  )rq   rr   rs   r5   r   r   rB   rX  rY  rZ  r   r   r   rm   ru   rv   s   @rT   r  r  -  s   	  156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
$eE,=,=&>>	?2
 2
rU   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 &   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y r   )r4   r5   r   r  r<   r   rM   rN   rO   r  out_projrP   s     rT   r5   z IBertClassificationHead.__init__s  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHrU   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r  )rO   r   rB   tanhr  )rQ   rv  rw  r   s       rT   rm   zIBertClassificationHead.forwardy  s^     Aq)]3

=1

=1]3m4rU   )rq   rr   rs   rt   r5   rm   ru   rv   s   @rT   r  r  p  s    7IrU   r  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eeej                     f   fd       Z xZS )IBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r^  )
r4   r5   r  r9  r#  r   r  r<   
qa_outputsr=  rP   s     rT   r5   z"IBertForQuestionAnswering.__init__  sU      ++%@
))F$6$68I8IJ 	rU   rc   r   rd   r*   r   re   start_positionsend_positionsr   r  r  rI  c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nrh  r   r   r+   r   )ignore_indexr   )rj  start_logits
end_logitsr   r	  )rR   rL  r#  r  splitr  r   r   r`   clampr   r   r   r	  )rQ   rc   r   rd   r*   r   re   r  r  r   r  r  r   rW  rk  r  r  
total_lossignored_indexrn  
start_lossend_lossr   s                          rT   rm   z!IBertForQuestionAnswering.forward  s    &1%<k$++B]B]**))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rU   )NNNNNNNNNNN)rq   rr   rs   r5   r   r   rB   rX  rY  rZ  r   r   r   rm   ru   rv   s   @rT   r  r    s/     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
+U53D3D-EE	F>
 >
rU   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )aM  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    r   r   )ner   rB   cumsumtype_asrb   )rc   r'   rf   maskincremental_indicess        rT   r]   r]     sW     <<$((*D <<!4<<TBE[[_cc##%33rU   )r\  r  r  r~  r  r9  r"  )r   )@rt   r   typingr   r   r   rB   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerrq   loggerModuler!   rx   r   r   r   r   r   r   r  r"  r9  r\  r/  r~  r  r  r  r  r]   __all__r  rU   rT   <module>r     s	  $   ) )    A A    . Q , , c c 
		H	%w=bii w=tK. K.\,;bii ,;^./RYY ./b;		 ;D,;")) ,;^79 79t=
299 =
@"))   \? \ \4 u
% u
 u
p E
+ E
 E
P*")) *> N
%9 N
N
b c
1 c
 c
L ?
"6 ?
 ?
Dbii & J
 4 J
 J
Z4"rU   