
    Uh                    8   d Z ddlZddlmZmZmZmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z,  e*jZ                  e.      Z/ G d dej`                        Z1 G d dej`                        Z2 G d de2      Z3 G d dej`                        Z4e2e3dZ5 G d dej`                        Z6 G d dej`                        Z7 G d dej`                        Z8 G d  d!ej`                        Z9 G d" d#ej`                        Z: G d$ d%ej`                        Z;e( G d& d'e"             Z< G d( d)ej`                        Z= G d* d+ej`                        Z>e( G d, d-e<             Z?e( G d. d/e<             Z@ e(d01       G d2 d3e<             ZAe( G d4 d5e<             ZBe( G d6 d7e<             ZCe( G d8 d9e<             ZD e(d:1       G d; d<e<e             ZEd?d=ZFg d>ZGy)@zPyTorch CamemBERT model.    N)ListOptionalTupleUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringget_torch_versionlogging   )CamembertConfigc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r!   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr)   register_buffertorcharangeexpandzerosr+   sizelongr&   selfconfig	__class__s     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.pyr2   zCamembertEmbeddings.__init__8   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr,   r!   r.   r   r0   devicer*   )"create_position_ids_from_input_idsr&   &create_position_ids_from_inputs_embedsrG   hasattrr.   rE   rC   rF   rH   r+   rQ   r7   r;   r)   r9   r<   r@   )rJ   	input_idsr.   r+   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr;   
embeddingsr9   s                rM   forwardzCamembertEmbeddings.forwardQ   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
rN   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr,   r!   rP   r   )rG   rC   rD   r&   rH   rQ   	unsqueezerE   )rJ   rV   rX   sequence_lengthr+   s        rM   rS   z:CamembertEmbeddings.create_position_ids_from_inputs_embedsy   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rN   )NNNNr   )__name__
__module____qualname____doc__r2   r]   rS   __classcell__rL   s   @rM   r$   r$   2   s    

4 rs&P=rN   r$   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )CamembertSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r)   r*   relative_keyrelative_key_query   r!   )r1   r2   r5   num_attention_headsrT   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer>   attention_probs_dropout_probr@   rA   r)   r8   r3   distance_embedding
is_decoderrJ   rK   r)   rL   s      rM   r2   zCamembertSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++rN   xreturnc                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr,   r   rn   r!   r   )rG   ro   rr   viewpermute)rJ   r|   new_x_shapes      rM   transpose_for_scoresz+CamembertSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$rN   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r!   rn   dimr,   rl   rm   rP   r/   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) ru   r   rv   rw   rC   catrz   matmul	transposer)   shapetensorrH   rQ   r   rD   ry   r8   tor0   einsummathsqrtrr   r   
functionalsoftmaxr@   r   
contiguousrG   rs   )rJ   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               rM   r]   zCamembertSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11GrN   NNNNNNF)ra   rb   rc   r2   rC   Tensorr   r   FloatTensorr   boolr]   re   rf   s   @rM   rh   rh      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	crN   rh   c                       e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  deej
                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     f fd
Z
 xZS )CamembertSdpaSelfAttentionc                     t         |   ||       |j                  | _        t	        j
                  t                     t	        j
                  d      k  | _        y )Nr)   z2.2.0)r1   r2   rx   dropout_probr   parser   require_contiguous_qkvr{   s      rM   r2   z#CamembertSdpaSelfAttention.__init__  sH    9PQ"??&-mm4E4G&H7==Y`Ka&a#rN   r   r   r   r   r   r   r   r}   c           	      p   | j                   dk7  s|s|*t        j                  d       t        |   |||||||      S |j                         \  }}	}
| j                  | j                  |            }|d u}|r|n|}|r|n|}|r*|r(|d   j                  d   |j                  d   k(  r|\  }}n|| j                  | j                  |            }| j                  | j                  |            }|:|s8t        j                  |d   |gd      }t        j                  |d   |gd      }| j                  r||f}| j                  rK|j                  j                   dk(  r2|0|j#                         }|j#                         }|j#                         }| j                  r|s	||	dkD  rdnd	}t        j$                  j&                  j)                  ||||| j*                  r| j,                  nd
|      }|j/                  dd      }|j1                  ||	| j2                        }|f}| j                  r||fz   }|S )Nr*   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   rn   r!   r   cudaTF        )	attn_mask	dropout_p	is_causal)r)   loggerwarning_oncer1   r]   rG   r   ru   r   rv   rw   rC   r   rz   r   rQ   typer   r   r   scaled_dot_product_attentiontrainingr   r   reshapers   )rJ   r   r   r   r   r   r   r   bsztgt_len_r   r   current_statesr   r   r   attn_outputr   rL   s                      rM   r]   z"CamembertSdpaSelfAttention.forward  s`    '':59JiNcH 7?%&!  (,,.Wa//

=0IJ 3$>2D.-3E/> .^A5F5L5LQ5OSaSgSghiSj5j%3"I{11$((>2JKI33DJJ~4NOK)2D!II~a'8)&D!L	#ii):K(HaP?? (5N
 &&;+=+=+B+Bf+LQ_Qk%002K!,,.I%002K OO,>>CY^ehi^iDot 	 hh))FF$+/==d''c G 
 "++Aq1!))#w8J8JK.?? 11GrN   r   r   )ra   rb   rc   r2   rC   r   r   r   r   r   r]   re   rf   s   @rM   r   r     s    b 2615=A>BDH,1[||[ !.[ E--.	[
  ((9(9:[ !)):): ;[ !uU->->'?!@A[ $D>[ 
u||	[ [rN   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )CamembertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr'   )r1   r2   r   rt   r5   denser<   r=   r>   r?   r@   rI   s     rM   r2   zCamembertSelfOutput.__init__z  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rN   r   input_tensorr}   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r@   r<   rJ   r   r   s      rM   r]   zCamembertSelfOutput.forward  7    

=1]3}|'CDrN   ra   rb   rc   r2   rC   r   r]   re   rf   s   @rM   r   r   y  1    >U\\  RWR^R^ rN   r   )eagersdpac                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )CamembertAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr   )	r1   r2    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrJ   r   outputsetpruned_headsr{   s      rM   r2   zCamembertAttention.__init__  sC    4V5P5PQ,C
	 *&1ErN   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r!   r   )lenr   rJ   ro   rr   r   r   ru   rv   rw   r   r   rs   union)rJ   headsindexs      rM   prune_headszCamembertAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rN   r   r   r   r   r   r   r   r}   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r!   )rJ   r   )rJ   r   r   r   r   r   r   r   self_outputsattention_outputr   s              rM   r]   zCamembertAttention.forward  sW     yy!"
  ;;|AF#%QR(88rN   r   r   )ra   rb   rc   r2   r   rC   r   r   r   r   r   r]   re   rf   s   @rM   r   r     s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	rN   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CamembertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r1   r2   r   rt   r5   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrI   s     rM   r2   zCamembertIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rN   r   r}   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rJ   r   s     rM   r]   zCamembertIntermediate.forward  s&    

=100?rN   r   rf   s   @rM   r   r     s#    9U\\ ell rN   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )CamembertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r1   r2   r   rt   r   r5   r   r<   r=   r>   r?   r@   rI   s     rM   r2   zCamembertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rN   r   r   r}   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rM   r]   zCamembertOutput.forward  r   rN   r   rf   s   @rM   r   r     r   rN   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )CamembertLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr!   z> should be used as a decoder model if cross attention is addedr*   r   )r1   r2   chunk_size_feed_forwardseq_len_dimr   	attentionrz   add_cross_attentionrp   crossattentionr   intermediater   r   rI   s     rM   r2   zCamembertLayer.__init__  s    '-'E'E$+F3 ++#)#=#= ##?? D6)g!hii"4VU_"`D1&9%f-rN   r   r   r   r   r   r   r   r}   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nrn   )r   r   r   r!   r,   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   rz   rT   rp   r   r   feed_forward_chunkr   r   )rJ   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    rM   r]   zCamembertLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44GrN   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rJ   r   intermediate_outputr  s       rM   r  z!CamembertLayer.feed_forward_chunk1  s,    "//0@A{{#68HIrN   r   )ra   rb   rc   r2   rC   r   r   r   r   r   r]   r  re   rf   s   @rM   r   r     s    ." 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?BrN   r   c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )CamembertEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r1   r2   rK   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rJ   rK   r   rL   s      rM   r2   zCamembertEncoder.__init__9  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr}   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r,   r!   rn   c              3   $   K   | ]  }|| 
 y wr   r  ).0vs     rM   	<genexpr>z+CamembertEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater  r   
attentionscross_attentions)rK   r   r  r   r   r   	enumerater  _gradient_checkpointing_func__call__tupler   )rJ   r   r   r   r   r   r  r   r   r  r  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       rM   r]   zCamembertEncoder.forward?  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
rN   )	NNNNNNFFT)ra   rb   rc   r2   rC   r   r   r   r   r   r   r   r]   re   rf   s   @rM   r  r  8  s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
rN   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CamembertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r1   r2   r   rt   r5   r   Tanh
activationrI   s     rM   r2   zCamembertPooler.__init__  s9    YYv1163E3EF
'')rN   r   r}   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r/  )rJ   r   first_token_tensorpooled_outputs       rM   r]   zCamembertPooler.forward  s6     +1a40

#566rN   r   rf   s   @rM   r,  r,    s#    $
U\\ ell rN   r,  c                   "    e Zd ZeZdZdZdZd Zy)CamembertPreTrainedModelrobertaTc                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsr   )meanstdNg      ?)r   r   rt   weightdatanormal_rK   initializer_rangebiaszero_r3   r&   r<   fill_CamembertLMHead)rJ   modules     rM   _init_weightsz&CamembertPreTrainedModel._init_weights  s&   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)0KK""$ 1rN   N)	ra   rb   rc   r"   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdparC  r  rN   rM   r5  r5    s    "L!&*#N%rN   r5  c                   (     e Zd ZdZ fdZd Z xZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                 Z   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        y r   )r1   r2   r   rt   r5   r   classifier_dropoutr?   r>   r@   
num_labelsout_projrJ   rK   rK  rL   s      rM   r2   z$CamembertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrN   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r1  )r@   r   rC   tanhrM  rJ   featureskwargsr|   s       rM   r]   z#CamembertClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rN   )ra   rb   rc   rd   r2   r]   re   rf   s   @rM   rI  rI    s    7IrN   rI  c                   .     e Zd ZdZ fdZd Zd Z xZS )rA  z,Camembert Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y r   )r1   r2   r   rt   r5   r   r<   r=   
layer_normr4   decoder	ParameterrC   rF   r>  rI   s     rM   r2   zCamembertLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrN   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   rV  rW  rQ  s       rM   r]   zCamembertLMHead.forward  s;    JJx GOOA LLOrN   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)rW  r>  rQ   r   rJ   s    rM   _tie_weightszCamembertLMHead._tie_weights  sC     <<##((F2 $		DLL))DIrN   )ra   rb   rc   rd   r2   r]   r]  re   rf   s   @rM   rA  rA    s    6&*rN   rA  c                        e Zd ZdZg Zd fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
ej                     de
ej                     de
eej                        de
e   de
e   de
e   de
e   deeej                     ef   fd       Z xZS )CamembertModela)  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        |j                  | _
        |j                  | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r1   r2   rK   r$   r\   r  encoderr,  poolerr   attn_implementationr)   	post_init)rJ   rK   add_pooling_layerrL   s      rM   r2   zCamembertModel.__init__  si    
 	 -f5'/1Bof-#)#>#> '-'E'E$ 	rN   c                 .    | j                   j                  S r   r\   r7   r\  s    rM   get_input_embeddingsz#CamembertModel.get_input_embeddings  s    ...rN   c                 &    || j                   _        y r   rg  )rJ   rw   s     rM   set_input_embeddingsz#CamembertModel.set_input_embeddings!  s    */'rN   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsra  r  r   r   )rJ   heads_to_pruner  r   s       rM   _prune_headszCamembertModel._prune_heads$  sE    
 +002 	CLE5LLu%//;;EB	CrN   rU   r   r.   r+   r   rV   r   r   r  r   r   r  r  r}   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                   |t        j"                  |      }| j                  |||||	      }|t        j$                  |||z   f|
      }| j&                  dk(  xr | j(                  dk(  xr	 |d u xr | }|rQ|j+                         dk(  r>| j                   j                  rt-        ||||      }n+t/        ||j0                  |      }n| j3                  ||      }| j                   j                  rs|q|j                         \  }}}||f}|t        j$                  ||
      }|r,|j+                         dk(  rt/        ||j0                  |      }n| j5                  |      }nd }| j7                  || j                   j8                        }| j;                  ||||||	|
|||
      }|d   }| j<                  | j=                  |      nd }|s
||f|dd  z   S t?        |||j@                  |jB                  |jD                  |jF                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer,   z5You have to specify either input_ids or inputs_embedsr   rn   r.   rP   )rU   r+   r.   rV   rW   )rQ   r   r*   )r   )	r   r   r   r   r  r   r   r  r  r!   )r  pooler_outputr  r   r  r  )$rK   r   r  use_return_dictrz   r   rp   %warn_if_padding_and_no_attention_maskrG   rQ   r   rT   r\   r.   rE   rC   rF   rH   onesrc  r)   r   r   r   r0   get_extended_attention_maskinvert_attention_maskget_head_maskr  ra  rb  r   r  r   r  r  ) rJ   rU   r   r.   r+   r   rV   r   r   r  r   r   r  r  rX   
batch_sizerY   rQ   rW   rZ   r[   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputr3  s                                    rM   r]   zCamembertModel.forward,  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~Wb&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y$++2O2OP	,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rN   )T)NNNNNNNNNNNNN)ra   rb   rc   rd   _no_split_modulesr2   rh  rj  rn  r   r   rC   r   r   r   r   r   r   r   r]   re   rf   s   @rM   r_  r_    sx    &/0C  -11515/3,0048<9==A$(,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
  (5L
 !) 6L
 "$u'8'8"9:L
 D>L
 $D>L
 'tnL
 d^L
 
uU\\"$PP	QL
 L
rN   r_  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   deee	j                     ef   fd       Z xZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fre  
r1   r2   rz   r   warningr_  r6  rA  lm_headrd  rI   s     rM   r2   zCamembertForMaskedLM.__init__  sR     NN1
 &fF&v. 	rN   c                 .    | j                   j                  S r   r  rW  r\  s    rM   get_output_embeddingsz*CamembertForMaskedLM.get_output_embeddings      ||###rN   c                 &    || j                   _        y r   r  rJ   new_embeddingss     rM   set_output_embeddingsz*CamembertForMaskedLM.set_output_embeddings      -rN   rU   r   r.   r+   r   rV   r   r   labelsr   r  r  r}   c                    ||n| j                   j                  }| j                  |||||||||
||      }|d   }| j                  |      }d}|	a|	j	                  |j
                        }	t               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r   r.   r+   r   rV   r   r   r   r  r  r   r,   rn   losslogitsr   r  )rK   rq  r6  r  r   rQ   r
   r   r4   r   r   r  )rJ   rU   r   r.   r+   r   rV   r   r   r  r   r  r  r   r  prediction_scoresmasked_lm_lossloss_fctr   s                      rM   r]   zCamembertForMaskedLM.forward  s   > &1%<k$++B]B],,))%'"7#9/!5#  
 "!* LL9YY0778F')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rN   )NNNNNNNNNNNN)ra   rb   rc   _tied_weights_keysr2   r  r  r   r   rC   
LongTensorr   r   r   r   r   r   r]   re   rf   s   @rM   r  r    sk    34JK$.  156:59371559=A>B-1,0/3&*@
E,,-@
 !!2!23@
 !!1!12	@

 u//0@
 E--.@
   1 12@
  ((9(9:@
 !)):): ;@
 ))*@
 $D>@
 'tn@
 d^@
 
uU\\"N2	3@
 @
rN   r  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eej                     ef   fd       Z xZS )"CamembertForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr  )	r1   r2   rL  rK   r_  r6  rI  
classifierrd  rI   s     rM   r2   z+CamembertForSequenceClassification.__init__#  sJ      ++%fF5f= 	rN   rU   r   r.   r+   r   rV   r  r   r  r  r}   c                 T   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}||j	                  |j
                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               } |||      }|
s|f|d	d z   }||f|z   S |S t#        |||j$                  |j&                  
      S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r.   r+   r   rV   r   r  r  r   r!   
regressionsingle_label_classificationmulti_label_classificationr,   rn   r  )rK   rq  r6  r  r   rQ   problem_typerL  r0   rC   rH   rq   r   squeezer
   r   r	   r   r   r  rJ   rU   r   r.   r+   r   rV   r  r   r  r  r   r  r  r  r  r   s                    rM   r]   z*CamembertForSequenceClassification.forward.  s   : &1%<k$++B]B],,))%'/!5#  

 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rN   
NNNNNNNNNN)ra   rb   rc   r2   r   r   rC   r  r   r   r   r   r   r   r]   re   rf   s   @rM   r  r    s"   	  156:59371559-1,0/3&*N
E,,-N
 !!2!23N
 !!1!12	N

 u//0N
 E--.N
   1 12N
 ))*N
 $D>N
 'tnN
 d^N
 
uU\\"$<<	=N
 N
rN   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eej                     ef   fd       Z xZS )CamembertForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr!   )r1   r2   r_  r6  r   r>   r?   r@   rt   r5   r  rd  rI   s     rM   r2   z#CamembertForMultipleChoice.__init__  sV     %f-zz&"<"<=))F$6$6: 	rN   rU   r.   r   r  r+   r   rV   r   r  r  r}   c                    |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|.|j                  |j                        }t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr!   r,   r   )r+   r.   r   r   rV   r   r  r  rn   r  )rK   rq  r   r   rG   r6  r@   r  r   rQ   r
   r   r   r  )rJ   rU   r.   r   r  r+   r   rV   r   r  r  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r3  r  reshaped_logitsr  r  r   s                           rM   r]   z"CamembertForMultipleChoice.forward  s   Z &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ,,*..,/!5#  

  
]3/ ++b+6YY556F')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rN   r  )ra   rb   rc   r2   r   r   rC   r  r   r   r   r   r   r   r]   re   rf   s   @rM   r  r    s"     15596:-1371559,0/3&*Z
E,,-Z
 !!1!12Z
 !!2!23	Z

 ))*Z
 u//0Z
 E--.Z
   1 12Z
 $D>Z
 'tnZ
 d^Z
 
uU\\"$==	>Z
 Z
rN   r  c                   ^    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   de
eej                     ef   fd       Z xZS )CamembertForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r  )r1   r2   rL  r_  r6  rK  r?   r   r>   r@   rt   r5   r  rd  rN  s      rM   r2   z(CamembertForTokenClassification.__init__  s      ++%fF)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rN   rU   r   r.   r+   r   rV   r  r   r  r  r}   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|W|j                  |j                        }t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r,   rn   r  )rK   rq  r6  r@   r  r   rQ   r
   r   rL  r   r   r  r  s                    rM   r]   z'CamembertForTokenClassification.forward  s   6 &1%<k$++B]B],,))%'/!5#  

 "!*,,71YYv}}-F')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rN   r  )ra   rb   rc   r2   r   r   rC   r  r   r   r   r   r   r   r]   re   rf   s   @rM   r  r    s     156:59371559-1,0/3&*=
E,,-=
 !!2!23=
 !!1!12	=

 u//0=
 E--.=
   1 12=
 ))*=
 $D>=
 'tn=
 d^=
 
uU\\"$99	:=
 =
rN   r  c                   ~    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   de
eej                     ef   fd       Z xZS )CamembertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
r1   r2   rL  r_  r6  r   rt   r5   
qa_outputsrd  rI   s     rM   r2   z&CamembertForQuestionAnswering.__init__@  sU      ++%fF))F$6$68I8IJ 	rN   rU   r   r.   r+   r   rV   start_positionsend_positionsr   r  r  r}   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nr  r   r!   r,   r   )ignore_indexrn   )r  start_logits
end_logitsr   r  )rK   rq  r6  r  splitr  r   r   rG   clampr
   r   r   r  )rJ   rU   r   r.   r+   r   rV   r  r  r   r  r  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rM   r]   z%CamembertForQuestionAnswering.forwardJ  s   4 &1%<k$++B]B],,))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rN   )NNNNNNNNNNN)ra   rb   rc   r2   r   r   rC   r  r   r   r   r   r   r   r]   re   rf   s   @rM   r  r  =  s;     156:593715596:48,0/3&*I
E,,-I
 !!2!23I
 !!1!12	I

 u//0I
 E--.I
   1 12I
 "%"2"23I
   0 01I
 $D>I
 'tnI
 d^I
 
uU\\"$@@	AI
 I
rN   r  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            "           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     dee	j                     deeee	j                           dee   dee   dee   dee   deee	j                     ef   fd       Zd Z xZS )CamembertForCausalLMr  r  c                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  r  rI   s     rM   r2   zCamembertForCausalLM.__init__  sL       NNno%fF&v. 	rN   c                 .    | j                   j                  S r   r  r\  s    rM   r  z*CamembertForCausalLM.get_output_embeddings  r  rN   c                 &    || j                   _        y r   r  r  s     rM   r  z*CamembertForCausalLM.set_output_embeddings  r  rN   rU   r   r.   r+   r   rV   r   r   r  r  r   r   r  r  r}   c                    ||n| j                   j                  }|	d}| j                  |||||||||
||||      }|d   }| j                  |      }d}|	E|	j	                  |j
                        }	 | j                  ||	fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   r.   r+   r   rV   r   r   r  r   r   r  r  r   r4   rn   )r  r  r  r   r  r  )rK   rq  r6  r  r   rQ   loss_functionr4   r   r  r   r  r  )rJ   rU   r   r.   r+   r   rV   r   r   r  r  r   r   r  r  rS  r   r  r  lm_lossr   s                        rM   r]   zCamembertForCausalLM.forward  s6   d &1%<k$++B]B]I,,))%'"7#9+/!5#  
  "!* LL9YY0778F(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rN   c                 J    d}|D ]  }|t        fd|D              fz  } |S )Nr  c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectr   rQ   )r  
past_statebeam_idxs     rM   r  z6CamembertForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)r"  )rJ   r  r  reordered_past
layer_pasts     `  rM   _reorder_cachez#CamembertForCausalLM._reorder_cache  s=    ) 	Jncmnn N	 rN   )NNNNNNNNNNNNNN)ra   rb   rc   r  r2   r  r  r   r   rC   r  r   r   r   r   r   r   r]   r  re   rf   s   @rM   r  r    s    34JK
$.  156:59371559=A>B-1EI$(,0/3&*^
E,,-^
 !!2!23^
 !!1!12	^

 u//0^
 E--.^
   1 12^
  ((9(9:^
 !)):): ;^
 ))*^
 "%e.?.?(@"AB^
 D>^
 $D>^
 'tn^
 d^^
" 
uU\\"$EE	F#^
 ^
@rN   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r!   r   )nerq   rC   cumsumtype_asrH   )rU   r&   rW   maskincremental_indicess        rM   rR   rR     sW     <<$((*D <<!4<<TBE[[_cc##%33rN   )r  r  r  r  r  r  r_  r5  )r   )Hrd   r   typingr   r   r   r   rC   torch.utils.checkpoint	packagingr   r   torch.nnr	   r
   r   activationsr   r   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r    configuration_camembertr"   
get_loggerra   r   Moduler$   rh   r   r   r   r   r   r   r   r  r,  r5  rI  rA  r_  r  r  r  r  r  r  rR   __all__r  rN   rM   <module>r     s_      / /     A A ' ) w	 	 	 . l l ? ? 4 
		H	%V=")) V=tCRYY CNb!7 bL"))  $&$  0 0hBII  bii SRYY SnZ
ryy Z
|bii  % % %6")) .*bii *> B
- B
 B
J Y
3 Y
 Y
x [
)A [
[
| f
!9 f
 f
R M
&> M
 M
` U
$< U
 U
p |3_ ||@4 	rN   