Ë
    ´ãUhJ ã                   ó<  — d Z ddlZddlZddlmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$  e"jJ                  e&«      Z'dejP                  de)de)fd„Z* G d„ de
jV                  «      Z, G d„ de
jZ                  «      Z. G d„ de
jZ                  «      Z/ G d„ de
jZ                  «      Z0 G d„ de
jZ                  «      Z1 G d„ de
jZ                  «      Z2e! G d„ d e«      «       Z3 G d!„ d"e3«      Z4 G d#„ d$e3«      Z5e! G d%„ d&e3«      «       Z6 e!d'¬(«       G d)„ d*e3e«      «       Z7 e!d+¬(«       G d,„ d-e3«      «       Z8e! G d.„ d/e3«      «       Z9 G d0„ d1e3«      Z: G d2„ d3e3e«      Z;g d4¢Z<y)5zPyTorch MVP model.é    N)ÚListÚOptionalÚTupleÚUnion)Únn)ÚBCEWithLogitsLossÚCrossEntropyLossÚMSELossé   )ÚACT2FN)ÚGenerationMixin)Ú_prepare_4d_attention_maskÚ!_prepare_4d_causal_attention_mask)ÚBaseModelOutputÚ)BaseModelOutputWithPastAndCrossAttentionsÚ!CausalLMOutputWithCrossAttentionsÚSeq2SeqLMOutputÚSeq2SeqModelOutputÚ#Seq2SeqQuestionAnsweringModelOutputÚSeq2SeqSequenceClassifierOutput)ÚPreTrainedModel)Úauto_docstringÚloggingé   )Ú	MvpConfigÚ	input_idsÚpad_token_idÚdecoder_start_token_idc                 óÖ   — | j                  | j                  «      }| dd…dd…f   j                  «       |dd…dd…f<   ||dd…df<   |€t        d«      ‚|j	                  |dk(  |«       |S )z1
    Shift input ids one token to the right.
    Néÿÿÿÿr   r   z1self.model.config.pad_token_id has to be defined.iœÿÿÿ)Ú	new_zerosÚshapeÚcloneÚ
ValueErrorÚmasked_fill_)r   r   r   Úshifted_input_idss       úv/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mvp/modeling_mvp.pyÚshift_tokens_rightr(   2   s}   € ð "×+Ñ+¨I¯O©OÓ<ÐØ(ª¨C¨R¨C¨Ñ0×6Ñ6Ó8Ð’a˜™eÑØ4Ð’a˜dÑàÐÜÐLÓMÐMà×"Ñ"Ð#4¸Ñ#<¸lÔKàÐó    c                   ón   ‡ — e Zd ZdZdedefˆ fd„Zd	dej                  dedej                  fˆ fd„Zˆ xZ	S )
ÚMvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    Únum_embeddingsÚembedding_dimc                 óN   •— d| _         t        ‰| 	  || j                   z   |«       y ©Né   )ÚoffsetÚsuperÚ__init__)Úselfr,   r-   Ú	__class__s      €r'   r3   z&MvpLearnedPositionalEmbedding.__init__H   s$   ø€ ð ˆŒÜ‰Ñ˜¨$¯+©+Ñ5°}ÕEr)   r   Úpast_key_values_lengthÚposition_idsc                 ó$  •— |€a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                  ¬«      j                  |d«      }n|j                  d«      }t        ‰| %  || j                  z   «      S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr0   )ÚdtypeÚdevicer    r   )r"   ÚtorchÚarangeÚlongÚweightr:   ÚexpandÚ	unsqueezer2   Úforwardr1   )r4   r   r6   r7   ÚbszÚseq_lenr5   s         €r'   rA   z%MvpLearnedPositionalEmbedding.forwardN   sˆ   ø€ ð ÐØ$Ÿ?™?¨2¨AÐ.‰LˆCÜ Ÿ<™<Ø&Ð(>ÀÑ(HÔPU×PZÑPZÐcg×cnÑcn×cuÑcuôç‰fS˜"‹oñ ð (×1Ñ1°!Ó4ˆLä‰w‰˜|¨d¯k©kÑ9Ó:Ð:r)   ©r   N)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úintr3   r;   ÚTensorrA   Ú__classcell__©r5   s   @r'   r+   r+   C   sH   ø„ ñðF sð F¸3õ Fñ; §¡ð ;Àsð ;Ð^c×^jÑ^j÷ ;ñ ;r)   r+   c                   ó”  ‡ — e Zd ZdZ	 	 	 ddededededef
ˆ fd„Zdej                  d	ed
efd„Z
	 	 	 	 	 	 ddej                  deej                     deeej                        deej                     deej                     deej                     dedeej                  eej                     eeej                        f   fd„Zˆ xZS )ÚMvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paperÚ	embed_dimÚ	num_headsÚdropoutÚ
is_decoderÚbiasc                 óî  •— t         ‰|   «        || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                  › d|› d«      ‚| j
                  dz  | _        || _        t        j                  |||¬«      | _        t        j                  |||¬«      | _        t        j                  |||¬«      | _        t        j                  |||¬«      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      à¿©rS   )r2   r3   rO   rP   rQ   Úhead_dimr$   ÚscalingrR   r   ÚLinearÚk_projÚv_projÚq_projÚout_proj)r4   rO   rP   rQ   rR   rS   r5   s         €r'   r3   zMvpAttention.__init___   sÖ   ø€ ô 	‰ÑÔØ"ˆŒØ"ˆŒØˆŒØ! YÑ.ˆŒàM‰M˜IÑ%¨$¯.©.Ò8ÜØMÈdÏnÉnÐM]Ø$ Y K¨rð3óð ð —}‘} dÑ*ˆŒØ$ˆŒä—i‘i 	¨9¸4Ô@ˆŒÜ—i‘i 	¨9¸4Ô@ˆŒÜ—i‘i 	¨9¸4Ô@ˆŒÜŸ	™	 )¨Y¸TÔBˆr)   ÚtensorrC   rB   c                 óŽ   — |j                  ||| j                  | j                  «      j                  dd«      j	                  «       S )Nr   r0   )ÚviewrP   rV   Ú	transposeÚ
contiguous)r4   r]   rC   rB   s       r'   Ú_shapezMvpAttention._shapez   s7   € Ø{‰{˜3 ¨¯©¸¿¹ÓG×QÑQÐRSÐUVÓW×bÑbÓdÐdr)   Úhidden_statesÚkey_value_statesÚpast_key_valueÚattention_maskÚlayer_head_maskÚattn_promptÚoutput_attentionsÚreturnc                 ó6  — |du}|j                  «       \  }	}
}| j                  |«      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |«      d|	«      }| j                  | j                  |«      d|	«      }nÃ|}| j                  | j	                  |«      d|	«      }| j                  | j                  |«      d|	«      }t        j                  |d   |gd¬«      }t        j                  |d   |gd¬«      }nD| j                  | j	                  |«      d|	«      }| j                  | j                  |«      d|	«      }| j                  r||f}|ºt        j                  |d   j                  |	ddd«      |gd¬«      }t        j                  |d   j                  |	ddd«      |gd¬«      }|\t        j                  |	d|
|d   j                  d«      «      j                  |j                  «      }t        j                  ||gd¬«      }|	| j                  z  d| j                  f} | j                  ||
|	«      j                  |Ž } |j                  |Ž } |j                  |Ž }|j                  d«      }t        j                   ||j#                  dd«      «      }|j                  «       |	| j                  z  |
|fk7  r/t%        d|	| j                  z  |
|f› d|j                  «       › «      ‚|{|j                  «       |	d|
|fk7  r#t%        d	|	d|
|f› d|j                  «       › «      ‚|j                  |	| j                  |
|«      |z   }|j                  |	| j                  z  |
|«      }t&        j(                  j+                  |d¬«      }|›|j                  «       | j                  fk7  r*t%        d
| j                  f› d|j                  «       › «      ‚|j                  dddd«      |j                  |	| j                  |
|«      z  }|j                  |	| j                  z  |
|«      }|r?|j                  |	| j                  |
|«      }|j                  |	| j                  z  |
|«      }nd}t&        j(                  j-                  || j,                  | j.                  ¬«      }t        j                   ||«      }|j                  «       |	| j                  z  |
| j                  fk7  r7t%        d|	| j                  |
| j                  f› d|j                  «       › «      ‚|j                  |	| j                  |
| j                  «      }|j#                  dd«      }|j1                  |	|
| j2                  «      }| j5                  |«      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r    r0   ©Údimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ©ÚpÚtrainingz `attn_output` should be of size )Úsizer[   rW   rb   rY   rZ   r;   ÚcatrR   r?   ÚzerosÚtor:   rP   rV   r_   Úbmmr`   r$   r   Ú
functionalÚsoftmaxrQ   rp   ÚreshaperO   r\   )r4   rc   rd   re   rf   rg   rh   ri   Úis_cross_attentionrB   Útgt_lenÚ_Úquery_statesÚ
key_statesÚvalue_statesÚprompt_maskÚ
proj_shapeÚsrc_lenÚattn_weightsÚattn_weights_reshapedÚ
attn_probsÚattn_outputs                         r'   rA   zMvpAttention.forward}   s  € ð .°TÐ9Ðà'×,Ñ,Ó.‰ˆˆWað —{‘{ =Ó1°D·L±LÑ@ˆá .Ð"<à'¨Ñ*ˆJØ)¨!Ñ,ŠLÙàŸ™ T§[¡[Ð1AÓ%BÀBÈÓLˆJØŸ;™; t§{¡{Ð3CÓ'DÀbÈ#ÓN‰LØÐ'àŸ™ T§[¡[°Ó%?ÀÀSÓIˆJØŸ;™; t§{¡{°=Ó'AÀ2ÀsÓKˆLÜŸ™ N°1Ñ$5°zÐ#BÈÔJˆJÜ Ÿ9™9 n°QÑ&7¸Ð%FÈAÔN‰Lð Ÿ™ T§[¡[°Ó%?ÀÀSÓIˆJØŸ;™; t§{¡{°=Ó'AÀ2ÀsÓKˆLà?Š?ð )¨,Ð7ˆNàÐ"ÜŸ™ K°¡N×$9Ñ$9¸#¸rÀ2ÀrÓ$JÈJÐ#WÐ]^Ô_ˆJÜ Ÿ9™9 k°!¡n×&;Ñ&;¸CÀÀRÈÓ&LÈlÐ%[ÐabÔcˆLØÐ)Ü#Ÿk™k¨#¨q°'¸;Àq¹>×;NÑ;NÈqÓ;QÓR×UÑUÐVd×VkÑVkÓlÜ!&§¡¨K¸Ð+HÈrÔ!Sà˜DŸN™NÑ*¨B°·±Ð>ˆ
ØCt—{‘{ <°¸#Ó>×CÑCÀZÐPˆØ$Z—_‘_ jÐ1ˆ
Ø(|×(Ñ(¨*Ð5ˆà—/‘/ !Ó$ˆÜ—y‘y ¨z×/CÑ/CÀAÀqÓ/IÓJˆà×ÑÓ 3¨¯©Ñ#7¸À'Ð"JÒJÜØ6¸¸d¿n¹nÑ8LÈgÐW^Ð7_Ð6`ð aØ ×%Ñ%Ó'Ð(ð*óð ð
 Ð%Ø×"Ñ"Ó$¨¨a°¸'Ð(BÒBÜ Ø7¸¸aÀÈ'Ð8RÐ7SÐS\Ð]k×]pÑ]pÓ]rÐ\sÐtóð ð (×,Ñ,¨S°$·.±.À'È7ÓSÐVdÑdˆLØ'×,Ñ,¨S°4·>±>Ñ-AÀ7ÈGÓTˆLä—}‘}×,Ñ,¨\¸rÐ,ÓBˆàÐ&Ø×#Ñ#Ó%¨$¯.©.Ð):Ò:Ü ØEÀtÇ~Á~ÐFWÐEXð YØ'×,Ñ,Ó.Ð/ð1óð ð +×/Ñ/°°2°q¸!Ó<¸|×?PÑ?PÐQTÐVZ×VdÑVdÐfmÐovÓ?wÑwˆLØ'×,Ñ,¨S°4·>±>Ñ-AÀ7ÈGÓTˆLáð
 %1×$5Ñ$5°c¸4¿>¹>È7ÐT[Ó$\Ð!Ø0×5Ñ5°c¸D¿N¹NÑ6JÈGÐU\Ó]‰Là$(Ð!ä—]‘]×*Ñ*¨<¸4¿<¹<ÐRV×R_ÑR_Ð*Ó`ˆ
ä—i‘i 
¨LÓ9ˆà×ÑÓ #¨¯©Ñ"6¸ÀÇÁÐ!OÒOÜØ2°C¸¿¹ÈÐRV×R_ÑR_Ð3`Ð2að bØ×$Ñ$Ó&Ð'ð)óð ð
 "×&Ñ& s¨D¯N©N¸GÀTÇ]Á]ÓSˆØ!×+Ñ+¨A¨qÓ1ˆð "×)Ñ)¨#¨w¸¿¹ÓGˆà—m‘m KÓ0ˆàÐ1°>ÐAÐAr)   )ç        FT)NNNNNF)rE   rF   rG   rH   rI   ÚfloatÚboolr3   r;   rJ   rb   r   r   rA   rK   rL   s   @r'   rN   rN   \   sD  ø„ ÙGð Ø ØñCàðCð ðCð ð	Cð
 ðCð õCð6e˜UŸ\™\ð e°Cð e¸có eð 48Ø8<Ø15Ø26Ø.2Ø"'ñwBà—|‘|ðwBð # 5§<¡<Ñ0ðwBð !  u§|¡|Ñ!4Ñ5ð	wBð
 ! §¡Ñ.ðwBð " %§,¡,Ñ/ðwBð ˜eŸl™lÑ+ðwBð  ðwBð 
ˆu|‰|˜X e§l¡lÑ3°X¸eÀEÇLÁLÑ>QÑ5RÐRÑ	S÷wBr)   rN   c                   óÖ   ‡ — e Zd Zdefˆ fd„Z	 d
dej                  dej                  dej                  dej                  dee   de	ej                  eej                     f   fd	„Z
ˆ xZS )ÚMvpEncoderLayerÚconfigc                 óf  •— t         ‰|   «        |j                  | _        t	        | j                  |j
                  |j                  ¬«      | _        t        j                  | j                  «      | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                  «      | _        t        j                   |j"                  | j                  «      | _        t        j                  | j                  «      | _        y )N)rO   rP   rQ   )r2   r3   Úd_modelrO   rN   Úencoder_attention_headsÚattention_dropoutÚ	self_attnr   Ú	LayerNormÚself_attn_layer_normrQ   r   Úactivation_functionÚactivation_fnÚactivation_dropoutrX   Úencoder_ffn_dimÚfc1Úfc2Úfinal_layer_norm©r4   r‹   r5   s     €r'   r3   zMvpEncoderLayer.__init__ø   sÎ   ø€ Ü‰ÑÔØŸ™ˆŒÜ%Ø—n‘nØ×4Ñ4Ø×,Ñ,ô
ˆŒô
 %'§L¡L°·±Ó$@ˆÔ!Ø—~‘~ˆŒÜ# F×$>Ñ$>Ñ?ˆÔØ"(×";Ñ";ˆÔÜ—9‘9˜TŸ^™^¨V×-CÑ-CÓDˆŒÜ—9‘9˜V×3Ñ3°T·^±^ÓDˆŒÜ "§¡¨T¯^©^Ó <ˆÕr)   rc   rf   rg   Úself_attn_promptri   rj   c                 ó¦  — |}| j                  |||||¬«      \  }}}t        j                  j                  || j                  | j                  ¬«      }||z   }| j                  |«      }|}| j                  | j                  |«      «      }t        j                  j                  || j                  | j                  ¬«      }| j                  |«      }t        j                  j                  || j                  | j                  ¬«      }||z   }| j                  |«      }|j                  t        j                  k(  r‹t        j                  |«      j                  «       s#t        j                   |«      j                  «       rEt        j"                  |j                  «      j$                  dz
  }	t        j&                  ||	 |	¬«      }|f}
|r|
|fz  }
|
S )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rc   rf   rg   rh   ri   rn   iè  )ÚminÚmax)r   r   rv   rQ   rp   r’   r”   r—   r•   r˜   r™   r9   r;   Úfloat16ÚisinfÚanyÚisnanÚfinforž   Úclamp)r4   rc   rf   rg   r›   ri   Úresidualr‚   r{   Úclamp_valueÚoutputss              r'   rA   zMvpEncoderLayer.forward  s’  € ð* !ˆØ)-¯©Ø'Ø)Ø+Ø(Ø/ð *8ó *
Ñ&ˆ| Qô Ÿ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-ÓdˆØ  =Ñ0ˆØ×1Ñ1°-Ó@ˆà ˆØ×*Ñ*¨4¯8©8°MÓ+BÓCˆÜŸ™×-Ñ-¨m¸t×?VÑ?VÐae×anÑanÐ-ÓoˆØŸ™ Ó/ˆÜŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-ÓdˆØ  =Ñ0ˆØ×-Ñ-¨mÓ<ˆà×Ñ¤%§-¡-Ò/ÜK‰K˜Ó&×*Ñ*Ô,´·±¸MÓ0J×0NÑ0NÔ0PäŸ+™+ m×&9Ñ&9Ó:×>Ñ>ÀÑEˆKÜ!ŸK™K¨¸K¸<È[ÔYˆMà Ð"ˆáØ˜Ñ&ˆGàˆr)   )F)rE   rF   rG   r   r3   r;   ÚFloatTensorr   rˆ   r   rA   rK   rL   s   @r'   rŠ   rŠ   ÷   sŠ   ø„ ð=˜yõ =ð, -2ñ4à×(Ñ(ð4ð ×)Ñ)ð4ð ×*Ñ*ð	4ð
  ×+Ñ+ð4ð $ D™>ð4ð 
ˆu× Ñ  (¨5×+<Ñ+<Ñ"=Ð=Ñ	>÷4r)   rŠ   c                   ó¾  ‡ — e Zd Zdefˆ fd„Z	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deeej                        dee	   dee	   deej                  eeej                  ej                  f      f   fd„Zˆ xZS )ÚMvpDecoderLayerr‹   c                 ó  •— t         ‰|   «        |j                  | _        t	        | j                  |j
                  |j                  d¬«      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                  «      | _        t	        | j                  |j
                  |j                  d¬«      | _        t        j                  | j                  «      | _        t        j$                  | j                  |j&                  «      | _        t        j$                  |j&                  | j                  «      | _        t        j                  | j                  «      | _        y )NT)rO   rP   rQ   rR   )rQ   rR   )r2   r3   r   rO   rN   Údecoder_attention_headsr   r   rQ   r   r“   r”   r•   r   r‘   r’   Úencoder_attnÚencoder_attn_layer_normrX   Údecoder_ffn_dimr—   r˜   r™   rš   s     €r'   r3   zMvpDecoderLayer.__init__@  s  ø€ Ü‰ÑÔØŸ™ˆŒä%Ø—n‘nØ×4Ñ4Ø×,Ñ,Øô	
ˆŒð —~‘~ˆŒÜ# F×$>Ñ$>Ñ?ˆÔØ"(×";Ñ";ˆÔä$&§L¡L°·±Ó$@ˆÔ!Ü(ØN‰NØ×*Ñ*Ø×,Ñ,Øô	
ˆÔô (*§|¡|°D·N±NÓ'CˆÔ$Ü—9‘9˜TŸ^™^¨V×-CÑ-CÓDˆŒÜ—9‘9˜V×3Ñ3°T·^±^ÓDˆŒÜ "§¡¨T¯^©^Ó <ˆÕr)   rc   rf   Úencoder_hidden_statesÚencoder_attention_maskrg   Úcross_attn_layer_head_maskr›   Úcross_attn_promptre   ri   Ú	use_cacherj   c           	      óx  — |}|	|	dd nd}| j                  ||||||
¬«      \  }}}t        j                  j                  || j                  | j                  ¬«      }||z   }| j                  |«      }d}d}|x|}|	|	dd nd}| j                  |||||||
¬«      \  }}}t        j                  j                  || j                  | j                  ¬«      }||z   }| j                  |«      }||z   }|}| j                  | j                  |«      «      }t        j                  j                  || j                  | j                  ¬«      }| j                  |«      }t        j                  j                  || j                  | j                  ¬«      }||z   }| j                  |«      }|f}|
r|||fz  }|r||fz  }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr0   )rc   re   rf   rg   rh   ri   rn   éþÿÿÿ)rc   rd   rf   rg   rh   re   ri   )r   r   rv   rQ   rp   r’   r­   r®   r”   r—   r•   r˜   r™   )r4   rc   rf   r°   r±   rg   r²   r›   r³   re   ri   r´   r¥   Úself_attn_past_key_valueÚself_attn_weightsÚpresent_key_valueÚcross_attn_present_key_valueÚcross_attn_weightsÚcross_attn_past_key_valuer§   s                       r'   rA   zMvpDecoderLayer.forwardZ  s  € ðH !ˆð :HÐ9S >°"°1Ñ#5ÐY]Ð à>B¿n¹nØ'Ø3Ø)Ø+Ø(Ø/ð ?Mó ?
Ñ;ˆÐ(Ð*;ô Ÿ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-ÓdˆØ  =Ñ0ˆØ×1Ñ1°-Ó@ˆð (,Ð$Ø!ÐØ Ð,Ø$ˆHð @NÐ?Y¨°r°sÑ(;Ð_cÐ%ØNR×N_ÑN_Ø+Ø!6Ø5Ø :Ø-Ø8Ø"3ð O`ó OÑKˆMÐ-Ð/Kô ŸM™M×1Ñ1°-À4Ç<Á<ÐZ^×ZgÑZgÐ1ÓhˆMØ$ }Ñ4ˆMØ ×8Ñ8¸ÓGˆMð !2Ð4PÑ PÐð !ˆØ×*Ñ*¨4¯8©8°MÓ+BÓCˆÜŸ™×-Ñ-¨m¸t×?VÑ?VÐae×anÑanÐ-ÓoˆØŸ™ Ó/ˆÜŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-ÓdˆØ  =Ñ0ˆØ×-Ñ-¨mÓ<ˆà Ð"ˆáØÐ)Ð+=Ð>Ñ>ˆGáØÐ)Ð+Ñ+ˆGàˆr)   )
NNNNNNNNFT)rE   rF   rG   r   r3   r;   rJ   r   r   rˆ   r¨   rA   rK   rL   s   @r'   rª   rª   ?  s;  ø„ ð=˜yõ =ð: 26Ø8<Ø9=Ø26Ø=AØ37Ø48Ø8<Ø,1Ø$(ñ_à—|‘|ð_ð ! §¡Ñ.ð_ð  (¨¯©Ñ5ð	_ð
 !)¨¯©Ñ 6ð_ð " %§,¡,Ñ/ð_ð %-¨U¯\©\Ñ$:ð_ð # 5§<¡<Ñ0ð_ð $ E§L¡LÑ1ð_ð !  u§|¡|Ñ!4Ñ5ð_ð $ D™>ð_ð ˜D‘>ð_ð 
ˆu× Ñ  (¨5°×1BÑ1BÀE×DUÑDUÐ1UÑ+VÑ"WÐWÑ	X÷_r)   rª   c                   ól   ‡ — e Zd ZdZdedededefˆ fd„Zdej                  dej                  fd	„Z	ˆ xZ
S )
ÚMvpClassificationHeadz-Head for sentence-level classification tasks.Ú	input_dimÚ	inner_dimÚnum_classesÚpooler_dropoutc                 óÄ   •— t         ‰|   «        t        j                  ||«      | _        t        j
                  |¬«      | _        t        j                  ||«      | _        y )N©ro   )r2   r3   r   rX   ÚdenseÚDropoutrQ   r\   )r4   r¿   rÀ   rÁ   rÂ   r5   s        €r'   r3   zMvpClassificationHead.__init__À  sD   ø€ ô 	‰ÑÔÜ—Y‘Y˜y¨)Ó4ˆŒ
Ü—z‘z NÔ3ˆŒÜŸ	™	 )¨[Ó9ˆr)   rc   rj   c                 ó¸   — | j                  |«      }| j                  |«      }t        j                  |«      }| j                  |«      }| j	                  |«      }|S ©N)rQ   rÅ   r;   Útanhr\   )r4   rc   s     r'   rA   zMvpClassificationHead.forwardÌ  sN   € ØŸ™ ]Ó3ˆØŸ
™
 =Ó1ˆÜŸ
™
 =Ó1ˆØŸ™ ]Ó3ˆØŸ™ mÓ4ˆØÐr)   )rE   rF   rG   rH   rI   r‡   r3   r;   rJ   rA   rK   rL   s   @r'   r¾   r¾   ½  sL   ø„ Ù7ð
:àð
:ð ð
:ð ð	
:ð
 õ
:ð U§\¡\ð °e·l±l÷ r)   r¾   c                   ó`   ‡ — e Zd ZdZˆ fd„Zdej                  deej                     fd„Zˆ xZ	S )Ú	MvpPromptz)Layer-wise prompt for encoder or decoder.c           	      ó8  •— t         ‰|   «        |j                  | _        || _        || _        |j
                  |z  | _        t        j                  |j                  ¬«      | _	        t        j                  |j                  |j
                  «      | _        t        j                  t        j                  |j
                  |j                  «      t        j                  «       t        j                  |j                  |dz  |j
                  z  «      «      | _        y )NrÄ   r0   )r2   r3   Úprompt_lengthÚ
num_layersrP   r   rV   r   rÆ   rQ   Ú	EmbeddingÚprompt_embeddingÚ
SequentialrX   Úprompt_mid_dimÚGELUÚprompt_trans)r4   r‹   rÎ   rP   r5   s       €r'   r3   zMvpPrompt.__init__Ø  s¾   ø€ Ü‰ÑÔØ#×1Ñ1ˆÔØ$ˆŒØ"ˆŒØŸ™¨)Ñ3ˆŒÜ—z‘z F§N¡NÔ3ˆŒÜ "§¡¨V×-AÑ-AÀ6Ç>Á>Ó RˆÔÜŸM™MÜI‰If—n‘n f×&;Ñ&;Ó<ÜG‰G‹IÜI‰If×+Ñ+¨Z¸!©^¸f¿n¹nÑ-LÓMó
ˆÕr)   Ú
prompt_idsrj   c                 ó*  — | j                  | j                  |«      «      }|j                  | j                  | j                  dz  | j
                  | j                  «      }| j                  |«      }|j                  g d¢«      j                  d«      }|S )Nr0   )r   r0   r   r   )
rÔ   rÐ   r_   rÍ   rÎ   rP   rV   rQ   ÚpermuteÚsplit)r4   rÕ   Úprompts      r'   rA   zMvpPrompt.forwardæ  sw   € Ø×"Ñ" 4×#8Ñ#8¸Ó#DÓEˆØ—‘˜T×/Ñ/°·±À1Ñ1DÀdÇnÁnÐVZ×VcÑVcÓdˆØ—‘˜fÓ%ˆØ—‘¢Ó-×3Ñ3°AÓ6ˆØˆr)   )
rE   rF   rG   rH   r3   r;   rJ   r   rA   rK   rL   s   @r'   rË   rË   Õ  s+   ø„ Ù3ô
ð %§,¡,ð °5¸¿¹Ñ3F÷ r)   rË   c                   ó.   — e Zd ZeZdZdZd„ Zed„ «       Z	y)ÚMvpPreTrainedModelÚmodelTc                 ó  — | j                   j                  }t        |t        j                  «      rY|j
                  j                  j                  d|¬«       |j                  %|j                  j                  j                  «        y y t        |t        j                  «      rf|j
                  j                  j                  d|¬«       |j                  2|j
                  j                  |j                     j                  «        y y y )Nr†   )ÚmeanÚstd)r‹   Úinit_stdÚ
isinstancer   rX   r>   ÚdataÚnormal_rS   Úzero_rÏ   Úpadding_idx)r4   Úmodulerß   s      r'   Ú_init_weightsz MvpPreTrainedModel._init_weightsô  sÃ   € Øk‰k×"Ñ"ˆÜfœbŸi™iÔ(ØM‰M×Ñ×&Ñ&¨C°SÐ&Ô9Ø{‰{Ð&Ø—‘× Ñ ×&Ñ&Õ(ð 'ä˜¤§¡Ô-ØM‰M×Ñ×&Ñ&¨C°SÐ&Ô9Ø×!Ñ!Ð-Ø—‘×"Ñ" 6×#5Ñ#5Ñ6×<Ñ<Õ>ð .ð .r)   c                 ó®   — | j                   j                  }t        j                  g d¢dddd|gg| j                  ¬«      }|j                  |«      |dœ}|S )N)r   é   é
   é   r0   r   é   é   r0   ©r:   )rf   r   )r‹   r   r;   r]   r:   Úne)r4   Ú	pad_tokenr   Údummy_inputss       r'   rñ   zMvpPreTrainedModel.dummy_inputsÿ  sW   € à—K‘K×,Ñ,ˆ	Ü—L‘LÒ"2°Q¸¸2¸qÀ)Ð4LÐ!MÐVZ×VaÑVaÔbˆ	à'Ÿl™l¨9Ó5Ø"ñ
ˆð Ðr)   N)
rE   rF   rG   r   Úconfig_classÚbase_model_prefixÚsupports_gradient_checkpointingrç   Úpropertyrñ   © r)   r'   rÛ   rÛ   î  s,   „ à€LØÐØ&*Ð#ò	?ð ñó ñr)   rÛ   c                   ó  ‡ — e Zd ZdZ	 ddedeej                     dee   fˆ fd„Z	d„ Z
d„ Z	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     dee   dee   dee   deeef   fd„Zˆ xZS )Ú
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r‹   Úembed_tokensÚ
use_promptc                 ó2  •— t         ‰|   |«       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |«      nd| _        ||| _        n0t        j                   |j"                  || j                  «      | _        t%        |j                  |«      | _        t        j(                  t+        |j,                  «      D cg c]  }t/        |«      ‘Œ c}«      | _        t        j2                  |«      | _        || _        |r7|j8                  | _        t;        ||j,                  |j<                  «      | _        d| _         | jC                  «        y c c}w ©Ng      ð?F)"r2   r3   rQ   Úencoder_layerdropÚ	layerdropr   r   rå   Úmax_position_embeddingsÚmax_source_positionsÚscale_embeddingÚmathÚsqrtÚembed_scalerù   r   rÏ   Ú
vocab_sizer+   Úembed_positionsÚ
ModuleListÚrangeÚencoder_layersrŠ   Úlayersr‘   Úlayernorm_embeddingrú   rÍ   rË   rŽ   r›   Úgradient_checkpointingÚ	post_init)r4   r‹   rù   rú   rO   r{   r5   s         €r'   r3   zMvpEncoder.__init__  sD  ø€ ô 	‰Ñ˜Ô à—~‘~ˆŒØ×1Ñ1ˆŒà—N‘Nˆ	Ø!×.Ñ.ˆÔØ$*×$BÑ$BˆÔ!Ø39×3IÒ3Iœ4Ÿ9™9 YÔ/ÈsˆÔàÐ#Ø ,ˆDÕä "§¡¨V×->Ñ->À	È4×K[ÑK[Ó \ˆDÔä<Ø×*Ñ*Øó 
ˆÔô —m‘mÄeÈF×LaÑLaÓFbÖ$cÀ¤_°VÕ%<Ò$cÓdˆŒÜ#%§<¡<°	Ó#:ˆÔ à$ˆŒÙØ!'×!5Ñ!5ˆDÔÜ$-ØØ×%Ñ%Ø×.Ñ.ó%ˆDÔ!ð ',ˆÔ#à‰Õùò %ds   ÄFc                 ó   — | j                   S rÈ   ©rù   ©r4   s    r'   Úget_input_embeddingszMvpEncoder.get_input_embeddings;  ó   € Ø× Ñ Ð r)   c                 ó   — || _         y rÈ   r  ©r4   Úvalues     r'   Úset_input_embeddingszMvpEncoder.set_input_embeddings>  ó
   € Ø!ˆÕr)   r   rf   Ú	head_maskÚinputs_embedsri   Úoutput_hidden_statesÚreturn_dictrj   c           	      ó4  — ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d«      ‚|$|}|j
                  }	|j                  d|	d   «      }n-| |j                  «       dd }	|dd…dd…df   }nt	        d«      ‚|€| j                  |«      | j                  z  }| j                  |«      }
||
z   }| j                  |«      }t        j                  j                  || j                  | j                  ¬«      }| j                   rIt#        j$                  | j&                  «      j)                  | j*                  «      }| j-                  |«      }|t/        ||j0                  «      }|rdnd}|rdnd}|_|j                  «       d   t3        | j4                  «      k7  r6t	        dt3        | j4                  «      › d	|j                  «       d   › d
«      ‚t7        | j4                  «      D ]Í  \  }}|r||fz   }d}| j                  r&t#        j8                  g «      }|| j:                  k  rd}|rd}n{| j<                  rE| j                  r9| j?                  |j@                  |||||   nd| j                   r|   nd|«      }n% ||||||   nd| j                   r|   nd|¬«      }|d   }|sŒÅ||d   fz   }ŒÏ |r||fz   }|stC        d„ |||fD «       «      S tE        |||¬«      S )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer    z5You have to specify either input_ids or inputs_embedsrn   rö   r   z&The head_mask should be specified for ú layers, but it is for ú.FT)NN)rg   r›   ri   r   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wrÈ   rö   ©Ú.0Úvs     r'   ú	<genexpr>z%MvpEncoder.forward.<locals>.<genexpr>È  s   è ø€ Òe˜qÐWXÑWdœÑeùs   ‚Š©Úlast_hidden_staterc   Ú
attentions)#r‹   ri   r  Úuse_return_dictr$   r"   r_   rq   rù   r  r  r  r   rv   rQ   rp   rú   r;   r<   rÍ   rt   r:   r›   r   r9   Úlenr
  Ú	enumerateÚrandrþ   r  Ú_gradient_checkpointing_funcÚ__call__Útupler   )r4   r   rf   r  r  ri   r  r  ÚinputÚinput_shapeÚ	embed_posrc   rÕ   r›   Úencoder_statesÚall_attentionsÚidxÚencoder_layerÚto_dropÚdropout_probabilityÚlayer_outputss                        r'   rA   zMvpEncoder.forwardA  sV  € ð\ 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆð Ð  ]Ð%>ÜÐcÓdÐdØÐ"ØˆEØŸ+™+ˆKØ!Ÿ™ r¨;°r©?Ó;‰IØÐ&Ø'×,Ñ,Ó.¨s°Ð3ˆKØ!¢!¢Q¨ (Ñ+‰EäÐTÓUÐUàÐ Ø ×-Ñ-¨iÓ8¸4×;KÑ;KÑKˆMà×(Ñ(¨Ó/ˆ	à%¨	Ñ1ˆØ×0Ñ0°Ó?ˆÜŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-Ódˆð ?Š?ÜŸ™ d×&8Ñ&8Ó9×<Ñ<¸T¿[¹[ÓIˆJØ#×4Ñ4°ZÓ@Ðð Ð%ä7¸È×H[ÑH[Ó\ˆNá3™¸ˆÙ0™°dˆð Ð Ø~‰~Ó Ñ"¤s¨4¯;©;Ó'7Ò8Ü Ø<¼SÀÇÁÓ=MÐ<Nð OØ!Ÿ™Ó(¨Ñ+Ð,¨Að/óð ô
 #,¨D¯K©KÓ"8ò "	FÑˆCÙ#Ø!/°=Ð2BÑ!BàˆGØ}Š}Ü&+§j¡j°£nÐ#Ø&¨¯©Ò7Ø"GáØ ,‘à×.Ò.°4·=²=Ø$(×$EÑ$EØ%×.Ñ.Ø%Ø&Ø+4Ð+@˜ 3šÀdØ26·/²/Ð)¨#Ò.ÀtØ)ó%‘Mñ %2Ø%Ø&Ø;DÐ;P¨°3ªÐVZØCGÇ?Â?Ð*:¸3Ò*?ÐX\Ø*;ô%Mð !.¨aÑ 0â Ø!/°=ÀÑ3CÐ2EÑ!E‘ðE"	FñH  Ø+¨}Ð.>Ñ>ˆNáÜÑe ]°NÀNÐ$SÔeÓeÐeÜØ+¸>ÐVdô
ð 	
r)   ©NF)NNNNNNN)rE   rF   rG   rH   r   r   r   rÏ   rˆ   r3   r  r  r;   Ú
LongTensorrJ   r¨   r   r   r   rA   rK   rL   s   @r'   rø   rø   
  s÷   ø„ ñð lqñ$Øð$Ø/7¸¿¹Ñ/Eð$ØZbÐcgÑZhõ$òL!ò"ð
 15Ø15Ø,0Ø59Ø,0Ø/3Ø&*ñJ
à˜E×,Ñ,Ñ-ðJ
ð ! §¡Ñ.ðJ
ð ˜EŸL™LÑ)ð	J
ð
   × 1Ñ 1Ñ2ðJ
ð $ D™>ðJ
ð ' t™nðJ
ð ˜d‘^ðJ
ð 
ˆuoÐ%Ñ	&÷J
r)   rø   c                   ó®  ‡ — e Zd ZdZ	 ddedeej                     dee   fˆ fd„Z	d„ Z
d„ Z	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deeej                        deej                     dee   dee   dee   dee   deeef   fd„Zˆ xZS )Ú
MvpDecoderzû
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r‹   rù   rú   c                 ó¶  •— t         ‰|   |«       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                  «      nd| _        ||| _        n:t        j                   |j"                  |j                  | j                  «      | _        t%        |j                  |j                  «      | _        t        j(                  t+        |j,                  «      D cg c]  }t/        |«      ‘Œ c}«      | _        t        j2                  |j                  «      | _        || _        |r]|j8                  | _        t;        ||j,                  |j<                  «      | _        t;        ||j,                  |j<                  «      | _         d| _!        | jE                  «        y c c}w rü   )#r2   r3   rQ   Údecoder_layerdroprþ   r   rå   rÿ   Úmax_target_positionsr  r  r  r   r  rù   r   rÏ   r  r+   r  r  r  Údecoder_layersrª   r
  r‘   r  rú   rÍ   rË   r¬   r›   r³   r  r  )r4   r‹   rù   rú   r{   r5   s        €r'   r3   zMvpDecoder.__init__Ø  sq  ø€ ô 	‰Ñ˜Ô Ø—~‘~ˆŒØ×1Ñ1ˆŒØ!×.Ñ.ˆÔØ$*×$BÑ$BˆÔ!Ø8>×8NÒ8Nœ4Ÿ9™9 V§^¡^Ô4ÐTWˆÔàÐ#Ø ,ˆDÕä "§¡¨V×->Ñ->ÀÇÁÐPT×P`ÑP`Ó aˆDÔä<Ø×*Ñ*ØN‰Nó 
ˆÔô —m‘mÄeÈF×LaÑLaÓFbÖ$cÀ¤_°VÕ%<Ò$cÓdˆŒÜ#%§<¡<°·±Ó#?ˆÔ à$ˆŒÙØ!'×!5Ñ!5ˆDÔÜ$-ØØ×%Ñ%Ø×.Ñ.ó%ˆDÔ!ô
 &/ØØ×%Ñ%Ø×.Ñ.ó&ˆDÔ"ð ',ˆÔ#à‰Õùò' %ds   ÄGc                 ó   — | j                   S rÈ   r  r  s    r'   r  zMvpDecoder.get_input_embeddings   r  r)   c                 ó   — || _         y rÈ   r  r  s     r'   r  zMvpDecoder.set_input_embeddings  r  r)   r   rf   r°   r±   r  Úcross_attn_head_maskÚpast_key_valuesr  r´   ri   r  r  rj   c                 ój  — |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||t        d«      ‚|$|}|j                  }|j                  d|d   «      }n-| |j                  «       dd }|dd…dd…df   }nt        d«      ‚||d   d   j                  d   nd}|€| j                  |«      | j                  z  }t        ||||«      }||t        ||j                  |d   ¬«      }| j                  ||«      }||z   }| j                  |«      }t         j"                  j%                  || j$                  | j&                  ¬«      }| j(                  rZt+        j,                  | j.                  «      j1                  | j2                  «      }| j5                  |«      }| j7                  |«      }| j8                  r%| j&                  r|	rt:        j=                  d	«       d
}	|rdnd}|
rdnd}|
r|dnd}|	rdnd}t?        ||gddg«      D ]j  \  }}|€Œ	|j                  «       d   tA        | jB                  «      k7  sŒ3t        d|› dtA        | jB                  «      › d|j                  «       d   › d«      ‚ tE        | jB                  «      D ](  \  }}|r||fz  }| j&                  r%t+        jF                  g «      }|| jH                  k  rŒ@|||   nd}| j8                  rc| j&                  rW| jK                  |jL                  |||||||   nd|||   nd| j(                  r|   nd| j(                  r|   ndd|
|	«      }nC ||||||||   nd|||   nd| j(                  r|   nd| j(                  r|   nd||
|	¬«      }|d   }|	r|||
rdnd   fz  }|
sŒ||d   fz  }|€Œ ||d   fz  }Œ+ |r||fz  }|	r|nd} |stO        d„ || |||fD «       «      S tQ        || |||¬«      S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer    zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r0   )rz   rn   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frö   r  rB  zThe `z` should be specified for r  r  )
rf   r°   r±   rg   r²   r›   r³   re   ri   r´   r   r   c              3   ó$   K  — | ]  }||–— Œ
 y ­wrÈ   rö   r   s     r'   r#  z%MvpDecoder.forward.<locals>.<genexpr>Ú  s   è ø€ ò àØ=ô ñùs   ‚)r%  rC  rc   r&  Úcross_attentions))r‹   ri   r  r´   r'  r$   r"   r_   rq   rù   r  r   r   r9   r  r  r   rv   rQ   rp   rú   r;   r<   rÍ   rt   r:   r›   r³   r  ÚloggerÚwarning_onceÚzipr(  r
  r)  r*  rþ   r+  r,  r-  r   )!r4   r   rf   r°   r±   r  rB  rC  r  r´   ri   r  r  r.  r/  r6   Ú	positionsrc   rÕ   r›   r³   Úall_hidden_statesÚall_self_attnsÚall_cross_attentionsÚnext_decoder_cacheÚ	attn_maskÚ	mask_namer3  Údecoder_layerr6  re   r7  Ú
next_caches!                                    r'   rA   zMvpDecoder.forward  sý  € ð` 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø%0Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆð Ð  ]Ð%>ÜÐsÓtÐtØÐ"ØˆEØ#Ÿ/™/ˆKØ!Ÿ™ r¨;°r©?Ó;‰IØÐ&Ø'×,Ñ,Ó.¨s°Ð3ˆKØ!¢!¢Q¨ (Ñ+‰EäÐdÓeÐeð DSÐC^ °Ñ!3°AÑ!6×!<Ñ!<¸QÒ!?ÐdeÐàÐ Ø ×-Ñ-¨iÓ8¸4×;KÑ;KÑKˆMä:Ø˜K¨Ð8Nó
ˆð
 !Ð,Ð1GÐ1Sä%?Ø&¨×(;Ñ(;À[ÐQSÁ_ô&Ð"ð
 ×(Ñ(¨Ð0FÓGˆ	à%¨	Ñ1ˆØ×0Ñ0°Ó?ˆäŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-Ódˆð ?Š?ÜŸ™ d×&8Ñ&8Ó9×<Ñ<¸T¿[¹[ÓIˆJØ#×4Ñ4°ZÓ@ÐØ $× 6Ñ 6°zÓ BÐà×&Ò&¨4¯=ª=ÙÜ×#Ñ#Øpôð "	ñ #7™B¸DÐÙ0™°dˆÙ&7Ð<QÐ<]™rÐdhÐÙ#,™R°$Ðô %(¨Ð4HÐ(IÈKÐYoÐKpÓ$qò 	Ñ ˆIyØÑ$Ø—>‘>Ó# AÑ&¬3¨t¯{©{Ó+;Ó<Ü$Ø 	˜{Ð*DÄSÈÏÉÓEUÐDVð WØ%ŸN™NÓ,¨QÑ/Ð0°ð3óð ð	ô #,¨D¯K©KÓ"8ó 3	@ÑˆCá#Ø! mÐ%5Ñ5Ð!Ø}Š}Ü&+§j¡j°£nÐ#Ø&¨¯©Ò7Øà5DÐ5P˜_¨SÒ1ÐVZˆNà×*Ò*¨t¯}ª}Ø $× AÑ AØ!×*Ñ*Ø!Ø"Ø)Ø*Ø&/Ð&;I˜c’NÀØ1EÐ1QÐ(¨Ò-ÐW[Ø-1¯_ª_Ð$ SÒ)À$Ø.2¯oªoÐ% cÒ*À4ØØ%Øó!‘ñ !.Ø!Ø#1Ø*?Ø+AØ7@Ð7L Y¨s¢^ÐRVà5IÐ5UÐ,¨SÒ1Ð[_à?C¿ºÐ&6°sÒ&;ÐTXØAEÇÂÐ'8¸Ò'=ÐVZØ#1Ø&7Ø'ô!ð *¨!Ñ,ˆMáØ" }Ñ:K±QÐQRÑ'SÐ&UÑUÐ"ã Ø =°Ñ#3Ð"5Ñ5à(Ò4Ø(¨]¸1Ñ-=Ð,?Ñ?Ò(ðg3	@ñl  Ø -Ð!1Ñ1Ðá+4Ñ'¸$ˆ
ÙÜñ à'¨Ð5FÈÐXlÐmôó ð ô
 9Ø+Ø&Ø+Ø%Ø1ô
ð 	
r)   r8  )NNNNNNNNNNNN)rE   rF   rG   rH   r   r   r   rÏ   rˆ   r3   r  r  r;   r9  rJ   r¨   r   r   r   r   rA   rK   rL   s   @r'   r;  r;  Î  sq  ø„ ñð lqñ&Øð&Ø/7¸¿¹Ñ/Eð&ØZbÐcgÑZhõ&òP!ò"ð
 15Ø15Ø=AØ=AØ,0Ø7;Ø=AØ59Ø$(Ø,0Ø/3Ø&*ñ_
à˜E×,Ñ,Ñ-ð_
ð ! §¡Ñ.ð_
ð  (¨×(9Ñ(9Ñ:ð	_
ð
 !)¨×)9Ñ)9Ñ :ð_
ð ˜EŸL™LÑ)ð_
ð ' u§|¡|Ñ4ð_
ð " $ u×'8Ñ'8Ñ"9Ñ:ð_
ð   × 1Ñ 1Ñ2ð_
ð ˜D‘>ð_
ð $ D™>ð_
ð ' t™nð_
ð ˜d‘^ð_
ð 
ˆuÐ?Ð?Ñ	@÷_
r)   r;  c            $       ó  ‡ — e Zd ZdgZddgZdefˆ fd„Zd„ Zd„ Zd„ Z	d	„ Z
d
„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                      deej                     deej                     deej                      deej                      deej                      deeej$                        deeej$                        deej$                     deej$                     dee   dee   dee   dee   deeef   f d„«       Zˆ xZS )ÚMvpModelÚfinal_logits_biasúencoder.embed_tokens.weightúdecoder.embed_tokens.weightr‹   c                 óz  •— t         ‰|   |«       |j                  |j                  }}|j                  | _        t        j                  ||j                  |«      | _        t        || j                  |j                  «      | _
        t        || j                  |j                  «      | _        | j                  «        y rÈ   )r2   r3   r   r  rú   r   rÏ   r   Úsharedrø   Úencoderr;  Údecoderr  )r4   r‹   rå   r  r5   s       €r'   r3   zMvpModel.__init__í  sŠ   ø€ Ü‰Ñ˜Ô à"(×"5Ñ"5°v×7HÑ7HZˆØ ×+Ñ+ˆŒÜ—l‘l :¨v¯~©~¸{ÓKˆŒä! &¨$¯+©+°v×7HÑ7HÓIˆŒÜ! &¨$¯+©+°v×7HÑ7HÓIˆŒð 	‰Õr)   c                 ó   — | j                   S rÈ   )rY  r  s    r'   r  zMvpModel.get_input_embeddingsú  s   € Ø{‰{Ðr)   c                 ó~   — || _         | j                   | j                  _        | j                   | j                  _        y rÈ   )rY  rZ  rù   r[  r  s     r'   r  zMvpModel.set_input_embeddingsý  s)   € ØˆŒØ$(§K¡Kˆ‰Ô!Ø$(§K¡Kˆ‰Õ!r)   c                 ó   — | j                   S rÈ   )rZ  r  s    r'   Úget_encoderzMvpModel.get_encoder  ó   € Ø|‰|Ðr)   c                 ó   — | j                   S rÈ   ©r[  r  s    r'   Úget_decoderzMvpModel.get_decoder  r`  r)   c                 ó*  — | j                   sJ d«       ‚| j                  d«       | j                  j                  j                  d«       | j                  j                  j                  d«       | j                  j
                  j                  d«       y )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)rú   Úrequires_grad_rZ  r›   r[  r³   r  s    r'   Úset_lightweight_tuningzMvpModel.set_lightweight_tuning  sj   € ØŠÐjÐ jÓjˆà×Ñ˜EÔ"Ø‰×%Ñ%×4Ñ4°TÔ:Ø‰×%Ñ%×4Ñ4°TÔ:Ø‰×&Ñ&×5Ñ5°dÕ;r)   r   rf   Údecoder_input_idsÚdecoder_attention_maskr  Údecoder_head_maskrB  Úencoder_outputsrC  r  Údecoder_inputs_embedsr´   ri   r  r  rj   c                 ó:  — |€D|€B|€t        d«      ‚t        || j                  j                  | j                  j                  «      }||n| j                  j
                  }||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|€| j                  ||||
|||¬«      }nI|rGt        |t        «      s7t        |d   t        |«      dkD  r|d   ndt        |«      dkD  r|d   nd¬«      }| j                  |||d   ||||	|||||¬«      }|s||z   S t        |j                  |j                   |j"                  |j$                  |j&                  |j                  |j"                  |j$                  ¬	«      S )
a"  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        Nz°If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   rf   r  r  ri   r  r  r   r   r0   r$  ©r   rf   r°   r±   r  rB  rC  r  r´   ri   r  r  )r%  rC  Údecoder_hidden_statesÚdecoder_attentionsrF  Úencoder_last_hidden_stater°   Úencoder_attentions)r$   r(   r‹   r   r   ri   r  r´   r'  rZ  rá   r   r(  r[  r   r%  rC  rc   r&  rF  )r4   r   rf   rg  rh  r  ri  rB  rj  rC  r  rk  r´   ri   r  r  Údecoder_outputss                    r'   rA   zMvpModel.forward  sÖ  € ðd Ð$Ð)>Ð)FØÐ Ü ðUóð ô !3Ø˜4Ÿ;™;×3Ñ3°T·[±[×5WÑ5Wó!Ðð 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø%0Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ"Ø"Ÿl™lØ#Ø-Ø#Ø+Ø"3Ø%9Ø'ð +ó ‰Oñ ¤¨O¼_Ô!MÜ-Ø"1°!Ñ"4Ü47¸Ó4HÈ1Ò4L˜o¨aÒ0ÐRVÜ14°_Ó1EÈÒ1I˜?¨1Ò-ÈtôˆOð Ÿ,™,Ø'Ø1Ø"1°!Ñ"4Ø#1Ø'Ø!5Ø+Ø/ØØ/Ø!5Ø#ð 'ó 
ˆñ Ø" _Ñ4Ð4ä!Ø-×?Ñ?Ø+×;Ñ;Ø"1×"?Ñ"?Ø.×9Ñ9Ø,×=Ñ=Ø&5×&GÑ&GØ"1×"?Ñ"?Ø.×9Ñ9ô	
ð 		
r)   ©NNNNNNNNNNNNNNN)rE   rF   rG   Ú"_keys_to_ignore_on_load_unexpectedÚ_tied_weights_keysr   r3   r  r  r_  rc  rf  r   r   r;   r9  rJ   r   r¨   rˆ   r   r   r   rA   rK   rL   s   @r'   rT  rT  è  sÆ  ø„ à*=Ð)>Ð&Ø7Ð9VÐWÐð˜yõ òò0ò
òò<ð ð 15Ø15Ø8<Ø=AØ,0Ø48Ø7;Ø=AØ=AØ59Ø=AØ$(Ø,0Ø/3Ø&*ñ!r
à˜E×,Ñ,Ñ-ðr
ð ! §¡Ñ.ðr
ð $ E×$4Ñ$4Ñ5ð	r
ð
 !)¨×)9Ñ)9Ñ :ðr
ð ˜EŸL™LÑ)ðr
ð $ E§L¡LÑ1ðr
ð ' u§|¡|Ñ4ðr
ð " $ u×'8Ñ'8Ñ"9Ñ:ðr
ð " $ u×'8Ñ'8Ñ"9Ñ:ðr
ð   × 1Ñ 1Ñ2ðr
ð  (¨×(9Ñ(9Ñ:ðr
ð ˜D‘>ðr
ð $ D™>ðr
ð ' t™nðr
ð  ˜d‘^ð!r
ð" 
ˆuÐ(Ð(Ñ	)ò#r
ó ôr
r)   rT  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )Úcustom_introc            &       ó¢  ‡ — e Zd Zg d¢Zdefˆ fd„Zd„ Zd„ Z	 d#dede	e   d	e
d
ej                  fˆ fd„Zded
dfd„Zd„ Zd„ Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$de	ej(                     de	ej*                     de	ej(                     de	ej(                     de	ej*                     de	ej*                     de	ej*                     de	eej.                        de	eej.                        de	ej.                     de	ej.                     de	ej(                     de	e
   de	e
   de	e
   de	e
   d
eeef   f"d „«       Zdej*                  fd!„Zed"„ «       Zˆ xZS )%ÚMvpForConditionalGeneration)rV  rW  úlm_head.weightr‹   c                 óx  •— t         ‰|   |«       t        |«      | _        | j	                  dt        j                  d| j                  j                  j                  f«      «       t        j                  |j                  | j                  j                  j                  d¬«      | _        | j                  «        y )NrU  r   FrU   )r2   r3   rT  rÜ   Úregister_bufferr;   rs   rY  r,   r   rX   r   Úlm_headr  rš   s     €r'   r3   z$MvpForConditionalGeneration.__init__Ž  s€   ø€ Ü‰Ñ˜Ô Ü˜fÓ%ˆŒ
Ø×ÑÐ0´%·+±+¸qÀ$Ç*Á*×BSÑBS×BbÑBbÐ>cÓ2dÔeÜ—y‘y §¡°·±×1BÑ1B×1QÑ1QÐX]Ô^ˆŒð 	‰Õr)   c                 ó6   — | j                   j                  «       S rÈ   )rÜ   r_  r  s    r'   r_  z'MvpForConditionalGeneration.get_encoder—  ó   € Øz‰z×%Ñ%Ó'Ð'r)   c                 ó6   — | j                   j                  «       S rÈ   )rÜ   rc  r  s    r'   rc  z'MvpForConditionalGeneration.get_decoderš  r~  r)   NÚnew_num_tokensÚpad_to_multiple_ofÚmean_resizingrj   c                 óL   •— t         ‰|   |||«      }| j                  |«       |S rÈ   )r2   Úresize_token_embeddingsÚ_resize_final_logits_bias)r4   r€  r  r‚  Únew_embeddingsr5   s        €r'   r„  z3MvpForConditionalGeneration.resize_token_embeddings  s.   ø€ ô ™Ñ8¸ÐI[Ð]jÓkˆØ×&Ñ& ~Ô6ØÐr)   c                 ó6  — | j                   j                  d   }||k  r| j                   d d …d |…f   }nSt        j                  d||z
  f| j                   j                  ¬«      }t        j
                  | j                   |gd¬«      }| j                  d|«       y )Nr    r   rî   rl   rU  )rU  r"   r;   rs   r:   rr   r{  )r4   r€  Úold_num_tokensÚnew_biasÚ
extra_biass        r'   r…  z5MvpForConditionalGeneration._resize_final_logits_bias¤  sŒ   € Ø×/Ñ/×5Ñ5°bÑ9ˆØ˜^Ò+Ø×-Ñ-ªa°°.°Ð.@ÑA‰HäŸ™ a¨¸.Ñ)HÐ%IÐRV×RhÑRh×RoÑRoÔpˆJÜ—y‘y $×"8Ñ"8¸*Ð!EÈ1ÔMˆHØ×ÑÐ0°(Õ;r)   c                 ó   — | j                   S rÈ   ©r|  r  s    r'   Úget_output_embeddingsz1MvpForConditionalGeneration.get_output_embeddings­  r`  r)   c                 ó   — || _         y rÈ   rŒ  ©r4   r†  s     r'   Úset_output_embeddingsz1MvpForConditionalGeneration.set_output_embeddings°  ó	   € Ø%ˆr)   c                 ón   — | j                   j                  «        | j                  j                  d«       y r8  ©rÜ   rf  r|  re  r  s    r'   rf  z2MvpForConditionalGeneration.set_lightweight_tuning³  ó$   € Ø
‰
×)Ñ)Ô+Ø‰×#Ñ# EÕ*r)   r   rf   rg  rh  r  ri  rB  rj  rC  r  rk  Úlabelsr´   ri   r  r  c                 óÒ  — ||n| j                   j                  }|R|rt        j                  d«       d}|€7|€5t	        || j                   j
                  | j                   j                  «      }| j                  |||||||||	|
|||||¬«      }| j                  |d   «      | j                  z   }d}|Ft        «       } ||j                  d| j                   j                  «      |j                  d«      «      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  ¬«	      S )	aµ  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rf   rg  rj  rh  r  ri  rB  rC  r  rk  r´   ri   r  r  r   r    r   ©	ÚlossÚlogitsrC  rn  ro  rF  rp  r°   rq  )r‹   r'  rG  Úwarningr(   r   r   rÜ   r|  rU  r	   r_   r  r   rC  rn  ro  rF  rp  r°   rq  )r4   r   rf   rg  rh  r  ri  rB  rj  rC  r  rk  r•  r´   ri   r  r  r§   Ú	lm_logitsÚmasked_lm_lossÚloss_fctÚoutputs                         r'   rA   z#MvpForConditionalGeneration.forward·  sŠ  € ðb &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐÙÜ—‘ÐkÔlØˆIØ Ð(Ð-BÐ-JÜ$6Ø˜DŸK™K×4Ñ4°d·k±k×6XÑ6Xó%Ð!ð —*‘*ØØ)Ø/Ø+Ø#9ØØ/Ø!5Ø+Ø'Ø"7ØØ/Ø!5Ø#ð ó 
ˆð" —L‘L ¨¡Ó,¨t×/EÑ/EÑEˆ	àˆØÐÜ'Ó)ˆHÙ% i§n¡n°R¸¿¹×9OÑ9OÓ&PÐRX×R]ÑR]Ð^`ÓRaÓbˆNáØ\ G¨A¨B KÑ/ˆFØ3AÐ3M^Ð%¨Ñ.ÐYÐSYÐYäØØØ#×3Ñ3Ø")×"?Ñ"?Ø&×9Ñ9Ø$×5Ñ5Ø&-×&GÑ&GØ")×"?Ñ"?Ø&×9Ñ9ô

ð 
	
r)   c                 ól   — t        || j                  j                  | j                  j                  «      S rÈ   )r(   r‹   r   r   )r4   r•  s     r'   Ú%prepare_decoder_input_ids_from_labelszAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels;  s%   € Ü! &¨$¯+©+×*BÑ*BÀDÇKÁK×DfÑDfÓgÐgr)   c                 ó\   ‡— d}| D ]#  }|t        ˆfd„|d d D «       «      |dd  z   fz  }Œ% |S )Nrö   c              3   ót   •K  — | ]/  }|j                  d ‰j                  |j                  «      «      –— Œ1 y­wrD   ©Úindex_selectrt   r:   ©r!  Ú
past_stateÚbeam_idxs     €r'   r#  z=MvpForConditionalGeneration._reorder_cache.<locals>.<genexpr>D  s.   øè ø€ ÒrÐU_j×-Ñ-¨a°·±¸Z×=NÑ=NÓ1O×PÑrùó   ƒ58r0   ©r-  ©rC  r§  Úreordered_pastÚ
layer_pasts    `  r'   Ú_reorder_cachez*MvpForConditionalGeneration._reorder_cache>  sT   ø€ àˆØ)ò 	ˆJàÜÓrÐcmÐnpÐopÐcqÔrÓrØ˜Q˜R.ñ!ðñ ‰Nð	ð Ðr)   )NT©NNNNNNNNNNNNNNNN) rE   rF   rG   ru  r   r3   r_  rc  rI   r   rˆ   r   rÏ   r„  r…  r  r  rf  r   r;   r9  rJ   r   r¨   r   r   r   rA   r   Ústaticmethodr­  rK   rL   s   @r'   rx  rx  †  s@  ø„ ò jÐð˜yõ ò(ò(ð dhñØ!ðØ7?À±}ðØ\`ðà	‰õð<¸ð <Àó <òò&ò+ð ð 15Ø15Ø8<Ø=AØ,0Ø48Ø7;Ø=AØ=AØ59Ø=AØ-1Ø$(Ø,0Ø/3Ø&*ñ#A
à˜E×,Ñ,Ñ-ðA
ð ! §¡Ñ.ðA
ð $ E×$4Ñ$4Ñ5ð	A
ð
 !)¨×)9Ñ)9Ñ :ðA
ð ˜EŸL™LÑ)ðA
ð $ E§L¡LÑ1ðA
ð ' u§|¡|Ñ4ðA
ð " $ u×'8Ñ'8Ñ"9Ñ:ðA
ð " $ u×'8Ñ'8Ñ"9Ñ:ðA
ð   × 1Ñ 1Ñ2ðA
ð  (¨×(9Ñ(9Ñ:ðA
ð ˜×)Ñ)Ñ*ðA
ð ˜D‘>ðA
ð $ D™>ðA
ð  ' t™nð!A
ð" ˜d‘^ð#A
ð$ 
ˆuoÐ%Ñ	&ò%A
ó ðA
ðFh¸E¿L¹Ló hð ñó ôr)   rx  z„
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $       óê  ‡ — e Zd ZddgZdefˆ fd„Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deee	j                        dee	j                     dee	j                     dee	j                     dee   dee   dee   dee   deeef   f d„«       Zˆ xZS )ÚMvpForSequenceClassificationrV  rW  r‹   c                 óÜ   •— t        ‰|   |fi |¤Ž t        |«      | _        t	        |j
                  |j
                  |j                  |j                  «      | _        | j                  «        y rÈ   )
r2   r3   rT  rÜ   r¾   r   Ú
num_labelsÚclassifier_dropoutÚclassification_headr  )r4   r‹   Úkwargsr5   s      €r'   r3   z%MvpForSequenceClassification.__init__S  sZ   ø€ Ü‰Ñ˜Ñ* 6Ò*Ü˜fÓ%ˆŒ
Ü#8ØN‰NØN‰NØ×ÑØ×%Ñ%ó	$
ˆÔ ð 	‰Õr)   c                 ón   — | j                   j                  «        | j                  j                  d«       y r8  )rÜ   rf  rµ  re  r  s    r'   rf  z3MvpForSequenceClassification.set_lightweight_tuning`  s&   € Ø
‰
×)Ñ)Ô+Ø× Ñ ×/Ñ/°Õ6r)   r   rf   rg  rh  r  ri  rB  rj  r  rk  r•  r´   ri   r  r  rj   c                 ó°  — ||n| j                   j                  }|d}|€$|	"t        d| j                  j                  › «      ‚| j                  |||||||||	|
||||¬«      }|d   }|j                  | j                   j                  «      j                  |j                  «      }t        t        j                  |j                  d«      «      «      dkD  rt        d«      ‚||dd…f   j                  |j!                  d«      d|j!                  d«      «      dd…ddd…f   }| j#                  |«      }d}|¯| j                   j$                  €¡| j                   j&                  dk(  rd	| j                   _        nv| j                   j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j,                  k(  rd
| j                   _        nd| j                   _        | j                   j$                  d	k(  rSt/        «       }| j                   j&                  dk(  r& ||j1                  «       |j1                  «       «      }n– |||«      }nŒ| j                   j$                  d
k(  rGt3        «       } ||j                  d| j                   j&                  «      |j                  d«      «      }n,| j                   j$                  dk(  rt5        «       } |||«      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                  |j>                  |j@                  |jB                  |jD                  ¬«	      S )a÷  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for ©rf   rg  rh  r  ri  rB  rj  r  rk  r´   ri   r  r  r   r   z7All examples must have the same number of <eos> tokens.r    Ú
regressionÚsingle_label_classificationÚmulti_label_classificationr—  )#r‹   r'  ÚNotImplementedErrorr5   rE   rÜ   ÚeqÚeos_token_idrt   r:   r(  r;   Úunique_consecutiveÚsumr$   r_   rq   rµ  Úproblem_typer³  r9   r=   rI   r
   Úsqueezer	   r   r   rC  rn  ro  rF  rp  r°   rq  )r4   r   rf   rg  rh  r  ri  rB  rj  r  rk  r•  r´   ri   r  r  r§   rc   Úeos_maskÚsentence_representationr™  r˜  r  rž  s                           r'   rA   z$MvpForSequenceClassification.forwardd  s  € ðZ &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆØÐØˆIàÐ Ð!:Ü%ØJÈ4Ï>É>×KbÑKbÐJcÐdóð ð —*‘*ØØ)Ø/Ø#9ØØ/Ø!5Ø+Ø'Ø"7ØØ/Ø!5Ø#ð ó 
ˆð    ™
ˆà—<‘< §¡× 8Ñ 8Ó9×<Ñ<¸]×=QÑ=QÓRˆäŒu×'Ñ'¨¯©°Q«Ó8Ó9¸AÒ=ÜÐVÓWÐWØ"/°º!°Ñ"<×"AÑ"AÀ-×BTÑBTÐUVÓBWÐY[Ð]j×]oÑ]oÐprÓ]sÓ"tÚˆr’1ˆHñ#
Ðð ×)Ñ)Ð*AÓBˆàˆØÑØ{‰{×'Ñ'Ð/Ø—;‘;×)Ñ)¨QÒ.Ø/;D—K‘KÕ,Ø—[‘[×+Ñ+¨aÒ/°V·\±\ÄUÇZÁZÒ5OÐSY×S_ÑS_Ôch×clÑclÒSlØ/LD—K‘KÕ,à/KD—K‘KÔ,à{‰{×'Ñ'¨<Ò7Ü"›9Ø—;‘;×)Ñ)¨QÒ.Ù# F§N¡NÓ$4°f·n±nÓ6FÓG‘Dá# F¨FÓ3‘DØ—‘×)Ñ)Ð-JÒJÜ+Ó-Ù §¡¨B°·±×0FÑ0FÓ GÈÏÉÐUWËÓY‘Ø—‘×)Ñ)Ð-IÒIÜ,Ó.Ù ¨Ó/ÙØY ¨¨ Ñ,ˆFØ)-Ð)9TG˜fÑ$ÐE¸vÐEä.ØØØ#×3Ñ3Ø")×"?Ñ"?Ø&×9Ñ9Ø$×5Ñ5Ø&-×&GÑ&GØ")×"?Ñ"?Ø&×9Ñ9ô

ð 
	
r)   rs  )rE   rF   rG   ru  r   r3   rf  r   r   r;   r9  rJ   r   r¨   rˆ   r   r   r   rA   rK   rL   s   @r'   r±  r±  J  s¦  ø„ ð 8Ð9VÐWÐð˜yõ ò7ð ð 15Ø15Ø8<Ø=AØ,0Ø48Ø7;Ø=AØ59Ø=AØ-1Ø$(Ø,0Ø/3Ø&*ñ!T
à˜E×,Ñ,Ñ-ðT
ð ! §¡Ñ.ðT
ð $ E×$4Ñ$4Ñ5ð	T
ð
 !)¨×)9Ñ)9Ñ :ðT
ð ˜EŸL™LÑ)ðT
ð $ E§L¡LÑ1ðT
ð ' u§|¡|Ñ4ðT
ð " $ u×'8Ñ'8Ñ"9Ñ:ðT
ð   × 1Ñ 1Ñ2ðT
ð  (¨×(9Ñ(9Ñ:ðT
ð ˜×)Ñ)Ñ*ðT
ð ˜D‘>ðT
ð $ D™>ðT
ð ' t™nðT
ð  ˜d‘^ð!T
ð" 
ˆuÐ5Ð5Ñ	6ò#T
ó ôT
r)   r±  c            &       ó  ‡ — e Zd ZddgZˆ fd„Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deeej                        deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   f"d„«       Zˆ xZS )ÚMvpForQuestionAnsweringrV  rW  c                 óò   •— t         ‰|   |«       d|_        |j                  | _        t        |«      | _        t        j                  |j                  |j                  «      | _        | j                  «        y r/   )
r2   r3   r³  rT  rÜ   r   rX   Úhidden_sizeÚ
qa_outputsr  rš   s     €r'   r3   z MvpForQuestionAnswering.__init__   s[   ø€ Ü‰Ñ˜Ô àˆÔØ ×+Ñ+ˆŒä˜fÓ%ˆŒ
ÜŸ)™) F×$6Ñ$6¸×8IÑ8IÓJˆŒð 	‰Õr)   c                 ón   — | j                   j                  «        | j                  j                  d«       y r8  )rÜ   rf  rÊ  re  r  s    r'   rf  z.MvpForQuestionAnswering.set_lightweight_tuning  s$   € Ø
‰
×)Ñ)Ô+Ø‰×&Ñ& uÕ-r)   r   rf   rg  rh  r  ri  rB  rj  Ústart_positionsÚend_positionsr  rk  r´   ri   r  r  rj   c                 ó¬  — ||n| j                   j                  }|	|
d}| j                  ||||||||||||||¬«      }|d   }| j                  |«      }|j	                  dd¬«      \  }}|j                  d«      j                  «       }|j                  d«      j                  «       }d}|	·|
µt        |	j                  «       «      dkD  r|	j                  d«      }	t        |
j                  «       «      dkD  r|
j                  d«      }
|j                  d«      }|	j                  d|«      }	|
j                  d|«      }
t        |¬«      } |||	«      } |||
«      }||z   d	z  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  |j                  |j                  |j                   |j"                  |j$                  ¬
«
      S )aX  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr¹  r   r   r    rl   )Úignore_indexr0   )
r˜  Ústart_logitsÚ
end_logitsrC  rn  ro  rF  rp  r°   rq  )r‹   r'  rÜ   rÊ  rØ   rÃ  ra   r(  rq   r¤   r	   r   rC  rn  ro  rF  rp  r°   rq  )r4   r   rf   rg  rh  r  ri  rB  rj  rÌ  rÍ  r  rk  r´   ri   r  r  r§   Úsequence_outputr™  rÐ  rÑ  Ú
total_lossÚignored_indexr  Ú
start_lossÚend_lossrž  s                               r'   rA   zMvpForQuestionAnswering.forward  s  € ðf &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆØÐ&¨=Ð+DØˆIà—*‘*ØØ)Ø/Ø#9ØØ/Ø!5Ø+Ø'Ø"7ØØ/Ø!5Ø#ð ó 
ˆð" " !™*ˆà—‘ Ó1ˆØ#)§<¡<°°r <Ó#:Ñ ˆjØ#×+Ñ+¨BÓ/×:Ñ:Ó<ˆØ×'Ñ'¨Ó+×6Ñ6Ó8ˆ
àˆ
ØÐ&¨=Ð+Dä?×'Ñ'Ó)Ó*¨QÒ.Ø"1×"9Ñ"9¸"Ó"=Ü=×%Ñ%Ó'Ó(¨1Ò,Ø -× 5Ñ 5°bÓ 9à(×-Ñ-¨aÓ0ˆMØ-×3Ñ3°A°}ÓEˆOØ)×/Ñ/°°=ÓAˆMä'°]ÔCˆHÙ! ,°Ó@ˆJÙ 
¨MÓ:ˆHØ$ xÑ/°1Ñ4ˆJáàØðð ˜˜ñˆFð 0:Ð/EZM FÑ*ÐQÈ6ÐQä2ØØ%Ø!Ø#×3Ñ3Ø")×"?Ñ"?Ø&×9Ñ9Ø$×5Ñ5Ø&-×&GÑ&GØ")×"?Ñ"?Ø&×9Ñ9ô
ð 	
r)   r®  )rE   rF   rG   ru  r3   rf  r   r   r;   rJ   r9  r   r¨   rˆ   r   r   r   rA   rK   rL   s   @r'   rÇ  rÇ  ü  s´  ø„ à7Ð9VÐWÐô
ò.ð ð -1Ø15Ø8<Ø=AØ,0Ø48Ø7;Ø=AØ6:Ø48Ø59Ø=AØ$(Ø,0Ø/3Ø&*ñ#Q
à˜EŸL™LÑ)ðQ
ð ! §¡Ñ.ðQ
ð $ E×$4Ñ$4Ñ5ð	Q
ð
 !)¨×)9Ñ)9Ñ :ðQ
ð ˜EŸL™LÑ)ðQ
ð $ E§L¡LÑ1ðQ
ð ' u§|¡|Ñ4ðQ
ð " $ u×'8Ñ'8Ñ"9Ñ:ðQ
ð " %×"2Ñ"2Ñ3ðQ
ð   × 0Ñ 0Ñ1ðQ
ð   × 1Ñ 1Ñ2ðQ
ð  (¨×(9Ñ(9Ñ:ðQ
ð ˜D‘>ðQ
ð $ D™>ðQ
ð  ' t™nð!Q
ð" ˜d‘^ð#Q
ð$ 
ˆuÐ9Ð9Ñ	:ò%Q
ó ôQ
r)   rÇ  c                   ó(   ‡ — e Zd ZdZˆ fd„Zd„ Zˆ xZS )ÚMvpDecoderWrapperz½
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 óD   •— t         ‰|   |«       t        |«      | _        y rÈ   )r2   r3   r;  r[  rš   s     €r'   r3   zMvpDecoderWrapper.__init__¬  s   ø€ Ü‰Ñ˜Ô Ü! &Ó)ˆr)   c                 ó&   —  | j                   |i |¤ŽS rÈ   rb  )r4   Úargsr¶  s      r'   rA   zMvpDecoderWrapper.forward°  s   € Øˆt|‰|˜TÐ, VÑ,Ð,r)   )rE   rF   rG   rH   r3   rA   rK   rL   s   @r'   rØ  rØ  ¦  s   ø„ ñô
*ö-r)   rØ  c                    óÖ  ‡ — e Zd ZdgZˆ fd„Zd„ Zd„ Zd„ Zd„ Zd„ Z	d„ Z
d	„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                      deej"                     deej"                     deej                      deej                      deeej"                        deej"                     deej                     dee   dee   dee   dee   deeef   fd„«       Zed„ «       Zˆ xZS )ÚMvpForCausalLMry  c                 ó  •— t        j                  |«      }d|_        d|_        t        ‰|   |«       t        |«      | _        t        j                  |j                  |j                  d¬«      | _        | j                  «        y )NTFrU   )ÚcopyÚdeepcopyrR   Úis_encoder_decoderr2   r3   rØ  rÜ   r   rX   rÉ  r  r|  r  rš   s     €r'   r3   zMvpForCausalLM.__init__·  sf   ø€ Ü—‘˜vÓ&ˆØ ˆÔØ$)ˆÔ!Ü‰Ñ˜Ô Ü& vÓ.ˆŒ
ä—y‘y ×!3Ñ!3°V×5FÑ5FÈUÔSˆŒð 	‰Õr)   c                 óB   — | j                   j                  j                  S rÈ   ©rÜ   r[  rù   r  s    r'   r  z#MvpForCausalLM.get_input_embeddingsÃ  s   € Øz‰z×!Ñ!×.Ñ.Ð.r)   c                 ó:   — || j                   j                  _        y rÈ   rã  r  s     r'   r  z#MvpForCausalLM.set_input_embeddingsÆ  s   € Ø*/ˆ
‰
×ÑÕ'r)   c                 ó   — | j                   S rÈ   rŒ  r  s    r'   r  z$MvpForCausalLM.get_output_embeddingsÉ  r`  r)   c                 ó   — || _         y rÈ   rŒ  r  s     r'   r  z$MvpForCausalLM.set_output_embeddingsÌ  r‘  r)   c                 ó&   — || j                   _        y rÈ   ©rÜ   r[  )r4   r[  s     r'   Úset_decoderzMvpForCausalLM.set_decoderÏ  s   € Ø$ˆ
‰
Õr)   c                 ó.   — | j                   j                  S rÈ   rè  r  s    r'   rc  zMvpForCausalLM.get_decoderÒ  s   € Øz‰z×!Ñ!Ð!r)   c                 ón   — | j                   j                  «        | j                  j                  d«       y r8  r“  r  s    r'   rf  z%MvpForCausalLM.set_lightweight_tuningÕ  r”  r)   r   rf   r°   r±   r  rB  rC  r  r•  r´   ri   r  r  rj   c                 óD  — ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j                  |||||||||
|||¬«      }| j                  |d   «      }d}|	Ft        «       } ||j                  d| j                   j                  «      |	j                  d«      «      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  ¬«      S )aÑ  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```Nrm  r   r    r   )r˜  r™  rC  rc   r&  rF  )r‹   ri   r  r'  rÜ   r[  r|  r	   r_   r  r   rC  rc   r&  rF  )r4   r   rf   r°   r±   r  rB  rC  r  r•  r´   ri   r  r  r§   r™  r˜  r  rž  s                      r'   rA   zMvpForCausalLM.forwardÙ  sF  € ðX 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆð —*‘*×$Ñ$ØØ)Ø"7Ø#9ØØ!5Ø+Ø'ØØ/Ø!5Ø#ð %ó 
ˆð —‘˜g a™jÓ)ˆàˆØÐÜ'Ó)ˆHÙ˜FŸK™K¨¨D¯K©K×,BÑ,BÓCÀVÇ[Á[ÐQSÃ_ÓUˆDáØY ¨¨ Ñ,ˆFØ'+Ð'7D7˜VÑ#ÐC¸VÐCä0ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø$×5Ñ5ô
ð 	
r)   c                 óJ   ‡— d}| D ]  }|t        ˆfd„|D «       «      fz  }Œ |S )Nrö   c              3   ót   •K  — | ]/  }|j                  d ‰j                  |j                  «      «      –— Œ1 y­wrD   r£  r¥  s     €r'   r#  z0MvpForCausalLM._reorder_cache.<locals>.<genexpr>4  s.   øè ø€ ÒnÐU_j×-Ñ-¨a°·±¸Z×=NÑ=NÓ1O×PÑnùr¨  r©  rª  s    `  r'   r­  zMvpForCausalLM._reorder_cache/  s=   ø€ àˆØ)ò 	ˆJØÜÓnÐcmÔnÓnðñ ‰Nð	ð Ðr)   )NNNNNNNNNNNNN)rE   rF   rG   ru  r3   r  r  r  r  ré  rc  rf  r   r   r;   r9  rJ   r¨   r   rˆ   r   r   r   rA   r¯  r­  rK   rL   s   @r'   rÝ  rÝ  ´  s  ø„ Ø*Ð+Ðô
ò/ò0òò&ò%ò"ò+ð ð 15Ø15Ø=AØ>BØ,0Ø7;Ø=AØ59Ø-1Ø$(Ø,0Ø/3Ø&*ñS
à˜E×,Ñ,Ñ-ðS
ð ! §¡Ñ.ðS
ð  (¨×(9Ñ(9Ñ:ð	S
ð
 !)¨×):Ñ):Ñ ;ðS
ð ˜EŸL™LÑ)ðS
ð ' u§|¡|Ñ4ðS
ð " $ u×'8Ñ'8Ñ"9Ñ:ðS
ð   × 1Ñ 1Ñ2ðS
ð ˜×)Ñ)Ñ*ðS
ð ˜D‘>ðS
ð $ D™>ðS
ð ' t™nðS
ð ˜d‘^ðS
ð 
ˆuÐ7Ð7Ñ	8òS
ó ðS
ðj ñó ôr)   rÝ  )rÝ  rx  rÇ  r±  rT  rÛ   )=rH   rß  r  Útypingr   r   r   r   r;   Útorch.utils.checkpointr   Útorch.nnr   r	   r
   Úactivationsr   Ú
generationr   Úmodeling_attn_mask_utilsr   r   Úmodeling_outputsr   r   r   r   r   r   r   Úmodeling_utilsr   Úutilsr   r   Úconfiguration_mvpr   Ú
get_loggerrE   rG  rJ   rI   r(   rÏ   r+   ÚModulerN   rŠ   rª   r¾   rË   rÛ   rø   r;  rT  rx  r±  rÇ  rØ  rÝ  Ú__all__rö   r)   r'   ú<module>rü     sß  ðñ ã Û ß /Ó /ã Û Ý ß AÑ Aå !Ý )÷÷÷ ñ õ .ß ,Ý (ð 
ˆ×	Ñ	˜HÓ	%€ð %§,¡,ð ¸cð Ð[^ó ô"; B§L¡Lô ;ô2XB2—9‘9ô XBôvEb—i‘iô EôPzb—i‘iô zô|˜BŸI™Iô ô0—	‘	ô ð2 ô˜ó ó ðô6A
Ð#ô A
ôHW
Ð#ô W
ðt ôZ
Ð!ó Z
ó ðZ
ñz ðôô
|Ð"4°oó |óð
|ñ~ ðôôi
Ð#5ó i
óði
ðX ôe
Ð0ó e
ó ðe
ôR-Ð*ô -ôBÐ'¨ô BòJr)   