
    UhJ                    <   d Z ddlZddlZddlmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$  e"jJ                  e&      Z'dejP                  de)de)fdZ* G d de
jV                        Z, G d de
jZ                        Z. G d de
jZ                        Z/ G d de
jZ                        Z0 G d de
jZ                        Z1 G d de
jZ                        Z2e! G d d e             Z3 G d! d"e3      Z4 G d# d$e3      Z5e! G d% d&e3             Z6 e!d'(       G d) d*e3e             Z7 e!d+(       G d, d-e3             Z8e! G d. d/e3             Z9 G d0 d1e3      Z: G d2 d3e3e      Z;g d4Z<y)5zPyTorch MVP model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_idss       v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr(   2   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                   n     e Zd ZdZdedef fdZd	dej                  dedej                  f fdZ xZ	S )
MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr,   r-   	__class__s      r'   r3   z&MvpLearnedPositionalEmbedding.__init__H   s$     $++5}Er)   r   past_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr0   )dtypedevicer    r   )r"   torcharangelongweightr:   expand	unsqueezer2   forwardr1   )r4   r   r6   r7   bszseq_lenr5   s         r'   rA   z%MvpLearnedPositionalEmbedding.forwardN   s     $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r)   r   N)
__name__
__module____qualname____doc__intr3   r;   TensorrA   __classcell__r5   s   @r'   r+   r+   C   sH    Fs F3 F; ;s ;^c^j^j ; ;r)   r+   c                       e Zd ZdZ	 	 	 ddededededef
 fdZdej                  d	ed
efdZ
	 	 	 	 	 	 ddej                  deej                     deeej                        deej                     deej                     deej                     dedeej                  eej                     eeej                        f   fdZ xZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbiasc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rS   )r2   r3   rO   rP   rQ   head_dimr$   scalingrR   r   Lineark_projv_projq_projout_proj)r4   rO   rP   rQ   rR   rS   r5   s         r'   r3   zMvpAttention.__init___   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$ii	94@ii	94@ii	94@		)YTBr)   tensorrC   rB   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r0   )viewrP   rV   	transpose
contiguous)r4   r]   rC   rB   s       r'   _shapezMvpAttention._shapez   s7    {{3GQQRSUVWbbddr)   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskattn_promptoutput_attentionsreturnc                 6   |du}|j                         \  }	}
}| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|	      }| j                  | j                  |      d|	      }n|}| j                  | j	                  |      d|	      }| j                  | j                  |      d|	      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|	      }| j                  | j                  |      d|	      }| j                  r||f}|t        j                  |d   j                  |	ddd      |gd      }t        j                  |d   j                  |	ddd      |gd      }|\t        j                  |	d|
|d   j                  d            j                  |j                        }t        j                  ||gd      }|	| j                  z  d| j                  f} | j                  ||
|	      j                  | } |j                  | } |j                  | }|j                  d      }t        j                   ||j#                  dd            }|j                         |	| j                  z  |
|fk7  r/t%        d|	| j                  z  |
|f d|j                                |{|j                         |	d|
|fk7  r#t%        d	|	d|
|f d|j                                |j                  |	| j                  |
|      |z   }|j                  |	| j                  z  |
|      }t&        j(                  j+                  |d      }||j                         | j                  fk7  r*t%        d
| j                  f d|j                                |j                  dddd      |j                  |	| j                  |
|      z  }|j                  |	| j                  z  |
|      }|r?|j                  |	| j                  |
|      }|j                  |	| j                  z  |
|      }nd}t&        j(                  j-                  || j,                  | j.                        }t        j                   ||      }|j                         |	| j                  z  |
| j                  fk7  r7t%        d|	| j                  |
| j                  f d|j                                |j                  |	| j                  |
| j                        }|j#                  dd      }|j1                  |	|
| j2                        }| j5                  |      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r    r0   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizer[   rW   rb   rY   rZ   r;   catrR   r?   zerostor:   rP   rV   r_   bmmr`   r$   r   
functionalsoftmaxrQ   rp   reshaperO   r\   )r4   rc   rd   re   rf   rg   rh   ri   is_cross_attentionrB   tgt_len_query_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                         r'   rA   zMvpAttention.forward}   s    .T9',,.Wa {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7N"KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q';q>;N;Nq;QRUUVdVkVkl!&K+Hr!SDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01>AAr)   )        FT)NNNNNF)rE   rF   rG   rH   rI   floatboolr3   r;   rJ   rb   r   r   rA   rK   rL   s   @r'   rN   rN   \   sD   G  CC C 	C
 C C6eU\\ eC ec e 488<1526.2"'wB||wB #5<<0wB !u||!45	wB
 !.wB "%,,/wB ell+wB  wB 
u||Xell3XeELL>Q5RR	SwBr)   rN   c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dej                  dee   de	ej                  eej                     f   fd	Z
 xZS )MvpEncoderLayerconfigc                 f   t         |           |j                  | _        t	        | j                  |j
                  |j                        | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )N)rO   rP   rQ   )r2   r3   d_modelrO   rN   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrQ   r   activation_functionactivation_fnactivation_dropoutrX   encoder_ffn_dimfc1fc2final_layer_normr4   r   r5   s     r'   r3   zMvpEncoderLayer.__init__   s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r)   rc   rf   rg   self_attn_promptri   rj   c                    |}| j                  |||||      \  }}}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|j                  t        j                  k(  rt        j                  |      j                         s#t        j                   |      j                         rEt        j"                  |j                        j$                  dz
  }	t        j&                  ||	 |	      }|f}
|r|
|fz  }
|
S )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rc   rf   rg   rh   ri   rn   i  )minmax)r   r   rv   rQ   rp   r   r   r   r   r   r   r9   r;   float16isinfanyisnanfinfor   clamp)r4   rc   rf   rg   r   ri   residualr   r{   clamp_valueoutputss              r'   rA   zMvpEncoderLayer.forward  s   * !)-')+(/ *8 *
&|Q --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<%--/KK&**,M0J0N0N0P++m&9&9:>>EK!KKK<[YM "&Gr)   )F)rE   rF   rG   r   r3   r;   FloatTensorr   r   r   rA   rK   rL   s   @r'   r   r      s    =y =, -24((4 ))4 **	4
  ++4 $D>4 
u  (5+<+<"==	>4r)   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deeej                        dee	   dee	   deej                  eeej                  ej                  f      f   fdZ xZS )MvpDecoderLayerr   c                    t         |           |j                  | _        t	        | j                  |j
                  |j                  d      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                  d      | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )NT)rO   rP   rQ   rR   )rQ   rR   )r2   r3   r   rO   rN   decoder_attention_headsr   r   rQ   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normrX   decoder_ffn_dimr   r   r   r   s     r'   r3   zMvpDecoderLayer.__init__@  s   %nn44,,	
 ~~#F$>$>?"(";";$&LL$@!(NN**,,	
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r)   rc   rf   encoder_hidden_statesencoder_attention_maskrg   cross_attn_layer_head_maskr   cross_attn_promptre   ri   	use_cacherj   c           	      x   |}|	|	dd nd}| j                  ||||||
      \  }}}t        j                  j                  || j                  | j                        }||z   }| j                  |      }d}d}|x|}|	|	dd nd}| j                  |||||||
      \  }}}t        j                  j                  || j                  | j                        }||z   }| j                  |      }||z   }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|f}|
r|||fz  }|r||fz  }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr0   )rc   re   rf   rg   rh   ri   rn   )rc   rd   rf   rg   rh   re   ri   )r   r   rv   rQ   rp   r   r   r   r   r   r   r   r   )r4   rc   rf   r   r   rg   r   r   r   re   ri   r   r   self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer   s                       r'   rA   zMvpDecoderLayer.forwardZ  s   H ! :H9S>"1#5Y] >Bnn'3)+(/ ?M ?
;(*; --mt||VZVcVc-d =011-@ (,$! ,$H @N?Yrs(;_c%NRN_N_+!65 :-8"3 O` OKM-/K MM11-4<<Z^ZgZg1hM$}4M 88GM !24P P !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m< ")+=>>G)++Gr)   )
NNNNNNNNFT)rE   rF   rG   r   r3   r;   rJ   r   r   r   r   rA   rK   rL   s   @r'   r   r   ?  s;   =y =: 268<9=26=A37488<,1$(_||_ !._  (5	_
 !) 6_ "%,,/_ %-U\\$:_ #5<<0_ $ELL1_ !u||!45_ $D>_ D>_ 
u  (51B1BEDUDU1U+V"WW	X_r)   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z	 xZ
S )
MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y )Nro   )r2   r3   r   rX   denseDropoutrQ   r\   )r4   r   r   r   r   r5   s        r'   r3   zMvpClassificationHead.__init__  sD     	YYy)4
zzN3		)[9r)   rc   rj   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S N)rQ   r   r;   tanhr\   )r4   rc   s     r'   rA   zMvpClassificationHead.forward  sN    ]3

=1

=1]3m4r)   )rE   rF   rG   rH   rI   r   r3   r;   rJ   rA   rK   rL   s   @r'   r   r     sL    7
:
: 
: 	
:
 
:U\\ ell r)   r   c                   `     e Zd ZdZ fdZdej                  deej                     fdZ xZ	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c           	      8   t         |           |j                  | _        || _        || _        |j
                  |z  | _        t        j                  |j                        | _	        t        j                  |j                  |j
                        | _        t        j                  t        j                  |j
                  |j                        t        j                         t        j                  |j                  |dz  |j
                  z              | _        y )Nr   r0   )r2   r3   prompt_length
num_layersrP   r   rV   r   r   rQ   	Embeddingprompt_embedding
SequentialrX   prompt_mid_dimGELUprompt_trans)r4   r   r   rP   r5   s       r'   r3   zMvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r)   
prompt_idsrj   c                 *   | j                  | j                  |            }|j                  | j                  | j                  dz  | j
                  | j                        }| j                  |      }|j                  g d      j                  d      }|S )Nr0   )r   r0   r   r   )
r   r   r_   r   r   rP   rV   rQ   permutesplit)r4   r   prompts      r'   rA   zMvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r)   )
rE   rF   rG   rH   r3   r;   rJ   r   rA   rK   rL   s   @r'   r   r     s+    3
%,, 53F r)   r   c                   .    e Zd ZeZdZdZd Zed        Z	y)MvpPreTrainedModelmodelTc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Nr   )meanstd)r   init_std
isinstancer   rX   r>   datanormal_rS   zero_r   padding_idx)r4   moduler   s      r'   _init_weightsz MvpPreTrainedModel._init_weights  s    kk""fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r)   c                     | j                   j                  }t        j                  g ddddd|gg| j                        }|j                  |      |d}|S )N)r      
      r0   r         r0   r:   )rf   r   )r   r   r;   r]   r:   ne)r4   	pad_tokenr   dummy_inputss       r'   r   zMvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r)   N)
rE   rF   rG   r   config_classbase_model_prefixsupports_gradient_checkpointingr   propertyr    r)   r'   r   r     s,    L&*#	?  r)   r   c                       e Zd ZdZ	 ddedeej                     dee   f fdZ	d Z
d Z	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     dee   dee   dee   deeef   fdZ xZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   embed_tokens
use_promptc                 2   t         |   |       |j                  | _        |j                  | _        |j
                  }|j                  | _        |j                  | _	        |j                  rt        j                  |      nd| _        ||| _        n0t        j                   |j"                  || j                        | _        t%        |j                  |      | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t        j2                  |      | _        || _        |r7|j8                  | _        t;        ||j,                  |j<                        | _        d| _         | jC                          y c c}w Ng      ?F)"r2   r3   rQ   encoder_layerdrop	layerdropr   r   r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   r   
vocab_sizer+   embed_positions
ModuleListrangeencoder_layersr   layersr   layernorm_embeddingr   r   r   r   r   gradient_checkpointing	post_init)r4   r   r   r   rO   r{   r5   s         r'   r3   zMvpEncoder.__init__  sD    	 ~~11NN	!..$*$B$B!393I3I499Y/s# ,D "V->->	4K[K[ \D<** 
 mmeFLaLaFb$c_V%<$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   Fc                     | j                   S r   r   r4   s    r'   get_input_embeddingszMvpEncoder.get_input_embeddings;         r)   c                     || _         y r   r  r4   values     r'   set_input_embeddingszMvpEncoder.set_input_embeddings>  
    !r)   r   rf   	head_maskinputs_embedsri   output_hidden_statesreturn_dictrj   c           	      4   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |$|}|j
                  }	|j                  d|	d         }n-| |j                         dd }	|dddddf   }nt	        d      || j                  |      | j                  z  }| j                  |      }
||
z   }| j                  |      }t        j                  j                  || j                  | j                        }| j                   rIt#        j$                  | j&                        j)                  | j*                        }| j-                  |      }|t/        ||j0                        }|rdnd}|rdnd}|_|j                         d   t3        | j4                        k7  r6t	        dt3        | j4                         d	|j                         d    d
      t7        | j4                        D ]  \  }}|r||fz   }d}| j                  r&t#        j8                  g       }|| j:                  k  rd}|rd}n{| j<                  rE| j                  r9| j?                  |j@                  |||||   nd| j                   r|   nd|      }n% ||||||   nd| j                   r|   nd|      }|d   }|s||d   fz   } |r||fz   }|stC        d |||fD              S tE        |||      S )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer    z5You have to specify either input_ids or inputs_embedsrn   r   r   z&The head_mask should be specified for  layers, but it is for .FT)NN)rg   r   ri   r   c              3   &   K   | ]	  }||  y wr   r   .0vs     r'   	<genexpr>z%MvpEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   last_hidden_staterc   
attentions)#r   ri   r  use_return_dictr$   r"   r_   rq   r   r  r  r  r   rv   rQ   rp   r   r;   r<   r   rt   r:   r   r   r9   lenr
  	enumeraterandr   r  _gradient_checkpointing_func__call__tupler   )r4   r   rf   r  r  ri   r  r  inputinput_shape	embed_posrc   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r'   rA   zMvpEncoder.forwardA  sV   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %7H[H[\N30d  ~~"s4;;'78 <S=M<N O!(+,A/ 
 #,DKK"8 "	FC#!/=2B!BG}}&+jjn#&7"G ,..4==$($E$E%..%&+4+@3d26//)#.t)%M %2%&;D;P3VZCG??*:3*?X\*;%M !.a 0 !/=3C2E!EE"	FH  +}.>>Ne]NN$Seee+>Vd
 	
r)   NF)NNNNNNN)rE   rF   rG   rH   r   r   r   r   r   r3   r  r  r;   
LongTensorrJ   r   r   r   r   rA   rK   rL   s   @r'   r   r   
  s     lq$$/7/E$ZbcgZh$L!"
 1515,059,0/3&*J
E,,-J
 !.J
 ELL)	J

   1 12J
 $D>J
 'tnJ
 d^J
 
uo%	&J
r)   r   c                       e Zd ZdZ	 ddedeej                     dee   f fdZ	d Z
d Z	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deeej                        deej                     dee   dee   dee   dee   deeef   fdZ xZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    r   r   r   c                    t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        ||| _        n:t        j                   |j"                  |j                  | j                        | _        t%        |j                  |j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t        j2                  |j                        | _        || _        |r]|j8                  | _        t;        ||j,                  |j<                        | _        t;        ||j,                  |j<                        | _         d| _!        | jE                          y c c}w r   )#r2   r3   rQ   decoder_layerdropr   r   r   r   max_target_positionsr  r  r  r   r  r   r   r   r  r+   r  r  r  decoder_layersr   r
  r   r  r   r   r   r   r   r   r  r  )r4   r   r   r   r{   r5   s        r'   r3   zMvpDecoder.__init__  sq    	 ~~11!..$*$B$B!8>8N8N499V^^4TW# ,D "V->->PTP`P` aD<**NN 
 mmeFLaLaFb$c_V%<$cd#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %ds   Gc                     | j                   S r   r  r  s    r'   r  zMvpDecoder.get_input_embeddings   r  r)   c                     || _         y r   r  r  s     r'   r  zMvpDecoder.set_input_embeddings  r  r)   r   rf   r   r   r  cross_attn_head_maskpast_key_valuesr  r   ri   r  r  rj   c                 j   |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||t        d      |$|}|j                  }|j                  d|d         }n-| |j                         dd }|dddddf   }nt        d      ||d   d   j                  d   nd}|| j                  |      | j                  z  }t        ||||      }||t        ||j                  |d         }| j                  ||      }||z   }| j                  |      }t         j"                  j%                  || j$                  | j&                        }| j(                  rZt+        j,                  | j.                        j1                  | j2                        }| j5                  |      }| j7                  |      }| j8                  r%| j&                  r|	rt:        j=                  d	       d
}	|rdnd}|
rdnd}|
r|dnd}|	rdnd}t?        ||gddg      D ]j  \  }}|	|j                         d   tA        | jB                        k7  s3t        d| dtA        | jB                         d|j                         d    d       tE        | jB                        D ](  \  }}|r||fz  }| j&                  r%t+        jF                  g       }|| jH                  k  r@|||   nd}| j8                  rc| j&                  rW| jK                  |jL                  |||||||   nd|||   nd| j(                  r|   nd| j(                  r|   ndd|
|	      }nC ||||||||   nd|||   nd| j(                  r|   nd| j(                  r|   nd||
|	      }|d   }|	r|||
rdnd   fz  }|
s||d   fz  }| ||d   fz  }+ |r||fz  }|	r|nd} |stO        d || |||fD              S tQ        || |||      S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer    zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r0   )rz   rn   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r  rB  zThe `z` should be specified for r  r  )
rf   r   r   rg   r   r   r   re   ri   r   r   r   c              3   $   K   | ]  }|| 
 y wr   r   r   s     r'   r#  z%MvpDecoder.forward.<locals>.<genexpr>  s      = s   )r%  rC  rc   r&  cross_attentions))r   ri   r  r   r'  r$   r"   r_   rq   r   r  r   r   r9   r  r  r   rv   rQ   rp   r   r;   r<   r   rt   r:   r   r   r  loggerwarning_oncezipr(  r
  r)  r*  r   r+  r,  r-  r   )!r4   r   rf   r   r   r  rB  rC  r  r   ri   r  r  r.  r/  r6   	positionsrc   r   r   r   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_namer3  decoder_layerr6  re   r7  
next_caches!                                    r'   rA   zMvpDecoder.forward  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee DSC^!3A!6!<!<Q!?de  --i84;K;KKM:K8N

 !,1G1S%?&(;(;[QS_&"
 ((0FG	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B&&4==##p "	 #7BD0d&7<Q<]rdh#,R$ %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 3	@C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[-1__$S)$.2oo%c*4%! !.!#1*?+A7@7LYs^RV5I5U,S1[_?C&6s&;TXAE'8'=VZ#1&7'! *!,M"}:KQQR'S&UU" =#3"55(4(]1-=,??(g3	@l  -!11+4'$
 '5FXlm  
 9+&+%1
 	
r)   r8  )NNNNNNNNNNNN)rE   rF   rG   rH   r   r   r   r   r   r3   r  r  r;   r9  rJ   r   r   r   r   r   rA   rK   rL   s   @r'   r;  r;    sq    lq&&/7/E&ZbcgZh&P!"
 1515=A=A,07;=A59$(,0/3&*_
E,,-_
 !._
  ((9(9:	_

 !))9)9 :_
 ELL)_
 'u||4_
 "$u'8'8"9:_
   1 12_
 D>_
 $D>_
 'tn_
 d^_
 
u??	@_
r)   r;  c            $           e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
d
 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                      deej                     deej                     deej                      deej                      deej                      deeej$                        deeej$                        deej$                     deej$                     dee   dee   dee   dee   deeef   f d       Z xZS )MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                 z   t         |   |       |j                  |j                  }}|j                  | _        t        j                  ||j                  |      | _        t        || j                  |j                        | _
        t        || j                  |j                        | _        | j                          y r   )r2   r3   r   r  r   r   r   r   sharedr   encoderr;  decoderr  )r4   r   r   r  r5   s       r'   r3   zMvpModel.__init__  s     "("5"5v7H7HZ ++ll:v~~{K!&$++v7H7HI!&$++v7H7HI 	r)   c                     | j                   S r   )rY  r  s    r'   r  zMvpModel.get_input_embeddings  s    {{r)   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r   )rY  rZ  r   r[  r  s     r'   r  zMvpModel.set_input_embeddings  s)    $(KK!$(KK!r)   c                     | j                   S r   )rZ  r  s    r'   get_encoderzMvpModel.get_encoder      ||r)   c                     | j                   S r   r[  r  s    r'   get_decoderzMvpModel.get_decoder  r`  r)   c                 *   | j                   sJ d       | j                  d       | j                  j                  j                  d       | j                  j                  j                  d       | j                  j
                  j                  d       y )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_rZ  r   r[  r   r  s    r'   set_lightweight_tuningzMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r)   r   rf   decoder_input_idsdecoder_attention_maskr  decoder_head_maskrB  encoder_outputsrC  r  decoder_inputs_embedsr   ri   r  r  rj   c                 :   |D|B|t        d      t        || j                  j                  | j                  j                        }||n| j                  j
                  }||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|| j                  ||||
|||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  |||d   ||||	|||||      }|s||z   S t        |j                  |j                   |j"                  |j$                  |j&                  |j                  |j"                  |j$                  	      S )
a"  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   rf   r  r  ri   r  r  r   r   r0   r$  r   rf   r   r   r  rB  rC  r  r   ri   r  r  )r%  rC  decoder_hidden_statesdecoder_attentionsrF  encoder_last_hidden_stater   encoder_attentions)r$   r(   r   r   r   ri   r  r   r'  rZ  r   r   r(  r[  r   r%  rC  rc   r&  rF  )r4   r   rf   rg  rh  r  ri  rB  rj  rC  r  rk  r   ri   r  r  decoder_outputss                    r'   rA   zMvpModel.forward  s   d $)>)F  U  !34;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]""ll#-#+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5# ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r)   NNNNNNNNNNNNNNN)rE   rF   rG   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r3   r  r  r_  rc  rf  r   r   r;   r9  rJ   r   r   r   r   r   r   rA   rK   rL   s   @r'   rT  rT    s   *=)>&79VWy 0
<  15158<=A,0487;=A=A59=A$(,0/3&*!r
E,,-r
 !.r
 $E$4$45	r

 !))9)9 :r
 ELL)r
 $ELL1r
 'u||4r
 "$u'8'8"9:r
 "$u'8'8"9:r
   1 12r
  ((9(9:r
 D>r
 $D>r
 'tnr
  d^!r
" 
u((	)#r
 r
r)   rT  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            &           e Zd Zg dZdef fdZd Zd Z	 d#dede	e   d	e
d
ej                  f fdZded
dfdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$de	ej(                     de	ej*                     de	ej(                     de	ej(                     de	ej*                     de	ej*                     de	ej*                     de	eej.                        de	eej.                        de	ej.                     de	ej.                     de	ej(                     de	e
   de	e
   de	e
   de	e
   d
eeef   f"d        Zdej*                  fd!Zed"        Z xZS )%MvpForConditionalGeneration)rV  rW  lm_head.weightr   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )NrU  r   FrU   )r2   r3   rT  r   register_bufferr;   rs   rY  r,   r   rX   r   lm_headr  r   s     r'   r3   z$MvpForConditionalGeneration.__init__  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r)   c                 6    | j                   j                         S r   )r   r_  r  s    r'   r_  z'MvpForConditionalGeneration.get_encoder      zz%%''r)   c                 6    | j                   j                         S r   )r   rc  r  s    r'   rc  z'MvpForConditionalGeneration.get_decoder  r~  r)   Nnew_num_tokenspad_to_multiple_ofmean_resizingrj   c                 L    t         |   |||      }| j                  |       |S r   )r2   resize_token_embeddings_resize_final_logits_bias)r4   r  r  r  new_embeddingsr5   s        r'   r  z3MvpForConditionalGeneration.resize_token_embeddings  s.     8I[]jk&&~6r)   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr    r   r   rl   rU  )rU  r"   r;   rs   r:   rr   r{  )r4   r  old_num_tokensnew_bias
extra_biass        r'   r  z5MvpForConditionalGeneration._resize_final_logits_bias  s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r)   c                     | j                   S r   r|  r  s    r'   get_output_embeddingsz1MvpForConditionalGeneration.get_output_embeddings  r`  r)   c                     || _         y r   r  r4   r  s     r'   set_output_embeddingsz1MvpForConditionalGeneration.set_output_embeddings  	    %r)   c                 n    | j                   j                          | j                  j                  d       y r8  r   rf  r|  re  r  s    r'   rf  z2MvpForConditionalGeneration.set_lightweight_tuning  $    

))+##E*r)   r   rf   rg  rh  r  ri  rB  rj  rC  r  rk  labelsr   ri   r  r  c                    ||n| j                   j                  }|R|rt        j                  d       d}|7|5t	        || j                   j
                  | j                   j                        }| j                  |||||||||	|
|||||      }| j                  |d         | j                  z   }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  	      S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rf   rg  rj  rh  r  ri  rB  rC  r  rk  r   ri   r  r  r   r    r   	losslogitsrC  rn  ro  rF  rp  r   rq  )r   r'  rG  warningr(   r   r   r   r|  rU  r	   r_   r  r   rC  rn  ro  rF  rp  r   rq  )r4   r   rf   rg  rh  r  ri  rB  rj  rC  r  rk  r  r   ri   r  r  r   	lm_logitsmasked_lm_lossloss_fctoutputs                         r'   rA   z#MvpForConditionalGeneration.forward  s   b &1%<k$++B]B]klI (-B-J$6DKK44dkk6X6X%! **)/+#9/!5+'"7/!5#  
" LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r)   c                 l    t        || j                  j                  | j                  j                        S r   )r(   r   r   r   )r4   r  s     r'   %prepare_decoder_input_ids_from_labelszAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels;  s%    !&$++*B*BDKKDfDfggr)   c                 \    d}| D ]#  }|t        fd|d d D              |dd  z   fz  }% |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywrD   index_selectrt   r:   r!  
past_statebeam_idxs     r'   r#  z=MvpForConditionalGeneration._reorder_cache.<locals>.<genexpr>D  s.     rU_j--aZ=N=N1OPr   58r0   r-  rC  r  reordered_past
layer_pasts    `  r'   _reorder_cachez*MvpForConditionalGeneration._reorder_cache>  sT    ) 	JrcmnpopcqrrQR.! N	 r)   )NTNNNNNNNNNNNNNNNN) rE   rF   rG   ru  r   r3   r_  rc  rI   r   r   r   r   r  r  r  r  rf  r   r;   r9  rJ   r   r   r   r   r   rA   r  staticmethodr  rK   rL   s   @r'   rx  rx    s@    jy (( dh!7?}\`	< < <&+  15158<=A,0487;=A=A59=A-1$(,0/3&*#A
E,,-A
 !.A
 $E$4$45	A

 !))9)9 :A
 ELL)A
 $ELL1A
 'u||4A
 "$u'8'8"9:A
 "$u'8'8"9:A
   1 12A
  ((9(9:A
 ))*A
 D>A
 $D>A
  'tn!A
" d^#A
$ 
uo%	&%A
 A
FhELL h  r)   rx  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $           e Zd ZddgZdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     deee	j                        dee	j                     dee	j                     dee	j                     dee   dee   dee   dee   deeef   f d       Z xZS )MvpForSequenceClassificationrV  rW  r   c                     t        |   |fi | t        |      | _        t	        |j
                  |j
                  |j                  |j                        | _        | j                          y r   )
r2   r3   rT  r   r   r   
num_labelsclassifier_dropoutclassification_headr  )r4   r   kwargsr5   s      r'   r3   z%MvpForSequenceClassification.__init__S  sZ    *6*f%
#8NNNN%%	$
  	r)   c                 n    | j                   j                          | j                  j                  d       y r8  )r   rf  r  re  r  s    r'   rf  z3MvpForSequenceClassification.set_lightweight_tuning`  s&    

))+  //6r)   r   rf   rg  rh  r  ri  rB  rj  r  rk  r  r   ri   r  r  rj   c                    ||n| j                   j                  }|d}|$|	"t        d| j                  j                         | j                  |||||||||	|
||||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d                  dkD  rt        d      ||ddf   j                  |j!                  d      d|j!                  d            dddddf   }| j#                  |      }d}|| j                   j$                  | j                   j&                  dk(  rd	| j                   _        nv| j                   j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j,                  k(  rd
| j                   _        nd| j                   _        | j                   j$                  d	k(  rSt/               }| j                   j&                  dk(  r& ||j1                         |j1                               }n |||      }n| j                   j$                  d
k(  rGt3               } ||j                  d| j                   j&                        |j                  d            }n,| j                   j$                  dk(  rt5               } |||      }|s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                  |j>                  |j@                  |jB                  |jD                  	      S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for rf   rg  rh  r  ri  rB  rj  r  rk  r   ri   r  r  r   r   z7All examples must have the same number of <eos> tokens.r    
regressionsingle_label_classificationmulti_label_classificationr  )#r   r'  NotImplementedErrorr5   rE   r   eqeos_token_idrt   r:   r(  r;   unique_consecutivesumr$   r_   rq   r  problem_typer  r9   r=   rI   r
   squeezer	   r   r   rC  rn  ro  rF  rp  r   rq  )r4   r   rf   rg  rh  r  ri  rB  rj  r  rk  r  r   ri   r  r  r   rc   eos_masksentence_representationr  r  r  r  s                           r'   rA   z$MvpForSequenceClassification.forwardd  s   Z &1%<k$++B]B]I!:%J4>>KbKbJcd  **)/#9/!5+'"7/!5#  
   
<< 8 89<<]=Q=QRu''Q89A=VWW"/!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r)   rs  )rE   rF   rG   ru  r   r3   rf  r   r   r;   r9  rJ   r   r   r   r   r   r   rA   rK   rL   s   @r'   r  r  J  s    89VWy 7  15158<=A,0487;=A59=A-1$(,0/3&*!T
E,,-T
 !.T
 $E$4$45	T

 !))9)9 :T
 ELL)T
 $ELL1T
 'u||4T
 "$u'8'8"9:T
   1 12T
  ((9(9:T
 ))*T
 D>T
 $D>T
 'tnT
  d^!T
" 
u55	6#T
 T
r)   r  c            &           e Zd ZddgZ fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deeej                        deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   f"d       Z xZS )MvpForQuestionAnsweringrV  rW  c                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r/   )
r2   r3   r  rT  r   r   rX   hidden_size
qa_outputsr  r   s     r'   r3   z MvpForQuestionAnswering.__init__   s[      ++f%
))F$6$68I8IJ 	r)   c                 n    | j                   j                          | j                  j                  d       y r8  )r   rf  r  re  r  s    r'   rf  z.MvpForQuestionAnswering.set_lightweight_tuning  s$    

))+&&u-r)   r   rf   rg  rh  r  ri  rB  rj  start_positionsend_positionsr  rk  r   ri   r  r  rj   c                    ||n| j                   j                  }|	|
d}| j                  ||||||||||||||      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}|	|
t        |	j                               dkD  r|	j                  d      }	t        |
j                               dkD  r|
j                  d      }
|j                  d      }|	j                  d|      }	|
j                  d|      }
t        |      } |||	      } |||
      }||z   d	z  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  |j                  |j                  |j                   |j"                  |j$                  

      S )aX  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr  r   r   r    rl   )ignore_indexr0   )
r  start_logits
end_logitsrC  rn  ro  rF  rp  r   rq  )r   r'  r   r  r   r  ra   r(  rq   r   r	   r   rC  rn  ro  rF  rp  r   rq  )r4   r   rf   rg  rh  r  ri  rB  rj  r  r  r  rk  r   ri   r  r  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                               r'   rA   zMvpForQuestionAnswering.forward  s   f &1%<k$++B]B]&=+DI**)/#9/!5+'"7/!5#  
" "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r)   r  )rE   rF   rG   ru  r3   rf  r   r   r;   rJ   r9  r   r   r   r   r   r   rA   rK   rL   s   @r'   r  r    s   79VW
.  -1158<=A,0487;=A6:4859=A$(,0/3&*#Q
ELL)Q
 !.Q
 $E$4$45	Q

 !))9)9 :Q
 ELL)Q
 $ELL1Q
 'u||4Q
 "$u'8'8"9:Q
 "%"2"23Q
   0 01Q
   1 12Q
  ((9(9:Q
 D>Q
 $D>Q
  'tn!Q
" d^#Q
$ 
u99	:%Q
 Q
r)   r  c                   (     e Zd ZdZ fdZd Z xZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 D    t         |   |       t        |      | _        y r   )r2   r3   r;  r[  r   s     r'   r3   zMvpDecoderWrapper.__init__  s     !&)r)   c                 &     | j                   |i |S r   rb  )r4   argsr  s      r'   rA   zMvpDecoderWrapper.forward  s    t||T,V,,r)   )rE   rF   rG   rH   r3   rA   rK   rL   s   @r'   r  r    s    
*-r)   r  c                        e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                      deej"                     deej"                     deej                      deej                      deeej"                        deej"                     deej                     dee   dee   dee   dee   deeef   fd       Zed        Z xZS )MvpForCausalLMry  c                    t        j                  |      }d|_        d|_        t        |   |       t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y )NTFrU   )copydeepcopyrR   is_encoder_decoderr2   r3   r  r   r   rX   r  r  r|  r  r   s     r'   r3   zMvpForCausalLM.__init__  sf    v& $)! &v.
yy!3!3V5F5FUS 	r)   c                 B    | j                   j                  j                  S r   r   r[  r   r  s    r'   r  z#MvpForCausalLM.get_input_embeddings  s    zz!!...r)   c                 :    || j                   j                  _        y r   r  r  s     r'   r  z#MvpForCausalLM.set_input_embeddings  s    */

'r)   c                     | j                   S r   r  r  s    r'   r  z$MvpForCausalLM.get_output_embeddings  r`  r)   c                     || _         y r   r  r  s     r'   r  z$MvpForCausalLM.set_output_embeddings  r  r)   c                 &    || j                   _        y r   r   r[  )r4   r[  s     r'   set_decoderzMvpForCausalLM.set_decoder  s    $

r)   c                 .    | j                   j                  S r   r  r  s    r'   rc  zMvpForCausalLM.get_decoder  s    zz!!!r)   c                 n    | j                   j                          | j                  j                  d       y r8  r  r  s    r'   rf  z%MvpForCausalLM.set_lightweight_tuning  r  r)   r   rf   r   r   r  rB  rC  r  r  r   ri   r  r  rj   c                 D   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j                  |||||||||
|||      }| j                  |d         }d}|	Ft               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```Nrm  r   r    r   )r  r  rC  rc   r&  rF  )r   ri   r  r'  r   r[  r|  r	   r_   r  r   rC  rc   r&  rF  )r4   r   rf   r   r   r  rB  rC  r  r  r   ri   r  r  r   r  r  r  r  s                      r'   rA   zMvpForCausalLM.forward  sF   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 gaj)')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r)   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywrD   r  r  s     r'   r#  z0MvpForCausalLM._reorder_cache.<locals>.<genexpr>4  s.     nU_j--aZ=N=N1OPnr  r  r  s    `  r'   r  zMvpForCausalLM._reorder_cache/  s=    ) 	Jncmnn N	 r)   )NNNNNNNNNNNNN)rE   rF   rG   ru  r3   r  r  r  r  r  rc  rf  r   r   r;   r9  rJ   r   r   r   r   r   r   rA   r  r  rK   rL   s   @r'   r  r    s   *+
/0&%"+  1515=A>B,07;=A59-1$(,0/3&*S
E,,-S
 !.S
  ((9(9:	S

 !)):): ;S
 ELL)S
 'u||4S
 "$u'8'8"9:S
   1 12S
 ))*S
 D>S
 $D>S
 'tnS
 d^S
 
u77	8S
 S
j  r)   r  )r  rx  r  r  rT  r   )=rH   r  r  typingr   r   r   r   r;   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mvpr   
get_loggerrE   rG  rJ   rI   r(   r   r+   ModulerN   r   r   r   r   r   r   r;  rT  rx  r  r  r  r  __all__r   r)   r'   <module>r     s      / /    A A ! )   . , ( 
		H	%%,, c [^ ";BLL ;2XB299 XBvEbii EPzbii z|BII 0		 2   6A
# A
HW
# W
t Z
! Z
 Z
z 
|"4o |
|~ i
#5 i
i
X e
0 e
 e
R-* -B' BJr)   