
    Uh                       d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ  ej<                  e      Z d;dZ!d Z"d;dZ#d Z$e G d de             Z%e G d de             Z&e G d de             Z'e G d de             Z(e G d de             Z) G d dejT                        Z+ G d d ejX                        Z- G d! d"ejX                        Z. G d# d$ejX                        Z/ G d% d&ejX                        Z0 G d' d(ejX                        Z1 ed)*       G d+ d,e)             Z2 ed-*       G d. d/e)             Z3e G d0 d1e)             Z4 ed2*       G d3 d4e)e             Z5 ed5*       G d6 d7e)e             Z6 G d8 d9e)      Z7g d:Z8y)<zRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)OptionalTupleUnion)Tensornn)	LayerNorm   )ACT2FN)GenerationMixin)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )ProphetNetConfigc                     |r/t         j                  j                  | j                         |      S t         j                  j                  | |t        j
                        S )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_traces      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   '   sH    }}$$\%7%7%9s$CC}}$$\s%--$PP    c                 z   t        j                  || | f||      t        j                  |      j                  z  }|j	                         j                         }t        |      D ]0  }||   j                  dd       ||   j                  | dz          2 d|dddddf<   t        j                  ||gd      S )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr#   r   
left_blockright_block
stream_idxs          r    ngram_attention_biasr4   .   s    
 	

E?O<VSXY\a\g\ghm\n\r\rr  ##%++-KEl 6
J..qu.=:$$j[1_56 Jq!Qw99j+.A66r!   c                    | }d}|rX| dz  } |t        j                  |t        j                  |            j                         | z  z   }t        j                  |      }n)t        j
                  |t        j                  |            }| dz  }t        j                  ||      }|t        j                  |j                         |z        t        j                  ||z        z  | |z
  z  z   }t        j                  |t        j                  |      | dz
  z        j                         }|t        j                  ||j                         |      z   }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r%   r   )r   lt
zeros_likeintabsmaxlogr   mathr(   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larges	            r    compute_relative_bucketsrH   ?   sG    10!Q& hh-u/?/?@V/WX\\^allm 	 "'+A!B!&+A5CSCSTjCk!lq Ixx.	:Huyy)?)E)E)G))STW[W_W_y X  	y	  " "L 99\5??<+HKZ[O+\]aacL/%++hH^HbHbHdfr2ssr!   c                    |j                  d      j                  d|j                  d      d      }||j                  d      z
  }t        j                  |dz
  |fd      j                  d      }|j                  d|j                  d      d      }||j                  d      z
  }t        | ||d      }t        | ||d      }||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)rB   )	unsqueezerepeatsizer   r.   rH   )r?   r@   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketss          r    #compute_all_stream_relative_bucketsrS   Z   s    
 &2%;%;A%>%E%EaIZIZ[]I^`a%b"%ClF\F\]_F`%`" ,199lQ6F5U[]+^+h+hij+k(+O+V+VWXZfZkZklnZoqr+s(+OR^RhRhikRl+l( &>\#ATY&" )A\#GZ_)% *+LLLr!   c                   2   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed	<   dZeeej                        ed
<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   ed        Zy)ProphetNetSeq2SeqLMOutputa  
    Base class for sequence-to-sequence language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
            SoftMax).
        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
            SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, encoder_sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
            softmax, used to compute the weighted average in the self-attention heads.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsc                 N    t        j                  dt               | j                  S Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.warningswarnFutureWarningr^   selfs    r    decoder_cross_attentionsz2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions   $    	

 $$$r!   )__name__
__module____qualname____doc__rV   r   r   FloatTensor__annotations__rW   rX   rY   r   rZ   r[   r\   r]   r^   r_   r`   ra   propertyrj    r!   r    rU   rU   q   sH   :x )-D(5$$
%,*.FHU&&'.04L(5,,-4:>OXeE$5$567>@D8E%*;*;$<=DFJ%0A0A*B!CJ=Au'8'8!9:ACGhuU->->'?@G;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:A% %r!   rU   c                       e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed	<   dZee
ej
                        ed
<   dZeej
                     ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   ed        Zy)ProphetNetSeq2SeqModelOutputa2  
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, encoder_sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, encoder_sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    last_hidden_stateNlast_hidden_state_ngramrY   rZ   r[   r\   r]   r^   r_   r`   ra   c                 N    t        j                  dt               | j                  S rc   rd   rh   s    r    rj   z5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions  rk   r!   )rl   rm   rn   ro   r   rp   rq   rw   r   rY   r   rZ   r[   r\   r]   r^   r_   r`   ra   rr   rj   rs   r!   r    ru   ru      s+   <| (((;?Xe&7&78?:>OXeE$5$567>@D8E%*;*;$<=DFJ%0A0A*B!CJ=Au'8'8!9:ACGhuU->->'?@G;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:A% %r!   ru   c                   l   e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed	<   dZee
ej
                        ed
<   y)ProphetNetDecoderModelOutputaZ  
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
    rv   Nrw   rY   hidden_stateshidden_states_ngram
attentionsngram_attentionsr^   )rl   rm   rn   ro   r   rp   rq   rw   r   rY   r   r{   r|   r}   r~   r^   rs   r!   r    rz   rz     s    .` (((;?Xe&7&78?:>OXeE$5$567>8<M8E%"3"345<>B%(9(9":;B59Ju00129;?huU%6%678?;?huU%6%678?r!   rz   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed	<   dZeeej                        ed
<   dZeeej                        ed<   y)ProphetNetDecoderLMOutputam  
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
            SoftMax).
        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
            SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
    NrV   rW   rX   rY   r{   r|   r}   r~   r^   )rl   rm   rn   ro   rV   r   r   rp   rq   rW   rX   rY   r   r{   r|   r}   r~   r^   rs   r!   r    r   r   X  s    /b )-D(5$$
%,*.FHU&&'.04L(5,,-4:>OXeE$5$567>8<M8E%"3"345<>B%(9(9":;B59Ju00129;?huU%6%678?;?huU%6%678?r!   r   c                   $    e Zd ZeZdZdZd Zd Zy)ProphetNetPreTrainedModel
prophetnetTc                 :   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          y y t        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          y y y )N        )meanstd)
isinstancer   Linearweightdatanormal_configinit_stdbiaszero_	Embeddingpadding_idx)ri   modules     r    _init_weightsz'ProphetNetPreTrainedModel._init_weights  s    fbii(MM&&CT[[5I5I&J{{&  &&( '-MM&&CT[[5I5I&J!!-""6#5#56<<> . .r!   c                    | j                   j                  }| j                   j                  }|J d       |j                  |j                        }|dd df   j                         |ddd f<   ||d<   |J d       |j                  |dk(  |       t        j                  |dk\        j                         sJ d	       |S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rJ   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r   decoder_start_token_idpad_token_id	new_zerosshaper*   masked_fill_r   allitem)ri   	input_idsr   r   shifted_input_idss        r    _shift_rightz&ProphetNetPreTrainedModel._shift_right  s    !%!C!C{{//%1 	
F	
1 &//	@%.sCRCx%8%>%>%@#qr'"$:&!'\)\\'&&'8D'@,Oyy*a/0557s9ss7  r!   N)	rl   rm   rn   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r   rs   r!   r    r   r     s    #L$&*#?!r!   r   c                   B     e Zd ZdZdeddf fdZd fd	Z fdZ xZS )	ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    r   returnNc                     |j                   | _        t        |   |j                   |j                  |j
                         y N)max_position_embeddings
max_lengthsuper__init__hidden_sizer   ri   r   	__class__s     r    r   z'ProphetNetPositionalEmbeddings.__init__  s3     88779K9KVM`M`ar!   c                 (   || j                   J d       ||]|d   d   j                  d   }|d   |z   }t        j                  dt        j                  |      t        | j                   |z         z  }n|&t        j                  |t        j                  |      }t        j                  |d      j                  |      |z  j	                         | j                   z   }|j                  d| j                  dz
        }t        | -  |      |fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r%   r   )r   r   r   r#   r   )r   r   r   r&   longr8   cumsumtype_asclampr   r   forward)	ri   inputs_shaper#   attention_maskrY   rN   prev_num_input_idsnum_input_idsr   s	           r    r   z&ProphetNetPositionalEmbeddings.forward  s   $$*:*:*B 	
Q	
C * &5Q%7%:%@%@%C" ,Q2D D$zz&

6R((=89  ")%*ZZEJJW]%^N LLQ7??OR``$&4++ ,
  ,11!T__q5HIw|,l::r!   c                 "    t         |   |      S r   )r   r   )ri   rN   r   s     r    _forwardz'ProphetNetPositionalEmbeddings._forward  s    w|,,r!   )NNN)	rl   rm   rn   ro   r   r   r   r   __classcell__r   s   @r    r   r     s.    b/ bD b;8- -r!   r   c                        e Zd ZdZdedef fdZdej                  dedefdZ		 	 	 	 	 dd	e
e   d
e
e   de
e   de
ee      dedeee
e   f   fdZ xZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   num_attn_headsc                    t         |           |j                  }|j                  | _        |j                  | _        || _        ||z  | _        | j                  |z  |k(  sJ d       t        j                  ||      | _	        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        y )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r   key_proj
value_proj
query_projout_proj)ri   r   r   r   r   s       r    r   zProphetNetAttention.__init__  s    
 	((!'!9!9~~,#~5}}~-< 	
4	
<
 		+{;))K=))K=		+{;r!   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S Nr   r%   viewr   r   	transpose
contiguous)ri   r   r   r   s       r    _shapezProphetNetAttention._shape  s9    {{3)<)<dmmLVVWXZ[\ggiir!   key_value_statesr   layer_head_maskpast_key_valueoutput_attentionsr   c                    |j                         \  }}}	|d u}
t        |j                               |||	gk(  sJ d|||	f d|j                                 | j                  |      | j                  dz  z  }|
r||d   }|d   }n|
rE| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }nD| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }|
r||f}|| j                  d| j                  f} | j	                  |||      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  d||j                  dd	            }|| j                  ||f}|j                         |k7  rt        d
| d|j                                ||j                         dk(  rd }|| j                  d|f}|2|j                         |k7  rt        d| d|j                                |||z   }|r|}nd }t        j                  j!                  |d      }||j                         | j                  fk(  s&J d| j                  f d|j                                 |j                  dddd      |j                  || j                  ||      z  }|j                  dddd      |z  }t        j                  j#                  || j$                  | j&                        }t        j                  d||      }|| j                  || j                  f}|j                         |k7  rt        d| d|j                                |j                  dd      j)                  |||	      }| j+                  |      }t        j                  j#                  || j"                  | j&                        }|||fS )Nz Size of hidden states should be 	, but is       ?r   r   rJ   r%   zbsij,bsjk->bsikr
   z#Attention weights should have size z Attention mask should have size r   /Head mask for a single layer should be of size ptrainingz `attn_output` should have shape , but is of shape )rM   listr   r   r   r   r   r   r   r   einsumr   
ValueErrorr   r   r   r   r   r   r   reshaper   )ri   r{   r   r   r   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputs                        r    r   zProphetNetAttention.forward  sZ    ,9+=+=+?(
G[ .T9M&&().
 
 	p .j';.N-OyYfYkYkYmXno		p 
 }59KL."<'*J)!,LT]]3C%Db*UJ;;t7G'H"jYL T]]=%A2zRJ;;t}'Er:VL
 ),7N !$"5"5r4==I
Jt{{<*EJJJW$Z__j1
(|((*5//!$||$5|ZEYEYZ[]^E_`$d&9&97GL.0B>BRR[\h\m\m\o[pqrr %.*<*<*>!*C!N$d&9&91gF%.*=*=*?>*Q??OyYgYlYlYnXopqq%'.8L$0!$(!}},,\r,B&"'')d.A.A-CC A4CVCVBXAY Z#((*+-C +//2q!<|?P?PD//'@ L
 %4$8$8B1$EH]$]!]]**$$]] + 


 ll#4j,O$d&9&97DMMR/??OOabmbrbrbtauvww!++Aq199*g{[mmK0mm++K4<<RVR_R_+`1>AAr!   )NNNNF)rl   rm   rn   ro   r   r8   r   r   r   r   r   r   boolr   r   r   s   @r    r   r     s    G< < <0jU\\ jC jc j .2+/,026"'`B #6*`B !(	`B
 "&)`B !v/`B  `B 
vx''	(`Br!   r   c                   2     e Zd ZdZdedef fdZd Z xZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    r   ffn_dimc                 *   t         |           t        |j                     | _        t        j                  |j                  |      | _        t        j                  ||j                        | _	        |j                  | _
        |j                  | _        y r   )r   r   r   activation_functionactivation_fnr   r   r   intermediateoutputactivation_dropoutr   )ri   r   r   r   s      r    r   zProphetNetFeedForward.__init__n  sk    #F$>$>?IIf&8&8'Bii););<"(";";~~r!   c                 D   | j                  |      }| j                  |      }t        j                  j	                  || j
                  | j                        }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )r   r   r   r   r   r   r   r   )ri   r{   s     r    r   zProphetNetFeedForward.forwardv  s    ))-8**=9--mt?V?Vaeanan-oM2--mt||VZVcVc-dr!   )	rl   rm   rn   ro   r   r8   r   r   r   r   s   @r    r   r   i  s!    &/ &# &r!   r   c                   d     e Zd Zdef fdZd Zd Z	 	 	 	 	 	 	 d	deee	      fdZ
d Zd Z xZS )
ProphetNetNgramSelfAttentionr   c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | j                  z  | _	        |j                  | _
        | j                  | j                  z  |j                  k(  sJ d       t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  | j                  | j                  z        | _        d| _        y )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r?   relative_max_distancenum_decoder_attention_headsr   r   r   r   r0   r   r   r   r   r   r   relative_pos_embeddingsr   r   s     r    r   z%ProphetNetNgramSelfAttention.__init__  sa   !--!--%+%A%A"$@@~~!'!9!9**d.A.AA\\
}}t222f6H6HH 	
D	
H 		&"4"4f6H6HI))F$6$68J8JK))F$6$68J8JK 		&"4"4f6H6HI (*yy1C1CTEUEUX\XkXkEk'l$  r!   c                     |j                  ||| j                  | j                        j                  dd      j	                         S r   r   )ri   r   r   r   s       r    r   z#ProphetNetNgramSelfAttention._shape  s9    {{:w0C0CT]]S]]^_abcnnppr!   c                     d| _         y )NT)r   rh   s    r    prepare_for_onnx_export_z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_  s	    r!   r   c	           	         |j                         \  }	}
}t        |j                               |	|
|gk(  sJ d|	|
|f d|j                          | j                  |      }| j	                  |      }| j                  |      }|| j                  dz  z  }| j                  ||
|	      }| j                  |d|	      }| j                  |d|	      }|	| j                  d| j                  f} |j                  | } |j                  | } |j                  | }|j                  d| j                  z   d      }|j                  d| j                  z   d      }|j                  d| j                  z   d      }|j                  d| j                  z   d      }|d   |dd  }}|d   |dd  }}|d   |dd  }}|d   |dd  }}|<|d   }t        j                  ||fd      }|d   }t        j                  ||fd      }||f}|
d| j                  z   z  }t        j                  d	||j                  dd
            }| j!                  ||||      } || z   }|||z   }t#        |d| j$                        j'                  |      }!|w|j                         | j                  fk(  s&J d| j                  f d|j                                 |j                  dddd      |!j                  |	| j                  d|      z  }!t(        j*                  j-                  |!| j.                  | j0                        }!t        j                  d	|!|      }"|"j                  dd      j3                  |	d||      }"| j5                  |"      }"t        j6                  |d      j                  |	| j                  | j                  || j                        }#t        j6                  |D $cg c]  }$t        j                  ||$gd       c}$d      }%t        j6                  |d      }&t        j                  |D 'cg c])  }'t        j                  ||'gd      j9                  d      + c}'d      }(t        j                  d|#|%f      })| j;                  |&|)||      }*|)|*z   })|5|j=                  dddd
d      }|j?                  |)j@                        }|)|z   })t#        |)d| j$                        j'                  |)      }+|\|j                         | j                  fk(  s&J d| j                  f d|j                                 |j                  ddddd      |+z  }+t(        j*                  j-                  |+| j.                  | j0                        }+t        j                  d|+|(j                  dd      f      },|,j                  dd
      },|,j3                  |	| j                  ||      },| j5                  |,      },t        j                  |"|,gd      j                  |	d|      }-|!j                  |	| j                  |d      }!t(        j*                  j-                  |-| j,                  | j0                        }-|-|!|+|fS c c}$w c c}'w )Nz#`hidden_states` should be of shape r   r   rJ   r   r   r%   r   zbntc,bncs->bntsr
   )r   r   r   r   r   zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc)!rM   r   r   r   r   r   r   r   r   r   chunkr0   r   r.   r   r    get_main_relative_pos_embeddingsr   r   r   r   r   r   r   r   r   r   stackrK   #get_predict_relative_pos_embeddingspermutetor   ).ri   r{   r   r   r   extended_predict_attention_maskrQ   rR   rN   r   ngram_sequence_lengthr   r   r   r   r   hidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listmain_key_statespredict_key_states_listmain_value_statespredict_value_states_listprev_main_key_statesprev_main_value_statesr/   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_stateskeypredict_key_statespredict_hidden_statesv_ppredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   s.                                                 r    r   z$ProphetNetNgramSelfAttention.forward  s*    :G9K9K9M6
);M&&()j:OQ\-]] 	
1*>SU`2`1a b##$&	
] }5]]=1
}5 $t}}c'9: {{<1F
S[[R<
{{<Z@ $"5"5r4==I
(|((*5$Z__j1
(|((*5 +00TZZQ0G(..q4::~1.E$**1tzz>q*A(..q4::~1.E9KA9NPbcdcePf67H7KM^_`_aMb43B13EWXWYGZ07H7KM^_`_aMb4 %#1!#4 #ii)=(OUVWO%3A%6" %		+ACT*U[\ ] *+<= 0A

NC "LL):<MOhOhijlmOno (,'L'L 1<A_(
$ .0LL% 1N B!
 '#
$	 	 &"'')d.A.A-CC A4CVCVBXAY Z#((*+-C .221b!Q?/BVBVD//_C O --//4CYCYdhdqdq/r
 !<<(9?L]^+55a;CCJPQSbdop==)9:  %{{+DaHMM

D$7$7$-- 

 #[[Zq)rSV%))_c4JA*N)rtuv !&,FA N  %yyLefSUYY)3/3==a@fhi 
  %||,@CWYkBlm +/*R*R!#7Gh+
'
  46UU*6.M.U.UVWYZ\]_`bc.d+.M.P.PQeQkQk.l+#7:Y#Y $ 
 '&
'	 	 &"'')d.A.A-CC A4CVCVBXAY Z#((*+-C "1!5!5aB1!EHZ!Z]]22$"8"84== 3 
 $ll #57K7U7UVWYZ7["\
 2;;AqA199*djjRacno"mm,?@ ii!13F GKPPQ[]_alm)..z4;N;NP_acdmm++K4<<RVR_R_+`O-?OOI *s gs   Y(#.Y-c                    |j                   \  }}}}|j                  ||||      }||j                   d d \  }}	t        j                  d|j                   d   dz         j	                  d      j	                  d      j                  ||	d      j                  |j                        }
|
|j	                  d      j                  ||	d      z
  }
t        | j                  | j                  |
d      }| j                  |      }|j                  |j                   d d | j                  | j                  fz         }|j                  dddd      }|j                  |j                   d d dz         }|j                  d| j                  d      }|j                  d|j                   d         }|j                         }|j                  d|j!                  d            }t        j"                  |d|      }|j                  |||d      }|S )	Nr%   r   rJ   r   Fr
   )rJ   r   index)r   r   r   arangerK   rL   r  r#   rH   r?   r   r  r   r  r   r   rM   gather)ri   r{   r   rN   rQ   r   r   r   r   r/   rA   rel_pos_embeddingsr  s                r    r  z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsV  s    8D7I7I4
NGW#((^WgV)1*7*=*=bq*A'JQ 2 22 6 :;11
OQ7L''(  "4l6L6LQ6O6V6VWacrtu6v!v-E  $"<"<>PRW.*
 "99-H/44$$Ra(D,<,<d>Q>Q+RR
 0771aC/778J8J2A8NQV8VW)G)N)NqRVReRegh)i&)G)L)L.44R8*
& *H)L)L)N&/77<N<S<STV<WX',||4FAUs't$'C'H'HUcelnp'q$++r!   c                 (   |j                   dd \  }}||j                   d   }|d   d   |dz
  k(  sJ d       t        j                  d|      j                  d      j                  d      j	                  ||d      j                  |j                        }||j                  d      j	                  ||d      z
  }t        | j                  | j                  |d      }|j                  dd      }| j                  |      }	|	j                  |j                   d d | j                  | j                  fz         }	|	j                  ddddd      }	|	j                  d| j                        }	|j                  d      }|j	                  | j                   d| j                  d      }|j                  d|j#                  d            j%                         }t        j&                  |	d|	      }
|
j                  || j                   | j                  |d      }
|
S )
Nr   r%   rJ   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr  r
   r,  )r   r   r.  rK   rL   r  r#   rH   r?   r   r   r  r   r   r  r   r0   rM   r   r/  )ri   r{   r   rN   rR   r   r/   key_sequence_lengthrA   r0  r(  s              r    r
  z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddings  s+    '4&9&9!A&>#
O,4"."4"4R"8?1%)<q)@@ t@ Q 3411
OQ7L''(  "4l6L6LQ6O6V6VWacrtu6v!v0H  $"<"<>PRW1-
 &//15!99-H 044$(8(8$:M:M'NN
 0771aAF/77D<L<LM,M,W,WXY,Z),M,T,TJJ4..-
) -N,R,R166r:-

$& 	* +0,,A-N+
'
 +J*N*N

D$7$7"+
' /.r!   NNNNNNN)rl   rm   rn   r   r   r   r  r   r   r   r   r  r
  r   r   s   @r    r   r     sZ     /  :q 37(,'+*.pP !v/pPd+,Z9/r!   r   c                   8     e Zd ZdZdef fdZ	 ddefdZ xZS )ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    r   c                     t         |           t        ||j                        | _        t        |j                        | _        t        ||j                        | _
        t        |j                        | _        y r   )r   r   r   num_encoder_attention_heads	self_attnr	   r   self_attn_layer_normr   encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   s     r    r   zProphetNetEncoderLayer.__init__  s_    ,VV5W5WX$-f.@.@$A! 2&&:P:PQ'01C1C'D$r!   r   c                     | j                  ||||      \  }}}| j                  ||z         }| j                  |      }| j                  ||z         }|f}	|r|	|fz  }	|	S )N)r{   r   r   r   )r8  r9  r;  r<  )
ri   r{   r   r   r   attention_outputr   _feed_forward_outputoutputss
             r    r   zProphetNetEncoderLayer.forward  s     -1NN')+/	 -; -
), 112B]2RS #//>445H=5XY "&Gr!   F	rl   rm   rn   ro   r   r   r   r   r   r   s   @r    r5  r5    s+    E/ E #(
  r!   r5  c                   R     e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddedefdZ xZS )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    r   c                 b   t         |           t        |      | _        t	        |j
                        | _        |j                  r5t        ||j                        | _
        t	        |j
                        | _        t        ||j                        | _        t	        |j
                        | _        y r   )r   r   r   r8  r	   r   r9  add_cross_attentionr   r   
cross_attncross_attn_layer_normr   decoder_ffn_dimr;  r<  r   s     r    r   zProphetNetDecoderLayer.__init__  s    5f=$-f.@.@$A! %%1&&:\:\]DO)263E3E)FD& 2&&:P:PQ'01C1C'D$r!   	use_cacher   c           
      r   ||d d nd }| j                  |||||||	|
      \  }}}}| j                  ||z         }||dd  nd }d }|4| j                  ||||||      \  }}}| j                  ||z         }||z   }| j	                  |      }| j                  ||z         }|f}|r||||fz  }|r||fz  }|S )Nr%   )r{   r   r   r   r  rQ   rR   rN   )r{   r   r   r   r   r   )r8  r9  rH  rI  r;  r<  )ri   r{   r   r`   encoder_attn_maskr   cross_attn_layer_head_maskr  rQ   rR   rN   r   rK  r   self_attn_past_key_valuengram_attention_outputself_attn_weightsself_attn_weights_ngrampresent_key_valuecross_attn_past_key_valuecross_attn_weightsr>  cross_attn_present_key_valuer@  rA  s                            r    r   zProphetNetDecoderLayer.forward  sK   $ :H9S>"1#5Y] `d`n`n'3)+,K+I.O% ao 	a
] 13JL] 11-BX2XY <J;UN23$7[_!! ,QUQ`Q`+!60 :8"3 Ra RN02N !667G-7WXM !24P P #//>445H=5XY ")+BDVWWG)++Gr!   )NNNNNNNNNNTFrC  r   s   @r    rE  rE    sV    E/ E$ "#'(,'+*."'= =  =r!   rE  z=
    The standalone encoder part of the ProphetNetModel.
    )custom_introc                       e Zd Zddedej
                  f fdZd Zd Ze		 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
e   de
e   de
e   deeef   fd       Z xZS )ProphetNetEncoderr   word_embeddingsc                    t         |   |       ||n5t        j                  |j                  |j
                  |j                        | _        t        |      | _	        t        |j
                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _        | j%                          yc c}w a7  
        word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
            The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
            embeddings instead of randomly initialized word embeddings.
        Nr   F)r   r   r   r   
vocab_sizer   r   r[  r   position_embeddingsr	   embeddings_layer_norm
ModuleListr+   num_encoder_layersr5  layersgradient_checkpointing	post_initri   r   r[  r?  r   s       r    r   zProphetNetEncoder.__init__D  s     	  * f//1C1CQWQdQde 	
 $B&#I %.v/A/A%B"mmUSYSlSlMm$n%;F%C$no&+#	 %os    Cc                     | j                   S r   r[  rh   s    r    get_input_embeddingsz&ProphetNetEncoder.get_input_embeddingsZ      ###r!   c                     || _         y r   ri  ri   values     r    set_input_embeddingsz&ProphetNetEncoder.set_input_embeddings]  
    $r!   r   r   	head_maskinputs_embedsr   output_hidden_statesreturn_dictr   c                 D   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      ||t	        d      ||| j                  |      }||d|ddddddf   j                  d| j                   j                  dd      z
  t        j                  | j                        j                  z  }|j                  |j                        }nd}| j                  |j                  dd |j                        \  }	}
||	z   }| j!                  |      }t"        j$                  j'                  || j                   j&                  | j(                        }|rdnd}|rdnd}|[|j+                         d	   t-        | j.                        k(  s2J d
t-        | j.                         d|j+                         d	    d       t1        | j.                        D ]p  \  }}|r||fz   }| j2                  r3| j(                  r'| j5                  |j6                  |||||   nd|      }n ||||||   nd|      }|d	   }|sh||d   fz   }r |r||fz   }|st9        d |||fD              S t;        |||      S )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r%   r   rs   r   z&The head_mask should be specified for  layers, but it is for .)r   r   r   c              3   &   K   | ]	  }||  y wr   rs   .0vs     r    	<genexpr>z,ProphetNetEncoder.forward.<locals>.<genexpr>  s     lq^_^kl   )rv   r{   r}   )r   r   rs  use_return_dictr   r[  rL   r7  r   r'   r   r(   r  r`  r   r#   ra  r   r   r   r   rM   lenrd  	enumeratere  _gradient_checkpointing_func__call__tupler   )ri   r   r   rq  rr  r   rs  rt  extended_attention_maskr`  rN   r{   r`   all_attentionsidxencoder_layerlayer_outputss                    r    r   zProphetNetEncoder.forward`  s   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!6RSS"}'@QRR"}'< 00;M %nQdA%56==aAhAhjkmnooDJJ'++',# '>&@&@ATAT&U#&*#,0,D,D]EXEXY[Z[E\^k^r^r,s)\%(;;22=A--mt{{?R?R]a]j]j-k&:0d  >>#A&3t{{+;< 8T[[9I8JJabkbpbpbrstbuavvwx< #,DKK"8 	FC#(=@P(P%**t}} $ A A!**!+'0'<Ys^$%! !.!#:7@7LYs^RV&7	! *!,M !/=3C2E!E/	F2  $9]<L$L!l]4I>$Zlll+;P]k
 	
r!   r   r3  )rl   rm   rn   r   r   r   r   rj  ro  r   r   r   r   r   r   r   r   r   r   r   s   @r    rZ  rZ  >  s    / ",, ,$%  -115,004,0/3&*]
ELL)]
 !.]
 ELL)	]

  -]
 $D>]
 'tn]
 d^]
 
uo%	&]
 ]
r!   rZ  z=
    The standalone decoder part of the ProphetNetModel.
    c                       e Zd Zddedeej                     f fdZd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     deeeej                           deej                     dee   dee   dee   dee   deeef   fd       Zd Zd Zd Z xZS )ProphetNetDecoderr   r[  c                    t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        ||n5t        j                  |j                  |j                  |j                        | _        t        |      | _        t        j                  | j                  |j                  d      | _        t        j"                  t%        |j&                        D cg c]  }t)        |       c}      | _        t-        |j                        | _        d| _        | j3                          yc c}w r]  )r   r   r0   r?   r   r   r   max_target_positionsr   r   r_  r   r   r[  r   r`  ngram_embeddingsrb  r+   num_decoder_layersrE  rd  r	   ra  re  rf  rg  s       r    r   zProphetNetDecoder.__init__  s    	 \\
!--%+%A%A"~~$*$B$B! * f//1C1CQWQdQde 	
 $B&#I  "TZZ9K9KT RmmUSYSlSlMm$n%;F%C$no%.v/A/A%B"&+# %os   Ec                     | j                   S r   ri  rh   s    r    rj  z&ProphetNetDecoder.get_input_embeddings  rk  r!   c                     || _         y r   ri  rm  s     r    ro  z&ProphetNetDecoder.set_input_embeddings  rp  r!   r   r   r`   encoder_attention_maskrq  cross_attn_head_maskrY   rr  rK  r   rs  rt  r   c                 
   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      ||t        d      ||| j                  |      }|j                  dd \  }}| j                  ||f|j                  |      \  }}|d\  }}n| j                  |      \  }}| j                  j                  |dz         }||z   }| j                  j                  }|\|j                  d      dk(  sJ d       t        | j                         D cg c]  }||dz
     |z   j#                  |dd        }}d}d}nOt        | j                         D cg c]  }||dz
     |z    }}| j%                  ||      }| j'                  ||      }||d	|ddddddf   j#                  d| j                   j(                  dd      z
  t+        j,                  | j.                        j0                  z  }|j3                  |j.                        }nd}t+        j4                  |g|z   d      }| j6                  r| j7                  |      }t8        j:                  j=                  || j<                  | j>                  
      }|rdnd}|r| j                   j                   dkD  rdnd}|
rdnd}|
rdnd}|
r| j                   j@                  rdnd}| jB                  r%| j>                  r|	rtD        jG                  d       d}	|	rdnd} tI        ||gddg      D ]f  \  }!}"|!	|!j                         d   tK        | jL                        k(  r3J d|" dtK        | jL                         d|j                         d    d        tO        | jL                        D ]  \  }#}$|r7||ddd|f   fz  }| j                   j                   dkD  r||dd|df   fz  }|||#   nd}%| jB                  rC| j>                  r7| jQ                  |$jR                  |||||||#   nd|||#   nd||||d|	|
      }&n# |$|||||||#   nd|||#   nd|||||%|	|
      }&|&d   }|	r| |&|
rdnd   fz  } |
s||&d   fz  }||&d   fz  }| j                   j@                  s||&d   fz  } |r7||ddd|f   fz  }| j                   j                   dkD  r||dd|df   fz  }|ddd|f   }'| j                   j                   dkD  r|dd|df   nd}(|stU        d |'|(| |||||fD              S tW        |'|(| |||||      S c c}w c c}w )aY  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r%   )r#   rY   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1rv  r   rs   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frq  r  zThe `z` should be specified for rw  rx  )r   r`   rN  r   rO  r  rQ   rR   rN   r   rK  r   r  r
   c              3   $   K   | ]  }|| 
 y wr   rs   rz  s     r    r}  z,ProphetNetDecoder.forward.<locals>.<genexpr>  s       = s   )rv   rw   rY   r{   r|   r}   r~   r^   ),r   rK  r   rs  r  r   r[  r   r`  r#   !compute_buffered_relative_bucketsr   r  r   rM   r+   r0   rL   prepare_attention_maskprepare_predict_attention_maskr   r   r'   r   r(   r  r.   ra  r   r   r   r   rG  re  loggerwarning_oncezipr  rd  r  r  r  r  rz   ))ri   r   r   r`   r  rq  r  rY   rr  rK  r   rs  rt  r   r/   main_stream_pos_embedrN   rQ   rR   predicting_stream_pos_embedr{   r  r0   ngram_hidden_statesr  r  extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attnspresent_key_values	attn_mask	mask_namer  decoder_layerr   r  rv   rw   s)                                            r    r   zProphetNetDecoder.forward  s   H "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!6fgg"}'@eff"}'< 00;M&3&9&9"1&=#
O.2.F.F) ''+ /G /
+| &PZM*,M
 66|D.1&*&>&>&G&GWXHX&Y# &(==0077 & %%a(A- a- #4::.# "%!),/JJRRS]_`bcd# # '+#.2+ Z__c_i_iYj#PU!%!),/JJ# # '+&A&A-Q_&`#.2.Q.QR_ao.p+ "-,QdA-=>EEaIpIprsuvwwDJJ'++/,+ /N.P.PQ^QdQd.e+.2+		=/4G"GK%% 66}EM--mt||VZVcVc-d /C%/CHYHY\]H]cg&&7T'8d 1dkk6U6U"[_&&4==##p "	#,R$ %(4H(IKYoKp$q 	 Iy$ ~~'*s4;;/?@ I;&@T[[AQ@R S!(+,A/@	 #,DKK"8 7	;C#--CSOCS@S2T1VV-;;$$q(2}QHXEX7Y6[[25D5P_S1VZN**t}} $ A A!**!+)3'0'<Ys^$2F2R)#.X\325 %!" !.!#:*?&E7@7LYs^RV5I5U,S1[_4S3Q6W!-#1'&7!$ *!,M"}:KQQR'S&UU" %-*:)<<%&=+;*==&;;22#a(8'::Oo7	;r  )mA?O?O<O.P-RR){{  1$.=ODTAT3U2WW. *!-=o-=*=>HLHYHY\]H]-?3C0C"Dcg  &+&12)*#	   ,/$;.7 >,3,	
 		
M##s   #U2U7c           	         |j                   \  }}t        j                  d| j                        j	                  |j
                        j                  dd      }t        | j                  | j                  |      \  }}|d d d |d |f   j                  |dd      }t        j                  |d d d |d |f   |d d d || j                  | j                  |z   f   gd      j                  |dd      }||fS r   )r   r   r.  r  r  r#   rL   rS   r?   r   r.   )ri   rN   r   r/   main_relative_bucketspredict_relative_bucketss         r    r  z3ProphetNetDecoder.compute_buffered_relative_buckets  s!   &2&8&8#
O||At'@'@ADD\EXEXY``abdef:]d88,;
77
 !6a9I/9IK[OK[6[ \ c cdnpqst u#(99(,<_,<>N>N)NO('')B)BTE^E^apEp)pp $
 &Q
" 	! %&>>>r!   c                 L   |j                   d d \  }}t        j                  ||ft        j                  |j                        j
                  |j                  |j                        }t        j                  |d      }|d |d |f   d d d d d d f   j                  || j                  j                  f|j                   z         }|@d|d d d d d d f   z
  t        j                  | j                        j
                  z  }||z   }n|}|j                  |j                        S )Nr%   r   r   rv  )r   r   fullr'   r   r(   r#   triuexpandr   r   r  )ri   r{   r   r   
seq_lengthcausal_maskextended_causal_maskr  s           r    r  z(ProphetNetDecoder.prepare_attention_mask  s%   !.!4!4Ra!8
J jj$KK++,00%% ''	
 jja0*;J;+CDT4QRTUEUV]]@@AKDUDUU 

 %'*^AtT1<L-M'MQVQ\Q\]a]g]gQhQlQl&l#&:=T&T#&:#&))-*=*=>>r!   c           	      &   |j                   d d \  }}t        | j                  | j                  |j                  |j
                        }t        j                  |d d d |d |f   |d d d || j                  | j                  |z   f   gd      }|d d d d d d d d f   j                  || j                  j                  f|j                   z         }|d|d d d d d d d f   z
  t        j                  | j
                        j                  z  }|j                  || j                  j                  | j                  ||f      }t        j                  |t        j                  |      gd      }||z   }n|}|j                  |j
                        S )Nr%   rJ   r   rv  )r   r4   r  r0   r#   r   r   r.   r  r   r   r'   r(   r7   r  )	ri   r{   r   r   r  predict_causal_maskextended_predict_causal_maskr  r  s	            r    r  z0ProphetNetDecoder.prepare_predict_attention_mask  s   !.!4!4Ra!8
J 3%%tzz=3G3GI\I\
 $ii#A{
{KZK$?@#{
{D$=$=@Y@Y\f@f$ff 
 (;4q!Q;N'O'V'V@@ADWD]D]](
$
 %'*^AtT4QR<R-S'SW\WbWbcgcmcmWnWrWr&r#&=&D&DT[[DDdjjR\^hi'# ',ii(%*:*:;R*STZ\'# /KMd.d+.J+.11-2E2EFFr!   r   )NNNNNNNNNNNN)rl   rm   rn   r   r   r   r   r   rj  ro  r   r   r   r   r   r   rz   r   r  r  r  r   r   s   @r    r  r    sq   / (2<<BX :$%  -1158<9=,07;@D04$(,0/3&*\
ELL)\
 !.\
  (5	\

 !) 6\
 ELL)\
 'u||4\
 "%ell(;"<=\
  -\
 D>\
 $D>\
 'tn\
 d^\
 
u22	3\
 \
|?,?0!Gr!   r  c            $           e Zd ZddgZdef fdZd Zd Zd Zd Z	d	 Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                     deej                     deej                     deej                     deej                     deej                     dee   deeeej                           deej                     deej                     dee   dee   dee   dee   deeef   f d       Z xZS )ProphetNetModelencoder.word_embeddings.weightdecoder.word_embeddings.weightr   c                    t         |   |       t        j                  |j                  |j
                  |j                        | _        t        j                  |      }d|_
        d|_        t        || j                        | _        t        j                  |      }d|_        d|_
        t        || j                        | _        | j#                          y )Nr^  FT)r   r   r   r   r_  r   r   r[  copydeepcopyis_encoder_decoderrK  rZ  encoder
is_decoderr  decoderrf  )ri   r   encoder_configdecoder_configr   s       r    r   zProphetNetModel.__init__  s     !||F,=,=v?Q?Q_e_r_rsv.,1)#( (9M9MNv.$(!,1)(9M9MN 	r!   c                     | j                   S r   ri  rh   s    r    rj  z$ProphetNetModel.get_input_embeddings0  rk  r!   c                 ~    || _         | j                   | j                  _         | j                   | j                  _         y r   )r[  r  r  rm  s     r    ro  z$ProphetNetModel.set_input_embeddings3  s.    $'+';';$'+';';$r!   c                     | j                   j                  ra| j                  | j                  j                  | j                         | j                  | j
                  j                  | j                         y y r   )r   tie_word_embeddings_tie_or_clone_weightsr  r[  r  rh   s    r    _tie_weightszProphetNetModel._tie_weights8  sT    ;;**&&t||'C'CTEYEYZ&&t||'C'CTEYEYZ +r!   c                     | j                   S r   )r  rh   s    r    get_encoderzProphetNetModel.get_encoder=      ||r!   c                     | j                   S r   r  rh   s    r    get_decoderzProphetNetModel.get_decoder@  r  r!   r   r   decoder_input_idsdecoder_attention_maskrq  decoder_head_maskr  encoder_outputsrY   rr  decoder_inputs_embedsrK  r   rs  rt  r   c                 \   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|| j                  ||||
|||      }| j                  |||d   ||||	|||||      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )a7  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```)r   r   rq  rr  r   rs  rt  r   )r   r   r`   r  rq  r  rY   rr  r   rs  rK  rt  )rv   rw   rY   rZ   r[   r\   r]   r^   r_   r`   ra   )r   rK  r   rs  r  r  r  ru   rv   rw   rY   r{   r|   r}   r~   r^   )ri   r   r   r  r  rq  r  r  r  rY   rr  r  rK  r   rs  rt  decoder_outputss                    r    r   zProphetNetModel.forwardC  sT   r "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]""ll#-#+"3%9' + O ,,'1"1!"4#1'!5+//!5# ' 
 "_44+-??$3$K$K+;;"1"?"?(7(K(K.99%4%E%E,==&5&G&G"1"?"?.99
 	
r!   )NNNNNNNNNNNNNNN)rl   rm   rn   _tied_weights_keysr   r   rj  ro  r  r  r  r   r   r   r   
BoolTensorr   r   r   ru   r   r   r   s   @r    r  r    s   :<\]/ "$<
[
  -11548=A,0487;+/@D048<$(,0/3&*!h
ELL)h
 !.h
 $ELL1	h

 !))9)9 :h
 ELL)h
 $ELL1h
 'u||4h
 "%h
 "%ell(;"<=h
  -h
  (5h
 D>h
 $D>h
 'tnh
  d^!h
" 
u22	3#h
 h
r!   r  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            &       f    e Zd Zg dZdef fdZd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                     deeeej                           deej                     deej                     deej                     dee   dee   dee   dee   deeef   f"d       Zd dZdej                  fdZed        Zd Zd Z xZS )!"ProphetNetForConditionalGeneration)r  r  lm_head.weightr   c                 
   t         |   |       t        |      | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFr   )r   r   r  r   r   r   disable_ngram_lossr   r   r   r_  lm_headrf  r   s     r    r   z+ProphetNetForConditionalGeneration.__init__  sd     )&1!.."(";";yy!3!3V5F5FUS 	r!   c                     | j                   S r   r  rh   s    r    get_output_embeddingsz8ProphetNetForConditionalGeneration.get_output_embeddings  r  r!   c                     || _         y r   r  ri   new_embeddingss     r    set_output_embeddingsz8ProphetNetForConditionalGeneration.set_output_embeddings  	    %r!   c                     | j                   j                  r1| j                  | j                  j                  | j
                         y y r   )r   r  r  r   r[  r  rh   s    r    r  z/ProphetNetForConditionalGeneration._tie_weights  s2    ;;**&&t'F'FU +r!   c                 .    | j                   j                  S r   )r   r[  rh   s    r    rj  z7ProphetNetForConditionalGeneration.get_input_embeddings  s    ...r!   r   r   r  r  rq  r  r  r  rY   rr  r  labelsrK  r   rs  rt  r   c                 `   ||n| j                   j                  }|||| j                  |      }| j                  |||||||||	|
|||||      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d      }| j                  |      }|dddf   }| j                   j                  dkD  r|ddddf   nd}|j                         s|j                         }d}|| j                  ||      }|s*t        d ||fD              }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                        S )	a	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)r   r   r  r  rq  r  r  r  rY   rr  r  rK  r   rs  rt  r%   r   rJ   r   c              3   &   K   | ]	  }||  y wr   rs   rz  s     r    r}  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>7       RQAMqRr~  )rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   )r   r  r   r   r   r   r0   r  is_contiguousr   _compute_lossr  rU   rY   rZ   r[   r\   r]   r^   r_   r`   ra   )ri   r   r   r  r  rq  r  r  r  rY   rr  r  r  rK  r   rs  rt  rA  r   r/   predicting_streamspredict_logitsrW   rX   rV   
all_logitss                             r    r   z*ProphetNetForConditionalGeneration.forward  s   | &1%<k$++B]B]"3";@U@] $ 1 1& 9//)/#9/!5++'"7/!5# " 
$ (9'D##J_JeJefhghJi 	$
O %QZ__Z9J9JO]_`&891%040A0AA0E~ae,4 ##%&&(F%%nf=DR6<*@RRJ9=9ID7Z''!"+5gz\cdedf\gOgg,) ' 7 7&-&C&C,3,O,O#*#=#=)0)I)I!(!9!9*1*K*K&-&C&C#*#=#= r!   c                    |j                  | j                  j                  |j                  d      |j                  d            j	                  |      }t        | j                  j                        D ]!  }|dkD  r| j                  r n|||d d d d f<   # |j                  dd      j                         }t        j                  j                  |j                  d|j                  d            dt        j                        }t        j                  j                  ||j                  d      d      }| j                  j                   dkD  r|j#                  dd	       }|j%                  |      j                  d      }	||	   }|j'                         }| j                  j                   |j                  d      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S Nr   r   rJ   r   r   )	reductionr   T)r   keepdimrv  r   r   r0   rM   fill_r+   r  r   r   r   r   log_softmaxr   r   r   nll_lossepssumner   ri   rW   r  ignore_indexexpend_targetsilprobsrV   smooth_lossnon_masked_tokenseps_is              r    r  z0ProphetNetForConditionalGeneration._compute_lossI     ))$++*;*;V[[^V[[YZ^\bbcopt{{(() 	-A1u00&,N1a7#	-
 !!!Q'224**KKFKKO,-- + 
 }}%%fn.A.A".EQW%X;;??S !::"d:;;K . 1 1, ? D DR H%&78K%**,KKKOOfkk"o5E$++//)T1EK4GGDr!   c                 $    | j                  |      S r   )r   )ri   r  s     r    %prepare_decoder_input_ids_from_labelszHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelse  s      ((r!   c                 \    d}| D ]#  }|t        fd|d d D              |dd  z   fz  }% |S )Nrs   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywr   Nindex_selectr  r#   r{  
past_statebeam_idxs     r    r}  zDProphetNetForConditionalGeneration._reorder_cache.<locals>.<genexpr>o  s.     rU_j--aZ=N=N1OPr   58r%   r  rY   r  reordered_past
layer_pasts    `  r    _reorder_cachez1ProphetNetForConditionalGeneration._reorder_cacheh  sV     ) 	JrcmnpopcqrrQR.! N	 r!   c                 .    | j                   j                  S r   )r   r  rh   s    r    r  z.ProphetNetForConditionalGeneration.get_encodert      &&&r!   c                 .    | j                   j                  S r   r   r  rh   s    r    r  z.ProphetNetForConditionalGeneration.get_decoderw  r  r!   )NNNNNNNNNNNNNNNNr   )rl   rm   rn   r  r   r   r  r  r  rj  r   r   r   r   r  r   r   r   rU   r   r  r  staticmethodr  r  r  r   r   s   @r    r  r    s    p	/ 	&V/  -11548=A,0487;26@D048<)-$(,0/3&*#wELL)w !.w $ELL1	w
 !))9)9 :w ELL)w $ELL1w 'u||4w "%,,/w "%ell(;"<=w  -w  (5w &w D>w $D>w  'tn!w" d^#w$ 
u//	0%w wr8)ELL )  ''r!   r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                        e Zd Zg dZdef fdZd Zd Zd Zd Z	d Z
d	 Zd
 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                      deej                      deej                      deej                      deej                      deej                      deeeej                            deej                      deej                      dee   dee   dee   dee   deeef   fd       ZddZ	 	 	 	 ddZed        Z xZS ) ProphetNetForCausalLM)z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightr  r   c                 P   t        j                  |      }d|_        d|_        t        |   |       t        |      | _        |j                  | _	        |j                  | _
        t        j                  |j                  |j                  d      | _        | j!                          y )NTFr  )r  r  r  r  r   r   ProphetNetDecoderWrapperr   r   r   r  r   r   r   r_  r  rf  r   s     r    r   zProphetNetForCausalLM.__init__  s    v& $)! 26:!.."(";";yy!3!3V5F5FUS 	r!   c                 B    | j                   j                  j                  S r   r   r  r[  rh   s    r    rj  z*ProphetNetForCausalLM.get_input_embeddings  s    &&666r!   c                 :    || j                   j                  _        y r   r!  rm  s     r    ro  z*ProphetNetForCausalLM.set_input_embeddings  s    27/r!   c                     | j                   S r   r  rh   s    r    r  z+ProphetNetForCausalLM.get_output_embeddings  r  r!   c                     || _         y r   r  r  s     r    r  z+ProphetNetForCausalLM.set_output_embeddings  r  r!   c                     | j                   j                  r;| j                  | j                  j                  j
                  | j                         y y r   )r   r  r  r   r  r[  r  rh   s    r    r  z"ProphetNetForCausalLM._tie_weights  s;    ;;**&&t'>'>'N'NPTP\P\] +r!   c                 &    || j                   _        y r   r  )ri   r  s     r    set_decoderz!ProphetNetForCausalLM.set_decoder  s    ")r!   c                 .    | j                   j                  S r   r  rh   s    r    r  z!ProphetNetForCausalLM.get_decoder  r  r!   r   r   r`   r  rq  r  rY   rr  r  rK  r   rs  rt  r   c                    ||n| j                   j                  }| j                  j                  |||||||||
|||      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d      }| j                  |      }|dddf   }| j                   j                  dkD  r|ddddf   nd}d}|	| j                  ||	      }|s*t        d ||fD              }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                  |j                  |j                   	      S )	a	  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)r   r   r`   r  rq  r  rY   rr  rK  r   rs  rt  r%   r   rJ   r   c              3   &   K   | ]	  }||  y wr   rs   rz  s     r    r}  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>  r  r~  )	rV   rW   rX   rY   r{   r|   r}   r~   r^   )r   r  r   r  r   r   r0   r  r  r  r   rY   r{   r|   r}   r~   r^   )ri   r   r   r`   r  rq  r  rY   rr  r  rK  r   rs  rt  rA  r   r/   r  r  rW   rX   rV   r  s                          r    r   zProphetNetForCausalLM.forward  s   B &1%<k$++B]B] //)))"7#9!5+'/!5# * 
 :C9NiooTaTgTghjijTk#
O$QZ__Z9J9JO]_`&891%040A0AA0E~ae,4%%nf=DR6<*@RRJ9=9ID7Z''!"+5gz\cdedf\gOgg,) ' 7 7%33$+$?$?"--!(!9!9!(!9!9
 
r!   c                    |j                  | j                  j                  |j                  d      |j                  d            j	                  |      }t        | j                  j                        D ]!  }|dkD  r| j                  r n|||d d d d f<   # |j                  dd      j                         }t        j                  j                  |j                  d|j                  d            dt        j                        }t        j                  j                  ||j                  d      d      }| j                  j                   dkD  r|j#                  dd	       }|j%                  |      j                  d      }	||	   }|j'                         }| j                  j                   |j                  d      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S r  r  r  s              r    r  z#ProphetNetForCausalLM._compute_loss  r  r!   c                 f    ||j                  |j                        }|r|d d dd f   }|||||dS )NrJ   )r   r   rq  rY   rK  )new_onesr   )ri   r   rY   r   rq  rK  kwargss          r    prepare_inputs_for_generationz3ProphetNetForCausalLM.prepare_inputs_for_generation8  sL     !&//	@N!!RS&)I #,"."
 	
r!   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nrs   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywr
  r  r  s     r    r}  z7ProphetNetForCausalLM._reorder_cache.<locals>.<genexpr>X  s.     nU_j--aZ=N=N1OPnr  r  r  s    `  r    r  z$ProphetNetForCausalLM._reorder_cacheR  s?     ) 	Jncmnn N	 r!   )NNNNNNNNNNNNNr  )NNNN)rl   rm   rn   r  r   r   rj  ro  r  r  r  r'  r  r   r   r   r   r   r   r   r   r   r  r/  r  r  r   r   s   @r    r  r  {  s   /  78&^*'  -1158<9=,07;@D04)-$(,0/3&*lELL)l !.l  (5	l
 !) 6l ELL)l 'u||4l "%ell(;"<=l  -l &l D>l $D>l 'tnl d^l 
u//	0l l\> 
4  r!   r  c                   4     e Zd ZdZdef fdZd Zd Z xZS )r  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r   c                     t         |   |       t        j                  |j                  |j
                  |j                        | _        t        || j                        | _	        | j                          y )Nr^  ri  )r   r   r   r   r_  r   r   r[  r  r  rf  r   s     r    r   z!ProphetNetDecoderWrapper.__init__c  sX     !||F,=,=v?Q?Q_e_r_rs(AUAUV 	r!   c                 l    | j                  | j                  | j                  j                                y r   )r  r[  r  rj  rh   s    r    r  z%ProphetNetDecoderWrapper._tie_weightsl  s%    ""4#7#79Z9Z9\]r!   c                 &     | j                   |i |S r   r  )ri   argsr.  s      r    r   z ProphetNetDecoderWrapper.forwardo  s    t||T,V,,r!   )	rl   rm   rn   ro   r   r   r  r   r   r   s   @r    r  r  ]  s     
/ ^-r!   r  )r  rZ  r  r  r  r   rB  )9ro   r  r<   re   dataclassesr   typingr   r   r   r   torch.utils.checkpointr   r   torch.nnr	   activationsr   
generationr   modeling_outputsr   modeling_utilsr   utilsr   r   r   configuration_prophetnetr   
get_loggerrl   r  r   r4   rH   rS   rU   ru   rz   r   r   r   r   Moduler   r   r   r5  rE  rZ  r  r  r  r  r  __all__rs   r!   r    <module>rD     sP   Y    ! ) )     ! ) / - 9 9 6 
		H	%Q7" 6M. Q% Q% Q%h R%; R% R%j 8@; 8@ 8@v :@ :@ :@z #! #! #!L(-R\\ (-V~B")) ~BBBII .|/299 |/~	(RYY (VQRYY Qh 
{
1 {

{
| 
RG1 RG
RGj
 P
/ P
 P
f 
D')BO D'
D'N 
Z5 Z
Zz-8 -,r!   