Ë
    ´ãUhœ ã                   ó’  — d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ  ej<                  e«      Z d;d„Z!d„ Z"d;d„Z#d„ Z$e G d„ de«      «       Z%e G d„ de«      «       Z&e G d„ de«      «       Z'e G d„ de«      «       Z(e G d„ de«      «       Z) G d„ dejT                  «      Z+ G d„ d ejX                  «      Z- G d!„ d"ejX                  «      Z. G d#„ d$ejX                  «      Z/ G d%„ d&ejX                  «      Z0 G d'„ d(ejX                  «      Z1 ed)¬*«       G d+„ d,e)«      «       Z2 ed-¬*«       G d.„ d/e)«      «       Z3e G d0„ d1e)«      «       Z4 ed2¬*«       G d3„ d4e)e«      «       Z5 ed5¬*«       G d6„ d7e)e«      «       Z6 G d8„ d9e)«      Z7g d:¢Z8y)<zRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).é    N)Ú	dataclass)ÚOptionalÚTupleÚUnion)ÚTensorÚnn)Ú	LayerNormé   )ÚACT2FN)ÚGenerationMixin)ÚBaseModelOutput)ÚPreTrainedModel)ÚModelOutputÚauto_docstringÚloggingé   )ÚProphetNetConfigc                 óÄ   — |r/t         j                  j                  | j                  «       |¬«      S t         j                  j                  | |t        j
                  ¬«      S )N©Údim©r   Údtype)r   Ú
functionalÚsoftmaxÚfloatÚtorchÚfloat32)Úhidden_stater   Ú
onnx_traces      ú„/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   '   sH   € ÙÜ}‰}×$Ñ$ \×%7Ñ%7Ó%9¸sÐ$ÓCÐCä}‰}×$Ñ$ \°sÄ%Ç-Á-Ð$ÓPÐPó    c                 óz  — t        j                  || | f||¬«      t        j                  |«      j                  z  }|j	                  «       j                  «       }t        |«      D ]0  }||   j                  dd¬«       ||   j                  | dz   «       Œ2 d|dd…dd…df<   t        j                  ||gd¬«      S )	z@
    This function computes the bias for the predict stream
    )Údevicer   r   F)Úwrapr   Né   r   )
r   ÚonesÚfinfoÚminÚdetachÚcloneÚrangeÚfill_diagonal_Útriu_Úcat)Úsequence_lengthÚngramr#   r   Ú
left_blockÚright_blockÚ
stream_idxs          r    Úngram_attention_biasr4   .   s¾   € ô
 	
‰
E˜?¨OÐ<ÀVÐSXÔYÔ\a×\gÑ\gÐhmÓ\n×\rÑ\rÑrð ð ×#Ñ#Ó%×+Ñ+Ó-€Kä˜E“lò 6ˆ
ØJÑ×.Ñ.¨q°uÐ.Ô=Ø:Ñ×$Ñ$ j [°1¡_Õ5ð6ð €JŠq’!QˆwÑÜ9‰9j +Ð.°AÔ6Ð6r!   c                 ó¦  — | }d}|rX| dz  } |t        j                  |t        j                  |«      «      j                  «       | z  z   }t        j                  |«      }n)t        j
                  |t        j                  |«      «      }| dz  }t        j                  ||«      }|t        j                  |j                  «       |z  «      t        j                  ||z  «      z  | |z
  z  z   }t        j                  |t        j                  |«      | dz
  z  «      j                  «       }|t        j                  ||j                  «       |«      z   }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r%   r   )r   ÚltÚ
zeros_likeÚintÚabsÚmaxÚlogr   Úmathr(   Ú	ones_likeÚwhere)	Únum_bucketsÚmax_distanceÚrelative_positionsÚis_bidirectionalÚinv_relative_positionsÚrel_positions_bucketÚ	max_exactÚis_smallÚval_if_larges	            r    Úcompute_relative_bucketsrH   ?   sG  € ð 1Ð0ÐØÐáØ! QÑ&ˆà Üh‰hÐ-¬u×/?Ñ/?Ð@VÓ/WÓX×\Ñ\Ó^ÐalÑlñmð 	ô "'§¡Ð+AÓ!BÑä!&§¡Ð+AÄ5×CSÑCSÐTjÓCkÓ!lÐà˜qÑ €IÜx‰xÐ.°	Ó:€HØœuŸy™yÐ)?×)EÑ)EÓ)GÈ)Ñ)SÓTÔW[×W_ÑW_ØyÑ óXñ  à	yÑ	 ñ "ñ "€Lô —9‘9˜\¬5¯?©?¸<Ó+HÈKÐZ[ÉOÑ+\Ó]×aÑaÓc€LØ/´%·+±+¸hÐH^×HbÑHbÓHdÐfrÓ2sÑsÐØÐr!   c                 ó’  — |j                  d«      j                  d|j                  d«      d«      }||j                  d«      z
  }t        j                  |dz
  |fd¬«      j                  d«      }|j                  d|j                  d«      d«      }||j                  d«      z
  }t        | ||d¬«      }t        | ||d¬«      }||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   éÿÿÿÿr   F)rB   )Ú	unsqueezeÚrepeatÚsizer   r.   rH   )r?   r@   Úposition_idsÚmain_stream_relative_positionsÚ$predicting_stream_relative_positionsÚmain_relative_position_bucketsÚ!predict_relative_position_bucketss          r    Ú#compute_all_stream_relative_bucketsrS   Z   sí   € ð
 &2×%;Ñ%;¸AÓ%>×%EÑ%EÀaÈ×IZÑIZÐ[]ÓI^Ð`aÓ%bÐ"Ø%CÀl×F\ÑF\Ð]_ÓF`Ñ%`Ð"ô ,1¯9©9°lÀQÑ6FÈÐ5UÐ[]Ô+^×+hÑ+hÐijÓ+kÐ(Ø+O×+VÑ+VÐWXÐZf×ZkÑZkÐlnÓZoÐqrÓ+sÐ(Ø+OÐR^×RhÑRhÐikÓRlÑ+lÐ(ô &>Ø\Ð#AÐTYô&Ð"ô )AØ\Ð#GÐZ_ô)Ð%ð *Ð+LÐLÐLr!   c                   ó2  — e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed	<   dZeeej                        ed
<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   ed„ «       Zy)ÚProphetNetSeq2SeqLMOutputaþ  
    Base class for sequence-to-sequence language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
            SoftMax).
        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
            SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, encoder_sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
            softmax, used to compute the weighted average in the self-attention heads.
    NÚlossÚlogitsÚlogits_ngramÚpast_key_valuesÚdecoder_hidden_statesÚdecoder_ngram_hidden_statesÚdecoder_attentionsÚdecoder_ngram_attentionsÚcross_attentionsÚencoder_last_hidden_stateÚencoder_hidden_statesÚencoder_attentionsc                 óN   — t        j                  dt        «       | j                  S ©Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.©ÚwarningsÚwarnÚFutureWarningr^   ©Úselfs    r    Údecoder_cross_attentionsz2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions¼   ó$   € ä‰ðäô	
ð
 ×$Ñ$Ð$r!   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__rV   r   r   ÚFloatTensorÚ__annotations__rW   rX   rY   r   rZ   r[   r\   r]   r^   r_   r`   ra   Úpropertyrj   © r!   r    rU   rU   q   sH  … ñ:ðx )-€Dˆ(5×$Ñ$Ñ
%Ó,Ø*.€FˆHU×&Ñ&Ñ'Ó.Ø04€L(˜5×,Ñ,Ñ-Ó4Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØFJÐ ¨%°×0AÑ0AÑ*BÑ!CÓJØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAØCGÐ˜h u¨U×->Ñ->Ñ'?Ñ@ÓGØ;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø=AÐ˜x¨×(9Ñ(9Ñ:ÓAØ@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAàñ%ó ñ%r!   rU   c                   ó   — e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed	<   dZee
ej
                        ed
<   dZeej
                     ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   ed„ «       Zy)ÚProphetNetSeq2SeqModelOutputa2  
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, encoder_sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, encoder_sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    Úlast_hidden_stateNÚlast_hidden_state_ngramrY   rZ   r[   r\   r]   r^   r_   r`   ra   c                 óN   — t        j                  dt        «       | j                  S rc   rd   rh   s    r    rj   z5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions  rk   r!   )rl   rm   rn   ro   r   rp   rq   rw   r   rY   r   rZ   r[   r\   r]   r^   r_   r`   ra   rr   rj   rs   r!   r    ru   ru   Æ   s+  … ñ<ð| ×(Ñ(Ó(Ø;?Ð˜X e×&7Ñ&7Ñ8Ó?Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØFJÐ ¨%°×0AÑ0AÑ*BÑ!CÓJØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAØCGÐ˜h u¨U×->Ñ->Ñ'?Ñ@ÓGØ;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø=AÐ˜x¨×(9Ñ(9Ñ:ÓAØ@DÐ˜8 E¨%×*;Ñ*;Ñ$<Ñ=ÓDØ=AÐ˜  u×'8Ñ'8Ñ!9Ñ:ÓAàñ%ó ñ%r!   ru   c                   ól  — e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed<   dZee
ej
                        ed	<   dZee
ej
                        ed
<   y)ÚProphetNetDecoderModelOutputaZ  
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
    rv   Nrw   rY   Úhidden_statesÚhidden_states_ngramÚ
attentionsÚngram_attentionsr^   )rl   rm   rn   ro   r   rp   rq   rw   r   rY   r   r{   r|   r}   r~   r^   rs   r!   r    rz   rz     sË   … ñ.ð` ×(Ñ(Ó(Ø;?Ð˜X e×&7Ñ&7Ñ8Ó?Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø8<€M8˜E %×"3Ñ"3Ñ4Ñ5Ó<Ø>BÐ˜ %¨×(9Ñ(9Ñ":Ñ;ÓBØ59€J˜˜u×0Ñ0Ñ1Ñ2Ó9Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ô?r!   rz   c                   óž  — e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed	<   dZeeej                        ed
<   dZeeej                        ed<   y)ÚProphetNetDecoderLMOutputam  
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
            SoftMax).
        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
            SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
    NrV   rW   rX   rY   r{   r|   r}   r~   r^   )rl   rm   rn   ro   rV   r   r   rp   rq   rW   rX   rY   r   r{   r|   r}   r~   r^   rs   r!   r    r€   r€   X  sè   … ñ/ðb )-€Dˆ(5×$Ñ$Ñ
%Ó,Ø*.€FˆHU×&Ñ&Ñ'Ó.Ø04€L(˜5×,Ñ,Ñ-Ó4Ø:>€OX˜e E×$5Ñ$5Ñ6Ñ7Ó>Ø8<€M8˜E %×"3Ñ"3Ñ4Ñ5Ó<Ø>BÐ˜ %¨×(9Ñ(9Ñ":Ñ;ÓBØ59€J˜˜u×0Ñ0Ñ1Ñ2Ó9Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ó?Ø;?Ðh˜u U×%6Ñ%6Ñ7Ñ8Ô?r!   r€   c                   ó$   — e Zd ZeZdZdZd„ Zd„ Zy)ÚProphetNetPreTrainedModelÚ
prophetnetTc                 ó:  — t        |t        j                  «      rm|j                  j                  j                  d| j                  j                  ¬«       |j                  %|j                  j                  j                  «        y y t        |t        j                  «      rz|j                  j                  j                  d| j                  j                  ¬«       |j                  2|j                  j                  |j                     j                  «        y y y )Nç        )ÚmeanÚstd)Ú
isinstancer   ÚLinearÚweightÚdataÚnormal_ÚconfigÚinit_stdÚbiasÚzero_Ú	EmbeddingÚpadding_idx)ri   Úmodules     r    Ú_init_weightsz'ProphetNetPreTrainedModel._init_weightsœ  sÈ   € ÜfœbŸi™iÔ(ØM‰M×Ñ×&Ñ&¨C°T·[±[×5IÑ5IÐ&ÔJØ{‰{Ð&Ø—‘× Ñ ×&Ñ&Õ(ð 'ä˜¤§¡Ô-ØM‰M×Ñ×&Ñ&¨C°T·[±[×5IÑ5IÐ&ÔJØ×!Ñ!Ð-Ø—‘×"Ñ" 6×#5Ñ#5Ñ6×<Ñ<Õ>ð .ð .r!   c                 ó‚  — | j                   j                  }| j                   j                  }|€J d«       ‚|j                  |j                  «      }|dd d…f   j                  «       |ddd …f<   ||d<   |€J d«       ‚|j                  |dk(  |«       t        j                  |dk\  «      j                  «       sJ d	«       ‚|S )
Nz™self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rJ   r   ).r   z1self.model.config.pad_token_id has to be defined.éœÿÿÿr   z8Verify that `shifted_input_ids` has only positive values)
r   Údecoder_start_token_idÚpad_token_idÚ	new_zerosÚshaper*   Úmasked_fill_r   ÚallÚitem)ri   Ú	input_idsr—   r˜   Úshifted_input_idss        r    Ú_shift_rightz&ProphetNetPreTrainedModel._shift_right¦  sØ   € Ø!%§¡×!CÑ!CÐØ—{‘{×/Ñ/ˆà%Ð1ð 	
ðFó	
Ð1ð &×/Ñ/°	·±Ó@ÐØ%.¨s°C°R°C¨xÑ%8×%>Ñ%>Ó%@Ð˜#˜q™r˜'Ñ"Ø$:Ð˜&Ñ!àÐ'Ð\Ð)\Ó\Ð'à×&Ñ&Ð'8¸DÑ'@À,ÔOäy‰yÐ*¨aÑ/Ó0×5Ñ5Ô7ÐsÐ9sÓsÐ7à Ð r!   N)	rl   rm   rn   r   Úconfig_classÚbase_model_prefixÚsupports_gradient_checkpointingr”   r    rs   r!   r    r‚   r‚   –  s   „ à#€LØ$ÐØ&*Ð#ò?ó!r!   r‚   c                   óB   ‡ — e Zd ZdZdeddfˆ fd„Zdˆ fd„	Zˆ fd„Zˆ xZS )	ÚProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    r   ÚreturnNc                 ó†   •— |j                   | _        t        ‰|   |j                   |j                  |j
                  «       y ©N)Úmax_position_embeddingsÚ
max_lengthÚsuperÚ__init__Úhidden_sizer˜   ©ri   r   Ú	__class__s     €r    r¬   z'ProphetNetPositionalEmbeddings.__init__Ä  s3   ø€ Ø ×8Ñ8ˆŒÜ‰Ñ˜×7Ñ7¸×9KÑ9KÈV×M`ÑM`Õar!   c                 ó(  •— || j                   J d«       ‚|€ê|]|d   d   j                  d   }|d   |z   }t        j                  dt        j                  |¬«      t        | j                   |z   «      z  }n‹|€&t        j                  |t        j                  |¬«      }t        j                  |d¬«      j                  |«      |z  j	                  «       | j                   z   }|j                  d| j                  dz
  «      }t        ‰| -  |«      |fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r%   r   )r   r   ©r   r#   r   )r’   rš   r   r&   Úlongr8   ÚcumsumÚtype_asÚclamprª   r«   Úforward)	ri   Úinputs_shaper#   Úattention_maskrY   rN   Úprev_num_input_idsÚnum_input_idsr¯   s	           €r    r¶   z&ProphetNetPositionalEmbeddings.forwardÈ  s  ø€ ØÐ$¨$×*:Ñ*:Ð*Bð 	
ØQó	
ÐCð ÐØÐ*ð &5°QÑ%7¸Ñ%:×%@Ñ%@ÀÑ%CÐ"Ø ,¨Q¡Ð2DÑ DÜ$Ÿz™z¨&¼¿
¹
È6ÔRÜ˜×(Ñ(¨=Ñ8Ó9ñ ‘ð "Ð)Ü%*§Z¡Z°ÄEÇJÁJÐW]Ô%^Nô —L‘L °QÔ7×?Ñ?ÀÓOÐR`Ñ`ß‘$“&˜4×+Ñ+ñ ,ð
  ,×1Ñ1°!°T·_±_ÀqÑ5HÓIä‰w‰˜|Ó,¨lÐ:Ð:r!   c                 ó"   •— t         ‰|   |«      S r¨   )r«   r¶   )ri   rN   r¯   s     €r    Ú_forwardz'ProphetNetPositionalEmbeddings._forwardä  s   ø€ Ü‰w‰˜|Ó,Ð,r!   )NNN)	rl   rm   rn   ro   r   r¬   r¶   r¼   Ú__classcell__©r¯   s   @r    r¥   r¥   ½  s.   ø„ ñðbÐ/ð b°Dõ bõ;÷8-ð -r!   r¥   c                   ó®   ‡ — e Zd ZdZdedefˆ fd„Zdej                  dedefd„Z		 	 	 	 	 dd	e
e   d
e
e   de
e   de
ee      dedeee
e   f   fd„Zˆ xZS )ÚProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   Únum_attn_headsc                 óª  •— t         ‰|   «        |j                  }|j                  | _        |j                  | _        || _        ||z  | _        | j                  |z  |k(  sJ d«       ‚t        j                  ||«      | _	        t        j                  ||«      | _
        t        j                  ||«      | _        t        j                  ||«      | _        y )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r«   r¬   r­   Úattention_dropoutÚdropoutrÁ   Úhead_dimr   r‰   Úkey_projÚ
value_projÚ
query_projÚout_proj)ri   r   rÁ   r­   r¯   s       €r    r¬   zProphetNetAttention.__init__ë  sµ   ø€ ô
 	‰ÑÔØ×(Ñ(ˆà!'×!9Ñ!9ˆÔØ—~‘~ˆŒØ,ˆÔØ# ~Ñ5ˆŒà}‰}˜~Ñ-°Ò<ð 	
ð4ó	
Ð<ô
 Ÿ	™	 +¨{Ó;ˆŒÜŸ)™) K°Ó=ˆŒÜŸ)™) K°Ó=ˆŒäŸ	™	 +¨{Ó;ˆr!   ÚtensorÚseq_lenÚbszc                 óŽ   — |j                  ||| j                  | j                  «      j                  dd«      j	                  «       S ©Nr   r%   ©ÚviewrÁ   rÅ   Ú	transposeÚ
contiguous)ri   rÊ   rË   rÌ   s       r    Ú_shapezProphetNetAttention._shape  s9   € Ø{‰{˜3 ¨×)<Ñ)<¸d¿m¹mÓL×VÑVÐWXÐZ[Ó\×gÑgÓiÐir!   Úkey_value_statesr¸   Úlayer_head_maskÚpast_key_valueÚoutput_attentionsr¦   c                 ó  — |j                  «       \  }}}	|d u}
t        |j                  «       «      |||	gk(  sJ d|||	f› d|j                  «       › «       ‚| j                  |«      | j                  dz  z  }|
r||d   }|d   }n‹|
rE| j	                  | j                  |«      d|«      }| j	                  | j                  |«      d|«      }nD| j	                  | j                  |«      d|«      }| j	                  | j                  |«      d|«      }|
r||f}|| j                  d| j                  f} | j	                  |||«      j                  |Ž } |j                  |Ž } |j                  |Ž }|j                  d«      }t        j                  d||j                  dd	«      «      }|| j                  ||f}|j                  «       |k7  rt        d
|› d|j                  «       › «      ‚||j                  «       dk(  rd }|| j                  d|f}|2|j                  «       |k7  rt        d|› d|j                  «       › «      ‚|||z   }|r|}nd }t        j                  j!                  |d¬«      }|Ž|j                  «       | j                  fk(  s&J d| j                  f› d|j                  «       › «       ‚|j                  dddd«      |j                  || j                  ||«      z  }|j                  dddd«      |z  }t        j                  j#                  || j$                  | j&                  ¬«      }t        j                  d||«      }|| j                  || j                  f}|j                  «       |k7  rt        d|› d|j                  «       › «      ‚|j                  dd«      j)                  |||	«      }| j+                  |«      }t        j                  j#                  || j"                  | j&                  ¬«      }|||fS )Nz Size of hidden states should be ú	, but is ç      à?r   r   rJ   r%   zbsij,bsjk->bsikr
   z#Attention weights should have size z Attention mask should have size r   ú/Head mask for a single layer should be of size ©ÚpÚtrainingz `attn_output` should have shape ú, but is of shape )rM   ÚlistrÈ   rÅ   rÓ   rÆ   rÇ   rÁ   rÐ   r   ÚeinsumrÑ   Ú
ValueErrorr   r   r   r   rÄ   rÃ   rÞ   ÚreshaperÉ   )ri   r{   rÔ   r¸   rÕ   rÖ   r×   Ú
batch_sizeÚtgt_lenr­   Úis_cross_attentionÚquery_statesÚ
key_statesÚvalue_statesÚ
proj_shapeÚsrc_lenÚattn_weightsÚexpected_shapeÚattn_weights_reshapedÚ
attn_probsÚattn_outputs                        r    r¶   zProphetNetAttention.forward  sZ  € ð ,9×+=Ñ+=Ó+?Ñ(ˆ
G˜[ð .°TÐ9ÐÜM×&Ñ&Ó(Ó)ØØØð.
ò 
ð 	pð .¨j¸'À;Ð.NÐ-OÈyÐYf×YkÑYkÓYmÐXnÐoó		pð 
ð —‘ }Ó5¸¿¹ÈÑ9KÑLˆá .Ð"<à'¨Ñ*ˆJØ)¨!Ñ,‰LÙàŸ™ T§]¡]Ð3CÓ%DÀbÈ*ÓUˆJØŸ;™; t§¡Ð7GÓ'HÈ"ÈjÓY‰Lð Ÿ™ T§]¡]°=Ó%AÀ2ÀzÓRˆJØŸ;™; t§¡°}Ó'EÀrÈ:ÓVˆLáð
 )¨,Ð7ˆNð ! $×"5Ñ"5°r¸4¿=¹=ÐIˆ
ØJt—{‘{ <°¸*ÓE×JÑJÈJÐWˆØ$Z—_‘_ jÐ1ˆ
Ø(|×(Ñ(¨*Ð5ˆØ—/‘/ !Ó$ˆÜ—|‘|Ð$5°|ÀZ×EYÑEYÐZ[Ð]^ÓE_Ó`ˆØ$ d×&9Ñ&9¸7ÀGÐLˆØ×ÑÓ .Ò0ÜÐBÀ>ÐBRÐR[Ð\h×\mÑ\mÓ\oÐ[pÐqÓrÐrð Ð%¨.×*<Ñ*<Ó*>À!Ò*CØ!ˆNà$ d×&9Ñ&9¸1¸gÐFˆØÐ%¨.×*=Ñ*=Ó*?À>Ò*QÜÐ?ÀÐ?OÈyÐYg×YlÑYlÓYnÐXoÐpÓqÐqØÐ%Ø'¨.Ñ8ˆLÙØ$0Ñ!à$(Ð!ä—}‘}×,Ñ,¨\¸rÐ,ÓBˆàÐ&Ø"×'Ñ'Ó)¨d×.AÑ.AÐ-CÒCð ØAÀ4×CVÑCVÐBXÐAYð ZØ#×(Ñ(Ó*Ð+ð-óÐCð +×/Ñ/°°2°q¸!Ó<¸|×?PÑ?PØ˜D×/Ñ/°¸'ó@ñ ˆLð
 %4×$8Ñ$8¸¸BÀÀ1Ó$EÐH]Ñ$]Ð!ä—]‘]×*Ñ*ØØ×$Ñ$Ø—]‘]ð +ó 
ˆ
ô
 —l‘lÐ#4°jÀ,ÓOˆØ$ d×&9Ñ&9¸7ÀDÇMÁMÐRˆØ×ÑÓ Ò/ÜÐ?ÀÐ?OÐOaÐbm×brÑbrÓbtÐauÐvÓwÐwà!×+Ñ+¨A¨qÓ1×9Ñ9¸*ÀgÈ{Ó[ˆØ—m‘m KÓ0ˆä—m‘m×+Ñ+¨K¸4¿<¹<ÐRV×R_ÑR_Ð+Ó`ˆØÐ1°>ÐAÐAr!   )NNNNF)rl   rm   rn   ro   r   r8   r¬   r   r   rÓ   r   r   Úboolr¶   r½   r¾   s   @r    rÀ   rÀ   è  sÆ   ø„ ÙGð<à ð<ð õ<ð0j˜UŸ\™\ð j°Cð j¸có jð .2Ø+/Ø,0Ø26Ø"'ñ`Bð # 6Ñ*ð`Bð ! Ñ(ð	`Bð
 " &Ñ)ð`Bð !  v¡Ñ/ð`Bð  ð`Bð 
ˆvx Ñ'Ð'Ñ	(÷`Br!   rÀ   c                   ó2   ‡ — e Zd ZdZdedefˆ fd„Zd„ Zˆ xZS )ÚProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    r   Úffn_dimc                 ó*  •— t         ‰|   «        t        |j                     | _        t        j                  |j                  |«      | _        t        j                  ||j                  «      | _	        |j                  | _
        |j                  | _        y r¨   )r«   r¬   r   Úactivation_functionÚactivation_fnr   r‰   r­   ÚintermediateÚoutputÚactivation_dropoutrÄ   )ri   r   rô   r¯   s      €r    r¬   zProphetNetFeedForward.__init__n  sk   ø€ Ü‰ÑÔÜ# F×$>Ñ$>Ñ?ˆÔÜŸI™I f×&8Ñ&8¸'ÓBˆÔÜ—i‘i ¨×);Ñ);Ó<ˆŒØ"(×";Ñ";ˆÔØ—~‘~ˆr!   c                 óD  — | j                  |«      }| j                  |«      }t        j                  j	                  || j
                  | j                  ¬«      }| j                  |«      }t        j                  j	                  || j                  | j                  ¬«      }|S )NrÜ   )rø   r÷   r   r   rÄ   rú   rÞ   rù   )ri   r{   s     r    r¶   zProphetNetFeedForward.forwardv  s„   € Ø×)Ñ)¨-Ó8ˆØ×*Ñ*¨=Ó9ˆäŸ™×-Ñ-¨m¸t×?VÑ?VÐae×anÑanÐ-ÓoˆØŸ™ MÓ2ˆÜŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-ÓdˆØÐr!   )	rl   rm   rn   ro   r   r8   r¬   r¶   r½   r¾   s   @r    ró   ró   i  s!   ø„ ñð&Ð/ð &¸#õ &ör!   ró   c                   ód   ‡ — e Zd Zdefˆ fd„Zd„ Zd„ Z	 	 	 	 	 	 	 d	deee	      fd„Z
d„ Zd„ Zˆ xZS )
ÚProphetNetNgramSelfAttentionr   c                 ó¤  •— t         ‰|   «        |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | j                  z  | _	        |j                  | _
        | j                  | j                  z  |j                  k(  sJ d«       ‚t        j                  |j                  |j                  «      | _        t        j                  |j                  |j                  «      | _        t        j                  |j                  |j                  «      | _        t        j                  |j                  |j                  «      | _        t        j                  |j                  | j                  | j                  z  «      | _        d| _        y )Nz6config.hidden_size must be divisible by num_attn_headsF)r«   r¬   r­   r?   Úrelative_max_distanceÚnum_decoder_attention_headsrÁ   rÄ   rÃ   rÅ   r0   r   r‰   rÆ   rÇ   rÈ   rÉ   Úrelative_pos_embeddingsr   r®   s     €r    r¬   z%ProphetNetNgramSelfAttention.__init__  sa  ø€ Ü‰ÑÔØ!×-Ñ-ˆÔà!×-Ñ-ˆÔØ%+×%AÑ%AˆÔ"Ø$×@Ñ@ˆÔØ—~‘~ˆŒØ!'×!9Ñ!9ˆÔØ×*Ñ*¨d×.AÑ.AÑAˆŒØ—\‘\ˆŒ
à}‰}˜t×2Ñ2Ñ2°f×6HÑ6HÒHð 	
ØDó	
ÐHô Ÿ	™	 &×"4Ñ"4°f×6HÑ6HÓIˆŒÜŸ)™) F×$6Ñ$6¸×8JÑ8JÓKˆŒÜŸ)™) F×$6Ñ$6¸×8JÑ8JÓKˆŒô Ÿ	™	 &×"4Ñ"4°f×6HÑ6HÓIˆŒô (*§y¡y°×1CÑ1CÀT×EUÑEUÐX\×XkÑXkÑEkÓ'lˆÔ$ð  ˆr!   c                 óŽ   — |j                  ||| j                  | j                  «      j                  dd«      j	                  «       S rÎ   rÏ   )ri   rÊ   rË   rä   s       r    rÓ   z#ProphetNetNgramSelfAttention._shapež  s9   € Ø{‰{˜: w°×0CÑ0CÀTÇ]Á]ÓS×]Ñ]Ð^_ÐabÓc×nÑnÓpÐpr!   c                 ó   — d| _         y )NT)r   rh   s    r    Úprepare_for_onnx_export_z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_¡  s	   € Øˆr!   rÖ   c	           	      óä  — |j                  «       \  }	}
}t        |j                  «       «      |	|
|gk(  sJ d|	|
|f› d|j                  › «       ‚| j                  |«      }| j	                  |«      }| j                  |«      }|| j                  dz  z  }| j                  ||
|	«      }| j                  |d|	«      }| j                  |d|	«      }|	| j                  d| j                  f} |j                  |Ž } |j                  |Ž } |j                  |Ž }|j                  d| j                  z   d¬«      }|j                  d| j                  z   d¬«      }|j                  d| j                  z   d¬«      }|j                  d| j                  z   d¬«      }|d   |dd  }}|d   |dd  }}|d   |dd  }}|d   |dd  }}|<|d   }t        j                  ||fd¬«      }|d   }t        j                  ||fd¬«      }||f}|
d| j                  z   z  }t        j                  d	||j                  dd
«      «      }| j!                  ||||«      } || z   }|||z   }t#        |d| j$                  ¬«      j'                  |«      }!|w|j                  «       | j                  fk(  s&J d| j                  f› d|j                  «       › «       ‚|j                  dddd«      |!j                  |	| j                  d|«      z  }!t(        j*                  j-                  |!| j.                  | j0                  ¬«      }!t        j                  d	|!|«      }"|"j                  dd«      j3                  |	d||«      }"| j5                  |"«      }"t        j6                  |d«      j                  |	| j                  | j                  || j                  «      }#t        j6                  |D $cg c]  }$t        j                  ||$gd«      ‘Œ c}$d«      }%t        j6                  |d¬«      }&t        j                  |D 'cg c])  }'t        j                  ||'gd«      j9                  d«      ‘Œ+ c}'d«      }(t        j                  d|#|%f«      })| j;                  |&|)||«      }*|)|*z   })|5|j=                  dddd
d«      }|j?                  |)j@                  «      }|)|z   })t#        |)d| j$                  ¬«      j'                  |)«      }+|\|j                  «       | j                  fk(  s&J d| j                  f› d|j                  «       › «       ‚|j                  ddddd«      |+z  }+t(        j*                  j-                  |+| j.                  | j0                  ¬«      }+t        j                  d|+|(j                  dd«      f«      },|,j                  dd
«      },|,j3                  |	| j                  ||«      },| j5                  |,«      },t        j                  |"|,gd«      j                  |	d|«      }-|!j                  |	| j                  |d«      }!t(        j*                  j-                  |-| j,                  | j0                  ¬«      }-|-|!|+|fS c c}$w c c}'w )Nz#`hidden_states` should be of shape rß   rÚ   rJ   r   r   r%   r   zbntc,bncs->bntsr
   )r   r   rÛ   rÙ   rÜ   zbnhtc,bnhsc->bnhtsé   zbnhts,bnhsc->bnhtc)!rM   rà   rš   rÈ   rÆ   rÇ   rÅ   rÓ   rÁ   rÐ   Úchunkr0   r   r.   rá   rÑ   Ú get_main_relative_pos_embeddingsr   r   r´   r   r   rÄ   rÃ   rÞ   rã   rÉ   ÚstackrK   Ú#get_predict_relative_pos_embeddingsÚpermuteÚtor   ).ri   r{   rÖ   r¸   rÕ   Úextended_predict_attention_maskrQ   rR   rN   rä   Úngram_sequence_lengthr­   rç   rè   ré   rê   Úhidden_states_listÚquery_states_listÚkey_states_listÚvalue_states_listÚmain_hidden_statesÚhidden_states_predict_listÚmain_query_statesÚpredict_query_states_listÚmain_key_statesÚpredict_key_states_listÚmain_value_statesÚpredict_value_states_listÚprev_main_key_statesÚprev_main_value_statesr/   Úmain_attn_weightsÚmain_relative_pos_embeddingsÚmain_attn_probsÚmain_attn_outputÚpredict_query_statesÚkeyÚpredict_key_statesÚpredict_hidden_statesÚv_pÚpredict_value_statesÚpredict_attn_weightsÚpredict_relative_pos_embeddingsÚpredict_attn_probsÚpredict_attn_outputrð   s.                                                 r    r¶   z$ProphetNetNgramSelfAttention.forward¤  s*  € ð :G×9KÑ9KÓ9MÑ6ˆ
Ð)¨;ÜM×&Ñ&Ó(Ó)¨jÐ:OÐQ\Ð-]Ò]ð 	
Ø1°*Ð>SÐU`Ð2`Ð1að bØ×#Ñ#Ð$ð&ó	
Ð]ð —‘ }Ó5ˆØ—]‘] =Ó1ˆ
Ø—‘ }Ó5ˆð $ t§}¡}°cÑ'9Ñ:ˆð —{‘{ <Ð1FÈ
ÓSˆØ—[‘[ ¨R°Ó<ˆ
Ø—{‘{ <°°ZÓ@ˆØ  $×"5Ñ"5°r¸4¿=¹=ÐIˆ
à(|×(Ñ(¨*Ð5ˆØ$Z—_‘_ jÐ1ˆ
Ø(|×(Ñ(¨*Ð5ˆð +×0Ñ0°°T·Z±Z±ÀQÐ0ÓGÐØ(×.Ñ.¨q°4·:±:©~À1Ð.ÓEÐØ$×*Ñ*¨1¨t¯z©z©>¸qÐ*ÓAˆØ(×.Ñ.¨q°4·:±:©~À1Ð.ÓEÐà9KÈAÑ9NÐPbÐcdÐceÐPfÐ6ÐØ7HÈÑ7KÐM^Ð_`Ð_aÐMbÐ4ÐØ3BÀ1Ñ3EÀÐWXÐWYÐGZÐ0ˆØ7HÈÑ7KÐM^Ð_`Ð_aÐMbÐ4Ðð Ð%Ø#1°!Ñ#4Ð Ü#Ÿi™iÐ)=¸Ð(OÐUVÔWˆOØ%3°AÑ%6Ð"Ü %§	¡	Ð+AÐCTÐ*UÐ[\Ô ]Ðð *Ð+<Ð=ˆð 0°A¸¿
¹
±NÑCˆô "ŸL™LÐ):Ð<MÈ×OhÑOhÐijÐlmÓOnÓoÐð (,×'LÑ'LØÐ 1°<ÐA_ó(
Ð$ð .Ð0LÑLÐàÐ%Ø 1°NÑ BÐä!ØØØ—‘ô
÷ ‰'Ð#Ó
$ð	 	ð Ð&Ø"×'Ñ'Ó)¨d×.AÑ.AÐ-CÒCð ØAÀ4×CVÑCVÐBXÐAYð ZØ#×(Ñ(Ó*Ð+ð-óÐCð .×2Ñ2°1°b¸!¸QÓ?À/×BVÑBVØ˜D×/Ñ/°°_óCñ ˆOô Ÿ-™-×/Ñ/°À4×CYÑCYÐdh×dqÑdqÐ/Órˆô
 !Ÿ<™<Ð(9¸?ÐL]Ó^Ðà+×5Ñ5°a¸Ó;×CÑCÀJÐPQÐSbÐdoÓpÐØŸ=™=Ð)9Ó:Ðô  %Ÿ{™{Ð+DÀaÓH×MÑMØ˜Ÿ
™
 D×$7Ñ$7¸È$Ï-É-ó 
Ðô
 #Ÿ[™[ÐZqÖ)rÐSV¬%¯)©)°_ÀcÐ4JÈAÕ*NÒ)rÐtuÓvÐô !&§¡Ð,FÈAÔ NÐô  %Ÿy™yØLeÖfÀSŒUY‰YÐ)¨3Ð/°Ó3×=Ñ=¸aÕ@ÒfÐhió 
Ðô  %Ÿ|™|Ð,@ÐCWÐYkÐBlÓmÐð +/×*RÑ*RØ!Ð#7¸ÐGhó+
Ð'ð
  4Ð6UÑUÐà*Ð6à.M×.UÑ.UÐVWÐYZÐ\]Ð_`ÐbcÓ.dÐ+Ø.M×.PÑ.PÐQe×QkÑQkÓ.lÐ+Ø#7Ð:YÑ#YÐ ä$Ø ØØ—‘ô
÷ ‰'Ð&Ó
'ð	 	ð Ð&Ø"×'Ñ'Ó)¨d×.AÑ.AÐ-CÒCð ØAÀ4×CVÑCVÐBXÐAYð ZØ#×(Ñ(Ó*Ð+ð-óÐCð "1×!5Ñ!5°a¸¸BÀÀ1Ó!EÐHZÑ!ZÐäŸ]™]×2Ñ2Ø $×"8Ñ"8À4Ç=Á=ð 3ó 
Ðô $Ÿl™lØ Ð#5Ð7K×7UÑ7UÐVWÐYZÓ7[Ð"\ó
Ðð 2×;Ñ;¸A¸qÓAÐØ1×9Ñ9¸*ÀdÇjÁjÐRaÐcnÓoÐØ"Ÿm™mÐ,?Ó@Ðô —i‘iÐ!1Ð3FÐ GÈÓK×PÑPÐQ[Ð]_ÐalÓmˆà)×.Ñ.¨z¸4×;NÑ;NÐP_ÐacÓdˆä—m‘m×+Ñ+¨K¸4¿<¹<ÐRV×R_ÑR_Ð+Ó`ˆà˜OÐ-?ÀÐOÐOùòI *sùò gs   ÏY(Ð#.Y-c                 ó  — |j                   \  }}}}|j                  ||||«      }|€Ç|j                   d d \  }}	t        j                  d|j                   d   dz   «      j	                  d«      j	                  d«      j                  ||	d«      j                  |j                  «      }
|
|j	                  d«      j                  ||	d«      z
  }
t        | j                  | j                  |
d«      }| j                  |«      }|j                  |j                   d d | j                  | j                  fz   «      }|j                  dddd«      }|j                  |j                   d d dz   «      }|j                  d| j                  d«      }|j                  d|j                   d   «      }|j                  «       }|j                  d|j!                  d«      «      }t        j"                  |d|¬«      }|j                  |||d«      }|S )	Nr%   r   rJ   r   Fr
   )rJ   ©r   Úindex)rš   rÐ   r   ÚarangerK   rL   r  r#   rH   r?   rÿ   r  rÁ   r  rã   r²   rM   Úgather)ri   r{   rì   rN   rQ   rä   rÁ   rå   rë   r/   rA   Úrel_pos_embeddingsr  s                r    r  z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsV  s  € ð 8D×7IÑ7IÑ4ˆ
N G¨WØ#×(Ñ(¨°^ÀWÈgÓVˆØ)Ð1Ø*7×*=Ñ*=¸b¸qÐ*AÑ'ˆJ˜ä—‘˜Q × 2Ñ 2°2Ñ 6¸Ñ :Ó;ß‘˜1“ß‘˜1“ß‘˜
 O°QÓ7ß‘L×'Ñ'Ó(ð ð "4°l×6LÑ6LÈQÓ6O×6VÑ6VÐWaÐcrÐtuÓ6vÑ!vÐÜ-EØ× Ñ  $×"<Ñ"<Ð>PÐRWó.Ð*ð
 "×9Ñ9¸-ÓHÐØ/×4Ñ4Ø×$Ñ$ R aÐ(¨D×,<Ñ,<¸d×>QÑ>QÐ+RÑRó
Ðð 0×7Ñ7¸¸1¸aÀÓCÐà/×7Ñ7¸×8JÑ8JÈ2ÈAÐ8NÐQVÑ8VÓWÐà)G×)NÑ)NÈqÐRV×ReÑReÐghÓ)iÐ&à)G×)LÑ)LØÐ.×4Ñ4°RÑ8ó*
Ð&ð *H×)LÑ)LÓ)NÐ&à/×7Ñ7¸Ð<N×<SÑ<SÐTVÓ<WÓXÐä',§|¡|Ð4FÈAÐUsÔ'tÐ$Ø'C×'HÑ'HÈÐUcÐelÐnpÓ'qÐ$Ø+Ð+r!   c                 ó(  — |j                   dd \  }}|€É|j                   d   }|d   d   |dz
  k(  sJ d«       ‚t        j                  d|«      j                  d«      j                  d«      j	                  ||d«      j                  |j                  «      }||j                  d«      j	                  ||d«      z
  }t        | j                  | j                  |d«      }|j                  dd«      }| j                  |«      }	|	j                  |j                   d d | j                  | j                  fz   «      }	|	j                  ddddd«      }	|	j                  d| j                  «      }	|j                  d«      }|j	                  | j                   d| j                  d«      }|j                  d|j#                  d«      «      j%                  «       }t        j&                  |	d|¬	«      }
|
j                  || j                   | j                  |d«      }
|
S )
Nr   r%   rJ   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr  r
   r,  )rš   r   r.  rK   rL   r  r#   rH   r?   rÿ   rÑ   r  rÐ   rÁ   r  rã   r0   rM   r²   r/  )ri   r{   rì   rN   rR   rä   r/   Úkey_sequence_lengthrA   r0  r(  s              r    r
  z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsƒ  s+  € ð '4×&9Ñ&9¸!¸AÐ&>Ñ#ˆ
Oà,Ð4Ø".×"4Ñ"4°RÑ"8ÐØ ‘? 1Ñ%Ð)<¸qÑ)@Ò@ð ØtóÐ@ô —‘˜QÐ 3Ó4ß‘˜1“ß‘˜1“ß‘˜
 O°QÓ7ß‘L×'Ñ'Ó(ð ð "4°l×6LÑ6LÈQÓ6O×6VÑ6VÐWaÐcrÐtuÓ6vÑ!vÐÜ0HØ× Ñ  $×"<Ñ"<Ð>PÐRWó1Ð-ð
 &×/Ñ/°°1Ó5ˆØ!×9Ñ9¸-ÓHÐð 0×4Ñ4Ø×Ñ  Ð$¨×(8Ñ(8¸$×:MÑ:MÐ'NÑNó
Ðð 0×7Ñ7¸¸1¸aÀÀAÓFÐà/×7Ñ7¸¸D×<LÑ<LÓMÐà,M×,WÑ,WÐXYÓ,ZÐ)Ø,M×,TÑ,TØJ‰J˜˜4×.Ñ.°ó-
Ð)ð -N×,RÑ,RØÐ1×6Ñ6°rÓ:ó-
ç
‰$‹&ð 	*ô +0¯,©,Ø AÐ-Nô+
Ð'ð
 +J×*NÑ*NØ˜Ÿ
™
 D×$7Ñ$7¸È"ó+
Ð'ð /Ð.r!   ©NNNNNNN)rl   rm   rn   r   r¬   rÓ   r  r   r   r   r¶   r  r
  r½   r¾   s   @r    rý   rý   €  sZ   ø„ ð Ð/õ  ò:qòð 37ØØØ(,Ø'+Ø*.ØñpPð !  v¡Ñ/ópPòd+,öZ9/r!   rý   c                   ó8   ‡ — e Zd ZdZdefˆ fd„Z	 ddefd„Zˆ xZS )ÚProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    r   c                 óö   •— t         ‰|   «        t        ||j                  «      | _        t        |j                  «      | _        t        ||j                  «      | _
        t        |j                  «      | _        y r¨   )r«   r¬   rÀ   Únum_encoder_attention_headsÚ	self_attnr	   r­   Úself_attn_layer_normró   Úencoder_ffn_dimÚfeed_forwardÚfeed_forward_layer_normr®   s     €r    r¬   zProphetNetEncoderLayer.__init__Ä  s_   ø€ Ü‰ÑÔä,¨V°V×5WÑ5WÓXˆŒÜ$-¨f×.@Ñ.@Ó$AˆÔ!ô 2°&¸&×:PÑ:PÓQˆÔÜ'0°×1CÑ1CÓ'DˆÕ$r!   r×   c                 óÀ   — | j                  ||||¬«      \  }}}| j                  ||z   «      }| j                  |«      }| j                  ||z   «      }|f}	|r|	|fz  }	|	S )N)r{   r¸   rÕ   r×   )r8  r9  r;  r<  )
ri   r{   r¸   rÕ   r×   Úattention_outputrì   Ú_Úfeed_forward_outputÚoutputss
             r    r¶   zProphetNetEncoderLayer.forwardÎ  sˆ   € ð -1¯N©NØ'Ø)Ø+Ø/ð	 -;ó -
Ñ)Ð˜,¨ð ×1Ñ1Ð2BÀ]Ñ2RÓSˆð #×/Ñ/°Ó>ÐØ×4Ñ4Ð5HÈ=Ñ5XÓYˆà Ð"ˆáØ˜Ñ&ˆGàˆr!   ©F©	rl   rm   rn   ro   r   r¬   rñ   r¶   r½   r¾   s   @r    r5  r5  ¿  s+   ø„ ñðEÐ/õ Eð #(ñð
  ÷r!   r5  c                   óR   ‡ — e Zd ZdZdefˆ fd„Z	 	 	 	 	 	 	 	 	 	 	 	 ddedefd„Zˆ xZS )ÚProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    r   c                 ób  •— t         ‰|   «        t        |«      | _        t	        |j
                  «      | _        |j                  r5t        ||j                  «      | _
        t	        |j
                  «      | _        t        ||j                  «      | _        t	        |j
                  «      | _        y r¨   )r«   r¬   rý   r8  r	   r­   r9  Úadd_cross_attentionrÀ   r   Ú
cross_attnÚcross_attn_layer_normró   Údecoder_ffn_dimr;  r<  r®   s     €r    r¬   zProphetNetDecoderLayer.__init__ï  s‰   ø€ Ü‰ÑÔä5°fÓ=ˆŒÜ$-¨f×.@Ñ.@Ó$AˆÔ!ð ×%Ò%Ü1°&¸&×:\Ñ:\Ó]ˆDŒOÜ)2°6×3EÑ3EÓ)FˆDÔ&ô 2°&¸&×:PÑ:PÓQˆÔÜ'0°×1CÑ1CÓ'DˆÕ$r!   Ú	use_cacher×   c           
      ór  — ||d d nd }| j                  |||||||	|
¬«      \  }}}}| j                  ||z   «      }||dd  nd }d }|4| j                  ||||||¬«      \  }}}| j                  ||z   «      }||z   }| j	                  |«      }| j                  ||z   «      }|f}|r||||fz  }|r||fz  }|S )Nr%   )r{   rÖ   r¸   rÕ   r  rQ   rR   rN   éþÿÿÿ)r{   rÔ   r¸   rÕ   rÖ   r×   )r8  r9  rH  rI  r;  r<  )ri   r{   r¸   r`   Úencoder_attn_maskrÕ   Úcross_attn_layer_head_maskr  rQ   rR   rN   rÖ   rK  r×   Úself_attn_past_key_valueÚngram_attention_outputÚself_attn_weightsÚself_attn_weights_ngramÚpresent_key_valueÚcross_attn_past_key_valueÚcross_attn_weightsr>  Úcross_attn_present_key_valuer@  rA  s                            r    r¶   zProphetNetDecoderLayer.forwardþ  sK  € ð$ :HÐ9S >°"°1Ñ#5ÐY]Ð Ø`d×`nÑ`nØ'Ø3Ø)Ø+Ø,KØ+IØ.OØ%ð aoó 	a
Ñ]ÐÐ 1Ð3JÐL]ð ×1Ñ1°-ÐBXÑ2XÓYˆð <JÐ;U N°2°3Ñ$7Ð[_Ð!Ø!ÐØ Ð,àQU×Q`ÑQ`Ø+Ø!6Ø0Ø :Ø8Ø"3ð Raó RÑNÐÐ0Ð2Nð !×6Ñ6Ð7GÈ-Ñ7WÓXˆMð !2Ð4PÑ PÐð #×/Ñ/°Ó>ÐØ×4Ñ4Ð5HÈ=Ñ5XÓYˆà Ð"ˆáØÐ)Ð+BÐDVÐWÑWˆGáØÐ)Ð+Ñ+ˆGàˆr!   )NNNNNNNNNNTFrC  r¾   s   @r    rE  rE  ê  sV   ø„ ñðEÐ/õ Eð$ Ø"ØØØ#'Ø(,Ø'+Ø*.ØØØØ"'ñ=ð ð=ð  ÷=r!   rE  z=
    The standalone encoder part of the ProphetNetModel.
    )Úcustom_introc                   ó  ‡ — e Zd Zddedej
                  fˆ fd„Zd„ Zd„ Ze		 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
e   de
e   de
e   deeef   fd„«       Zˆ xZS )ÚProphetNetEncoderr   Úword_embeddingsc                 ó¶  •— t         ‰|   |«       ||n5t        j                  |j                  |j
                  |j                  ¬«      | _        t        |«      | _	        t        |j
                  «      | _        t        j                  t        |j                  «      D cg c]  }t        |«      ‘Œ c}«      | _        d| _        | j%                  «        yc c}w ©a7  
        word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
            The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
            embeddings instead of randomly initialized word embeddings.
        N©r’   F)r«   r¬   r   r‘   Ú
vocab_sizer­   r˜   r[  r¥   Úposition_embeddingsr	   Úembeddings_layer_normÚ
ModuleListr+   Únum_encoder_layersr5  ÚlayersÚgradient_checkpointingÚ	post_init©ri   r   r[  r?  r¯   s       €r    r¬   zProphetNetEncoder.__init__D  s²   ø€ ô 	‰Ñ˜Ô ð Ð*ñ ä—‘˜f×/Ñ/°×1CÑ1CÐQW×QdÑQdÔeð 	Ôô
 $BÀ&Ó#IˆÔ Ü%.¨v×/AÑ/AÓ%BˆÔ"ä—m‘mÌUÐSY×SlÑSlÓMmÖ$nÈÔ%;¸FÕ%CÒ$nÓoˆŒà&+ˆÔ#à‰Õùò	 %os   Â Cc                 ó   — | j                   S r¨   ©r[  rh   s    r    Úget_input_embeddingsz&ProphetNetEncoder.get_input_embeddingsZ  ó   € Ø×#Ñ#Ð#r!   c                 ó   — || _         y r¨   ri  ©ri   Úvalues     r    Úset_input_embeddingsz&ProphetNetEncoder.set_input_embeddings]  ó
   € Ø$ˆÕr!   rž   r¸   Ú	head_maskÚinputs_embedsr×   Úoutput_hidden_statesÚreturn_dictr¦   c                 óD  — ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|€|€t	        d«      ‚||t	        d«      ‚||€| j                  |«      }||d|dd…dddd…f   j                  d| j                   j                  dd«      z
  t        j                  | j                  «      j                  z  }|j                  |j                  «      }nd}| j                  |j                  dd |j                  «      \  }	}
||	z   }| j!                  |«      }t"        j$                  j'                  || j                   j&                  | j(                  ¬«      }|rdnd}|rdnd}|[|j+                  «       d	   t-        | j.                  «      k(  s2J d
t-        | j.                  «      › d|j+                  «       d	   › d«       ‚t1        | j.                  «      D ]p  \  }}|r||fz   }| j2                  r3| j(                  r'| j5                  |j6                  |||||   nd|«      }n ||||||   nd|¬«      }|d	   }|sŒh||d   fz   }Œr |r||fz   }|st9        d„ |||fD «       «      S t;        |||¬«      S )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.ç      ð?r   r%   rÜ   rs   r   z&The head_mask should be specified for ú layers, but it is for ú.)r¸   rÕ   r×   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wr¨   rs   ©Ú.0Úvs     r    ú	<genexpr>z,ProphetNetEncoder.forward.<locals>.<genexpr>»  s   è ø€ Òl˜qÐ^_Ñ^kœÑlùó   ‚Š)rv   r{   r}   )r   r×   rs  Úuse_return_dictrâ   r[  rL   r7  r   r'   r   r(   r  r`  rš   r#   ra  r   r   rÄ   rÞ   rM   Úlenrd  Ú	enumeratere  Ú_gradient_checkpointing_funcÚ__call__Útupler   )ri   rž   r¸   rq  rr  r×   rs  rt  Úextended_attention_maskr`  rN   r{   r`   Úall_attentionsÚidxÚencoder_layerÚlayer_outputss                    r    r¶   zProphetNetEncoder.forward`  s÷  € ð4 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ Ð!6ÜÐRÓSÐSØÐ" }Ð'@ÜÐQÓRÐRØÐ" }Ð'<Ø ×0Ñ0°Ó;ˆMð Ð%àn¢Q¨¨d²AÐ%5Ñ6×=Ñ=¸aÀÇÁ×AhÑAhÐjkÐmnÓoÑoÜ—‘˜DŸJ™JÓ'×+Ñ+ñ',Ð#ð '>×&@Ñ&@À×ATÑATÓ&UÑ#à&*Ð#à,0×,DÑ,DÀ]×EXÑEXÐY[ÐZ[ÐE\Ð^k×^rÑ^rÓ,sÑ)Ð˜\à%Ð(;Ñ;ˆØ×2Ñ2°=ÓAˆÜŸ™×-Ñ-¨m¸t¿{¹{×?RÑ?RÐ]a×]jÑ]jÐ-Ókˆá&:¡ÀÐÙ0™°dˆð Ð Ø—>‘>Ó# AÑ&¬3¨t¯{©{Ó+;Ò<ð Ø8¼¸T¿[¹[Ó9IÐ8JÐJaÐbk×bpÑbpÓbrÐstÑbuÐavÐvwÐxóÐ<ô #,¨D¯K©KÓ"8ò 	FÑˆCÙ#Ø(=ÀÐ@PÑ(PÐ%à×*Ò*¨t¯}ª}Ø $× AÑ AØ!×*Ñ*Ø!Ø+Ø'0Ð'<Y˜s’^À$Ø%ó!‘ñ !.Ø!Ø#:Ø7@Ð7L Y¨s¢^ÐRVØ&7ô	!ð *¨!Ñ,ˆMâ Ø!/°=ÀÑ3CÐ2EÑ!E‘ð/	Fñ2  Ø$9¸]Ð<LÑ$LÐ!áÜÑl ]Ð4IÈ>Ð$ZÔlÓlÐlÜØ+Ð;PÐ]kô
ð 	
r!   r¨   r3  )rl   rm   rn   r   r   r‘   r¬   rj  ro  r   r   r   r   rñ   r   r   r   r¶   r½   r¾   s   @r    rZ  rZ  >  sâ   ø„ ñÐ/ð À"Ç,Á,õ ò,$ò%ð ð -1Ø15Ø,0Ø04Ø,0Ø/3Ø&*ñ]
à˜EŸL™LÑ)ð]
ð ! §¡Ñ.ð]
ð ˜EŸL™LÑ)ð	]
ð
   §¡Ñ-ð]
ð $ D™>ð]
ð ' t™nð]
ð ˜d‘^ð]
ð 
ˆuoÐ%Ñ	&ò]
ó ô]
r!   rZ  z=
    The standalone decoder part of the ProphetNetModel.
    c                   óÀ  ‡ — e Zd Zddedeej                     fˆ fd„Zd„ Zd„ Z	e
	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     deeeej                           deej                     dee   dee   dee   dee   deeef   fd„«       Zd„ Zd„ Zd„ Zˆ xZS )ÚProphetNetDecoderr   r[  c                 óÀ  •— t         ‰|   |«       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        ||n5t        j                  |j                  |j                  |j                  ¬«      | _        t        |«      | _        t        j                  | j                  |j                  d«      | _        t        j"                  t%        |j&                  «      D cg c]  }t)        |«      ‘Œ c}«      | _        t-        |j                  «      | _        d| _        | j3                  «        yc c}w r]  )r«   r¬   r0   r?   rÿ   rÄ   r©   Úmax_target_positionsr   r‘   r_  r­   r˜   r[  r¥   r`  Úngram_embeddingsrb  r+   Únum_decoder_layersrE  rd  r	   ra  re  rf  rg  s       €r    r¬   zProphetNetDecoder.__init__Ç  s  ø€ ô 	‰Ñ˜Ô à—\‘\ˆŒ
Ø!×-Ñ-ˆÔØ%+×%AÑ%AˆÔ"Ø—~‘~ˆŒØ$*×$BÑ$BˆÔ!ð Ð*ñ ä—‘˜f×/Ñ/°×1CÑ1CÐQW×QdÑQdÔeð 	Ôô
 $BÀ&Ó#IˆÔ ä "§¡¨T¯Z©Z¸×9KÑ9KÈTÓ RˆÔÜ—m‘mÌUÐSY×SlÑSlÓMmÖ$nÈÔ%;¸FÕ%CÒ$nÓoˆŒÜ%.¨v×/AÑ/AÓ%BˆÔ"à&+ˆÔ#à‰Õùò %os   ÄEc                 ó   — | j                   S r¨   ri  rh   s    r    rj  z&ProphetNetDecoder.get_input_embeddingsä  rk  r!   c                 ó   — || _         y r¨   ri  rm  s     r    ro  z&ProphetNetDecoder.set_input_embeddingsç  rp  r!   rž   r¸   r`   Úencoder_attention_maskrq  Úcross_attn_head_maskrY   rr  rK  r×   rs  rt  r¦   c                 óø
  — |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|€|€t        d«      ‚||t        d«      ‚||€| j                  |«      }|j                  dd \  }}| j                  ||f|j                  |¬«      \  }}|d\  }}n| j                  |«      \  }}| j                  j                  |dz   «      }||z   }| j                  j                  }|\|j                  d«      dk(  sJ d«       ‚t        | j                   «      D cg c]  }||dz
     |z   j#                  |dd«      ‘Œ  }}d}d}nOt        | j                   «      D cg c]  }||dz
     |z   ‘Œ }}| j%                  ||«      }| j'                  ||«      }||d	|dd…dddd…f   j#                  d| j                   j(                  dd«      z
  t+        j,                  | j.                  «      j0                  z  }|j3                  |j.                  «      }nd}t+        j4                  |g|z   d«      }| j6                  r| j7                  |«      }t8        j:                  j=                  || j<                  | j>                  ¬
«      }|rdnd}|r| j                   j                   dkD  rdnd}|
rdnd}|
rdnd}|
r| j                   j@                  rdnd}| jB                  r%| j>                  r|	rtD        jG                  d«       d}	|	rdnd} tI        ||gddg«      D ]f  \  }!}"|!€Œ	|!j                  «       d   tK        | jL                  «      k(  rŒ3J d|"› dtK        | jL                  «      › d|j                  «       d   › d«       ‚ tO        | jL                  «      D ]  \  }#}$|r7||dd…d|…f   fz  }| j                   j                   dkD  r||dd…|d…f   fz  }|||#   nd}%| jB                  rC| j>                  r7| jQ                  |$jR                  |||||||#   nd|||#   nd||||d|	|
«      }&n# |$|||||||#   nd|||#   nd|||||%|	|
¬«      }&|&d   }|	r| |&|
rdnd   fz  } |
sŒÒ||&d   fz  }||&d   fz  }| j                   j@                  sŒû||&d   fz  }Œ |r7||dd…d|…f   fz  }| j                   j                   dkD  r||dd…|d…f   fz  }|dd…d|…f   }'| j                   j                   dkD  r|dd…|d…f   nd}(|stU        d„ |'|(| |||||fD «       «      S tW        |'|(| |||||¬«      S c c}w c c}w )aY  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r%   )r#   rY   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1rv  rÜ   rs   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frq  r“  zThe `z` should be specified for rw  rx  )r¸   r`   rN  rÕ   rO  r  rQ   rR   rN   rÖ   rK  r×   r  r
   c              3   ó$   K  — | ]  }||–— Œ
 y ­wr¨   rs   rz  s     r    r}  z,ProphetNetDecoder.forward.<locals>.<genexpr>°  s   è ø€ ò àð =ô ñùs   ‚)rv   rw   rY   r{   r|   r}   r~   r^   ),r   rK  r×   rs  r  râ   r[  rš   r`  r#   Ú!compute_buffered_relative_bucketsr¼   rŽ  rŠ   rM   r+   r0   rL   Úprepare_attention_maskÚprepare_predict_attention_maskr   r   r'   r   r(   r  r.   ra  r   r   rÄ   rÞ   rG  re  ÚloggerÚwarning_onceÚzipr€  rd  r  r‚  rƒ  r„  rz   ))ri   rž   r¸   r`   r’  rq  r“  rY   rr  rK  r×   rs  rt  rä   r/   Úmain_stream_pos_embedrN   rQ   rR   Úpredicting_stream_pos_embedr{   rŽ  r0   Úngram_hidden_statesr…  r  Úextended_encoder_attention_maskÚall_main_stream_hidden_statesÚall_ngram_stream_hidden_statesÚall_main_stream_attnsÚall_ngram_stream_attnsÚall_cross_attnsÚpresent_key_valuesÚ	attn_maskÚ	mask_namer‡  Údecoder_layerrÖ   r‰  rv   rw   s)                                            r    r¶   zProphetNetDecoder.forwardê  s×  € ðH "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ Ð!6ÜÐfÓgÐgØÐ" }Ð'@ÜÐeÓfÐfØÐ" }Ð'<Ø ×0Ñ0°Ó;ˆMà&3×&9Ñ&9¸"¸1Ð&=Ñ#ˆ
Oà.2×.FÑ.FØ˜Ð)Ø ×'Ñ'Ø+ð /Gó /
Ñ+Ð˜|ð Ð&ØPZÑMÐ*Ñ,Mð
 ×6Ñ6°|ÓDñØ.Ø1à&*×&>Ñ&>×&GÑ&GÈÐWXÑHXÓ&YÐ#ð &Ð(=Ñ=ˆà×0Ñ0×7Ñ7Ðð Ð&Ø ×%Ñ% aÓ(¨AÒ-ð ØaóÐ-ô # 4§:¡:Ó.ö#àð " %¨!¡)Ñ,Ð/JÑJ×RÑRÐS]Ð_`ÐbcÕdð#Ðð #ð '+Ð#Ø.2Ñ+ô Z_Ð_c×_iÑ_iÓYjö#ØPUÐ! %¨!¡)Ñ,Ð/JÓJð#Ðð #ð '+×&AÑ&AÀ-ÐQ_Ó&`Ð#Ø.2×.QÑ.QÐR_ÐaoÓ.pÐ+ð "Ð-àÐ,ªQ°°dºAÐ-=Ñ>×EÑEÀaÈÏÉ×IpÑIpÐrsÐuvÓwÑwÜ—‘˜DŸJ™JÓ'×+Ñ+ñ/,Ð+ð /N×.PÑ.PÐQ^×QdÑQdÓ.eÑ+à.2Ð+äŸ	™	 = /Ð4GÑ"GÈÓKˆà×%Ò%Ø ×6Ñ6°}ÓEˆMäŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-Ódˆñ /C©ÈÐ%Ù/CÈÏÉ×HYÑHYÐ\]ÒH]©ÐcgÐ&á&7¡¸TÐÙ'8¡¸dÐÙ 1°d·k±k×6UÒ6U™"Ð[_ˆà×&Ò&¨4¯=ª=ÙÜ×#Ñ#Øpôð "	á#,™R°$Ðô %(¨Ð4HÐ(IÈKÐYoÐKpÓ$qò 	Ñ ˆIyØÑ$Ø —~‘~Ó'¨Ñ*¬s°4·;±;Ó/?Ó@ð Ø˜I˜;Ð&@ÄÀTÇ[Á[ÓAQÐ@Rð SØ!Ÿ™Ó(¨Ñ+Ð,¨Að/óÐ@ð	ô #,¨D¯K©KÓ"8ó 7	;ÑˆCÙ#à-°-ÂÐCSÀOÐCSÐ@SÑ2TÐ1VÑVÐ-Ø—;‘;×$Ñ$ qÒ(Ø2°}ÂQÈÑHXÐEXÑ7YÐ6[Ñ[Ð2à5DÐ5P˜_¨SÒ1ÐVZˆNà×*Ò*¨t¯}ª}Ø $× AÑ AØ!×*Ñ*Ø!Ø+Ø)Ø3Ø'0Ð'<Y˜s’^À$Ø2FÐ2RÐ)¨#Ò.ÐX\Ø3Ø2Ø5Ø ØØØ%ó!‘ñ" !.Ø!Ø#:Ø*?Ø&EØ7@Ð7L Y¨s¢^ÐRVà5IÐ5UÐ,¨SÒ1Ð[_à4SØ3QØ6WØ!-Ø#1Ø'Ø&7ô!ð$ *¨!Ñ,ˆMáØ" }Ñ:K±QÐQRÑ'SÐ&UÑUÐ"â Ø%¨-¸Ñ*:Ð)<Ñ<Ð%Ø&¨=¸Ñ+;Ð*=Ñ=Ð&à—;‘;×2Ó2Ø#¨°aÑ(8Ð':Ñ:’Oðo7	;ñr  Ø)¨mºAÐ?OÀÐ?OÐ<OÑ.PÐ-RÑRÐ)Ø{‰{× Ñ  1Ò$Ø.°=ÂÀOÑDTÐATÑ3UÐ2WÑWÐ.ð *ª!Ð-=¨oÐ-=Ð*=Ñ>ÐØHLÏÉ×HYÑHYÐ\]ÒH] -²°?Ñ3CÐ0CÒ"DÐcgÐáÜñ ð &Ø+Ø&Ø1Ø2Ø)Ø*Ø#ð	ôó ð ô ,Ø/Ø$;Ø.Ø7Ø >Ø,Ø3Ø,ô	
ð 		
ùòM#ùò#s   Å#U2ÆU7c           	      óð  — |j                   \  }}t        j                  d| j                  «      j	                  |j
                  «      j                  dd«      }t        | j                  | j                  |«      \  }}|d d …d |…d |…f   j                  |dd«      }t        j                  |d d …d |…d |…f   |d d …d |…| j                  | j                  |z   …f   gd«      j                  |dd«      }||fS rÎ   )rš   r   r.  r  r  r#   rL   rS   r?   rÿ   r.   )ri   rN   rä   r/   Úmain_relative_bucketsÚpredict_relative_bucketss         r    r–  z3ProphetNetDecoder.compute_buffered_relative_bucketsÉ  s!  € Ø&2×&8Ñ&8Ñ#ˆ
Oä—|‘| A t×'@Ñ'@ÓA×DÑDÀ\×EXÑEXÓY×`Ñ`ÐabÐdeÓfˆÜ:]Ø×Ñ˜d×8Ñ8¸,ó;
Ñ7ÐÐ7ð
 !6²aÐ9I¸/Ð9IÐK[ÈOÐK[Ð6[Ñ \× cÑ cÐdnÐpqÐstÓ uÐÜ#(§9¡9à(ªÐ,<¨_Ð,<Ð>N¸Ð>NÐ)NÑOØ(ÚÐ'˜Ð'¨×)BÑ)BÀT×E^ÑE^ÐapÑEpÐ)pÐpñðð ó$
÷ ‰&˜Q Ó
"ð 	!ð %Ð&>Ð>Ð>r!   c                 óL  — |j                   d d \  }}t        j                  ||ft        j                  |j                  «      j
                  |j                  |j                  ¬«      }t        j                  |d«      }|d |…d |…f   d d d d …d d …f   j                  || j                  j                  f|j                   z   «      }|@d|d d …d d d d …f   z
  t        j                  | j                  «      j
                  z  }||z   }n|}|j                  |j                  «      S )Nr%   r±   r   rv  )rš   r   Úfullr'   r   r(   r#   ÚtriuÚexpandr   r   r  )ri   r{   r¸   rä   Ú
seq_lengthÚcausal_maskÚextended_causal_maskr…  s           r    r—  z(ProphetNetDecoder.prepare_attention_maskß  s%  € Ø!.×!4Ñ!4°R°aÐ!8Ñˆ
Jô —j‘jØ˜Ð$ÜK‰K˜×+Ñ+Ó,×0Ñ0Ø×%Ñ%Ø ×'Ñ'ô	
ˆô —j‘j ¨aÓ0ˆà*¨;¨J¨;¸¸¸Ð+CÑDÀTÈ4ÒQRÒTUÐEUÑV×]Ñ]Ø˜Ÿ™×@Ñ@ÐAÀK×DUÑDUÑUó 
Ðð
 Ð%Ø'*¨^ºA¸tÀTÊ1Ð<LÑ-MÑ'MÔQV×Q\ÑQ\Ð]a×]gÑ]gÓQh×QlÑQlÑ&lÐ#Ø&:Ð=TÑ&TÑ#à&:Ð#Ø&×)Ñ)¨-×*=Ñ*=Ó>Ð>r!   c           	      ó&  — |j                   d d \  }}t        | j                  | j                  |j                  |j
                  «      }t        j                  |d d …d |…d |…f   |d d …d |…| j                  | j                  |z   …f   gd¬«      }|d d d d …d d …d d …f   j                  || j                  j                  f|j                   z   «      }|¡d|d d …d d d d d …f   z
  t        j                  | j
                  «      j                  z  }|j                  || j                  j                  | j                  ||f«      }t        j                  |t        j                  |«      gd¬«      }||z   }n|}|j                  |j
                  «      S )Nr%   rJ   r   rv  )rš   r4   r  r0   r#   r   r   r.   r¯  r   r   r'   r(   r7   r  )	ri   r{   r¸   rä   r°  Úpredict_causal_maskÚextended_predict_causal_maskr…  r  s	            r    r˜  z0ProphetNetDecoder.prepare_predict_attention_mask÷  sª  € Ø!.×!4Ñ!4°R°aÐ!8Ñˆ
Jô 3Ø×%Ñ% t§z¡z°=×3GÑ3GÈ×I\ÑI\ó
Ðô $Ÿi™ià#¢A {¨
 {°K°Z°KÐ$?Ñ@Ø#Ú{˜
{ D×$=Ñ$=À×@YÑ@YÐ\fÑ@fÐ$fÐfñðð ô
Ðð (;¸4ÀÂqÊ!ÊQÐ;NÑ'O×'VÑ'VØ˜Ÿ™×@Ñ@ÐAÐDW×D]ÑD]Ñ]ó(
Ð$ð
 Ð%Ø'*¨^ºA¸tÀTÈ4ÒQRÐ<RÑ-SÑ'SÔW\×WbÑWbÐcg×cmÑcmÓWn×WrÑWrÑ&rÐ#Ø&=×&DÑ&DØ˜TŸ[™[×DÑDÀdÇjÁjÐR\Ð^hÐió'Ð#ô ',§i¡iØ(¬%×*:Ñ*:Ð;RÓ*SÐTÐZ\ô'Ð#ð /KÐMdÑ.dÑ+à.JÐ+Ø.×1Ñ1°-×2EÑ2EÓFÐFr!   r¨   )NNNNNNNNNNNN)rl   rm   rn   r   r   r   r‘   r¬   rj  ro  r   r   r   r   rñ   r   rz   r¶   r–  r—  r˜  r½   r¾   s   @r    r‹  r‹  Á  sq  ø„ ñÐ/ð À(È2Ï<É<ÑBXõ ò:$ò%ð ð -1Ø15Ø8<Ø9=Ø,0Ø7;Ø@DØ04Ø$(Ø,0Ø/3Ø&*ñ\
à˜EŸL™LÑ)ð\
ð ! §¡Ñ.ð\
ð  (¨¯©Ñ5ð	\
ð
 !)¨¯©Ñ 6ð\
ð ˜EŸL™LÑ)ð\
ð ' u§|¡|Ñ4ð\
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ð\
ð   §¡Ñ-ð\
ð ˜D‘>ð\
ð $ D™>ð\
ð ' t™nð\
ð ˜d‘^ð\
ð 
ˆuÐ2Ð2Ñ	3ò\
ó ð\
ò|?ò,?ö0!Gr!   r‹  c            $       óô  ‡ — e Zd ZddgZdefˆ fd„Zd„ Zd„ Zd„ Zd„ Z	d	„ Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                     deej                     deej                     deej                     deej                     deej                     dee   deeeej                           deej                     deej                     dee   dee   dee   dee   deeef   f d„«       Zˆ xZS )ÚProphetNetModelúencoder.word_embeddings.weightúdecoder.word_embeddings.weightr   c                 ó²  •— t         ‰|   |«       t        j                  |j                  |j
                  |j                  ¬«      | _        t        j                  |«      }d|_
        d|_        t        || j                  «      | _        t        j                  |«      }d|_        d|_
        t        || j                  «      | _        | j#                  «        y )Nr^  FT)r«   r¬   r   r‘   r_  r­   r˜   r[  ÚcopyÚdeepcopyÚis_encoder_decoderrK  rZ  ÚencoderÚ
is_decoderr‹  Údecoderrf  )ri   r   Úencoder_configÚdecoder_configr¯   s       €r    r¬   zProphetNetModel.__init__  s¨   ø€ Ü‰Ñ˜Ô Ü!Ÿ|™|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÔsˆÔäŸ™ vÓ.ˆØ,1ˆÔ)Ø#(ˆÔ Ü(¨¸×9MÑ9MÓNˆŒäŸ™ vÓ.ˆØ$(ˆÔ!Ø,1ˆÔ)Ü(¨¸×9MÑ9MÓNˆŒð 	‰Õr!   c                 ó   — | j                   S r¨   ri  rh   s    r    rj  z$ProphetNetModel.get_input_embeddings0  rk  r!   c                 ó~   — || _         | j                   | j                  _         | j                   | j                  _         y r¨   )r[  r¾  rÀ  rm  s     r    ro  z$ProphetNetModel.set_input_embeddings3  s.   € Ø$ˆÔØ'+×';Ñ';ˆ‰Ô$Ø'+×';Ñ';ˆ‰Õ$r!   c                 óò   — | j                   j                  ra| j                  | j                  j                  | j                  «       | j                  | j
                  j                  | j                  «       y y r¨   )r   Útie_word_embeddingsÚ_tie_or_clone_weightsr¾  r[  rÀ  rh   s    r    Ú_tie_weightszProphetNetModel._tie_weights8  sT   € Ø;‰;×*Ò*Ø×&Ñ& t§|¡|×'CÑ'CÀT×EYÑEYÔZØ×&Ñ& t§|¡|×'CÑ'CÀT×EYÑEYÕZð +r!   c                 ó   — | j                   S r¨   )r¾  rh   s    r    Úget_encoderzProphetNetModel.get_encoder=  ó   € Ø|‰|Ðr!   c                 ó   — | j                   S r¨   ©rÀ  rh   s    r    Úget_decoderzProphetNetModel.get_decoder@  rË  r!   rž   r¸   Údecoder_input_idsÚdecoder_attention_maskrq  Údecoder_head_maskr“  Úencoder_outputsrY   rr  Údecoder_inputs_embedsrK  r×   rs  rt  r¦   c                 ó\  — ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|€| j                  ||||
|||¬«      }| j                  |||d   ||||	|||||¬«      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  ¬«      S )a7  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```)rž   r¸   rq  rr  r×   rs  rt  r   )rž   r¸   r`   r’  rq  r“  rY   rr  r×   rs  rK  rt  )rv   rw   rY   rZ   r[   r\   r]   r^   r_   r`   ra   )r   rK  r×   rs  r  r¾  rÀ  ru   rv   rw   rY   r{   r|   r}   r~   r^   )ri   rž   r¸   rÏ  rÐ  rq  rÑ  r“  rÒ  rY   rr  rÓ  rK  r×   rs  rt  Údecoder_outputss                    r    r¶   zProphetNetModel.forwardC  sT  € ðr "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ"Ø"Ÿl™lØ#Ø-Ø#Ø+Ø"3Ø%9Ø'ð +ó ˆOð Ÿ,™,Ø'Ø1Ø"1°!Ñ"4Ø#1Ø'Ø!5Ø+Ø/Ø/Ø!5ØØ#ð 'ó 
ˆñ Ø" _Ñ4Ð4Ü+Ø-×?Ñ?Ø$3×$KÑ$KØ+×;Ñ;Ø"1×"?Ñ"?Ø(7×(KÑ(KØ.×9Ñ9Ø%4×%EÑ%EØ,×=Ñ=Ø&5×&GÑ&GØ"1×"?Ñ"?Ø.×9Ñ9ô
ð 	
r!   )NNNNNNNNNNNNNNN)rl   rm   rn   Ú_tied_weights_keysr   r¬   rj  ro  rÈ  rÊ  rÎ  r   r   r   r   Ú
BoolTensorr   rñ   r   ru   r¶   r½   r¾   s   @r    r·  r·    s®  ø„ à:Ð<\Ð]ÐðÐ/õ ò"$ò<ò
[ò
òð ð -1Ø15Ø48Ø=AØ,0Ø48Ø7;Ø+/Ø@DØ04Ø8<Ø$(Ø,0Ø/3Ø&*ñ!h
à˜EŸL™LÑ)ðh
ð ! §¡Ñ.ðh
ð $ E§L¡LÑ1ð	h
ð
 !)¨×)9Ñ)9Ñ :ðh
ð ˜EŸL™LÑ)ðh
ð $ E§L¡LÑ1ðh
ð ' u§|¡|Ñ4ðh
ð " %™ðh
ð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðh
ð   §¡Ñ-ðh
ð  (¨¯©Ñ5ðh
ð ˜D‘>ðh
ð $ D™>ðh
ð ' t™nðh
ð  ˜d‘^ð!h
ð" 
ˆuÐ2Ð2Ñ	3ò#h
ó ôh
r!   r·  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            &       óf  ‡ — e Zd Zg d¢Zdefˆ fd„Zd„ Zd„ Zd„ Zd„ Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                     deeeej                           deej                     deej                     deej                     dee   dee   dee   dee   deeef   f"d„«       Zd d„Zdej                  fd„Zed„ «       Zd„ Zd„ Zˆ xZS )!Ú"ProphetNetForConditionalGeneration)r¸  r¹  úlm_head.weightr   c                 ó
  •— t         ‰|   |«       t        |«      | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  d¬«      | _        | j                  «        y )NF©r   )r«   r¬   r·  rƒ   r˜   r’   Údisable_ngram_lossr   r‰   r­   r_  Úlm_headrf  r®   s     €r    r¬   z+ProphetNetForConditionalGeneration.__init__·  sd   ø€ Ü‰Ñ˜Ô Ü)¨&Ó1ˆŒØ!×.Ñ.ˆÔØ"(×";Ñ";ˆÔä—y‘y ×!3Ñ!3°V×5FÑ5FÈUÔSˆŒð 	‰Õr!   c                 ó   — | j                   S r¨   ©rÞ  rh   s    r    Úget_output_embeddingsz8ProphetNetForConditionalGeneration.get_output_embeddingsÂ  rË  r!   c                 ó   — || _         y r¨   rà  ©ri   Únew_embeddingss     r    Úset_output_embeddingsz8ProphetNetForConditionalGeneration.set_output_embeddingsÅ  ó	   € Ø%ˆr!   c                 ó’   — | j                   j                  r1| j                  | j                  j                  | j
                  «       y y r¨   )r   rÆ  rÇ  rƒ   r[  rÞ  rh   s    r    rÈ  z/ProphetNetForConditionalGeneration._tie_weightsÈ  s2   € Ø;‰;×*Ò*Ø×&Ñ& t§¡×'FÑ'FÈÏÉÕUð +r!   c                 ó.   — | j                   j                  S r¨   )rƒ   r[  rh   s    r    rj  z7ProphetNetForConditionalGeneration.get_input_embeddingsÌ  s   € Ø‰×.Ñ.Ð.r!   rž   r¸   rÏ  rÐ  rq  rÑ  r“  rÒ  rY   rr  rÓ  ÚlabelsrK  r×   rs  rt  r¦   c                 ó`  — ||n| j                   j                  }||€|€| j                  |«      }| j                  |||||||||	|
|||||¬«      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d«      }| j                  |«      }|dd…df   }| j                   j                  dkD  r|dd…dd…f   nd}|j                  «       s|j                  «       }d}|| j                  ||«      }|s*t        d„ ||fD «       «      }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  ¬«      S )	aË	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)rž   r¸   rÏ  rÐ  rq  rÑ  r“  rÒ  rY   rr  rÓ  rK  r×   rs  rt  r%   r   rJ   r   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wr¨   rs   rz  s     r    r}  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>7  ó   è ø€ ÒR QÀAÁMœqÑRùr~  )rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   )r   r  r    rƒ   rš   rÐ   r0   rÞ  Úis_contiguousrÒ   Ú_compute_lossr„  rU   rY   rZ   r[   r\   r]   r^   r_   r`   ra   )ri   rž   r¸   rÏ  rÐ  rq  rÑ  r“  rÒ  rY   rr  rÓ  ré  rK  r×   rs  rt  rA  rä   r/   Úpredicting_streamsÚpredict_logitsrW   rX   rV   Ú
all_logitss                             r    r¶   z*ProphetNetForConditionalGeneration.forwardÏ  sù  € ð| &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐÐ"3Ð";Ð@UÐ@]à $× 1Ñ 1°&Ó 9Ðà—/‘/ØØ)Ø/Ø#9ØØ/Ø!5Ø+Ø+Ø'Ø"7ØØ/Ø!5Ø#ð "ó 
ˆð$ (9Ð'DÐ×#Ò#ÐJ_×JeÑJeÐfhÐghÐJiñ 	$ˆ
Oð % Q™ZŸ_™_¨Z¸¿¹×9JÑ9JÈOÐ]_Ó`ÐØŸ™Ð&8Ó9ˆà¢ 1 Ñ%ˆØ04·±×0AÑ0AÀAÒ0E~¢a¨© eÒ,È4ˆð ×#Ñ#Ô%Ø×&Ñ&Ó(ˆFàˆØÐØ×%Ñ% n°fÓ=ˆDáÜÑR¨6°<Ð*@ÔRÓRˆJØ9=Ð9ID7˜ZÑ'¨'°!°"¨+Ñ5ÐgÈzÐ\cÐdeÐdfÐ\gÑOgÐgä,ØØØ)Ø '× 7Ñ 7Ø&-×&CÑ&CØ,3×,OÑ,OØ#*×#=Ñ#=Ø)0×)IÑ)IØ!(×!9Ñ!9Ø*1×*KÑ*KØ&-×&CÑ&CØ#*×#=Ñ#=ôð r!   c                 óÌ  — |j                  | j                  j                  |j                  d«      |j                  d«      «      j	                  |«      }t        | j                  j                  «      D ]!  }|dkD  r| j                  r n|||d d …d d …f<   Œ# |j                  dd«      j                  «       }t        j                  j                  |j                  d|j                  d«      «      dt        j                  ¬«      }t        j                  j                  ||j                  d«      d¬«      }| j                  j                   dkD  r“|j#                  dd¬	«       }|j%                  |«      j                  d«      }	||	   }|j'                  «       }| j                  j                   |j                  d«      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S ©Nr   r   rJ   r   r†   )Ú	reductionr…   T)r   Úkeepdimrv  ©r™   r   r0   rM   Úfill_r+   rÝ  rÑ   rÒ   r   r   Úlog_softmaxrÐ   r   r   Únll_lossÚepsÚsumÚner†   ©ri   rW   ré  Úignore_indexÚexpend_targetsÚiÚlprobsrV   Úsmooth_lossÚnon_masked_tokensÚeps_is              r    rî  z0ProphetNetForConditionalGeneration._compute_lossI  ó–  € Ø×)Ñ)¨$¯+©+×*;Ñ*;¸V¿[¹[È»^ÈVÏ[É[ÐYZË^Ó\×bÑbÐcoÓpˆät—{‘{×(Ñ(Ó)ò 	-ˆAØ1Šu˜×0Ò0ÙØ&,ˆN˜1ša¢˜7Ò#ð	-ð
 ×!Ñ! ! QÓ'×2Ñ2Ó4ˆÜ—‘×*Ñ*ØK‰K˜˜FŸK™K¨›OÓ,ØÜ—-‘-ð +ó 
ˆô }‰}×%Ñ% f¨n×.AÑ.AÀ"Ó.EÐQWÐ%ÓXˆà;‰;?‰?˜SÒ Ø!Ÿ:™:¨"°d˜:Ó;Ð;ˆKØ .× 1Ñ 1°,Ó ?× DÑ DÀRÓ HÐØ%Ð&7Ñ8ˆKØ%×*Ñ*Ó,ˆKà—K‘K—O‘O f§k¡k°"£oÑ5ˆEØ˜$Ÿ+™+Ÿ/™/Ñ)¨TÑ1°E¸KÑ4GÑGˆDàˆr!   c                 ó$   — | j                  |«      S r¨   )r    )ri   ré  s     r    Ú%prepare_decoder_input_ids_from_labelszHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelse  s   € Ø× Ñ  Ó(Ð(r!   c                 ó\   ‡— d}| D ]#  }|t        ˆfd„|d d D «       «      |dd  z   fz  }Œ% |S )Nrs   c              3   ót   •K  — | ]/  }|j                  d ‰j                  |j                  «      «      –— Œ1 y­w©r   N©Úindex_selectr  r#   ©r{  Ú
past_stateÚbeam_idxs     €r    r}  zDProphetNetForConditionalGeneration._reorder_cache.<locals>.<genexpr>o  s.   øè ø€ ÒrÐU_j×-Ñ-¨a°·±¸Z×=NÑ=NÓ1O×PÑrùó   ƒ58r%   ©r„  ©rY   r  Úreordered_pastÚ
layer_pasts    `  r    Ú_reorder_cachez1ProphetNetForConditionalGeneration._reorder_cacheh  sV   ø€ ð ˆØ)ò 	ˆJàÜÓrÐcmÐnpÐopÐcqÔrÓrØ˜Q˜R.ñ!ðñ ‰Nð	ð Ðr!   c                 ó.   — | j                   j                  S r¨   )rƒ   r¾  rh   s    r    rÊ  z.ProphetNetForConditionalGeneration.get_encodert  ó   € Ø‰×&Ñ&Ð&r!   c                 ó.   — | j                   j                  S r¨   ©rƒ   rÀ  rh   s    r    rÎ  z.ProphetNetForConditionalGeneration.get_decoderw  r  r!   )NNNNNNNNNNNNNNNN©r–   )rl   rm   rn   rÖ  r   r¬   rá  rå  rÈ  rj  r   r   r   r   r×  r   rñ   r   rU   r¶   rî  r  Ústaticmethodr  rÊ  rÎ  r½   r¾   s   @r    rÙ  rÙ  ¯  sö  ø„ ò pÐð	Ð/õ 	òò&òVò/ð ð -1Ø15Ø48Ø=AØ,0Ø48Ø7;Ø26Ø@DØ04Ø8<Ø)-Ø$(Ø,0Ø/3Ø&*ñ#wà˜EŸL™LÑ)ðwð ! §¡Ñ.ðwð $ E§L¡LÑ1ð	wð
 !)¨×)9Ñ)9Ñ :ðwð ˜EŸL™LÑ)ðwð $ E§L¡LÑ1ðwð ' u§|¡|Ñ4ðwð " %§,¡,Ñ/ðwð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðwð   §¡Ñ-ðwð  (¨¯©Ñ5ðwð ˜Ÿ™Ñ&ðwð ˜D‘>ðwð $ D™>ðwð  ' t™nð!wð" ˜d‘^ð#wð$ 
ˆuÐ/Ð/Ñ	0ò%wó ðwórð8)¸E¿L¹Ló )ð ñó ðò'ö'r!   rÙ  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                    óü  ‡ — e Zd Zg d¢Zdefˆ fd„Zd„ Zd„ Zd„ Zd„ Z	d„ Z
d	„ Zd
„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                      deej                      deej                      deej                      deej                      deej                      deeeej                            deej                      deej                      dee   dee   dee   dee   deeef   fd„«       Zdd„Z	 	 	 	 dd„Zed„ «       Zˆ xZS ) ÚProphetNetForCausalLM)z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightrÚ  r   c                 óP  •— t        j                  |«      }d|_        d|_        t        ‰|   |«       t        |«      | _        |j                  | _	        |j                  | _
        t        j                  |j                  |j                  d¬«      | _        | j!                  «        y )NTFrÜ  )r»  r¼  r¿  r½  r«   r¬   ÚProphetNetDecoderWrapperrƒ   r˜   r’   rÝ  r   r‰   r­   r_  rÞ  rf  r®   s     €r    r¬   zProphetNetForCausalLM.__init__‡  s‚   ø€ ä—‘˜vÓ&ˆØ ˆÔØ$)ˆÔ!Ü‰Ñ˜Ô Ü2°6Ó:ˆŒà!×.Ñ.ˆÔØ"(×";Ñ";ˆÔä—y‘y ×!3Ñ!3°V×5FÑ5FÈUÔSˆŒð 	‰Õr!   c                 óB   — | j                   j                  j                  S r¨   ©rƒ   rÀ  r[  rh   s    r    rj  z*ProphetNetForCausalLM.get_input_embeddings—  s   € Ø‰×&Ñ&×6Ñ6Ð6r!   c                 ó:   — || j                   j                  _        y r¨   r!  rm  s     r    ro  z*ProphetNetForCausalLM.set_input_embeddingsš  s   € Ø27ˆ‰×ÑÕ/r!   c                 ó   — | j                   S r¨   rà  rh   s    r    rá  z+ProphetNetForCausalLM.get_output_embeddings  rË  r!   c                 ó   — || _         y r¨   rà  rã  s     r    rå  z+ProphetNetForCausalLM.set_output_embeddings   ræ  r!   c                 ó¦   — | j                   j                  r;| j                  | j                  j                  j
                  | j                  «       y y r¨   )r   rÆ  rÇ  rƒ   rÀ  r[  rÞ  rh   s    r    rÈ  z"ProphetNetForCausalLM._tie_weights£  s;   € Ø;‰;×*Ò*Ø×&Ñ& t§¡×'>Ñ'>×'NÑ'NÐPT×P\ÑP\Õ]ð +r!   c                 ó&   — || j                   _        y r¨   r  )ri   rÀ  s     r    Úset_decoderz!ProphetNetForCausalLM.set_decoder§  s   € Ø")ˆ‰Õr!   c                 ó.   — | j                   j                  S r¨   r  rh   s    r    rÎ  z!ProphetNetForCausalLM.get_decoderª  r  r!   rž   r¸   r`   r’  rq  r“  rY   rr  ré  rK  r×   rs  rt  r¦   c                 ó¾  — ||n| j                   j                  }| j                  j                  |||||||||
|||¬«      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d«      }| j                  |«      }|dd…df   }| j                   j                  dkD  r|dd…dd…f   nd}d}|	| j                  ||	«      }|s*t        d„ ||fD «       «      }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                  |j                  |j                   ¬«	      S )	að	  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)rž   r¸   r`   r’  rq  r“  rY   rr  rK  r×   rs  rt  r%   r   rJ   r   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wr¨   rs   rz  s     r    r}  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>  rì  r~  )	rV   rW   rX   rY   r{   r|   r}   r~   r^   )r   r  rƒ   rÀ  rš   rÐ   r0   rÞ  rî  r„  r€   rY   r{   r|   r}   r~   r^   )ri   rž   r¸   r`   r’  rq  r“  rY   rr  ré  rK  r×   rs  rt  rA  rä   r/   rï  rð  rW   rX   rV   rñ  s                          r    r¶   zProphetNetForCausalLM.forward­  s˜  € ðB &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆð —/‘/×)Ñ)ØØ)Ø"7Ø#9ØØ!5Ø+Ø'ØØ/Ø!5Ø#ð *ó 
ˆð :CÐ9N i§o¢oÐTa×TgÑTgÐhjÐijÐTkÑ#ˆ
Oà$ Q™ZŸ_™_¨Z¸¿¹×9JÑ9JÈOÐ]_Ó`ÐØŸ™Ð&8Ó9ˆà¢ 1 Ñ%ˆØ04·±×0AÑ0AÀAÒ0E~¢a¨© eÒ,È4ˆàˆØÐØ×%Ñ% n°fÓ=ˆDáÜÑR¨6°<Ð*@ÔRÓRˆJØ9=Ð9ID7˜ZÑ'¨'°!°"¨+Ñ5ÐgÈzÐ\cÐdeÐdfÐ\gÑOgÐgä,ØØØ)Ø '× 7Ñ 7Ø%×3Ñ3Ø$+×$?Ñ$?Ø"×-Ñ-Ø!(×!9Ñ!9Ø!(×!9Ñ!9ô
ð 
r!   c                 óÌ  — |j                  | j                  j                  |j                  d«      |j                  d«      «      j	                  |«      }t        | j                  j                  «      D ]!  }|dkD  r| j                  r n|||d d …d d …f<   Œ# |j                  dd«      j                  «       }t        j                  j                  |j                  d|j                  d«      «      dt        j                  ¬«      }t        j                  j                  ||j                  d«      d¬«      }| j                  j                   dkD  r“|j#                  dd¬	«       }|j%                  |«      j                  d«      }	||	   }|j'                  «       }| j                  j                   |j                  d«      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S ró  rö  rý  s              r    rî  z#ProphetNetForCausalLM._compute_loss  r  r!   c                 óf   — |€|j                  |j                  «      }|r|d d …dd …f   }|||||dœS )NrJ   )rž   r¸   rq  rY   rK  )Únew_onesrš   )ri   rž   rY   r¸   rq  rK  Úkwargss          r    Úprepare_inputs_for_generationz3ProphetNetForCausalLM.prepare_inputs_for_generation8  sL   € ð Ð!Ø&×/Ñ/°	·±Ó@ˆNáØ!¢! R¡S &Ñ)ˆIð #Ø,Ø"Ø.Ø"ñ
ð 	
r!   c                 óJ   ‡— d}| D ]  }|t        ˆfd„|D «       «      fz  }Œ |S )Nrs   c              3   ót   •K  — | ]/  }|j                  d ‰j                  |j                  «      «      –— Œ1 y­wr
  r  r  s     €r    r}  z7ProphetNetForCausalLM._reorder_cache.<locals>.<genexpr>X  s.   øè ø€ ÒnÐU_j×-Ñ-¨a°·±¸Z×=NÑ=NÓ1O×PÑnùr  r  r  s    `  r    r  z$ProphetNetForCausalLM._reorder_cacheR  s?   ø€ ð ˆØ)ò 	ˆJØÜÓnÐcmÔnÓnðñ ‰Nð	ð Ðr!   )NNNNNNNNNNNNNr  )NNNN)rl   rm   rn   rÖ  r   r¬   rj  ro  rá  rå  rÈ  r'  rÎ  r   r   r   r   r   rñ   r   r€   r¶   rî  r/  r  r  r½   r¾   s   @r    r  r  {  s¶  ø„ òÐðÐ/õ ò 7ò8òò&ò^ò*ò'ð ð -1Ø15Ø8<Ø9=Ø,0Ø7;Ø@DØ04Ø)-Ø$(Ø,0Ø/3Ø&*ñlà˜EŸL™LÑ)ðlð ! §¡Ñ.ðlð  (¨¯©Ñ5ð	lð
 !)¨¯©Ñ 6ðlð ˜EŸL™LÑ)ðlð ' u§|¡|Ñ4ðlð " %¨¨e¯l©lÑ(;Ñ"<Ñ=ðlð   §¡Ñ-ðlð ˜Ÿ™Ñ&ðlð ˜D‘>ðlð $ D™>ðlð ' t™nðlð ˜d‘^ðlð 
ˆuÐ/Ð/Ñ	0òló ðló\ð> ØØØó
ð4 ñó ôr!   r  c                   ó4   ‡ — e Zd ZdZdefˆ fd„Zd„ Zd„ Zˆ xZS )r  z„
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r   c                 óò   •— t         ‰|   |«       t        j                  |j                  |j
                  |j                  ¬«      | _        t        || j                  ¬«      | _	        | j                  «        y )Nr^  ri  )r«   r¬   r   r‘   r_  r­   r˜   r[  r‹  rÀ  rf  r®   s     €r    r¬   z!ProphetNetDecoderWrapper.__init__c  sX   ø€ Ü‰Ñ˜Ô ä!Ÿ|™|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÔsˆÔÜ(¨À×AUÑAUÔVˆŒð 	‰Õr!   c                 ól   — | j                  | j                  | j                  j                  «       «       y r¨   )rÇ  r[  rÀ  rj  rh   s    r    rÈ  z%ProphetNetDecoderWrapper._tie_weightsl  s%   € Ø×"Ñ" 4×#7Ñ#7¸¿¹×9ZÑ9ZÓ9\Õ]r!   c                 ó&   —  | j                   |i |¤ŽS r¨   rÍ  )ri   Úargsr.  s      r    r¶   z ProphetNetDecoderWrapper.forwardo  s   € Øˆt|‰|˜TÐ, VÑ,Ð,r!   )	rl   rm   rn   ro   r   r¬   rÈ  r¶   r½   r¾   s   @r    r  r  ]  s    ø„ ñð
Ð/õ ò^ö-r!   r  )r‹  rZ  r  rÙ  r·  r‚   rB  )9ro   r»  r<   re   Údataclassesr   Útypingr   r   r   r   Útorch.utils.checkpointr   r   Útorch.nnr	   Úactivationsr   Ú
generationr   Úmodeling_outputsr   Úmodeling_utilsr   Úutilsr   r   r   Úconfiguration_prophetnetr   Ú
get_loggerrl   r™  r   r4   rH   rS   rU   ru   rz   r€   r‚   r‘   r¥   ÚModulerÀ   ró   rý   r5  rE  rZ  r‹  r·  rÙ  r  r  Ú__all__rs   r!   r    ú<module>rD     sP  ðñ Yã Û Û Ý !ß )Ñ )ã Û ß Ý å !Ý )Ý /Ý -ß 9Ñ 9Ý 6ð 
ˆ×	Ñ	˜HÓ	%€óQò7ó" ò6Mð. ôQ% ó Q%ó ðQ%ðh ôR% ;ó R%ó ðR%ðj ô8@ ;ó 8@ó ð8@ðv ô:@ ó :@ó ð:@ðz ô#! ó #!ó ð#!ôL(- R§\¡\ô (-ôV~B˜"Ÿ)™)ô ~BôB˜BŸI™Iô ô.|/ 2§9¡9ô |/ô~	(˜RŸY™Yô (ôVQ˜RŸY™Yô Qñh ðôô
{
Ð1ó {
óð
{
ñ| ðôô
RGÐ1ó RGóð
RGðj
 ôP
Ð/ó P
ó ðP
ñf ðôô
D'Ð)BÀOó D'óð
D'ñN ðôô
ZÐ5°ó Zóð
Zôz-Ð8ô -ò,r!   