
    Uh`                       d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ  ej>                  e       Z!e G d de             Z"e G d de             Z#e G d de             Z$e G d de             Z%e G d de             Z&e G d de             Z'e G d de             Z(e G d de             Z)e G d d e             Z*e G d! d"e             Z+ G d# d$e
jX                        Z- G d% d&e
jX                        Z. G d' d(e
jX                        Z/ G d) d*e
jX                        Z0 G d+ d,e
jX                        Z1 G d- d.e
jX                        Z2 G d/ d0e
jX                        Z3 G d1 d2e
jX                        Z4 G d3 d4e
jX                        Z5 G d5 d6e
jX                        Z6 G d7 d8e
jX                        Z7 G d9 d:e
jX                        Z8e G d; d<e             Z9 ed=>       G d? d@e9             Z:dA Z; G dB dCe
jX                        Z< edD>       G dE dFe9             Z= edG>       G dH dIe9             Z> edJ>       G dK dLe9             Z? edM>       G dN dOe9             Z@ edP>       G dQ dRe9             ZA edS>       G dT dUe9             ZBe G dV dWe9             ZCe G dX dYe9             ZDg dZZEy)[zPyTorch LUKE model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)apply_chunking_to_forward)ModelOutputauto_docstringlogging   )
LukeConfigc                   l    e Zd ZU dZdZeej                     ed<   dZ	ee
ej                  df      ed<   y)BaseLukeModelOutputWithPoolinga  
    Base class for outputs of the LUKE model.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
            Sequence of entity hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
            Linear layer and a Tanh activation function.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length +
            entity_length, sequence_length + entity_length)`. Attentions weights after the attention softmax, used to
            compute the weighted average in the self-attention heads.
    Nentity_last_hidden_state.entity_hidden_states__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r        x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/luke/modeling_luke.pyr   r   %   @    2 =Ahu'8'89@DH(5):):C)?#@AHr$   r   c                   l    e Zd ZU dZdZeej                     ed<   dZ	ee
ej                  df      ed<   y)BaseLukeModelOutputa#  
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
            Sequence of entity hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr   .r   r   r#   r$   r%   r(   r(   D   r&   r$   r(   c                   l   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                  d	f      ed
<   dZeeej                  d	f      ed<   y)LukeMaskedLMOutputa>	  
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            The sum of masked language modeling (MLM) loss and entity prediction loss.
        mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked entity prediction (MEP) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossmlm_lossmep_losslogitsentity_logitshidden_states.r   
attentions)r   r   r   r   r+   r   r    r!   r"   r,   r-   r.   r/   r0   r   r   r1   r#   r$   r%   r*   r*   c   s    > )-D(5$$
%,,0Hhu(()0,0Hhu(()0*.FHU&&'.15M8E--.58<M8E%"3"345<DH(5):):C)?#@AH:>Ju00#567>r$   r*   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	EntityClassificationOutputay  
    Outputs of entity classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr+   r.   .r0   r   r1   r   r   r   r   r+   r   r    r!   r"   r.   r0   r   r   r1   r#   r$   r%   r3   r3          , )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r$   r3   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	EntityPairClassificationOutputa~  
    Outputs of entity pair classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr+   r.   .r0   r   r1   r4   r#   r$   r%   r7   r7      r5   r$   r7   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	EntitySpanClassificationOutputa  
    Outputs of entity span classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr+   r.   .r0   r   r1   r4   r#   r$   r%   r9   r9      r5   r$   r9   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	LukeSequenceClassifierOutputa  
    Outputs of sentence classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr+   r.   .r0   r   r1   r4   r#   r$   r%   r;   r;          2 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r$   r;   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	LukeTokenClassifierOutputa  
    Base class for outputs of token classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr+   r.   .r0   r   r1   r4   r#   r$   r%   r>   r>     r<   r$   r>   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed	<   y)
 LukeQuestionAnsweringModelOutputay  
    Outputs of question answering models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr+   start_logits
end_logits.r0   r   r1   )r   r   r   r   r+   r   r    r!   r"   rA   rB   r0   r   r   r1   r#   r$   r%   r@   r@   /  s    6 )-D(5$$
%,04L(5,,-4.2J**+2=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r$   r@   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	LukeMultipleChoiceModelOutputa  
    Outputs of multiple choice models.

    Args:
        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
            Classification loss.
        logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

            Classification scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
            layer plus the initial entity embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr+   r.   .r0   r   r1   r4   r#   r$   r%   rD   rD   T  s    6 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:ADH(5):):C)?#@AH:>Ju00#567>r$   rD   c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )LukeEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        |j                  | _        t        j                  |j                  |j
                  | j"                        | _	        y )Npadding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrI   selfconfig	__class__s     r%   rM   zLukeEmbeddings.__init__}  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= "..#%<<**F,>,>DL\L\$
 r$   c                    |C|0t        || j                        j                  |j                        }n| j	                  |      }||j                         }n|j                         d d }|:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }| j                  |      }||z   |z   }| j                  |      }| j                  |      }|S )Ndtypedevice)"create_position_ids_from_input_idsrI   tord   &create_position_ids_from_inputs_embedssizer    zeroslongposition_idsrR   rT   rV   rW   r[   )	r]   	input_idstoken_type_idsrk   inputs_embedsinput_shaperT   rV   
embeddingss	            r%   forwardzLukeEmbeddings.forward  s     $A)TM]M]^aabkbrbrs#JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"%88;PP
^^J/
\\*-
r$   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nra   r   rb   r   )rh   r    arangerI   rj   rd   	unsqueezeexpand)r]   rn   ro   sequence_lengthrk   s        r%   rg   z5LukeEmbeddings.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r$   )NNNN)r   r   r   r   rM   rq   rg   __classcell__r_   s   @r%   rF   rF   x  s&    
& B=r$   rF   c                   ~     e Zd Zdef fdZ	 ddej                  dej                  deej                     fdZ xZ	S )LukeEntityEmbeddingsr^   c                    t         |           || _        t        j                  |j
                  |j                  d      | _        |j                  |j                  k7  r1t        j                  |j                  |j                  d      | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                         | _        t        j"                  |j$                        | _        y )Nr   rH   FbiasrJ   )rL   rM   r^   r   rN   entity_vocab_sizeentity_emb_sizeentity_embeddingsrP   Linearentity_embedding_denserS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   s     r%   rM   zLukeEntityEmbeddings.__init__  s    !#f.F.FH^H^lm!n!!V%7%77*,))F4J4JFL^L^ej*kD'#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=r$   
entity_idsrk   rm   c                 R   |t        j                  |      }| j                  |      }| j                  j                  | j                  j
                  k7  r| j                  |      }| j                  |j                  d            }|dk7  j                  |      j                  d      }||z  }t        j                  |d      }||j                  d      j                  d      z  }| j                  |      }||z   |z   }| j                  |      }| j                  |      }|S )Nr   )minra   dimgHz>)r    
zeros_liker   r^   r   rP   r   rT   clamptype_asrt   sumrV   rW   r[   )	r]   r   rk   rm   r   rT   position_embedding_maskrV   rp   s	            r%   rq   zLukeEntityEmbeddings.forward  s'    !"--j9N 22:>;;&&$++*A*AA $ ; ;<M N"66|7I7Ia7I7PQ#/2#5">">?R"S"]"]^`"a14KK#ii(;D14K4O4OTV4O4W4]4]bf4]4gg $ : :> J&)<<?TT
^^J/
\\*-
r$   N)
r   r   r   r   rM   r    
LongTensorr   rq   rw   rx   s   @r%   rz   rz     sL    >z >$ 6:	$$ && !!1!12	r$   rz   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )LukeSelfAttentionc                     t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        |j                  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        | j                  rt        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j$                  |j&                        | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)rL   rM   rP   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizeuse_entity_aware_attentionr   r   querykeyvalue	w2e_query	e2w_query	e2e_queryrY   attention_probs_dropout_probr[   r\   s     r%   rM   zLukeSelfAttention.__init__  s    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP*0*K*K'YYv1143E3EF
99V//1C1CDYYv1143E3EF
**YYv'9'94;M;MNDNYYv'9'94;M;MNDNYYv'9'94;M;MNDNzz&"E"EFr$   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nra   r      r   r   )rh   r   r   viewpermute)r]   xnew_x_shapes      r%   transpose_for_scoresz&LukeSelfAttention.transpose_for_scores  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r$   c                    |j                  d      }||}nt        j                  ||gd      }| j                  | j	                  |            }| j                  | j                  |            }	| j                  r|| j                  | j                  |            }
| j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }|d d d d d |d d f   }|d d d d d |d d f   }|d d d d |d d d f   }|d d d d |d d d f   }t        j                  |
|j                  dd            }t        j                  ||j                  dd            }t        j                  ||j                  dd            }t        j                  ||j                  dd            }t        j                  ||gd      }t        j                  ||gd      }t        j                  ||gd      }nF| j                  | j                  |            }t        j                  ||j                  dd            }|t        j                  | j                        z  }|||z   }t         j"                  j%                  |d      }| j'                  |      }|||z  }t        j                  ||	      }|j)                  dddd      j+                         }|j                         d d | j,                  fz   } |j.                  | }|d d d |d d f   }|d }n|d d |d d d f   }|r|||f}|S ||f}|S )Nr   r   ra   r   r   r   r   )rh   r    catr   r   r   r   r   r   r   r   matmul	transposemathsqrtr   r   
functionalsoftmaxr[   r   
contiguousr   r   ) r]   word_hidden_statesr   attention_mask	head_maskoutput_attentions	word_sizeconcat_hidden_states	key_layervalue_layerw2w_query_layerw2e_query_layere2w_query_layere2e_query_layerw2w_key_layere2w_key_layerw2e_key_layere2e_key_layerw2w_attention_scoresw2e_attention_scorese2w_attention_scorese2e_attention_scoresword_attention_scoresentity_attention_scoresattention_scoresquery_layerattention_probscontext_layernew_context_layer_shapeoutput_word_hidden_statesoutput_entity_hidden_statesoutputss                                    r%   rq   zLukeSelfAttention.forward
  sx    '++A.	'#5 #(99.@BV-W]^#_ --dhh7K.LM	//

;O0PQ**/C/O #77

CU8VWO"77GY8Z[O"77G[8\]O"77G[8\]O &aJYJ&9:M%aJYJ&9:M%aIJ&9:M%aIJ&9:M $)<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a  %*II/CEY.Z`a$b!&+ii1EG[0\bc&d#$yy*?AX)Y_`a 33DJJ?S4TUK$||K9L9LRQS9TU+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD$1!ZiZ2B$C!'*.'*79:q8H*I'02M_G  12MNGr$   NNF)r   r   r   rM   r   rq   rw   rx   s   @r%   r   r     s    G0% Pr$   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )LukeSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrJ   )rL   rM   r   r   rP   denserW   rX   rY   rZ   r[   r\   s     r%   rM   zLukeSelfOutput.__init___  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r$   r0   input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r[   rW   r]   r0   r   s      r%   rq   zLukeSelfOutput.forwarde  7    

=1]3}|'CDr$   r   r   r   rM   r    Tensorrq   rw   rx   s   @r%   r   r   ^  1    >U\\  RWR^R^ r$   r   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )LukeAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rL   rM   r   r]   r   outputsetpruned_headsr\   s     r%   rM   zLukeAttention.__init__m  s0    %f-	$V,Er$   c                     t        d      Nz4LUKE does not support the pruning of attention headsNotImplementedError)r]   headss     r%   prune_headszLukeAttention.prune_headss      !"XYYr$   c                 F   |j                  d      }| j                  |||||      }||d   }|}	n3t        j                  |d d d      }t        j                  ||gd      }	| j	                  ||	      }
|
d d d |d d f   }|d }n|
d d |d d d f   }||f|dd  z   }|S )Nr   r   r   r   )rh   r]   r    r   r   )r]   r   r   r   r   r   r   self_outputsconcat_self_outputsr   attention_outputword_attention_outputentity_attention_outputr   s                 r%   rq   zLukeAttention.forwardv  s     '++A.	yy 
  '".q/#5 "'))L!,<!"D#(99.@BV-W]^#_ ;;':<PQ 0JYJ1A B'&*#&6q)*a7G&H# )*AB\RSRTEUUr$   r   )r   r   r   rM   r   rq   rw   rx   s   @r%   r   r   l  s    "Z "r$   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )LukeIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rL   rM   r   r   rP   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr\   s     r%   rM   zLukeIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r$   r0   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r]   r0   s     r%   rq   zLukeIntermediate.forward  s&    

=100?r$   r   rx   s   @r%   r   r     s#    9U\\ ell r$   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
LukeOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rL   rM   r   r   r   rP   r   rW   rX   rY   rZ   r[   r\   s     r%   rM   zLukeOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r$   r0   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r%   rq   zLukeOutput.forward  r   r$   r   rx   s   @r%   r   r     r   r$   r   c                   2     e Zd Z fdZ	 	 	 ddZd Z xZS )	LukeLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y Nr   )
rL   rM   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r\   s     r%   rM   zLukeLayer.__init__  sI    '-'E'E$&v.,V4 (r$   c                 J   |j                  d      }| j                  |||||      }||d   }nt        j                  |d d d      }|dd  }	t	        | j
                  | j                  | j                  |      }
|
d d d |d d f   }|d }n|
d d |d d d f   }||f|	z   }	|	S )Nr   )r   r   r   r   )rh   r  r    r   r   feed_forward_chunkr  r  )r]   r   r   r   r   r   r   self_attention_outputsconcat_attention_outputr   layer_outputword_layer_outputentity_layer_outputs                r%   rq   zLukeLayer.forward  s     '++A.	!% / "0 "
  '&<Q&?#&+ii0Fr0JPQ&R#(,0##T%A%A4CSCSUl
 )JYJ)9:'"&".q)*a/?"@$&9:WDr$   c                 L    | j                  |      }| j                  ||      }|S r   )r  r   )r]   r   intermediate_outputr  s       r%   r
  zLukeLayer.feed_forward_chunk  s,    "//0@A{{#68HIr$   r   )r   r   r   rM   rq   r
  rw   rx   s   @r%   r  r    s    ) #Jr$   r  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )LukeEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rL   rM   r^   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r]   r^   _r_   s      r%   rM   zLukeEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c           	         |rdnd }|rdnd }	|rdnd }
t        | j                        D ]y  \  }}|r||fz   }|	|fz   }	|||   nd }| j                  r-| j                  r!| j	                  |j
                  |||||      }n ||||||      }|d   }||d   }|sq|
|d   fz   }
{ |r||fz   }|	|fz   }	|st        d |||
||	fD              S t        |||
||	      S )Nr#   r   r   r   c              3   $   K   | ]  }|| 
 y wr   r#   .0vs     r%   	<genexpr>z&LukeEncoder.forward.<locals>.<genexpr>(        
 = 
   )last_hidden_stater0   r1   r   r   )	enumerater  r  training_gradient_checkpointing_func__call__tupler(   )r]   r   r   r   r   r   output_hidden_statesreturn_dictall_word_hidden_statesall_entity_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                  r%   rq   zLukeEncoder.forward  si    (<)=24 $5b4(4 	POA|#)?CUBW)W&+CG[F]+](.7.CilO**t}} $ A A ))&("#%! !-&("#%! "/q!1#/'4Q'7$ &9]1=M<O&O#=	P@  %;?Q>S%S"'?CWBY'Y$ 
 '*'(,
 
 
 #00*%9!9
 	
r$   )NNFFTr   r   r   rM   rq   rw   rx   s   @r%   r  r    s    , "D
r$   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
LukePoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rL   rM   r   r   rP   r   Tanh
activationr\   s     r%   rM   zLukePooler.__init__>  s9    YYv1163E3EF
'')r$   r0   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r7  )r]   r0   first_token_tensorpooled_outputs       r%   rq   zLukePooler.forwardC  s6     +1a40

#566r$   r   rx   s   @r%   r4  r4  =  s#    $
U\\ ell r$   r4  c                   $     e Zd Z fdZd Z xZS )EntityPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j
                  |j                        | _        y r   )rL   rM   r   r   rP   r   r   r   r   r   r   transform_act_fnrW   rX   r\   s     r%   rM   z&EntityPredictionHeadTransform.__init__M  s{    YYv1163I3IJ
f''-$*6+<+<$=D!$*$5$5D!f&<&<&BWBWXr$   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r>  rW   r   s     r%   rq   z%EntityPredictionHeadTransform.forwardV  s4    

=1--m<}5r$   r2  rx   s   @r%   r<  r<  L  s    Yr$   r<  c                   $     e Zd Z fdZd Z xZS )EntityPredictionHeadc                     t         |           || _        t        |      | _        t        j                  |j                  |j                  d      | _	        t        j                  t        j                  |j                              | _        y )NFr|   )rL   rM   r^   r<  	transformr   r   r   r~   decoder	Parameterr    ri   r}   r\   s     r%   rM   zEntityPredictionHead.__init__^  sa    6v>yy!7!79Q9QX]^LLV-E-E!FG	r$   c                 d    | j                  |      }| j                  |      | j                  z   }|S r   )rC  rD  r}   r   s     r%   rq   zEntityPredictionHead.forwarde  s-    }5]3dii?r$   r2  rx   s   @r%   rA  rA  ]  s    Hr$   rA  c                   @    e Zd ZeZdZdZddgZdej                  fdZ
y)LukePreTrainedModellukeTr   rz   modulec                 j   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        r|j                  dk(  r%|j                  j                  j                          n;|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNr         ?)r   r   r   weightdatanormal_r^   initializer_ranger}   zero_rN   embedding_dimrI   rW   fill_)r]   rJ  s     r%   _init_weightsz!LukePreTrainedModel._init_weightss  s(   fbii(MM&&CT[[5R5R&S{{&  &&( '-##q(""((*""**9V9V*W!!-""6#5#56<<> .-KK""$MM$$S) .r$   N)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   ModulerV  r#   r$   r%   rH  rH  l  s/    L&*#(*@A*BII *r$   rH  zt
    The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any
    )custom_introc                        e Zd Zddedef fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     dee   dee   dee   deeef   fd       Zdej                  deej                     fdZ xZS )	LukeModelr^   add_pooling_layerc                     t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        |rt        |      nd| _
        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)rL   rM   r^   rF   rp   rz   r   r  encoderr4  pooler	post_init)r]   r^   r_  r_   s      r%   rM   zLukeModel.__init__  sZ    
 	 (0!5f!="6*,=j(4 	r$   c                 .    | j                   j                  S r   rp   rR   r]   s    r%   get_input_embeddingszLukeModel.get_input_embeddings  s    ...r$   c                 &    || j                   _        y r   re  r]   r   s     r%   set_input_embeddingszLukeModel.set_input_embeddings  s    */'r$   c                 .    | j                   j                   S r   r   rf  s    r%   get_entity_embeddingszLukeModel.get_entity_embeddings  s    %%777r$   c                 &    || j                   _         y r   rl  ri  s     r%   set_entity_embeddingszLukeModel.set_entity_embeddings  s    380r$   c                     t        d      r   r   )r]   heads_to_prunes     r%   _prune_headszLukeModel._prune_heads  r   r$   rl   r   rm   rk   r   entity_attention_maskentity_token_type_idsentity_position_idsr   rn   r   r)  r*  r   c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||
t	        d      |#| j                  ||       |j                         }n!|
|
j                         dd }nt	        d      |\  }}||j                  n|
j                  }|t        j                  ||f|      }|&t        j                  |t        j                  |      }|V|j                  d      }|t        j                  ||f|      }|(t        j                  ||ft        j                  |      }| j                  |	| j                   j                        }	| j                  ||||
      }| j                  ||      }|d}n| j!                  |||      }| j#                  ||||	|||	      }|d
   }| j$                  | j%                  |      nd}|s
||f|dd z   S t'        |||j(                  |j*                  |j,                  |j.                        S )uz  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeModel

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
        >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
        # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"

        >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        # Input Wikipedia entities to obtain enriched contextualized representations of word tokens

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entities = [
        ...     "Beyoncé",
        ...     "Los Angeles",
        ... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"

        >>> encoding = tokenizer(
        ...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
        ... )
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timera   z5You have to specify either input_ids or inputs_embeds)rd   rb   r   )rl   rk   rm   rn   )r   r   r   r)  r*  r   )r#  pooler_outputr0   r1   r   r   )r^   r   r)  use_return_dictr   %warn_if_padding_and_no_attention_maskrh   rd   r    onesri   rj   get_head_maskr  rp   get_extended_attention_maskr   ra  rb  r   r0   r1   r   r   )r]   rl   r   rm   rk   r   rs  rt  ru  r   rn   r   r)  r*  ro   
batch_size
seq_lengthrd   entity_seq_lengthword_embedding_outputextended_attention_maskentity_embedding_outputencoder_outputssequence_outputr:  s                            r%   rq   zLukeModel.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZZ(@PN!"[[EJJvVN! * 2$,(-

J@Q3R[a(b%$,(-ZAR4S[`[e[ent(u% &&y$++2O2OP	 !%%)'	 !0 !
 #'"B"B>Sh"i &*#&*&<&<ZI\^s&t# ,,!#2/!5# ' 
 *!, 9=8OO4UY#]3oab6III--')77&11%4%M%M!0!E!E
 	
r$   word_attention_maskc                    |}|t        j                  ||gd      }|j                         dk(  r|dddddddf   }n:|j                         dk(  r|ddddddf   }nt        d|j                   d      |j                  | j                        }d	|z
  t        j                  | j                        j                  z  }|S )
ac  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            word_attention_mask (`torch.LongTensor`):
                Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
            entity_attention_mask (`torch.LongTensor`, *optional*):
                Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nra   r   r   r   z&Wrong shape for attention_mask (shape ))rc   rN  )	r    r   r   r   shaperf   rc   finfor   )r]   r  rs  r   r  s        r%   r|  z%LukeModel.get_extended_attention_maskG  s     - ,"YY8M'NTVWN1$&4Qa]&C#!Q&&4QdA5E&F#EnFZFZE[[\]^^"9"<"<4::"<"N#&)@#@EKKPTPZPZD[D_D_"_&&r$   )T)NNNNNNNNNNNNN)r   r   r   r   boolrM   rg  rj  rm  ro  rr  r   r   r    r   r!   r   r   r   rq   r|  rw   rx   s   @r%   r^  r^    s   z d "/089Z  156:593715=A<@:>1559,0/3&*Y
E,,-Y
 !!2!23Y
 !!1!12	Y

 u//0Y
 U--.Y
  ((9(9:Y
  ((8(89Y
 &e&6&67Y
 E--.Y
   1 12Y
 $D>Y
 'tnY
 d^Y
 
u44	5Y
 Y
v'#(#3#3'LTUZUeUeLf'r$   r^  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r    cumsumr   rj   )rl   rI   maskincremental_indicess       r%   re   re   f  sP     <<$((*D <<!4<<TBdJ##%33r$   c                   .     e Zd ZdZ fdZd Zd Z xZS )
LukeLMHeadz*Roberta Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y r   )rL   rM   r   r   rP   r   rW   rX   
layer_normrO   rD  rE  r    ri   r}   r\   s     r%   rM   zLukeLMHead.__init__z  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIr$   c                     | j                  |      }t        |      }| j                  |      }| j                  |      }|S r   )r   r   r  rD  )r]   featureskwargsr   s       r%   rq   zLukeLMHead.forward  s;    JJx GOOA LLOr$   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)rD  r}   rd   typerf  s    r%   _tie_weightszLukeLMHead._tie_weights  sC     <<##((F2 $		DLL))DIr$   )r   r   r   r   rM   rq   r  rw   rx   s   @r%   r  r  w  s    4&*r$   r  z
    The LUKE model with a language modeling head and entity prediction head on top for masked language modeling and
    masked entity prediction.
    c            $           e Zd Zg dZ fdZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   deeef   f d       Z xZS )LukeForMaskedLM)zlm_head.decoder.weightzlm_head.decoder.biasz!entity_predictions.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        t        |      | _        t        j                         | _
        | j                          y r   )rL   rM   r^  rI  r  lm_headrA  entity_predictionsr   r	   loss_fnrc  r\   s     r%   rM   zLukeForMaskedLM.__init__  sQ     f%	!&)"6v">**, 	r$   c                     t         |           | j                  | j                  j                  | j
                  j                  j                         y r   )rL   tie_weights_tie_or_clone_weightsr  rD  rI  r   )r]   r_   s    r%   r  zLukeForMaskedLM.tie_weights  s:    ""4#:#:#B#BDIID_D_DqDqrr$   c                 .    | j                   j                  S r   r  rD  rf  s    r%   get_output_embeddingsz%LukeForMaskedLM.get_output_embeddings  s    ||###r$   c                 &    || j                   _        y r   r  )r]   new_embeddingss     r%   set_output_embeddingsz%LukeForMaskedLM.set_output_embeddings  s    -r$   rl   r   rm   rk   r   rs  rt  ru  labelsentity_labelsr   rn   r   r)  r*  r   c                 J   ||n| j                   j                  }| j                  ||||||||||||d      }d}d}| j                  |j                        }|	d|	j                  |j                        }	| j                  |j                  d| j                   j                        |	j                  d            }||}d}d}|j                  l| j                  |j                        }|
O| j                  |j                  d| j                   j                        |
j                  d            }||}n||z   }|s8t        d ||||||j                  |j                  |j                   fD              S t#        ||||||j                  |j                  |j                         S )aC  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NTrl   r   rm   rk   r   rs  rt  ru  r   rn   r   r)  r*  ra   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z*LukeForMaskedLM.forward.<locals>.<genexpr>  s       = r"  )r+   r,   r-   r.   r/   r0   r   r1   )r^   rx  rI  r  r#  rf   rd   r  r   rO   r   r  r~   r(  r0   r   r1   r*   )r]   rl   r   rm   rk   r   rs  rt  ru  r  r  r   rn   r   r)  r*  r   r+   r,   r.   r-   r/   s                         r%   rq   zLukeForMaskedLM.forward  s   b &1%<k$++B]B]))))%!"7"7 3'/!5  
  g778YYv}}-F||FKKDKK4J4J$KV[[Y[_]H|++7 33G4T4TUM(<<(:(:2t{{?\?\(]_l_q_qrt_uv<#D(?D  !))00&&	   "'!//!(!=!=))	
 		
r$   NNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysrM   r  r  r  r   r   r    r   r!   r  r   r   r*   rq   rw   rx   s   @r%   r  r    s    qs$.  156:593715<@<@:>-1481559,0/3&*!q
E,,-q
 !!2!23q
 !!1!12	q

 u//0q
 U--.q
  ((8(89q
  ((8(89q
 &e&6&67q
 ))*q
   0 01q
 E--.q
   1 12q
 $D>q
 'tnq
  d^!q
" 
u((	)#q
 q
r$   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee	   dee	   de
eef   fd       Z xZS )LukeForEntityClassificationc                 ,   t         |   |       t        |      | _        |j                  | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   rL   rM   r^  rI  
num_labelsr   rY   rZ   r[   r   rP   
classifierrc  r\   s     r%   rM   z$LukeForEntityClassification.__init__2  si     f%	 ++zz&"<"<=))F$6$68I8IJ 	r$   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   c                    ||n| j                   j                  }| j                  |||||||||	|
||d      }|j                  dddddf   }| j	                  |      }| j                  |      }d}||j                  |j                        }|j                  dk(  r!t        j                  j                  ||      }nMt        j                  j                  |j                  d      |j                  d      j                  |            }|s5t        d |||j                   |j"                  |j$                  fD              S t'        |||j                   |j"                  |j$                        S )	u
  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
        >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: person
        ```NTr  r   r   ra   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z6LukeForEntityClassification.forward.<locals>.<genexpr>        = r"  r+   r.   r0   r   r1   )r^   rx  rI  r   r[   r  rf   rd   ndimr   r   cross_entropy binary_cross_entropy_with_logitsr   r   r(  r0   r   r1   r3   r]   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   feature_vectorr.   r+   s                      r%   rq   z#LukeForEntityClassification.forward>  sm   | &1%<k$++B]B]))))%!"7"7 3'/!5  
  !99!Q'Bn50 YYv}}-F{{a}}2266B}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   *!//!(!=!=))
 	
r$   NNNNNNNNNNNNNN)r   r   r   rM   r   r   r    r   r!   r  r   r   r3   rq   rw   rx   s   @r%   r  r  +  s}   
  156:593715=A<@:>1559.2,0/3&*k
E,,-k
 !!2!23k
 !!1!12	k

 u//0k
 U--.k
  ((9(9:k
  ((8(89k
 &e&6&67k
 E--.k
   1 12k
 **+k
 $D>k
 'tnk
 d^k
  
u00	1!k
 k
r$   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee	   dee	   de
eef   fd       Z xZS )LukeForEntityPairClassificationc                 4   t         |   |       t        |      | _        |j                  | _        t        j                  |j                        | _        t        j                  |j                  dz  |j                  d      | _        | j                          y )Nr   Fr  r\   s     r%   rM   z(LukeForEntityPairClassification.__init__  sp     f%	 ++zz&"<"<=))F$6$6$:F<M<MuU 	r$   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   c                 :   ||n| j                   j                  }| j                  |||||||||	|
||d      }t        j                  |j
                  dddddf   |j
                  dddddf   gd      }| j                  |      }| j                  |      }d}||j                  |j                        }|j                  dk(  r!t        j                  j                  ||      }nMt        j                  j                  |j                  d      |j                  d      j!                  |            }|s5t#        d |||j$                  |j&                  |j(                  fD              S t+        |||j$                  |j&                  |j(                  	      S )
u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityPairClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
        >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: per:cities_of_residence
        ```NTr  r   r   r   ra   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z:LukeForEntityPairClassification.forward.<locals>.<genexpr>%  r  r"  r  )r^   rx  rI  r    r   r   r[   r  rf   rd   r  r   r   r  r  r   r   r(  r0   r   r1   r7   r  s                      r%   rq   z'LukeForEntityPairClassification.forward  s   B &1%<k$++B]B]))))%!"7"7 3'/!5  
  --aAg68X8XYZ\]_`Y`8abhi
 n50 YYv}}-F{{a}}2266B}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   .!//!(!=!=))
 	
r$   r  )r   r   r   rM   r   r   r    r   r!   r  r   r   r7   rq   rw   rx   s   @r%   r  r    s}   
  156:593715=A<@:>1559-1,0/3&*p
E,,-p
 !!2!23p
 !!1!12	p

 u//0p
 U--.p
  ((9(9:p
  ((8(89p
 &e&6&67p
 E--.p
   1 12p
 ))*p
 $D>p
 'tnp
 d^p
  
u44	5!p
 p
r$   r  z
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    c            &           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     dee	   dee	   dee	   de
eef   f"d       Z xZS )LukeForEntitySpanClassificationc                 2   t         |   |       t        |      | _        |j                  | _        t        j                  |j                        | _        t        j                  |j                  dz  |j                        | _        | j                          y )Nr   r  r\   s     r%   rM   z(LukeForEntitySpanClassification.__init__;  sn     f%	 ++zz&"<"<=))F$6$6$:F<M<MN 	r$   rl   r   rm   rk   r   rs  rt  ru  entity_start_positionsentity_end_positionsr   rn   r  r   r)  r*  r   c                    ||n| j                   j                  }| j                  ||||||||||||d      }|j                  j	                  d      }|	j                  d      j                  dd|      }	|	j                  |j                  j                  k7  r%|	j                  |j                  j                        }	t        j                  |j                  d|	      }|
j                  d      j                  dd|      }
|
j                  |j                  j                  k7  r%|
j                  |j                  j                        }
t        j                  |j                  d|
      }t        j                  |||j                  gd      }| j                  |      }| j                  |      }d}||j                  |j                        }|j                  dk(  rJt         j"                  j%                  |j'                  d| j(                        |j'                  d            }nMt         j"                  j+                  |j'                  d      |j'                  d      j-                  |            }|s5t/        d |||j0                  |j2                  |j4                  fD              S t7        |||j0                  |j2                  |j4                  	      S )
u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        entity_start_positions (`torch.LongTensor`):
            The start positions of entities in the word token sequence.
        entity_end_positions (`torch.LongTensor`):
            The end positions of entities in the word token sequence.
        labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
            entropy loss is used for the single-label classification. In this case, labels should contain the indices
            that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length,
            num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
            labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntitySpanClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
        >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

        >>> text = "Beyoncé lives in Los Angeles"
        # List all possible entity spans in the text

        >>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
        >>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
        >>> entity_spans = []
        >>> for i, start_pos in enumerate(word_start_positions):
        ...     for end_pos in word_end_positions[i:]:
        ...         entity_spans.append((start_pos, end_pos))

        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
        >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
        ...     if predicted_class_idx != 0:
        ...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
        Beyoncé PER
        Los Angeles LOC
        ```NTr  ra   r   r   r   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z:LukeForEntitySpanClassification.forward.<locals>.<genexpr>  r  r"  r  )r^   rx  rI  r#  rh   rt   ru   rd   rf   r    gatherr   r   r[   r  r  r   r   r  r   r  r  r   r(  r0   r   r1   r9   )r]   rl   r   rm   rk   r   rs  rt  ru  r  r  r   rn   r  r   r)  r*  r   rP   start_states
end_statesr  r.   r+   s                           r%   rq   z'LukeForEntitySpanClassification.forwardG  s   ^ &1%<k$++B]B]))))%!"7"7 3'/!5  
 //44R8!7!A!A"!E!L!LRQSU`!a!((G,E,E,L,LL%;%>%>w?X?X?_?_%`"||G$=$=rCYZ3==bAHHRQ\]&&'*C*C*J*JJ#7#:#:7;T;T;[;[#\ \\'";";RAUV
L*g>^>^#_efgn50YYv}}-F {{a}}226;;r4??3SU[U`U`acUde}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   .!//!(!=!=))
 	
r$   )NNNNNNNNNNNNNNNN)r   r   r   rM   r   r   r    r   r!   r  r   r   r9   rq   rw   rx   s   @r%   r  r  4  s   
  156:593715<@<@:>=A;?1559-1,0/3&*#H
E,,-H
 !!2!23H
 !!1!12	H

 u//0H
 U--.H
  ((8(89H
  ((8(89H
 &e&6&67H
 !))9)9 :H
 'u'7'78H
 E--.H
   1 12H
 ))*H
 $D>H
  'tn!H
" d^#H
$ 
u44	5%H
 H
r$   r  z
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee	   dee	   de
eef   fd       Z xZS )LukeForSequenceClassificationc                 \   t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  n|j                        | _	        t        j                  |j                  |j                        | _        | j                          y r   rL   rM   r  r^  rI  r   rY   classifier_dropoutrZ   r[   r   rP   r  rc  r\   s     r%   rM   z&LukeForSequenceClassification.__init__  s      ++f%	zz)/)B)B)NF%%TZTnTn
 ))F$6$68I8IJ 	r$   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   c                    ||n| j                   j                  }| j                  |||||||||	|
||d      }|j                  }| j	                  |      }| j                  |      }d}||j                  |j                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t!               } ||j#                  d| j                        |j#                  d            }n,| j                   j                  dk(  rt%               } |||      }|s5t'        d	 |||j(                  |j*                  |j,                  fD              S t/        |||j(                  |j*                  |j,                  
      S )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   
regressionsingle_label_classificationmulti_label_classificationra   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z8LukeForSequenceClassification.forward.<locals>.<genexpr>B  r  r"  r  )r^   rx  rI  rw  r[   r  rf   rd   problem_typer  rc   r    rj   r   r
   squeezer	   r   r   r(  r0   r   r1   r;   )r]   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   r:  r.   r+   loss_fcts                       r%   rq   z%LukeForSequenceClassification.forward  s#   V &1%<k$++B]B]))))%!"7"7 3'/!5  
   --]3/YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./ (=(=w?[?[]d]o]op   ,!//!(!=!=))
 	
r$   r  )r   r   r   rM   r   r   r    r   r!   r  r   r   r;   rq   rw   rx   s   @r%   r  r    s}   
  156:593715=A<@:>1559.2,0/3&*g
E,,-g
 !!2!23g
 !!1!12	g

 u//0g
 U--.g
  ((9(9:g
  ((8(89g
 &e&6&67g
 E--.g
   1 12g
 **+g
 $D>g
 'tng
 d^g
  
u22	3!g
 g
r$   r  z
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee	   dee	   de
eef   fd       Z xZS )LukeForTokenClassificationc                 `   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                  n|j                        | _	        t        j                  |j                  |j                        | _        | j                          y NF)r_  r  r\   s     r%   rM   z#LukeForTokenClassification.__init__Y  s      ++f>	zz)/)B)B)NF%%TZTnTn
 ))F$6$68I8IJ 	r$   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   c                 N   ||n| j                   j                  }| j                  |||||||||	|
||d      }|j                  }| j	                  |      }| j                  |      }d}|W|j                  |j                        }t               } ||j                  d| j                        |j                  d            }|s5t        d |||j                  |j                  |j                  fD              S t        |||j                  |j                  |j                        S )aM  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        NTr  ra   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z5LukeForTokenClassification.forward.<locals>.<genexpr>  r  r"  r  )r^   rx  rI  r#  r[   r  rf   rd   r	   r   r  r(  r0   r   r1   r>   )r]   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   r  r.   r+   r  s                       r%   rq   z"LukeForTokenClassification.forwardf  s1   V &1%<k$++B]B]))))%!"7"7 3'/!5  
  "33,,71YYv}}-F')HFKKDOO<fkk"oND (=(=w?[?[]d]o]op   )!//!(!=!=))
 	
r$   r  )r   r   r   rM   r   r   r    r   r!   r  r   r   r>   rq   rw   rx   s   @r%   r  r  Q  s}     156:593715=A<@:>1559.2,0/3&*U
E,,-U
 !!2!23U
 !!1!12	U

 u//0U
 U--.U
  ((9(9:U
  ((8(89U
 &e&6&67U
 E--.U
   1 12U
 **+U
 $D>U
 'tnU
 d^U
  
u//	0!U
 U
r$   r  c            $           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     dee	   dee	   dee	   de
eef   f d       Z xZS )LukeForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r  )
rL   rM   r  r^  rI  r   r   rP   
qa_outputsrc  r\   s     r%   rM   z!LukeForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r$   rl   r   rm   rk   r   rs  rt  ru  r   rn   start_positionsend_positionsr   r)  r*  r   c                 `   ||n| j                   j                  }| j                  |||||||||	|
||d      }|j                  }| j	                  |      }|j                  dd      \  }}|j                  d      }|j                  d      }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|       |j                  d|       t        |      } |||      } |||      }||z   d	z  }|s6t        d
 ||||j                  |j                  |j                  fD              S t        ||||j                  |j                  |j                        S )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        NTr  r   ra   r   r   )ignore_indexr   c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z3LukeForQuestionAnswering.forward.<locals>.<genexpr>  s       = r"  )r+   rA   rB   r0   r   r1   )r^   rx  rI  r#  r  splitr  lenrh   clamp_r	   r(  r0   r   r1   r@   )r]   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r  r   r)  r*  r   r  r.   rA   rB   
total_lossignored_indexr  
start_lossend_losss                             r%   rq   z LukeForQuestionAnswering.forward  s   P &1%<k$++B]B]))))%!"7"7 3'/!5  
  "331#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J   ))00&&   0%!!//!(!=!=))
 	
r$   r  )r   r   r   rM   r   r   r    r   r!   r  r   r   r@   rq   rw   rx   s   @r%   r  r    s   	  156:594815=A<@:>15596:48,0/3&*!f
E,,-f
 !!2!23f
 !!1!12	f

 u001f
 U--.f
  ((9(9:f
  ((8(89f
 &e&6&67f
 E--.f
   1 12f
 "%"2"23f
   0 01f
 $D>f
 'tnf
  d^!f
" 
u66	7#f
 f
r$   r  c            "           e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     dee	   dee	   dee	   de
eef   fd       Z xZS )LukeForMultipleChoicec                 &   t         |   |       t        |      | _        t	        j
                  |j                  |j                  n|j                        | _        t	        j                  |j                  d      | _        | j                          y r  )rL   rM   r^  rI  r   rY   r  rZ   r[   r   rP   r  rc  r\   s     r%   rM   zLukeForMultipleChoice.__init__8  so     f%	zz)/)B)B)NF%%TZTnTn
 ))F$6$6: 	r$   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  r   c                 :   ||n| j                   j                  }||j                  d   n|
j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|
1|
j                  d|
j	                  d      |
j	                  d            nd}
|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  |||||||||	|
||d      }|j                  }| j                  |      }| j                  |      }|j                  d|      }d}|.|j                  |j                        }t               } |||      }|s5t        d |||j                  |j                  |j                  fD              S t!        |||j                  |j                  |j                        S )	a^  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   ra   r   Tr  c              3   $   K   | ]  }|| 
 y wr   r#   r  s     r%   r   z0LukeForMultipleChoice.forward.<locals>.<genexpr>  r!  r"  r  )r^   rx  r  r   rh   rI  rw  r[   r  rf   rd   r	   r(  r0   r   r1   rD   )r]   rl   r   rm   rk   r   rs  rt  ru  r   rn   r  r   r)  r*  num_choicesr   r:  r.   reshaped_logitsr+   r  s                         r%   rq   zLukeForMultipleChoice.forwardD  s   F &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 BLAWZ__R)<=]a
 %0 "&&r+@+E+Eb+IJ 	 %0 "&&r+@+E+Eb+IJ 	 #.  $$R)<)A)A")EGZG_G_`bGcd 	 ))))%!"7"7 3'/!5  
   --]3/ ++b+6YY556F')HOV4D 
 #))00&&
 
 
 -"!//!(!=!=))
 	
r$   r  )r   r   r   rM   r   r   r    r   r!   r  r   r   rD   rq   rw   rx   s   @r%   r  r  6  s}   
  156:593715=A<@:>1559.2,0/3&*P
E,,-P
 !!2!23P
 !!1!12	P

 u//0P
 U--.P
  ((9(9:P
  ((8(89P
 &e&6&67P
 E--.P
   1 12P
 **+P
 $D>P
 'tnP
 d^P
  
u33	4!P
 P
r$   r  )
r  r  r  r  r  r  r  r  r^  rH  )Fr   r   dataclassesr   typingr   r   r   r    torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_luker   
get_loggerr   loggerr   r(   r*   r3   r7   r9   r;   r>   r@   rD   r[  rF   rz   r   r   r   r   r   r  r  r4  r<  rA  rH  r^  re   r  r  r  r  r  r  r  r  r  __all__r#   r$   r%   <module>r     s     ! ) )    A A ' K - 6 9 9 * 
		H	% I%? I I< I/ I I< '? '? '?T ? ? ?< ?[ ? ?< ?[ ? ?< ?; ? ?B ? ? ?B !?{ !? !?H  ?K  ?  ?FF=RYY F=R(299 (Vn		 ndRYY ,BII ,`ryy   1		 1hK
")) K
^ BII "299  */ * *0 
Y'# Y'
Y'x4"* *> L
) L
L
^ y
"5 y
y
x ~
&9 ~
~
B V
&9 V
V
r u
$7 u
u
p d
!4 d
d
N s
2 s
 s
l ^
/ ^
 ^
Br$   