
    Uh                     :   d Z ddlmZmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+  e(       rddl,m-Z- ddl.m/Z/  e       r	  e)j`                  e1      Z2 G d de	jf                        Z4	 d3de	jj                  dejl                  dejl                  dejl                  deejl                     de7de7fdZ8 G d de	jj                        Z9 G d  d!e	jj                        Z:e& G d" d#e!             Z; G d$ d%e;      Z<e& G d& d'e;             Z= G d( d)ee%      Z> G d* d+e;e      Z? e&d,-       G d. d/e;             Z@e& G d0 d1e;             ZAg d2ZBy)4zPyTorch OPT model.    )CallableListOptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargsis_flash_attn_available)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                   x     e Zd ZdZdedef fdZ	 	 d	dej                  dedeej                     f fdZ	 xZ
S )
OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y N   )offsetsuper__init__)selfr&   r'   	__class__s      v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/opt/modeling_opt.pyr-   z&OPTLearnedPositionalEmbedding.__init__;   s$     $++5}E    attention_maskpast_key_values_lengthposition_idsc                     |8t        j                  |d      }||z  dz
  j                         }|dd|df   }t        |   || j
                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr    dim)torchcumsumlongr,   forwardr+   )r.   r2   r3   r4   r/   s       r0   r;   z%OPTLearnedPositionalEmbedding.forwardA   s^      <<A>L(>9A=CCEL'+A+B(BCLw|dkk9::r1   r   N)__name__
__module____qualname____doc__intr-   r8   
LongTensorr   r;   __classcell__r/   s   @r0   r%   r%   6   s]    Fs F3 F '(37	;((; !$; u//0	; ;r1   r%   modulequerykeyvaluer2   scalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )N)r7   dtypeptrainingr    r*   )r8   matmul	transposer   
functionalsoftmaxfloat32torN   rJ   rQ   
contiguous)
rE   rF   rG   rH   r2   rI   rJ   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr\   S   s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r1   c                   (    e Zd ZdZ	 ddedee   f fdZ	 	 	 	 	 ddej                  dee
ej                        deej                     deej                     d	ed
eej                     de
ej                  eej                     ee   f   fdZ xZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        || _
        |-t        j                  d| j                  j                   d       | j                  | j                  z  | _        d| _        | j                  | j                  z  | j                  k7  r&t#        d| j                   d| j                   d      | j                  dz  | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        t'        j(                  | j                  | j                  | j                        | _        y )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r,   r-   r_   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrJ   enable_biasr`   loggerwarning_oncer/   r=   head_dim	is_causal
ValueErrorrI   r   Lineark_projv_projq_projout_proj)r.   r_   r`   rY   r/   s       r0   r-   zOPTAttention.__init__m   s    	++33//!--" !8!8 9 :, , $..8MMDNN*t~~=MdnnM]$T^^$4B8  }}d*iiTEUEUViiTEUEUViiTEUEUV		$..$..tGWGWXr1   hidden_statespast_key_valuer2   layer_head_maskoutput_attentionscache_positionreturnc                    |j                         \  }}	}
| j                  |      | j                  z  }|j                  |d| j                  | j
                        j                  dd      }| j                  |      }| j                  |      }|j                  |d| j                  | j
                        j                  dd      }|j                  |d| j                  | j
                        j                  dd      }|#|j                  ||| j                  d|i      \  }}t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt         | j                  j                     } || ||||f| j"                  sd	n| j$                  d
d|\  }}|j'                  ||	d      j)                         }| j+                  |      }|sd}|||fS )z#Input shape: Batch x Time x ChannelrL   r    r*   Nrx   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.              ?)rJ   rI   )sizerr   rI   viewrg   rl   rS   rp   rq   updater`   r\   r_   _attn_implementationrj   rk   r   rQ   rJ   reshaperX   rs   )r.   rt   ru   r2   rv   rw   rx   rY   bsztgt_len_query_states
key_statesvalue_statesattention_interfacer[   rZ   s                    r0   r;   zOPTAttention.forward   s    (,,.Wa {{=1DLL@#((b$..$--PZZ[\^_`[[/
{{=1__S"dnndmmLVVWXZ[\
#((b$..$--PZZ[\^_`%'5'<'<L$..;K^:\($J )@;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,	%
 	%
!\ "))#w;FFHmmK0 LL.88r1   N)NNNFN)r=   r>   r?   r@   r!   r   rA   r-   r8   Tensorr   boolr   r;   rC   rD   s   @r0   r^   r^   j   s    G
 $(!Y!Y C=!YL 9=1526"'15<9||<9 !u||!45<9 !.	<9
 "%,,/<9  <9 !.<9 
u||Xell3Xe_D	E<9r1   r^   c                   t    e Zd Zddedee   f fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee	ej                        dee
   d	ee
   d
eej                     deej                     dee   de	ej                  ee	ej                  ej                  f      f   fdZ xZS )OPTDecoderLayerr_   r`   c                    t         |           |j                  | _        t	        ||      | _        |j                  | _        |j                  | _        t        |j                     | _
        t        j                  | j                  |j                        | _        t        j                  | j                  |j                   |j"                        | _        t        j                  |j                   | j                  |j"                        | _        t        j                  | j                  |j                        | _        y )N)r_   r`   elementwise_affinerb   )r,   r-   rd   re   r^   	self_attndo_layer_norm_beforerJ   r   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normro   ffn_dimri   fc1fc2final_layer_norm)r.   r_   r`   r/   s      r0   r-   zOPTDecoderLayer.__init__   s    ++%VyI$*$?$?!~~#F$>$>?$&LLNNv/S/S%
! 99T^^V^^&BTBTU99V^^T^^&BTBTU "T^^PVPtPt ur1   rt   r2   rv   ru   rw   	use_cacher4   rx   rY   ry   c	                 "   |}
| j                   r| j                  |      } | j                  d|||||||d|	\  }}}t        j                  j                  || j
                  | j                        }|
|z   }| j                   s| j                  |      }|j                  }|j                  d|j                  d            }|}
| j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      }t        j                  j                  || j
                  | j                        }|
|z   j                  |      }| j                   s| j                  |      }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )rt   ru   r4   r2   rv   rw   rx   rO   rL    )r   r   r   r   rT   rJ   rQ   shaper   r   r   r   r   r   r   )r.   rt   r2   rv   ru   rw   r   r4   rx   rY   residualself_attn_weightspresent_key_valuehidden_states_shapeoutputss                  r0   r;   zOPTDecoderLayer.forward   s   < ! $$ 55mDM ?Mdnn 	?
')%)+/)	?
 	?
;(*; --mt||VZVcVc-d =0 (( 55mDM ,11%--b-2D2DR2HI  $$ 11-@M/**=9/--mt||VZVcVc-d!M1778KL (( 11-@M ")++G)++Gr1   r   )NNNFFNN)r=   r>   r?   r!   r   rA   r-   r8   r   r   r   rB   r   r   FloatTensorr;   rC   rD   s   @r0   r   r      s   vy vXc] v( 26268<,1$)3715S||S !.S "%,,/	S
 !u||!45S $D>S D>S u//0S !.S -.S 
u  (51B1BEDUDU1U+V"WW	XSr1   r   c                   @    e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdZdZd Zy)OPTPreTrainedModelmodelTr   c                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y t        |t        j                        rJ|j
                  j                  j                  d       |j                  j                  j                          y y )Nr}   )meanstdr~   )r_   init_std
isinstancer   ro   weightdatanormal_rc   zero_	Embeddingpadding_idxr   fill_)r.   rE   r   s      r0   _init_weightsz OPTPreTrainedModel._init_weightsE  s    kk""fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)KK""$ .r1   N)r=   r>   r?   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cacher   r   r1   r0   r   r   7  sJ    L&*#*+"&!N  $!%r1   r   c                   B    e Zd ZdZdef fdZd Zd Z	 ddee	j                  df   de	j                  d	e	j                  d
edef
dZede	j                  dedede	j                   d	e	j                  defd       Ze	 	 	 	 	 	 	 	 	 	 	 ddee	j(                     dee	j                     dee	j                     d
eee	j,                        dee	j,                     dee   dee   dee   dee   dee	j(                     d	ee	j                     dee   deeef   fd       Z xZS )
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    r_   c           	      8   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  | j
                        | _        t        |j                  |j                        | _        |j                  |j                  k7  r2t        j                   |j                  |j                  d      | _        nd | _        |j                  |j                  k7  r2t        j                   |j                  |j                  d      | _        nd | _        |j&                  r=|j(                  s1t        j*                  |j                  |j,                        | _        nd | _        t        j0                  t3        |j4                        D cg c]  }t7        ||       c}      | _        d| _        | j=                          y c c}w )NFrb   r   )r`   )r,   r-   rJ   	layerdroppad_token_idr   max_position_embeddingsmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr%   rd   embed_positionsro   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing	post_init)r.   r_   ir/   s      r0   r-   zOPTDecoder.__init__\  s    ~~))!..$*$B$B! ++LL):):F<V<VX\XhXhi<V=[=[]c]o]op%%););;!yy););V=W=W^cdD#D%%););; ii(B(BFDVDV]bcDO"DO
 &&v/N/N$&LL""v7[7[%D! %)D!mmSXY_YqYqSr$sa_Vq%I$st&+#	 %ts   Hc                     | j                   S r   r   r.   s    r0   get_input_embeddingszOPTDecoder.get_input_embeddings  s       r1   c                     || _         y r   r   r.   rH   s     r0   set_input_embeddingszOPTDecoder.set_input_embeddings  s
    !r1   r2   r"   input_tensorrx   past_key_valuesrw   c           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r}   flex_attentionr   Fr|   )inputs_embedsr3   is_trainingr    rL   )sequence_lengthtarget_lengthrN   rx   
batch_size)cudaxpunpu)r_   r   anyr   r8   r   r#   get_seq_lengthis_compileabler   _ignore_causal_mask_sdparQ   rN   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfomin_unmask_unattended)r.   r2   r   rx   r   rw   past_seen_tokensusing_compilable_cacherN   r   r   causal_mask	min_dtypes                r0   _update_causal_maskzOPTDecoder._update_causal_mask  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr1   r   r   rN   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerN   r   r    )diagonalr   rL   r   )r7   r8   r   r   fullr   triuaranger   expandcloner   rW   masked_fill)r2   r   r   rN   rx   r   rY   r   r   mask_lengthpadding_masks              r0   r   z@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r1   	input_ids	head_maskr   r   output_hidden_statesreturn_dictr4   rY   ry   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}||j                  d|j                  d         }|| j                  |      }d}|r>t        |t              s.d}t        j                   |      }|t        j                  d       ||j#                         nd}|2t%        j&                  |||j                  d	   z   |j(                  
      }|A||j                  d	   z   }t%        j*                  |j                  d   ||j(                  
      }| j-                  |||||      }|
8t%        j.                  |d	      }
|
|z  d	z
  j1                         }
|
dd|df   }
| j3                  |||
      }| j4                  | j5                  |      }||j7                  |j(                        z   }|rdnd}|rdnd}d}t9        |gdg      D ]j  \  }}|	|j;                         d   t=        | j>                        k7  s3t        d| dt=        | j>                         d|j;                         d    d       tA        | j>                        D ]  \  }}|r||fz  }| j                  r%t%        jB                  g       }|| jD                  k  r?| j                  r7| j                  r+| jG                  |jH                  |||||   ndd|||
|	      }n ||f||
|||   nd||||d|}|d   }|r	||rdnd	   }|s||d	   fz  } | jJ                  | jK                  |      }| jL                  | jM                  |      }|r||fz  }|r|nd}|r|jO                         }tQ        ||||      S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrL   TzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r    r   r6   )r4   r   r
  zThe `z` should be specified for z layers, but it is for .)r2   r4   rv   ru   rw   r   rx   r*   last_hidden_stater   rt   
attentions))r_   rw   r  r   use_return_dictrn   r   rQ   rj   rk   r   r   r   r   r   r   from_legacy_cacher   r8   r  r   onesr   r9   r:   r   r   rW   zipr   lenr   	enumeraterandr   _gradient_checkpointing_func__call__r   r   to_legacy_cacher   )r.   r	  r2   r
  r   r   r   rw   r  r  r4   rx   rY   return_legacy_cacher   
seq_lengthr   
pos_embedsrt   all_hidden_statesall_self_attnsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputs
next_caches                                r0   r;   zOPTDecoder.forward  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I !r9??2+>?I  --i8M#Z?"&*<<_MO&##Y @O?Z?99;`a!"\\ "2]5H5H5K"KTaThThN !)M,?,?,BBJ"ZZ(;(;A(>
S`SgSghN..M>?L]

  <<A>L(>9A=CCEL'+;+<(<=L)).:JYe)f
??& OOM:M%
m6J6J(KK #7BD0d! %(k]$C 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 )	6C#!m%55!}}&+jjn#&7**t}} $ A A!**!&/&;IcN% "
! !.!
!#.!-7@7LYs^RV#2&7'#1
! 
! *!,M%28I1q%Q" =#3"55S)	6V   , 11-@M' ,,];M  -!11+4'$
#335J&+&+%	
 	
r1   )FNNNNNNNNNNN)r=   r>   r?   r@   r!   r-   r   r   r   r8   r   r   r   r   staticmethodrA   rN   r   r   r   rB   r   r   r   r   r   r   r;   rC   rD   s   @r0   r   r   T  s   #y #J!" #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4l  1515,0=A59$(,0/3&*3715R
E,,-R
 !.R
 ELL)	R

 "$u'8'8"9:R
   1 12R
 D>R
 $D>R
 'tnR
 d^R
 u//0R
 !.R
 -.R
 
u--	.R
 R
r1   r   c                       e Zd Zdef fdZd Zd Zd Zee		 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
eeej                      ef      d
e
ej                      de
e   de
e   de
e   de
e   de
ej                     de
ej                     dee   deeef   fd              Z xZS )OPTModelr_   c                 d    t         |   |       t        |      | _        | j	                          y r   )r,   r-   r   decoderr   r.   r_   r/   s     r0   r-   zOPTModel.__init__  s&     !&)r1   c                 .    | j                   j                  S r   r.  r   r   s    r0   r   zOPTModel.get_input_embeddings  s    ||(((r1   c                 &    || j                   _        y r   r1  r   s     r0   r   zOPTModel.set_input_embeddings  s    $)!r1   c                     | j                   S r   )r.  r   s    r0   get_decoderzOPTModel.get_decoder      ||r1   r	  r2   r
  r   r   r   rw   r  r  r4   rx   rY   ry   c                 |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	 | j
                  d|||
||||||d|d|}t        |j                  |j                  |j                  |j                        S )NTr	  r2   r4   r
  r   r   r   rw   r  r  rx   r  r   )r_   rw   r  r   r  r.  r   r  r   rt   r  )r.   r	  r2   r
  r   r   r   rw   r  r  r4   rx   rY   decoder_outputss                 r0   r;   zOPTModel.forward  s    " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] '$,, 
)%+'/!5)
 
 '-??+;;)77&11	
 	
r1   r)  )r=   r>   r?   r!   r-   r   r   r4  r   r   r   r8   rB   r   r   r   r   r   r   r   r   r   r   r;   rC   rD   s   @r0   r,  r,    sT   y )*  1515,0KO59$(,0/3&*3715+
E,,-+
 !.+
 ELL)	+

 "%U->->(?(F"GH+
   1 12+
 D>+
 $D>+
 'tn+
 d^+
 u//0+
 !.+
 -.+
 
u--	.+
  +
r1   r,  c                       e Zd Zy)KwargsForCausalLMN)r=   r>   r?   r   r1   r0   r:  r:    s    r1   r:  c            !           e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
ee	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                      deej                      deeeej&                     ef      deej&                     deej                     dee   dee   dee   dee   deej                     deej                      dee   deeef   fd              Zed        Z xZS )OPTForCausalLMzlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFrb   )
r,   r-   r,  r   r   ro   r   r   lm_headr   r/  s     r0   r-   zOPTForCausalLM.__init__!  sK     f%
 yy!;!;V=N=NUZ[ 	r1   c                 B    | j                   j                  j                  S r   r   r.  r   r   s    r0   r   z#OPTForCausalLM.get_input_embeddings+      zz!!...r1   c                 :    || j                   j                  _        y r   rA  r   s     r0   r   z#OPTForCausalLM.set_input_embeddings.      */

'r1   c                     | j                   S r   r?  r   s    r0   get_output_embeddingsz$OPTForCausalLM.get_output_embeddings1  r5  r1   c                     || _         y r   rF  )r.   new_embeddingss     r0   set_output_embeddingsz$OPTForCausalLM.set_output_embeddings4  s	    %r1   c                 &    || j                   _        y r   r   r.  )r.   r.  s     r0   set_decoderzOPTForCausalLM.set_decoder7  s    $

r1   c                 .    | j                   j                  S r   rL  r   s    r0   r4  zOPTForCausalLM.get_decoder:  s    zz!!!r1   r	  r2   r
  r   r   labelsr   rw   r  r  r4   rx   rY   ry   c                     ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  j
                  d|||||||||	d|d|}| j                  |d         j                         }d}|E|j                  |j                        } | j                  ||fd| j                   j                  i|}t        |||j                  |j                  |j                        S )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr7  r   r   losslogitsr   rt   r  r   )r_   rw   r  r  r   r.  r?  rX   rW   r   loss_functionr   r   r   rt   r  )r.   r	  r2   r
  r   r   rO  r   rw   r  r  r4   rx   rY   r   rS  rR  s                    r0   r;   zOPTForCausalLM.forward=  s;   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %$**$$ 
)%+'/!5)
 
 gaj)446YYv}}-F%4%%  ;;11 	D &#33!//))
 	
r1   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywr<   )index_selectrW   r   ).0
past_statebeam_idxs     r0   	<genexpr>z0OPTForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)tuple)r   rZ  reordered_past
layer_pasts    `  r0   _reorder_cachezOPTForCausalLM._reorder_cache  s=    ) 	Jncmnn N	 r1   NNNNNNNNNNNN)r=   r>   r?   _tied_weights_keysr-   r   r   rG  rJ  rM  r4  r   r   r   r8   rB   r   r   r   r   r   r   r   r:  r   r   r;   r*  r_  rC   rD   s   @r0   r<  r<    s   *+/0&%"  1515,0KO59-1$(,0/3&*3715P
E,,-P
 !.P
 ELL)	P

 "%U->->(?(F"GHP
   1 12P
 ))*P
 D>P
 $D>P
 'tnP
 d^P
 u//0P
 !.P
 *+P
 
u,,	-P
  P
d  r1   r<  a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   r    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
eej                     ef      deej                     deej                     d	ee   d
ee   dee   dee   deej                     de
eef   fd       Zd Zd Z xZS )OPTForSequenceClassificationr_   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r>  )
r,   r-   
num_labelsr,  r   r   ro   r   scorer   r/  s     r0   r-   z%OPTForSequenceClassification.__init__  sT      ++f%
YYv994??QVW
 	r1   r	  r2   r
  r   r   rO  r   rw   r  r  r4   ry   c                    |
|
n| j                   j                  }
| j                  |||||||||	|

      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d	       |t        j                  ||j                  
      |f   }d}|| j                   j"                  | j$                  dk(  rd| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  dk(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|
s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	r   r2   r4   r
  r   r   rw   r  r  r   r*   r    z=Cannot handle batch sizes > 1 if no padding token is defined.rL   )r   rN   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationrQ  )r_   r  r   rg  r   r   rn   rW   r   r8   int32r  argmaxrj   rk   r/   r=   problem_typerf  rN   r:   rA   r   squeezer
   r   r	   r   r   rt   r  )r.   r	  r2   r
  r   r   rO  r   rw   r  r  r4   transformer_outputsrt   rS  r   r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrR  loss_fctoutputs                           r0   r;   z$OPTForSequenceClassification.forward  s   * &1%<k$++B]B]"jj+)%'/!5# ) 
 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r1   c                 B    | j                   j                  j                  S r   rA  r   s    r0   r   z1OPTForSequenceClassification.get_input_embeddings  rB  r1   c                 :    || j                   j                  _        y r   rA  r   s     r0   r   z1OPTForSequenceClassification.set_input_embeddings  rD  r1   r)  )r=   r>   r?   r!   r-   r   r   r8   rB   r   r   r   r   r   r   r   r;   r   r   rC   rD   s   @r0   rd  rd    sL   y   156:15KO59-1$(,0/3&*37\
E,,-\
 !!2!23\
 E--.	\

 "%U->->(?(F"GH\
   1 12\
 ))*\
 D>\
 $D>\
 'tn\
 d^\
 u//0\
 
u66	7\
 \
|/0r1   rd  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
eej                     ef      deej                     deej                     d	eej                     d
ee   dee   dee   dee   deej                     de
eef   fd       Zd Zd Z xZS )OPTForQuestionAnsweringr_   c                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y r)   )	r,   r-   r,  r   r   ro   r   
qa_outputsr   r/  s     r0   r-   z OPTForQuestionAnswering.__init__  s@     f%
))F$>$>B 	r1   r	  r2   r
  r   r   start_positionsend_positionsr   rw   r  r  r4   ry   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      j                  |j                        }|j                  d|      j                  |j                        }t        |      } |||      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```Nri  r   r    rL   r6   )ignore_indexr*   )rR  start_logits
end_logitsrt   r  )r_   r  r   r}  splitrp  rX   r  r   clamprW   r   r
   r   rt   r  )r.   r	  r2   r
  r   r   r~  r  r   rw   r  r  r4   rq  rt   rS  r  r  
total_lossignored_indexrv  
start_lossend_lossrw  s                           r0   r;   zOPTForQuestionAnswering.forward#  s   ` &1%<k$++B]B]"jj+)%'/!5# ) 
 ,A./#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EHHWO)//=ADDV]]SM']CH!,@J
M:H$x/14J"J/2Eab2IIF/9/EZMF*Q6Q+%!-;;*55
 	
r1   c                 B    | j                   j                  j                  S r   rA  r   s    r0   r   z,OPTForQuestionAnswering.get_input_embeddings  rB  r1   c                 :    || j                   j                  _        y r   rA  r   s     r0   r   z,OPTForQuestionAnswering.set_input_embeddings  rD  r1   r`  )r=   r>   r?   r!   r-   r   r   r8   rB   r   r   r   r   r   r   r   r;   r   r   rC   rD   s   @r0   r{  r{    se   y   156:15KO596:48$(,0/3&*37_
E,,-_
 !!2!23_
 E--.	_

 "%U->->(?(F"GH_
   1 12_
 "%"2"23_
   0 01_
 D>_
 $D>_
 'tn_
 d^_
 u//0_
 
u22	3_
 _
B/0r1   r{  )r<  r,  r   rd  r{  )r}   )Cr@   typingr   r   r   r   r   r8   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   configuration_optr!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerr=   rj   r   r%   Moduler   floatr\   r^   r   r   r   r,  r:  r<  rd  r{  __all__r   r1   r0   <module>r     s    9 9    A A ! . ) > [  G & h h (  !;J  
		H	%;BLL ;H %II%<<% 
% <<	%
 U\\*% % %.b9299 b9Jebii eP % % %8C
# C
L =
! =
 =
@ ?,j >z' zz m0#5 m0m0` o00 o0 o0dr1   