
    Uhr                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z#  e       rddl$m%Z% ddl&m'Z'  e!jP                  e)      Z* G d dejV                        Z, G d dejV                        Z- G d dej\                        Z/ G d de/      Z0e/e0dZ1 G d dej\                        Z2e G d de             Z3e G d d e3             Z4 ed!"       G d# d$e3e             Z5e G d% d&e3             Z6 ed'"       G d( d)e3             Z7g d*Z8y)+zPyTorch BioGPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )BioGptConfig)	BlockMask)make_flex_block_causal_maskc                   x     e Zd ZdZdedef fdZ	 	 d	dej                  dedeej                     f fdZ
 xZS )
 BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y )N   )offsetsuper__init__)selfr   r    	__class__s      |/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/biogpt/modeling_biogpt.pyr%   z)BioGptLearnedPositionalEmbedding.__init__?   s$     $++5}E    attention_maskpast_key_values_lengthposition_idsc                     |U|j                         }t        j                  |d      j                  |      |z  j                         dz
  }|dd|df   }t        |   || j                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)longtorchcumsumtype_asr$   forwardr#   )r&   r*   r+   r,   	positionsr'   s        r(   r4   z(BioGptLearnedPositionalEmbedding.forwardE   sx     +002N n!<DD^TWeekkmpqqI %Q(>(?%?@Lw|dkk9::r)   r   N)__name__
__module____qualname____doc__intr%   r1   
LongTensorr   Tensorr4   __classcell__r'   s   @r(   r   r   :   s[    Fs F3 F '(/3	;((; !$; u||,	; ;r)   r   c            
       `     e Zd ZdZd	dedededee   f fdZdej                  f fdZ
 xZS )
BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r   r    padding_idxembed_scalec                 6    t         |   |||       || _        y N)r$   r%   rC   )r&   r   r    rB   rC   r'   s        r(   r%   z"BioGptScaledWordEmbedding.__init__^   s    D&r)   	input_idsc                 <    t         |   |      | j                  z  S rE   )r$   r4   rC   )r&   rF   r'   s     r(   r4   z!BioGptScaledWordEmbedding.forwardb   s    wy)D,<,<<<r)   )      ?)r7   r8   r9   r:   r;   r   floatr%   r1   r=   r4   r>   r?   s   @r(   rA   rA   Y   sE    's '3 'S '_ghm_n '= = =r)   rA   c                   p    e Zd ZdZ	 	 	 	 	 	 ddededededededee   d	ee   f fd
Z		 	 	 	 	 	 dde
j                  dee
j                     dee   dee
j                     dee
j                     dedee
j                     dee
j                  ee
j                     eee
j                        f   fdZ xZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	is_causalconfig	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rP   )r$   r%   rL   rM   rN   head_dimrR   
ValueErrorscalingrO   rQ   rS   loggerwarning_oncer'   r7   r   Lineark_projv_projq_projout_proj)
r&   rL   rM   rN   rO   rP   rQ   rR   rS   r'   s
            r(   r%   zBioGptAttention.__init__j   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr)   hidden_stateskey_value_statespast_key_valuer*   layer_head_maskoutput_attentionscache_positionreturnc                 \
   |du}|j                         \  }	}
}| j                  |      j                  |	d| j                  | j                        j                  dd      }|| j                  z  }|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|r7|5r3j                  | j                     }|j                  | j                     }n| j!                  |      }| j#                  |      }|j                  |	d| j                  | j                        j                  dd      }|j                  |	d| j                  | j                        j                  dd      }|D|s|nd}j%                  ||| j                  d|i      \  }}|rd|j                  | j                  <   |	| j                  z  d| j                  f} |j&                  | } |j&                  | } |j&                  | }|j                  d      }t)        j*                  ||j                  dd            }|j                         |	| j                  z  |
|fk7  r/t-        d|	| j                  z  |
|f d|j                                |_|ddddddd|j.                  d	   f   }|j                  |	| j                  |
|      |z   }|j                  |	| j                  z  |
|      }t0        j2                  j5                  |d
      }||j                         | j                  fk7  r*t-        d| j                  f d|j                                |j                  dddd      |j                  |	| j                  |
|      z  }|j                  |	| j                  z  |
|      }|r?|j                  |	| j                  |
|      }|j                  |	| j                  z  |
|      }nd}t0        j2                  j7                  || j6                  | j8                        }t)        j*                  ||      }|j                         |	| j                  z  |
| j                  fk7  r9t-        d|	| j                  z  |
| j                  f d|j                                |j                  |	| j                  |
| j                        }|j                  dd      }|j'                  |	|
| j:                        }| j=                  |      }|||fS )#Input shape: Batch x Time x ChannelNr   r"   re   Tz$Attention weights should be of size z	, but is r.   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizer^   viewrM   rV   	transposerX   
isinstancer   
is_updatedgetrS   cross_attention_cacheself_attention_cache	key_cachevalue_cacher\   r]   updatereshaper1   bmmrW   shaper   
functionalsoftmaxrN   rm   rL   r_   )r&   r`   ra   rb   r*   rc   rd   re   is_cross_attentionbsztgt_len_query_statesrr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r(   r4   zBioGptAttention.forward   s    .T9',,.Wa {{=166sBPTP]P]^hhijlmn#dll2%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,66t~~FJ.::4>>JL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=DNN*B>
+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %+Aq!5Kz7G7G7K5K,KLN',,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C$..4H'SWS`S`3a2b c$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01>AAr)   )        FTFNNNNNNFN)r7   r8   r9   r:   r;   rI   boolr   r   r%   r1   r=   r   r   r4   r>   r?   s   @r(   rK   rK   g   sL   G  )-#'%C%C %C 	%C
 %C %C %C &%C C=%CT 48*.1526"'15pB||pB #5<<0pB !	pB
 !.pB "%,,/pB  pB !.pB 
u||Xell3XeELL>Q5RR	SpBr)   rK   c                   *    e Zd Z	 	 	 	 	 	 d
dej                  deej                     dee   deej                     deej                     dedeej                     deej                  eej                     eeej                        f   f fd	Z	 xZ
S )BioGptSdpaAttentionr`   ra   rb   r*   rc   rd   re   rf   c                 .   |r*t         j                  d       t        |   ||||||      S |du}|j	                         \  }	}
}| j                  |      j                  |	d| j                  | j                        j                  dd      }|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                   }n|}|r|n|}|r7|5r3j"                  | j                     }|j$                  | j                     }n| j'                  |      }| j)                  |      }|j                  |	d| j                  | j                        j                  dd      }|j                  |	d| j                  | j                        j                  dd      }|D|s|nd}j+                  ||| j                  d|i      \  }}|rd|j                  | j                  <   d}||ddddddd|j,                  d	   f   }|j.                  j0                  d
k(  r2|0|j3                         }|j3                         }|j3                         }| j4                  r	||
dkD  rdnd}t6        j8                  j:                  j=                  ||||| j>                  r| j@                  nd|      }|j                  dd      j3                         }|j                  |	|
| jB                        }| jE                  |      }|d|fS )rh   a  BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` . Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)ra   rb   r*   rd   re   Nri   r   r"   re   Trj   cudaFr   )	attn_mask	dropout_prQ   )#rY   rZ   r$   r4   rn   r^   ro   rM   rV   rp   rq   r   rr   rs   rS   rt   ru   rv   rw   r\   r]   rx   r{   devicetype
contiguousrQ   r1   r   r|   scaled_dot_product_attentionrm   rN   rL   r_   )r&   r`   ra   rb   r*   rc   rd   re   r~   r   r   r   r   rr   r   r   r   r   causal_maskrQ   r   r'   s                        r(   r4   zBioGptSdpaAttention.forward  s    l 7?!1--"3- #   .T9',,.Wa {{=166sBPTP]P]^hhijlmn%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,66t~~FJ.::4>>JL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=%(Aq2HJ4D4DR4H2H)HIK ##v-+2I'224L#..0J'224L
 !NN{/BwQR{DX]	 hh))FF!&*mmdll G 
 "++Aq1<<> "&&sGT^^DmmK0D.00r)   r   )r7   r8   r9   r1   r=   r   r   r   r   r4   r>   r?   s   @r(   r   r     s     48*.1526"'15e1||e1 #5<<0e1 !	e1
 !.e1 "%,,/e1  e1 !.e1 
u||Xell3XeELL>Q5RR	Se1 e1r)   r   )eagersdpac                   0    e Zd Zddedee   f fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee	   dee
   d	ee
   d
eej                     deej                  eeej                  ej                  f      f   fdZ xZS )BioGptDecoderLayerrR   rS   c                    t         |           |j                  | _        t	        |j
                     | j                  |j                  |j                  dd|      | _        |j                  | _
        t        |j                     | _        |j                  | _        t        j                   | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                   | j                        | _        y )NT)rL   rM   rN   rO   rQ   rS   )r$   r%   hidden_sizerL   BIOGPT_ATTENTION_CLASSES_attn_implementationnum_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrN   r   
hidden_actactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normr[   intermediate_sizefc1fc2final_layer_norm)r&   rR   rS   r'   s      r(   r%   zBioGptDecoderLayer.__init__u  s    ++1&2M2MNnn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r)   r`   r*   rc   rb   rd   	use_cachere   rf   c                 l   |}| j                  |      }| j                  ||||||      \  }}	}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|f}
|r|
|	fz  }
|r|
|fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )r`   rb   r*   rc   rd   re   rk   )r   r   r   r|   rN   rm   r   r   r   r   r   )r&   r`   r*   rc   rb   rd   r   re   residualself_attn_weightsoutputss              r(   r4   zBioGptDecoderLayer.forward  sE   8 !11-@ <@>>'))+/) <J <
8(. --mt||VZVcVc-d =0 !--m</**=9--mt?V?Vaeanan-o/--mt||VZVcVc-d =0 ")++G((Gr)   rE   )NNNFTN)r7   r8   r9   r   r   r;   r%   r1   r=   r   r   r   FloatTensorr4   r>   r?   s   @r(   r   r   t  s    =| = =2 2626*.,1$(15>||> !.> "%,,/	>
 !> $D>> D>> !.> 
u  (51B1BEDUDU1U+V"WW	X>r)   r   c                       e Zd ZeZdZdZdZdZdZ	d Z
	 ddeej                  df   dej                  dej                  ded	ef
d
Zedej                  dededej&                  dej                  defd       Zy)BioGptPreTrainedModelbiogptTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsr   )meanstdNrH   )rq   r   r[   weightdatanormal_rR   initializer_rangerP   zero_	EmbeddingrB   r   fill_)r&   modules     r(   _init_weightsz#BioGptPreTrainedModel._init_weights  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r)   r*   r   input_tensorre   past_key_valuesrd   c           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r   flex_attentionr   Fr   )inputs_embedsr+   is_trainingr   ri   )sequence_lengthtarget_lengthdtypere   
batch_size)r   xpunpu)rR   r   anyrq   r1   r=   r   get_seq_lengthis_compileabler   _ignore_causal_mask_sdparm   r   r{   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfomin_unmask_unattended)r&   r*   r   re   r   rd   past_seen_tokensusing_compilable_cacher   r   r   r   	min_dtypes                r(   _update_causal_maskz)BioGptPreTrainedModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr)   r   r   r   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer   r   r   )diagonalr   ri   r   )r/   r1   r   r   fullr   triuarangery   expandcloner{   tomasked_fill)r*   r   r   r   re   r   kwargsr   r   mask_lengthpadding_masks              r(   r   zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position*  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r)   N)F)r7   r8   r9   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_cache_class_supports_static_cacher   r   r1   r=   r   r   r   staticmethodr;   r   r    r)   r(   r   r     s    L &*#N !*. #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r)   r   c                   n    e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	eeee	j                           d
ee   dee	j                     dee   dee   dee   dee	j                     deeef   fd       Z xZS )BioGptModelrR   c           	         t         |   |       || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  rt        j                  |j                        nd}t        |j                  | j                  | j                  |      | _        t!        |j"                  | j                        | _        t'        j(                  t+        |j,                        D cg c]  }t/        ||       c}      | _        t'        j2                  | j                        | _        d| _        |j8                  dk(  | _        | j=                          y c c}w )NrH   )rC   )rS   Fr   )r$   r%   rR   	layerdropr   rN   r   rL   pad_token_idrB   scale_embeddingmathsqrtrA   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsr   
ModuleListrangenum_hidden_layersr   layersr   
layer_normgradient_checkpointingr   	_use_sdpa	post_init)r&   rR   rC   ir'   s       r(   r%   zBioGptModel.__init__e  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vQR%7!%L$vw,,t~~6&+#44> %ws   E6c                     | j                   S rE   r   r&   s    r(   get_input_embeddingsz BioGptModel.get_input_embeddings{  s       r)   c                     || _         y rE   r  r&   values     r(   set_input_embeddingsz BioGptModel.set_input_embeddings~  s
    !r)   rF   r*   	head_maskr   r   r   r,   rd   output_hidden_statesreturn_dictre   rf   c                    ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
|d u |d uz  rt        d      ||j                  d|j                  d         }|| j                  |      }| j                  r%| j                  r|rt        j                  d       d}d}|r<t        |t              s,d}t        j                  d       t        j                   |      }|j#                         d d \  }}||j%                         nd}|%t'        j(                  |||z   |j*                        }|1t-               s'||z   }t'        j.                  |||j*                        }t        |t              r|j0                  n|}| j3                  |||||      }||j5                  d      }| j7                  |||	      }||z   }t8        j:                  j=                  || j<                  | j                  
      }|	rdnd }|rdnd }d }d }t?        | j@                        D ]  \  }}|	r||fz  }| j                  r%t'        jB                  g       }|| jD                  k  r?| j                  r6| j                  r*| jG                  |jH                  |||||   nd d |||      }n ||||||   nd ||||      }|d   }|r	||rdnd   }|s||d   fz  } |	r||fz  }| jK                  |      }|r|nd }|r|jM                         }|
stO        d |||||fD              S tQ        |||||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timeri   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r,   rk   r   )r*   rc   rb   rd   r   re   r"   r   c              3   $   K   | ]  }|| 
 y wrE   r   ).0vs     r(   	<genexpr>z&BioGptModel.forward.<locals>.<genexpr>  s      = s   )last_hidden_stater   r`   
attentionscross_attentions))rR   rd   r  r   use_return_dictrW   ro   r{   r   r  rm   rY   rZ   rq   r   r   from_legacy_cachern   r   r1   r   r   r   onesru   r   	unsqueezer  r   r|   rN   	enumerater  randr   _gradient_checkpointing_func__call__r  to_legacy_cachetupler   )r&   rF   r*   r  r   r   r   r,   rd   r  r  re   r   return_legacy_cacher   
seq_lengthr+   mask_seq_lengthself_attn_cacher   r`   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheidxdecoder_layerdropout_probabilitylayer_outputs
next_caches                                 r(   r4   zBioGptModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] -t";<cdd !r9??2+>?I  --i8M&&4==##p "	 $Z?"&\
 2CCOTO!.!3!3!5cr!:
JETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D4zAO"ZZ
OML`L`aN /+>? 00  	
 ..
 )33A6L++N<Ram+n%4--mt||VZVcVc-d"6BD0d#!"+DKK"8 %	6C#!m%55!}}&+jjn#&7**t}} $ A A!**!&/&;IcN%"	! !.!#.7@7LYs^RV#2&7'#1! *!,M%28I1q%Q" =#3"55K%	6P  -!116+4'$
(88:J '5FXlm  
 9+&+%1
 	
r)   NNNNNNNNNNN)r7   r8   r9   r   r%   r  r  r   r   r1   r<   r   r   r=   r   r   r   r4   r>   r?   s   @r(   r   r   c  sB   | ,!"  156:1559@D$(37,0/3&*15V
E,,-V
 !!2!23V
 E--.	V

   1 12V
 "%ell(;"<=V
 D>V
 u//0V
 $D>V
 'tnV
 d^V
 !.V
 
u??	@V
 V
r)   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                       e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	eeee	j                           d
ee	j                     dee   dee	j                     dee   dee   dee   dee	j                     deeef   fd       Zed        Z xZS )BioGptForCausalLMzoutput_projection.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFrU   )
r$   r%   r   r   r   r[   r   r   output_projectionr	  r&   rR   r'   s     r(   r%   zBioGptForCausalLM.__init__#  sJ     !&)!#6+=+=v?P?PW\!] 	r)   c                     | j                   S rE   r;  r  s    r(   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings,  s    %%%r)   c                     || _         y rE   r>  )r&   new_embeddingss     r(   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings/  s
    !/r)   rF   r*   r  r   r   labelsr   r,   rd   r  r  re   rf   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }| j                  |      }d}|* | j                  ||fd| j                   j
                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r*   r  r   r   r   r,   rd   r  r  re   r   r   r   )losslogitsr   r`   r  r  )rR   r  r   r;  loss_functionr   r   r   r`   r  r  )r&   rF   r*   r  r   r   rC  r   r,   rd   r  r  re   r   r   sequence_outputprediction_scoreslm_lossoutputs                      r(   r4   zBioGptForCausalLM.forward2  s   . &1%<k$++B]B]++)'+%/!5#)  
 "!* 22?C(d((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r)   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywr6   )index_selectr   r   )r  
past_statebeam_idxs     r(   r  z3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>w  s.     nU_j--aZ=N=N1OPns   58)r'  )r   rP  reordered_past
layer_pasts    `  r(   _reorder_cachez BioGptForCausalLM._reorder_cacher  s=    ) 	Jncmnn N	 r)   )NNNNNNNNNNNN)r7   r8   r9   _tied_weights_keysr%   r?  rB  r   r   r1   r<   r   r   r=   r   r   r   r4   r   rS  r>   r?   s   @r(   r8  r8    se    55&0  156:1559@D-1$(37,0/3&*15=
E,,-=
 !!2!23=
 E--.	=

   1 12=
 "%ell(;"<==
 ))*=
 D>=
 u//0=
 $D>=
 'tn=
 d^=
 !.=
 
u77	8=
 =
~  r)   r8  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     dee	e	ej                           deej                     deej                     d	ee   d
ee   dee   dee   dee	ef   fd       Z xZS )BioGptForTokenClassificationc                 z   t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropout)r$   r%   
num_labelsr   r   hasattrrX  r   r   DropoutrN   r[   r   
classifierr	  )r&   rR   rX  r'   s      r(   r%   z%BioGptForTokenClassification.__init__~  s      ++!&)6/0V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr)   rF   token_type_idsr*   r  r   r   rC  r   rd   r  r  rf   c                    ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r*   r  r   r   rd   r  r  r   ri   r   r"   )rE  rF  r`   r  )rR   r  r   rN   r\  r   ro   rY  r1   wheretensorignore_indexr3   r   r`   r  )r&   rF   r]  r*   r  r   r   rC  r   rd   r  r  transformer_outputsr`   rF  rE  loss_fctactive_lossactive_logitsactive_labelsrK  s                        r(   r4   z$BioGptForTokenClassification.forward  so   * &1%<k$++B]B]"kk+)'/!5# * 

 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r)   r5  )r7   r8   r9   r%   r   r   r1   r<   r   r   r=   r   r   r   r4   r>   r?   s   @r(   rV  rV  |  s%     15596:15@D59-1$(,0/3&*=
E,,-=
 !!1!12=
 !!2!23	=

 E--.=
 "%ell(;"<==
   1 12=
 ))*=
 D>=
 $D>=
 'tn=
 d^=
 
u++	,=
 =
r)   rV  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   N    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
e
ej                           deej                     deej                     d	ee   d
ee   dee   dee   dee
ef   fd       Zd Zd Z xZS )BioGptForSequenceClassificationrR   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r:  )
r$   r%   rY  r   r   r   r[   r   scorer	  r<  s     r(   r%   z(BioGptForSequenceClassification.__init__  sS      ++!&)YYv114??O
 	r)   rF   r*   r  r   r   rC  r   rd   r  r  rf   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  d}n|Vt        j                  || j                   j
                        j                  d      dz
  j                  |j                        }n.d}t        j                  | j                  j                   d       |t        j                  ||j                        |f   }d}|| j                   j                   | j"                  dk(  rd	| j                   _        nl| j"                  dkD  rL|j$                  t        j&                  k(  s|j$                  t        j(                  k(  rd
| j                   _        nd| j                   _        | j                   j                   d	k(  rIt+               }| j"                  dk(  r& ||j-                         |j-                               }n |||      }n| j                   j                   d
k(  r=t/               } ||j1                  d| j"                        |j1                  d            }n,| j                   j                   dk(  rt3               } |||      }|
s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )r_  Nr`  r   r"   ri   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)rE  rF  r   r`   r  )rR   r  r   rl  r{   r   r1   nesumr   r   rY   rZ   r'   r7   r   problem_typerY  r   r0   r;   r	   squeezer   ro   r   r   r   r`   r  )r&   rF   r*   r  r   r   rC  r   rd   r  r  rd  r`   rF  r   r   pooled_logitsrE  re  rK  s                       r(   r4   z'BioGptForSequenceClassification.forward  s   ( &1%<k$++B]B]"kk+)'/!5# * 

 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88It{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r)   c                 .    | j                   j                  S rE   r   r   r  s    r(   r  z4BioGptForSequenceClassification.get_input_embeddings>  s    {{'''r)   c                 &    || j                   _        y rE   rw  r  s     r(   r  z4BioGptForSequenceClassification.set_input_embeddingsA  s    #( r)   )
NNNNNNNNNN)r7   r8   r9   r   r%   r   r   r1   r<   r   r   r=   r   r   r   r4   r  r  r>   r?   s   @r(   rj  rj    s,   |   156:15@D59-1$(,0/3&*V
E,,-V
 !!2!23V
 E--.	V

 "%ell(;"<=V
   1 12V
 ))*V
 D>V
 $D>V
 'tnV
 d^V
 
u66	7V
 V
p()r)   rj  )r8  rV  rj  r   r   )9r:   r   typingr   r   r   r1   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   configuration_biogptr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerr7   rY   r   r   rA   ModulerK   r   r   r   r   r   r8  rV  rj  __all__r   r)   r(   <module>r     s     ) )    A A ! 5 )  .  /  !;J 
		H	%
;r|| ;>
= 
=ZBbii ZB|f1/ f1T  U Up SO S Sl t
' t
 t
n 
Y- Y
Yx M
#8 M
 M
` g)&; g)g)Tr)   