
    Uhd                        d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZmZmZmZmZ ddlmZmZmZ dd	lmZmZmZmZ d
dlmZ  e       rddlm Z   ejB                  e"      Z#d Z$d Z%d Z&d Z'd Z( G d dej                  jR                        Z* G d de	jR                        Z+ G d de	jR                        Z, G d de	jR                        Z- G d de	jR                        Z. G d de-      Z/e-e/dZ0 G d d e	jR                        Z1 G d! d"e	jR                        Z2 G d# d$e	jR                        Z3 G d% d&e	jR                        Z4 G d' d(e	jR                        Z5 G d) d*e	jR                        Z6e G d+ d,e             Z7e G d- d.e7             Z8e G d/ d0e7             Z9 G d1 d2e	jR                        Z: ed34       G d5 d6e7             Z;e G d7 d8e7             Z< G d9 d:e	jR                        Z=d=d;Z>g d<Z?y)>zPyTorch ESM model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   ))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_flash_attn_2_available#is_flash_attn_greater_or_equal_2_10logging   )	EsmConfig)_flash_attention_forwardc                 b    | j                  dd      \  }}t        j                  | |fd      S )N   dim)chunktorchcat)xx1x2s      v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr'   -   s/    WWQBWFB99rc2YB''    c                     |d d d d d | j                   d   d d f   }|d d d d d | j                   d   d d f   }| |z  t        |       |z  z   S )N)shaper'   )r#   cossins      r&   apply_rotary_pos_embr.   2   sX    
aMaggbkM1$
%C
aMaggbkM1$
%CGA,--r(   c                 j    | dz  dt        j                  | t        j                  d      z        z   z  S )zo
    This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
    g      ?      ?g       @)r!   erfmathsqrtr#   s    r&   gelur5   9   s.     s7cEIIa$))C.&899::r(   c                 ,    | | j                  dd      z   S )zJMake layer symmetric in final two dimensions, used for contact prediction.r   r*   )	transposer4   s    r&   
symmetrizer8   @   s    q{{2r"""r(   c                     | j                  dd      }| j                  dd      }| j                  dd      }||z  }|j                  |       | |z
  }|S )z=Perform average product correct, used for contact prediction.r   T)keepdimsr*   )r   r*   )sumdiv_)r#   a1a2a12avg
normalizeds         r&   average_product_correctrB   E   s[    	
rD	!B	
rD	!B
%%4%
(C
r'CHHSMSJr(   c                        e Zd ZdZdef fdZd	dZdej                  dej                  de	ej                  ej                  f   fdZ
 xZS )
RotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    r   c                     t         |           ddt        j                  d|dt        j                        j                         |z  z  z  }|}| j                  d|       d | _        d | _        d | _	        y )Nr0   i'  r   r   dtypeinv_freq)
super__init__r!   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr   rH   	__class__s      r&   rJ   zRotaryEmbedding.__init__X   sl    %ELLC%++$N$T$T$VY\$\]^Z2#r(   c                 t   |j                   |   }|| j                  k7  s#| j                  j                  |j                  k7  r|| _        t	        j
                  |j                   |   |j                        j                  | j                        }t	        j                  || j                        }t	        j                  ||fd      j                  |j                        }|j                         d d d d d d f   | _        |j                         d d d d d d f   | _        | j                  | j                  fS )Ndevicer   r   )r+   rO   rP   rV   r!   rK   type_asrH   outerr"   tor,   r-   rQ   )rR   r#   seq_dimensionseq_lentfreqsembs          r&   _update_cos_sin_tablesz&RotaryEmbedding._update_cos_sin_tablesc   s    ''-( d***d.>.>.E.E.Q#*D QWW]3AHHEMMdmm\AKK4==1E))UEN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r(   qkreturnc                 .   | j                  |d      \  | _        | _        t        || j                  | j                        j	                  |j
                        t        || j                  | j                        j	                  |j
                        fS )Nr*   )rZ   rF   )r_   rP   rQ   r.   rY   rG   )rR   r`   ra   s      r&   forwardzRotaryEmbedding.forwards   s    -1-H-HZ\-H-]*$* !D$4$4d6F6FGJJQRQXQXJY D$4$4d6F6FGJJQRQXQXJY
 	
r(   )r   )__name__
__module____qualname____doc__intrJ   r_   r!   Tensorr   rd   __classcell__rS   s   @r&   rD   rD   Q   sM    	 C 	 2 
 
%,, 
5u||A[;\ 
r(   rD   c                   8     e Zd ZdZ	 	 ddedef fdZd Z xZS )EsmContactPredictionHeadzWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                     t         |           || _        || _        t	        j
                  |d|      | _        t	        j                         | _        y )Nr   )	rI   rJ   ro   rp   r   Linear
regressionSigmoid
activation)rR   ro   biasrp   rS   s       r&   rJ   z!EsmContactPredictionHead.__init__   s@     	&))KD9**,r(   c                 X   |j                  | j                        j                  |      }|j                  d      |j                  d      z  }||d d d d d d d d f   z  }|dd dd df   }|ddd dd f   }|j	                         \  }}}}}|j                  |||z  ||      }|j                  | j                  j                  j                        }t        t        |            }|j                  dddd      }| j                  | j                  |      j                  d            S )Nr   r   .r   r   r   )nerp   rY   	unsqueezesizeviewrs   weightrV   rB   r8   permuteru   squeeze)	rR   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r&   rd   z EsmContactPredictionHead.forward   s!   99T\\*--j9%%a(8+=+=a+@@(1dD!Q+>"??
SbS#2#.
QR,
/9/@,
FE61__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr(   )Tr   )re   rf   rg   rh   ri   rJ   rd   rk   rl   s   @r&   rn   rn   |   s+    a
 	
'
' 	
'Gr(   rn   c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )EsmEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j(                        j+                  d      d       |j                  | _        | j                   dk(  r;t        j                  |j(                  |j
                  | j,                        | _        |j0                  | _        |j2                  | _        y )	N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r   F)
persistent)rI   rJ   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rN   r!   rK   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrR   configrS   s     r&   rJ   zEsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r(   c                    |+|t        || j                  |      }n| j                  |      }|| j                  |      }|}| j                  r|j                  || j                  k(  j                  d      d      }d}|j                  d      }|| j                  k(  j                  d      j                         |z  }	|d|z
  z  d|	z
  d d d d f   z  j                  |j                        }| j                  dk(  r| j                  |      }
||
z   }| j                  | j                  |      }|-||j                  d      z  j                  |j                        }|S )Nr           gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   ry   r;   rM   rY   rG   r   r   r   )rR   	input_idsattention_maskr   inputs_embedspast_key_values_length
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s              r&   rd   zEsmEmbeddings.forward   s~    $A)TM]M]_uv#JJ=Y  00;M #
 #//d>P>P1P0[0[\^0_adeJ)(,,R0K#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#&99J??&4J%$~'?'?'CCGG
HXHXYJ r(   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr   r   rG   rV   r   )rz   r!   rK   r   longrV   ry   r   )rR   r   input_shapesequence_lengthr   s        r&   r   z4EsmEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r(   )NNNNr   )re   rf   rg   rh   rJ   rd   r   rk   rl   s   @r&   r   r      s    20 rs+Z=r(   r   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )EsmSelfAttentionc                 T   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |xs t%        |dd      | _        d | _        | j&                  dk(  s| j&                  d	k(  rG|j*                  | _        t        j,                  d
|j*                  z  dz
  | j                        | _        n*| j&                  dk(  rt1        | j                        | _        |j2                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryr   r   rotaryr   )rI   rJ   r   r   num_attention_headshasattr
ValueErrorri   attention_head_sizeall_head_sizer   rr   querykeyvaluer   attention_probs_dropout_probr   r   r   rotary_embeddingsr   r   distance_embeddingrD   
is_decoderrR   r   r   rS   s      r&   rJ   zEsmSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%49Q9Q%RD" ++r(   r#   rb   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr   r   r   r   r   )rz   r   r   r{   r}   )rR   r#   new_x_shapes      r&   transpose_for_scoresz%EsmSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r(   hidden_statesr   	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                    | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|| j                  dz  z  }| j                  r|
|f}| j                  dk(  r| j                  ||
      \  }}
t	        j                  ||
j                  dd            }| j                  d	k(  s| j                  d
k(  rF|j                         d   }t	        j                  |t        j                  |j                        j!                  dd      }t	        j                  |t        j                  |j                        j!                  dd      }||z
  }| j#                  || j$                  z   dz
        }|j'                  |j(                        }| j                  d	k(  rt	        j*                  d||      }||z   }nE| j                  d
k(  r6t	        j*                  d||      }t	        j*                  d|
|      }||z   |z   }|||z   }t,        j.                  j1                  |d      }| j3                  |      }|||z  }t	        j                  |j'                  |j(                        |      }|j5                  dddd      j7                         }|j                         d d | j8                  fz   }|j!                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   r   r         r   r   r*   r   r   r   rF   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )r   r   r   r   r!   r"   r   r   r   r   matmulr7   rz   rK   r   rV   r{   r   r   rY   rG   einsumr   
functionalsoftmaxr   r}   
contiguousr   )rR   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                             r&   rd   zEsmSelfAttention.forward  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB "D$<$<d$BB?? (5N''83%)%;%;K%S"K !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s %/.@ --//0@b/I ,,7  -	9O_%7%78I8I%JKX%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr(   NNNNNNF)re   rf   rg   rJ   r!   rj   r   r   FloatTensorr   boolrd   rk   rl   s   @r&   r   r      s    ,>%ell %u|| % 7;15=A>BDH,1d||d !!2!23d E--.	d
  ((9(9:d !)):): ;d !uU->->'?!@Ad $D>d 
u||	dr(   r   c                   $     e Zd Z fdZd Z xZS )EsmSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	rI   rJ   r   rr   r   denser   r   r   r   s     r&   rJ   zEsmSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r(   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   rR   r   input_tensors      r&   rd   zEsmSelfOutput.forward  .    

=1]3%4r(   re   rf   rg   rJ   rd   rk   rl   s   @r&   r   r         >
r(   r   c                        e Zd ZdZd fd	Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     f fdZ xZS )EsmFlashAttention2aF  
    ESM flash attention module. This module inherits from `EsmSelfAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 j    t         |   ||       t                | _        |j                  | _        y )N)r   )rI   rJ   r   _flash_attn_uses_top_left_maskr   dropout_probr   s      r&   rJ   zEsmFlashAttention2.__init__  s5    9PQ
 3V2W.W+"??r(   r   r   r   r   r   r   r   rb   c                    |s||*t         j                  d       t        |   |||||||      S |j	                         \  }}	}
| j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }|8t        j                  |d   |gd      }t        j                  |d   |gd      }|j                  }|t        j                  k(  rt        j                         rt        j                         }nMt        | j                   d      r| j                   j"                  }n | j                  j$                  j                  }t         j                  d| d       |j'                  |      }|j'                  |      }|j'                  |      }|| j(                  d	z  z  }| j*                  d
k(  r| j-                  ||      \  }}n7| j*                  dk(  s| j*                  dk(  rt/        d| j*                   d      t1        |j3                  dddd      |j3                  dddd      |j3                  dddd      ||	| j4                  d| j6                  r| j8                  nd| j:                  	      }|j=                  ||	d      }|d f}| j4                  r||fz   }|S )NzEsmFlashAttention2 does not support output_attentions, head_mask, or cross_attention. Falling back to the manual attention implementation. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   r   r   r   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r   r   r   r   z%ESM flash attention does not support z embeddingsr   r0   r   )query_length	is_causalsoftmax_scaler   use_top_left_maskr   )loggerwarning_oncerI   rd   rz   r   r   r   r   r!   r"   rG   float32is_autocast_enabledget_autocast_gpu_dtyper   r   r  r|   rY   r   r   r   r   r   r}   r   trainingr  r   reshape)rR   r   r   r   r   r   r   r   bszq_lenr   r   r   r   input_dtypetarget_dtypeattn_outputr   rS   s                     r&   rd   zEsmFlashAttention2.forward  s    	 59N9ZU
 7?%&!  &**,UA//

=0IJ--dhh}.EF	//

=0IJ%		>!#4i"@aHI))^A%6$D!LK "''%--'((*$;;=&?@#{{BB#zz0066 >$ &..6K!\2I%..6K "D$<$<d$BB''83%)%;%;K%S"K))^;t?[?[_s?sDTEaEaDbbmnoo
 /1a+aAq)1a+oo)-D%%C"AA

 "))#ub9%?? 11Gr(   r   r   )re   rf   rg   rh   rJ   r!   rj   r   r   r   r   rd   rk   rl   s   @r&   r   r     s    @ 7;15=A>BDH,1]||] !!2!23] E--.	]
  ((9(9:] !)):): ;] !uU->->'?!@A] $D>] 
u||	] ]r(   r   )eagerflash_attention_2c                   8     e Zd Z fdZd Z	 	 	 	 	 	 ddZ xZS )EsmAttentionc                     t         |           t        |j                     |      | _        t        |      | _        t               | _        t        j                  |j                  |j                        | _
        y )Nr   )rI   rJ   ESM_ATTENTION_CLASSES_attn_implementationrR   r   outputsetpruned_headsr   r   r   r   r   s     r&   rJ   zEsmAttention.__init__
  sY    )&*E*EFvN	#F+Ef&8&8f>S>STr(   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rR   r   r   r  r   r   r   r   r  r   r   union)rR   r   indexs      r&   prune_headszEsmAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r(   c           	          | j                  |      }| j                  |||||||      }	| j                  |	d   |      }
|
f|	dd  z   }|S )Nr   r   )r   rR   r  )rR   r   r   r   r   r   r   r   hidden_states_lnself_outputsattention_outputr   s               r&   rd   zEsmAttention.forward#  sf      >>-8yy!"
  ;;|AF#%QR(88r(   r   )re   rf   rg   rJ   r#  rd   rk   rl   s   @r&   r  r  	  s'    U;* "#r(   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EsmIntermediatec                     t         |           t        j                  |j                  |j
                        | _        y r   )rI   rJ   r   rr   r   intermediate_sizer   r   s     r&   rJ   zEsmIntermediate.__init__=  s,    YYv1163K3KL
r(   r   rb   c                 >    | j                  |      }t        |      }|S r   )r   r5   )rR   r   s     r&   rd   zEsmIntermediate.forwardA  s     

=1]+r(   re   rf   rg   rJ   r!   rj   rd   rk   rl   s   @r&   r)  r)  <  s$    MU\\ ell r(   r)  c                   $     e Zd Z fdZd Z xZS )	EsmOutputc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rI   rJ   r   rr   r+  r   r   r   r   r   r   s     r&   rJ   zEsmOutput.__init__H  sB    YYv779K9KL
zz&"<"<=r(   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r&   rd   zEsmOutput.forwardM  r   r(   r   rl   s   @r&   r/  r/  G  r   r(   r/  c                   8     e Zd Z fdZ	 	 	 	 	 	 ddZd Z xZS )EsmLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        t        j                  |j                   |j"                        | _        y )Nr   z> should be used as a decoder model if cross attention is addedr   )rI   rJ   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr)  intermediater/  r  r   r   r   r   r   s     r&   rJ   zEsmLayer.__init__U  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v"6D+F3'f&8&8f>S>STr(   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }| j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr   )r   r   r   r   r   r:  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r*   )r7  r   r   AttributeErrorr:  feed_forward_chunk)rR   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr'  r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r&   rd   zEsmLayer.forwardd  sf    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12$=dV D` `  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P../?@/G+ ??!2 44Gr(   c                 n    | j                  |      }| j                  |      }| j                  ||      }|S r   )r   r;  r  )rR   r'  attention_output_lnintermediate_outputrE  s        r&   r>  zEsmLayer.feed_forward_chunk  s<    "nn-=>"//0CD{{#68HIr(   r   )re   rf   rg   rJ   rd   r>  rk   rl   s   @r&   r3  r3  T  s(    U$ "#=~r(   r3  c                   8     e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddZ xZS )
EsmEncoderc                 0   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w )Nr   F)rI   rJ   r   r   
ModuleListrangenum_hidden_layersr3  layerr   r   r   emb_layer_norm_aftergradient_checkpointing)rR   r   r   rS   s      r&   rJ   zEsmEncoder.__init__  sm    ]]eFD\D\>]#^HV$4#^_
$&LL1C1CI^I^$_!&+# $_s   Bc                    | j                   r%| j                  r|rt        j                  d       d}|	rdnd }|rdnd }|r| j                  j
                  rdnd }|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                   r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz   }|s|||d   fz   }| j                  j
                  s||d   fz   } | j                  r| j                  |      }|	r||fz   }|
st        d |||||fD              S t        |||||	      S )
Nzh`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting `use_cache=False`...F r   r   r   r   c              3   $   K   | ]  }|| 
 y wr   rS  ).0vs     r&   	<genexpr>z%EsmEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_statepast_key_valuesr   r   cross_attentions)rQ  r  r	  r
  r   r8  	enumeraterO  _gradient_checkpointing_func__call__rP  tupler   )rR   r   r   r   r   r   rY  	use_cacher   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r&   rd   zEsmEncoder.forward  s    &&4==##+ "	"6BD$5b4%64;;;Z;Zr`d#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M%7=;L:N%N" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ $$ 55mDM 1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r(   )	NNNNNNFFTr   rl   s   @r&   rJ  rJ    s+    , "#"V
r(   rJ  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	EsmPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rI   rJ   r   rr   r   r   Tanhru   r   s     r&   rJ   zEsmPooler.__init__  s9    YYv1163E3EF
'')r(   r   rb   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   ru   )rR   r   first_token_tensorpooled_outputs       r&   rd   zEsmPooler.forward  s6     +1a40

#566r(   r-  rl   s   @r&   rk  rk    s#    $
U\\ ell r(   rk  c                   *    e Zd ZeZdZdZg dZdZd Z	y)EsmPreTrainedModelesmT)r3  #EsmFoldTriangularSelfAttentionBlockr   c                 l   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r%|j                  j                  j                          yy)zInitialize the weightsr   )meanstdNr0   )
isinstancer   rr   r|   datanormal_r   initializer_rangerv   zero_r   r   r   fill_	EsmLMHead)rR   modules     r&   _init_weightsz EsmPreTrainedModel._init_weights$  s&   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)	*KK""$ +r(   N)
re   rf   rg   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2r  rS  r(   r&   rs  rs    s$    L&*#\!%r(   rs  c                       e Zd ZdZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	ee
j                        de	e   de	e   de	e   de	e   deee
j                     ef   fd       Zd Z xZS )EsmModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                    t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        t        |j                  |j                  z  d      | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        NT)ro   rv   )rI   rJ   r   r   r   rJ  encoderrk  poolerrn   rN  r   contact_head	post_init)rR   r   add_pooling_layerrS   s      r&   rJ   zEsmModel.__init__E  sq    
 	 '/!&)+<i'$40063M3MMTX

 	r(   c                 .    | j                   j                  S r   r   r   rR   s    r&   get_input_embeddingszEsmModel.get_input_embeddingsY  s    ...r(   c                 &    || j                   _        y r   r  )rR   r   s     r&   set_input_embeddingszEsmModel.set_input_embeddings\  s    */'r(   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rO  r7  r#  )rR   heads_to_prunerO  r   s       r&   _prune_headszEsmModel._prune_heads_  sE    
 +002 	CLE5LLu%//;;EB	Cr(   r   r   r   r   r   r   r   rY  r_  r   r`  ra  rb   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }| j                   j                  r|	|	n| j                   j
                  }	nd}	||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |\  }}||j                  n|j                  }||d   d   j                  d   nd}|t        j                  |||z   f|      }| j                   j                  d	k(  r|}n| j                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j                  |      }nd}| j!                  || j                   j"                        }| j%                  |||||
      }| j'                  |||||||	|
||
      }|d   }| j(                  | j)                  |      nd}|s
||f|dd z   S t+        |||j,                  |j.                  |j0                  |j2                        S )aV  
        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r   rU   r  )r   r   r   r   r   )	r   r   r   r   rY  r_  r   r`  ra  r   )rX  pooler_outputrY  r   r   rZ  )r   r   r`  use_return_dictr   r_  r   %warn_if_padding_and_no_attention_maskrz   rV   r+   r!   onesr  get_extended_attention_maskinvert_attention_maskget_head_maskrN  r   r  r  r   rY  r   r   rZ  )rR   r   r   r   r   r   r   r   rY  r_  r   r`  ra  r   r   r   rV   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputrq  s                               r&   rd   zEsmModel.forwardg  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN;;++/BB&4#
 594T4TUcep4q# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r(   c                 H    | ||dd      j                   }t        j                  |d      }||j                  d      j                  d      j                  d      z  }||j                  d      j                  d      j                  d      z  }| j	                  ||      S )NT)r   ra  r   r   r   r   r      )r   r!   stackry   r  )rR   r   r   attnss       r&   predict_contactszEsmModel.predict_contacts  s    VN`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r(   )T)NNNNNNNNNNNN)re   rf   rg   rh   rJ   r  r  r  r   r   r!   rj   r   r   r   r   r   r   rd   r  rk   rl   s   @r&   r  r  7  s_   
(/0C  -115/3,0048<9==A$(,0/3&*y
ELL)y
 !.y
 u||,	y

 ELL)y
  -y
  (5y
 !) 6y
 "$u'8'8"9:y
 D>y
 $D>y
 'tny
 d^y
 
uU\\"$PP	Qy
 y
v	0r(   r  c                   |    e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   dee   deeef   fd       Zd Z xZS )EsmForMaskedLMzlm_head.decoder.weightc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  )
rI   rJ   r   r	  warningr  rt  r  lm_headinit_weightsr   s     r&   rJ   zEsmForMaskedLM.__init__  sR     NN1
 Fe< (r(   c                 .    | j                   j                  S r   r  decoderr  s    r&   get_output_embeddingsz$EsmForMaskedLM.get_output_embeddings  s    ||###r(   c                 &    || j                   _        y r   r  )rR   new_embeddingss     r&   set_output_embeddingsz$EsmForMaskedLM.set_output_embeddings  s    -r(   r   r   r   r   r   r   r   labelsr   r`  ra  rb   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }d}|at	               }|j                  |j                        } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)	r   r   r   r   r   r   r   r`  ra  r   r   r   losslogitsr   r   )r   r  rt  r  r	   rY   rV   r{   r   r   r   r   )rR   r   r   r   r   r   r   r   r  r   r`  ra  r   r  prediction_scoresmasked_lm_lossloss_fctr  s                     r&   rd   zEsmForMaskedLM.forward  s   * &1%<k$++B]B](()%'"7#9/!5#  
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r(   c                 <    | j                   j                  ||      S )N)r   )rt  r  )rR   r   r   s      r&   r  zEsmForMaskedLM.predict_contacts?  s    xx(((OOr(   )NNNNNNNNNNN)re   rf   rg   _tied_weights_keysrJ   r  r  r   r   r!   
LongTensorrj   r   r   r   r   r   rd   r  rk   rl   s   @r&   r  r    s7   23$.  151537,059=A9=-1,0/3&*5
E,,-5
 !.5
 u//0	5

 ELL)5
   1 125
  ((9(9:5
 !) 65
 ))*5
 $D>5
 'tn5
 d^5
 
un$	%5
 5
nPr(   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z&ESM Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        y )Nr   F)rv   )rI   rJ   r   rr   r   r   r   r   r   r   r  	Parameterr!   zerosrv   r   s     r&   rJ   zEsmLMHead.__init__F  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r(   c                     | j                  |      }t        |      }| j                  |      }| j                  |      | j                  z   }|S r   )r   r5   r   r  rv   rR   featureskwargsr#   s       r&   rd   zEsmLMHead.forwardN  sD    JJx GOOA LLOdii'r(   re   rf   rg   rh   rJ   rd   rk   rl   s   @r&   r  r  C  s    0Ar(   r  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   $    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee
   d	ee
   d
ee
   deeef   fd       Z xZS )EsmForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr  )	rI   rJ   
num_labelsr   r  rt  EsmClassificationHead
classifierr  r   s     r&   rJ   z%EsmForSequenceClassification.__init___  sJ      ++Fe</7r(   r   r   r   r   r   r  r   r`  ra  rb   c
           
      R   |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}||j	                  |j
                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               } |||      }|	s|f|
d	d z   }||f|z   S |S t#        |||
j$                  |
j&                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   r   r`  ra  r   r   rs   single_label_classificationmulti_label_classificationr   r   r  )r   r  rt  r  rY   rV   problem_typer  rG   r!   r   ri   r
   r~   r	   r{   r   r   r   r   rR   r   r   r   r   r   r  r   r`  ra  r   r  r  r  r  r  s                   r&   rd   z$EsmForSequenceClassification.forwardi  s   & &1%<k$++B]B](()%'/!5#  	
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r(   	NNNNNNNNN)re   rf   rg   rJ   r   r   r!   r  rj   r   r   r   r   r   rd   rk   rl   s   @r&   r  r  X  s      151537,059-1,0/3&*C
E,,-C
 !.C
 u//0	C

 ELL)C
   1 12C
 ))*C
 $D>C
 'tnC
 d^C
 
u..	/C
 C
r(   r  c                   $    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee
   d	ee
   d
ee
   deeef   fd       Z xZS )EsmForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r  )rI   rJ   r  r  rt  r   r   r   r   rr   r   r  r  r   s     r&   rJ   z"EsmForTokenClassification.__init__  sk      ++Fe<zz&"<"<=))F$6$68I8IJr(   r   r   r   r   r   r  r   r`  ra  rb   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|Wt               }|j                  |j                        } ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r   r  )r   r  rt  r   r  r	   rY   rV   r{   r  r   r   r   r  s                   r&   rd   z!EsmForTokenClassification.forward  s   " &1%<k$++B]B](()%'/!5#  	
 "!*,,71')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r(   r  )re   rf   rg   rJ   r   r   r!   r  rj   r   r   r   r   r   rd   rk   rl   s   @r&   r  r    s      151537,059-1,0/3&*2
E,,-2
 !.2
 u//0	2

 ELL)2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
r(   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 &   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y r   )rI   rJ   r   rr   r   r   r   r   r   r  out_projr   s     r&   rJ   zEsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr(   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S ro  )r   r   r!   tanhr  r  s       r&   rd   zEsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r(   r  rl   s   @r&   r  r    s    7Ir(   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )rx   ri   r!   cumsumrW   r   )r   r   r   maskincremental_indicess        r&   r   r     sW     <<$((*D <<!4<<TBE[[_cc##%33r(   )r  r  r  r  rs  )r   )@rh   r2   typingr   r   r   r   r!   torch.utils.checkpointr   torch.nnr   r	   r
   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   utilsr   r   r   r   configuration_esmr   modeling_flash_attention_utilsr   
get_loggerre   r	  r'   r.   r5   r8   rB   ModulerD   rn   r   r   r   r   r  r  r)  r/  r3  rJ  rk  rs  r  r  r  r  r  r  r   __all__rS  r(   r&   <module>r     s,      / /    A A  d c l l ( J 
		H	%(
.;#
	(
ehhoo (
V Gryy  GFX=BII X=vIryy IX
BII 
m) mb + 0299 0fbii 
		 
Sryy Sl^
 ^
D		  % % %6 t0! t0 t0n PP' PP PPf		 * O
#5 O
O
d >
 2 >
 >
BBII &4 r(   