
    UhA&                        d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ  e	       rddlmZmZ dd	lmZ  G d
 d      Zeej$                  ef   Z	 	 	 	 d!dej$                  dee   deeeef      ddfdZej,                  j/                  d      	 d"dej$                  dej$                  dej$                  dej$                  fd       Zdej$                  dedej$                  fdZ	 	 	 d#dej4                  j6                  dej$                  dej$                  dej$                  deej$                  df   dee   dee   deej$                     deej$                  ej$                  f   fd Zy)$a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalTupleUnionN)version   )is_torch_flex_attn_available)_torch_version)	BlockMaskflex_attention)create_block_maskc                   x     e Zd ZdZdZdZdZ fdZej                  j                  d      d        Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 \    | j                   t        | 	  |       | _         | j                   S N)	_instancesuper__new__)clsargskwargs	__class__s      z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__6   s'    == !GOC0CM}}    	recursivec                 (   | j                   r|| j                  k7  rw|| _        t        j                  t              j
                  dk(  r$|r"t        j                  t        dd      | _	        nt        j                  t              | _	        d| _         yy)z>
        Initialize or update the singleton instance.
        z2.6.0Fzmax-autotune-no-cudagraphs)dynamicmodeTN)
_is_flex_compiledtrainingr   parser	   base_versiontorchcompiler   _compiled_flex_attention)selfr    s     r   __init__zWrappedFlexAttention.__init__<   st    
 %%T]])B %DM}}^,99WD05"E8T1- 16n0M-%)D" *Cr   c                     | j                   S r   )r%   )r&   s    r   __call__zWrappedFlexAttention.__call__N   s    ,,,r   )__name__
__module____qualname____doc__r   r   r%   r   r#   compilerdisabler'   r)   __classcell__)r   s   @r   r   r   -   sK     I# ^^e,* -*"-r   r   attention_mask_2dattention_chunk_sizeoffsetsreturnr
   c           	         
  j                   \  }}|s|}|s|}t        j                  j                  j	                   dd|f        j
                  } j                         |4j                         j                  d      j                  d      dz
  |z   fd

fd}|
n|||d   |d   fd}	n}	t        |	|d|||d	
      S )a  
    Create a block causal document mask for a batch of sequences, both packed and unpacked.
    Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full block causal
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
    r   )valuepadN   c                 T    ||k\  }	| |f   	| |f   k(  }| |f   dkD  }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.

        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskr1   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 B    | |f   | |f   k(  } | |||      }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        r;   )r<   r=   r>   r?   
chunk_maskcausal_doc_maskrE   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s>      	5 01Z	6@Q5RR
))XufMO++r   c                 .    |z   }|z   } | |||      S r   r;   )	r<   r=   r>   r?   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s(    x'H*I*9h)TTr   T)rQ   BHQ_LENKV_LENdevice_compile)
shaper#   nn
functionalr7   rV   clonefill_cumsumcreate_block_causal_mask_flex)r1   r2   query_length
key_lengthr3   
batch_sizetotal_seq_lenrV   rJ   rQ   rE   rI   rD   rN   rO   rP   s   `         @@@@@@r   make_flex_block_causal_maskrc   U   s    : !2 7 7J"
$++//0AQRT^P_/`%%F$**,L'"((*003::2>BH\]
, 2F1MoSh1:AJ		U
 +(

 r   Fr   querykeyr6   c                 8     t        |             } || ||fi |S r   )r   )rd   re   r6   r    r   flex_attention_compileds         r   compile_friendly_flex_attentionrh      s5     =28<>" 	 r   hidden_statesn_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r8   N)rX   expandreshape)ri   rj   batchnum_key_value_headsslenhead_dims         r   	repeat_kvrr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr   moduleattention_maskscalingsoftcap	head_maskc                 N   d }	d t        |t              r|}	n|d d d d d d d |j                  d   f   fd}
d}|j                  d   }||dz
  z  dk(  sTt        ||j                  d   |j                  d   z        }t        ||j                  d   |j                  d   z        }d}|j	                  dd       }t        ||||
|	|||d| j                  
      \  }}|j                  |j                        }|j                  dd	      j                         }||fS )
Nc                     t        j                  | z        z  } | |   d   |   |   z   } | |   |   d   d   z   } | S )Nr   )r#   tanh)scorer<   r=   r>   r?   r@   rw   rv   s        r   	score_modz)flex_attention_forward.<locals>.score_mod   sm    ejj99E"K	215e<VDDE Ii0:1=a@@Er   Tr8   r   Fkernel_options)r}   
block_mask
enable_gqascaler~   
return_lser    r   )
isinstancer
   rX   rr   getrh   r    todtype	transpose
contiguous)rs   rd   re   r6   rt   ru   rv   rw   r   r   r}   r   num_local_query_headsr~   attn_outputattention_weightsr@   s         ``        @r   flex_attention_forwardr      sA    JK.),#
$!!Q?SYYr]?":; J!KKN #&;a&?@QFU[[^syy|;<%Q5;;q>!AB
ZZ 0$7N%D% &"K" *,,U[[9''1-88:K)))r   )NNNN)F)NNN)r-   typingr   r   r   r#   	packagingr   utilsr   utils.import_utilsr	   !torch.nn.attention.flex_attentionr
   r   r   r^   r   TensorintOffsetrc   r.   r/   rh   rr   rY   Modulefloatr   r;   r   r   <module>r      s  8 * )   0 /  !K
"- "-J 
u||S 	!
 +//3Y||Y"3-Y
 eFFN+,Y Yx %(
 	<<	 << \\ )"	UU\\ 	U# 	U%,, 	U$  $#(,:*HHOO:*<<:* 
:* <<	:*
 %,,34:* e_:* e_:* %:* 5<<%&:*r   