
    Uh$                        d dl Z d dlZd dlmc mZ ddlmZ  e       rd dlZd dl	Z	d dl
mZmZ dZdZ e e j                   de            Zeeefvr ed      d	 Z G d
 dej(                  j*                        Zej.                  Z G d dej(                  j*                        Zej.                  Zd ZddZ	 	 	 ddZ	 	 	 	 	 ddZy)    N   )is_torch_npu_available)	rearrangerepeat   NPU_FA2_SPARSE_MODE)defaultzEnvironment variable `NPU_FA2_SPARSE_MODE` can only be set as 2 (top-left aligned causal mask) or 3 (down-right aligned causal mask).c                  4    t               rt        t        k(  S dS )NF)r   SPARSE_MODE!TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/integrations/npu_flash_attention.py'is_npu_fa2_top_left_aligned_causal_maskr   )   s    ?U?W;;;b]bbr   c                   ,    e Zd Zed        Zed        Zy)IndexFirstAxisc           
      *   | j                  |       |j                  dk\  sJ |j                  d   |j                  dd  c| _        }|j	                         } t        j                  t        |d      dt        |d|            j                  dg| S )Nr   r      b ... -> b (...)z -> z dd)
save_for_backwardndimshapefirst_axis_dimnumeltorchgatherr   r   reshape)ctxinputindicesother_shape
second_dims        r   forwardzIndexFirstAxis.forward/   s    g&zzQ*/++a.%++ab/'K &&(
u||e/0!VGZS]5^

'"$"$ 	$r   c           	         | j                   \  }|j                  dk\  sJ |j                  dd  }t        |d      }t	        j
                  | j                  |j                  d   g|j                  |j                        }|j                  dt        |d|j                  d         |        |j                  | j                  g| d fS )Nr   r   r   devicedtyper   r   r   )saved_tensorsr   r   r   r   zerosr   r*   r+   scatter_r   r!   )r"   grad_outputr$   r%   
grad_inputs        r   backwardzIndexFirstAxis.backward;   s    &&
1$$$!''+-?@[[!2!21!56%%##

 	Avgz[=N=Nq=QRT_`!z!!#"4"4C{CTIIr   N__name__
__module____qualname__staticmethodr'   r1   r   r   r   r   r   .   s*    	$ 	$ J Jr   r   c                   ,    e Zd Zed        Zed        Zy)IndexPutFirstAxisc                     | j                  |       |j                  dk(  sJ |j                  dk\  sJ t        j                  |g|j                  dd  |j
                  |j                  d}|||<   |S )Nr   r   r)   )r   r   r   r-   r   r*   r+   )r"   valuesr$   r   outputs        r   r'   zIndexPutFirstAxis.forwardQ   sr    g&||q   {{a^ifll12.>iv}}\b\h\hi wr   c                 2    | j                   \  }||   }|d d fS N)r,   )r"   r/   r$   grad_valuess       r   r1   zIndexPutFirstAxis.backward\   s&    &&
!'*D$&&r   Nr2   r   r   r   r8   r8   P   s(      ' 'r   r8   c                 >    t        | |||z        }t        |d|      S )a  
    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.
    Return:
        hidden_states: (batch, seqlen, ...)
    z(b s) ... -> b s ...)b)index_put_first_axisr   )hidden_statesr$   batchseqlenr;   s        r   	pad_inputrE   i   s&     "-%&.IFV3u==r   c                    |||z   n|}|j                  dt        j                        }|j                  dt        j                        }t        j                  |j	                         d      j	                         }|j                         j                         }t        j                  t        j                  |dt        j                        d      }t        t        | d      |      ||||fS )a  
    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    r   )dimr+   F)as_tupler   )r   r   zb s ... -> (b s) ...)sumr   int32nonzeroflattenmaxitemFpadcumsumindex_first_axisr   )	rB   attention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr$   max_seqlen_in_batch
cu_seqlenss	            r   unpad_inputrZ   {   s     3>2I+-~I }}5;;}?*..2U[[.ImmI--/%@HHJG*..0557u||$4!5;;OQWXJ 	=2HI7S r   c                    d|z
  }|%dt        j                  | j                  d         z  }|s0| j                  d   }t        j                  | |||d||      d   }	|	S t        j                  t        j                  ddg| j                        d	
      j                         }
| j                  d   }t        j                  | |||d|||
t        	      d   }	|	S )N      ?r   r   BSND)	keep_probscaler      r*   r   diagonal)r^   r_   
atten_masksparse_mode)mathsqrtr   	torch_npunpu_fusion_attentionr   triuonesr*   boolr   )qkv	dropout_psoftmax_scalecausalkwargsr^   head_numr;   attn_mask_npus              r   npu_flash_attn_funcrv      s     iIdii44771://1a6U^fstuvw  M 

5::tTl188#LWXY^^`771://$#

 
 Mr   c
                 v   d|z
  }|%dt        j                  | j                  d         z  }|	s| j                  d   }t        j                  | |||d d ||dt        |dd  j                         j                         j                               t        |dd  j                         j                         j                                     d   }|S t        j                  t        j                  ddg| j                        d	      j                         }| j                  d   }t        j                  | |||d d |||dt        |dd  j                         j                         j                               t        |dd  j                         j                         j                               t        
      d   }|S )Nr\   r   r   TND)pserd   r_   r^   input_layoutactual_seq_qlenactual_seq_kvlenr   r`   ra   rb   )	ry   padding_maskrd   r_   r^   rz   r{   r|   re   )rf   rg   r   rh   ri   tuplecpunumpytolistr   rj   rk   r*   rl   r   )rm   rn   ro   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krp   rq   rr   rs   r^   rt   r;   ru   s                  r   npu_flash_attn_varlen_funcr      s    iIdii44771://!,qr"2"6"6"8">">"@"G"G"IJ"<#3#7#7#9#?#?#A#H#H#JK
 @ M% 

5::tTl188#LWXY^^`771://$!,qr"2"6"6"8">">"@"G"G"IJ"<#3#7#7#9#?#?#A#H#H#JK#
   Mr   r=   )        NF)NNr   NF)osr   torch.nn.functionalnn
functionalrO   utils.import_utilsr   rf   rh   einopsr   r   r   #DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODEintgetenvr   
ValueErrorr   autogradFunctionr   applyrR   r8   rA   rE   rZ   rv   r   r   r   r   <module>r      s    
    7 (
 %& !&' #)"))1;^_`8:]^^
	1 c
JU^^,, J< "'' '// '* ).. >$J  R 4r   