
    Uh                       d dl mZ d dlmZmZmZmZmZ d dlZd dlm	Z	 d dl
mc mc mZ d dlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-  e+       rd dl.m/Z/ d dl0m1Z1m2Z2 ndZ/ e*       r	d dl3m4Z4m5Z5 nd\  Z5Z4 e(jl                  e7      Z8 G d ded      Z9 G d dejt                        Z: G d de	jv                        Z<d  Z=d!ej|                  d"e?d#ej|                  fd$Z@	 dHd%e	jv                  d&ej|                  d'ej|                  d(ej|                  d)eej|                     d*eAd+eAfd,ZBdId-ZC G d. d/e	jv                        ZD G d0 d1ej                  jv                        ZEd2ej|                  d3e?fd4ZFd5 ZGd6 ZH eIe/e4e5f      ZJd7 ZK G d8 d9e	jv                        ZL G d: d;e	jv                        ZM ed<       G d= d>e	jv                               ZN G d? d@e	jv                        ZOe& G dA dBe"             ZPe& G dC dDeP             ZQe& G dE dFePe             ZRg dGZSy)J    )partial)CallableOptionalTuple	TypedDictUnionN)nn)ACT2FN   )Cache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNNc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.pyr&   r&   A   s7    " ######__r6   r&   F)totalc                   B     e Zd ZdZej
                  dfdef fdZ xZS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfigc                 R   t         	|   ||||       |j                  | _        d| _        |j                  }|j
                  }g | _        g | _        g | _        t        |j                        D ]*  }| j                  |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                  z  |z  z   |||      gz  c_        | xj                  t        j                  ||j                   |j"                  |||      gz  c_        | xj                  t        j$                  g g|z  |      gz  c_        | xj                  t        j$                  g g|z  |      gz  c_        | j                  j'                  |       - t        |j                        D cg c]  }t        j$                  g g|z  |       c}| _        t        |j                        D cg c]  }t        j$                  g g|z  |       c}| _        y c c}w c c}w )NFmamba   devicedtyper@   )super__init__layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr0   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)
selfr;   
batch_sizerA   r@   conv_kernel_sizessm_state_sizei_	__class__s
            r7   rD   z)HybridMambaAttentionDynamicCache.__init__i   s   UF;!'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   3"H4"H$)	r,   r-   r.   r/   r0   float16r   rD   __classcell__r^   s   @r7   r:   r:   [   s)     ?DmmTX %u{ %u %ur6   r:   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )BambaRotaryEmbeddingr;   c                    t         |           t        |d      rG|j                  ;|j                  j	                  d|j                  j	                  d            | _        nd| _        |j                  | _        |j                  | _        || _	        t        | j
                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                  | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)rC   rD   hasattrre   getrf   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr;   r   rope_init_fnattention_scalingregister_bufferri   original_inv_freq)rX   r;   r@   ri   r^   s       r7   rD   zBambaRotaryEmbedding.__init__   s    6>*v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r6   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   mpscpuF)device_typeenabledr>   dimrA   )ri   floatexpandshapetor@   
isinstancerg   strr0   autocast	transposecatcosrq   sinrA   )
rX   xposition_idsinv_freq_expandedposition_ids_expandedrx   freqsembr   r   s
             r7   forwardzBambaRotaryEmbedding.forward   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.N)
r,   r-   r.   r   rD   r0   no_gradr   r   r`   ra   s   @r7   rc   rc      s3    /{ /" U]]_<  <r6   rc   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nru   r>   rz   )r   r0   r   )r   x1x2s      r7   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r6   hidden_statesn_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r   r~   reshape)r   r   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr6   modulequerykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr>   r   ru   )r{   rA   )ptrainingr   )r   num_key_value_groupsr0   matmulr   r   r	   
functionalsoftmaxfloat32r   rA   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r7   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r6   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    ru   .Nrz   )	unsqueezer   r   r0   r   )qkr   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r7   apply_rotary_pos_embr      s    , --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr6   c                   6    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   de
ej                     de
e   d	e
ej                     d
ee   de	ej                  e
ej                     e
e	ej                        f   fdZ xZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr;   	layer_idxc                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr   g      Tbias)rC   rD   r;   r   getattrrP   num_attention_headsr   r   r   r   attention_dropout	is_causalr	   Linearattention_biasq_projk_projv_projo_proj)rX   r;   r   r^   s      r7   rD   zBambaAttention.__init__  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r6   r   position_embeddingsr   past_key_valuecache_positionr   r   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j                  d	       nt         | j                  j                     } || |	|
||f| j"                  sd
n| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nru   r   r>   )r   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )r   r   r   viewr   r   r   r   updater   r   r;   _attn_implementationrl   loggerwarning_oncer   r   r   r   r   r   r   )rX   r   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r7   r   zBambaAttention.forward#  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r6   r$   )r,   r-   r.   r/   r   r3   rD   r0   Tensorr   r   r   r1   r   r   r   r`   ra   s   @r7   r   r   	  s    G
{ 
s 
8 +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0)r6   r   c                   (     e Zd Zd fd	ZddZ xZS )BambaRMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        y r   rC   rD   r	   	Parameterr0   onesweightvariance_epsilonrX   rP   epsr^   s      r7   rD   zBambaRMSNormGated.__init__W  s/    ll5::k#:; #r6   c                    |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S Nr>   ru   T)keepdim)rA   r   r0   r   r	   r   silupowmeanrsqrtr   r   )rX   r   gateinput_dtypevariances        r7   r   zBambaRMSNormGated.forward\  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r6   gư>r   r,   r-   r.   rD   r   r`   ra   s   @r7   r   r   V  s    $
	;r6   r   input_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )lenr   r0   r	   r   pad)r   r   	pad_shapes      r7   pad_tensor_by_sizer  k  sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr6   c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r   ru   r>   )r  r  r   r   )r   r   
chunk_sizes      r7   reshape_into_chunksr	  v  s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r6   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    ru   .Nr?   diagonalr   r   rz   )
sizer~   r0   trilr   r@   boolmasked_fillcumsuminf)r   r  masktensor_segsums       r7   segment_sumr    s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr6   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r   rA   r   )r   r   rA   s      r7   apply_mask_to_padding_statesr    sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr6   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	e
   de	ej                     de	ej                     d	e	ej                     f
d
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ	 	 	 	 dde	e
   de	ej                     de	ej                     d	e	ej                     fdZ xZS )
BambaMixeruO  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the HybridCache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r;   r   c           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        dt;        d      f| _        d| _        d| _         | j                  d| j0                  z  | j                  z  z   | _!        tE        jF                  | jB                  | jB                  |j                  | j                  | jB                  | j                  dz
        | _$        | j                  | jB                  z   | j                  z   }tE        jJ                  | j                  || j(                        | _&        tE        jN                  tQ        jR                  | j                              | _*        tQ        jV                  d| j                  dz         }tE        jN                  tQ        jX                  |            | _-        d	| jZ                  _.        t_        | j                  | j,                  
      | _0        tE        jN                  tQ        jR                  | j                              | _1        d	| jb                  _.        tE        jJ                  | j                  | j                  | j(                        | _2        tf        sth        jk                  d       y th        jk                  d       y )Nr   r  gMbP?g?r>   r   )in_channelsout_channelsr   kernel_sizegroupspaddingr   Tr   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6rC   rD   rR   	num_headsrP   rH   r[   rG   rZ   r3   rO   intermediate_sizer   mamba_conv_biasuse_conv_bias
hidden_act
activationr
   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrQ   n_groupsrS   r   mamba_chunk_sizer  r}   time_step_limittime_step_mintime_step_maxconv_dimr	   Conv1dconv1dr   in_projr   r0   r   dt_biasarangelogA_log_no_weight_decayr   normDout_projis_fast_path_availabler   r   )rX   r;   r   projection_sizeAr^   s        r7   rD   zBambaMixer.__init__  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11 !$U5\2" ..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQUQ^Q^_%>  fgr6   r   cache_paramsr   r   r+   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr   r   ru   rz   .r|   T)zr6  dt_softplusr   r  dt_limitF)r<  r  r+   r'  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr>   )r   swish)r   r   r   r'  r+   )r  r<  rC  r+   rM  r6  rD  )/r  r5  r   r-  r[   rF   rI   r   rJ   squeezesplitr#  r2  r"  r#   r4  r   r   r'  r0   expr9  r}   r~   r   r   r   r6  r<  r   r   r;  r=  r/  r   r!   r  r   r   r	   r   r  rZ   copy_r(  r"   r    )rX   r   rA  r   r   r+   projected_statesrY   seq_lenr]   groups_time_state_sizeuse_precomputed_statesr   hidden_states_B_CdtBCr@  r6  r<  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrI   scan_output	ssm_states                              r7   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r6   c                 *   |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }n
t0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.1|/|j                  | j                     j9                  |.       d|_        | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nru   rz   r   r   )shiftsdimsrB   r>   .r  ).NNr|   r?   )r{   output_sizer   r   r   )r   r   T)9r   rA   r  r5  rP  r#  r2  r"  rF   rI   r   rJ   rollr   r@   r4  r   r0   sumrO  r%  r   r(  r   r	   r   r  rZ   rR  r-  r[   rQ  r9  r}   r~   r   r6  softplusclampr/  r   r   r   r   bmmr<  repeat_interleaver  r  r	  permuter  r  
zeros_liker   r;  r=  )5rX   input_statesrA  r   r   rY   rT  r]   rA   rS  r   rW  rX  rV  rI   r^  r   rY  rZ  r@  cache_devicer6  dAdBdBxrJ   ssm_states_reshaped
C_reshapedyr<  r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr`  state_decay_outC_times_statesstate_decay_out_permutedY_offr_  contextualized_statess5                                                        r7   torch_forwardzBambaMixer.torch_forward  s]    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99hq!Q|&<x&GIL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iH26/ii4(
 !%knnU.C D$$I &{s   vc                 r   t         rAd| j                  j                  j                  j                  v r| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r>  r5  r   r@   rg   ra  NotImplementedErrorrA   r   r   r  )rX   r   rA  r   r   r+   r   rA   s           r7   r   zBambaMixer.forwardx  s     "f0C0C0J0J0O0O&O,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r6   )NNNN)NNN)r,   r-   r.   r/   r   r3   rD   r0   r   r   r:   r1   r4   ra  r  r   r`   ra   s   @r7   r  r    sI   Ah{ Ahs AhL DH5915-1g||g ?@g !!1!12	g
 !.g %//*gZ DH5915M% ?@M% !!1!12	M%
 !.M%f DH5915-1_ ?@_ !!1!12	_
 !._ %//*_r6   r  c                   $     e Zd Z fdZd Z xZS )BambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nr   )rC   rD   r;   rP   r#  r	   r   mlp_bias	gate_projup_proj	down_projr
   r&  act_fnrX   r;   r^   s     r7   rD   zBambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r6   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )r  r  r  r  )rX   r   r  s      r7   r   zBambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r6   r   ra   s   @r7   r  r    s    0r6   r  RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )BambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr   r   s      r7   rD   zBambaRMSNorm.__init__  s1     	ll5::k#:; #r6   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r   )	rA   r   r0   r   r   r   r   r   r   )rX   r   r   r   s       r7   r   zBambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r6   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler   r   r   rX   s    r7   
extra_reprzBambaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr6   r   )r,   r-   r.   rD   r   r  r`   ra   s   @r7   r  r    s    $;Jr6   r  c                   v    e Zd Zddededef fdZ	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e   d	e	e   d
e	e   de	ej                     de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fdZ xZS )BambaDecoderLayerr;   r   
layer_typec                 r   t         |           d}|dk(  rt        nd } ||      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        || _	        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr   r!  r=   )r;   r   	attentionzInvalid layer_type)rC   rD   r  feed_forwardr  rP   r+  input_layernormpre_ff_layernormr  r  r=   r   	self_attn
ValueError)rX   r;   r   r  num_expertsffn_layer_classr^   s         r7   rD   zBambaDecoderLayer.__init__  s    &1Q&6(D+F3+F,>,>FDWDWX ,V-?-?VEXEX Y$ #6YGDJ;&+FI>DN122r6   r   r   r   r   r   	use_cacher   r   r   r   c	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r=   )r   rA  r   r   Nr  )r   r   r   r   r   r  r   r   r5   )r  r  r=   r  r  r  )rX   r   r   r   r   r   r  r   r   r   residualself_attn_weightsoutputss                r7   r   zBambaDecoderLayer.forward  s    D !,,]; ??g%&DJJ ++--	
 M !%__+/=t~~ 
0+-)-"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr6   )r=   )NNNFFNN)r,   r-   r.   r   r3   r   rD   r0   r   r   r1   r:   r  r   r   r&   FloatTensorr   r`   ra   s   @r7   r  r    s   3{ 3s 3 3( 2637EI,1$)59KOK||K !.K u//0	K
 !!ABK $D>K D>K !!1!12K &eELL%,,,F&GHK 23K 
u  (51B1BEDUDU1U+V"WW	XKr6   r  c                   8    e Zd ZeZdZdZdgZdZdZ	dZ
dZdZd Zy)BambaPreTrainedModelmodelTr  past_key_valuesc                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        t        f      r&|j                  j                  j                  d       y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t               r|j"                  j                  j                  d       t%        j&                  t%        j(                  d|j*                  dz               |j,                  _        |j.                  j                  j                  d       y y )Nr   )r   stdg      ?r   )r;   initializer_ranger   r	   r   r3  r   datanormal_r   zero_r   r  fill_	Embeddingpadding_idxr  r6  r0   r8  r7  r"  r9  r<  )rX   r   r  s      r7   _init_weightsz"BambaPreTrainedModel._init_weights   sW   kk++fryy"))45MM&&CS&9{{&  &&( '!2L ABMM$$S)-MM&&CS&9!!-""6#5#56<<> .
+NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ ,r6   N)r,   r-   r.   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_is_statefulr  r5   r6   r7   r  r    s=    L&*#,-"3!N L%r6   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e   d	e	e
j                     d
e	e   de	e   de	e   de	e
j                     dee   defd              Zde
j                  de
j                  de
j                  dedef
dZede
j                  dedede
j.                  de
j                  defd       Zd Z xZS )
BambaModelr;   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)r   r  r!  )r;   F)rC   rD   pad_token_idr  
vocab_sizer	   r  rP   embed_tokensrL   rM   rU   r  rE   
ModuleListlayersr   r  r+  final_layernormrc   
rotary_embgradient_checkpointing	post_init)rX   r;   decoder_layersr\   r^   s       r7   rD   zBambaModel.__init__4  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r6   c                     | j                   S r   r  r  s    r7   get_input_embeddingszBambaModel.get_input_embeddingsG  s       r6   c                     || _         y r   r  rX   r   s     r7   set_input_embeddingszBambaModel.set_input_embeddingsJ  s
    !r6   	input_idsr   r   r  inputs_embedsr  r   output_hidden_statesr   r   r   c
                 N   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }| j                  |||	||      }| j!                  ||	      }| j#                  ||      }|rdnd }|rdnd }| j$                  D ]  }|j&                  d	k(  r|n|}|r||fz  }| j
                  r:| j                  r.| j)                  t+        |j,                  fi |
|||||||	|	      }n ||f||||||	|d
|
}|d   }|s}|d   ||d   fz  } | j/                  |      }|r||fz  }|r|j0                  sd|_        |sd n|}t3        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rB   r   r5   r=   )r   r   r   r   r  r   r   T)last_hidden_stater  r   
attentions)r;   r   r  r  r  r  r   r   r   r  r0   r7  r   r@   r   _update_causal_mask_update_mamba_maskr  r  r  _gradient_checkpointing_funcr   __call__r  rF   r   )rX   r  r   r   r  r  r  r   r  r   r   r   r   
mamba_maskr   all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r7   r   zBambaModel.forwardM  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..M>?L]
 ,,^^L
 #oom\J"6BD0d![[ %	:M'4'?'?7'JP[J#!m%55!**t}} $ A AM22=f=! #%"'
! !.!
!#-!-#2&7'#1(;
! 
! *!,M  #/"}Q'7&99NK%	:N ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r6   r   c           	         | j                   j                  dk(  r	|d|v r|S y ||j                         nd}| j                   j                  dk(  r&|s$t        j                  |||| j
                        ry |j                  }|j                  d   }t        |t        j                        r|j                  d   n||z   dz   }	| j                  |||	|||j                  d         }
| j                   j                  dk(  rQ|O|j                  j                  d	v r7|s5t        j                  |      j                  }t        j                   |
|      }
|
S )
Nflash_attention_2r   r   r   )r  past_key_values_lengthis_trainingr   ru   )sequence_lengthtarget_lengthrA   r   rY   )r  xpunpu)r;   r   get_seq_lengthr   _ignore_causal_mask_sdpar   rA   r   r   r0   r   5_prepare_4d_causal_attention_mask_with_cache_positionr@   rg   finfomin_unmask_unattended)rX   r   r   r   r  r   past_seen_tokensrA   r  r  r   	min_dtypes               r7   r  zBambaModel._update_causal_mask  se    ;;++/BB)c^.C%%
 @O?Z?99;`a ;;++v5>O%>>*'7 MM	 ""&,,Q/ .%,,7   $!O3a7 	 PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr6   r  r  rA   rY   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	| ddddddf   | ddddddf   k(  dddd| dddf   j                  |      }
|ddddddd|	f   |
z   }|dk(  }|ddddddd|	f   j                  ||      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuerA   r@   r   r  rB   ru   r   )r{   r0   r  r  fullr@   triur7  r   r~   cloner   r   r  )r   r  r  rA   r   rY   r   r   r  mask_lengthpadding_attention_maskpadding_masks               r7   r  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K, ) E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*8D$9I*Jn]^`dfgim]mNn*nq?*+Q.*"U) '  +1aL[L+@ADZZ+q05@Aq,;,AV5W5c5c )6Aq!\k\12 r6   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r0   all)rX   r   r   r  s       r7   r  zBambaModel._update_mamba_mask2  s7     $
!q ^%?EIIn`aNaDbJr6   )	NNNNNNNNN)r,   r-   r.   r   rD   r  r  r   r   r   r0   r1   r   r:   r  r  r   r&   r   r   r  staticmethodr3   rA   r  r  r`   ra   s   @r7   r  r  2  s   { &!"  151537FJ59$(,0/359m
E,,-m
 !.m
 u//0	m

 ""BCm
   1 12m
 D>m
 $D>m
 'tnm
 !!1!12m
 23m
 
!m
  m
^:: ll: 	:
 ::  :x 555 5 {{	5
 5 5 5n	r6   r  c                       e Zd ZdgZddiZddgdgfiZ fdZd Zd Zd	 Z	d
 Z
d Zd Zee	 	 	 	 	 	 	 	 	 	 	 ddeej"                     deej$                     deej"                     dee   deej(                     deej"                     dee   dee   dee   deej"                     deeej$                  f   defd              Z	 	 	 	 	 	 ddZ xZS )BambaForCausalLMzlm_head.weightlm_headcolwise_repr   logitsc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFr   )
rC   rD   r  r  r  r	   r   rP   r  r  r  s     r7   rD   zBambaForCausalLM.__init__D  sU     '
 ++yy!3!3V5F5FUS 	r6   c                 .    | j                   j                  S r   r  r  r  s    r7   r  z%BambaForCausalLM.get_input_embeddingsM  s    zz&&&r6   c                 &    || j                   _        y r   r  r  s     r7   r  z%BambaForCausalLM.set_input_embeddingsP  s    "'

r6   c                     | j                   S r   r  r  s    r7   get_output_embeddingsz&BambaForCausalLM.get_output_embeddingsS  s    ||r6   c                     || _         y r   r  )rX   new_embeddingss     r7   set_output_embeddingsz&BambaForCausalLM.set_output_embeddingsV  s	    %r6   c                     || _         y r   r  )rX   decoders     r7   set_decoderzBambaForCausalLM.set_decoderY  s	    
r6   c                     | j                   S r   r  r  s    r7   get_decoderzBambaForCausalLM.get_decoder\  s    zzr6   r  r   r   r  r  labelsr  r   r  r   logits_to_keepr   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   r   r  r  r  r   r  r   )r  r  r  )lossr  r  r   r  r5   )r;   r   r  r  r  r   r3   slicer  loss_functionr  r   r  r   r  )rX   r  r   r   r  r  r  r  r   r  r   r  r   r  r   slice_indicesr  r!  s                     r7   r   zBambaForCausalLM.forward_  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r6   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nru   r   r   rB   r  r  )r   r  r  r   r  r   )r   r:   r;   rA   r@   longr  masked_fill_r   r   num_logits_to_keep)rX   r  r  r   r  r   r   r  r   empty_past_kvmodel_inputss              r7   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  sa    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r6   )NNNNNNNNNNr   )NNNNNT)r,   r-   r.   _tied_weights_keys_tp_plan_pp_planrD   r  r  r  r  r  r  r   r   r   r0   r1   r   r:   r  r  r   r3   r   r   r+  r`   ra   s   @r7   r
  r
  >  s   *+=)H_-z:;H'(&  151537FJ59-1$(,0/35934G
E,,-G
 !.G
 u//0	G

 ""BCG
   1 12G
 ))*G
 D>G
 $D>G
 'tnG
 !!1!12G
 c5<</0G
 
 G
  G
X 8r6   r
  )r  r
  r  )r   )Nr   )T	functoolsr   typingr   r   r   r   r   r0   r	   (transformers.models.jamba.modeling_jambamodelsjambamodeling_jambatransformers.activationsr
   cache_utilsr   
generationr   integrationsr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_bambar   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr    r!   causal_conv1dr"   r#   
get_loggerr,   r   r&   r:   Modulerc   r   r   r3   r   r}   r   r   r   r   r  r	  r  r  r>  r  r  r  r  r  r  r  r
  __all__r5   r6   r7   <module>rH     s~  6  > >   A A +   ) 7 > B O K F & > > V , Rmm!DD-7** 
		H	%	 43u~'V'V 3ul<299 <D(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %6%PJ)RYY J)Z; ;*VU\\ VS V
(( 46FH\]^ __ __Dryy   Y'J299 J (J(]		 ]@ %? % %: H% H HV c+_ c cL Er6   