
    Uh                        d Z ddlmZ ddlmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmc mc mZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZmZ d
dlmZ d
dl m!Z!m"Z" d
dl#m$Z$ d
dl%m&Z& d
dl'm(Z(m)Z)m*Z* d
dl+m,Z,m-Z-m.Z. ddl/m0Z0  e-       r	  e.       rddl1m2Z2 ddl3m4Z4m5Z5 ndZ2 e,       r	ddl6m7Z7m8Z8 nd\  Z8Z7 e9e2e7e8f      Z: e*jv                  e<      Z= G d ded      Z> G d dej~                        Z? G d de      Z@d4dZA G d  d!e      ZB G d" d#e      ZCd$ ZD G d% d&e
j                        ZF G d' d(e      ZG G d) d*e      ZH G d+ d,e      ZIe( G d- d.e$             ZJe( G d/ d0eJ             ZK G d1 d2e      ZLg d3ZMy)5zPyTorch Bamba model.    )partial)OptionalTuple	TypedDictUnionN)nn)ACT2FN)JambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedpad_tensor_by_sizereshape_into_chunkssegment_sum   )AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)is_causal_conv1d_availableis_flash_attn_2_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_update)NNc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     y/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.pyr)   r)   P   s7    " ######__r9   r)   F)totalc                   B     e Zd ZdZej
                  dfdef fdZ xZS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfigc                 R   t         	|   ||||       |j                  | _        d| _        |j                  }|j
                  }g | _        g | _        g | _        t        |j                        D ]*  }| j                  |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                  z  |z  z   |||      gz  c_        | xj                  t        j                  ||j                   |j"                  |||      gz  c_        | xj                  t        j$                  g g|z  |      gz  c_        | xj                  t        j$                  g g|z  |      gz  c_        | j                  j'                  |       - t        |j                        D cg c]  }t        j$                  g g|z  |       c}| _        t        |j                        D cg c]  }t        j$                  g g|z  |       c}| _        y c c}w c c}w )NFmamba   devicedtyperC   )super__init__layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr3   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)
selfr>   
batch_sizerD   rC   conv_kernel_sizessm_state_sizei_	__class__s
            r:   rG   z)HybridMambaAttentionDynamicCache.__init__x   s   UF;!'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   3"H4"H$)	r/   r0   r1   r2   r3   float16r"   rG   __classcell__ra   s   @r:   r=   r=   j   s)     ?DmmTX %u{ %u %ur9   r=   c                       e Zd Zy)BambaRotaryEmbeddingNr/   r0   r1   r8   r9   r:   rf   rf          r9   rf   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t        j                  ||gd      }t        j                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Ndim)	unsqueezeshaper   r3   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r:   apply_rotary_pos_embr}      s    , --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr9   c                       e Zd Zy)BambaAttentionNrg   r8   r9   r:   r   r      rh   r9   r   c                       e Zd Zy)BambaRMSNormGatedNrg   r8   r9   r:   r   r      rh   r9   r   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr!   r   )rn   rD   to)hidden_statesattention_maskrD   s      r:   apply_mask_to_padding_statesr      sa     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr9   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	e
   de	ej                     de	ej                     d	e	ej                     f
d
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ	 	 	 	 dde	e
   de	ej                     de	ej                     d	e	ej                     fdZ xZS )
BambaMixeruO  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the HybridCache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r>   	layer_idxc           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        dt;        d      f| _        d| _        d| _         | j                  d| j0                  z  | j                  z  z   | _!        tE        jF                  | jB                  | jB                  |j                  | j                  | jB                  | j                  dz
        | _$        | j                  | jB                  z   | j                  z   }tE        jJ                  | j                  || j(                        | _&        tE        jN                  tQ        jR                  | j                              | _*        tQ        jV                  d| j                  dz         }tE        jN                  tQ        jX                  |            | _-        d	| jZ                  _.        t_        | j                  | j,                  
      | _0        tE        jN                  tQ        jR                  | j                              | _1        d	| jb                  _.        tE        jJ                  | j                  | j                  | j(                        | _2        tf        sth        jk                  d       y th        jk                  d       y )N        infgMbP?g?rA   r!   )in_channelsout_channelsbiaskernel_sizegroupspadding)r   Tepsa  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6rF   rG   rU   	num_headsrS   rK   r^   rJ   r]   r6   rR   intermediate_sizer   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrT   n_groupsrV   head_dimmamba_chunk_size
chunk_sizefloattime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr3   onesdt_biasarangelogA_log_no_weight_decayr   normDout_projis_fast_path_availableloggerwarning_once)r[   r>   r   projection_sizeAra   s        r:   rG   zBambaMixer.__init__   s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11 !$U5\2" ..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQUQ^Q^_%>  fgr9   r   cache_paramscache_positionr   r.   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr!   r   rj   rk   .rD   T)zr   dt_softplusr   r   dt_limitF)r   r   r.   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesrA   )siluswish)xweightr   r   r.   )r   r   r   r.   r   r   r   )/r   r   rn   r   r^   rI   rL   r   rM   squeezesplitr   r   r   r'   r   r   r   r   r3   expr   r   expandr   r   float32r   r   viewr#   r   r   r   trainingr%   r   variance_epsilon	transposer   
functionalpadr]   copy_r   r&   r$   )r[   r   r   r   r   r.   projected_statesr\   seq_lenr`   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrL   scan_output	ssm_states                              r:   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward2  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r9   c                 *   |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }n
t0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.1|/|j                  | j                     j9                  |.       d|_        | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nrj   rk   r!   r   )shiftsdimsrE   rA   .).N).NNr   rB   )rl   output_sizer      )r!   r   T)9rn   rD   r   r   r   r   r   r   rI   rL   r   rM   rollr   rC   r   r   r3   sumr   r   r   r   r   r   r   r   r]   r   r   r^   r   r   r   r   r   r   softplusclampr   r   reshape
contiguousr   bmmr   repeat_interleaver   r   r   permutecumsumr   
zeros_likero   r   r   )5r[   input_statesr   r   r   r\   r   r`   rD   r   r   r   r   r   rL   r   r   r   r   r   cache_devicer   dAdBdBxrM   ssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess5                                                        r:   torch_forwardzBambaMixer.torch_forward  s]    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99hq!Q|&<x&GIL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iH26/ii4(
 !%knnU.C D$$I &{s   vc                 r   t         rAd| j                  j                  j                  j                  v r| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r!   r   )r   r   r   rC   typer   NotImplementedErrorrD   rn   r   r  )r[   r   r   r   r   r.   kwargsrD   s           r:   forwardzBambaMixer.forward  s     "f0C0C0J0J0O0O&O,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r9   )NNNN)NNN)r/   r0   r1   r2   r"   r6   rG   r3   Tensorr   r=   r4   r7   r   r  r  rc   rd   s   @r:   r   r      sI   Ah{ Ahs AhL DH5915-1g||g ?@g !!1!12	g
 !.g %//*gZ DH5915M% ?@M% !!1!12	M%
 !.M%f DH5915-1_ ?@_ !!1!12	_
 !._ %//*_r9   r   c                       e Zd Zy)BambaMLPNrg   r8   r9   r:   r"  r"    rh   r9   r"  c                       e Zd Zy)BambaRMSNormNrg   r8   r9   r:   r$  r$    rh   r9   r$  c                   v    e Zd Zddededef fdZ	 	 	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e   d	e	e   d
e	e   de	ej                     de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fdZ xZS )BambaDecoderLayerr>   r   
layer_typec                     t         |           | `d}|dk(  rt        nd } ||      | _        || _        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr!   r@   )r>   r   	attentionzInvalid layer_type)
rF   rG   	self_attnr"  feed_forwardr'  r   r@   r   
ValueError)r[   r>   r   r'  num_expertsffn_layer_classra   s         r:   rG   zBambaDecoderLayer.__init__  sr    N&1Q&6(D+F3$ #6YGDJ;&+FI>DN122r9   r   r   rt   past_key_valueoutput_attentions	use_cacher   position_embeddingsr  returnc	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r@   )r   r   r   r   Nr)  )r   r   rt   r/  r0  r1  r   r2  r8   )input_layernormr'  r@   r*  pre_ff_layernormr+  )r[   r   r   rt   r/  r0  r1  r   r2  r  residualself_attn_weightsoutputss                r:   r  zBambaDecoderLayer.forward  s    D !,,]; ??g%&DJJ ++--	
 M !%__+/=t~~ 
0+-)-"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr9   )r@   )NNNFFNN)r/   r0   r1   r"   r6   strrG   r3   r   r   r4   r=   boolr   r   r)   FloatTensorr  rc   rd   s   @r:   r&  r&    s   3{ 3s 3 3( 2637EI,1$)59KOK||K !.K u//0	K
 !!ABK $D>K D>K !!1!12K &eELL%,,,F&GHK 23K 
u  (51B1BEDUDU1U+V"WW	XKr9   r&  c                   8    e Zd ZeZdZdZdgZdZdZ	dZ
dZdZd Zy)BambaPreTrainedModelmodelTr&  past_key_valuesc                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        t        f      r&|j                  j                  j                  d       y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t               r|j"                  j                  j                  d       t%        j&                  t%        j(                  d|j*                  dz               |j,                  _        |j.                  j                  j                  d       y y )Nr   )meanstdg      ?r!   )r>   initializer_range
isinstancer   r   r   r   datanormal_r   zero_r   r$  fill_	Embeddingpadding_idxr   r   r3   r   r   r   r   r   )r[   modulerC  s      r:   _init_weightsz"BambaPreTrainedModel._init_weights7  sW   kk++fryy"))45MM&&CS&9{{&  &&( '!2L ABMM$$S)-MM&&CS&9!!-""6#5#56<<> .
+NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ ,r9   N)r/   r0   r1   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_is_statefulrM  r8   r9   r:   r>  r>  +  s=    L&*#,-"3!N L%r9   r>  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e   d	e	e
j                     d
e	e   de	e   de	e   de	e
j                     dee   defd              Zde
j                  de
j                  de
j                  dedef
dZede
j                  dedede
j.                  de
j                  defd       Zd Z xZS )
BambaModelr>   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)r   r'  r   )r>   F)rF   rG   pad_token_idrK  
vocab_sizer   rJ  rS   embed_tokensrO   rP   rX   r&  rH   
ModuleListlayers_attn_implementationr$  r   final_layernormrf   
rotary_embgradient_checkpointing	post_init)r[   r>   decoder_layersr_   ra   s       r:   rG   zBambaModel.__init__K  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r9   c                     | j                   S Nr\  )r[   s    r:   get_input_embeddingszBambaModel.get_input_embeddings^  s       r9   c                     || _         y rf  rg  )r[   values     r:   set_input_embeddingszBambaModel.set_input_embeddingsa  s
    !r9   	input_idsr   rt   r@  inputs_embedsr1  r0  output_hidden_statesr   r  r3  c
                 N   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }| j                  |||	||      }| j!                  ||	      }| j#                  ||      }|rdnd }|rdnd }| j$                  D ]  }|j&                  d	k(  r|n|}|r||fz  }| j
                  r:| j                  r.| j)                  t+        |j,                  fi |
|||||||	|	      }n ||f||||||	|d
|
}|d   }|s}|d   ||d   fz  } | j/                  |      }|r||fz  }|r|j0                  sd|_        |sd n|}t3        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r!   rE   r   r8   r@   )r   rt   r/  r0  r1  r   r2  T)last_hidden_stater@  r   
attentions)r>   r0  rn  r1  r,  rb  r   r   r   r\  r3   r   rn   rC   rm   _update_causal_mask_update_mamba_maskra  r^  r'  _gradient_checkpointing_funcr   __call__r`  rI   r   )r[   rl  r   rt   r@  rm  r1  r0  rn  r   r  r   causal_mask
mamba_maskr2  all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r:   r  zBambaModel.forwardd  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..M>?L]
 ,,^^L
 #oom\J"6BD0d![[ %	:M'4'?'?7'JP[J#!m%55!**t}} $ A AM22=f=! #%"'
! !.!
!#-!-#2&7'#1(;
! 
! *!,M  #/"}Q'7&99NK%	:N ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r9   input_tensorc           	         | j                   j                  dk(  r	|d|v r|S y ||j                         nd}| j                   j                  dk(  r&|s$t        j                  |||| j
                        ry |j                  }|j                  d   }t        |t        j                        r|j                  d   n||z   dz   }	| j                  |||	|||j                  d         }
| j                   j                  dk(  rQ|O|j                  j                  d	v r7|s5t        j                  |      j                  }t        j                   |
|      }
|
S )
Nflash_attention_2r   r   sdpa)rm  past_key_values_lengthis_trainingr!   rj   )sequence_lengthtarget_lengthrD   r   r\   )r  xpunpu)r>   r_  get_seq_lengthr   _ignore_causal_mask_sdpar   rD   rn   rE  r3   r   5_prepare_4d_causal_attention_mask_with_cache_positionrC   r  finfomin_unmask_unattended)r[   r   r~  r   r@  r0  past_seen_tokensrD   r  r  rv  	min_dtypes               r:   rr  zBambaModel._update_causal_mask  se    ;;++/BB)c^.C%%
 @O?Z?99;`a ;;++v5>O%>>*'7 MM	 ""&,,Q/ .%,,7   $!O3a7 	 PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr9   r  r  rD   r\   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	| ddddddf   | ddddddf   k(  dddd| dddf   j                  |      }
|ddddddd|	f   |
z   }|dk(  }|ddddddd|	f   j                  ||      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuerD   rC   r!   )diagonalrE   rj   r   )rl   r3   r  r  fullrC   triur   r   r   clonern   r   masked_fill)r   r  r  rD   r   r\   r  rv  r  mask_lengthpadding_attention_maskpadding_masks               r:   r  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K, ) E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*8D$9I*Jn]^`dfgim]mNn*nq?*+Q.*"U) '  +1aL[L+@ADZZ+q05@Aq,;,AV5W5c5c )6Aq!\k\12 r9   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr!   )r3   all)r[   r   r   rw  s       r:   rs  zBambaModel._update_mamba_maskI  s7     $
!q ^%?EIIn`aNaDbJr9   )	NNNNNNNNN)r/   r0   r1   r"   rG   rh  rk  r   r   r   r3   r4   r   r=   r<  r;  r   r)   r   r  rr  staticmethodr6   rD   r  rs  rc   rd   s   @r:   rX  rX  I  s   { &!"  151537FJ59$(,0/359m
E,,-m
 !.m
 u//0	m

 ""BCm
   1 12m
 D>m
 $D>m
 'tnm
 !!1!12m
 23m
 
!m
  m
^:: ll: 	:
 ::  :x 555 5 {{	5
 5 5 5n	r9   rX  c                   N    e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddeej
                     deej                     deej
                     dee   deej                     deej
                     dee	   dee	   d	ee	   d
eej
                     de
eej                  f   def fdZ	 	 	 	 	 	 ddZ xZS )BambaForCausalLMrl  r   rt   r@  rm  labelsr1  r0  rn  r   logits_to_keepr3  c                 8    t        |   |||||||||	|
|fi |S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rF   r  )r[   rl  r   rt   r@  rm  r  r1  r0  rn  r   r  r  ra   s                r:   r  zBambaForCausalLM.forwardV  sB    J w 
 
 	
r9   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nrj   r!   r   rE   rm  rl  )rt   r@  r1  r   r  r   )rn   r=   r>   rD   rC   longr   masked_fill_r   updatenum_logits_to_keep)r[   rl  r@  r   rm  r   rt   r1  r  empty_past_kvmodel_inputss              r:   prepare_inputs_for_generationz.BambaForCausalLM.prepare_inputs_for_generation  sa    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r9   )NNNNNNNNNNr   )NNNNNT)r/   r0   r1   r   r3   r4   r   r=   r<  r;  r   r6   r   r  r  rc   rd   s   @r:   r  r  U  s    151537FJ59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 ""BC2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 
 2
n 8r9   r  )rX  r  r>  )Nr!   )Nr2   	functoolsr   typingr   r   r   r   r3   torch.utils.checkpointr   (transformers.models.jamba.modeling_jambamodelsjambamodeling_jambatransformers.activationsr	   r
   (transformers.models.llama.modeling_llamar   r   r   r   r   r   *transformers.models.mamba2.modeling_mamba2r   r   r   r   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.import_utilsr   r   r    configuration_bambar"   +mamba_ssm.ops.triton.selective_state_updater#   !mamba_ssm.ops.triton.ssd_combinedr$   r%   causal_conv1dr&   r'   r  r   
get_loggerr/   r   r)   r=   rf   r}   r   r   r   Moduler   r"  r$  r&  r>  rX  r  __all__r8   r9   r:   <module>r     s  (   4 4    A A + O   ? O - & 
 r q , Rmm!DD-7**46FH\]^  
		H	%	 43u~'V'V 3ul	/ 	
%P	^ 		) 	__ __D	x 		< 	]2 ]@ %? % %: H% H HVm' m` Er9   