
    Uhj                     h   d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.  e,       rddl/m0Z0m1Z1 ddl2m3Z3 nd\  Z3Z1Z0 e+       r	ddl4m5Z5m6Z6 nd\  Z6Z5 e7e3e1e5e6e0f      Z8 e)jr                  e:      Z; G d dejx                        Z= e&j|                  e=       de
j~                  de@de
j~                  fdZA G d d e      ZB	 d@d!ejx                  d"e
j~                  d#e
j~                  d$e
j~                  d%ee
j~                     d&eCd'eCfd(ZD G d) d*ejx                        ZE G d+ d,ejx                        ZF G d- d.ejx                        ZG G d/ d0ejx                        ZH G d1 d2ejx                        ZI G d3 d4ejx                        ZJe( G d5 d6e"             ZKe( G d7 d8eK             ZL G d9 d:eKe      ZM e(d;<       G d= d>eK             ZNg d?ZOy)AzPyTorch Zamba model.    N)AnyCallableDictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ALL_LAYERNORM_LAYERS)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                   ,     e Zd Zd fd	Zd Zd Z xZS )ZambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r
   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.pyr,   zZambaRMSNorm.__init__A   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor.   float32powmeanrsqrtr1   r0   )r2   hidden_statesinput_dtypevariances       r6   forwardzZambaRMSNorm.forwardI   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r7   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler0   shaper1   r2   s    r6   
extra_reprzZambaRMSNorm.extra_reprP   s*    ))*+6$2G2G1HIIr7   )gư>)__name__
__module____qualname__r,   rE   rJ   __classcell__r5   s   @r6   r)   r)   @   s    $;Jr7   r)   rB   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rH   expandreshape)rB   rP   batchnum_key_value_headsslenhead_dims         r6   	repeat_kvrY   X   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr7   c                   ~   e Zd ZdZej
                  dfdZ	 ddej                  dej                  dede	e
eef      deej                  ej                  f   f
d	Zd
ej                  fdZdde	e   defdZdeeej                     eej                     f   fdZedde	eeej(                           ddfd       Zy)ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nc           
         || _         |j                  | _        d| _        |j                  |j                  z  | _        |j                  | _        |j                  | _	        |j                  | _
        g | _        g | _        g | _        i | _        i | _        i | _        t#        |j$                        D ]  }| xj                  t'        j(                  || j
                  | j                  ||      gz  c_        || j                  | j
                  | j                  z  | j                  f}| xj                  t'        j(                  |||      gz  c_        | j                  |   dk(  s| j                  j+                  |        t#        |j$                        D cg c]  }t'        j,                  g g|z  |       c}| _        t#        |j$                        D cg c]  }t'        j,                  g g|z  |       c}| _        y c c}w c c}w )NFdevicer<   hybridr^   )r<   layers_block_typehas_previous_statemamba_expandr3   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr.   zerosappendtensor	key_cachevalue_cache)r2   config
batch_sizer<   r^   icache_shape_s           r6   r,   z ZambaHybridDynamicCache.__init__r   s   
!'!9!9"'!'!4!4v7I7I!I$22 & 3 3#11"$v//0 	2AJ(>(>@U@U^dlqr!  ""&&$*<*<<##	K OOKe TUUO%%a(H4''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   "H"H

key_statesvalue_states	layer_idxcache_kwargsrQ   c                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr:   r   r9   dim)ru   rH   rv   r.   cat)r2   r|   r}   r~   r   s        r6   updatezZambaHybridDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr7   beam_idxc                    t        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j	                  d|j                  |            | j                  |<   V y)zDReorders the cache for beam search, given the selected beam indices.r   N)	rp   lenru   r^   index_selectr=   rv   rj   rk   )r2   r   r~   r^   s       r6   reorder_cachez%ZambaHybridDynamicCache.reorder_cache   sD   s4>>23 		iI^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI&		ir7   c                     || j                   vr| j                   d   n|}t        | j                        |k  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rl   r   ru   rH   )r2   r~   s     r6   get_seq_lengthz&ZambaHybridDynamicCache.get_seq_length   sR     3<4CZCZ2ZD++A.`i	t~~)+~~i(..r22r7   c                     t        d      Nz@ZambaHybridDynamicCache does not have a legacy cache equivalent.NotImplementedErrorrI   s    r6   to_legacy_cachez'ZambaHybridDynamicCache.to_legacy_cache   s    !"deer7   past_key_valuesr   c                     t        d      r   r   )clsr   s     r6   from_legacy_cachez)ZambaHybridDynamicCache.from_legacy_cache   s    !"deer7   N)r   )rK   rL   rM   __doc__r.   float16r,   Tensorintr   r   strr   r   r   
LongTensorr   r   r   classmethodFloatTensorr    r7   r6   r[   r[   d   s    27t uJ 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F$ie&6&6 i3 3c 3fuU\\':E%,,<O'O!P f fuUEVEV?W9X0Y fes f fr7   r[   modulequerykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr9   r   r   r:   )r   r<   )ptrainingr    )rY   num_key_value_groupsr.   matmul	transposerH   r
   
functionalsoftmaxr>   r=   r<   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr|   r}   attn_weightscausal_maskattn_outputs                r6   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r7   c                        e Zd ZdZdedef fdZ	 ddej                  dede	ej                     de	e
   dee   d	eej                  e	ej                     e	eej                        f   fd
Z xZS )ZambaAttentiona9  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://arxiv.org/pdf/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    rw   r~   c                 .   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  |j                  z  | _	        |j                  | _
        | j                  dz  dz  | _        d| _        |j                  | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j&                  d      | _        y )Nr9         TFbias)r+   r,   rw   r~   attention_hidden_sizeattention_head_dimrX   num_attention_headsrV   r   max_position_embeddingsr   	is_causalattention_dropoutr
   Linearq_projk_projv_projr3   o_projr2   rw   r~   r5   s      r6   r,   zZambaAttention.__init__   s9   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr7   rB   r   past_key_valuer   rQ   c                 h   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
||j                  |	|
|      \  }	}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j                  d       nt        | j                  j                     } || ||	|
|f| j                  sd	n| j                   | j"                  d
|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr:   r    r9   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )rH   rX   r   viewr   r   r   r   r   rw   _attn_implementationgetloggerwarning_oncer   r   r   r   rT   r   r   )r2   rB   r~   r   r   r   input_shapehidden_shapequery_statesr|   r}   attention_interfacer   r   s                 r6   rE   zZambaAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST%'5'<'<ZW`'a$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r7   r   )rK   rL   rM   r   r!   r   r,   r.   r   r   r[   r   r   r   rE   rN   rO   s   @r6   r   r      s    l{ ls l. =A))||)) )) !.	))
 !!89)) -.)) 
u||Xell3XeELL>Q5RR	S))r7   r   c                   l     e Zd ZdZdef fdZ	 d	dej                  defdZ	d	defdZ
d	defdZ xZS )
ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    rw   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j"                  | _        t'        j(                  | j                  | j                  | j                   | j                  | j                  | j                  dz
        | _        |j,                  | _        t0        |j,                     | _        |j4                  | _        t'        j8                  | j                  | j                  dz  | j$                        | _        t'        j<                  t?        j@                  | j                  | j                  | j                  dz  z   | j                              | _!        t'        j<                  t?        j@                  | j                  | j                  | j                        dz
  dz  | j                  dz  z        | _"        t'        j<                  t?        j@                  | j                  | j                              | _#        t?        jH                  d| j                  dz   t>        jJ                        d d d f   }|jM                  | j                  d      jO                         }t'        j<                  t?        jP                  |      jS                  | j                  | j                  d            | _*        t'        j<                  t?        jV                  | j                  | j                              | _,        t'        j8                  | j                  | j                  | j$                        | _-        t\        st^        ja                  d       y y )	Nr    )in_channelsout_channelsr   kernel_sizegroupspaddingr9   r   g      ?r<   r:   ap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r+   r,   rw   r~   r3   re   rf   rg   rh   rc   rd   mamba_dt_ranktime_step_rankri   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr
   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr-   r.   rr   x_proj_weightdt_proj_weightdt_proj_biasaranger>   rS   r   logrT   A_logr/   Dout_projis_fast_path_availabler   r   )r2   rw   r~   Ar5   s       r6   r,   zZambaMambaMixer.__init__5  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\&&''$*=*=*AA''
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_%^ &r7   rB   cache_paramsc                    |j                   \  }}}|d uxr |j                  xr |dk(  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }}	|j                  d      j                         }|	j                  d      }	|	j                  || j                  d|      j                  dd      }	| j                  j                  j	                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                   | j"                        }|j%                  d      }n|,t'        j(                  |dk(        s||j%                  d      z  }|dt*        j,                  j/                  || j0                  |j                   d   z
  df      }|j                  | j                     j3                  |       t5        ||
| j                  j                   | j"                        }|,t'        j(                  |dk(        s||j%                  d      z  }|j                  d| j                  | j6                  |      j                  dd      }| j8                  d d d d d d d f   |z  j                  dd      }t'        j:                  || j<                  | j>                  | j>                  gd      \  }}}| j@                  d d d f   |j                  dd      z  }t'        jB                  | jD                  jG                                }| jH                  | jH                  jG                         nd }t'        jJ                  |d|f|jL                  |jN                        }|rtQ        | j                        D ]  }tS        |jT                  | j                     d d |f   ||d	df   ||d	df   ||   ||d d df   ||d d df   | jV                  |   |	|d	df   ||   d

      j%                  d      }t'        jX                  ||fd      } nAt'        jJ                  |d| j6                  | j>                  f|jL                  |jN                        }tQ        | j                        D ]  }t[        ||   ||   ||   ||   j                  dd      ||   j                  dd      | jV                  |   jG                         |	|   ||   d
d

      \  }}t'        jX                  ||fd      j                         }t'        jX                  ||j%                  d      fd      } |*|(|jT                  | j                     j3                  |       | j]                  |j                  dd            }|S )Nr    r9   r:   r   r   )r   r   r]   .T)dt_softplus)delta_softplusreturn_last_state)/rH   rb   r   r   r   chunksqueezer   rT   ri   r   r0   sizer&   rj   r~   r   r   	unsqueezer.   allr
   r   padrh   copy_r%   r   r   splitr   rf   r   expr   floatr   emptyr^   r<   rp   r$   rk   r   r   r#   r   )r2   rB   r   r   rx   seq_lenr{   use_precomputed_statesprojected_statesgateconv_weightsrj   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r6   cuda_kernels_forwardz$ZambaMambaMixer.cuda_kernels_forwardt  s    "/!4!4
GQ!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M)%))Na<O2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM)%))Na<O2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. O 6 ++DNN;AqDA!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FANO  Q 3 3T5H5HI$++#))I
 4--. S,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	S $)A''7==iH !%l.D.DQ.J K$$r7   c           
         |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }	}
|	j                  d      j                         }	|
j                  d      }
|
j                  || j                  d|      j                  dd      }
t        |t              }|r|j                  | j                     j                   d   |k(  r| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j!                  |	j"                        }|j$                  r|dk(  r|j&                  | j                     j                   d   |k(  r|j&                  | j                     }t)        j*                  |dd      }|	d d d d df   |d d d d df<   ||j&                  | j                  <   t)        j,                  || j.                  j0                  d d dd d f   z  d      }	| j2                  r|	| j.                  j4                  z  }	| j7                  |	      j!                  |      j9                  d      }	n|Ct)        j:                  |dk(        s+|	|d d |	j                   d    d f   j9                  d      z  }	t<        j>                  jA                  |	| jB                  |	j                   d   z
  df      }||j&                  | j                  <   | j7                  | j/                  |	      dd |f         }	|t)        j:                  |dk(        s|	|d d |	j                   d    d f   j9                  d      z  }	nt)        jD                  || j                  | jF                  | jH                  f|	j"                  |      }|,t)        j:                  |dk(        s|	|j9                  d      z  }	| j7                  | j/                  |	      dd |f         }	|,t)        j:                  |dk(        s|	|j9                  d      z  }	|	j                  d| j                  | jF                  |      j                  dd      }	| jJ                  d d d d d d d f   |	z  j                  dd	      }t)        jL                  || jN                  | jH                  | jH                  gd      \  }}}| jP                  d d d f   |j                  dd	      z  | jR                  d d d d d d f   z   }t<        j>                  jU                  |      }t)        jV                  | jX                  j[                                }t)        jV                  |d d d d d d d d f   |d d d d d d d d d f   z        }|d d d d d d d d d f   |d d d d d d d d d f   j[                         z  }||	d d d d d d d d d f   j[                         z  }g }t]        |      D ]  }|d d d d d d |d d f   j                  dd      |z  |d d d d d d |d d f   j                  dd      z   }t)        j^                  |j                  dd      j!                  |      |d d d d |d d f   j9                  d            }|ja                  |d d d d d d df           t)        jb                  |d      }||	| jd                  d d d d d d f   z  z   }|| j7                  |
      z  }|r||j                  | j                  <   | jg                  |j                  dd      j                  |d|      j                  dd            }|S )
Nr    r9   r:   r   r   )shiftsdims.r]   r   )4rH   r<   r   r   r   r   r   r   rT   ri   
isinstancer[   rk   r~   r   cloner=   r^   rb   rj   r.   rollsumr   r0   r   r   r   r   r  r
   r   r  rh   rr   r   rf   r   r  r   r   r   softplusr  r   r  rp   r   rs   stackr   r   )r2   input_statesr   r   rx   r  r{   r<   r
  rB   r  	use_cacher  
conv_stater  r  r  r  r  r   
discrete_A
discrete_BdeltaB_ur  ry   scan_outputr  s                              r6   slow_forwardzZambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX|-DE	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I //qL ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O!-eiiRS@S6T$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}t?T?TWdWjWjkmWn?npq>rs
;E((8 $])CC'M)R S!-eiiRS@S6T$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nMT//1D1DdFYFYZ$++I
 )%))Na<O2P -0H0H0K K HHT[[%?XgX%NOM)%))Na<O2P -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGw 	9A"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78	9 kk,B7!]TVVAtQ<L5M%MN!DHHTN26?L##DNN3 !%!!!Q'//
BHRRSTVWX!
 %$r7   c                     | j                   rGt        r"d| j                  j                  j                  vrt        d      | j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   r^   type
ValueErrorr  r*  )r2   rB   r   r   s       r6   rE   zZambaMambaMixer.forward2  sm      )V4;M;M;T;T;Y;Y-Y i 
 ,,]LYg,hh  ^ \\r7   r'   )rK   rL   rM   r   r!   r,   r.   r   r[   r  r*  rE   rN   rO   s   @r6   r   r   (  sX    
={ =@ im_%"\\_%9P_%B[%7N [%z	]3J 	]r7   r   c                   $     e Zd Z fdZd Z xZS )ZambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r+   r,   rw   r3   rd   r
   r   	gate_projup_proj	down_projr   
hidden_actact_fnr2   rw   r5   s     r6   r,   zZambaMLP.__init__@  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r7   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r   )r5  r7  r3  r4  )r2   xr5  s      r6   rE   zZambaMLP.forwardJ  s6    NN4;;t~~a/@#ADLLQRO#ST	r7   )rK   rL   rM   r,   rE   rN   rO   s   @r6   r0  r0  ?  s    0r7   r0  c                       e Zd Zddedee   f fdZ	 	 	 	 ddej                  dej                  dedeej                     dee	   dee
   d	ee
   d
ee   deej                  eeej                  ej                  f      f   fdZ xZS )ZambaAttentionDecoderLayerrw   r~   c                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _        y )Nr4   )r+   r,   r   	self_attnr0  feed_forwardr)   r   rms_norm_epsinput_layernormr3   pre_ff_layernormr   s      r6   r,   z#ZambaAttentionDecoderLayer.__init__P  s_    '	:$V,+F,H,HfNaNab ,V-?-?VEXEX Yr7   rB   original_hidden_statesr   r   r   r$  r   rQ   c           
          t        j                  ||gd      }| j                  |      } | j                  d||||||d|\  }}	| j	                  |      }| j                  |      }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://arxiv.org/pdf/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r:   r   )rB   r~   r   r   r   r$  r   )r.   concatenaterB  r?  rC  r@  )r2   rB   rD  r~   r   r   r   r$  r   self_attn_weightsoutputss              r6   rE   z"ZambaAttentionDecoderLayer.forwardX  s    > ))=:P*QWYZ,,];+94>> ,
'))/,
 ,
(( --m<))-8 ")++Gr7   r   NNFF)rK   rL   rM   r!   r   r   r,   r.   r   r[   boolr   r   r   r   rE   rN   rO   s   @r6   r<  r<  O  s    Z{ Zx} Z 26<@,1$)3||3 !&3 	3
 !.3 !!893 $D>3 D>3 -.3 
u  (51B1BEDUDU1U+V"WW	X3r7   r<  c                   t    e Zd Zdedef fdZ	 	 	 	 	 	 	 	 	 ddej                  deej                     dee   deej                     deej                     dee	   d	ee
   d
ee
   deej                     deej                     deej                  eeej                  ej                  f      f   fdZ xZS )ZambaMambaDecoderLayerrw   r~   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        || _        y )N)rw   r~   r>  )	r+   r,   r   mambar)   r3   rA  rB  r~   r   s      r6   r,   zZambaMambaDecoderLayer.__init__  s>    $FiH
+F,>,>FDWDWX"r7   rB   rD  r   r   r   r   r$  cache_positiontransformer_hidden_statesrQ   c                     |}|
||
z   n|}| j                  |      }| j                  |||      }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)rB   r   r   )rB  rN  )r2   rB   rD  r~   r   r   r   r   r$  rO  rP  r   residualrG  rH  s                  r6   rE   zZambaMambaDecoderLayer.forward  s    < !
 :S9^M55dq 	 ,,];

'') # 
 ! !=0 ")++G((Gr7   )	NNNNNFFNN)rK   rL   rM   r!   r   r,   r.   r   r   r[   rJ  r   r   r   rE   rN   rO   s   @r6   rL  rL    s   #{ #s # :>#'15.2<@,1$)59<@:||: !) 6: C=	:
 !.: ell+: !!89: $D>: D>: !!1!12: $,ELL#9: 
u  (51B1BEDUDU1U+V"WW	X:r7   rL  c                   l    e Zd Zdedej
                  def fdZ	 	 	 	 	 	 	 	 ddej                  de
ej                     de
e   de
ej                     d	e
ej                     d
e
e   de
e   de
e   de
ej                     deej                   e
eej                   ej                   f      f   fdZ xZS )ZambaHybridLayershared_transflinearrN  c                 L    t         |           || _        || _        || _        y r   )r+   r,   rU  rV  mamba_decoder)r2   rU  rV  rN  r5   s       r6   r,   zZambaHybridLayer.__init__  s%    *"r7   rB   rD  r~   r   r   r   r   r$  rO  rQ   c
           
          | j                  ||||||||	      }
|
d   }|r|
d   }| j                  |      }| j                  |||||||	      }
|r|
d   f|
dd z   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )rD  r~   r   r   r   r$  rO  r   r    )rP  r   r   r   r$  rO  r9   N)rU  rV  rX  )r2   rB   rD  r~   r   r   r   r   r$  rO  layer_outputsrP  rG  s                r6   rE   zZambaHybridLayer.forward  s    > **#9&)/) + 	
 %2!$4! -a 0$(KK0I$J!**&?))/) + 
 *1-/@AMRSRTDUUMr7   )NNNNNFFN)rK   rL   rM   r<  r
   r   rL  r,   r.   r   r   r   r[   rJ  r   r   r   rE   rN   rO   s   @r6   rT  rT    s   #&@ #")) #\r # :>#'15.2<@,1$)59>||> !) 6> C=	>
 !.> ell+> !!89> $D>> D>> !!1!12> 
u  (51B1BEDUDU1U+V"WW	X>r7   rT  c                        e Zd ZeZdZdZddgZdZdZ	dZ
dZdZd Zee	 	 	 	 ddeej"                     d	eeeeeef   f      d
edef fd              Z xZS )ZambaPreTrainedModelmodelTr<  rL  r   Fc                 T   | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t        |t              r&|j                  j                  j                  d       y t        |t              r|j                   j                  j                  d|       | j                   j"                  dz  }t        j$                  j'                  |j(                  | |       | j                   j*                  | j                   j,                  z  | j                   j.                  z  }t1        j2                  t1        j4                  | j                   j.                  |      t7        j8                  | j                   j:                        t7        j8                  | j                   j<                        z
  z  t7        j8                  | j                   j<                        z         j?                  | j                   j@                        }|t1        j8                  t1        jB                  |              z   }|jD                  j                  jG                  |       t1        jH                  d|jJ                  dz   t0        jL                        d d d f   }|jO                  |jP                  d      jS                         }|jT                  j                  jG                  t1        j8                  |      jW                  |j.                  |jX                  d             |jZ                  j                  j                  d       y y )	Nr   )r@   stdg      ?r   )minr    r   r:   ).rw   initializer_ranger  r
   r   r   r0   datanormal_r   zero_	Embeddingpadding_idxr)   fill_r   r   r   inituniform_r   rc   r3   ri   r.   r  randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1r   r  r   rf   r>   rS   rd   r   r   rT   r   r   )r2   r   r_  dt_init_stdr   dtinv_dtr   s           r6   _init_weightsz"ZambaPreTrainedModel._init_weights&  s   kk++fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)0  %%--3C-@++33T9KGGV22[L+N![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F$$**62Q 5 5 9OPTVWPWXA1126AACALL##EIIaL$8$89M9MvOdOdfh$ijHHMM$% 1r7   torch_dtype
device_maphard_check_onlycheck_device_mapc                 `    t         |   |||||      }|s|j                  dk(  rd|_        |S )z
        Overloads `PreTrainedModel._check_and_enable_flash_attn_2` so as to DISABLE Flash Attention 2 by default on Zamba models.
        Flash attention 2 is currently not supported in the HuggingFace implementation of Zamba v1.
        )rw  rx  flash_attention_2r   )r+   _check_and_enable_flash_attn_2r   )r   rw   ru  rv  rw  rx  r5   s         r6   r{  z3ZambaPreTrainedModel._check_and_enable_flash_attn_2F  sE     7K__o 8 

 6#>#>BU#U*1F'r7   rI  )rK   rL   rM   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_is_statefulrt  r   r   r.   r<   r	   r   r   r   rJ  r{  rN   rO   s   @r6   r\  r\    s    L&*#57OP"3"N L%@  .2;? %!& ekk* U3S#X#678	
    r7   r\  c                   8    e Zd ZdZdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e   d
e	e
j                     de	e   de	e   de	e   de	e   de	e
j                     deeef   fd       Zd Z xZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    rw   c           
         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        |      }g }g }|j                  | _
        t        |j                        D ]  }|j                  |   dk(  r|j                  t        ||             2|j                  |   dk(  sE|j                  t        j                  | j                   j                  | j                   j                  d             |j                  t        ||              t#        |      }t#        |      }g }g | _        t'        | j                        D ]  \  }}|dk(  r_d| d}	g d}
g | j$                  |
D cg c]  }|	|z   	 c}| _        |j                  t)        |t+        |      t+        |                   j|j                  t+        |              t        j,                  |      | _        |j0                  | _        t3        |j                  |j4                  	      | _        d| _        | j;                          y c c}w )
NrN  )r~   r_   Fr   zlayers..)	z%shared_transf.self_attn.q_proj.weightz%shared_transf.self_attn.k_proj.weightz%shared_transf.self_attn.v_proj.weightz%shared_transf.self_attn.o_proj.weightz+shared_transf.feed_forward.gate_proj.weightz)shared_transf.feed_forward.up_proj.weightz+shared_transf.feed_forward.down_proj.weightz$shared_transf.input_layernorm.weightz%shared_transf.pre_ff_layernorm.weightr>  )r+   r,   pad_token_idrf  
vocab_sizer
   re  r3   embed_tokensr<  ra   rp   rq   rs   rL  r   rw   iter_tied_weights_keys	enumeraterT  next
ModuleListlayersr   r)   rA  final_layernormgradient_checkpointing	post_init)r2   rw   blockmamba_layerslinear_layersry   r  layer_id
layer_typeprefix_name	tied_keysr   r5   s               r6   r,   zZambaModel.__init__h  s&    !.. ++LL):):F<N<NPTP`P`a*62!'!9!9v//0 	QA''*g5##$:6Q$OP))!,8$$RYYt{{/F/FH_H_fk%lm##$:6Q$OP	Q L)]+"$$-d.D.D$E 	2 HjX% 'z3
	 +pD,C,C*odmFn]`{UXGXFn*o'.ud=6I4P\K]^_d<01#	2$ mmF+$*$?$?!+F,>,>FDWDWX&+# Gos   .I7c                     | j                   S r   r  rI   s    r6   get_input_embeddingszZambaModel.get_input_embeddings  s       r7   c                     || _         y r   r  r2   r   s     r6   set_input_embeddingszZambaModel.set_input_embeddings  s
    !r7   	input_idsr   position_idsr   inputs_embedsr$  r   output_hidden_statesreturn_dictrO  rQ   c                 d   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}t        j                  |      }|r|t        j                  d       |
.t        j                  |j                  d   |j                        }
||
j!                  d      }| j#                  |||
      }|rdnd }|rdnd }t%        | j&                        D ]r  \  }}|r||fz  }| j                  r1| j                  r%| j)                  |j*                  |||||||||

      }n ||||||||||
		      }|d   }|sd|d   j||d   fz  }t | j-                  |      }|r||fz  }|r|j.                  sd
|_        t1        ||r|nd ||      }|	r|S |j3                         S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r    r`   r   r   )rD  r~   r   r   r   r   r$  rO  T)last_hidden_stater   rB   
attentions)rw   r   r  r$  use_return_dictr.  r  r   r   r   r  r.   r  r   rH   r^   r   _update_causal_maskr  r  _gradient_checkpointing_func__call__r  rb   r   to_tuple)r2   r  r   r  r   r  r$  r   r  r  rO  rB   rD  r   all_hidden_statesall_self_attnsr~   layerrZ  outputs                       r6   rE   zZambaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0:
 !"\\-*=*=a*@I]I]^N)33A6L..~}n]"6BD0d )$++ 6 "	:Iu#!m%55!**t}} $ A ANN!*"#%"! !&!+A'#1 +#2&7'#1
! *!,M  #/"}Q'7&99NE"	:H ,,];  -!11?#E#E15O.(+/8Od+%	
 %v;&//*;;r7   c                    | j                   j                  dk(  r	|d|v r|S y |j                  |j                  }}t	        j
                  |      j                  }|j                  d   }|d   dz   }t	        j                  ||f|||      }	|dk7  rt	        j                  |	d      }	|	t	        j                  ||      |j                  dd      kD  z  }	|	d d d d d d f   j                  |j                  d   ddd      }	||	j                         }	|j                         d	k(  rd|j                  d   }
|	d
d |
f   j                  d      |d d d d d d f   j                  d      z  }|	d
d |
f   j!                  ||      |	d
d |
f<   | j                   j                  dk(  r0|.|j                  j"                  dv rt%        j&                  |	|      }	|	S )Nrz  r   r    r:   )
fill_valuer<   r^   )diagonalr`   r   r9   .r   )r,  xpunpu)rw   r   r<   r^   r.   finfor`  rH   fulltriur   rT   rS   r  r   eqmasked_fillr-  r   _unmask_unattended)r2   r   input_tensorrO  r<   r^   	min_dtypesequence_lengthtarget_lengthr   mask_lengthpadding_masks               r6   r  zZambaModel._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/=!Ai_dmsta**[1=Ku||M&ANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\hjs1tC+-. KK,,6*%%**.DD
 1CCKQZ[Kr7   
NNNNNNNNNN)rK   rL   rM   r   r!   r,   r  r  r   r   r.   r   r   r[   r   rJ  r	   r   r   rE   r  rN   rO   s   @r6   r  r  _  s(   -{ -^!"  151537=A59$(,0/3&*59k<E,,-k< !.k< u//0	k<
 ""9:k<   1 12k< D>k< $D>k< 'tnk< d^k< !!1!12k< 
u--	.k< k<\!r7   r  c                       e Zd Zdef fdZd Zd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     dee   deej"                     deej                     dee   dee   dee   dee   deej                     deeej                  f   deeef   fd       Z	 	 	 	 	 	 ddZ xZS )ZambaForCausalLMrw   c                 $   t         |   |       t        |      | _        dg| j                  j                  | _        |j
                  | _        t        j                  |j                  |j
                  d      | _	        | j                          y )Nzlm_head.weightFr   )r+   r,   r  r]  r  r  r
   r   r3   lm_headr  r8  s     r6   r,   zZambaForCausalLM.__init__2  so     '
#3"Tdjj6S6S"T ++yy!3!3V5F5FUS 	r7   c                 .    | j                   j                  S r   r]  r  rI   s    r6   r  z%ZambaForCausalLM.get_input_embeddings<      zz&&&r7   c                 &    || j                   _        y r   r  r  s     r6   r  z%ZambaForCausalLM.set_input_embeddings?      "'

r7   c                     | j                   S r   r  rI   s    r6   get_output_embeddingsz&ZambaForCausalLM.get_output_embeddingsB  s    ||r7   c                     || _         y r   r  )r2   new_embeddingss     r6   set_output_embeddingsz&ZambaForCausalLM.set_output_embeddingsE  s	    %r7   c                     || _         y r   r]  )r2   decoders     r6   set_decoderzZambaForCausalLM.set_decoderH  s	    
r7   c                     | j                   S r   r  rI   s    r6   get_decoderzZambaForCausalLM.get_decoderK  s    zzr7   r  r   r  r   r  labelsr$  r   r  r  rO  logits_to_keeprQ   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
| j	                  ||||||||	||

      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   r  r   r  r$  r   r  rO  r  r   r    losslogitsr   rB   r  )rw   r   r  r  r]  r  r   slicer  loss_functionr  r   r   rB   r  )r2   r  r   r  r   r  r  r$  r   r  r  rO  r  loss_kwargsrH  rB   slice_indicesr  r  r  s                       r6   rE   zZambaForCausalLM.forwardN  sL   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W~ot4]kmA}a,?@A%4%%ffdooUUDY,F'+'7D7V#CVC%#33!//))
 	
r7   c           	      t   |d u }	|	sZ||d   |j                   d   k\  r|d d |j                   d    d f   }nc|j                   d   |j                   d   k7  rD|d d |f   }n:t        | j                  |j                   d   | j                  | j                        }|T|R|j                         j                  d      dz
  }|j                  |dk(  d       |	s|d d |j                   d    d f   }||	rd|i}
nd|j                         i}
|
j                  ||||| j                  j                  |d       |
S )Nr:   r    r   )r<   r^   r  r  )r  r   r$  r   r  rO  )rH   r[   rw   r<   r^   longcumsummasked_fill_r   r   num_logits_to_keep)r2   r  r   r   r  rO  r  r$  r   empty_past_kvmodel_inputss              r6   prepare_inputs_for_generationz.ZambaForCausalLM.prepare_inputs_for_generation  sc    (4/  )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	5Y__Q/tzz$++O %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r7   )NNNNNNNNNNNr   )NNNNNT)rK   rL   rM   r!   r,   r  r  r  r  r  r  r   r   r.   r   r   r[   r   rJ  r	   r   r   r   rE   r  rN   rO   s   @r6   r  r  1  s   { '(&  151537=A59-1$(,0/3&*5934O
E,,-O
 !.O
 u//0	O

 ""9:O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
 c5<</0O
 
u,,	-O
 O
h 9r7   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   L    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deeeeej                     f      deej                     d	eej                     d
ee   dee   dee   dee   deeef   fd       Z xZS )ZambaForSequenceClassificationc                    t         |   |       |j                  | _        t        |      | _        | j                  j
                  | _        t        j                  |j                  | j                  d      | _	        | j                          y r2  )r+   r,   
num_labelsr  r]  r  r
   r   r3   scorer  r8  s     r6   r,   z'ZambaForSequenceClassification.__init__  se      ++'
"&**"?"?YYv114??O
 	r7   c                 .    | j                   j                  S r   r  rI   s    r6   r  z3ZambaForSequenceClassification.get_input_embeddings  r  r7   c                 &    || j                   _        y r   r  r  s     r6   r  z3ZambaForSequenceClassification.set_input_embeddings  r  r7   r  r   r  r   r  r  r$  r   r  r  rQ   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|
s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r  r   r  r$  r   r  r  r   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r:   r]   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r`   
regressionsingle_label_classificationmulti_label_classificationr  )rw   r  r]  r  rH   r  r.  r=   r^   r.   int32r   argmaxr   r   r5   rK   problem_typer  r<   r  r   r   r   r   r   r   r   r   rB   r  )r2   r  r   r  r   r  r  r$  r   r  r  transformer_outputsrB   r  rx   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r6   rE   z&ZambaForSequenceClassification.forward  s   ( &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r7   r  )rK   rL   rM   r,   r  r  r   r   r.   r   r   r	   r   r   r   rJ  r   r   rE   rN   rO   s   @r6   r  r    s)   '(  151537KO59-1$(,0/3&*[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
r7   r  )r  r  r  r\  )r   )Pr   rk  typingr   r   r   r   r   r   r	   r.   torch.utils.checkpointr
   torch.nnr   r   r   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   utils.import_utilsr   r   configuration_zambar!   &mamba_ssm.ops.selective_scan_interfacer"   r#   +mamba_ssm.ops.triton.selective_state_updater$   causal_conv1dr%   r&   r  r   
get_loggerrK   r   Moduler)   rs   r   r   rY   r[   r  r   r   r   r0  r<  rL  rT  r\  r  r  r  __all__r   r7   r6   <module>r     sd  (   D D D    A A ! . ) > B q q F & 1 , T , XR@P=-~DD-7**.0@BVXfg 
 
		H	%J299 J(    L )	UU\\ 	U# 	U%,, 	U[fl [fJ %II%<<% 
% <<	%
 U\\*% % %4I)RYY I)XS]bii S]nryy  < <~ARYY AHEryy EP A? A AH M% M Mbh+_ hV m
%9 m
m
` gr7   