
    Uh                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlm Z   ejB                  e"      Z# e       rddl$m%Z% ndZ% e       rddl&m'Z'm(Z( ddl)m*Z* nd\  Z*Z(Z' e       r	ddl+m,Z,m-Z- nd\  Z-Z, e.e*e(e,e-e'f      Z/ G d dej`                        Z1 G d dej`                        Z2 G d dej`                        Z3e G d de             Z4e G d de             Z5e G d  d!e             Z6e G d" d#e4             Z7 ed$%       G d& d'e4e             Z8g d(Z9y))zPyTorch MAMBA model.    N)	dataclass)AnyDictOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)
MambaCache)GenerationMixin)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNc            
       @    e Zd ZdZdedef fdZ	 	 	 ddej                  de	e
   de	ej                     de	ej                     fd	Zdde	e
   de	ej                     de	ej                     fd
Z	 	 	 dde	e
   de	ej                     de	ej                     fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    config	layer_idxc           	      |   t         |           || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                        | _
        || _        |j                  | _        t        j                  | j                  | j                  |j                  |j                  | j                  |j                  dz
        | _        |j                   | _        t$        |j                      | _        |j(                  | _        t        j*                  | j                  | j                  dz  |j,                        | _        t        j*                  | j                  | j                  | j
                  dz  z   d      | _        t        j*                  | j                  | j                  d      | _        t5        j6                  d| j
                  dz   t4        j8                        d d d f   }|j;                  | j                  d      j=                         }t        j>                  t5        j@                  |            | _!        t        j>                  t5        jD                  | j                              | _#        t        j*                  | j                  | j                  |j,                        | _$        |j,                  | _        tJ        sM| j(                  r+tM               rtN        jQ                  d	       y tS        d
      tN        jQ                  d       y y )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r&   FTdtypea7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)*super__init__r!   hidden_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr"   use_conv_biasr	   Conv1dconv1d
hidden_act
activationr   actuse_mambapyLinearuse_biasin_projx_projdt_projtorcharangefloat32expand
contiguous	ParameterlogA_logonesDout_projis_fast_path_availabler   loggerwarning_onceImportError)selfr!   r"   A	__class__s       z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mamba/modeling_mamba.pyr0   zMambaMixer.__init__F   s{   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!-- yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!35==I$PQ'RHHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQWQ`Q`a%')''F & Z  ##J &    hidden_statescache_paramscache_positionattention_maskc                 	   | j                  |      j                  dd      }| j                  r%|"t        || j                  j
                  | j                  r| j                  j                  nd | j                  j
                  | j                  j
                  | j                  j
                  | j                  r$| j                  j                  j                         nd t        j                  | j                  j                                d d | j                   j                         | j                  j                  j                         d      }|S |j#                  dd      \  }}|||j%                  d      z  }| j                  j
                  j'                  | j                  j
                  j)                  d      | j                  j
                  j)                  d            }|m|d   dkD  ret+        |j-                  d      |j.                  | j0                     || j                  j                  | j2                        }|j%                  d      }n|Yt4        j6                  j9                  || j:                  |j<                  d   z
  df      }	|j?                  | j0                  |	|       tA        ||| j                  j                  | j2                        }|||j%                  d      z  }| j                  |j                  dd            }
t        jB                  |
| jD                  | jF                  | jF                  gd      \  }}}| j                  j
                  |j                  dd      z  }t        j                  | j                  j                                }tI        | j                  d	      r$| j                  j                  j                         nd }|e|d   dkD  r]tK        |jL                  | j0                     |d
   |d
   ||d d df   |d d df   | j                   |d
   |d
      j%                  d      }nptO        ||||j                  dd      |j                  dd      | j                   j                         ||dd
      \  }}|||jQ                  | j0                  |       | j                  |j                  dd            }|S )Nr   r*   T)
delta_biasdelta_softplusdimr   r.   )r=   r&   ).r   )dt_softplus)r_   return_last_state))rB   	transposetrainingr   r;   weightr9   r&   rC   rD   rO   rA   floatrE   exprL   rN   chunk	unsqueezeviewsizer   squeezeconv_statesr"   r=   r	   
functionalpadr5   shapeupdate_conv_stater   splitr8   r3   hasattrr   
ssm_statesr   update_ssm_state)rT   rY   rZ   r[   r\   projected_statescontextualized_statesgateconv_weightsrn   ssm_parameters	time_stepBCdiscrete_time_steprU   time_proj_biasscan_outputs	ssm_states                      rW   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward   s7     <<6@@AF==\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!p %$O #3"8"8"8"BM4) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;P^_ 0!<1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOIq! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$rX   c           	      X   |j                   \  }}}|j                  }| j                  |      j                  dd      }	|	j	                  dd      \  }
}||
|j                  d      z  }
||j                  | j                     j                         }|j                  |
j                        }|j                   d   | j                  k(  rt        j                  j                  |
| j                  |
j                   d   z
  df      }|j                  | j                  ||       | j!                  | j#                  |
      dd |f         }
n9|j                  | j                  |
|      }|j                  | j"                  j$                  j                        }t'        j(                  || j"                  j$                  d d dd d f   z  d      }
| j*                  r|
| j"                  j,                  z  }
| j!                  |
      j                  |      j                  d      }
n`t'        j.                  || j0                  | j2                  f|
j                  |      }| j!                  | j#                  |
      dd |f         }
||
|j                  d      z  }
| j5                  |
j                  dd            }t'        j6                  || j8                  | j2                  | j2                  gd      \  }}}| j;                  |      }t        j                  j=                  |      j                  dd      }t'        j>                  | j@                  jC                                }t'        j>                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jC                         z  }||
d d d d d d d f   jC                         z  }| jD                  r| jF                  r|tI        |j                  dd      |j                  dd            }||j                  d      z  jK                  d      j                  dd      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }ng }tO        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t'        jP                  |j                  |      |d d |d d f   j                  d            }|jS                  |d d d d df           t'        jT                  |d      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }|(|j                  | j                     jW                  |       | jY                  |j                  dd            }|S )	Nr   r*   r`   r   r.   .devicer-   r   )-rq   r-   rB   rd   ri   rj   ru   r"   clonetor   r5   r	   ro   rp   rr   r>   r;   rf   rE   sumr9   r&   zerosr6   r3   rC   rs   r8   rD   softplusrh   rL   rg   r?   re   r   rm   rN   rangematmulappendstackcopy_rO   )rT   input_statesrZ   r[   r\   
batch_sizeseq_len_r-   rw   rY   ry   r   
conv_stater{   r|   r}   r~   r   rU   
discrete_A
discrete_BdeltaB_uhsscan_outputr   irx   s                               rW   slow_forwardzMambaMixer.slow_forward   s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~z>Z $])CC'M)R S);;DNNM[ij
']]4;;+=+=+D+DE
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB OT33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6KL7^ :&q!Qz2Y>!QPQST*AUU	#ll9<<+>!Q'
@T@TUW@XY##K1a$89:  ++l;K%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$rX   c                     t         r^d| j                  j                  j                  j                  v r2t
        j                  j                         s| j                  ||||      S | j                  ||||      S )Ncuda)
rP   rC   rf   r   typerE   _dynamois_compilingr   r   )rT   rY   rZ   r[   r\   s        rW   forwardzMambaMixer.forward:  se     "f0B0B0I0I0N0N&NW\WdWdWqWqWs,,]L.Zhii  nn]]rX   r   )__name__
__module____qualname____doc__r   r7   r0   rE   Tensorr   r   
LongTensorr   r   r   __classcell__rV   s   @rW   r    r    >   s   :{ :s :~ .25959c%||c% z*c% !!1!12	c%
 !!1!12c%LO%x
7K O%aijojzjza{ O%  S[  \a  \l  \l  Sm O%j .25959	^ z*	^ !!1!12		^
 !!1!12	^rX   r    c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)r/   r0   r	   rJ   rE   rM   rf   variance_epsilon)rT   r1   epsrV   s      rW   r0   zMambaRMSNorm.__init__G  s1     	ll5::k#:; #rX   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr*   r.   T)keepdim)	r-   r   rE   rG   powmeanrsqrtr   rf   )rT   rY   input_dtypevariances       rW   r   zMambaRMSNorm.forwardO  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::rX   c                 R    | j                   j                  d    d| j                   S )Nr   z, eps=)rf   rq   r   rT   s    rW   
extra_reprzMambaRMSNorm.extra_reprV  s*    ++##A&'vd.C.C-DEErX   )gư>)r   r   r   r0   r   r   r   r   s   @rW   r   r   F  s    $;FrX   r   c                   t     e Zd Z fdZ	 	 	 ddee   deej                     deej                     fdZ xZ	S )
MambaBlockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||      | _
        y )Nr   r"   )r/   r0   r!   r"   residual_in_fp32r   r1   layer_norm_epsilonnormr    mixer)rT   r!   r"   rV   s      rW   r0   zMambaBlock.__init__[  sR    " & 7 7 !3!39R9RS	)<
rX   rZ   r[   r\   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  ||||      }||z   }|S )Nr,   rZ   r[   r\   )r   r   rf   r-   r   rE   rG   r   )rT   rY   rZ   r[   r\   residuals         rW   r   zMambaBlock.forwardc  s     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^dr # 
 !=0rX   r   )
r   r   r   r0   r   r   rE   r   r   r   r   s   @rW   r   r   Z  sQ    = .25959 z* !!1!12	
 !!1!12rX   r   c                   *    e Zd ZeZdZddgZdZdZd Z	y)MambaPreTrainedModelbackboner   r    Tc                    t        |t              rWd|j                  _        d|j                  _        | j
                  j                  dz  | j
                  j                  z  }| j
                  j                  dk(  r5t        j                  j                  |j                  j                  |       nO| j
                  j                  dk(  r6t        j                  j                  |j                  j                  | |       t        j                   t        j"                  | j
                  j$                        t'        j(                  | j
                  j*                        t'        j(                  | j
                  j,                        z
  z  t'        j(                  | j
                  j,                        z         j/                  | j
                  j0                        }|t        j(                  t        j2                  |              z   }t        j4                         5  |j                  j6                  j9                  |       ddd       d|j                  j6                  _        t        |t        j<                        rM|j6                  t?        |j6                  dd      st        j                  jA                  |j6                         nYt        |t        jB                        r?t        j                  jE                  |j                  | j
                  jF                  	       | j
                  jH                  r|jK                         D ]  \  }}|d
v st        j                  jM                  |t'        jN                  d             t        j4                         5  |t'        jN                  | j
                  jP                        z  }ddd        yy# 1 sw Y   xY w# 1 sw Y   xY w)zInitialize the weights.Tg      constantrandom)minN
_no_reinitF)std)zout_proj.weight   )a))
isinstancer    rL   _no_weight_decayrN   r!   r8   time_step_scaletime_step_init_schemer	   init	constant_rD   rf   uniform_rE   rh   randr6   mathrK   time_step_maxtime_step_minclamptime_step_floorexpm1no_gradr&   r   r   r@   getattrzeros_	Embeddingnormal_initializer_rangerescale_prenorm_residualnamed_parameterskaiming_uniform_sqrtnum_hidden_layers)rT   moduledt_init_stddtinv_dtnameps          rW   _init_weightsz"MambaPreTrainedModel._init_weights~  s   fj),0FLL)(,FHH%++44d:T[[=X=XXK{{00J>!!&.."7"7E22h>  !6!6kR

4;;88988DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F 2##))&12-1FNN*fbii({{&v{{L%@GGNN6;;/-GGOOFMMt{{/L/LOM;;// "224 Fa..
 GG,,Q$))A,,? FTYYt{{'D'DEEF FF 02 22F Fs    &O"-O(O%(O1	N)
r   r   r   r   config_classbase_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr    rX   rW   r   r   v  s)    L"%|4&*#L-FrX   r   c                   |    e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeeej                        ed<   y)MambaOutputa#  
    Class for the MAMBA model outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nlast_hidden_staterZ   rY   )r   r   r   r   r   r   rE   FloatTensor__annotations__rZ   r   rY   r   r   rX   rW   r   r     sH    $ 6:x 1 129)-L(:&-8<M8E%"3"345<rX   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   y)MambaCausalLMOutputa  
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    NlosslogitsrZ   rY   )r   r   r   r   r   r   rE   r   r   r   rZ   r   rY   r   r   rX   rW   r   r     s\    ( )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<rX   r   c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee   dee	j                     dee	j                     deeef   fd       Z xZS )
MambaModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )Nr   Fr   )r/   r0   r	   r   
vocab_sizer1   
embeddings
ModuleListr   r   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rT   r!   idxrV   s      rW   r0   zMambaModel.__init__  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$r3Z#%F$rs&+#"6#5#56;T;TU//? %ss   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)rT   
state_dictprefixargsks        rW   r
  zMambaModel.load_hook  s;     	Aq EO^^TUEV
199\=AB	rX   c                     | j                   S Nr  r   s    rW   get_input_embeddingszMambaModel.get_input_embeddings  s    rX   c                     || _         y r  r  rT   new_embeddingss     rW   set_input_embeddingszMambaModel.set_input_embeddings  s	    (rX   	input_idsinputs_embedsrZ   	use_cacheoutput_hidden_statesreturn_dictr[   r\   returnc	                    ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|st        | j                   |j                  d      |j                  |j                        }t        j                  d| j                   j                  |j                        }n|t        d      d}|}	|rdnd}
| j                  D ]O  }| j                  r,| j                  r | j!                  |j"                  |	|||      }	n ||	|||	      }	|sJ|
|	fz   }
Q | j%                  |	      }	|r|
|	fz   }
|st'        d
 |	||
fD              S t)        |	|r||
      S d|
      S )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr   r   c              3   &   K   | ]	  }||  y wr  r   ).0vs     rW   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>I  s     fqXYXefs   )r   rZ   rY   )r!   r  re   r  use_return_dict
ValueErrorr  r  r   rl   r   r-   rE   rF   r4   r  _gradient_checkpointing_func__call__r  tupler   )rT   r  r  rZ   r  r  r   r[   r\   rY   all_hidden_statesmixer_blocks               rW   r   zMambaModel.forward  s   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !;   L%"6BD;; 	IK**t}} $ A A((-~We! !,!!-#1#1	! $$58H$H!	I  M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
rX   )NNNNNNNN)r   r   r   r0   r
  r  r  r   r   rE   r   r   boolr   r   r   r   r   r   s   @rW   r  r    s    
)  1548-1$(/3&*5959Q
E,,-Q
   0 01Q
 z*	Q

 D>Q
 'tnQ
 d^Q
 !!1!12Q
 !!1!12Q
 
uk!	"Q
 Q
rX   r  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZdgZ fdZd Zd Zd Zd Z	 dde	de
eef   d	ed
e
eef   fdZ	 	 	 	 	 ddee   deej$                     deej$                     fdZe	 	 	 	 	 	 	 	 	 ddeej$                     deej$                     deej*                     dee   deej$                     dee   dee   dee   deej.                     d
eeef   fd       Z xZS )MambaForCausalLMzlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr+   )
r/   r0   r  r   r	   r@   r1   r  lm_headr  )rT   r!   rV   s     rW   r0   zMambaForCausalLM.__init__[  sF     "6*yy!3!3V5F5FUSrX   c                     | j                   S r  r4  r   s    rW   get_output_embeddingsz&MambaForCausalLM.get_output_embeddingsb  s    ||rX   c                     || _         y r  r6  r  s     rW   set_output_embeddingsz&MambaForCausalLM.set_output_embeddingse  s	    %rX   c                 6    | j                   j                         S r  )r   r  r   s    rW   r  z%MambaForCausalLM.get_input_embeddingsh  s    }}1133rX   c                 8    | j                   j                  |      S r  )r   r  r  s     rW   r  z%MambaForCausalLM.set_input_embeddingsk  s    }}11.AArX   outputsmodel_kwargsnum_new_tokensr!  c                    |j                  dd       |d<   |j                  dd      rd|v r|d   |d   dd  |z   |d<   d|v r?|d   }t        j                  ||j                  |j                  d   df      gd	      |d<   |S )
NrZ   r  Tr[   r.   r\   r   r   r`   )getrE   catnew_onesrq   )rT   r<  r=  r>  kwargsr\   s         rW   #_update_model_kwargs_for_generationz4MambaForCausalLM._update_model_kwargs_for_generationn  s     (/{{>4'H^$[$/ L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* rX   rZ   r[   r\   c                 <   |rh|t        d      |d   dkD  r|d d df   j                  d      }|9d }n6t        j                  d| j                  j
                  |j                        }||d|i}nd|j                         i}|j                  ||||d       |S )Nz`cache_position` should not be None as it should have been initialized in `model.generate`, you are responsible for passing in a valid `cache_position` if you are calling `prepare_inputs_for_generation` directly with `use_cache=True`r   r.   r#  r  r  )rZ   r  r[   r\   )	r)  rj   rE   rF   r!   r4   r   rI   update)	rT   r  r  r  rZ   r[   r\   rC  model_inputss	            rW   prepare_inputs_for_generationz.MambaForCausalLM.prepare_inputs_for_generation  s     % e 
 a 1$%ae,66r:	!-%)N "'a1H1HQZQaQa!b$)=+];L')=)=)?@L ,&"0"0		
 rX   r  r  labelsr  r   r  c
           
         ||n| j                   j                  }| j                  |||||||	|      }|d   }| j                  |j	                  | j                  j
                  j                              j                         }d}||j	                  |j                        }|dddddf   j                         }|dddf   j                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)rZ   r  r  r   r  r[   r\   r   .r.   r   )r   r   rZ   rY   )r!   r(  r   r4  r   rf   r-   rg   r   rI   r
   rk   rl   r   rZ   rY   )rT   r  r\   r  rZ   rI  r  r   r  r[   rC  mamba_outputsrY   r   r   shift_logitsshift_labelsloss_fctoutputs                      rW   r   zMambaForCausalLM.forward  s_   2 &1%<k$++B]B]%'!5#)) & 	
 &a(m..t||/B/B/H/HIJPPRYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
rX   )r   )NNNNN)	NNNNNNNNN)r   r   r   _tied_weights_keysr0   r7  r9  r  r  r   r   strr   r7   rD  r   r   rE   r   rH  r   r   r/  r   r   r   r   r   r   r   s   @rW   r2  r2  R  s    ++&4B YZ"26sCx.RU	c3h, -15959.
 z*. !!1!12. !!1!12.`  155959-1-1/3&*$(15<
E,,-<
 !!1!12<
   1 12	<

 z*<
 ))*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
rX   r2  )r2  r  r   ):r   r   dataclassesr   typingr   r   r   r   r   rE   torch.utils.checkpointr	   torch.nnr
   activationsr   cache_utilsr   
generationr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   configuration_mambar   
get_loggerr   rQ   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r   allrP   Moduler    r   r   r   r   r   r  r2  __all__r   rX   rW   <module>re     s     ! 4 4    % ! % ) - 
 k j , 
		H	%#EXR@P=-~DD-7**.0@BVXfg 
E^ E^PF299 F( 8 4F? 4F 4Fn =+ = =0 =+ = =6 k
% k
 k
\ V
+_ V
V
r ErX   