
    Uh+                     R   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ  ej6                  e      Zdad Z G d dej@                  jB                        Z"d%dZ#d%dZ$ G d dejJ                        Z& G d dejJ                        Z' G d dejJ                        Z(e G d de             Z)e G d de             Z*e G d de             Z+e G d de)             Z, ed !       G d" d#e)e             Z-g d$Z.y)&zPyTorch RWKV model.    N)	dataclass)Path)ListOptionalTupleUnion)nn   )GenerationMixin)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                    ddl m} t        t              j	                         j
                  j
                  j
                  dz  dz  }dD cg c]  }||z  	 }}t        t        j                  | k(  ry t        j                  d|  d       dd	d
dddd|  g} |d|  |t        j                         t        j                  k(  |      a| t        _        y c c}w )Nr   )loadkernelsrwkv)z
wkv_op.cppzwkv_cuda.cuzwkv_cuda_bf16.cuz2Loading CUDA kernel for RWKV at context length of .z
-res-usagez--maxrregcount 60z--use_fast_mathz-O3z-Xptxas -O3z--extra-device-vectorizationz-DTmax=wkv_)namesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionr   r   __file__resolveparentrwkv_cuda_kernelmax_seq_lengthloggerinfor   get_verbosityDEBUG)context_lengthload_kernelkernel_folderfcuda_kernel_filesflagss         x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernelr0   .   s    = N**,33::AAIMPVVM4efq*ff #(8(G(G>(Y
KKD^DTTUVW 	&
.!"E #N#$!&&(GMM9	 '5#/ gs   Cc                   0    e Zd Zedd       Zedd       Zy)RwkvLinearAttentionNc                    |j                         \  }}}	|t        j                  kD  r t        d| dt        j                   d      ||	z  t	        |	d      z  dk7  rt        d| d|	 dt	        |	d       d	      |j
                  | _        |j                  j                  d
k7  sK|j                  j                  d
k7  s2|j                  j                  d
k7  s|j                  j                  d
k7  rt        d      t        j                  |j                         j                                }|j
                  t        j                  k(  r0|j                         }|j                         }|j                         }|j                         }|j                         }|j                         }t        j                  |t        j                        }
|s||Vt        j                   ||	dt        j"                  |j                  t        j                        }|d d d d dfxx   dz  cc<   nBt        j$                  |D cg c]  }|j'                  d       c}d      j                         }|j
                  t        j(                  k(  rt        j*                  }nt        j,                  } ||||||
|       nI|j
                  t        j(                  k(  rt        j.                  nt        j0                  } ||||||
       | j3                  |||||
       |4t        j4                  |dd      D cg c]  }|j7                  d       }}|
j9                  | j                        |fS c c}w c c}w )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of r   cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr
   )dtypedevicer7      籡*G)dim)sizer#   r$   
ValueErrorminr8   input_dtyper9   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32cat	unsqueezebfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunksqueezeto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputsforward_funcs                r/   rQ   zRwkvLinearAttention.forwardO   s   +.88:(
G[%444.wi7b#2233DF  #c+r&::a?-j\9L[M Z";34A7 
 )) ""f,  %%/zz&(||  F*tuuii
 0 0 2 = = ?@@
99%#))+J))+CKKME**,
nn  "!!#U5L5LM5,}--::"'"9"9 aAg$&		5"Aa1;;q>"AqITTVyyENN*/GG/BBZeVUK<?II<W+88]m]u]uLZeVDj*c5&I+0;;uaQ+GHaQYYq\HEHyy)500 #B Is   ?M+4M0c                 <   | j                   }| j                  \  }}}}}t        j                  |t        j                  |t        j
                  k(  rt        j
                  nt        j                        }	t        j                  |t        j                        }
t        j                  |t        j                        }t        j                  |t        j                        }|t        j                  k(  r|j                         }|t        j
                  k(  rt        j                  nt        j                  } |||||||j                         |	|
||
       |	j                  |      |
j                  |      |j                  |      |j                  |      d d fS )N)r7   r8   r6   )r@   saved_tensorsrB   rG   rH   rM   rJ   rF   rD   r#   backward_bf16backwardrE   rU   )rV   g_outputg_stater@   rW   rX   rY   rZ   r`   g_time_decayg_time_firstg_keyg_valuebackward_funcs                 r/   rf   zRwkvLinearAttention.backward   sJ    oo585F5F2
JUF''11$/5>>$A%..u}}

 ''
%BYBYZ  E4K4KL""58O8OP%--'~~'H:E:W(66]m]v]v!	
 OOK(OOK(HH[!JJ{#
 	
    NFN)__name__
__module____qualname__staticmethodrQ   rf    rn   r/   r2   r2   N   s)    <1 <1| %
 %
rn   r2   c                    |j                         \  }}}t        j                  |      }|t        j                  |d d df   t        j                        }	t        j                  |d d df   t        j                        }
t        j                  |d d df   t        j                        dz
  }n|\  }	}
}t        j                  |        } t        |      D ]  }|d d |f   j                         }|d d |f   }t        j                  |||z         }t        j                  ||z
        }t        j                  ||z   |z
        }||	z  ||z  z   }||
z  |z   }||z  j                  |j                        |d d |f<   t        j                  || z   |      }t        j                  || z   |z
        }t        j                  ||z
        }||	z  ||z  z   }	||
z  |z   }
|} |s||	|
|g}||fS )Nr   )r8   r;   )
r=   rB   
zeros_likerJ   rC   rangerD   maximumrU   r8   )rW   rX   rY   rZ   r[   r\   _
seq_lengthr`   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_states                        r/   rwkv_linear_attention_cpur      s    xxzAz1c"F}$$SAYemmD	$$SAYemmD	$$SAYemmDtK	*/'	9i
 ))J''Jz* "!]*+113a./ y+
2JKYYy>12YY{Z/.@ANR-%77	9nr)$-$;#?#?#Mq-  i*&<kJYYy:-=>YY{]23NR-%77	NR'	!	%"( u(Iy15=rn   c                     t        d | |||fD              }|j                  d      dk(  }t        |s|rt        | |||||      S t        j                  | |||||      S )Nc              3   N   K   | ]  }|j                   j                  d k7    yw)r5   N)r9   rA   ).0ts     r/   	<genexpr>z(rwkv_linear_attention.<locals>.<genexpr>   s     Xa!((--6)Xs   #%r   r[   r\   )anyr=   r#   r   r2   apply)rW   rX   rY   rZ   r[   r\   no_cuda	one_tokens           r/   rwkv_linear_attentionr      sm    XJ
CQV3WXXG q I7i(ZeSXgstt"((ZeUT`aarn   c                   0     e Zd Zd fd	ZddZddZ xZS )RwkvSelfAttentionc                 r   t         |           || _        t        d uxr t        j                  |j
                  k(  }t               r"t               r|s	 t        |j
                         || _        |j                  }|j                  |j                  n|}|| _        t        j                   t#        j$                  |            | _        t        j                   t#        j$                  |            | _        t        j                   t#        j$                  dd|            | _        t        j                   t#        j$                  dd|            | _        t        j                   t#        j$                  dd|            | _        t        j0                  d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        y # t        $ r t        j                  d       Y w xY w)Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr#   r$   r)   r   r   r0   	Exceptionr%   r&   layer_idr_   attention_hidden_sizer	   	ParameterrB   emptyrW   rX   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrY   rZ   
receptancer`   )selfr   r   kernel_loadedr_   r   	__class__s         r/   r   zRwkvSelfAttention.__init__   s   (4q9I9X9X\b\q\q9q$;$=mY$V%:%:; !((,2,H,H,TF((Ze 	 &;",,u{{3H'IJ,,u{{3H'IJLLQ;)GH ll5;;q![+IJ#%<<Aq+0N#O ,,}599[*?eLYY{,AN
))K1FUSii 5{O)  YWXYs   H H65H6c                 p   |j                  d      dk(  r||d   d d d d | j                  f   }n3| j                  |      }| |d   d d d d | j                  f   |d d df<   || j                  z  |d| j                  z
  z  z   }|| j                  z  |d| j                  z
  z  z   }|| j
                  z  |d| j
                  z
  z  z   }| j                  |      }| j                  |      }t        j                  | j                  |            }| |d d df   |d   d d d d | j                  f<   ||||fS Nr   r   r   )r=   r   r   r   r   r   rY   rZ   rB   sigmoidr   )r   hiddenr[   shiftedrY   rZ   r   s          r/   extract_key_valuez#RwkvSelfAttention.extract_key_value  s<   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1t(((7a$:K:K6K+LL,,,w!d>Q>Q:Q/RRd666AH`H`D`9aa
hhsm

5!]]4??:#>?
,21b5ME!HQ4==()3u,,rn   c                      j                  ||      \  }}}}|t         fd|dd  D              nd }t         j                   j                  ||||      \  }}|T|d   |d   d d d d  j
                  f<   |d   |d   d d d d  j
                  f<   |d   |d   d d d d  j
                  f<    j                  ||z        |fS )	Nr[   c              3   J   K   | ]  }|d d d d j                   f     y wrp   r   )r   ra   r   s     r/   r   z,RwkvSelfAttention.forward.<locals>.<genexpr>#  s!     FqAaDMM12Fs    #r:   r   r   r   r
      )r   tupler   rW   rX   r   r`   )	r   r   r[   	use_cacher   rY   rZ   layer_stater   s	   `        r/   rQ   zRwkvSelfAttention.forward!  s    (,(>(>vU(>(S%
CJOJ[eFE!"IFFae1OOOO"
k ",7NE!HQ4==(),7NE!HQ4==(),7NE!HQ4==(){{:,-u44rn   r   rp   ro   )rq   rr   rs   r   r   rQ   __classcell__r   s   @r/   r   r      s    P<-&5rn   r   c                   (     e Zd Zd fd	ZddZ xZS )RwkvFeedForwardc                 B   t         |           || _        || _        |j                  }|j
                  |j
                  nd|j                  z  }t        j                  d      | _        t        j                  t        j                  dd|            | _        t        j                  t        j                  dd|            | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        y )Nr   r   r   Fr   )r   r   r   r   r_   intermediate_sizer	   r   r   r   rB   r   r   r   r   rY   r   rZ   )r   r   r   r_   r   r   s        r/   r   zRwkvFeedForward.__init__6  s     (((.(@(@(LF$$RSV\VhVhRh 	 ,,}5LLQ;)GH#%<<Aq+0N#O 99[*;%H))K5IYY0+EJ
rn   c                 z   |j                  d      dk(  r||d   d d d d | j                  f   }n3| j                  |      }| |d   d d d d | j                  f   |d d df<   || j                  z  |d| j                  z
  z  z   }|| j                  z  |d| j                  z
  z  z   }t        j                  t        j                  | j                  |                  }| j                  |      }t        j                  | j                  |            }| |d d df   |d   d d d d | j                  f<   ||z  |fS r   )r=   r   r   r   r   rB   squarerelurY   rZ   r   r   )r   r   r[   r   rY   r   rZ   s          r/   rQ   zRwkvFeedForward.forwardG  s)   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1t(((7a$:K:K6K+LLd666AH`H`D`9aa
ll5::dhhsm45

3]]4??:#>?
,21b5ME!HQ4==()E!5((rn   r   rp   rq   rr   rs   r   rQ   r   r   s   @r/   r   r   5  s    K")rn   r   c                   &     e Zd Z fdZddZ xZS )	RwkvBlockc                    t         |           || _        || _        |dk(  r0t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _	        t	        j
                  |j                  |j                        | _
        t        ||      | _        t        ||      | _        y )Nr   )eps)r   r   r   r   r	   	LayerNormr_   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   s      r/   r   zRwkvBlock.__init__\  s     q=,,v'9'9v?X?XYDK<< 2 28Q8QR<< 2 28Q8QR*68<+FH=rn   c                    | j                   dk(  r| j                  |      }| j                  | j                  |      ||      \  }}||z   }| j	                  | j                  |      |      \  }}||z   }||f}|r||fz  }|S |dz  }|S )Nr   )r[   r   r   rp   )r   r   r   r   r   r   )r   r   r[   r   output_attentionsr   r   outputss           r/   rQ   zRwkvBlock.forwardj  s    ==A[[(F>>$((6*:%S\>]	5)#"//0@/Ne,&5/	|#G  wGrn   )NFFr   r   s   @r/   r   r   [  s    >rn   r   c                   0    e Zd ZeZdZdgZddgZdZdZ	d Z
y)RwkvPreTrainedModelr   r   rW   rX   Tc           	         t        |t              rv|j                  }|j                  j                  }|j                  j
                  }|j                  }||dz
  z  }d||z  z
  }t        j                  t        |      D cg c]  }||z  	 c}|j                  j                  |j                  j                        }	|	ddddf   }	t        |      D 
cg c]  }
dd|
|dz
  z  dd|z  z   z  z  z    }}
t        j                  ||j                  j                  |j                  j                        }t        j                  t        |      D cg c]  }|dz   d	z  dz
   c}|j                  j                  |j                  j                        d
z  }t        j                         5  ||j                  _        t        j"                  |j                  t%        j&                  d      z  |z         |j                  _        t        j(                  |	|      |j                  _        t        j(                  |	|      d|z  z   |j*                  _        t        j(                  |	d
|z        |j,                  _        ddd       yt        |t.              r|j                  }|j                  j                  }|j                  j
                  }d||z  z
  }t        j                  t        |      D cg c]  }||z  	 c}|j                  j                  |j                  j                        }	|	ddddf   }	t        j                         5  t        j(                  |	|      |j                  _        t        j(                  |	|      |j,                  _        ddd       yyc c}w c c}
w c c}w # 1 sw Y   yxY wc c}w # 1 sw Y   yxY w)zInitialize the weights.r   g      ?r8   r9   N   gffffff?g?r
   g      ?g333333?)
isinstancer   r   r   num_hidden_layersr_   r   rB   tensorrx   r   r8   r9   rW   rX   no_graddata	ones_likemathlogpowr   r   r   )r   moduler   r   r_   r   ratio_0_to_1ratio_1_to_almost0itime_weighthdecay_speedzigzags                r/   _init_weightsz!RwkvPreTrainedModel._init_weights  s]   f/0H & ? ? --33K$*$@$@!#'81'<=L!$3D(D!E,,*/*<=Q[=))//**11K
 &dD!m4K 45 Q!4q89sS<EW?WXXXK   ,,{&:K:K:Q:QZ`ZkZkZrZrsK.34I.JKa!eq[1_K ++11!,,33
    c)4!!&).9J9JTXXVY]9Z]c9c)d!!&+099[BT+U##(-2YY{DV-WZ]`lZl-l%%*27))KOaIa2b**/c c 0H & ? ? --33K!$3D(D!E,,*/*<=Q[=))//**11K
 &dD!m4K ]+099[BT+U##(27))KI[2\**/] ] 17 > Lc c >] ]s2   N&N+N0.CN55OAO5N>ON)rq   rr   rs   r   config_classbase_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr   ru   rn   r/   r   r   }  s1    L$)<8&*#L7]rn   r   c                       e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)
RwkvOutputa  
    Class for the RWKV model outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlast_hidden_stater[   .hidden_states
attentions)rq   rr   rs   __doc__r   r   rB   FloatTensor__annotations__r[   r   r   r   r   ru   rn   r/   r   r     sw    , 6:x 1 129/3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rn   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	RwkvCausalLMOutputa|  
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlosslogitsr[   .r   r   )rq   rr   rs   r   r   r   rB   r   r   r   r[   r   r   r   r   ru   rn   r/   r   r     s    0 )-D(5$$
%,*.FHU&&'./3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rn   r   c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deeej                        dee   d	ee   d
ee   dee   deeef   fd       Zd Zd Z xZS )	RwkvModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        d| _        d| _        | j!                          y c c}w )Nr   F)r   r   r	   	Embedding
vocab_sizer_   
embeddings
ModuleListrx   r   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_init)r   r   idxr   s      r/   r   zRwkvModel.__init__  s     ,,v'8'8&:L:LMmmPUV\VnVnPo$pYv%D$pqll6#5#56#( &+# 	 %qs   &C	c                     | j                   S rp   r  r   s    r/   get_input_embeddingszRwkvModel.get_input_embeddings  s    rn   c                     || _         y rp   r  r   new_embeddingss     r/   set_input_embeddingszRwkvModel.set_input_embeddings  s	    (rn   	input_idsattention_maskinputs_embedsr[   r   r   output_hidden_statesreturn_dictreturnc	           	         ||n| j                   j                  }||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j
                  }|t        j                  d       | j                  | j                  k(  r| j                          ||t        d      ||t        d      || j                  |      }|r||j                  d      | j                   j                  | j                   j                  f}	t        d      D 
cg c]A  }
t!        j"                  |	|
dk  r|j$                  nt         j&                  |j(                  d	C }}
|d
xx   dz  cc<   | j*                  r%| j                  r|rt        j                  d       d}|}|rdnd}|rdnd}t-        | j.                        D ]  \  }}| j*                  r0| j                  r$| j1                  |j2                  ||||      \  }}}n |||||      \  }}}| j                  r=| j                   j4                  dkD  r$|dz   | j                   j4                  z  dk(  r|dz  }|r||fz   }|s||fz   } | j7                  |      }|r||fz   }|st9        d ||||fD              S t;        ||||      S c c}
w )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr      r   r   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...ru   )r[   r   r   r:   c              3   &   K   | ]	  }||  y wrp   ru   )r   xs     r/   r   z$RwkvModel.forward.<locals>.<genexpr>{  s     tqfgfsts   )r   r[   r   r   )r   r   r  trainingr   use_return_dictr%   warning_oncer  _rescale_layersr>   r  r=   r_   r   rx   rB   rI   r8   rJ   r9   r  	enumerater  _gradient_checkpointing_func__call__rescale_everyr  r   r   )r   r  r  r  r[   r   r   r  r  shaper   r   all_self_attentionsall_hidden_statesr
  blockr   s                    r/   rQ   zRwkvModel.forward  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]% ^_==D444  " ]%>cdd=#8TUU  OOI6M"''*DKK,C,CT[[EbEbcE
 q	  a-"5"5U]][h[o[oE  !HH&&4==##p "	%$5b4"6BD#DKK0 	JJC**t}}373T3TNNM5)EV40uj 49!)Wh40uj
 ((KK--11W 9 99Q> - 1#$58H$H! &9ZM&I#+	J. M2 1]4D Dt]E;LNa$bttt++*	
 	
es   5AKc           	         | j                   | j                   k(  ry | j                  j                  dkD  rt	        j
                         5  t        | j                        D ]  \  }}| j                  r|j                  j                  j                  j                  dt        || j                  j                  z        z         |j                  j                  j                  j                  dt        || j                  j                  z        z         t        |j                  j                  j                  d      r|j                  j                  j                  j                   j#                  dt        || j                  j                  z        z         |j                  j                  j                  j                   j#                  dt        || j                  j                  z        z         t        |j                  j                  j                  d      rN| j%                  |j                  j                  |       | j%                  |j                  j                  |       |j                  j                  j                  j#                  dt        || j                  j                  z        z         |j                  j                  j                  j#                  dt        || j                  j                  z        z          	 d d d        | j                   | _         y # 1 sw Y   xY w)Nr   r:   SCBquant_state)r  r  r   r$  rB   r   r!  r  r   r`   weightmul_intr   rZ   hasattrr*  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr(  s      r/   r   zRwkvModel._rescale_layers  s`   ##DMM(9:;;$$q( r'0'= rOHe}}..55::1HPTP[P[PiPiDi@j;jk**0077<<Q#hRVR]R]RkRkFkBl=lm #5??#9#9#@#@%H!OO2299==BB1HX\XcXcXqXqLqHrCrs!..44;;??DDQ#hZ^ZeZeZsZsNsJtEtu$U__%;%;%B%BMR AA%//BXBXZbc AA%BTBTBZBZ\de!OO2299>>qCTXT_T_TmTmHmDn?no!..44;;@@c(VZVaVaVoVoJoFpApqrr" (,}}#4 #r rs   
KL77M c                    t               st        d      ddl}|j                  j	                  |j
                  j                  |j
                  j                        }|j                  dt        || j                  j                  z        z         |j                  j                  |j                  d      d      j                  |j                        }t!        |d|       y)	z
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        z/Please install bitsandbytes to use this method.r   Nr:   cpuF)requires_gradr,  )r   ImportErrorbitsandbytes
functionaldequantize_4bitr,  r   r+  r0  r.  r   r$  r	   
Params4bitrU   r9   setattr)r   target_layerr2  bnbdequant_weightsquant_weights         r/   r1  z*RwkvModel._bnb_4bit_dequantize_and_rescale  s    
 )*OPP"..889L9L9Q9QS_SfSfSrSrsQ#h$++2K2K&K"LLM vv((););E)BRW(X[[\k\r\rsh5rn   )NNNNNNNN)rq   rr   rs   r   r  r  r   r   rB   
LongTensorr   r   boolr   r   r   rQ   r   r1  r   r   s   @r/   r   r     s    )  15595937$(,0/3&*l
E,,-l
 !!1!12l
   1 12	l

 U../0l
 D>l
 $D>l
 'tnl
 d^l
 
uj 	!l
 l
\506rn   r   z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   0    e Zd ZdgZ fdZd Zd ZddZe	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	ee
j                        d
e	e
j                     de	e   de	e   de	e   de	e   deeef   fd       Z xZS )RwkvForCausalLMzhead.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r   r   r   r   r	   r   r_   r  headr	  )r   r   r   s     r/   r   zRwkvForCausalLM.__init__  sH     f%	IIf00&2C2C%P	 	rn   c                     | j                   S rp   rF  r  s    r/   get_output_embeddingsz%RwkvForCausalLM.get_output_embeddings  s    yyrn   c                     || _         y rp   rH  r  s     r/   set_output_embeddingsz%RwkvForCausalLM.set_output_embeddings  s	    "	rn   c                 h    ||d d df   j                  d      }||d|i}nd|i}||d<   ||d<   |S )Nr   r  r  r[   r   )rL   )r   r  r[   r  r   kwargsmodel_inputss          r/   prepare_inputs_for_generationz-RwkvForCausalLM.prepare_inputs_for_generation  s]     !!R%(2226I $+];L'3L %W$-[!rn   r  r  r  r[   labelsr   r   r  r  r  c
           	      t   |	|	n| j                   j                  }	| j                  |||||||	      }|d   }| j                  |      }d}|* | j                  ||fd| j                   j
                  i|
}|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aI  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        N)r  r[   r   r   r  r  r   r  r   )r   r   r[   r   r   )
r   r  r   rF  loss_functionr  r   r[   r   r   )r   r  r  r  r[   rP  r   r   r  r  rM  rwkv_outputsr   r   r   r`   s                   r/   rQ   zRwkvForCausalLM.forward  s    J &1%<k$++B]B]yy'/!5# ! 
 %Q=)%4%%  ;;11 	D Yab!11F)-)9TGf$EvE!$$&44#..
 	
rn   )NNN)	NNNNNNNNN)rq   rr   rs   _tied_weights_keysr   rI  rK  rO  r   r   rB   r@  r   r   rA  r   r   r   rQ   r   r   s   @r/   rD  rD    s    (#"  15595937-1$(,0/3&*F
E,,-F
 !!1!12F
   1 12	F

 U../0F
 ))*F
 D>F
 $D>F
 'tnF
 d^F
 
u((	)F
 F
rn   rD  )rD  r   r   ro   )/r   r   dataclassesr   pathlibr   typingr   r   r   r   rB   torch.utils.checkpointr	   
generationr   modeling_utilsr   utilsr   r   r   r   r   r   configuration_rwkvr   
get_loggerrq   r%   r#   r0   autogradFunctionr2   r   r   Moduler   r   r   r   r   r   r   rD  __all__ru   rn   r/   <module>rb     s]      !  / /    ) -  + 
		H	%  5@g
%..11 g
T)XbC5		 C5L#)bii #)L		 D ?]/ ?] ?]D ? ? ?: ? ? ?@ o6# o6 o6d i
)? i
i
X Brn   