
    |Uh#I             ,       t   d dl mZ d dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZmZ dd	lmZ i Z	 ej.                  ej0                  ej2                  fej4                  ej6                  fej8                  ej:                  fej<                  ej>                  ej@                  fejB                  ejD                  fej.                  ej0                  ej2                  fejF                  ejH                  ejJ                  fd
Z&ejN                  ejP                  fejR                  ejT                  fejV                  ejX                  fejZ                  ej\                  fejN                  ejP                  fejR                  ejT                  fdZ/ej`                  ejb                  ejd                  fejf                  ejh                  ejj                  fejl                  ejn                  ejp                  fejr                  ejt                  ejv                  fejx                  ejz                  ej|                  fej~                  ej                  ej                  fdZB G d d      ZC G d d      ZD G d d      ZE ej                  dd       ZGej                  j                         dkD  rdej                  fdZJnd dlKZKdej                  fdZJej                  eGddZMddej                  fdZNddZOddZPddZQdd ZRdd!ZSdd"ZTdd#ZU ed$eV%      dd&       ZWd'ee	ej                        fd(ZXd)ed*ej                  fd+ZZde	e   d*e	ej                     fd,Z[ ed$eV%      	 	 	 dded-e	ej                     d.e\d*efd/       Z] G d0 d1      Z^	 	 	 	 	 ddej                  d2e	ej                     d3e	ej                     d-e	ej                     d*e_ej                  e^f   f
d4Z`	 	 	 	 	 	 ddej                  d5e	e^   d3e	ej                     d2e	ej                     d-e	ej                     d6ead*ej                  fd7Zbdd9Zcddd8dej                  fdej                  d3e	ej                     d-e	ej                     fd:Zeddd8dej                  fdej                  d3e	ej                     d-e	ej                     fd;Zfddd8dd<ej                  fdej                  d3e	ej                     d-e	ej                     d*e_ej                  e^f   fd=Zg	 	 	 	 ddej                  d5e	e^   d3e	ej                     d-e	ej                     d6ead*ej                  fd>Zh	 	 	 	 ddej                  d5e	e^   d3e	ej                     d-e	ej                     d6ead*ej                  fd?Zi	 	 	 	 	 ddej                  d5e	e^   d3e	ej                     d-e	ej                     d6ead*ej                  fd@Zj ed$eV%      	 	 dded2e	ej                     d-e	ej                     d*e_ee_eef   f   fdA       Zk ed$eV%      	 	 	 	 ddedBe	e_eef      d3e	ej                     d2e	ej                     d-e	ej                     d*efdC       Zl ed$eV%      dded2ed-e	ej                     d*efdD       Zm ed$eV%      dded2ed-e	ej                     d*efdE       Zn	 	 	 	 	 	 	 	 	 ddFeodGedHedIedJe\dKe\dLeadMe\dNe	ej                     dOe\dPe\dQe\dRe\dSe\dTe	ej                     dUe\d*df"dVZp edWeV%      	 	 	 	 ddFeodGedHedIedNe	ej                     dJe\dOe\dKe\dLeadMe\dXedYe	ej                     dZed[e	ej                     d\ed]e	ej                     dRe\dSe\dTe	ej                     dUe\d*df*d^       Zq	 	 	 ddFeodGedHedIedNe	ej                     dJe\dOe\dPe\dQe\dKe\dLeadMe\dXedYe	ej                     d_ed`e	ej                     dRe\dSe\d*df&daZr ed$eV%      ddbedcedLeaddeafde       Zs ed$eV%      dfedgedhediefdj       Ztej                  fdkZv	 	 	 	 ddedled-e	ej                     fdmZw	 	 	 ddedled-e	ej                     fdnZx	 	 	 ddedled-e	ej                     fdoZydej                  fdej                  dlej                  d-e	ej                     fdpZ{	 	 ddej                  dqej                  drej                  d-e	ej                     dse	ej                     f
dtZ| ed$eV%      	 	 	 	 ddej                  dqe	ej                     dre	ej                     due	ej                     d*e_ej                  ej                  e	ej                     f   f
dv       Z} ed$eV%      ddej                  fdw       Z~ G dx dy      Z G dz d{      Z G d| d}      Zd~ Zd Zej                  fdZ	 	 	 	 	 ddej                  dre	ej                     dqe	ej                     de	ej                     de	ej                     f
dZdej                  dej                  fdZddej                  fdZ	 dde
eej                  f   dlej                  d-e	ej                     fdZddZdZ edeV%      dd       Z ed$eV%      ej                  dfd       Zy)    )IterableN)prod)AnyOptionalUnion)Tensor)
deprecated)pack_dict_to_tensorunpack_tensor_to_dict   )lib)adammomentumrmsproplionadagradlambademamix)r   r   r   r   r   lars)r   r   r   r   r   r   c                   4    e Zd ZdZd Zd Zed        ZddZy)GlobalPageManagerNc                     t        d      NzCall get_instance() insteadRuntimeErrorselfs    i/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/bitsandbytes/functional.py__init__zGlobalPageManager.__init__z       899    c                     g | _         y N)paged_tensorsr   s    r   
initializezGlobalPageManager.initialize}   s
    r!   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r#   	_instance__new__r%   clss    r   get_instancezGlobalPageManager.get_instance   6    == KK,CMMM$$&}}r!   c                 J    | j                   d d d   D ]  }t        ||        y )N)r$   prefetch_tensor)r   to_cputs      r   prefetch_allzGlobalPageManager.prefetch_all   s,     ##DbD) 	'AAv&	'r!   F)	__name__
__module____qualname__r(   r   r%   classmethodr,   r3    r!   r   r   r   w   s*    I:   'r!   r   c                   2    e Zd ZdZd Zd Zed        Zd Zy)CUBLAS_ContextNc                     t        d      r   r   r   s    r   r   zCUBLAS_Context.__init__   r    r!   c                     i | _         y r#   )contextr   s    r   r%   zCUBLAS_Context.initialize   s	    r!   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r#   r'   r*   s    r   r,   zCUBLAS_Context.get_instance   r-   r!   c                    |j                   | j                  vrt        j                  j	                         }t        j                  j                  |       t        j                  t        j                               | j                  |j                   <   t        j                  j                  |       | j                  |j                      S r#   )
indexr>   torchcudacurrent_device
set_devicectc_void_pr   get_context)r   deviceprev_devices      r   rH   zCUBLAS_Context.get_context   s}    <<t||+**335KJJ!!&))+S__5F)GDLL&JJ!!+.||FLL))r!   )	r5   r6   r7   r(   r   r%   r8   r,   rH   r9   r!   r   r;   r;      s*    I:  *r!   r;   c                   ,    e Zd ZdZd Zd Zed        Zy)Cusparse_ContextNc                     t        d      r   r   r   s    r   r   zCusparse_Context.__init__   r    r!   c                 \    t        j                  t        j                               | _        y r#   )rF   rG   r   get_cusparser>   r   s    r   r%   zCusparse_Context.initialize   s    {{3#3#3#56r!   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r#   r'   r*   s    r   r,   zCusparse_Context.get_instance   r-   r!   )r5   r6   r7   r(   r   r%   r8   r,   r9   r!   r   rL   rL      s%    I:7  r!   rL   rC   )rA   ac                 @    t         j                  j                  |       S r#   )rB   rC   	device_ofrQ   s    r   _cuda_device_ofrU      s    zz##A&&r!   c                 *    t        j                         S r#   )
contextlibnullcontextrT   s    r   rU   rU      s    %%''r!   dtyperI   c                    | j                   t        |      z  }t        j                  t	        j
                  |            }t	        j                  |t	        j                  t        j                              }t        j                  j                  ||      }t        j                  || t        |            j                  |      }d|_        |j                   |_        |S )N)shape)rZ   countT)itemsizer   r   cget_managed_ptrrF   c_size_tcastPOINTERc_intnp	ctypeslibas_arrayrB   
frombufferviewis_pagedrA   page_deviceid)rZ   rI   r\   	num_bytescuda_ptrc_ptr	new_arrayouts           r   	get_pagedrp      s    e,I##BKK	$:;HGGHbjj23E%%e5%9I


9Ee
E
J
J5
QCCLCJr!   FAc                     | j                   sJ d       |rd}n| j                  }t        j                  t	        |       t        j                  | j                        t        j                  |             y )Nz%Only paged tensors can be prefetched!r/   )	ri   rj   r   	cprefetchget_ptrrF   r`   nbytesc_int32)rq   r1   deviceids      r   r0   r0      sO    ::>>>:??MM'!*bkk!((3RZZ5IJr!   c           	         d }|j                   t        j                  k(  r+t        t        d|  dd       }t        j                  |      }nG|j                   t        j                  k(  r*t        t        d|  dd       }t        j                  |      }|t        d|        t        |dd      }|r|rt        |       |t        |        |t        |      t        |      t        j                  |j                                      |j                  s|j                  rt        j                  j!                          y y )Nc_fp32_uint8zFunction not implemented: 
is_managedF)rZ   rB   float32getattrr   rF   c_floatuint8c_uint8NotImplementedErrorr0   rt   c_int64numelri   rC   synchronize)	func_namerq   Bvalueprefetchfunccvaluer|   s           r   elementwise_funcr      s    Dww%--sa	{%0$7E"	
EKK	sa	{&148E"|!$>yk"JKKL%0Jh=AWQZAGGI)>?zzQZZ
 	

   r!   c                      t        d| d |       y )Nfillr   )rq   r   rI   r   s       r   r   r      s    VQe,r!   c                      t        d| |d       y )N_mulr   r   )rq   r   rI   s      r   r   r      s    VQ1%r!   c                 N   | rdnd}d|z  }|s|dk  r| sd|z  nd|z  dz
  }t        j                  |d|      }d|j                         z
  }|dk(  r|S |j                         dz  }t        j                  |d | j	                         dg|z  z   ||d  j	                         z         S )	N                    r         ?   r   )rB   linspacer   r   tolist)signed
total_bitsadd_zerosigntotal_valuesvaluesgapls           r   create_linear_mapr     s    4sDj=L:>
 -3q*}:8I^^D#|4F

C
axLLNa||F2AJ--/1#);fQRj>O>O>QQRRr!   c                    	 ddl m} |rv|j                  t	        j
                  | dd      d d       j                         }dgdz  }|j                  t	        j
                  | dd      d d        j                         }nu|j                  t	        j
                  | dd      d d       j                         }dgd	z  }|j                  t	        j
                  | dd      d d        j                         }||z   |z   }t	        j                  |      }|j                         j                  }||j                         z  }|j                         d
k(  sJ |S # t        $ r}t        d      |d }~ww xY w)Nr   )normzZScipy is required for `create_normal_map`. Install `bitsandbytes` with the `[test]` extra.g      ?	   r/      r      r   )scipy.statsr   ImportErrorppfrB   r   r   r   sortr   maxr   )	offsetuse_extra_valuer   iev1v2v3vr   s	            r   create_normal_mapr     sR   $ XXennVS!4Sb9:AACSHxxvsA6s;<<DDFXXennVS!4Sb9:AACSHxxvsA6s;<<DDF
R"A\\!_F[[]!!F
fjjlF<<>S   M/  h
	s   E 	E0E++E0c                 T   |}|}| rdnd}||z   ||z
  k(  sJ g }g }t        t        d||z
  z   d||z
  z  d            D ]  \  }	}
|j                  d|
z          g }t        t	        j
                  ddg|            }d|dz
  z  }t        d|z        D ]  }|D ]z  }|dk7  rdnd}t        t        |            D ]  \  }	}||d|	dz    z  z  z  } |dk(  r
|d| z  z  }n|d||z
  dz
   z  z  }|j                  |       | si|j                  |        |  t        |      d|z  k(  sJ |j                          |dk  r/dt        |      z
  }t        |      D ]  }	|j                  d        |j                          t        j                  |      }||j                         z  }|S )Nr   r   r   )repeatr   r   )	enumeraterangeappendlist	itertoolsproductlenr   rB   tensorr   )r   exponent_bitsprecision_bitsr   ephas_signevaluespvaluesivalr   lstbiasevaluebit_patternr   pvalr   codes                       r   create_fp8_mapr   2  s   AAqHq5J))))GGEA-(*B$C"DaM\dLdFeghij 3q#v F
y  !Q?
@C"#Dm,- & 	&K1A!E$T+%67 04Ah//0{T7
* v}q'8%9 99MM% uf%	&& v;!Z-'''
KKMA~CKs 	AMM!	
KKM<<DDHHJDKr!   c                    g }|dz
  }d||z
  z  dz
  }t        |      D ]  }t        | rd||z   |z
  z  dz   nd||z   |z
  dz   z  dz         }t        j                  dd|t        j                        }|dd |dd z   dz  }	|d|dz
   |z   z  |	z  j                         z  }| s|d|dz
   |z   z   |	z  j                         z  } |d	kD  r{t        j                  dd|dz   t        j                        }|dd |dd z   dz  }	|d|dz
   z   z  |	z  j                         z  }| r!|d|dz
   |z   z   |	z  j                         z  }|j                  d	       |j                  d
       t        |      d|z  k(  sJ dt        |      z
  }
t        |
      D ]  }|j                  d	        |j                          t        j                  |t        j                        S )a+  
    Creates the dynamic quantiztion map.

    The dynamic data type is made up of a dynamic exponent and
    fraction. As the exponent increase from 0 to -7 the number
    of bits available for the fraction shrinks.

    This is a generalization of the dynamic type where a certain
    number of the bits and be reserved for the linear quantization
    region (the fraction). n determines the maximum number of
    exponent bits.

    For more details see
    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
    r   r   g?rZ   Nr/          @
   r   r   r   )
r   intrB   r   r}   r   r   r   r   r   )r   max_exponent_bitsr   datanon_sign_bitsadditional_itemsr   fraction_items
boundariesmeansr   s              r   create_dynamic_mapr   ]  s-   " D NM]->>?!C$% 
O !m#&7781<q=(+<<q@AAE

 ^^CN%--P
CR:ab>1S8",q01A56%?GGIIr 1A 56:;<uDLLNND
O !^^C,<q,@V
CR:ab>1S8",q01A56%?GGIIr 1A 56:;<uDLLNNDKKNKKt9:%%%
D	/C3Z A 	IIK<<EMM22r!   zDThis function is deprecated and will be removed in a future release.)categoryc                 D   t        | d|z  dz
        }|j                         }|j                  d       dt        |      z
  }t	        |      D ]  }|j                  d        |j                          t        |      }||j                         j                         z  }|S )Nr   r   )num_quantilesr   r   )	estimate_quantilesr   r   r   r   r   r   absr   )rq   r   qr   r   s        r   create_quantile_mapr     s    1AzMA,=>A	
AHHQK
A,C3Z 	 FFHq	A	AEEGKKMAHr!   tensorsc                    d}t               }| D ]G  }|t        |dd      r||j                  z  }|j                  |j                  j
                         I |s2t        d| D cg c]  }|j                  |j                  f c}       t        |      dkD  r2t        d| D cg c]  }|j                  |j                  f c}       |S c c}w c c}w )ap  Verifies that the input tensors are all on the same device.

    An input tensor may also be marked as `paged`, in which case the device placement is ignored.

    Args:
        tensors (`Iterable[Optional[torch.Tensor]]`): A list of tensors to verify.

    Raises:
        `RuntimeError`: Raised when the verification fails.

    Returns:
        `Literal[True]`
    Tri   FzZAll input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 r   zcInput tensors need to be on the same GPU, but found the following tensor and device combinations:
 )	setr~   is_cudaaddrI   rA   r   r\   r   )r   on_gpugpu_idsr2   s       r   	is_on_gpur     s#    FeG (=J!>aiiFKK'	( i  IP  kQ  DElmlslsuvu}u}k~  kQ  jR  S
 	
 7|ar  RY  tZ  MNuvu|u|~  G  G  uH  tZ  s[  \
 	
 M kQ
 tZs   %C
%C
r   returnc                     t        j                  t        j                  j	                  | j
                  j                              S r#   )rF   rG   rB   _C_cuda_getCurrentRawStreamrI   rA   )r   s    r   _get_tensor_streamr     s*    ;;uxx99&--:M:MNOOr!   c                 N    | yt        j                  | j                               S )zGets the memory address of the first element of a tenso

    Args:
        A (`Optional[Tensor]`): A PyTorch tensor.

    Returns:
        `Optional[ct.c_void_p]`: A pointer to the underlying tensor data.
    N)rF   rG   data_ptr)rq   s    r   rt   rt     s!     	y;;qzz|$$r!   ro   r   c           
      :   | j                         dk  rt        d| j                          d      |dkD  rt        d|       |dk  r|dk(  rdd|z  z  }|0t        j                  d	t        j                  | j
                  
      }t        |       5  t        | |g       | j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      n| j                  t        j                  k(  r_t        j                   t        |       t        |      t        j                  |      t        j                  | j                                      nt        d| j                         ddd       |dk  rQt#        d|z        }t        j$                  dd|      j'                         j)                  | j
                        }||   }|S # 1 sw Y   axY w)a  
    Estimates 256 equidistant quantiles on the input tensor eCDF.

    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
    and the extreme quantiles close to 0 and 1 have high variance / large estimation
    errors. These large errors can be avoided by using the offset variable which trims
    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
    usually has a much lower error but is not a minimum entropy encoding. Given an offset
    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor. Any shape.
    out : torch.Tensor
        Tensor with the 256 estimated quantiles.
    offset : float
        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
    num_quantiles : int
        The number of equally spaced quantiles.

    Returns
    -------
    torch.Tensor:
        The 256 quantiles in float32 datatype.
    r   zQQuantile estimation needs at least 256 values in the Tensor, but Tensor had only z values.zgCurrently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles=      `?r   r   N)r   rY   zNot supported data type r      )r   r   rB   zerosr}   rI   rU   r   rZ   r   cestimate_quantiles_fp32rt   rF   r   rc   float16cestimate_quantiles_fp16roundr   longto)rq   ro   r   r   stepidxs         r   r   r     s   F 	wwy3!_`a`g`g`i_jjrs
 	
 s!u  wD  vE  F
 	
 sv2a-'(
{kk&ahhG		 L1c(77emm#((WS\2::fCUWYW_W_`a`g`g`iWjkWW%((WS\2::fCUWYW_W_`a`g`g`iWjk%(@	&JKKL sS=()nnQ]388:==ahhG#hJL Ls   DHHc                       e Zd ZdZdZeD  cg c]  }d| 	 c}} Zg dZ	 	 	 	 	 	 	 ddZd Ze	de
eef   d	ej                  d
d fd       ZddZd Zd Zyc c}} w )
QuantStatezWcontainer for quantization state components to work with Params4bit and similar classes)fp4nf4bitsandbytes__)absmax	quant_mapnested_absmaxnested_quant_mapquant_state
quant_type	blocksizerZ   r\   nested_blocksizenested_dtypenested_offsetNc	                     || _         || _        || _        || _        || _        || _        || _        || _        |d u| _        y r#   )	r  r\   r   rZ   r
  r	  r   state2nested)	r   r  r\   r   r
  r	  rZ   r   r  s	            r   r   zQuantState.__init__5  sH     
	
"$D(r!   c                 B   | j                   rU| j                  | j                  | j                  | j                  | j
                  | j                  g| j                  g}||   S | j                  | j                  | j                  | j                  d| j                  g}||   S )a$  
        ensures compatibility with older quant state scheme with nested lists.
        assumes the following layout:
        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
        N)r  r  r\   rZ   r
  r   r  r	  )r   r   	list_reprs      r   __getitem__zQuantState.__getitem__J  s     ;;



dkk*I ~ djj$**dnndTXTcTcdI~r!   qs_dictrI   r   c                    |j                         D cg c]'  \  }}d|v st        |t        j                        s&|) }}}t	        |      sd|vrt        d      t	        |      dk7  s#|d   j                  d      d   | j                  vrt        d| j                   d	| d      t	        |      dk(  r.|d   }|j                  t        |j                  |                   |j                         D ci c]  \  }}|j                  d      d   | }}}t        |j                               j                  | j                        sJ d
|v rut        j                  t!        |d               j#                  |      } | |d
   j#                  |      |d   |d   j#                  |      t%        t        |d               }nd\  }} | |d   |d   j#                  |      |d   |d   j#                  |      t%        t        |d         |d   t        j&                  |d         nd||      }	|	S c c}}w c c}}w )aO  
        unpacks components of state_dict into QuantState
        where necessary, convert into strings, torch.dtype, ints, etc.

        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.

        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
        r  r	  z<Expected packed or unpacked quant_state items, found neitherr   r   .r/   z@There should be exactly one `quant_state` item with ending from z.
Detected r  r  r  r  r  )r  r
  r   rZ   NNr  r
  r  rZ   r\   N)r	  r  r
  r   rZ   r\   r   r  )items
isinstancerB   r   r   
ValueErrorsplitvalid_qs_type_keysupdater   popr   keysissubsetvalid_qs_keysr   floatr   r~   Size)
r+   r  rI   kr   qs_keyfirst_qs_keyr   r  r  s
             r   	from_dictzQuantState.from_dict^  s9    !(f1=A3E*UVX]XdXdJe!ff6{|7:[\\[A!5b!9AWAW!WRSVSiSiRjjvw}v~~  A 
 v;!!!9LNN0\1JKL3:==?C41a1773<#Q&CC7<<>"++C,=,=>>>g%\\%(@"ABEEfMF/226:!"45/033F;eW^%<=	F (NFF|,8$''/k*%((0%!1229'2B2N%**WW-.TX	
 K g Ds   H7H7H79H=c                    | j                   | j                  | j                  | j                  t	        | j
                        j                  d      t        | j                        d}| j                  r|j                  | j                  j                  | j                  j                  | j                  j                  j                         t	        | j                  j
                        j                  d      | j                  j                         d       |s|S |j                         D ci c]#  \  }}t!        |t"        j$                        s!||% }}}|j                         D ci c]#  \  }}t!        |t"        j$                        r!||% }}}t'        |      |d| j                   z   <   |S c c}}w c c}}w )z
        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
        ztorch.)r	  r  r
  r  rZ   r\   )r  r  r  r  r  zquant_state.bitsandbytes__)r	  r  r
  r   strrZ   striptupler\   r  r  r  cloner   itemr  r  rB   r   r
   )r   packedr  r$  r   qs_packed_dictnon_tensor_dicts          r   as_dictzQuantState.as_dict  sT    //kk_**844::&
 ;;NN%)[[%7%7(,(=(=(,(8(8(>(>(@$'(9(9$:$@$@$J%)[[%5%5%7 N ,3==?Z41ajELL>Y!Q$ZZ,3MMO_DAq:aQVQ]Q]C^1a4__NabqNr84??JK [_s   /"F5F5-"F;F;c                    | j                   j                  |      | _         | j                  j                  |      | _        | j                  r| j                  j                  |      | _        | j
                  j                  j                  |      | j
                  _        | j
                  j                   j                  |      | j
                  _         y y r#   )r   r   r  r  r   r  )r   rI   s     r   r   zQuantState.to  s    IILL(	kknnV,;;++..0DK!%!3!3!6!6v!>DKK#{{//226:DKK r!   c                    t        |t              syt        j                  | j                  |j                  d      xr/ | j
                  |j
                  k(  xr t        j                  | j                  |j                  d      xr | j                  |j                  k(  xr | j                  |j                  k(  xr | j                  |j                  k(  xr | j                  %|j                  | j                  |j                  k(  n| j                  |j                  u xrI | j                  %|j                  | j                  |j                  k(  S | j                  |j                  u S )NFgư>)atol)r  r   rB   allcloser  r\   r   rZ   r
  r	  r   r  )r   others     r   __eq__zQuantState.__eq__  s&   %, NN4;;4@ 

ekk)tyy%**4@ 

ekk) %//1	
 5#3#33 ;;*u||/G u||+[[ELL0 ;;*u||/G u||+	
 [[ELL0	
r!   )NNNNNNNr4   )r5   r6   r7   __doc__valid_quant_typesr  r!  r   r  r8   dictr)  r   rB   rI   r'  r1  r   r7  ).0xs   00r   r   r   !  s    a&8IJ1N1#.JM$ )*( 0S#X 0 0 0 0d@;
k Ks   Ar   r   r  c                    |;dt         vr*t               j                  | j                        t         d<   t         d   }t        j
                  j                  j                  j                  | |j                  | j                        |      \  }}|r]|j                         }||z  }t        ||d      \  }	}
t        |	|j                  | j                  d      || j                  ||
      }n4t        ||j                  | j                  d      || j                        }||j                  |      n|}| |j                  |j                        |_        ||fS )aW  Quantize a tensor in blocks of values.

    The input tensor is quantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is calculated for scaling
    the non-linear quantization.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor.
        - [`QuantState`]: The state object used to undo the quantization.
    dynamicF)r
  r  T)copy)r  r   r
  rZ   r   r  r  r   r
  rZ   )	name2qmapr   r   rI   rB   opsbitsandbytesquantize_blockwisedefaultmeanr   rZ   copy_r  )rq   r   r  ro   r
  r  _out_absmaxr   qabsmaxr  r  s               r   rD  rD    s:   F |I%#5#7#:#:188#DIi #II**==EE	MD' 6,W	RWX -''
 !dggahhTg6R^gopovovw !_#))D/$C #\\+*<*<=r!   r  r
  c                 v   ||J |=|;dt         vr*t               j                  | j                        t         d<   t         d   }|t	        |||t
        j                        }|j                  }|j                  r\t        |j                  |j                        }||j                  z  }|j                  t
        j                  k7  r|j                         }|rt
        j                  j                  j                  j!                  | ||j"                  j                  | j                        |j$                  |j                  |       |S t
        j                  j                  j                  j'                  | ||j"                  j                  | j                        |j$                  |j                        S )a  Dequantize a tensor in blocks of values.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_blockwise`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
            Ignored when `quant_state` is provided.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
            Ignored when `quant_state` is provided.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `torch.Tensor`:
            The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
    r>  r@  ro   )rA  r   r   rI   r   rB   r}   r  r  dequantize_blockwiser  r   rZ   r"  rB  rC  ro   r   r
  rE  )rq   r  r  r   ro   r
  r  s          r   rM  rM    sw   R "f&888|+I%#5#7#:#:188#DIi # TYV[VcVcdF%k&8&8+:L:LM+$$$<<5==(\\^F
		3377)!! 	8 	
 
99!!66>>	AHH% r!   @   c                 d   |d}d }| dk(  r	 g d}n4| dk(  rg d}n*| dk(  rg d}n | dk(  r|d	k(  rg d
d d d   }nt        d      |t        d|  d      t        j                  ||      }|j                  |j	                         j                                |j                         dk(  sJ |S )NrC   r  )r   g    6Gg    fg    TFٿg   I4ҿg   ০ǿg    Or   g   __?g   `\?g   ?g   @g?g    4?g   ` ?g   `v"?r   r  )r   g      ?g       @g      (@g      @g      @r   g      @r   g      g       g      (g      g      g       g      int4)               r   r   r   r   r/   iiaf4rN  )r   g|8geg:Kڞ׿gH2퓊cпg}Yu-ÿgQ	#(Dr   gF?g`_?g
0E?gL_߹E?gƶ=?ga@?gкv-?r   r/   z94-bit AbnormalFloats currently only support blocksize 64.z	Typename z not supportedrI      )r   rB   r   div_r   r   r   )typenamerI   r
  r   s       r   get_4bit_typer_  f  s    ~D5		
$ 
U	 l	V	G	U	 ?" d#D& &&abb|!IhZ~"FGG<<V,DIIdhhjnn::<2Kr!   c           	      $    t        | ||||d|      S Nr  quantize_4bitrq   r  ro   r
  compress_statisticsquant_storages         r   quantize_fp4rg         FC4GP]^^r!   c           	      $    t        | ||||d|      S Nr  rb  rd  s         r   quantize_nf4rk    rh  r!   r  c           
         | j                   }t        j                  j                  j                  j                  | |||      \  }}	t        || j                        }
|rB|	j                         }t        |	|z
  d      \  }}~	t        ||| j                  ||
|||      }nt        |	|| j                  ||
|      }||j                  |      n|}| |j                  |j                        |_        ||fS )a  Quantize tensor A in blocks of 4-bit values.

    Quantizes tensor A by dividing it into blocks which are independently quantized.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 64.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
        quant_storage (`torch.dtype`, *optional*): The dtype of the tensor used to store the result. Defaults to `torch.uint8`.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        Tuple[`torch.Tensor`, `QuantState`]: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor with packed 4-bit values.
        - [`QuantState`]: The state object used to undo the quantization.
    r[  r   )r
  )r  r\   rZ   r
  r   r	  r   r  )r  r\   rZ   r
  r   r	  )r\   rB   rB  rC  rc  rE  r_  rI   rF  rD  r   rZ   rG  r  )rq   r  ro   r
  re  r	  rf  input_shaperH  rI  r   r   rJ  r  states                  r   rc  rc    s    @ ''KII**88@@		MD' AHH5D,Wv-=M''!	
 ''!
 !_#))D/$C ||ELL1:r!   c                 "    t        | ||||d      S ra  dequantize_4bitrq   r  r  ro   r
  s        r   dequantize_fp4rs         1k63	5IIr!   c                 "    t        | ||||d      S rj  rp  rr  s        r   dequantize_nf4rv  %  rt  r!   c           	         |+||J t        ||j                  |j                  ||      }n|j                  }|j                  r\t        |j                  |j                        }||j                  z  }|j                  t        j                  k7  r|j                         }|ct        j                  j                  j                  j                  | ||j                  |j                   |j                  |j                  |       n`t        j                  j                  j                  j#                  | ||j                  |j                   |j                  |j                        }| j                  d   dk(  r|j%                         S |S )a  Dequantizes a packed 4-bit quantized tensor.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_4bit`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 64.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.

    Raises:
        ValueError: Raised when the input data type or blocksize is not supported.

    Returns:
        `torch.Tensor`: The dequantized tensor.
    )r  r\   rZ   r
  r	  rL  r   r   )r   r\   rZ   r  r  rM  r  r   rB   r}   r"  rB  rC  rq  ro   r
  r	  rE  r2   )rq   r  r  ro   r
  r	  s         r   rq  rq  /  sS   D !co55 ))))!
 ##%k&8&8+:L:LM+$$$<<5==(\\^F
		..22v{,,k.D.DkFWFWYdYjYjps 	3 	
 ii$$44<<!!""
 	wwqzQuuwJr!   c                    |Vdt         vr*t               j                  | j                        t         d<   t         d   }|j                  | j                        }t	        j
                  |       j                         }|j                  t        j                  k7  r|j                         }| |z  }t        |||      }|||ffS )Nr>  )rA  r   r   rI   rB   r   r   rZ   r}   r"  quantize_no_absmax)rq   r   ro   r  inps        r   quantizer{  x  s     |I%#5#7#:#:188#DIi #wwqxx YYq\F||u}}$
f*C
S$
,Cr!   rn  c                     ||J |X|Vdt         vr*t               j                  | j                        t         d<   t         d   }|j                  | j                        }|||f}t	        | |d   |      }||d   z  S )Nr>  r   r   )rA  r   r   rI   dequantize_no_absmax)rq   rn  r  r   ro   s        r   
dequantizer~    s      222|I%#5#7#:#:188#DIi #wwqxx }
q%(C
0Cq>r!   c           
      X   t        |       5  |%t        j                  | t        j                        }t	        | |g       t        j                  t        |      t        |       t        |      t        j                  | j                                      ddd       |S # 1 sw Y   |S xY w)a  
    Quantizes input tensor to 8-bit.

    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
    `out` using the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor, optional
        The output tensor. Needs to be of type byte.

    Returns
    -------
    torch.Tensor:
        Quantized 8-bit tensor.
    Nr   )rU   rB   
zeros_liker   r   r   	cquantizert   rF   rc   r   )rq   r   ro   s      r   ry  ry    s    , 
	 T;""1EKK8C1c(gdmWQZrxx	?RS	T JT Js   B	BB)c           
      r   t        |       5  |%t        j                  | t        j                        }t	        || |g       t        |       }t        j                  t        |      t        |       t        |      t        j                  | j                               |       ddd       |S # 1 sw Y   |S xY w)a  
    Dequantizes the 8-bit tensor to 32-bit.

    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
    the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The 8-bit input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        The 32-bit output tensor.

    Returns
    -------
    torch.Tensor:
        32-bit output tensor.
    Nr   )rU   rB   r  r}   r   r   r   cdequantizert   rF   rc   r   )rq   r   ro   streams       r   r}  r}    s    , 
	 ^;""1EMM:C4C.!#A&wqz73<!'')ATV\]^ J^ Js   BB,,B6optimizer_namegr   state1beta1epsr   lrr  beta2beta3alphaweight_decaygnorm_scale	unorm_vec	max_unormc                    d}|dkD  r-t        j                  |j                  j                               }d}|j                  t         j
                  k(  rt        |    d   }n|j                  t         j                  k(  rt        |    d   }nd|j                  t         j                  k(  r"t        t        |          dk(  rt        |    d   }n%t        d|j                   d|j                         t        |||||g       t        |      5   |t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        j                  |
      t        j                  |      t        j                  |      t        j                  |      t        j                   |      t        j                  |      t        j                  |      t        j"                  |      t        j                   |j%                                      ddd       y# 1 sw Y   yxY w)	az  
    Performs an inplace optimizer update with one or two optimizer states.

    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer: {adam}.
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Optimizer state 1.
    beta1 : float
        Optimizer beta1.
    eps : float
        Optimizer epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    state2 : torch.Tensor
        Optimizer state 2.
    beta2 : float
        Optimizer beta2.
    beta3 : float
        Optimizer beta3.
    alpha : float
        Optimizer alpha.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    skip_zeros : bool
        Whether to skip zero-valued gradients or not (default: False).
    r   Nr   r   rU  r   AGradient+optimizer bit data type combination not supported: grad , optimizer )rB   r   r   r"  rZ   r}   str2optimizer32bitr   bfloat16r   r  r   rU   rt   rF   r   rv   c_boolr   )r  r  r   r  r  r  r   r  r  r  r  r  r  r  r  r  
skip_zeros
param_norm
optim_funcs                      r   optimizer_update_32bitr    s   | J3ZZ/
Jww%--'7:
	
EMM	!'7:
	
ENN	"s+=n+M'NRS'S'7:
OPQPWPWyXdekeqeqdrs
 	
 q!VVY/0		 
AJAJFOFOIJJy!JJz"JJuJJuJJuJJuJJsOJJ|$JJtJJrNJJ{#IIj!JJqwwy!%	

 
 
s   
EII(zyThis function is deprecated and will be removed in a future release. Please use optimizer_update_8bit_blockwise instead. qmap1qmap2max1max2new_max1new_max2c                 h   d}|dkD  r-t        j                  |j                  j                               }t	        |      5  t        ||||||
|||||g       |j                  t         j                  k(  rt|j                  t         j                  k(  rVt        |    d   t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        |
      t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |j                                      n|j                  t         j                  k(  rs|j                  t         j                  k(  rUt        |    d   t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        |
      t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |j                                      n%t!        d|j                   d|j                         ddd       y# 1 sw Y   yxY w)a  
    Performs an inplace Adam update.

    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
    Uses AdamW formulation if weight decay > 0.0.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer. Choices {adam, momentum}
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Adam state 1.
    state2 : torch.Tensor
        Adam state 2.
    beta1 : float
        Adam beta1.
    beta2 : float
        Adam beta2.
    eps : float
        Adam epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    qmap1 : torch.Tensor
        Quantization map for first Adam state.
    qmap2 : torch.Tensor
        Quantization map for second Adam state.
    max1 : torch.Tensor
        Max value for first Adam state update.
    max2 : torch.Tensor
        Max value for second Adam state update.
    new_max1 : torch.Tensor
        Max value for the next Adam update of the first state.
    new_max2 : torch.Tensor
        Max value for the next Adam update of the second state.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    r   r   r   r  r  N)rB   r   r   r"  rU   r   rZ   r}   r   str2optimizer8bitrt   rF   r   rv   r   r   r  )r  r  r   r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  s                        r   optimizer_update_8bitr  G  s   Z J3ZZ/
		 51aE5$hX`ab77emm#(Cn-a0

	"

9%

:&

5!

5!

3

4 

2!!

<(

;'

1779%+. WW%&,,%++*En-a0

	"

9%

:&

5!

5!

3

4 

2!!

<(

;'

1779%+0 STUT[T[S\\hioiuiuhvw g5 5 5s    MN((N1absmax1absmax2c                    d }|j                   t        j                  k(  r*|j                   t        j                  k(  rt        |    d   }n|j                   t        j
                  k(  r*|j                   t        j                  k(  rt        |    d   }n|j                   t        j                  k(  r?|j                   t        j                  k(  r"t        t        |          dk(  rt        |    d   }n%t        d|j                    d|j                          t        ||||||||g       t        |      5   |t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        j                  |
      t        j                  |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |j!                                      d d d        y # 1 sw Y   y xY w)Nr   r   rU  r   r  r  )rZ   rB   r}   r   str2optimizer8bit_blockwiser   r  r   r  r   rU   rt   rF   r   rv   r  r   )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  s                       r   optimizer_update_8bit_blockwiser    s   * Jww%--FLLEKK$?0@C
	
EMM	!fllekk&A0@C
	5>>!LLEKK'+N;<A0@C
OPQPWPWyXdekeqeqdrs
 	
 q!VVUE7GDE		 
AJAJFOFOJJuJJuJJuJJuJJsOJJtJJrNENENGGJJ|$JJ{#IIj!JJqwwy!'	

 
 
s   0EI;;Jgrad	gnorm_vec
percentilec           
      6   t        |       5  t        | |g       | j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      n| j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      nt        d| j                   d      ddd       t        j                  ||dz           }t        j                  |      \  }}t        j                  ||         }d}||kD  r||z  }|||fS # 1 sw Y   exY w)a  Applies percentile clipping

    grad: torch.Tensor
        The gradient tensor.
    gnorm_vec: torch.Tensor
        Vector of gradient norms. 100 elements expected.
    step: int
        The current optimization steps (number of past gradient norms).

    zGradient type z not supported!Nd   r   )rU   r   rZ   rB   r}   r   cpercentile_clipping_g32rt   rF   rv   r   r   cpercentile_clipping_g16r  sqrtr   )	r  r  r   r  current_gnormvalsr   
clip_valuer  s	            r   percentile_clippingr    sF    
	 K4#$::&((	"

4 

4::<(	 ZZ5==(((	"

4 

4::<(	 ~djj\IJJ#K& JJy45M

9%ID#D,-JKz! =0*k117K Ks   DFF	histogramindex1index2sourcec                 ,   t        | j                        dk(  sJ | j                  t        j                  k(  sJ |j                  t        j                  k(  sJ |j                  t        j
                  k(  sJ |j                  t        j
                  k(  sJ | j                  j                  dk(  sJ |j                  j                  dk(  sJ |j                  j                  dk(  sJ |j                  j                  dk(  sJ t        j                  | j                  d         }t        j                  |j                               }t        | |||g       t        j                  t        |       t        |      t        |      t        |      ||       y )Nr   rC   r   )r   r\   rZ   rB   r}   int32rI   typerF   rv   r   r   r   chistogram_scatter_add_2drt   )r  r  r  r  maxdim1ns         r   histogram_scatter_add_2dr  :  s=   y1$$$??emm+++<<5==(((<<5;;&&&<<5;;&&&  F***=='''=='''=='''jj+,G


6<<>"Ay&&&12!!')"4gfowvX_`fXgiprstr!   c                    t         j                  j                         st         j                  j                          | j                  |k7  s|j                  |k7  r%t        d| j                   d|j                         | j                  }|j                  }|}|}	d}
t        |      dk(  rt        |      dk(  r|s%|	s#| j                  d   |j                  d   k7  rd}
n|r%|	s#| j                  d   |j                  d   k7  rd}
n|r%|	r#| j                  d   |j                  d   k7  rd}
n|s|	r| j                  d   |j                  d   k7  rld}
nht        |      dk(  rt        |      dk(  r|s%|	s#| j                  d   |j                  d   k7  rd}
n%|r$|	s"| j                  d   |j                  d   k7  rd}
n|r$|	r"| j                  d   |j                  d   k7  rd}
n|s|	r| j                  d   |j                  d   k7  rd}
nt        |      dk(  rt        |      dk(  r|s$|	s"| j                  d   |j                  d   k7  rd}
nq|r$|	s"| j                  d   |j                  d   k7  rd}
nK|r$|	r"| j                  d   |j                  d   k7  rd}
n%|s#|	r!| j                  d   |j                  d   k7  rd}
|a|j                  }|
syt        |      dk(  rjt        |      dk(  r[|d   |d   k(  rO|d   |d   k(  rC|d   |d   k(  r7|d   |d   k(  r+d}
n't        |      dk(  rJt        |      dk(  r<|s|	s|d   |d   f}n|r|	r|d   |d   f}n|r|	s|d   |d   f}n|s|	r|d   |d   f}nt        |      dk(  rZt        |      dk(  rL|s|	s|d   |d   |d   f}n|r|	r|d   |d   |d   f}n|r|	s|d   |d   |d   f}nz|sx|	rv|d   |d   |d   f}ngt        |      dk(  rYt        |      dk(  rK|s|	s|d   |d   |d   f}n8|r|	r|d   |d   |d   f}n%|r|	s|d   |d   |d   f}n|s|	r|d   |d   |d   f}|
st        d	| d
| d| d
|	 d	      S )Nz3Expected torch.int8 input tensors A and B, but got  and Tr   r   r   FrU  z?Tensor dimensions incorrect for matrix mulitiplication: A x B:  x z with transpose for A x B: r  )	rB   rC   is_initializedinitrZ   	TypeErrorr\   r   r  )rq   r   ro   transposed_Atransposed_Bexpected_typesAsBtAtBcorrectsouts               r   check_matmulr  M  sr   ::$$&

ww-177m#;MaggYV[\]\c\c[deff	
B	
B	B	BG
2w!|B1"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G	RA#b'Q,"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G	RA#b'Q,"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G
yy3r7a<CGqLAw"Q%DGr!u$4A"Q%BqEUWXYUZNr7a<CGqLb1r!u~1r!u~B1r!u~B1r!u~W\c"glb1r!ube,1r!ube,B1r!ube,B1r!ube,W\c"glb1r!ube,1r!ube,B1r!ube,B1r!ube,MbTQTUWTXXstvswwz{}z~~  A
 	
 Kr!   r   c           	         |t        d      |j                  }|j                  r#t        ||j                        |j
                  z   }|Zt        j                  j                  j                  j                  | ||j                  ||j                  |j                  |       |S t        j                  j                  j                  j                  | ||j                  ||j                  |j                        S )NzIstate cannot be None. gemv_4bit() requires the state from quantize_4bit()rL  )r  r  r  rM  r  r   rB   rB  rC  	gemv_4bitro   r\   r   r
  rE  )rq   r   ro   r  r  rn  r  s          r   r  r    s     }dee\\F||%fell;ellJ
		((,,KKJJOO 	- 	
 
99!!++33		

 r!   c                    t        | ||||      }|0t        j                  |t        j                  | j                        }t        | j                        dk(  rct        |j                        dk(  rK| j                  d   |j                  d   k(  r,| j                  d   |j                  d   k(  rt        | ||      S | j                  }|j                  }|rt        |      dk(  r|d   |d   f}n|rt        |      dk(  r|d   |d   |d   f}|rt        |      dk(  r|d   |d   f}n|rt        |      dk(  r|d   |d   |d   f}t        |      dk(  rx|j                         d   |j                  d   k(  rd}n%|j                         d   |j                  d   k(  rd}t        | j                        dk(  rL| j                         d   | j                  d   k(  rd}nq| j                         d   | j                  d   k(  rNd}nK| j                         d   | j                  d   k(  rd}n%| j                         d   | j                  d   k(  rd}t        |      dk(  r|d   }| j                         |rdnd   }	n,t        |      dk(  rt        |      dk(  r|d   |d   z  }|d   }	|d   }
|d   }|j                         |rdnd   }|d   }ngt        |      dk(  rYt        |      dk(  sJ |d   |d   k(  r|d   |d   k(  st        d| d	|       d}d}|d   }
|d   }|d   |d   z  }|
}|d   }	|
}t        j                         j                  | j                        }t        || |g       t        j                  |t!        j"                  |      t!        j"                  |      t!        j$                  
      t!        j$                        t!        j$                        t'        |      t'        |       t'        |      t!        j$                        t!        j$                  	      t!        j$                               |S )
NsizerZ   rI   rU  r   r   r   FTzMOnly bsi,bso->io supported for tensor contractions, but dims for A x B were: r  )r  rB   r   r  rI   r   r\   batched_igemmstrider  r;   r,   rH   r   r   cigemmrF   r  rv   rt   )rq   r   ro   r  r  r  r  r  r  ldbmr$  ldaldcptrs                  r   igemmr    s    1c<>D
{kkt5;;qxxH
177|qS\Q.771:#
aggaj(@ As++	
B	
BB1eRU^	#b'Q,eRUBqE"B1eRU^	#b'Q,eRUBqE" 2w!|88:a=AGGAJ& LXXZ]aggaj(Lqww<1xxz!}
*$A!''!*,#xxz!}
*$A!''!*,#r7a<1A((*,QA6CW\c"gl11AQ%CqEqEhhj|!4e	RA2w!||1A2a5BqE>_`b_ccfgifjk  qEqEqEBqEMe

%
%
'
3
3AHH
=C q!SkJJ
		,
		,


1


1


1




3


3


3 Jr!   c                    t        | j                        dk(  rt        |j                        dk(  s%t        d| j                   d|j                         t        | ||||      }|0t	        j
                  |t        j                  | j                        }|j                         r|j                         d   }d}n|j                         }|d   |j                  d   k7  r$|j                         }|j                         d   }n|d   |j                  d   k(  rd	}|j                         d   }n{|d   dk(  r$|j                         }|j                         d   }nO|d   dk(  r$|j                         }|j                         d   }n#|j                         }|j                         d   }| j                         r| j                         d   }d}n| j                         }|d   | j                  d   k7  r&| j                         } | j                         d   }d}nP|d   | j                  d   k(  r| j                         d   }d	}n%| j                         } | j                         d   }d}| j                  d   }	| j                  d   }
|j                  d   }|j                  d   }|}|j                  d   |j                  d   z  }| j                  d   | j                  d   z  }| j                  d   |j                  d   z  }t        j                         j                  | j                        }t        || |g       t        j                   |t#        j$                  |      t#        j$                  |      t#        j&                  |      t#        j&                  |
      t#        j&                  |      t)        |      t)        |       t)        |      t#        j&                  |      t#        j&                  |      t#        j&                  |      t#        j*                  |      t#        j*                  |      t#        j*                  |      t#        j,                  |	             |S )
NrU  z@Expected 3-dimensional tensors for bmm, but got shapes A and B: r  r  r   Fr   r   T)r   r\   r  r  rB   r   r  rI   is_contiguousr  
contiguousr;   r,   rH   r   r   cbatched_igemmrF   r  rv   rt   c_longc_uint32)rq   r   ro   r  r  r  r  sr  	num_batchr  r  r$  r  strideAstrideBstrideCr  s                     r   r  r  +  so    qww<1CLA$5[\]\c\c[ddijkjqjqirstt1c<>D
{kkt5;;qxxHhhjmHHJQ41771:A((*Q-CqTQWWQZL((*Q-CtqyLLNhhjm1LLNhhjmLLNhhjmhhjmHHJQ41771:A((*Q-C LqTQWWQZ((*Q-CLA((*Q-C L 
I	
A	
A	
A
Cggaj1771:%Gggaj1771:%Gggaj1771:%G

%
%
'
3
3AHH
=Cq!Sk
		,
		,


1


1


1




3


3


3
		'
		'
		'
I!$ Jr!   c                     |7t         j                  j                  j                  j	                  | ||       |S t         j                  j                  j                  j                  | |      S )aL  Performs an 8-bit integer matrix multiplication.

    A linear transformation is applied such that `out = A @ B.T`. When possible, integer tensor core hardware is
    utilized to accelerate the operation.

    Args:
        A (`torch.Tensor`): The first matrix operand with the data type `torch.int8`.
        B (`torch.Tensor`): The second matrix operand with the data type `torch.int8`.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor used to store the result.
        dtype (`torch.dtype`, *optional*): The expected data type of the output. Defaults to `torch.int32`.

    Raises:
        `NotImplementedError`: The operation is not supported in the current environment.
        `RuntimeError`: Raised when the cannot be completed for any other reason.

    Returns:
        `torch.Tensor`: The result of the operation.
    )rB   rB  rC  int8_linear_matmulro   rE  )rq   r   ro   rZ   s       r   r  r    sS    & 		1155aC@
99!!44<<QBBr!   	row_stats	col_statsr   c                     t         j                  j                  j                  j	                  | ||t         j
                  |      }||j                  |      S |S )a  Performs dequantization on the result of a quantized int8 matrix multiplication.

    Args:
        A (`torch.Tensor` with dtype `torch.int32`): The result of a quantized int8 matrix multiplication.
        row_stats (`torch.Tensor`): The row-wise quantization statistics for the lhs operand of the matrix multiplication.
        col_stats (`torch.Tensor`): The column-wise quantization statistics for the rhs operand of the matrix multiplication.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor to store the output of the operation.
        bias (`torch.Tensor`, *optional*): An optional bias vector to add to the result.

    Returns:
        `torch.Tensor`: The dequantized result with an optional bias, with dtype `torch.float16`.
    )rZ   r   )rB   rB  rC  int8_mm_dequantrE  r   rG  )rq   r  r  ro   r   results         r   r  r    sR    & YY##33;;Ay)[`[h[hos;tF yy  Mr!   nnz_block_ptrc                 0   | j                         sJ d}||z| j                         j                  d| j                  d         }|dkD  r||k\  }|j	                  |d       |t        | |      }|!|j                  dd      j                         }|||fS )a   "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The row-wise and column-wise absmax values are determined.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead.
    The column-wise quantization scales are not typically needed in inference scenarios.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): Input tensor.
        row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
        col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
        nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor.
    Nr/   r   r   Fdimkeepdim)is_floating_pointr   rh   r\   masked_fill_get_row_absmaxamaxr"  )rq   r  r  r  	thresholdoutlier_maskabsAs          r   get_colrow_absmaxr    s    D    LI-uuw||B,s?9,LlC0 'q)4I		a	7==?Ii--r!   c                 *   | j                   t        j                  k(  sJ t        | j                  dd       }| j                  d   }t        j
                  |ft        j                  | j                        }t        | g       t        |       5  t        j                  t        |       t        |      t        j                  |      t        j                  |      t        j                  |      t!        |              ddd       |S # 1 sw Y   |S xY w)aT  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored.
    Nr/   rY   )rZ   rB   r   r   r\   emptyr}   rI   r   rU   r   cget_row_statsrt   rF   r   rv   r   )rq   r  rowscolsr  s        r   r  r    s      77emm###D772;DTG5==JIqcN		 
AJIJJy!JJtJJtq!	

 
 s   A/DDc                   h    e Zd Zdedededej
                  dej
                  dej
                  fdZy)	COOSparseTensorr  r  nnzrowidxcolidxr   c                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ || _        || _        || _        || _        || _	        || _
        y r#   )rZ   rB   r  r   r   r  r  r  r  r  r   )r   r  r  r  r  r  r   s          r   r   zCOOSparseTensor.__init__"  s     ||u{{***||u{{***||u}},,,||~$$$||~$$$||~$$$		r!   N)r5   r6   r7   r   rB   r   r   r9   r!   r   r  r  !  sE    "),6;llLQLLbgbnbnr!   r  c                       e Zd Zd Zy)CSRSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |dz   k(  sJ || _        || _        || _        || _        || _	        || _
        y Nr   )rZ   rB   r  r   r   r  r  r  rowptrr  r   )r   r  r  r  r  r  r   s          r   r   zCSRSparseTensor.__init__5      ||u{{***||u{{***||u}},,,||~$$$||~$$$||~)))		r!   Nr5   r6   r7   r   r9   r!   r   r
  r
  4      r!   r
  c                       e Zd Zd Zy)CSCSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |dz   k(  sJ || _        || _        || _        || _        || _	        || _
        y r  )rZ   rB   r  r   r   r  r  r  colptrr  r   )r   r  r  r  r  r  r   s          r   r   zCSCSparseTensor.__init__F  r  r!   Nr  r9   r!   r   r  r  E  r  r!   r  c                    t        j                  | j                  d      \  }}|j                  d       t        j                  | j
                  dz   ft         j                  | j                  j                        }|j                  |j                         |j                         d       |j                  d       t        | j
                  | j                  | j                  || j                  | j                         S NTreturn_countsr   rY   r   )rA   srcr  )rB   uniquer  add_r   r  r  rI   scatter_r   r   cumsum_r
  r  r  r  r   )cooAr   countsr  s       r   coo2csrr   V  s    \\$++TBNFF
KKN[[$))a-)T[[EWEWXF
OO&++-VZZ\qOA
NN1499dii64;;PTP[P[\\r!   c                 F   t        j                  | j                        \  }}| j                  |   }| j                  |   }t        j
                  |d      \  }}|j                  d       t        j                  | j                  dz   ft         j                  | j                  j                        }|j                  |j                         |j                         d       |j                  d       t        | j                   | j                  | j"                  |||      S r  )rB   r   r  r  r   r  r  r   r  r  rI   r  r   r   r  r  r  r  )r  r   
col2rowidxr  r   	colvaluesr  r  s           r   coo2cscr$  _  s    jj-OC[[$F[[$FS=IvNN1[[$))a-)T[[EWEWXF
OO)..*

!OD
NN1499dii666RRr!   c                     t        j                  |ft         j                  |      }t        j                  |ft         j                  |      }t        j                  |f||      }t        | |||||      S )NrY   )rB   r   r  r  )r  r  r  rI   rZ   r  r  r   s           r   	coo_zerosr&  k  s[    [[#u{{6BF[[#u{{6BF[[#uV<F4sFFFCCr!   out_colout_rowc                     |t        d      |t        d      |t        d      |t        d      t        j                  j                  j                  j                  | |      S )aL  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The statistics are determined both row-wise and column-wise (transposed).

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`int8_vectorwise_quant`] instead.
    This implementation performs additional column-wise transposed calculations which are not optimized.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
        row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
        out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
        out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
        - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    zUrow_stats must be None. int8_double_quant() does not support pre-allocated row_stats.zUcol_stats must be None. int8_double_quant() does not support pre-allocated col_stats.zQout_col must be None. int8_double_quant() does not support pre-allocated out_col.zQout_row must be None. int8_double_quant() does not support pre-allocated out_row.)r  )r  rB   rB  rC  int8_double_quantrE  )rq   r  r  r'  r(  r  s         r   r*  r*  r  ss    N pqqpqqlmmlmm99!!33;;A;SSr!   statsc                 j    t         j                  j                  j                  j	                  | |      S )aY  Dequantizes a tensor with dtype `torch.int8` to `torch.float32`.

    Args:
        A (`torch.Tensor` with dtype `torch.int8`): The quantized int8 tensor.
        stats (`torch.Tensor` with dtype `torch.float32`): The row-wise quantization statistics.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
    )rB   rB  rC  int8_vectorwise_dequantrE  )rq   r+  s     r   r-  r-    s'     99!!99AA!UKKr!   c                 j    t         j                  j                  j                  j	                  | |      S )aw  Quantizes a tensor with dtype `torch.float16` to `torch.int8` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input tensor.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    )rB   rB  rC  int8_vectorwise_quantrE  )rq   r  s     r   r/  r/    s'    $ 99!!77??9MMr!   r  c                    t        | t              s| j                  r| j                  t        j
                  k(  sJ d       t        | j                  d   | j                  d   | j                         | j                         d   j                         | j                         d   j                         | j                               } |Et	        j                  | j                  |j                  d   f|j                  |j                        }| j                  }| j                   j#                         |k(  sJ | j$                  j#                         |k(  sJ | j                  j#                         |k(  sJ | j&                  |j                  d   k(  sJ |j)                         rdnd}|j+                         |rdnd   }|j                  d   }t,        j/                         j0                  }t3        | j                         }t3        | j$                        }	t3        | j                        }
t3        |      }t3        |      }t5        j6                  | j                        }t5        j6                  | j                        }t5        j6                  | j&                        }t5        j6                  |j                  d         }t5        j6                  |      }t5        j6                  |      }t9        | j                   | j$                  | j                  ||g       t;        j<                  |||	|
||||||||t5        j>                  |             |S )Nz8Tensor must be `COOSparseTensor or a PyTorch COO tensor.r   r   )r  r  r  r  r  r   rI   rZ   FT) r  r  	is_sparselayoutrB   
sparse_coor\   _nnzindicesr   r   r  r  rI   rZ   r  r  r   r  r  r  r  rL   r,   r>   rt   rF   rv   r   r   	cspmm_coor  )r  r   ro   r  r  r  r  r  	ptrRowidx	ptrColidx	ptrValuesptrBptrCcnnzcrowsAccolsAccolsBcldbcldcs                      r   spmm_coorC    s   
 dO,~~$++1A1A"A 	
F	
A
 AA		<<>!$((*<<>!$((*;;=
 {kk499aggaj1!((!''R
((C;;#%%%;;#%%%;;#%%%99
"""OO-54L
((*<aQ
0C
''!*C

'
'
)
1
1C$I$I$I1:D3<D::dhhDZZ		"FZZ		"FZZ
#F::c?D::c?Dt{{DKKa=>MM
		,  Jr!   c                 d   |Ot        j                  | j                  |j                  d   f|j                  | j
                  j                        }| j                  }| j                  j                         |k(  sJ | j                  j                         |k(  sJ | j
                  j                         |k(  sJ | j                  |j                  d   k(  s J | j                   d|j                          |j                         rdnd}|j                         |rdnd   }|j                  d   }t        j                  | j                  d      \  }}	|	j                  d      j!                         }
t        j"                  |	d      \  }}|j!                         }|j!                         }|d   d	k  sJ d
|d    d       |j                  t         j$                  t         j&                  fv sJ t)        |
      }t)        |      }t)        |      }t)        | j                        }t)        | j                        }t)        | j
                        }t)        |      }t)        |      }t)        |      }t+        j,                  |	j                               }t+        j,                  | j                        }t+        j,                  | j                        }t+        j,                  | j                        }t+        j,                  |j                  d         }t+        j,                  |j                  d         }t+        j,                  |      }t+        j,                  |      }t/        |      5  t1        | j                  | j                  | j
                  |||g       |j                  t         j$                  k(  r#t3        j4                  ||||||||||||||       n?|j                  t         j&                  k(  r"t3        j6                  ||||||||||||||       d d d        |S # 1 sw Y   |S xY w)Nr   r1  r   z vs FTr  )
descending    z)Current max count per row is 8 but found r  )rB   r   r  r\   rI   r   rZ   r  r  r   r  r  r  r  r  cumsumr   r   r   int8rt   rF   rv   rU   r   r    cspmm_coo_very_sparse_naive_fp16 cspmm_coo_very_sparse_naive_int8)r  r   dequant_statsro   r  r  r  r  r   r  r   	max_countmax_idx	ptrOffsetptrMaxCount	ptrMaxIdxr8  r9  r:  r;  r<  ptrDequantStats	cnnz_rowsr=  r>  r?  crowsBr@  rA  rB  s                                 r   spmm_coo_very_sparserT  	  sd   
{kk499aggaj1!(($++J[J[\
((C;;#%%%;;#%%%;;#%%%99
"?tyykaggY$??"OO-54L
((*<aQ
0C
''!*C\\$++TBNFF]]1!!#FFt<IwkkmGIQ<2Z!J9UV<.XYZZ77u}}ejj1111I)$K I$I$I$I1:D3<Dm,O

6<<>*I::dhhDZZ		"FZZ		"FZZ
#FZZ
#F::c?D::c?D		 #4;;T[[!S-PQ77emm#00  WW

"00)#L JM#L Js   ,B/P%%P/g     _@ztThis function is deprecated and will be removed in a future release. Consider using `int8_vectorwise_quant` instead.vectorc                 (   |dk(  rmt        j                  |       j                         j                         }t        j                  | |z  dz        j                  t         j                        }||fS |dv rkt        j                  t        j                  |       |d      }t        j                  | t        |z  z        j                  t         j                        }||fS |dk(  r| j                  }| j                         } | j                         | j                         z
  }|dk(  rd}d	|z  }| j                         }t        j                  ||z        }	t        j                  || z  |	z
        |	z   } | |fS |d
v r| j                  }| j                         } t        j                  | |d      t        j                  | |d      z
  }d||dk(  <   d	|z  }t        j                  | |d      }t        j                  ||z        }	t        j                  || z  |	z
        |	z   } | |fS |dk(  rt        j                         5  t        j                  |       }
t        j                  |
|d      }|dz  }|
|j                  |
      kD  }t        j                  | |         }|j                  |
      |   |z  | |<   t        j                  | |z  t        z        j                  t         j                        }d d d        ||fS y # 1 sw Y   fS xY w)Nlinear   )rU  rowTr  	zeropointr   r   g     o@)vector-zeropointrow-zeropointtruncated-vectorgffffff?)rB   r   r   r"  r   r   rH  r  CrZ   minaminno_grad	expand_asr   )r<  r  r	  r  xqrZ   dynaqxminxzpxabsxr   r   s                r   vectorwise_quantri  `	  s    Xyy|!'')[[TC(++EJJ74x	(	(zz%))A,C>[[a$h(++EJJ74x	{	"GGIuuw 19DT\uuwkk$)$KKQ%+"u	<	<GGIzz!d3ejjUY6ZZTQYT\zz!d3kk$)$KKQ%+"u	)	)]]_ 	:99Q<D::dT:D#:D--C::af%D^^D)#.5AcFQX\*--ejj9B	: 4x	: 4xs   B5LLc                    |dk(  r5||z  t         t         z  z  }| j                         |z  j                  |      S |dk(  r*d||z  z  }| j                         |z  j                  |      S |dk(  rd||z  z  }| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r||z  }n||z  }|j                  |      S |dk(  r| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r	|d|z  z  }n|d|z  z  }|d|j                         z  z  }|j                  |      S |d	k(  r| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r|||z  t         t         z  z  z  }n|||z  t         t         z  z  z  }|j                  |      S |d
v r| j                         }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r)t        |j                        dk(  r|j                  d      }t        |j                        dk(  r||t         z  z  }n||t         z  z  }||t         z  z  }|j                  |      S y )NrW  rZ  r   r\  rU  r   r   r[  rY  )r]  rU  )r^  r"  r   r   r\   squeezer2   )rc  S1S2rZ   r	  r   r<  s          r   vectorwise_mm_dequantrn  	  s6   
 XBw!a% 
T!%%e,,	{	"b2g
T!%%e,,		&b2gHHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=AIAIAttE{	)	)HHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=ArMArMA	S2446\ttE{	u	HHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=AbAE""AbAE""AttE{	5	5HHJrxx=A#agg,!"3ABrxx=A#agg,!"3ABrxx=AaKAaKA	R!VttE{r!   r4   )T)NTr#   )Tr   T)g+ew?T)TrS  r   r   )TrQ  r   )r   )Nr   r   )NNN   F)NNNNro  F)NrN  )NNNrN  )NNNrN  r  r  )NNNN)	Nr   r   r   r   r   Nr   F)r   r   Nr   )r   r   F)rS  )NFFN)NFF)NNNr   )r   )NNNNr   )r   rU  )collections.abcr   ctypesrF   r   mathr   typingr   r   r   numpyrd   rB   r   typing_extensionsr	   bitsandbytes.utilsr
   r   
cextensionr   rA  cadam32bit_grad_fp32cadam32bit_grad_fp16cadam32bit_grad_bf16cmomentum32bit_grad_32cmomentum32bit_grad_16crmsprop32bit_grad_32crmsprop32bit_grad_16clion32bit_grad_fp32clion32bit_grad_fp16clion32bit_grad_bf16cadagrad32bit_grad_32cadagrad32bit_grad_16cademamix32bit_grad_fp32cademamix32bit_grad_fp16cademamix32bit_grad_bf16r  cadam_static_8bit_grad_32cadam_static_8bit_grad_16cmomentum_static_8bit_grad_32cmomentum_static_8bit_grad_16crmsprop_static_8bit_grad_32crmsprop_static_8bit_grad_16clion_static_8bit_grad_32clion_static_8bit_grad_16r  cadam_8bit_blockwise_grad_fp32cadam_8bit_blockwise_grad_fp16cadam_8bit_blockwise_grad_bf16"cmomentum_8bit_blockwise_grad_fp32"cmomentum_8bit_blockwise_grad_fp16"cmomentum_8bit_blockwise_grad_bf16!crmsprop_8bit_blockwise_grad_fp32!crmsprop_8bit_blockwise_grad_fp16!crmsprop_8bit_blockwise_grad_bf16clion_8bit_blockwise_grad_fp32clion_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_bf16!cadagrad_8bit_blockwise_grad_fp32!cadagrad_8bit_blockwise_grad_fp16!cadagrad_8bit_blockwise_grad_bf16"cademamix_8bit_blockwise_grad_fp32"cademamix_8bit_blockwise_grad_fp16"cademamix_8bit_blockwise_grad_bf16r  r   r;   rL   rI   FIRST_CUDA_DEVICErC   device_countrU   rW   r}   rp   r0   r   r   r   r   r   r   r   FutureWarningr   r   rG   r   rt   r"  r   r   r+  rD  r   rM  r_  r   rg  rk  rc  rs  rv  rq  r{  r~  ry  r}  r)  r  r  r  r  r  rH  r  r  r  r  r  r  r  r  r  r  r
  r  r   r$  halfr&  r*  r-  r/  rC  rT  r^  ri  rn  r9   r!   r   <module>r     s$  
 %    ' '    ( I 	   	       	""""
 	!!!!
 	       	!!!!
 	       	$$$$$$9! J 	%%%%
 	))))
 	((((
 	%%%%
 	%%%%
 	))))+ : 	****** 	...... 	------ 	****** 	------ 	......5 D' '0* *2 " !ELLq1  	::q '5<< ' (5<< ( "MM2C Ku|| K!6-&S&:(V43n R]jk l !x 67 !HPv P"++ P
%x %HR[[$9 % R]jk #'	@@	%,,	@ @
 @ l@Fn
 n
f $(%)"&D||D
5<<
 D U\\"D 
%,,		D 5<<#$DR )-%)#'"&J||J*%J U\\"J 5<<
 	J
 
%,,	J J \\JZOh &*"&++_||_U\\"_ 
%,,	_ &*"&++_||_U\\"_ 
%,,	_ &*"&++J||JU\\"J 
%,,	J 5<<#$J^ )-%)"&J||J*%J U\\"J 
%,,		J
 J \\J )-%)"&J||J*%J U\\"J 
%,,		J
 J \\J )-%)"&F||F*%F U\\"F 
%,,		F
 F \\FR R]jk $("&
5<<
  
%,,	 65(()	 l& R]jk .2%)#'"&E&&.)* U\\" 5<<
 	
 
%,,	  l( R]jk&  Xell5K W]  l< R]jkF & x7M Y_  lP &*(,#d
d
d
 d
 	d

 d
 
d
 d
 	d
 U\\"d
 d
 d
 d
 d
 d
 %d
  !d
$ 
%d
N ;, (,)AAA A 	A
 U\\"A A A 
A A 	A A ELL!A A 5<<
 A A  u||$!A" #A$ %A& %'A( )A* 
+A
Aj '=
=
=
 =
 	=

 U\\"=
 =
 =
 =
 =
 
=
 =
 	=
 =
 ELL!=
 =
  ell#!=
" #=
$ %=
( 
)=
@ R]jk&2f &2 &2s &2PS &2 l&2R R]jku u u uX^ u lu$ GLjj Qn #'
""" 
%,,	"P #'bbb 
%,,	bP #']]] 
%,,	]@ X\chcncn C%,, C5<< Chu||>T C< #'#'|||| || 
%,,		
 5<<
 8 R]jk )-(,,06.||6.%6. %6. ELL)	6. 5<<x'==>6. l6.r R]jk"ell " l"J & " "]	S .3ZZ D )-(,&*&*0T||0T%0T %0T ell#	0T
 ell#0TfLu|| LELL LNU\\ N0 #'@
-
.@||@ 
%,,	@FOd 
 6
)
)X J -2JJ8 8	8r!   