
    Uh5                         d dl mZmZmZmZmZ ddlmZ erddlm	Z	 ddl
mZmZmZmZ ddlmZ  e       rd dlZ ej$                  e      Z G d	 d
e      Zy)    )TYPE_CHECKINGAnyDictListOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                       e Zd ZdZdZdZddgZ fdZd Zdd	Z	d
dddde
dee
ef   fdZ	 dd
dddde
dddee
ef   deee
      fdZd dZ	 dd
ddeee
      fdZdee
   de
dee
   fdZd ZddZedefd       Z xZS )!FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                 4    t        |   |fi | || _        y N)super__init__quantization_config)selfr   kwargs	__class__s      ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__+   s    ,77#6     c                 0   t               st        d      t               st        d      t        d      st        d      t        j
                  j                         st        d      t        j
                  j                         }|\  }}|dk  rt        d      |j                  dd       }|t        j                  d	       y |N| j                  sAt        |t              r0d
|j!                         v sd|j!                         v rt        d      y y y y )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr   r   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr    s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment/   s1   !#]  '(F 
 'x0r  zz&&(^__"ZZ==?)u19j  ZZd3
| #&&z40j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                     |(t         j                  }t        j                  d|       |S |t         j                  k(  rt        d      |S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r$   bfloat16r+   infofloat16r)   )r   torch_dtypes     r   update_torch_dtypez'FbgemmFp8HfQuantizer.update_torch_dtype\   sV    ..KKKF  	 EMM)w  r   modelr   param_valueztorch.Tensor
param_name
state_dictc                 R   ddl m}m} t        ||      \  }}	t	        ||      rP| j
                  s|	dk(  r.|	dk(  r(|j                  t        j                  k7  rt        d      y|	dk(  rt        d      y	t	        ||      r(| j
                  s|	dk(  ry|	d
k(  s|	dk(  rt        d      y	y)Nr
   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleTgate_up_proj_scaledown_proj_scale)
integrationsrC   rD   r   r.   r-   dtyper$   float8_e4m3fnr)   )
r   r=   r>   r?   r@   r   rC   rD   moduletensor_names
             r   check_quantized_paramz*FbgemmFp8HfQuantizer.check_quantized_paraml   s     	O25*Efo.!![F%:(*{/@/@EDWDW/W$%]^^.0$%bccf89!![F%:"66+IZ:Z$%bccr   target_deviceztorch.deviceunexpected_keysc                    ddl m} t        ||      \  }}	t        ||      r|	dk(  r|j	                  dd      }
|
j
                  }|
j                  d|d         }t        j                  j                  j                  |      \  }}|j                  |      }|j	                  dd      }|j                  |d   d|d         }n|	dk(  r|j	                  dd      }
|
j
                  }|
j                  d|d         }t        j                  j                  j                  |      \  }}|j                  |      }|j	                  dd      }|j                  |d   |d   d      }t        j                  j                  j                  |            |j                  |	 d<   nt        j                  j                  j                  |      \  }}t        j                  j                  |j                  |j
                  d   d      j                  |            |j                  |	 d<   t        j                  j                  j                  |            |j                  |	<   |||v r|j!                  |       ~y	)
z@
        Quantizes weights into weight and weight_scale
        r
   )rD   gate_up_projr   r   	down_proj_scaleN)rJ   rD   r   r.   	transposeshapereshaper$   opsfbgemmquantize_fp8_per_rownn	Parameterto_parametersviewremove)r   r=   r>   r?   rP   r@   rQ   rD   rM   rN   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerG   s                    r   create_quantized_paramz+FbgemmFp8HfQuantizer.create_quantized_param   sU    	>25*Ef89n, $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 1 +22>B	%//15	0889JA~^_O`a+ $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 1 +22>B	%//15	0889JN[\L]_`a9>9K9KLOO\iLj9kF+f56&+ii&6&6&K&KK&X#I|9>9K9K!!,"4"4Q"7;>>}M:F+f56 +0((*<*<Y\\-=X*Y;'&:+H"":.r   c                     |S r    )r   r=   r   s      r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   keep_in_fp32_modulesc                 &   ddl m} |j                  }| j                  || j                  j
                  |      | _        |j                  } ||| j
                  | j                  | j                  ||      }| j                  |j                  _        y )Nr
   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r-   configtp_plan)rJ   ro   _tp_planget_modules_to_not_convertr   rp   rq   r-   )r   r=   rm   r   ro   rr   rq   s          r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	B..&*&E&E4++BBDX'
# .#'#>#> $ 8 8,,
 ,0+C+C(r   missing_keysprefixc                 @   ddl m}m} g }|j                         D ]h  \  }}t	        ||      st	        ||      s|D ]E  }	||	v s
|| d|	 v s|	j                  d      r#|	j                  d      r5|j                  |	       G j |D 
cg c]	  }
|
|vs|
 c}
S c c}
w )Nr
   rB   .z.weightz.bias)rJ   rC   rD   named_modulesr.   endswithappend)r   r=   rv   rw   rC   rD   not_missing_keysnamerM   missingks              r   update_missing_keysz(FbgemmFp8HfQuantizer.update_missing_keys   s    N!//1 	9LD&&/2jIc6d+ 9GDvhay4I,I ' 0 0 ; ' 0 0 9(//89	9 (Ea14D+DEEEs   
	BBc                    d|j                   j                  v rqi ddddddddddddd	d
dddddddddddddddddd
dddddddd
dddd	}|j                         ||j                         _        |S ||_        |S |S )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   rq   	text_plans      r   update_tp_planz#FbgemmFp8HfQuantizer.update_tp_plan   sU   v''000$ 3O	$
 9/$ 3O$ 9/$ 3O$ 9/$ 3O$ %h$ 23F$ ;<O$ 2$& G'$( Mo)$* Eo+$, KO-$. G/$0 01$2 *2DSJYBQHWDS ?UDZ;JG$IJ %%'3>G&&(; M -6)Mr   c                      y)NTrk   )r   safe_serializations     r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable#  s    r   c                      y)NFrk   )r   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable&  s    r   )r;   torch.dtyper6   r   r   )r=   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r5   r<   strr   r   rO   r   r   ri   rl   ru   r   r   r   propertyboolr   __classcell__)r   s   @r   r   r   !   s7    (,$ %|47+Z   $ 	
 cNJ 04= = $= 	=
 &= cN= "$s),=~ 59D D 'tCy1D2FtCy F# FRVWZR[ F-^ d  r   r   )typingr   r   r   r   r   baser	   modeling_utilsr   utilsr   r   r   r   quantizers_utilsr   r$   
get_loggerr   r+   r   rk   r   r   <module>r      sN    < ;  0 a a 2  
		H	%G; Gr   