U
    U?h)                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z d dlZd dlmZmZmZmZ d dlmZ d dlmZmZ ddlmZ ddlmZ dd	lmZmZ ejd
ejd e e!Z"G dd dZ#G dd de#Z$G dd de#Z%G dd de#Z&G dd de#Z'dd Z(G dd dZ)dddddZ*G dd  d Z+G d!d" d"Z,d#d$ Z-d%d& Z.e!d'kre. Z/e/j0re"1ej2 e/j3Z4e/j5Z6ee/j7 Z7ej89e6re":d(e6 d) e;d(e6 d)e/j<re/j=d*kre">d+ d,e/_<e?e4Z@e/j=d*kre&e/jAe/jBd-ZCnbe/j=d.kr@e'e/jAe/j<e/jDe7d/ZCn>e/j=d0krTe$ ZCn*e/j=d1krne%e/jAd2ZCneEd3e/j= e,e@e/jDe/jFeCd4ZGeGH  eGj@Ie6d5 dS )6    )annotationsN)
GraphProto
ModelProto	NodeProtoTensorProto)version)quantize_matmul_4bitsquantize_qdq_matmul_4bits   )CalibrationDataReader)	ONNXModel)QuantFormatattribute_to_kwargz2%(asctime)s %(name)s [%(levelname)s] - %(message)s)formatlevelc                   @  s   e Zd Zdd ZdS )WeightOnlyQuantConfigc                 C  s   || _ || _dS )a  This is the Base class for Weight Only Quant Configuration.

        Args:
            algorithm:
                weight only quantize algorithm name.
            quant_format: QuantFormat{QOperator, QDQ}.
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
        N	algorithmquant_format)selfr   r    r   a/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/quantization/matmul_4bits_quantizer.py__init__    s    
zWeightOnlyQuantConfig.__init__N)__name__
__module____qualname__r   r   r   r   r   r      s   r   c                      s$   e Zd Zdejf fdd	Z  ZS )RTNWeightOnlyQuantConfigNc                   s8   |t jkstd|dkri }t jd|d || _dS )aF  
        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
        RTN is the most straightforward way to quantize weight using scale maps.

        Args:
            ratios:
                percentile of clip. Defaults to {}.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        z"RTN only supports QOperator formatNRTNr   )r   	QOperatorAssertionErrorsuperr   ratios)r   r!   r   	__class__r   r   r   /   s    z!RTNWeightOnlyQuantConfig.__init__r   r   r   r   r   r   __classcell__r   r   r"   r   r   .   s   r   c                      s2   e Zd Zdddddejfdd fddZ  ZS )	GPTQWeightOnlyQuantConfigg{Gz?   FTr   )calibration_data_readerc                   sJ   |t jkstdt jd|d || _|| _|| _|| _|| _	|| _
dS )a  
        This is a class for GPTQ algorithm Weight Only Quant Configuration.
        GPTQ algorithm provides more accurate quantization but requires more computational resources.

        Args:
            calibration_data_reader:
                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
            percdamp:
                percent of the average Hessian diagonal to use for dampening.
            block_size (int, optional):
                channel number in one block to execute a GPTQ quantization iteration.
            actorder (bool, optional):
                whether rearrange Hessian matrix considering the diag's value.
            mse (bool, optional):
                whether get scale and zero point with mse error.
            perchannel (bool, optional):
                whether quantize weight per-channel.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        z#GPTQ only supports QOperator formatGPTQr   N)r   r   r   r    r   r(   percdamp
block_sizeactordermse
perchannel)r   r(   r*   r+   r,   r-   r.   r   r"   r   r   r   L   s     z"GPTQWeightOnlyQuantConfig.__init__r$   r   r   r"   r   r&   K   s   r&   c                      s(   e Zd Zdddejf fdd	Z  ZS )HQQWeightOnlyQuantConfigr'      r
   c                   s8   |t jkstdt jd|d || _|| _|| _dS )a&  
        This is a class for HQQ algorithm Weight Only Quant Configuration.
        HQQ algorithm quant weight without needing calibrate data.

        Args:
            block_size (int, optional):
                channel number in one block to execute a HQQ quantization iteration.
            bits (int, optional):
                how many bits to represent weight.
            axis (int, optional):
                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        z"HQQ only supports QOperator formatHQQr   N)r   r   r   r    r   r+   bitsaxis)r   r+   r2   r3   r   r"   r   r   r   {   s    z!HQQWeightOnlyQuantConfig.__init__r$   r   r   r"   r   r/   z   s
   r/   c                      s2   e Zd Zdddejfdddd fdd	Z  ZS )
DefaultWeightOnlyQuantConfigr'   FNintbool
int | None)r+   is_symmetricaccuracy_levelc                   s,   t  jd|d || _|| _d| _|| _dS )a  
        This is a class for weight only affine quantization configuration.

        Args:
            block_size (int, optional):
                channel number in one block to execute an affine quantization iteration.
            is_symmetric (bool, optional):
                whether quantize weight symmetrically.
            accuracy_level (int, optional):
                Accuracy level of the 4-bit quantized MatMul computation.
                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        DEFAULTr   r0   N)r    r   r+   r8   r2   r9   )r   r+   r8   r9   r   r"   r   r   r      s
    z%DefaultWeightOnlyQuantConfig.__init__r$   r   r   r"   r   r4      s
   r4   c                 C  s   t |t| |  | kS N)r5   npceil)Zval1Zval2r   r   r   is_divisible   s    r>   c                   @  sZ   e Zd ZddddZeddd	d
dddZedd ZdddZddddddZdS )HQQWeightOnlyQuantizerr/   configc                 C  s
   || _ d S r;   r@   r   rA   r   r   r   r      s    zHQQWeightOnlyQuantizer.__init__r   NFz	list[int]r5   dict)min_maxr3   
opt_paramsc                   sb  dd l  |d krdddddn|}|d |d |d	 |d
 f\}}}	}
| jrR jn j}| |}||}||}|dkr fdd}n|f fdd	}d}t|
D ]} || | |d |d }|| | }||| |} j||| |  |dd}||	9 }t	 
||  }|r:t|t|d ||k rJ|}q qRq~~~~||fS )Nr   gffffff?g      $@g)\(?   )lp_normbetakappaitersrG   rH   rI   rJ   r
   c                   s&     |  jj | d|   S )N      ?)signnn
functionalreluabs)xrH   torchr   r   	shrink_op   s    z:HQQWeightOnlyQuantizer.optimize_weights.<locals>.shrink_opc              
     s@     |  jj | d|   | d |d    S )NrK   g:0yE>r
   )rL   rM   rN   rO   rP   pow)rQ   rH   prR   r   r   rT      s    *g     @Tr3   Zkeepdim   )rS   Zis_cudaZfloat16Zfloat32torangeroundclampZmeanfloatrP   printr<   )tensorscalezerorD   r3   rE   verboserG   rH   rI   rJ   dtypeZw_frT   Z
best_erroriw_qZw_rZw_eZcurrent_errorr   rR   r   optimize_weights   s:    



"
z'HQQWeightOnlyQuantizer.optimize_weightsc                 C  sz   | j d |j d kr |j}| j} |dkrn|  d | }t|D ]*}| dd   ||d | || > O  < q@ntdd S )Nr   )   r0      rh   zOnly 2,4,8 bits are supported.)shapeTZelement_sizerZ   NotImplementedError)Zpack_tensorZori_int_tensorr2   Zcompress_ratiojr   r   r   pack_on_row_fast_248bit  s    *z.HQQWeightOnlyQuantizer.pack_on_row_fast_248bitr0   T@   r
   c                 C  s$  dd l }| }	|	j}
||
| |  | }|dkrL|jj|	d|fdd}	n|jj|	ddd|fdd}	|	j}|d k	r|r|dkr|	d|gn|	|dg}	|dkr|	 |	  }}d}n$|	j|ddd }|	j|ddd }d| d }d}||g}|||  j	d	d
}|| }|dk
  dkrH|||dk< || j	d	d
}| | }|rb||}|r| j|	||||d\}}||	| | 	|d |d }|| }d| }|dkr||d d}||d d}n |d|d }|d|d }~	~~|||j||jfS )Nr   r
   constantFTrW   rg   g     @)max)r_   r`   ra   rD   r3   rK   )rS   r]   ri   rM   rN   padreshapeminrq   r\   sumitemr[   rf   r5   rY   rc   )r   r_   r2   Zchannel_wise
group_sizeoptimizeZ
round_zeror3   rS   weightZ	ori_shapepad_lenri   _min_maxZmax_vZmin_vrD   r`   Zmin_max_axisra   re   r   r   r   quantize_internal  sL    $

"
z(HQQWeightOnlyQuantizer.quantize_internalr   list[GraphProto]list[NodeProto]nodegraph_stackreturnc                 C  s  |j dkr|gS ddl}td|j d |jd }t||\}}|dkr\td |gS tj	|}t
|jdkrtd	 |gS ||}|j r| }| j|j| jj| jjd
\}	}
}|	 }	|
 }
| }|j|	jd |	jd d f|j|	jd}| ||	| jj |
  }|  }|d}|d}|j\}}| jj}|d }|| d | }||||}tj|  }|jd |_|jD ]$}|j|kr|j|  qĐqtj|}|jd |_|j||g |jd |j|jg}tj|}|jd |_|j|g | |j i }|j\}}||d< ||d< | jj|d< | jj|d< tj!j"d||j#d g|jr|jd nddd|}td|j d |gS )
        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
        MatMulr   Nstart to quantize  ...r
   2MatMul doesn't have const weight. Skip to quantizerg   )MatMul weight is not 2D. Skip to quantize)r2   rw   )rc   devicerp   _Q4_scales_zero_pointsKNr2   r+   MatMulNBits com.microsoftinputsoutputsnamedomaincomplete quantization of )r   )$op_typerS   loggerinfor   inputget_initializeronnxnumpy_helperto_arraylenri   Z
from_numpycudaZis_availabler}   rj   rA   r2   r+   
contiguouszerosuint8r   rm   cpunumpyrs   
from_arrayremoveinitializerextendappendhelper	make_nodeoutput)r   r   r   rS   input_bZb_pbZbs_graphZb_arrayZb_array_torchZquant_weight_torchZscales_torchZzero_points_torchZpacked_torchscaleszero_pointsrowscolsr+   	blob_sizek_blocksb_quantr   scales_tensorinput_names	zp_tensorkwargsmatmul_q4_noder   r   r   quantizeP  s    





  





 
	zHQQWeightOnlyQuantizer.quantize)r   NF)r0   Trn   TTr
   )	r   r   r   r   staticmethodrf   rm   r}   r   r   r   r   r   r?      s       8
           
Ar?   r~   ztuple[TensorProto, GraphProto])
graph_pathr   c                 C  sL   t t|d ddD ]2}|| }|jD ]}|j| kr&||f    S q&qdS )Nr
   rp   )NN)rZ   r   r   r   )r   r   gidgraphr_   r   r   r   r     s    

r   c                   @  s<   e Zd ZddddZddddd	Zd
dddddZdS )DefaultWeightOnlyQuantizerr4   r@   c                 C  s
   || _ d S r;   r@   rB   r   r   r   r     s    z#DefaultWeightOnlyQuantizer.__init__znpt.ArrayLikez)tuple[np.ndarray, np.ndarray, np.ndarray])
fp32weightr   c              	   C  sF  t |jdkrtd|j\}}| jj}|| d | }| jjtjkr|d }|| }|| }|dkr|t	|d|fdfd}tj
|||fdd}	tj
||d d  dd}
tj
|| |jd}t|	|||
|||| jj nbtj
|| d d dd}	tj
|| d d dd}
tj
||f|jd}t|	|||
|||| jj |	||
fS )	z!4b quantize fp32 weight to a blobrg   z9Current int4 block quantization only supports 2D tensors!r
   r   )r   r   ro   r   )rc   )r   ri   
ValueErrorrA   r+   r   r   r   r<   rr   r   rc   r   r8   r	   )r   r   r   r   r+   r   r   Zpadded_rowsrz   packedZ
zero_pointr   r   r   r   int4_block_quant  sL    
              z+DefaultWeightOnlyQuantizer.int4_block_quantr   r~   r   r   c                 C  s8  |j dkr|gS td|j d | jjr2tjntj}|j	d }t
||\}}|dkrhtd |gS tj|}t|jdkrtd |gS | |\}}	}
| jjtjkrtj||jd	 }tj|	|jd
 }n4tj|jd ||j| d}tj|	|jd }|j	D ]$}|j|kr|j	|  q:q|j||g g }| jjtjkr8|j	d |j|jg}| jjstj|
|jd }||j |j|g i }|j\}}||d< ||d< d|d< | jj|d< | jjdk	r| jj|d< tjjd!||jd g|jr|jd	 nddd|}|| n|j|jg}|jd g}|j	d |d g}|jd g}| jjstj|jd ||	j|
 d}||j |j|g d| jjd}tjjd"|||jr|jd ndd|}tjjd|||jr
|jd ndd}|||g td |j d |S )#r   r   r   r   r
   Nr   rg   r   r   r   Z_DQ_Q4TZ
_DQ_scalesr   r   r   r   r0   r2   r+   r9   r   r   r   r   _outputZ_DQ_zero_points)r3   r+   DequantizeLinear)r   r   r   Z
_matmul_Q4r   )r   )r   ) r   r   r   r   rA   r8   r   ZINT4ZUINT4r   r   r   r   r   r   ri   r   r   r   r   r   r   Zmake_tensortobytesr   r   r   r   r+   r9   r   r   )r   r   r   Zqtyper   Zb_tensorZb_graphZ	b_ndarrayr   r   r   r   r   r   Zoutput_nodesr   r   r   r   r   r   Zdq_input_namesZdq_output_namesZmatmul_input_namesZmatmul_output_namesZ	dq_kwargsZdq_nodeZmatmul_noder   r   r   r     s    



 



 
	
     z#DefaultWeightOnlyQuantizer.quantizeN)r   r   r   r   r   r   r   r   r   r   r     s   "r   c                   @  s\   e Zd ZdZddddejdfddddd	d
ddZddddZdd Zdd Z	dd Z
dS )MatMul4BitsQuantizerao  
    Perform 4b quantization of constant MatMul weights.
    If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
    MatMul node.
    If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
    replaced by the DequantizeLinear + MatMul nodes.
    r'   FNzModelProto | strr5   r6   r7   zWeightOnlyQuantConfig | None)modelr+   r8   r9   algo_configc                 C  s   |d krg }t |tr$tt|nt|| _t |tr<|nd | _|| _|| _|| _	t
|| _d | _|d kr|t||||d}|| _|jdkrt| j| _n|jdkrt| j| _d S )Nr+   r8   r9   r   r1   r:   )
isinstancestrr   r   loadr   
model_pathr+   r8   r9   setnodes_to_excludenode_quantizerr4   r   r   r?   r   )r   r   r+   r8   r9   r   r   r   r   r   r   r   =  s*    
"


zMatMul4BitsQuantizer.__init__r~   )r   c                 C  s|  g }|d }|j D ]D}dd |jD }t|ri }|jD ]}|jtjjkrj||j |j	| 
|i}nN|jtjjkrg }	|jD ] }
||
 |	| 
|g q|j	|	i}nt|}|| q:tjj|j|j|jfd|j	i|}g }|j	| jkrtd|j	 d |g}n8| jd k	r@| jjdkr@| j||}n| j||}|| q|d |j | |  |S )	Nrp   c                 S  s,   g | ]$}|j tjjks$|j tjjkr|qS r   )typer   AttributeProtoGRAPHGRAPHS).0attrr   r   r   
<listcomp>b  s    z:MatMul4BitsQuantizer._process_subgraph.<locals>.<listcomp>r   zexclude to quantize z$ as specified by nodes_to_exclude...r1   r   )r   	attributer   r   r   r   r   r   gr   _process_subgraphr   Zgraphsr   r   updater   r   r   r   r   r   r   r   r   r   r   r   Z
ClearFieldpop)r   r   Z	new_nodesr   r   Zgraph_attrsr   r   kvvalueZsubgraphZ	out_nodesr   r   r   r   ]  sR    


  
z&MatMul4BitsQuantizer._process_subgraphc                   s^   i }d j  jrdndd} jjjjD ]0}|jdkr(t fdd|jD s(|||j< q(|S )z3Generate weight only quant configuration for nodes.r0   symZasym)r2   rw   scheme)r   c                   s   g | ]} j |d kqS r;   )r   r   )r   rd   r   r   r   r     s     zAMatMul4BitsQuantizer._generate_q4_node_config.<locals>.<listcomp>)	r+   r8   r   r   r   r   allr   r   )r   Zq4_node_configZtemplate_config_q4r   r   r   r   _generate_q4_node_config  s    
z-MatMul4BitsQuantizer._generate_q4_node_configc                   s6   fdd}i } j dk	r$ j |d<   } jj}td| d |dkrdd	lm}  jj|d
< |f  j	dk	rz j	n j
j
|d| _
n|dkr ddlm}  jj|d<  jj|d<  jj|d<  jj|d<  jj|d< d|d< | }|f  j	dk	r j	n j
j
||d| _
td| d dS )u  4b quantize a model with RTN or GPTQ algorithm. Please refer to
        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
        for more details on weight only quantization using Intel® Neural Compressor.
        c                  3  s&   t  jj} | D ]}|d fV  qd S r;   )copydeepcopyr   r(   )Zdata_readerdatar   r   r   inc_dataloader  s    z<MatMul4BitsQuantizer.int4_quant_algo.<locals>.inc_dataloaderNr9   zstart to quantize model with z algorithm...r   r   )rtn_quantizer!   )r   weight_configr)   )gptq_quantizer*   	blocksizer,   r-   r.   rp   Z	n_samples)r   r   
dataloaderz$complete quantization of model with z algorithm.)r9   r   r   r   r   r   Z.neural_compressor.adaptor.ox_utils.weight_onlyr   r!   r   r   r   r*   r+   r,   r-   r.   )r   r   r   Zweight_only_node_configr   r   r   r   r   r   r   int4_quant_algo  sB    



z$MatMul4BitsQuantizer.int4_quant_algoc              
   C  s  | j jdkr| j g}| j jtjkr6| jdd nB| j }|D ]2}|j	dkrD|j
dk rDtd | j|j	d qD| | | j  n|ztd W n< tk
r } zt| d td	|W 5 d }~X Y nX d
d l}t
|jt
dkstd|   d S )N)r1   r:   r   r
   )Nzai.onnxr      zThe opset of the input model is under 21 and doesn't support int4 data type. Force to update it to opset 21, but the generated model may not be a valid model.neural_compressor.zLneural-compressor is not correctly installed. Please check your environment.r   z2.3.2zGRequire neural-compressor >= 2.3.2 to support weight only quantization!)r   r   r   r   r   r   r   Zset_opset_importopset_importr   r   r   warningr   Zclean_initializers	importlibimport_module	ExceptionloggingerrorRuntimeErrorr   parse__version__r   r   )r   r   r   Zopseter   r   r   r   process  s:    


zMatMul4BitsQuantizer.process)r   r   r   __doc__r   r   r   r   r   r   r   r   r   r   r   r   4  s    -/r   c                 C  s   |   dkS )N)true1)lower)r   r   r   r   ort_convert_str_to_bool  s    r  c               
   C  s   t jdd} | jdddd | jdddd | jd	d
dtdd | jddtddddgdd | jddtdd | jdd
dddtdd
gdd | jdd
tdd | jd d!d
d"d# | jd
d$ | jd%d&td
g d'd( | jd)d*td*d+gd,d |  S )-Na
  Blockwise int4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--block_sizeF    zBlock size for quantization)r  defaultr   r  z--quant_methodr
  hqqrtngptquW   the algorithm used to quantize weight, 
rtn and gptq leverage Intel® Neural Compressor)r
  r   choicesr  z--bitsr0   z#the target bits to represent weight)r
  r   r  z--symmetric?zWIndicate whether to quantize the model symmetrically, symmetric is not supported by hqq)r  r
  constnargsr   r  r  z--accuracy_levelzAccuracy level of the 4-bit quantized MatMul computation. Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).)r  r   r  z-vz	--verbose
store_true)r  action)rb   z--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)r  r   r  r
  r  z--quant_formatr   ZQDQzQuantFormat {QOperator, QDQ}QOperator format quantizes the model with quantized operators directly.QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.)argparseArgumentParseradd_argumentr5   r   r  set_defaults
parse_args)parserr   r   r   r    s`    	


r  __main__zfile z already existsr  zAsymmetric is not supportted by hqq, will force to symmetric=FalseF)r+   r2   r
  r   r  r  )r+   z!Unsupported quantization method: )r   r9   r   r   T)J
__future__r   r  r   r   r   osr   r<   Znumpy.typingtypingZnptr   Zonnx.onnx_pbr   r   r   r   	packagingr   Zonnxruntime.capi._pybind_stater   r	   Z	calibrater   Z
onnx_modelr   Zquant_utilsr   r   basicConfigINFO	getLoggerr   r   r   r   r&   r/   r4   r>   r?   r   r   r   r  r  argsrb   setLevelDEBUGZinput_modelZinput_model_pathZoutput_modelZoutput_model_pathr   pathexistsr   r   Z	symmetricZquant_methodr   r   r   r+   r2   Zquant_configr9   r   r   Zquantr   Zsave_model_to_filer   r   r   r   <module>   s   
/#  e	  ;>



