U
    U?h                     @   s   d dl Z d dlZd dlZd dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlm Z  G d	d
 d
eZ!dS )    N)onnx_pb   )BaseQuantizerQuantizationParams)
TensorData)	ONNXModel)TENSOR_NAME_QUANT_SUFFIXQuantizationModeQuantizedValueQuantizedValueType__producer____version__add_infer_metadataattribute_to_kwargcompute_scale_zpcompute_scale_zp_float8find_by_nameget_qmin_qmax_for_qTypeget_qrange_for_qType	ms_domain&save_and_reload_model_with_shape_infertensor_proto_to_array)CreateOpQuantizerc                   @   s   e Zd Zd:ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
d;ddZdd Zdd Zdd Zdd Zd<ddZd=dd Zd!d" Zd>d$d%Zd&d' Zd?d(d)Zd@d+d,ZdAd.d/ZdBd0d1ZdCd2d3Zd4d5 Zd6d7 Zd8d9 ZdS )DONNXQuantizerNc                 C   s~  t | |||||||	|
|| |s| j  t| jj}dd |jjD | _| jdd |jj	D  | jdd |jj
D  t|| _|| _|| _| jdk| _d| jko| jd | _g | _d| _i | _| jdd |jj	D  | jd	d |jj
D  | jjjjD ]}| jd
d |j	D  q
| jtkrFtd| j |  | _d| _d| _d| _d| _i | _| j | _ d S )Nc                 S   s   i | ]}|j |qS  name).0vir   r   Y/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/quantization/onnx_quantizer.py
<dictcomp>G   s      z*ONNXQuantizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]}|j |qS r   r   r   Zotr   r   r   r    H   s      c                 S   s   i | ]}|j |qS r   r   r   itr   r   r   r    I   s      
   ZMatMulConstBOnly/c                 S   s   i | ]}|j d qS r   r   r!   r   r   r   r    U   s      c                 S   s   i | ]}|j d qS r&   r   r"   r   r   r   r    V   s      c                 S   s   i | ]
}|d qS r&   r   )r   output_namer   r   r   r    X   s      zunsupported quantization mode Zfixed_quantization_range_uint8Zfixed_quantization_range_int8Z
fixed_zeroZfixed_zero_zp)!r   __init__modelZreplace_gemm_with_matmulr   graph
value_infovalue_infosupdateoutputinputr   modestaticZopset_versionfuse_dynamic_quantextra_optionsZq_matmul_const_b_only	new_nodesgraph_scopetensor_namesnoder	   
ValueErrorcalculate_quantization_paramsquantization_paramsfixed_qrange_uint8_namefixed_qrange_int8_namefixed_zero_namefixed_zero_zp_namequantized_value_mapZget_non_initializer_inputsgenerated_value_names)selfr)   per_channelreduce_ranger0   r1   weight_qTypeactivation_qTypetensors_rangenodes_to_quantizenodes_to_excludeop_types_to_quantizer3   r7   r   r   r   r(   &   sP    


zONNXQuantizer.__init__c                 C   s~   t jj|d| jjjd}t| t|| j| j| j	| j
| j| j| j| j| j| j| j}| |_| j | d|_|  |jjjS )z
        generate submodel for the subgraph, so that we re-utilize current quantization implementation.
        quantize the submodel
        update subgraph and set it back to node
        onnx-quantizer)producer_nameZopset_importsr%   )onnxhelperZ
make_modelr)   opset_importr   r   rB   rC   r0   r1   rD   rE   rF   rG   rH   rI   r3   parentr5   quantize_modelr*   )rA   subgraphZ	graph_keyZwarped_modelZsub_quantizerr   r   r   quantize_subgrapho   s0    zONNXQuantizer.quantize_subgraphc           	      C   s  dd |j D }t|dkr |S |jr,|jn|j dt| j }i }|j D ]}|jtjjkr|j| 	|j
| d|j i}n\|jtjjkrg }|jD ].}|| 	|| d|j dt| g q|j|i}nt|}|| qLtjj|j|j|jfd|ji|S )z|
        Check subgraph, if any, quantize it and replace it.
        return new_nodes added for quantizing subgraph
        c                 S   s,   g | ]$}|j tjjks$|j tjjkr|qS r   )typerL   AttributeProtoGRAPHGRAPHS)r   attrr   r   r   
<listcomp>   s    z>ONNXQuantizer.quantize_node_with_sub_graph.<locals>.<listcomp>r   Z_node_count_:r   )	attributelenr   op_typer4   rS   rL   rT   rU   rR   grV   Zgraphsextendr   r-   rM   	make_noder/   r.   )	rA   r7   Zgraph_attrsZ	node_namekwargsrW   kvvaluerQ   r   r   r   quantize_node_with_sub_graph   s0    "
"
z*ONNXQuantizer.quantize_node_with_sub_graphc                 C   s   t dd | j D S )zQ
        Detect if model already has QuantizeLinear or DequantizeLinear.
        c                 s   s"   | ]}|j d kp|j dkV  qdS )QuantizeLinearDequantizeLinearN)r\   r   r7   r   r   r   	<genexpr>   s    z.ONNXQuantizer.has_QDQ_nodes.<locals>.<genexpr>)anyr)   nodes)rA   r   r   r   has_QDQ_nodes   s    zONNXQuantizer.has_QDQ_nodesc                 C   s2   t || j d k	rdS | jd k	r.| j|S dS )NTF)r   r)   initializerrO   find_initializer_in_path)rA   Zinitializer_namer   r   r   rl      s
    
z&ONNXQuantizer.find_initializer_in_pathc                 C   s2   | j | |D ]}|jD ]}| j| qqd S N)r4   r^   r.   r@   add)rA   ri   r7   r'   r   r   r   add_new_nodes   s    
zONNXQuantizer.add_new_nodesc                 C   sH  |   rtd | j D ]d}| jr0| |}t| j}t	| |}|
  t|t| jD ]"}| j| jD ]}| j| qlq\q|   | j d | j j| j | jd kr| j \}}t|dkrtdt| t| jj_t| jj_dd | jjjD }|s@dd | jD }	|	r@| jjj }
d|
_t|
_| jjS )	NzPlease check if the model is already quantized. Note you don't need to quantize a QAT model. OnnxRuntime support to run QAT model directly.r7   r   z0Invalid model with unknown initializers/tensors.c                 S   s   g | ]}|j tkr|qS r   )domainr   )r   opsetr   r   r   rX      s     
 z0ONNXQuantizer.quantize_model.<locals>.<listcomp>c                 S   s   g | ]}|j d kr|qS )zcom.microsoft)rp   rf   r   r   r   rX      s     
 r   ) rj   loggingwarningr)   ri   enable_subgraph_quantizationrc   r[   r4   r   quantizeranger.   r@   rn   _dequantize_outputsr*   Z
ClearFieldr7   r^   rO   Zclean_initializersRuntimeErrorstrr   rK   r   Zproducer_versionrN   versionr   rp   )rA   r7   Znumber_of_existing_new_nodesZop_quantizerir'   _Zinitializers_not_foundZms_opsetZms_nodesrq   r   r   r   rP      s<    





zONNXQuantizer.quantize_modelc                 C   s<   d| j kr(td|| j d  | j d S td|dd S )NZDefaultTensorTypezDget_tensor_type returns DefaultTensorType for tensor name %r, use %dz)Unable to find data type for weight_name=a7  . shape_inference failed to return a type probably this node is from a different domain or using an input produced by such an operator. This may happen if you quantize a model already quantized. You may use extra_options `DefaultTensorType` to indicate the default weight type, usually `onnx.TensorProto.FLOAT`.)r3   rr   inforx   rA   tensor_namer   r   r   _get_default_tensor_type   s    


z&ONNXQuantizer._get_default_tensor_typeFc                 C   s   t || j }|d k	r|jS || jkrd| j| }|jdrd|rZ|jjjdkrZ| 	|S |jjjS | j
rt| jd kr|r| 	|S d S | j|}|d k	r|S | j
r| jr| j|}|d k	r|S |r| 	|S d S )Ntensor_typer   )r   r)   rk   Z	data_typer,   rS   HasFieldr   	elem_typer   rt   rO   is_valid_quantize_weightget_tensor_type)rA   r   	mandatoryweightr   otyperesr   r   r   r     s.    





zONNXQuantizer.get_tensor_typec                 C   s   |  |r| |S || jkrp| j| }|jdrR|jjjtjj	tjj
fkrRdS td|d|j d dS | jr| jr| j|S td|d dS )	Nr   Tz<Inference failed or unsupported type to quantize for tensor z
, type is .Fz%Failed to infer data type of tensor: zS. Please add data type info for this tensor if your model has customized operators.)Zis_input_a_initializerr   r,   rS   r   r   r   
onnx_protoTensorProtoFLOATFLOAT16rr   rs   rt   rO   is_float_tensor)rA   r   r   r   r   r   r     s&    




zONNXQuantizer.is_float_tensorc                 C   sH   |t jjkr| |||S |t jjkr4| |||S td| ddS )a  
        Create nodes for dynamic quantization of input and add them to nodes_list.
            parameter input_name: Name of the input.
            parameter nodes_list: new nodes are appended to this list.
            parameter qType: type to quantize to.
            parameter initial_type: type to quantize from
            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
        zUnexpected value for qType=r   N)r   r   INT8+_get_dynamic_input_quantization_params_int8UINT8,_get_dynamic_input_quantization_params_uint8r8   )rA   
input_name
nodes_listqTypeinitial_typer   r   r   &_get_dynamic_input_quantization_params5  s
    	z4ONNXQuantizer._get_dynamic_input_quantization_paramsc                 C   s  t jj}|d }|d }tjjd|g|d g|dd}|| |d }tjjd|g|d g|dd}	||	 |d	 }
tjd
|jd g|
d g|
}|| |d	 }tjd
|	jd g|d g|}|| |d }tjd|jd |jd g|d g|}|| tj| j	|g t
|d g}| j| |d }tjd|jd | j	g|g|}|| tj| j|g dg}| j| || jg g fS )az  
        Create nodes for dynamic quantization of input to int8 and add them to nodes_list
            parameter input_name: Name of the input.
            parameter nodes_list: new nodes are appended to this list.
            parameter initial_type: initial weight type (FLOAT or FLOAT16)
            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
        _scale
_ReduceMin	ReduceMin:0r   Zkeepdims
_ReduceMax	ReduceMaxZ_AbsZAbsZ_Abs_MaxZMaxg       @Z	scale_DivDiv)r   r   r   rL   rM   r_   appendr.   make_tensorr<   r   r)   add_initializerr>   )rA   r   r   r   r   input_scale_namereduce_min_namereduce_min_nodereduce_max_namereduce_max_nodeZreduce_min_abs_nameZreduce_min_abs_nodeZreduce_max_abs_nameZreduce_max_abs_nodeZabs_max_nameZabs_max_nodeZinitializer_divscale_div_namescale_div_nodeZinitializer_zpr   r   r   r   D  s|    







z9ONNXQuantizer._get_dynamic_input_quantization_params_int8c                 C   s  t jj}|d }|d }|d }tjjd|g|d g|dd}|| |d }	tjjd	|g|	d g|	dd}
||
 tj| j|g t	|g}| j
| tj| j|g d
g}| j
| |d }tjd|
jd |jd g|d g|}|| |d }tjd|jd | jg|g|}|| |d }tjd| j|jd g|d g|}|| |d }tjd|jd |g|d g|}|| |d }tjd|j|d g|}|| |d }tjjd|j|g||d}|| ||g g fS )a{  
        Create nodes for dynamic quantization of input to uint8 and add them to nodes_list
            parameter input_name: Name of the input.
            parameter nodes_list: new nodes are appended to this list.
            parameter initial_type: initial weight type (FLAOT or FLOAT16)
            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
        r   _zero_pointr   r   r   r   r   r   r   g        Z
_scale_SubSubZ
_scale_Divr   Z_zero_point_SubZ_zero_point_DivZ_zero_point_FloorZFloorZ_zero_point_CastZCast)to)r   r   r   rL   rM   r_   r   r   r;   r   r)   r   r=   r.   )rA   r   r   r   r   r   Zinput_zp_namer   r   r   r   Zinitializer_qrangeZinitializer_qvalueZscale_sub_nameZscale_sub_noder   r   Zzp_sub_nameZzp_sub_nodeZzp_div_nameZzp_div_nodeZzp_floor_nameZzp_floor_nodeZzp_cast_nameZzp_cast_noder   r   r   r     s    







z:ONNXQuantizer._get_dynamic_input_quantization_params_uint8c                 C   s   | j }|dks|dkr| jdks,|| jkrBtd| d dS | j| }t|tsptdt| d|d|dkst|dkrt	d	| d
| t
|d g}t|d dr|d jt
jt
jfkrt	dt|d  d|t
|d g}|jt
jks
t|d }nRt
|g}t
|g}| j| }d|krT|d j}||}|jt
jksftg }	|d }
g }|d }tj|
||	|  }| j| |jt
jkrtjj}n.|jt
jkrtjj}nt	d|j d|tj||||d }| j| d||
||	fS )a\  
        Create initializers and inputs in the graph for zero point and scale of output.
        Zero point and scale values are obtained from self.quantization_params if specified.
            parameter param_name: Name of the quantization parameter.
            return: result, scale_name, zero_point_name, scale_shape, zero_point_shape.
        Nz$Quantization parameters for tensor:"z" not specified)F r   r   r   Unexpected type  for r      zbQuantization parameters should contain zero point, scale, quant type. Specified values for output z: 
zero_pointscaledtypez and param_name=
quant_typer   r   zUnexpected dtype=z for param_name=)T)rE   r:   rr   r}   
isinstancer   	TypeErrorrS   r[   r8   nparrayhasattrr   Zfloat32Zfloat16Zfloat64AssertionErrorZastyperL   rM   r   Zraveltolistr)   r   r   r   r   r   Zreshape)rA   
param_nameZ	use_scaleZuse_zeropointZzero_point_typeparamsZzero_point_valuesZscale_valuesr   Zzero_point_shapeZzero_point_namescale_shape
scale_nameZinit_zpZ
scale_typeZ
init_scaler   r   r   _get_quantization_params  sX    

$




   


z&ONNXQuantizer._get_quantization_paramsc              	   C   sH  |j | }|dkstd|t }|d }	|dk	rL|dk	rLd||  }
}}n| |\}
}}}}g }|
rtjd|||g|g|	}n| jrdS | jr|t	j
jkr|d }|d }tjd	|g|||g|	}n\|dk	std
|d| d| d| | j||||d\}}}}tjd|||g|g|	}t|||||| j|< ||fS )a  
        Given an input for a node (which is not a initializer), this function

        - add nodes to compute zero point and scale for this input if they don't exist.
        - add new QuantizeLinear node to quantize the input.

        :param node: node being quantized in NodeProto format.
        :param input_index: index of input in node.input.
        :param qType: type to quantize to.
        :param given_scale_name: if those inputs need to be quanitzed using this scale tensor.
        :param given_zp_name: if those inputs to be quantized using this zeropoint tensor.
        :param initial_type: type of the weight to quantize
        :return: List of newly created nodes in NodeProto format.
        r   z*Cannot access undefined variable in graph._QuantizeLinearNTrd   r   r   ZDynamicQuantizeLinearzCCannot quantize input without knowing the initial type, input_name=z, input_index=z, qType=z, node=r   )r/   r   r   r   rL   rM   r_   r1   r2   r   r   r   r   r
   r?   )rA   r7   input_indexr   Zgiven_scale_nameZgiven_zp_namer   r   r'   Zql_node_nameZ
data_foundr   zp_namer|   ri   qlinear_noder   Zzp_shaper   r   r   _get_quantize_input_nodes1  sV    

	z'ONNXQuantizer._get_quantize_input_nodesc                 C   s.   || j kr| j | S | jd k	r*| j|S d S rm   )r?   rO   find_quantized_value)rA   r   r   r   r   r   w  s
    


z"ONNXQuantizer.find_quantized_value      ?c              
   C   s   || j kr| j | jS | j | j}t|| j }t|}|| j krR| j | j}n0|| jkrr| |\}	}}	}	}	nt	d| dt|| j }
t|
}| 
||||\}}}}}}|| j kstt||||tj|jdkrdnd||d}|| j |< |S )z]
        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
        z	Expected z5 to be in quantized value map for static quantizationr   r   N)	node_type
node_qtype)r?   q_namer   r   r)   rk   r   r:   r   r8   Zquantize_bias_static_implr   r
   r   Initializersize)rA   Z	bias_namer   weight_namebetaZweight_scale_nameZweight_initializerZweight_scaler   r|   Zinputscale_initializerZinput_scaleZquantized_bias_nameZquantized_bias_scale_nameZquantized_bias_zp_nameZbias_scale_datar   r   quantized_valuer   r   r   quantize_bias_static~  sB    


	

z"ONNXQuantizer.quantize_bias_staticc                 C   s   || j kp|| jkp|| jkS )zq
        only check for value info and newly generated tensor names, initializers are checked separately
        )r,   r6   r@   r~   r   r   r   contains_tensor  s
    
zONNXQuantizer.contains_tensorc              	   C   s   | j ||dddd|dS )NFr   r7   indicesinitializer_use_weight_qTyperC   op_level_per_channelaxisfrom_subgraph_ONNXQuantizer__quantize_inputs)rA   r7   r   r   r   r   r   quantize_activation  s    z!ONNXQuantizer.quantize_activationr   c              	   C   s   | j ||d||||dS )NTr   r   )rA   r7   r   rC   r   r   r   r   r   r   quantize_weight  s    	zONNXQuantizer.quantize_weightTc              
   C   s  g }g }	g }
g }|D ]}|j | }|| jkr^| j| }||j |	|j |
|j q|s|
d |d |	d qt|| j }|dk	r| j	r|r| 
|j|r| jn| j||\}}}n | ||r| jn| j|\}}}|
| |	| || q| |r~| j|d | j| j }|dkr|j | }|| jkr| j| }|dsztd| d|jdstd| d|jjj}n&|| jkstd	|d
| j| }| j||| j|d}|dkr dS |r | | n
|| |d }|jdkrL|
|j ||j d  |	|j d  n0|
|jd  ||jd  |	|jd  q| jdk	r| jj ||g||||dd\}}}}|
|d  ||d  |	|d  qt!d| d| j" q|
|	||fS )a  
        Given a node, this function quantizes the inputs as follows:
            - If input is an initializer, quantize the initializer data, replace old initializer
              with new initializer
            - Else, add QuantizeLinear nodes to perform quantization
            parameter node: node being quantized in NodeProto format.
            parameter indices: input indices to quantize.
            return: (List of quantized input names,
                     List of zero point names used for input quantization,
                     List of scale names used for input quantization,
                     List of new QuantizeLinear nodes created)
        r   Nr   rS   zvalue_info=z has no type.r   z is not a tensor.zshape inference failed for zF and attribute 'tensor_names' does not have any value for this tensor.r   )NNNNr   rd   r      r   T)r   rC   r   r   r   z!Invalid tensor name to quantize: z @graph scope)#r/   r?   r   r   r   r   r   r)   rk   rB   quantize_weight_per_channelr   rD   rE   quantize_initializerr   find_node_by_namer4   r*   r,   r   r   rS   r   r   r6   r   ro   r^   r\   r.   rO   r   r8   r5   )rA   r7   r   r   rC   r   r   r   Zscale_namesZzero_point_namesZquantized_input_namesri   r   Z
node_inputr   rk   q_weight_namer   r   r   r   r+   r   Zquantize_input_nodesZparent_quantized_input_namesZparent_zero_point_namesZparent_scale_namesr|   r   r   r   Z__quantize_inputs  s    











  




   


zONNXQuantizer.__quantize_inputsc           	      C   sj   |j | jkr(| j|j  }|j|j|jfS | ||||\}}}t|j |||tjd}|| j|j < |||fS )a  
        :param weight: TensorProto initializer
        :param qType: type to quantize to
        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
        :return: quantized weight name, zero point name, scale name
        N)	r   r?   r   r   r   Zquantize_initializer_implr
   r   r   )	rA   r   r   rC   keep_float_weightr   r   r   r   r   r   r   r   \  s,    	   
z"ONNXQuantizer.quantize_initializerc           
      C   sd   || j kr$| j | }|j|j|jfS | |||||\}}}	t|||	|tjd }|| j |< |||	fS rm   )r?   r   r   r   Z quantize_weight_per_channel_implr
   r   r   )
rA   r   rD   Zchannel_axisrC   r   r   r   r   r   r   r   r   r   }  s.    	

    

z)ONNXQuantizer.quantize_weight_per_channelc                 C   s   || j kr|| jkr| j | }t|j| j }| jjjdksT| jjjdkrj|dk	rjtj	|j
dksjt|d }| j|| j| j }|dkr|j|j|jg}tjd||g|}|S ||jd kstdS )a  
        Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
        it back to float32 or float16
            parameter value_name: value to dequantize
            parameter new_nodes_list: List of new nodes created before processing current node
            return: None if there is already a DequantizeLinear node that dequantizes it
                    A DequantizeLinear node otherwise
        rJ   Nr   Z_DequantizeLinearre   r   )r?   r@   r   r   r)   rk   rK   rL   Znumpy_helperZto_arrayr   r   r   r4   r*   r   r   rM   r_   r.   )rA   Z
value_namer   Z
scale_initZdqlinear_nameZdqlinear_nodeZdqlinear_inputsdequantize_noder   r   r   _dequantize_value  s2    	
   zONNXQuantizer._dequantize_valuec                 C   s6   | j  jD ]$}| |j}|dk	r| j| qdS )z
        Dequantize output if it is quantized
            parameter new_nodes_list: List of new nodes created before processing current node
            return: List of new nodes created
        N)r)   r*   r.   r   r   r4   r   )rA   r.   r   r   r   r   rw     s    z!ONNXQuantizer._dequantize_outputsc                 C   sB  | j d krd S |   i }| j D ]}| j | }t|tsTtdt| d|d| jj|i d}| j}d|kr||d j	}d|krd|kr|d |d  }}n|t
jjkrt||jd \}}nh|d	|jd
 }|d|jd }	|d| j}
|dd}t|||
d\}}t||	|||
| j\}}t|||d||< q |S )Nr   r   r   )Zdefault_valr   r   r   r   rminr   rmax	symmetricrC   F)rC   r   )r   r   r   )rF   Zadjust_tensor_rangesr   r   r   rS   Ztensor_quant_overridesZget_per_tensor_overridesrE   r   rL   r   ZFLOAT8E4M3FNr   Zavg_stdgetZrange_valueZis_activation_symmetricr   r   Zmin_real_ranger   )rA   r:   r   tdZquant_overridesr   zeror   r   r   r   rC   ZqminZqmaxr   r   r   r9     s0    



z+ONNXQuantizer.calculate_quantization_params)N)F)NN)NNN)r   )F)FFr   F)TFFr   F)FF)TF)__name__
__module____qualname__r(   rR   rc   rj   rl   ro   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rw   r9   r   r   r   r   r   %   sT    
I"-
T^
<     
F
1

    
     
 
&  
 'r   )"rr   numpyr   rL   Zonnx.numpy_helperr   r   Zbase_quantizerr   r   Z	calibrater   Z
onnx_modelr   Zquant_utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   registryr   r   r   r   r   r   <module>   s   H