U
    T?hL                     @   st  d dl mZ d dlmZmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z; ee<Z=G dd de;Z>dS )    )	getLogger)ListOptional)PackingMode)AttentionMaskFusionAttention)FusionBartAttention)FusionBiasGelu)FusionEmbedLayerNormalization)FusionFastGelu)
FusionGelu)FusionGeluApproximation)FusionGemmFastGelu)FusionLayerNormalizationFusionLayerNormalizationTF)AttentionMaskFormatFusionOptions)FusionQOrderedAttention)FusionQOrderedGelu) FusionQOrderedLayerNormalization)FusionQOrderedMatMul)FusionQuickGelu)FusionReshape)FusionRotaryEmbeddings)FusionShape)"FusionSimplifiedLayerNormalization&FusionSkipSimplifiedLayerNormalization) FusionBiasSkipLayerNormalizationFusionSkipLayerNormalization)FusionUtils)
ModelProtoTensorProtohelper)	OnnxModelc                       s.  e Zd ZdCeeed fddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zeee ed#d$d%Zed&d'd(Zd)d* ZdDd-d.Zd/d0 Zd1d2 Zd3d4 Zd5d6 ZdEe e! ed9d:d;Z"d<d= Z#dFd>d?Z$dGed@dAdBZ%  Z&S )HBertOnnxModelr   )model	num_headshidden_sizec                    s   |dkr|dks(|dkr$|| dks(t t | || _|| _t| | _t| | j| j| j| _t	| | j| j| j| _
t| | _dS )aG  Initialize BERT ONNX Model.

        Args:
            model (ModelProto): the ONNX model
            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
        r   N)AssertionErrorsuper__init__r&   r'   r   attention_maskr   attention_fusionr   qordered_attention_fusionr   utils)selfr%   r&   r'   	__class__ Z/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/onnx_model_bert.pyr*   &   s    (
   zBertOnnxModel.__init__c                 C   s   | j   | j  d S N)r,   applyr-   r/   r2   r2   r3   fuse_attention;   s    
zBertOnnxModel.fuse_attentionc                 C   sD   t | }|  t| }|  t| }|  t| }|  d S r4   )r   r5   r   r   r   r/   fusionr2   r2   r3   	fuse_gelu@   s    zBertOnnxModel.fuse_geluc                 C   s   t | |}|  d S r4   )r	   r5   )r/   is_fastgelur9   r2   r2   r3   fuse_bias_geluK   s    
zBertOnnxModel.fuse_bias_geluc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   gelu_approximationO   s    z BertOnnxModel.gelu_approximationc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_gemm_fast_geluS   s    z!BertOnnxModel.fuse_gemm_fast_geluc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_add_bias_skip_layer_normW   s    z+BertOnnxModel.fuse_add_bias_skip_layer_normc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_reshape[   s    zBertOnnxModel.fuse_reshapec                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   
fuse_shape_   s    zBertOnnxModel.fuse_shapec                 C   s   t | |}|  d S r4   )r
   r5   )r/   use_mask_indexr9   r2   r2   r3   fuse_embed_layerc   s    
zBertOnnxModel.fuse_embed_layerc                 C   s4   t | }|  t| }|  t| }|  d S r4   )r   r5   r   r   r8   r2   r2   r3   fuse_layer_normg   s    zBertOnnxModel.fuse_layer_normc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_simplified_layer_normr   s    z(BertOnnxModel.fuse_simplified_layer_normc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_skip_layer_normv   s    z"BertOnnxModel.fuse_skip_layer_normc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_skip_simplified_layer_normz   s    z-BertOnnxModel.fuse_skip_simplified_layer_normc                 C   s   t | }|  ttdd | jjj}ttdd |}d}|t	| jj
k r| jj
| }d|jkr~|j|kr~| jj
| q>|d7 }q>d S )Nc                 S   s   | j dko| jdkS )NRotaryEmbeddingcom.microsoft)op_typedomainnoder2   r2   r3   <lambda>       z6BertOnnxModel.fuse_rotary_embeddings.<locals>.<lambda>c                 S   s   | j S r4   )rK   rL   r2   r2   r3   rN      rO   r   rH      )r   r5   listfilterr%   graphrM   setmaplenZ	functionsnamerK   remove)r/   r9   Zrot_emb_nodesZnon_ms_domains_to_keepifnr2   r2   r3   fuse_rotary_embeddings~   s    z$BertOnnxModel.fuse_rotary_embeddingsc                 C   s   t | }|  d S r4   )r   r5   r8   r2   r2   r3   fuse_qordered_mamtul   s    z"BertOnnxModel.fuse_qordered_mamtul)rJ   input_indicescastedc           
         s   g }|   }| |}|D ]|  fdd|D }|D ]`}| |rR|s|| q4||kr4|| }	|	jdkr4| |	jd dk	r4|r4||	jd  q4q|S )z
        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
        Returns a list of the graph input names based on the filter whether it is casted or not.
        c                    s$   g | ]}|t  jk r j| qS r2   )rV   input).0rY   rL   r2   r3   
<listcomp>   s      zABertOnnxModel.get_graph_inputs_from_node_type.<locals>.<listcomp>Castr   N)output_name_to_nodeget_nodes_by_op_typeZfind_graph_inputappendrJ   r_   )
r/   rJ   r]   r^   Zgraph_inputsrc   nodesZbert_inputsZ
bert_inputparentr2   rL   r3   get_graph_inputs_from_node_type   s    

z-BertOnnxModel.get_graph_inputs_from_node_typer^   c                 C   s,   |  ddddg|}||  ddg|7 }|S )NEmbedLayerNormalizationr   rP      	Attention   )rh   )r/   r^   inputsr2   r2   r3   !get_graph_inputs_from_fused_nodes   s    z/BertOnnxModel.get_graph_inputs_from_fused_nodesc                 C   sb   |   }d}d}|jD ].}| |tj\}}|r8|d7 }|t|7 }qtd| d| d dS )zPChange data type of all graph inputs to int32 type, and add Cast node if needed.r   rP   z)Graph inputs are changed to int32. Added z Cast nodes, and removed z Cast nodes.N)rS   r_   Zchange_graph_input_typer!   ZINT32rV   loggerinfo)r/   rS   Zadd_cast_countZremove_cast_countZgraph_inputnew_nodeZremoved_nodesr2   r2   r3   change_graph_inputs_to_int32   s    
z*BertOnnxModel.change_graph_inputs_to_int32
batch_sizemax_seq_lenc                 C   s   | j dd| j dd }| jjjD ]B}|j|kr"|jjjjd }||_	|dk	r"|jjjjd }||_	q"| jjj
D ]}|jjjjd }||_	qpdS )zD
        Update input and output shape to use dynamic axes.
        Tri   Fr   NrP   )ro   r%   rS   r_   rW   typeZtensor_typeshapedimZ	dim_paramoutput)r/   Zdynamic_batch_dimZdynamic_seq_lenZbert_graph_inputsr_   Z	dim_protory   r2   r2   r3   use_dynamic_axes   s    

zBertOnnxModel.use_dynamic_axesc                 C   s   |    d S r4   )adjust_reshape_and_expandr6   r2   r2   r3   
preprocess   s    zBertOnnxModel.preprocessc           
   	   C   s:  g }|   D ]}|jdkr| |jd }|d k	rd|jdkrd||g | |jd |jd  q| |ddddgddddg| 	 }|d k	r|d }| |jd }|d }| |jd }|d }	|d k	r|d k	rt
|d	krt
|dkr|d |d kr|	jd |jd< q|r6| | td
t
|  d S )NZReshaperP   r   ZExpandSlice   z"Removed Reshape and Expand count: )rf   rJ   Zget_constant_valuer_   sizeextendZreplace_input_of_all_nodesry   match_parent_pathrc   rV   remove_nodesrp   rq   )
r/   nodes_to_removerM   Zreshape_shapeZreshape_pathZexpand_nodeZexpand_shape_valueZreshape_before_expandZshape_valueZ
slice_noder2   r2   r3   r{      sD    





z'BertOnnxModel.adjust_reshape_and_expandc                 C   sv  |   }g }|  D ]P}dddd}|j|kr||j }| |ddddd	d
g|dddddg|}|d k	r|\}}}	}
}}|jd |  jd jkr|jd |jd< |   }|jdkr| |dddd
gddddg|}|d k	r|d jd |  jd jkrtj	d|jdt
|jd  |j|jd d}d|_|jtd| jg | || |j || q| | d S )NrP   r   rm   )rj   	ReduceSumrl   rb   ZConstantOfShapeZConcatZ	UnsqueezeZGatherZShaperl   r   r   Z_remove_mask)rn   outputsrW   rI   r&   )rc   rf   rJ   r   r_   rS   rW   ry   r"   Z	make_noderV   rK   	attributer   Zmake_attributer&   add_nodeZget_graph_by_nodere   r   )r/   rc   r   rM   Zop_input_idrY   Zparent_nodescastZconstantOfShapeconcatZ	unsqueezeZgatherrw   Zattention_noder2   r2   r3   clean_graph   sd    	




zBertOnnxModel.clean_graphc                 C   s   |    |   d S r4   )r   Zprune_graphr6   r2   r2   r3   postprocessB  s    zBertOnnxModel.postprocessNF)optionsadd_dynamic_axesc                 C   s  |d k	r|j s|   | j  | j  |d ks8|jrH|   |   |d ksV|jr^| 	  | 
  |   |d ks||jr|   |   |d ks|jr|   |d k	r| j|j |jrt| jtst| | j| j| j|j| _|d ks|jr|   |d ks|jr|   |   |d ks.|j rD|jt!j"k}| #| | j$  | %  |d ksh|j&r| j'dd | j'dd |d ks|j(r| )  |d k	r|j*r| +  |d k	r|j,r| -  | .  |r| /  t01d| 2   d S )NT)r;   Fzopset version: )3Zenable_shape_inferenceZdisable_shape_inferencer.   Zremove_identity_nodesZremove_useless_cast_nodesZenable_layer_normrD   rE   Zenable_gelur:   r|   r@   Zenable_skip_layer_normrF   rG   Zenable_rotary_embeddingsr[   r+   Zset_mask_formatZattention_mask_formatZuse_multi_head_attention
isinstancer,   r   r   r'   r&   Zenable_attentionr7   Zenable_qordered_matmulr\   rA   Zenable_embed_layer_normr   ZMaskIndexEndrC   Zremove_useless_reshape_nodesr   Zenable_bias_gelur<   Zenable_bias_skip_layer_normr?   Zenable_gelu_approximationr=   Zenable_gemm_fast_gelur>   Zremove_unused_constantrz   rp   rq   Zget_opset_version)r/   r   r   rB   r2   r2   r3   optimizeF  sb    



zBertOnnxModel.optimizec                 C   sd   i }ddddddddd	d
ddg}ddddg}|| D ]}|  |}t|||< q4td|  |S )z8
        Returns node count of fused operators.
        rj   rl   MultiHeadAttentionGeluFastGeluBiasGeluZGemmFastGeluLayerNormalizationSimplifiedLayerNormalizationSkipLayerNormalization SkipSimplifiedLayerNormalizationrH   QOrderedAttentionZQOrderedGeluZQOrderedLayerNormalizationZQOrderedMatMulzOptimized operators: )rd   rV   rp   rq   )r/   op_countZopsZq_opsoprf   r2   r2   r3   get_fused_operator_statistics  s0    
z+BertOnnxModel.get_fused_operator_statisticsc           	         s
   dkr|    td fdd}|d}|d|d |d }|d	|d
 |d }|d|d }|d|d }|dko|dko||ko|d| kp|d| k}|dkrtd |dkrtd |dkrtd |dkrtd |dkrtd |S )zA
        Returns True when the model is fully optimized.
        NZop_namec                    s     | pdS )Nr   )getr   fused_op_countr2   r3   r     s    z2BertOnnxModel.is_fully_optimized.<locals>.op_countrj   rl   r   r   r   r   r   r   r   r   r   r   r   zLayer Normalization not fusedz$Simple Layer Normalization not fusedzGelu (or FastGelu) not fusedz!EmbedLayerNormalization not fusedz+Attention (or MultiHeadAttention) not fused)r   strrp   debugwarning)	r/   r   r   ZembedZ	attentionZgeluZ
layer_normZsimple_layer_normZ
is_perfectr2   r   r3   is_fully_optimized  s4    





z BertOnnxModel.is_fully_optimized)use_symbolic_shape_inferc                 C   s   t | }|| d S r4   )r   convert)r/   r   Zpacking_moder2   r2   r3   convert_to_packing_mode  s    z%BertOnnxModel.convert_to_packing_mode)r   r   )rt   ru   )NF)N)F)'__name__
__module____qualname__r    intr*   r7   r:   r<   r=   r>   r?   r@   rA   rC   rD   rE   rF   rG   r[   r\   r   r   boolrh   ro   rs   rz   r|   r{   r   r   r   r   r   r   r   r   __classcell__r2   r2   r0   r3   r$   %   s8   
)BQ 
(r$   N)?loggingr   typingr   r   r   r   Zfusion_attentionr   r   Zfusion_bart_attentionr   Zfusion_biasgelur	   Zfusion_embedlayerr
   Zfusion_fastgelur   Zfusion_gelur   Zfusion_gelu_approximationr   Zfusion_gemmfastgelur   Zfusion_layernormr   r   Zfusion_optionsr   r   Zfusion_qordered_attentionr   Zfusion_qordered_gelur   Zfusion_qordered_layernormr   Zfusion_qordered_matmulr   Zfusion_quickgelur   Zfusion_reshaper   Zfusion_rotary_attentionr   Zfusion_shaper   Zfusion_simplified_layernormr   r   Zfusion_skiplayernormr   r   Zfusion_utilsr   Zonnxr    r!   r"   Z
onnx_modelr#   r   rp   r$   r2   r2   r2   r3   <module>   s6   