U
    T?hw                     @   s|  d dl mZ d dlmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ eeZG dd dZG dd dZG dd dZG dd dZ G dd dZ!G dd dZ"G dd dZ#G dd dZ$G dd de
Z%G dd deZ&G dd  d e%Z'G d!d" d"e%Z(G d#d$ d$e%Z)G d%d& d&e%Z*G d'd( d(eZ+dS ))    )	getLogger)ListOptionalN)DynamoOnnxHelper)Fusion)AttentionOpTypeFusionOptions) FusionBiasSkipLayerNormalizationFusionSkipLayerNormalization)NumpyHelper)
ModelProto	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @   s   e Zd Zdd ZdS )ProcessGemmWFuncc                 C   s   t |dS )N   r   )np	transposeselfx r   Y/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/onnx_model_phi.py__call__   s    zProcessGemmWFunc.__call__N__name__
__module____qualname__r   r   r   r   r   r      s   r   c                   @   s   e Zd Zdd ZdS )ProcessMatMulQFuncc                 C   s   t t |ddd dS )N   r   r   r   r   splitr   r   r   r   r      s    zProcessMatMulQFunc.__call__Nr   r   r   r   r   r!      s   r!   c                   @   s   e Zd Zdd ZdS )ProcessMatMulKFuncc                 C   s   t t |ddd dS )Nr"   r   r   r   r#   r   r   r   r   r       s    zProcessMatMulKFunc.__call__Nr   r   r   r   r   r%      s   r%   c                   @   s   e Zd Zdd ZdS )ProcessMatMulVFuncc                 C   s   t t |ddd dS )Nr"   r      r   r#   r   r   r   r   r   %   s    zProcessMatMulVFunc.__call__Nr   r   r   r   r   r&   $   s   r&   c                   @   s   e Zd Zdd ZdS )ProcessBiasQFuncc                 C   s   t |ddd }|S )Nr"   r   r   r$   r   r   r   r   r   *   s    zProcessBiasQFunc.__call__Nr   r   r   r   r   r(   )   s   r(   c                   @   s   e Zd Zdd ZdS )ProcessBiasKFuncc                 C   s   t |ddd }|S )Nr"   r)   r   r*   r   r   r   r   r   0   s    zProcessBiasKFunc.__call__Nr   r   r   r   r   r+   /   s   r+   c                   @   s   e Zd Zdd ZdS )ProcessBiasVFuncc                 C   s   t |ddd }|S )Nr"   r)   r'   r*   r   r   r   r   r   6   s    zProcessBiasVFunc.__call__Nr   r   r   r   r   r,   5   s   r,   c                   @   s   e Zd Zdd ZdS )ProcessRotCacheFuncc                 C   s8   t |jdkst|jd dkr4|d d ddf S |S )Nr'   r       r      )lenshapeAssertionErrorr   r   r   r   r   <   s    zProcessRotCacheFunc.__call__Nr   r   r   r   r   r-   ;   s   r-   c                       s  e Zd Zeee d fddZedddZdd Z	d	d
 Z
dd Zdd Zd2ddZdd Zdd Zdd Zee eee dddZd3ee ee edddZd4ee ee eddd Zd5ee ee edd"d#Zd6ee ee edd$d%Zd7ee ee edd&d'Zd8ee ee edd(d)Zd9ee ee edd*d+Zd:ee ee edd,d-Zd;ee ee edd0d1Z  ZS )<Fission)modelnodes_to_findc                    s   t  |d| d S )NZDONOTUSEsuper__init__)r   r4   r5   	__class__r   r   r8   F   s    zFission.__init__attn_op_typec                 C   s
   || _ d S Nr;   )r   r<   r   r   r   set_attention_op_typeM   s    zFission.set_attention_op_typec                 C   s   |d t | S )N_)str)r   layer_idnamer   r   r   	get_unameP   s    zFission.get_unamec                 C   sB   |D ](}||ks$| |s$||r|  S qtd| dd S )NzEdge z
 not found)endswith
startswith
ValueError)r   edgesrB   edger   r   r   get_edge_by_nameS   s    
zFission.get_edge_by_namec                 C   s   |  |j|S r=   )rI   inputr   noderB   r   r   r   get_input_by_nameY   s    zFission.get_input_by_namec                 C   s   |  |j|S r=   )rI   outputrK   r   r   r   get_output_by_name\   s    zFission.get_output_by_nameNc                 C   sd   | j |}t|}||}tj|d kr2|d n|tj|j|	 
 dd}| j || j |jS )NZ
_processedTZ	data_typeZdimsvalsraw)r4   get_initializerr   to_arrayr   make_tensorr   FLOATr1   flattentobytesadd_initializerthis_graph_namerB   )r   Zinitializer_nameZfunctorZcustom_nameiZ
i_np_arrayZprocessed_i_np_arrayZ
new_tensorr   r   r   process_initializer_   s    

zFission.process_initializerc                 C   s&   | j  j }||_tj|jj_	d S r=   )
r4   graph
value_infoaddrB   r   rV   typetensor_type	elem_typer   rB   new_value_infor   r   r   add_fp32_value_infom   s    zFission.add_fp32_value_infoc                 C   s&   | j  j }||_tj|jj_	d S r=   )
r4   r]   r^   r_   rB   r   INT64r`   ra   rb   rc   r   r   r   add_int64_value_infor   s    zFission.add_int64_value_infoc                 C   s\   | j  jD ]$}|j|kr| j  j|  q2qtj|tj|d}| j  j	|g d S )Nrb   r1   )
r4   r]   r^   rB   remover   make_tensor_value_infor   rV   extend)r   rB   r1   r^   rd   r   r   r   replace_fp32_value_infow   s    
zFission.replace_fp32_value_info)subgraph_nodesrA   layer_known_edges_namesc                 C   s   |D ]}t |jD ]>\}}|dkr&qq||kr| |||j|< | |j|  qt |jD ]>\}}|dkrpq\q\||kr\| |||j|< | |j|  q\| ||j|_| j| | j| j	|j< qd S )N )
	enumeraterJ   rC   re   rN   rB   nodes_to_addappendrZ   node_name_to_graph_name)r   rm   rA   rn   new_noder[   rB   r   r   r   set_unique_name_and_add_nodes   s     z%Fission.set_unique_name_and_add_nodesro   )inputsoutputsprefixc                 C   s>   t |dkstt |dks ttjd|||d dd}|gS )Nr"   r   LayerNormalizationZ_LayerNormalizationg   >)rv   rw   rB   epsilonr0   r2   r   	make_noder   rv   rw   rx   rL   r   r   r   	layernorm   s    zFission.layernormc                 C   sr   t |dkstt |dks ttjd|d |d g|d g|d d}tjd|d |d g||d	 d}||gS )
Nr"   r   ZMatMulr   Z
matmul_outrv   rw   rB   Addr'   ZBiasr{   )r   rv   rw   rx   matmulr_   r   r   r   gemm   s    zFission.gemmr.   c              	   C   sB   t |dkstt |dks ttjd|||d d||d}|gS )N   r   ZRotaryEmbeddingcom.microsoft)rv   rw   rB   domainrotary_embedding_dim	num_headsr{   )r   rv   rw   rx   Zrot_dimr   rL   r   r   r   rotary   s    	zFission.rotaryc                 C   s>   t |dkstt |dks ttjd|||d dd}|gS )Nr   FastGelur   )rv   rw   rB   r   r{   r}   r   r   r   fastgelu   s    zFission.fastgeluc                 C   s<   t |dkstt |dks ttjd|||d d}|gS )Nr'   r   r   r   r{   r}   r   r   r   r_      s    zFission.addc              	   C   sB   t |dkstt |dks ttjd|||d d|dd}|gS )N   r"   MultiHeadAttentionr   r   )rv   rw   rB   r   r   unidirectionalr{   r   rv   rw   rx   r   rL   r   r   r   mha   s    	zFission.mhac              	   C   sB   t |dkstt |dks ttjd|||d d||d}|gS )N   r"   GroupQueryAttentionr   )rv   rw   rB   r   r   Zkv_num_headsr{   r   r   r   r   gqa   s    	zFission.gqac                 C   sF   t |dkstt |dks ttjd|||d d|dddd	}|gS )N   r'   	Attentionr   r   r.   )rv   rw   rB   r   r   r   Z	do_rotaryr   r{   r   r   r   r   	attention   s    zFission.attentionP      %?c                 C   sF   t |dkstt |dks ttjd|||d d||||d	}|gS )N   r   PagedAttentionzvllm.ort.ext)rv   rw   rB   r   r   Znum_kv_heads	head_sizescaler{   )r   rv   rw   rx   r   r   r   rL   r   r   r   
paged_attn  s    	zFission.paged_attn)N)ro   )ro   )ro   r.   r.   )ro   )ro   )ro   r.   )ro   r.   )ro   r.   )ro   r.   r   r   )r   r   r    r   r   r@   r8   r   r>   rC   rI   rM   rO   r\   re   rg   rl   r   intru   r~   r   r   r   r_   r   r   r   r   __classcell__r   r   r9   r   r3   E   sB   
      r3   c                       sX   e Zd Zeeed fddZedddZdd Ze	d	d
dZ
e	d	ddZ  ZS )Phi2PreProcessorr4   r   hidden_sizec                    s(   t  | d| _|| _|| _d| _d S )Nr.   Zmodeling_phi_PhiModel_model_1)r7   r8   num_hidden_layersnum_attention_headsr   	func_namer   r4   r   r   r9   r   r   r8     s
    zPhi2PreProcessor.__init__)returnc                 C   s   i }d|d< d|d< d|d< d|d< t d	| jd	D ]X}d
| |d| < d| |d| < d| |d| d< d| |d| d< q2dd | jjjD }d|krd|krd|d< d|d< n$d|krd|kstd|d< d|d< |S )NZlogitsZ	lm_head_1	input_idsZl_input_ids_Z
past_key_0Z
key_statesZpast_value_0Zvalue_statesr   Z	past_key_Zkey_states_Zpast_value_Zvalue_states_Zpresent_key_Zmodel_layers__1Zpresent_value_Z_1_1c                 S   s   g | ]
}|j qS r   rB   ).0or   r   r   
<listcomp>3  s     z7Phi2PreProcessor.get_phi2_edge_dict.<locals>.<listcomp>Zmodel_layers_0_1_1Zmodel_layers_0_1_2Zpresent_key_0Zpresent_value_0Zmodel_layers_0_1)ranger   r4   r]   rN   r2   )r   Z	edge_dictr[   rw   r   r   r   get_phi2_edge_dict'  s$    
z#Phi2PreProcessor.get_phi2_edge_dictc                 C   s<   d}| j jjD ](}|j|}|dkr|j|d  |_qd S )NZ)modeling_phi_PhiDecoderLayer_model_layersr)   )r4   r]   rL   op_typefind)r   Zphi2_transformer_layer_namerL   indexr   r   r   simplify_phi2_op_type=  s
    z&Phi2PreProcessor.simplify_phi2_op_typer;   c              
   C   s  |t jk| _|t jk| _| jj}g }|jD ]}d|jkrt	j
|j| jsNtjntjddgd}t	j
dtjdgd}t	j
dtjddgd}t	j
dtjddgd}t	j
d	tjdgd}	| js||||gn||||	g | jr.d
|jkrt	j
|jd
d|jjjdd| jd| j| j gd}
||
g q*| jrd
|jkrpt	j
|j|jjjdddddgd}
||
g d|jkrt	j
|j|jjjddddgd}
||
g q*d
|jksd|jkr*t	j
|j|jjjd| jd| j| j gd}
||
g q*|d |j| g }t|jD ]\}}|dkr<||g n| jrd|jkrt	j
|jdd|jjjdd| jd| j| j gd}
||
g n@| jrn6t	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| d S )Nr   
batch_sizeseq_lenrh   stepr   position_idsattention_maskinput_metadatapast_keyZpastr'   Zpast_seq_lenZ
num_blocksr   Zhead_size_x
block_sizeZblock_x
past_valuer   rJ   r   present_keyZpresentZtotal_seq_lenrN   )r   r   Zuse_attnr   Zuse_vllmr4   r]   rJ   rB   r   rj   r   INT32rf   rk   replacer`   ra   rb   r   r   Z
ClearFieldrp   rN   )r   r<   r]   Z
new_inputsviZvi_iidZvi_stepZvi_pidZvi_maskZvi_metaZvi_cacheZnew_outputsr[   r   r   r   process_graph_ioD  s    










z!Phi2PreProcessor.process_graph_ioc                 C   s~   d }| j jD ]}|j| jr|j} q*q|d k	s6t| | | |   | 	  | 
  |tjkrp|   | | d S r=   )r4   Z	functionsrB   rD   r   r2   Zunroll_functionZupdate_edgesr   r   Zremove_dropout_layerr   r   Zremove_lm_head_layerr   )r   r<   Zfunction_namefuncr   r   r   preprocess_onnx  s    

z Phi2PreProcessor.preprocess_onnx)r   r   r    r   r   r8   dictr   r   r   r   r   r   r   r   r9   r   r     s
   }r   c                       s*   e Zd Zed fddZdd Z  ZS )FissionTransformerEmbeddingPhir4   c                    s   t  |dg d S )NZ6torch_nn_modules_sparse_Embedding_model_embed_tokens_1r6   r   r4   r9   r   r   r8     s    z'FissionTransformerEmbeddingPhi.__init__c           	      C   s   t d|j t|jdks tt|jdks2t|jd }|jd }| |d}|||g}tj	d||g|gddg}| 
|d| | j| d	| _d S )
NOptimizing %s...r'   r   r   zembed_tokens.weightGatherZEmbedding_Gatherr   T)loggerinforB   r0   rJ   r2   rN   rM   r   r|   ru   nodes_to_removerr   prune_graph)	r   rL   input_name_to_nodesoutput_name_to_noderJ   rN   Z	embeddingrn   rm   r   r   r   fuse  s"    


	z#FissionTransformerEmbeddingPhi.fuser   r   r    r   r8   r   r   r   r   r9   r   r     s   r   c                       s*   e Zd Zed fddZdd Z  ZS )FissionTransformerLayerNormPhir   c                    s   t  |dg d S )NZ@torch_nn_modules_normalization_LayerNorm_model_final_layernorm_1r6   r   r9   r   r   r8     s    z'FissionTransformerLayerNormPhi.__init__c           
      C   s   t d|j t|jdks tt|jdks2t|jd }|jd }| |d}| |d}||||g}g }	|	| 	|||g|gd | 
|	d| | |d	d
dg | |d	d
dg | j| d| _d S )Nr   r"   r   r   zfinal_layernorm.weightzfinal_layernorm.biasFinalc   r   r   r   T)r   r   rB   r0   rJ   r2   rN   rM   rk   r~   ru   rl   r   rr   r   )
r   rL   r   r   rJ   rN   	ln_weightln_biasrn   rm   r   r   r   r     s    

z#FissionTransformerLayerNormPhi.fuser   r   r   r9   r   r     s   r   c                       s*   e Zd Zed fddZdd Z  ZS )!FissionTransformerCausalLMHeadPhir   c                    s   t  |dg d S )NZ(torch_nn_modules_linear_Linear_lm_head_1r6   r   r9   r   r   r8     s    z*FissionTransformerCausalLMHeadPhi.__init__c           
      C   s   t d|j t|jdks tt|jdks2t|jd }|jd }| | |dt	 }| |d}||||g}g }	|	
| |||g|gd | |	d	| | |d
ddg | |d
ddg | j| d| _d S )Nr   r   r   r'   r   zlm_head.weightzlm_head.biasZLMHead_r   r   r   r   i   T)r   r   rB   r0   rJ   r2   rN   r\   rM   r   rk   r   ru   rl   r   rr   r   )
r   rL   r   r   rJ   rN   Z	fc_weightZfc_biasrn   rm   r   r   r   r     s    

z&FissionTransformerCausalLMHeadPhi.fuser   r   r   r9   r   r     s   r   c                       sD   e Zd Zeed fddZdd Zdd Zdd	 Zd
d Z	  Z
S )FissionTransformerBlockPhi)r4   r   c                    sT   || _ d}i | _g }t|D ]$}d| d}|| || j|< qt || d S )Nr.   Z*modeling_phi_PhiDecoderLayer_model_layers_r   )r   func_to_layer_idr   rr   r7   r8   )r   r4   r   Zmax_num_layersr5   layerr   r9   r   r   r8   6  s    
z#FissionTransformerBlockPhi.__init__c                 C   s   | j |j S r=   )r   r   )r   rL   r   r   r   get_layer_idF  s    z'FissionTransformerBlockPhi.get_layer_idc                 C   s   t jddgdgdtjdt jdddgdgd	d
t jdddgdgdd
t jddgdgdtjdt jddgdgdd
t jdddgdgdddt jddgdgdtjdg}|S )NZCastr   Z
mask_int64ZCast_gqa_aux_0)rv   rw   rB   toZ	ReduceSumoneZmask_row_sumsZReduceSum_gqa_auxr   SubZseqlens_k_int64ZSub_gqa_aux	seqlens_kZCast_gqa_aux_1ZShapeZ
mask_shapeZShape_gqa_aux_0r   Ztotal_seq_len_int64ZGather_gqa_aux_0r   )rv   rw   rB   axistotal_sequence_lengthZCast_gqa_aux_2)r   r|   r   rf   r   )r   gqa_aux_nodesr   r   r   get_gqa_aux_nodesI  sV    +z,FissionTransformerBlockPhi.get_gqa_aux_nodesc	                 C   sX  | j |}	| j |}
| j |}tt|	d}tt|
d}tt|d}tj|||fdd}| j |}| j |}| j |}t|}t|}t|}tj|||fdd}|jd }tj	|t
j||d g|  dd}| j || j tj	|t
j|d g|  dd}| j || j | |j | |j ||fS )Nr   r   )r   r   r"   TrP   )r4   rS   r   r   r   rT   stackr1   r   rU   r   rV   rW   rX   rY   rZ   re   rB   )r   Zq_wZk_wZv_wZq_bZk_bZv_bZweight_nameZ	bias_nameZq_weightZk_weightZv_weightZqwkwZvwZ
qkv_weightZq_biasZk_biasZv_biasZqbkbZvbZqkv_biasr   weightZbiasr   r   r   pack_qkv_gemmw  sD    






z(FissionTransformerBlockPhi.pack_qkv_gemmc           $      C   s  t d|j t d| j  | |}|jd }| |d}| |d}|jd }| |d}	| |d}
| |d	}| |d
}d\}}}}}}d\}}d\}}| jt	j
krH| | |dt }| | |dt }| | |dt }| |d}| |d}| |d}| | |dt }| | |dt }n\| | |d| |d| |d| |d| |d| |d| |d| |d\}}| | |dt }| |d}| | |dt }| | |dt }| |d}| |d}g }||||g |||	|
g |||g | jt	j
krf|||||||||g n|||g |||||||g |dddd d!d"g g }|| |||gd#g || d$||gd%gd& || d#||gd'gd( || d'gd)g || d)||gd*gd+ || d%d*gd,gd- || |d,g|gd. | jt	j
kr|| d#||gd/gd0 || d#||gd1gd2 || d#||gd3gd4 | jt	jkrd"nd}|| d/|||gd5gd0 || d1|||gd6gd2 | jt	jkrH|| d5d6d3d7dd7||gd$|	|
g n| jt	jkr|| d5d6d3||dd gd$|	|
g |dkr
|  } | D ] }!| j|! | j| j|!j< q| j !t"j#t$j%d8gd9d:d;d<| j n.| jt	jkrN|| &d5d6d3||d!gd$g nBd=| }"d>| }#||"|#g || 'd#||d|"gd$|#g | (||| | )|d?d@dAg | )|d?d@dAg | j*| dB| _+d S )CNr   zAttentionOpType: r   r   r   r)   r   Zpresent_valuezinput_layernorm.weightzinput_layernorm.bias)NNNNNN)NNzself_attn.q_proj.weightzself_attn.k_proj.weightzself_attn.v_proj.weightzself_attn.q_proj.biaszself_attn.k_proj.biaszself_attn.v_proj.biaszrotary_emb.cos_cachedzrotary_emb.sin_cachedattn_qkv_weightattn_qkv_biaszself_attn.dense.weightzself_attn.dense.biaszmlp.fc1.weightzmlp.fc2.weightzmlp.fc1.biaszmlp.fc2.biasr   r   r   r   r   r   Zln_outZattn_outZattn_add_outZOutProj_Zfc1_outZFC1_Zgelu_outZfc2_outZFC2_Zresidual_1_outZ
Residual_1Z
Residual_2queryZQ_keyZK_valueZV_Z	query_rotZkey_rotro   r   Zint64)Zdtyper   r   Zpast_Zpresent_r   r   r   T),r   r   rB   r<   r   rJ   rM   rN   rO   r   r   r\   r   r-   r   rC   rk   r~   r   r   r_   r   r   r   r   r   r   r   rq   rr   rZ   rs   r4   rY   r   Z
from_arrayr   arrayr   r   ru   rl   r   r   )$r   rL   r   r   rA   Zi_hidden_statesZi_key_cacheZi_value_cacheZo_hidden_statesZo_key_cacheZo_value_cacher   r   Zattn_q_weightZattn_q_biasZattn_k_weightZattn_k_biasZattn_v_weightZattn_v_biasr   r   Z	cos_cacheZ	sin_cacheZattn_out_weightZattn_out_biasZmlp_fc1_weightZmlp_fc2_weightZmlp_fc1_biasZmlp_fc2_biasrn   rm   Zpos_ids_namer   rt   Z	past_nameZpresent_namer   r   r   r     s$   



 
 
 
 
 








 	
 

 zFissionTransformerBlockPhi.fuse)r   r   r    r   r   r8   r   r   r   r   r   r   r   r9   r   r   5  s   .*r   c                       sR   e Zd Zeeed fddZdee ed fddZ	d	d
 Z
dddZ  ZS )PhiOnnxModelr   c                    sJ   t  | t| j||| _t| || _t| | _t	| | _
t| | _d S r=   )r7   r8   r   r4   phi2_preprocessorr   fission_transformer_blockr   fission_causal_lm_headr   fission_transformer_layernormr   fission_transformer_embeddingr   r9   r   r   r8   P  s    

zPhiOnnxModel.__init__NF)optionsadd_dynamic_axesc                    s   |d k	st |j}| j| | j| | j  | j  | j  | j	  t
   t| | _t| | _| j  | j  d S r=   )r2   Zattention_op_typer   r>   r   r   applyr   r   r   r7   r   r
   Zfuse_slnr	   Zfuse_bias_sln)r   r   r   r<   r9   r   r   optimizeX  s    







zPhiOnnxModel.optimizec              	   C   sN   i }ddddddddd	g	}|D ]}|  |}t|||< qtd
|  |S )z8
        Returns node count of fused operators.
        r   r   r   r   GeluBiasGelur   ry   SkipLayerNormalizationzOptimized operators: )Zget_nodes_by_op_typer0   r   r   )r   op_countZopsopZnodesr   r   r   get_fused_operator_statisticsm  s     
z*PhiOnnxModel.get_fused_operator_statisticsc                    s    dkr|    td fdd}|d|d |d |d }|d	|d
 |d }|d|d }|dko||ko||k}|dkrtd |dkrtd |dkrtd |S )zA
        Returns True when the model is fully optimized.
        NZop_namec                    s     | pdS )Nr   )getr   fused_op_countr   r   r     s    z1PhiOnnxModel.is_fully_optimized.<locals>.op_countr   r   r   r   r   r   r   ry   r   r   zLayer Normalization not fusedzGelu (or FastGelu) not fusedz+Attention (or MultiHeadAttention) not fused)r   r@   r   debugwarning)r   r  r   r   ZgeluZ
layer_normZ
is_perfectr   r   r   is_fully_optimized  s*    


zPhiOnnxModel.is_fully_optimized)NF)N)r   r   r    r   r   r8   r   r   boolr   r   r  r   r   r   r9   r   r   O  s   r   ),loggingr   typingr   r   numpyr   Zdynamo_onnx_helperr   Zfusion_baser   Zfusion_optionsr   r   Zfusion_skiplayernormr	   r
   Zfusion_utilsr   Zonnxr   r   r   r   r   Z
onnx_modelr   r   r   r   r!   r%   r&   r(   r+   r,   r-   r3   r   r   r   r   r   r   r   r   r   r   <module>   s:   
 Z 4"!!  