U
    T?h                     @   s   d dl mZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ eeZG d	d
 d
ZG dd de
ZdS )    )	getLogger)ListOptionalTupleUnionN)Fusion)AttentionMaskFormat)FusionUtilsNumpyHelper)	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @   sL   e Zd ZdZedddZedddZdd	 Zd
d Z	e
e
dddZdS )AttentionMask:
    Fuse Attention subgraph into one Attention node.
    )modelc                 C   s2   || _ i | _i | _t|| _tj| _| | _	d S N)
r   mask_indicemask_castedr	   utilsr   ZMaskIndexEndmask_formatZget_opset_versionopset_version)selfr    r   [/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/fusion_attention.py__init__   s    
zAttentionMask.__init__r   c                 C   s
   || _ d S r   r   )r   r   r   r   r   set_mask_format!   s    zAttentionMask.set_mask_formatc                 C   s*   || j kr|| j | kst|| j |< d S r   )r   AssertionError)r   mask
mask_indexr   r   r   set_mask_indice$   s    
zAttentionMask.set_mask_indicec                 C   s    t | jdksttt| jS Nr   )lenr   r   nextiter)r   r   r   r   get_first_mask)   s    zAttentionMask.get_first_mask)inputreturnc              	   C   sb  | j tjkrd S || jkr$| j| S | j|rB| j|\}}n| j|\}}d}|rd|| j	|< | j tj
kr~|| j|< |S | jd}| jdk rtjd|g|g| jddd}|jtddgtd	d
g npd}| j|d kr| jtj|tjdgdgdd tjd||g|g| jddd}|jtd	d
g | j| || j|< |S )NTr!      Z	ReduceSumZMaskReduceSuminputsoutputsnameaxes   Zkeepdimsr   Zort_const_1_reduce_sum_axesFr.   	data_typedimsvalsraw)r   r   ZNoMaskr   r   Zfind_graph_inputr   Zcast_graph_input_to_int32Zcast_input_to_int32r   r   create_node_namer   r   	make_node	attributeextendmake_attributeget_initializeradd_initializerZmake_tensorr   INT64add_node)r   r(   ZcastedZ
input_nameZ	cast_nodeZoutput_nameZmask_index_nodeZ	axes_namer   r   r   process_mask-   sV    




$	
zAttentionMask.process_maskN)__name__
__module____qualname____doc__r   r   r   r   r"   r'   strr?   r   r   r   r   r      s   
r   c                       s  e Zd ZdZdddddgfeeeee eee	e
 d fddZeeeef d	d
dZeeeef dddZedddZe
dddZe
e
e
dddZe
e
e
e
fdddZe
e
e
dddZe
e
dddZeeedf eedf e
eedf dd d!Zeeeeeedf eedf eeedf d"d#d$Zd.eeee
df eee
df eeedf eedf eee
e
e
e
e
e
e
eeedf d&d'd(Zd/e
eeeeeeeee
e
e
e
e
e
e
ee eeedf d)d*d+Zd,d- Z  ZS )0FusionAttentionr   NFSkipLayerNormalizationLayerNormalization)r   hidden_size	num_headsattention_maskuse_multi_head_attention!disable_multi_head_attention_biassearch_op_typesc           	         sh   |rdnd}t  ||| || _|| _|r0|nt|| _|| _|| _d | _d| _	d| _
d | _d| _d S )NMultiHeadAttention	AttentionT)superr   rH   rI   r   rJ   rK   rL   mask_filter_valuenum_heads_warninghidden_size_warningshape_infershape_infer_done)	r   r   rH   rI   rJ   rK   rL   rM   Zattention_op_name	__class__r   r   r   m   s    
zFusionAttention.__init__)concatr)   c                 C   s   t |jdkrv| j|jd }| j|jd }t|tjrv|jdkrvt|tjrv|jdkrv|d |d |d  fS | j| j	fS )aU  
        Detect num_heads and hidden_size from Concat node in the following subgraph:

        SkipLayerNormalization or EmbedLayerNormalization
                        /        |
                     MatMul    Shape
                        |        |
                       Add     Gather(indices=0)
                        |        |
                        |      Unsqueeze
                        |        |
                        |     Concat (*, -1, 12, 64)
                        |     /
                       Reshape
                          |
                       Transpose
                 r0   r   )
r$   r(   r   Zget_constant_value
isinstancenpZndarraysizerI   rH   )r   rX   rI   	head_sizer   r   r   )get_num_heads_and_hidden_size_from_concat   s    

z9FusionAttention.get_num_heads_and_hidden_size_from_concat)	reshape_qr)   c                 C   sR  | j |jd }|dkrf| j |d}|dk	rD|jdkrD| |S t|jd  d | j| j	fS t
|}t|dks|d dks|d dkrtd	| d
 | j| j	fS |d }|d }|| }| jdkr
|| jkr
| jr
td| j d| d d| _| j	dkrJ|| j	krJ| jrJtd| j	 d| d d| _||fS )zDetect num_heads and hidden_size from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q

        Returns:
            Tuple[int, int]: num_heads and hidden_size
        r0   NConcatz is not initializer.rY   rZ   r   r[   zq_shape_value=z7. Expected value are like [0, 0, num_heads, head_size].z--num_heads is z. Detected value is z. Using detected value.Fz--hidden_size is )r   r;   r(   Z
get_parentop_typer`   loggerdebugrI   rH   r
   to_arrayr$   rR   warningrS   )r   ra   Zq_shaperX   Zq_shape_valuerI   r_   rH   r   r   r   get_num_heads_and_hidden_size   s2    


$z-FusionAttention.get_num_heads_and_hidden_sizeadd_qkc                 C   s   | j s| jjdd| _d| _ | jd kr*d S | j|jd }| j|jd }|d ks^|d krttd| d d S ||krtd| d d S |jd S )	NT)updater   r0   zone of the inputs of z is Nonezthe shape of two inputs of z is not same)rU   r   Zinfer_runtime_shaperT   Zget_edge_shaper(   rd   re   )r   rj   Zinput_0_shapeZinput_1_shaper   r   r   get_add_qk_str   s    
zFusionAttention.get_add_qk_strc                    s    d t tfdd| j}t|dkr0S t|dks@t| jd}tjd fddt	| j
D g|dd	}| j| | j| j|< S )
NZ_maskc                    s   | j d  kS r#   )output)node)mask_output_namer   r   <lambda>       z0FusionAttention.reshape_add_qk.<locals>.<lambda>r0   r   rb   c                    s   g | ]} qS r   r   ).0_ri   r   r   
<listcomp>   s     z2FusionAttention.reshape_add_qk.<locals>.<listcomp>r,   r-   r.   axis)listfilternodes_to_addr$   r   r   r6   r   r7   rangerI   appendthis_graph_namenode_name_to_graph_name)r   rj   Zconcat_nodeconcat_node_nameZconcat_add_qk_fp32r   )rj   ro   r   reshape_add_qk   s     zFusionAttention.reshape_add_qk)past_kpast_vr)   c                 C   s   | j d}| j d}|d dd}|d dd}tjd|g|g|dgd}tjd|g|g|dgd}| j| | j| | j| j|< | j| j|< | j d}	|dd	ddd
d}
tjd||g|
g|	dd}| j| | j| j|	< |
S )zConcatenate past_k and past_v inputs to create past_kv input.

        Args:
            past_k (str): name of past K value
            past_v (str): name of past V value

        Returns:
            kv_output_name (str): name of past KV value
        	UnsqueezeZ_5d.rs   r   )r,   r-   r.   r/   rb   z.valuez.kv_valueZ_kvru   )	r   r6   replacer   r7   ry   r{   r|   r}   )r   r   r   Zunsqueeze_k_nameZunsqueeze_v_nameZ	k_5d_nameZ	v_5d_nameZk_5dZv_5dr~   Zkv_output_name	concat_kvr   r   r   r      sD    		zFusionAttention.concat_kvc                 C   s   d}| j |}|dkrJtjtjdd| j jgdd|d}| j || j | j 	d}| j 	d}|d	 
d
d}|d	 
d
d}tjd||g|g|d}	tjd||g|g|d}
| j|	 | j|
 | j| j|< | j| j|< ||fS )ah  Reshape past_k and past_v from 4D to 3D to use as inputs for multihead attention node.

        Args:
            past_k (str): name of past K value of shape 4D
            past_v (str): name of past V value of shape 4D

        Returns:
            k_3d (str): name of past K value of shape 3D
            v_3d (str): name of past V value of shape 3D
        Zkv_4d_to_3dNr   int64Zdtyper.   ReshapeZ_3dr   rs   r+   )r   r;   r   
from_arrayr]   arrayrH   r<   r|   r6   r   r   r7   ry   r{   r}   )r   r   r   Znew_dims_nameZnew_dimsZreshape_k_nameZreshape_v_nameZ	k_3d_nameZ	v_3d_nameZk_3dZv_3dr   r   r   
reshape_kv4  s:     zFusionAttention.reshape_kv)present_k_namepresent_v_namekv_nodec                 C   s   d\}}| j |}| j |}|dkrPtjtjddd|d}| j || j |dkrtjtjddd|d}| j || j | j d}| j d}	t	j
d||g|g|dd	}
t	j
d||g|g|	dd	}| j|
 | j| | j| j|< | j| j|	< dS )
a?  Split kv_node containing present KV values into separate present K and present V values.

        Args:
            present_k_name (str): name of output to store present K value in
            present_v_name (str): name of output to store present V value in
            kv_node (str): name of present KV values
        )Zindex_0Zindex_1Nr   r   r   r   r0   ZGatherru   )r   r;   r   r   r]   r   r<   r|   r6   r   r7   ry   r{   r}   )r   r   r   r   Zk_indexZv_indexZk_dimZv_dimZgather_k_nameZgather_v_name	present_k	present_vr   r   r   split_kve  s:    	zFusionAttention.split_kv)r   r   c           	   	   C   s   |d  dd}|d  dd}| jd}| jd}tjd|g|g|ddddgd	}tjd|g|g|ddddgd	}| j| | j| | j| j|< | j| j|< ||fS )
a}  Transpose past_k and past_v from (B,N,P,H) to (B,P,N,H)

        Args:
            past_k (str): name of past K value of shape (B,N,P,H)
            past_v (str): name of past V value of shape (B,N,P,H)

        Returns:
            past_k_transpose (str): name of past K value of shape (B,P,N,H)
            past_v_transpose (str): name of past V value of shape (B,P,N,H)
        Z_transposedr   rs   	Transposer   rZ   r0   r[   )r,   r-   r.   perm)	r   r   r6   r   r7   ry   r{   r|   r}   )	r   r   r   Zpast_k_transposeZpast_v_transposeZtranspose_k_nameZtranspose_v_nameZtranspose_kZtranspose_vr   r   r   transpose_kv  s.    

	zFusionAttention.transpose_kv)q_addk_addv_addname_prefixr)   c                 C   s   | j |jd p"| j |jd }t|}t|}t|}|d k	rx| j |jd pl| j |jd }	t|	}|d k	r| j |jd p| j |jd }
t|
}tj|||fdd}dt|j	 }|d }| j
||j|g|d |S )Nr0   r   rv   r[   	_qkv_biasr.   r2   r3   r4   )r   r;   r(   r
   rf   r]   Z
zeros_likestackprodshaper<   r2   )r   r   r   r   r   q_biasqbkbvbk_biasv_biasqkv_biasqkv_bias_dim	bias_namer   r   r   create_combined_qkv_bias  s(    $


$
$
z(FusionAttention.create_combined_qkv_bias)q_matmulk_matmulv_matmulr   r   r   rI   r)   c           $      C   s"  | j d}|jd |jd kr4|jd |jd ks8t| j |jd }	| j |jd }
| j |jd }t|	}t|
}t|}|j|jkr|j|jkst|jd }tj	|||fdd
|d| f}|d }| j||	j|jd |jd g|d |d }tjd|jd |g|g|d	}| j| j|< |g}|d
 }| j|tjdgdgdd |d }| j|tjdg|gdd |d }| j|tjdgd| gdd |d }| j|tjdgd| gdd |d }| j|tjdgdgdd |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |} |}!|}"||||g | jr|dk	r,| j |jd rdnd}#tt| j |j|# r,||jd|# < |} || | j| j|j< |dk	r| j |jd rNdnd}#tt| j |j|# r||jd|# < |}!|| | j| j|j< |dk	r| j |jd rdnd}#tt| j |j|# r||jd|# < |}"|| | j| j|j< | j| | |!|"fS )a  Create packed QKV MatMul node before MultiHeadAttention node.
           This is for the scenario where an Attention node should be created but cannot be created
           because past_key and past_value are separate inputs and not one concatenated input.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of heads

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        MatMulr   r0   r   r[   _qkv_weightr   Z_qkv_outr+   Z_q_start_indexFr1   Z_k_start_indexZ_v_start_indexrZ   Z_end_of_qkv_indexZ_qkv_last_axisr   Z_q_outSliceZ_k_outZ_v_outN)r   r6   r(   r   r;   r
   rf   r   r]   r   Zreshaper<   r2   r   r7   r|   r}   r   r=   r.   r9   rL   anyr{   ry   )$r   r   r   r   r   r   r   rI   Zmatmul_node_nameq_weightk_weightv_weightqwkwvwd
qkv_weightZqkv_weight_nameZqkv_matmul_outputZ
qkv_matmul	qkv_nodesZq_slice_nameZk_slice_nameZv_slice_nameZend_of_qkv_nameZqkv_last_axis_nameZq_slice_outputq_sliceZk_slice_outputk_sliceZv_slice_outputv_sliceZq_outputZk_outputZv_outputZinitializer_inputr   r   r   create_packed_qkv_matmul_node  s    ,



"






 

 

 
z-FusionAttention.create_packed_qkv_matmul_node )r   r   r   r   r   r   rI   rH   rm   key_padding_maskrj   r   r   r   r   
packed_qkvr)   c              	   C   s8  |dkst |dkr:|| dkr:td| d|  dS tdd | j jD }| jd}g }|r| |||||||\}}}|	|j
d |j
d |j
d g nt|tkrt|tkr| jr|	|j
d |j
d |j
d g n"|	|j
d |j
d |j
d g nlt|tkrzt|tkrz||krz||krz| jrb|	|j
d ||g n|	|j
d ||g ndS | js| ||||}|| n
|d |r|r|	|
|||g n|
s|r|	|
|g |	g}|r|r|	||g tjd	|||d
}d|_|j	td|g |S )a[  Create a MultiHeadAttention node.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            output (str): output name of MHA
            key_padding_mask (str): name of key padding mask
            add_qk (str): name of add after Q x K'
            past_k (str): name of past K value - (batch_size, num_heads, past_sequence_length, head_size)
            past_v (str): name of past V value - (batch_size, num_heads, past_sequence_length, head_size)
            present_k (str): name of present K value - (batch_size, num_heads, sequence_length, head_size)
            present_v (str): name of present V value - (batch_size, num_heads, sequence_length, head_size)
            packed_qkv (bool): whether to combine MatMuls from Q, K, V paths
                               Note: This is for the scenario where an Attention node should be created but cannot be created
                               because past_key and past_value are separate inputs and not one concatenated input.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   input hidden size # is not a multiple of num of heads Nc                 S   s   g | ]
}|j qS r   r   )rr   rn   r   r   r   rt     s     zCFusionAttention.create_multihead_attention_node.<locals>.<listcomp>rO   r   rN   r+   com.microsoftrI   )r   rd   re   setr   graphr(   r6   r   r9   rm   typer   rL   rD   r   r{   r   r7   domainr8   r:   )r   r   r   r   r   r   r   rI   rH   rm   r   rj   r   r   r   r   r   Zgraph_input_namesZmha_node_nameZ
mha_inputsr   r   r   r   Zmha_outputsZmha_noder   r   r   create_multihead_attention_nodef  sl    -      
$$$


z/FusionAttention.create_multihead_attention_node)r!   r   r   r   r   r   r   rI   rH   r(   rm   
add_qk_strr   r   r   r   scalecausalr)   c           6      C   s|  |dkst |	dkr:|	| dkr:td|	 d|  dS d}|dkrZ|dkrZ|dkrZd}| j|jd }| j|jd }| j|jd }d\}}}|r*| j|jd p| j|jd }| j|jd p| j|jd }| j|jd p| j|jd }|r&|r&|r&|s*dS |dkrLt|jd  d	 dS t|}t|}t|}|j	|j	ks|t |j	d }|j	d }|j	d }||  kr|ksn t |	dkr|	|krt
d
|	 d| d d} |j	|j	krd} t|j	dd }!t|j	dd }"t|j	dd }#d}$| rbtj|||fdd}%|!|" |# }$ntj|||fdd}%d|! }$|r6t|}&t|}'t|}(t|&j	})t|'j	}*t|(j	}+|)|*  kr|!ksn t |+|#kst | rtj|&|'|(fdd},|)|* |+ }-ntj|&|'|(fdd},d|) }-| jd}.| jsf| j|.d |j||$g|%d |r| j|.d |j|-g|,d | jr|rtd dS |jd |jd |jd |.d g}/|dk	r|/| tjd|/|g|.d}0n|
|.d |r|.d ndg}/|dk	r |/| n
|/d |o2|}1|1rP| ||}2|/|2 |dk	r~| |}3|1st|/d |/|3 |g}4|r|r|dddddd}5|4|5 | |||5 tjd|/|4|.d}0d|0_|0jtd|g |r|0jtddg |dk	r.|0jtd|g | rP|0jtd|!|"|#gg | jdk	rx|0jtd t | jg |0S )!a+  Create an Attention node.

        Args:
            mask_index (str): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for K
            v_matmul (NodeProto): MatMul node in fully connection for V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            input (str): input name
            output (str): output name
            add_qk_str (str): name of Add node after Q x K'
            past_k (str): name of input for past K value
            past_v (str): name of input for past V value
            present_k (str): name of output to store present K value
            present_v (str): name of output to store present V value
            scale: scale before softmax
            causal: whether it is uni-directional mask.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   r   r   NTFr0   )NNNzl is not an initializer. Please set do_constant_folding=True in torch.onnx.export to unblock attention fusionzInput hidden size (z3) is not same as weight matrix dimension of q,k,v (z:). Please provide a correct input hidden size or pass in 0r   r[   rO   r   r   r   zVMultiHeadAttention does not support relative_position_bias: cannot fuse the attention.rN   r+   r   z.key_keyr   rs   r   rI   Zunidirectionalr   Zqkv_hidden_sizesrQ   )!r   rd   re   r   r;   r(   printr
   rf   r   rg   r]   r   Zconcatenater   r6   rK   r<   r2   rm   r{   r   r7   r   r   r   r   r   r8   r9   r:   rQ   float)6r   r!   r   r   r   r   r   r   rI   rH   r(   rm   r   r   r   r   r   r   r   Zhas_biasr   r   r   r   r   r   r   r   r   Z
qw_in_sizeZ
kw_in_sizeZ
vw_in_sizeZis_qkv_diff_dimsZqw_out_sizeZkw_out_sizeZvw_out_sizeZqkv_weight_dimr   r   r   r   Zq_bias_shapeZk_bias_shapeZv_bias_shaper   r   Zattention_node_nameZattention_inputsZattention_nodeZpast_existsZpast_kvro   Zattention_outputsZ
present_kvr   r   r   create_attention_node  s   .
$$&






















z%FusionAttention.create_attention_nodec           6      C   s&  |}|j dkr0| j|dd}|d k	r,|}nd S | j|dddddgd d dddg}d }|d k	rp|\}}}	}
}n:| j|ddddgdd ddg}|d k	r|\}}}
}nd S g }t|jD ]0\}}||krq||d jd krq|| qt|dkrd S |d }| j|d	d}|d k	r||jd  }|d k	rht|d
krh|d }|j dkrb|jd }nd S n(|d k	rt|dkr|jd }nd S n6|j dkr|| }|D ]}|j dkr|jd }q|| }|j dkrt|jdkr|jd }|| }dd |D }|	ddkr d S | j|ddddgdddd g}|d krXt
d d S |\}}}}d}d}d}ddddgddd dgfddd	dgddd dgfddddgddd
dgfddddgdddd
gfdddgdddgfd}d }| D ]`\}} | j|| d | d }|d krq|dkr(d}|dkr6d}|dkrDd} qNq|d krft
d d S d }!d }"d }#|r|\}}#}"}n2|r|\}}!}#}"n|r|\}}}"n|\}}!}}"| j|"ddddgdddd g}$|$d kr| j|"dddddgddddd g}$|$d krt
d d S |$d }%|$d }&|$d }'| j|"ddddgdddd g}(|(d kr| j|"dddddgddddd g}(|(d krt
d  d S |(d })|(d }*d }+d },|r| j|#d!dd"gdddgfd"d#d#gdddgfd$d!dd"gddddgfg|\}}+}n|r| j|#d$d"d#d#gddddgfd"d#d#gdddgfg|\}}+}|!d k	r| |!},|,d krt
d%|!  d S nN|rnF| j|!d	d&d$d#d#gd ddddgfd	d&d#d#gd dddgfg|\}}+}|s|+d krt
d' d S |s8t|+dkr8|+d j d	kr8| j|+d \}}-|-d(kr8|-| _|jd |kr"|'jd |kr"|*jd |kr"|s| j|+d jd nd }.|d kr|	n|
}/| |%\}0}1|0dks|1dkrt
d) d S | |.|'|*||&|)||0|1||/jd |,}2|2d krd S | j|2 | j| j|2j< |d k	r|jd }3d*|3 }4| jd+|3 tjdgtdd|0t|1|0 gdd,}5| j t!"d|/jd |5jg|4gd-|3 | j |4|jd< | j#$|/|
|g | j#$| | j#$| j%s|$n
|$d d  | j#$| j%s|(n
|(d d  | j#$| j%s|n
|d d  d| _&d S ).NrG   Addr   r   r   r   ZEinsumr0   ZMulrZ      rF   rY   c                 S   s   g | ]
}|j qS r   )rc   )rr   childr   r   r   rt     s     z(FusionAttention.fuse.<locals>.<listcomp>r[   z&fuse_attention: failed to match v pathFZSoftmaxDivZWhere)Zpath1Zpath2path3path4path5r   Tr   r   z'fuse_attention: failed to match qk pathz&fuse_attention: failed to match q pathr   z&fuse_attention: failed to match k pathZExpandZEqualr   ZCastz4fuse_attention: failed to verify shape inference of Subz)fuse_attention: failed to match mask pathizmFailed to detect num_heads and hidden_size for Attention fusion. Please specify those parameters in argument.Zedge_modified_Zshape_modified_tensorr1   Zreshape_modified_)'rc   r   Zmatch_parentZmatch_parent_path	enumerater(   rm   r{   r$   countrd   re   itemsZmatch_parent_pathsrl   Zget_constant_inputrQ   rJ   r?   rh   rg   r   ry   r|   r}   r.   r<   r   r=   r]   r   intr>   r   r7   Znodes_to_remover9   rK   Zprune_graph)6r   Znormalize_nodeZinput_name_to_nodesZoutput_name_to_nodeZ
start_nodeZadd_before_layernormr   Zeinsum_noders   Zreshape_qkvZtranspose_qkvZ
matmul_qkvZother_inputsZ_ir(   Z
root_inputZmul_before_layernormZmul_childrenZlayernorm_nodechildrenr   Zparent_nodeZchildren_typesZv_nodesZadd_vZmatmul_vZ
is_distillZis_distill_addZis_no_mask_attentionZqk_pathsZqk_nodeskvrj   Z	matmul_qkZwhere_qkZq_nodesra   Zadd_qZmatmul_qZk_nodesZadd_kZmatmul_kZ
mask_nodesr   Zmul_valr!   Zattention_last_nodeZq_num_headsZq_hidden_sizenew_nodeZunique_indexZnew_edgeZshape_tensorr   r   r   fuse  s   
 
 
	

 








 


 


	





$
0 


	
   zFusionAttention.fuse)r   r   r   r   r   r   F)r   r   r   r   r   NF)r@   rA   rB   rC   r   r   r   r   boolr   rD   r   r   r   r`   rh   rl   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   rV   r   rE   h   s   	)71-)


 


        


|       
 frE   )loggingr   typingr   r   r   r   numpyr]   Zfusion_baser   Zfusion_optionsr   Zfusion_utilsr	   r
   Zonnxr   r   r   r   Z
onnx_modelr   r@   rd   r   rE   r   r   r   r   <module>   s   V