U
    T?hc                     @   s   d dl mZ d dlmZmZmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZ eeZG dd de	ZG d	d
 d
eZdS )    )	getLogger)DictListOptionalTupleUnion)Fusion)FusionUtils)	NodeProtoTensorProtohelper)	OnnxModelc                       s
  e Zd ZdZd*eed fddZeede	eef f ddd	Z
eeeee f eed
ddZdd Zdd Zdd Zdd Zdd Zee	eedef f dddZd+eeeeedef ee dddZdd Zd d! Zd,d"d#Zd$d% Zd&d' Zd(d) Z  ZS )-FusionEmbedLayerNoMaskz
    Fuse embedding layer into one node (EmbedLayerNormalization).
    It supports the following model types: BERT, DistilBert, ALBert.
    no mask)modeldescriptionc                    s<   t  |dddg| t|| _d | _d| _d | _d | _d S )NEmbedLayerNormalizationLayerNormalizationSkipLayerNormalizationF)super__init__r	   utilsshape_infershape_infer_done	attention
embed_node)selfr   r   	__class__ \/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/fusion_embedlayer.pyr      s    
zFusionEmbedLayerNoMask.__init__N)addreturnc                 C   sP   | j |dgdg}|d kr d S | j |dgdg}|d kr@d S |d |d fS )NGatherr      )r   match_parent_path)r   r!   gather_0_pathgather_1_pathr   r   r    match_two_gather&   s    z'FusionEmbedLayerNoMask.match_two_gather)	layernorminput_name_to_nodesis_distil_bertr"   c           
   	   C   s  | j j|d|dd| _| jdk	r$dS |jd |kr6dS ||jd  }tdd |D }|d	d	d	d
gkr|D ]^}|jd
krj| j |dd	dd	gddddg}|dk	rj|d jd |jd krj|d | _ dS qjt	|dkr|d jd	kr|d jd |kr||d jd  }t	|dkr|d jdkr|d jd |kr||d jd  }	|	D ]}|jdkrV|| _ dS qVtdd |	D }|r|d	d	d	dd
gkr|dd	d	d	ddgkr|dd	d	d	dgkrt
d dS n2|dd	d	d	gkr|d	d	d	d
gkrt
d dS dS )a  Check that LayerNormalization has a child of Attention node or subgraph like Attention.

        Args:
            layernorm (NodeProto): LayerNormalization node
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            is_distil_bert (bool): whether it is DistilBert or not

        Returns:
            bool: whether there is Attention node or subgraph like Attention
        	AttentionF)	recursiveNTr   c                 S   s   g | ]
}|j qS r   op_type.0childr   r   r    
<listcomp>K   s     zCFusionEmbedLayerNoMask.check_attention_subgraph.<locals>.<listcomp>ZMatMulr   AddMultiHeadAttention   r$   c                 S   s   g | ]
}|j qS r   r.   r0   r   r   r    r3   h   s     Shapez<No Attention like subgraph in children of LayerNormalization)r   Zfind_first_child_by_typer   outputsortedr/   r%   inputcross_attentionlenloggerdebug)
r   r)   r*   r+   childrenZchildren_typesnodepath1ZgrandchildrenZnodesr   r   r    check_attention_subgraph1   s|       



 
2



z/FusionEmbedLayerNoMask.check_attention_subgraphc              	   C   s,  | j |ddgddg}|dkrL| j |ddddgddddg}|dkrLd	S |d |d
  }}|jd |krpd	S | j |dddddgdddddgfddddgddddgfg|\}}}|dkrd	S |d }	| j|	ddr| j|	ddsd	S |d }
| j|
ddsd	S |d
 }|jd |kr(d	S dS )az    Match position embedding path from input_ids to Gather for DistilBert.

        Pattern is like the following:
                 (input_ids)
                      |
                     Shape
                       |                          |    Gather (indices=1)
                       |       |
                       |      Cast (optional)
                       |       |
                       |      Range (start=0, end=*, delta=1)
                       |       |
                       |    Unsqueeze
                       |    /
                      Expand
                        |
                      Gather
        ZExpandr8   r$   NZWhereZReshaper7   r   Fr6   	UnsqueezeRangeCastr#   T)r   r%   r;   Zmatch_parent_pathsr   check_node_input_value)r   position_embedding_gather	input_idsoutput_name_to_noderB   expandshape_Zpath2Z
range_nodeZgather_nodeZ
shape_noder   r   r    #match_position_embedding_distilbert   sD    


z:FusionEmbedLayerNoMask.match_position_embedding_distilbertc                 C   s   dS )aY  Match position embedding path from input_ids to Gather for Roberta.

        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
                                                |                              ^
                                                V                              |
                                                +------------------------------+

        Roberta new pattern from transformers v4.9:
           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
                                                |                                           ^
                                                V                                           |
                                                +-------------------------------------------+

        start_node = position_embedding_gather
        start_index = 1

        # match optional Cast node.
        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
        if parent is None:
            return
        if parent.op_type == "Cast":
            if OnnxModel.get_node_attribute(parent, "to") != 7:
                return
            start_node = parent
            start_index = 0

        i, path, return_indices = self.model.match_parent_paths(
            start_node,
            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
            output_name_to_node)

        if path is not None:
            # constant input of Add shall be 1.
            i, value = self.model.get_constant_input(path[0])
            if value != 1:
                return False

            _, self.padding_word_id = self.model.get_constant_input(path[-1])

            return input_ids == path[-1].input[0]
        Fr   r   rI   rJ   rK   r   r   r     match_position_embedding_roberta   s    -z7FusionEmbedLayerNoMask.match_position_embedding_robertac                 C   s  | j |ddgddg|}|dkr&dS |\}}| j |jd }|dk	rt|jdkr|jd dkr| j|ddgr| j|ddgrt|jd	ks| j|d	dgsdS | j  }|d
k rt	
|ddgsdS n| j|ddgsdS | j |d|}	|	dkrdS |	jdkr<| j|	dds*dS | j |	d|}
n|	}
|
dksV|
jdkrZdS | j|
ddspdS | j |
d|}|dks|jdkrdS ||jd kS )a	    Match position embedding path from input_ids to Gather for BERT.

        BERT Embedding Layer Pattern:
                                    (input_ids)
                                   /                                          /          Shape
                                /              |
                              /              Gather (indices=1)
                             /                  |
                            /                  Add (optional, B=0)
                           /                    |
                        Gather (segment_ids) Unsqueeze (axes=0)
                           \        |           |
                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
                              \    /            |
                                Add          Gather
                                   \       /
                                      Add
                                       |
                                LayerNormalization
        SlicerD   r$   r7   NFr            Zaxesr4   r#   r8   )r   r%   get_constant_valuer;   r=   rM   r   rH   Zget_opset_versionr	   Zcheck_node_attributeZ
get_parentr/   )r   rI   rJ   rK   pathsliceZ	unsqueezeZslice_weightZopset_versionrA   ZgatherrM   r   r   r    match_position_embedding_bert   s^    

z4FusionEmbedLayerNoMask.match_position_embedding_bertc                 C   s(   |  |||rdS | |||r$dS dS )NTF)rY   rO   rP   r   r   r    match_position_embedding:  s
    z/FusionEmbedLayerNoMask.match_position_embeddingc                 C   s  |j d }|r|j d nd}|j d }| jsB| jjdd| _d| _| jdk	r| j|}| j|}|rl|sptt|dkrt|dkr|d |d kst	d| d|  dS |r| j
||st	d	| d
| j|  dS | j|j d }	|	dkst|	jdkr t	d dS | j|j d }
|
dksbt|
jdksb|	jd |
jd krpt	d dS |r| j|j d }|dkst|jdks|	jd |jd krt	d dS |	jd |
jd krtd|j d  d|	jd  d|j d  d|
jd   |r|	jd |jd krltd|j d  d|	jd  d|j d  d|jd   |
jd |jd krtd|j d  d|
jd  d|j d  d|jd   dS )zXSanity check of embedding weights, and match hidden_size of weights and shape of inputs.r$   NT)updater7   z^Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: z vs FzYCannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: z != r   zICannot fuse EmbedLayerNormalization: word embedding table is not expectedzMCannot fuse EmbedLayerNormalization: position embedding table is not expectedzLCannot fuse EmbedLayerNormalization: segment embedding table is not expectedzword_embedding_table (z) size z <= position_embedding_table (z <= segment_embedding_table (zposition_embedding_table ()r;   r   r   Zinfer_runtime_shaper   Zget_edge_shapeAssertionErrorr=   r>   infoZcompare_shaperV   rM   warning)r   word_embedding_gathersegment_embedding_gatherrI   rJ   segment_idsposition_idsZinput_ids_shapeZposition_ids_shapeZword_embedding_tableZposition_embedding_tableZsegment_embedding_tabler   r   r    check_embeddingH  s|    







222z&FusionEmbedLayerNoMask.check_embedding)
input_namer"   c                 C   sX   d}| j |}|dk	r@|jjjtjkr:| j|\}}qP|}n| j|\}}||fS )a  Cast a graph input or node input to int32.

        Args:
            input_name (str): name of graph input or node input

        Returns:
            A tuple of casted input name and the cast node.
            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
        N)	r   find_graph_inputtypeZtensor_typeZ	elem_typer   ZINT32r   Zcast_input_to_int32)r   rd   Zinput_cast_nodeZgraph_inputZint32_outputr   r   r    cast_to_int32  s    z$FusionEmbedLayerNoMask.cast_to_int32F)rJ   r)   r_   rI   r`   rb   c	                 C   s  g }	|  |\}}
| jd}|jdkr>|jd }|jd }n|jd }|jd }d}|dk	r|  |jd \}}
|||jd |jd |jd ||g}n|d|jd |jd d||g}|dk	r|d |  |\}}
|| |d	 |d
 g}|r|dk	r|n|d }|| tjd|||d}d|_|j	D ] }|j
dkr4|j	|g q4t|j	dkr||j	tddg |	| |	D ]}| j| j|j
< q| j|	 || _|S )ag  Create an EmbedLayerNormalization node. Note that segment embedding is optional.

        Args:
            input_ids (str): input_ids for word embeddings
            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
            word_embedding_gather (NodeProto): the Gather node for word embedding
            position_embedding_gather (NodeProto): the Gather node for position embedding
            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.

        Returns:
            NodeProto: the EmbedLayerNormalization node created.
        r   r   r$   r7   rS   Nr    _outputZ_dummy_mask_indexZ_embedding_sum)outputsnamezcom.microsoftepsilong-q=)rg   r   Zcreate_node_namer/   r;   appendr   Z	make_nodedomain	attributerk   extendr=   Zmake_attributeZthis_graph_nameZnode_name_to_graph_namenodes_to_addr   )r   rJ   r)   r_   rI   r`   rb   embedding_sum_outputembedding_sum_namerq   rN   Z	node_namegammabetaZembed_node_inputsra   Zembed_node_outputsrk   r   ZattrA   r   r   r    create_fused_node  sj    









z(FusionEmbedLayerNoMask.create_fused_nodec                 C   s$   | j |jd |jd  d| _d S )Nr   T)r   replace_input_of_all_nodesr9   Zprune_graph)r   r)   r   r   r   r    finish_fusion  s    z$FusionEmbedLayerNoMask.finish_fusionc                 C   s*   |j dko(t|jdko(t|jd dkS )Nr   rS   r   )r/   r=   r9   )r   rA   r   r   r    "is_skip_layer_norm_with_sum_output  s    z9FusionEmbedLayerNoMask.is_skip_layer_norm_with_sum_outputc              
   C   s  |  |}|d krdS |\}}|jd }	|jd }
| j||ddsFdS | |d |sXdS |jdkr| |}d}|}|r|jd nd }|d k	o| j|d k	}n|}|jdkrdnd}t	|j|kr|j| nd }|d k	o| j|d k	}|o||kot	|| dk}|d k	o(|jdkp(|p(|}| j
|	|||||
||rF|nd d}|rxd	|j|< |sx| j||jd
  | || dS )NFr$   r+   r   rS   r4   r   )rr   rs   Z_no_use__to_be_removed_r7   T)r(   r;   rC   rc   r/   ry   r9   r   Zfind_graph_outputr=   rv   rw   rx   )r   r)   add_before_layernormr*   rK   optional_segment_gather
two_gatherr_   rI   rJ   rb   Zneed_embedding_sum_outputZsum_output_indexZnode_with_sum_outputZ
sum_outputZis_sum_graph_outputZis_sum_used_by_multiple_nodesr   r   r   r    	fuse_gpt2  sX    






z FusionEmbedLayerNoMask.fuse_gpt2c           
      C   s   |  |}|dkrdS |\}}|jd }| j||dds<dS | |||sNdS | |d|s`dS | ||||d}	| ||	 dS )a  Fuse embedding layer for DistilBert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        NFr$   Trz   )r(   r;   rC   rZ   rc   rv   rx   )
r   r)   r{   r*   rK   r}   r_   rI   rJ   r   r   r   r    fuse_distilbertd  s(    

    z&FusionEmbedLayerNoMask.fuse_distilbertc                 C   s   | j |dgdg}|dkr dS | |d }|dkr:dS |\}}|jd }	| j||dds`dS | j |dgdg}
|
dkrdS |
d }| ||	|s| ||	|sdS |}|}|}| |||sdS | |	||||}| || dS )	a  Fuse embedding layer for Bert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        r4   r   NFr$   rz   r#   T)	r   r%   r(   r;   rC   rZ   rc   rv   rx   )r   r)   r{   r*   rK   Zadd_2_gatherr}   r_   r`   rJ   Zposition_embedding_pathrI   tempr   r   r   r    	fuse_bert  s>    	
z FusionEmbedLayerNoMask.fuse_bertc           	      C   s  | j |dgdg}|jdkr8|d kr*d S |d }d }n| j |dgdg}| j |dgdg}|d kr|d k	r|d kr|d S |d }|d }nJ|d k	r|d kr| j |dgdg}|d krd S |d }|d }n|}d }| |||||rd S | ||||rd S | ||||rd S d S )Nr4   r   r   r#   r$   )r   r%   r/   r~   r   r   )	r   rA   r*   rK   Zfirst_add_pathr{   r|   r&   r'   r   r   r    fuse  sB    


    zFusionEmbedLayerNoMask.fuse)r   )NFN)N)__name__
__module____qualname____doc__r   strr   r
   r   r   r(   r   r   boolrC   rO   rQ   rY   rZ   rc   rg   r   rv   rx   ry   r~   r   r   r   __classcell__r   r   r   r    r      s@    T>/HJ    
b 
Q)2r   c                       s8   e Zd Zd	ed fddZdd Z fddZ  ZS )
FusionEmbedLayerNormalizationF)r   c                    s   t  |d || _d S )Nz	with mask)r   r   use_mask_index)r   r   r   r   r   r    r     s    z&FusionEmbedLayerNormalization.__init__c                 C   s   | j }t|jdkr0|j| td|j nDt|jdkrb|jd sb||jd< td|j ntd|j d S |D ]H}td|j |jdkr|jd |jd< qx|jd	krx|jd |jd
< qxd S )N   zappend mask to %szreplace mask in %szskip mask in %szupdate mask_index in %sr,   r$   rS   r5   rT   )	r   r=   r;   rm   r>   r?   rk   r/   r9   )r   
mask_int32attention_nodesr   Zattention_noder   r   r    replace_mask  s    


z*FusionEmbedLayerNormalization.replace_maskc                    sh  d | _ d | _d | _t ||| | jd kr0d S | jsNtd | d d S | j d krz| jd krztd | d d S | j r| j j	d }n| jj	d }|| }| j
|rdd |D }| || | d d S ||krtd	| | d d S || }|jd
krddd |D }|jdkrN|j	d }t|t|krN| j| | || | d d S )NzG--use_mask_index is not set: EmbedLayerNormalization will not have maskz EmbedLayerNormalization(no mask)zLEmbedLayerNormalization will not have mask since attention node is not foundrS   rT   c                 S   s   g | ]}|j d kr|qS )r,   r5   r.   r1   rA   r   r   r    r3     s     
 z6FusionEmbedLayerNormalization.fuse.<locals>.<listcomp>z"EmbedLayerNormalization(with mask)zHEmbedLayerNormalization will not have mask since %s is not a node output)	ReduceSumrF   c                 S   s   g | ]}|j d kr|qS r   r.   r   r   r   r    r3   %  s     
 r   r   )r   r<   r   r   r   r   r>   r?   Zincrease_counterr;   r   re   r   r/   r=   Znodes_to_removerm   )r   rA   r*   rK   r   Zchildren_nodesr   r   r   r    r     sH    







z"FusionEmbedLayerNormalization.fuse)F)r   r   r   r   r   r   r   r   r   r   r   r    r     s   r   N)loggingr   typingr   r   r   r   r   Zfusion_baser   Zfusion_utilsr	   Zonnxr
   r   r   Z
onnx_modelr   r   r>   r   r   r   r   r   r    <module>   s        X