U
    T?hf^                     @   s0  d dl Z d dlZd dlZd dlZd dlZd dlmZ dddgZd'ddZdd	 Z	d
d Z
d(ddZd)ddZdd Zdd Zdd Zdd Zd*ddZdd Zdd Zdd  Zd!d" Zed#kr,e Zed$e d d%lmZ eej ejsej std&eeZnejZeeeZ e D ]Z!ee! qdS )+    N)TensorProtoZScanZLoopIfc                 C   s  t  }|jdddtdd |jdddtdd |jd	d
dtddd |jdddtddd |jddtddd |jddtddd |jddtddd |jddtddd |jddtddd |jddtd d d |jd!dtd d"d |jd#dtd d$d |jd%dd&d'd(d)d&gd*d+ |jd,d-dd.d/d0 |jdd1 |jd2dtd3d4d |jd5dd.d6d0 |jdd7 |jd8dd.d9d0 |jdd: |jd;d<dd.d= |jdd> || S )?Nz-iz--inputFz2Set the input file for reading the profile results)requiredtypehelpz-mz--modelzIonnx model path to run profiling. Required when --input is not specified.z-bz--batch_size   zbatch size of input)r   r   defaultr   z-sz--sequence_length    zsequence length of inputz--past_sequence_lengthzpast sequence length for gpt2z--global_lengthz&number of global tokens for longformerz	--samplesi  z\number of samples to test. Set it large enough to reduce the variance of performance result.z--thresholdg{Gz?zfThreshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.z--thread_numznumber of threads to usez--input_ids_namez"input name for input IDs, for bertz--segment_ids_namez$input name for segment IDs, for bertz--input_mask_namez'input name for attention mask, for bertz--dummy_inputsr   bertgpt2
longformerzEType of model inputs. The default will create dummy inputs with ones.)r   r   choicesr   z-gz	--use_gpu
store_truezuse GPU)r   actionr   )use_gpuz
--providercudazExecution provider to usez--basic_optimizationz_Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime)basic_optimizationz--kernel_time_onlyz.Only include the kernel time and no fence time)kernel_time_onlyz-vz	--verbose)r   r   )verbose)argparseArgumentParseradd_argumentstrintfloatset_defaults
parse_args)argvparser r    S/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/profiler.pyparse_arguments   s    		
r"   c                 C   sD   ddl m} || ||| |dd}|D ]}|d |}	q&| }
|
S )Nr   )create_onnxruntime_sessionT)Zenable_all_optimizationnum_threadsZenable_profiling)benchmark_helperr#   runZend_profiling)Zonnx_model_pathr   providerr   
thread_num
all_inputsr#   sessioninputs_profile_filer    r    r!   run_profile   s    	r.   c              	   C   s@   t d|  d t| }t|}W 5 Q R X t|ts<t|S )Nzloading profile output z ...)printopenjsonload
isinstancelistAssertionError)r-   Zopened_file	sess_timer    r    r!   load_profile_json   s
    
r7   c                 C   sV  i }i }i }d}d}| D ]}|d dkr8|d dkr8d}|s>q|d dkrd	|krd
|krd|d
 kr|d }|d
 d }	|	t krq|	sd| d}	||kr||  |d	 7  < ||  d7  < n|d	 ||< d||< |	||< ||d	 7 }q|sdgS g }
|
d|d dd |
d |
d t| dd ddD ]f\}}|| }||k rZq<|| }|t| }|
|dd|d dd|dd|dd| 	 q<i }| D ]6\}}	|| }|	|kr||	  |7  < n|||	< q|
d |
d |
d  t| d!d ddD ]4\}	}|| }|
|dd|d dd|	  q|
S )"  Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

    Returns:
        List[str]: lines of string for output.
    r   FcatSessionnameZsession_initializationTZKerneldurargsop_name()r   zNo kernel record found!z%
Top expensive kernels with Time% >= d   .2f:@----------------------------------------------------------------u&   Total(μs)	Time%	Calls	Avg(μs)	Kernelc                 S   s   | d S Nr   r    xr    r    r!   <lambda>       z&parse_kernel_results.<locals>.<lambda>keyreverse10d	      Y@5.2f5d8.1fz
Group kernel time by operator:u   Total(μs)	Time%	Operatorc                 S   s   | d S rE   r    rF   r    r    r!   rH     rI   )NODES_TYPE_CONTAINING_SUBGRAPHappendsorteditemsr   )r6   	thresholdZkernel_name_to_op_namekernel_timeZkernel_freqtotalZsession_inititemZkernel_namer>   linesdurationratiocallsavg_timeZop_timer    r    r!   parse_kernel_results   sd    (


6



&r`   Fc                 C   s  g }i }i }i }d}| D ]2}|d dkrd|krd|krd|d kr|d  dd	 d
d	 dd	}	d|d kr|d d dkrd}
n*|d d dkrd}
n|d d dkrd}
|	|kr|
||	< q||	 |
kstn|rq|d d }|tkrq|	|kr"||	  |d 7  < ||	  d7  < n|d ||	< d||	< ||	 ||d 7 }qdddg}d}|D ]}	||	 }||	 }|t| }|| d }||	d	}||7 }||dd|dd|dd|dd|dd|dd|	  q`|d|d  d!d" |d |d# t| d$d% d&d'D ]\}	}|| }||k rBq$||	 }|t| }|| d }||	d	}||dd|dd|dd|dd|dd|	  q$|S )(r8   r   r9   Noder<   r=   r>   r;   Z_kernel_time Z_fence_beforeZ_fence_afterr'   ZCPUExecutionProviderZCPUZCUDAExecutionProviderCUDAZDmlExecutionProviderZDMLr   z
Nodes in the original order:rD   u3   Total(μs)	Time%	Acc %	Avg(μs)	Calls	Provider	Nodeg        rO   rM   rN   rP   rR   rQ   8sz#
Top expensive nodes with Time% >= rA   rB   rC   u-   Total(μs)	Time%	Avg(μs)	Calls	Provider	Nodec                 S   s   | d S rE   r    rF   r    r    r!   rH   V  rI   z$parse_node_results.<locals>.<lambda>TrJ   )replacer5   rS   rT   r   getrU   rV   )r6   r   rW   Znode_name_listZ	node_timeZ	node_freqZnode_providerrY   rZ   Z	node_nameZdevicer>   r[   Zbefore_percentager\   r^   r_   
percentager'   r]   r    r    r!   parse_node_results  sv    
(


6


:rh   c                 C   sD  i }i }d}i }i }i }i }	d}
i }| D ]}|d dkr(d|kr(d|kr(d|d kr(|d d }|t krlq(d|d krd|d	 kr(||	kr|	|  |d 7  < n|d |	|< |
|d 7 }
q(|d dd
}||kr||  d7  < nd||< | d| }||kr,||  |d 7  < ||  d7  < n|d ||< d||< ||kr`||  |d 7  < n|d ||< ||kr||  |d 7  < ||  d7  < n|d ||< d||< ||d 7 }q(d
dg}|d |d t| dd ddD ]\}}|	|d}|| }|| }|||
  }|| }|| }||dd|d dd|dd|d dd|dd|dd|dd|  q|d
dg7 }|d |d t| dd ddD ]\}}|d}|d }|d }|dd
}|| }|| }|||  }||dd|d dd|dd|dd|d d|  q|S )!a  Group results by operator name.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool): Only include items for kernel time.
        use_gpu (bool): GPU is used in profiling or not.

    Returns:
        List[str]: lines of string for output.
    r   r9   ra   r<   r=   r>   r'   Zfencer;   rb   r   rC   zGrouped by operatorrD   uM   Total(μs)	Time%	Kernel(μs)	Kernel%	Calls	AvgKernel(μs)	Fence(μs)	Operatorc                 S   s   | d S rE   r    rF   r    r    r!   rH     rI   z$group_node_results.<locals>.<lambda>TrJ   rM   rN   rO   rP   Z11drQ   z14.1fzGrouped by provider + operatoru<   Kernel(μs)	Provider%	Calls	AvgKernel(μs)	Provider	Operatorc                 S   s   | d S rE   r    rF   r    r    r!   rH     rI   ZExecutionProviderz9.2frd   )rS   rf   rT   rU   rV   splitre   )r6   r   r   Zop_kernel_timeZop_kernel_recordsZtotal_kernel_timeZprovider_op_kernel_timeZprovider_op_kernel_recordsZprovider_kernel_timeZop_fence_timeZtotal_fence_timeZprovider_counterrZ   r>   r'   rK   r[   rX   Z
fence_timeZkernel_time_ratio
total_timeZ
time_ratioZkernel_callsZavg_kernel_timepartsZshort_epr^   Zprovider_time_ratior    r    r!   group_node_resultsd  s    
(




F


2rl   c                 C   s&   t | dtkr"t| | dS d S )Nvalue)r   Z
WhichOneofr   getattr)dimr    r    r!   get_dim_from_type_proto  s    rp   c                 C   s   dd | j jjD S )Nc                 S   s   g | ]}t |qS r    )rp   ).0dr    r    r!   
<listcomp>  s     z-get_shape_from_type_proto.<locals>.<listcomp>)tensor_typeshapero   )Z
type_protor    r    r!   get_shape_from_type_proto  s    rv   c                    s  i  |   D ]}t|j}g }t|D ]\}}t|tr&|| q&t|dkrV dS t|dkrn|||d < t|dkr|||d < |jjj	}	|	t
jt
jt
jfkst|	t
jkrtjn|	t
jkrtjntj}
tj||
d}| |j< q fddt|D }|S )a  Create dummy inputs for ONNX model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples

    Returns:
        List[Dict]: list of inputs
       Nr   r   Zdtypec                    s   g | ]} qS r    r    rq   r,   dummy_inputsr    r!   rs     s     z'create_dummy_inputs.<locals>.<listcomp>)'get_graph_inputs_excluding_initializersrv   r   	enumerater3   r   rT   lenrt   	elem_typer   FLOATINT32INT64r5   numpyfloat32int64int32onesr;   range)
onnx_model
batch_sizesequence_lengthsamplesgraph_inputru   Zsymbol_dimsiro   r   	data_typedatar)   r    rz   r!   create_dummy_inputs  s.    


r   c                 C   sB   ddl m}m} || |||\}	}
}||||dd|	|
|dd	}|S )a-  Create dummy inputs for BERT model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples
        input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
        segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
        input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

    Returns:
        List[Dict]: list of inputs
    r   )find_bert_inputsgenerate_test_data{   F)Z
test_casesseedr   	input_idssegment_ids
input_maskZrandom_mask_length)Zbert_test_datar   r   )r   r   r   r   input_ids_namesegment_ids_nameinput_mask_namer   r   r   r   r   r)   r    r    r!   create_bert_inputs  s    r   c                    s   ||||| d}i  |   D ]}t|j}t|D ]6\}}	t|	tr4|	|kr^td|	 q4||	 ||< q4|jjj}
|
t	j
t	jt	jfkst|
t	j
krtjn|
t	jkrtjntj}tj||d}| |j< q fddt|D }|S )a  Create dummy inputs for GPT-2 model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        past_sequence_length (int): past sequence length
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   Zseq_lenZpast_seq_lenZtotal_seq_lensymbol is not supported: rx   c                    s   g | ]} qS r    r    ry   rz   r    r!   rs   O  s     z&create_gpt2_inputs.<locals>.<listcomp>)r|   rv   r   r}   r3   r   RuntimeErrorrt   r   r   r   r   r   r5   r   r   r   r   r   r;   r   )r   r   r   past_sequence_lengthr   symbolsr   ru   r   ro   r   r   r   r)   r    rz   r!   create_gpt2_inputs#  s.    


r   c                    s  ||d}i  |   D ]}t|j}t|D ]6\}}	t|	tr,|	|krVtd|	 q,||	 ||< q,|jjj}
|
t	j
t	jt	jfkst|
t	j
krtjn|
t	jkrtjntj}d|jkrtj||d}d|ddd|f< ntj||d}| |j< q fddt|D }|S )	a  Create dummy inputs for Longformer model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        global_length (int): number of global tokens
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   r   r   globalrx   r   Nc                    s   g | ]} qS r    r    ry   rz   r    r!   rs   ~  s     z,create_longformer_inputs.<locals>.<listcomp>)r|   rv   r   r}   r3   r   r   rt   r   r   r   r   r   r5   r   r   r   r   r;   Zzerosr   r   )r   r   r   global_lengthr   r   r   ru   r   ro   r   r   r   r)   r    rz   r!   create_longformer_inputsS  s,    




r   c                 C   s@   t | }t||j}|t||j|j7 }|t||j|j7 }|S )N)r7   r`   rW   rh   r   rl   r   )r-   r=   Zprofile_recordsr[   r    r    r!   process_results  s
    r   c                 C   s  | j dkr| j n
tjdd}dtjkr4t|tjd< ddlm} ddlm	} ||| j
}d }| jdkrt|| j| j| j| j| j| j}n\| jdkrt|| j| j| j| j}n8| jd	krt|| j| j| j| j}nt|| j| j| j}t| j
| j| j| j| j |}|S )
Nr   F)ZlogicalZOMP_NUM_THREADS)r2   )	OnnxModelr   r   r   )r(   psutil	cpu_countosenvironr   onnxr2   r   r   modelr{   r   r   r   r   r   r   r   r   r   r   r   r   r.   r   r'   r   )r=   r$   r2   r   r   r)   r-   r    r    r!   r&     sV    

	

	r&   __main__	Arguments)setup_loggerzMrequires either --model to run profiling or --input to read profiling results)N)r   )Fr   )NNN)"r   r1   r   r   r   r   r   rS   r"   r.   r7   r`   rh   rl   rp   rv   r   r   r   r   r   r&   __name__	argumentsr/   r%   r   r   inputr   r5   r-   resultsliner    r    r    r!   <module>   sH   


 

P
Wd/   
)0/6




