U
    T?h                     @   sV  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z! e"d
Z#ej$ddZ$dej%kre&e$ej%d< ddl'Z'ddl(m)Z)m*Z*m+Z+ dd Z,dd Z-e.e.dddZ/dd Z0dd Z1dd Z2e3dkrRe2  dS )a]   Benchmarking the inference of pretrained transformer models.
    PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
    One difference is that random input_ids is generated in this benchmark.

    For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

    Example commands:
        Export all models to ONNX, optimize and validate them:
            python benchmark.py -b 0 -o -v -i 1 2 3
        Run OnnxRuntime on GPU for all models:
            python benchmark.py -g
        Run OnnxRuntime on GPU for all models with fp32 optimization:
            python benchmark.py -g -o
        Run OnnxRuntime on GPU with fp16 optimization:
            python benchmark.py -g -o -p "fp16"
        Run TorchScript on GPU for all models:
            python benchmark.py -e torchscript -g
        Run TorchScript on GPU for all models with fp16:
            python benchmark.py -e torchscript -g -p "fp16"
        Run ONNXRuntime and TorchScript on CPU for all models with quantization:
            python benchmark.py -e torchscript onnxruntime -p "int8" -o
        Run OnnxRuntime with the ROCM provider and graph optimization script:
            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

    It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)ZlogicalZOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc           4      C   s  dd l }g }| rBd| krBd| krBd| krBtd |S d}|dkrrtj}d}d| krrtd	 |S |tjkrtd
| d |D ]8}t| d }|
D ] }|t|kr q|d | }t| d |_	t
|}d|krHt N t|t| d t| d t| d |||||| |||||||\}} }!}"W 5 Q R X d|krt|t| d t| d t| d |||||| |||||||\}} }!}"| sqt|| |d|||d}#|#d krqdd |# D }$g }%| rdnd}&tj||d}'tt|t|t|!|'jg}(tt||'jg})|D ]}*|*dkrBq.|D ]|}+|"d k	rd|+|"krdqFd|krttjntj},t|!|*|+||'|,}-d|j||&||| ||||*|+| tt d}.|'j	dkrt d| d|*d|'j!|'j!g  nt d| d|*|+g  |r"t"|#|-|.|	|*|}/n|##|$|-}0|(g}1t$t|0D ]8}2|2dkrlt| d dkrl|1%|) n
|1%|( q@d|krtj&ntj'}3t(|#|-|.|	|$|0|%|1|*|&|3|}/t |/ |%|/ qFq.qq|S )Nr   ZCUDAExecutionProviderZROCMExecutionProviderZDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.Ztensorrt   ZTensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)Zenable_all_optimizationnum_threadsverboseZ(enable_mlas_gemm_fastmath_arm64_bfloat16c                 S   s   g | ]
}|j qS  )name).0Znode_argr#   r#   T/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/benchmark.py
<listcomp>   s     z#run_onnxruntime.<locals>.<listcomp>cudacpu	cache_dironnxruntimeZenginer   	providersdeviceZ	optimizer	precisionZ
io_binding
model_nameinputsthreads
batch_sizesequence_lengthZcustom_layer_numr   vitZswinzRun onnxruntime on  with input shape Zgpt))r,   Zget_available_providersloggererrorr   ZNOOPTwarningr   len
model_typer   parsetorchZno_gradr   r   r   get_outputsr   from_pretrainednumpyprodmaxZhidden_sizeZint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer   runrangeappendZlonglongZintcr	   )4use_gpuprovidermodel_namesmodel_classconfig_modifierr0   r!   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr+   onnx_dirr"   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr,   resultsZwarm_up_repeatr1   Zall_input_namesZ
num_inputsZinput_namesfusion_optionsZonnx_model_fileZis_valid_onnx_model
vocab_sizeZmax_sequence_lengthZort_sessionZort_output_namesZoutput_buffersr/   configZmax_last_state_sizeZmax_pooler_sizer4   r5   Zinput_value_typeZ
ort_inputsZresult_templateresultZort_outputsZoutput_buffer_max_sizesiZ	data_typer#   r#   r&   run_onnxruntimeY   sf   

















	




	


rh   c                    s  g }| r t j s td |S t d |D ]}tj||	|d}|| t	||||d}|j
dkrt|d g}ntj||d}|j|d}td	|  td
|   |tjkr|  t | rdnd}|| |tjkrt|}|D ]}|dkrq|D ]}|j
dkrvtd| d|d|j|jg  t j|d|j|jf|tjkrht jnt j|dnR|d k	r||krqtd| d||g  t jd|jd ||ft j|dz|	rt j !|n|
rt "|n|   t#j$ fdd|dd}|	rdn|
r(dndt j%d| r:dndd|d|d||||& t't() d}|*t+|| t| |,| W n8 t-k
r } zt.| t j/  W 5 d }~X Y nX qqq.|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr+   )re   r+   custom_model_classr6   r   r*      zModel zNumber of parameters zcuda:0r)   zRun PyTorch on r8   r   )sizedtyper/   r   )lowhighrl   rm   r/   c                      s    S Nr#   r#   	inference	input_idsr#   r&   <lambda>      zrun_pytorch.<locals>.<lambda>repeatnumberri   torch2r?   NAr(   r   r-   )0r?   r(   Zis_availabler9   r:   Zset_grad_enabledr   rA   modifyr   r=   r   max_model_input_sizesgetdebugZnum_parametersr   FLOAT16Zhalfr/   toINT8r   Zquantize_torch_modelrJ   rK   ZrandnZfloat16Zfloat32randintrd   longZjittracecompiletimeitrw   rF   rG   rH   r   rI   updater   rN   RuntimeError	exceptionZempty_cache)rO   rQ   rR   rS   r0   r!   rT   rU   rV   ri   ry   r+   r"   rb   r1   re   model	tokenizermax_input_sizer/   r4   r5   runtimesrf   er#   rq   r&   run_pytorch9  s    











&


$r   do_eager_modeuse_xlac                    s*   ddl m dd l fdd}|S )Nr   )wrapsc                    sX     fdd} j d fdd}dkrPdksLtd|S |S d S )	Nc                     s
    | |S rp   r#   ra   kwargsfuncr#   r&   run_in_eager_mode  s    zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode)Zexperimental_compilec                     s
    | |S rp   r#   r   r   r#   r&   run_in_graph_mode  s    zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_modeTFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)functionAssertionError)r   r   r   r   r    r   r   r   r&   run_func  s    
z+run_with_tf_optimizations.<locals>.run_func)	functoolsr   
tensorflow)r   r   r   r#   r   r&   run_with_tf_optimizations  s    r   c                    s  g }dd l jj| | s,jg d | rHj sHtd |S | rj	d}z8j|d d jj
|d d jjdd W n, tk
r } zt| W 5 d }~X Y nX |tjks|tjkrtd|D ](}tj||	d |  t| |	|dd	tj||	d}|j|d
}|D ]}|dkrBq.|D ]}|d k	rd||krdqFtd| d||g  dd l}|  fddt|| D }j|||fj dzt!dddfdd}t!dddfdd}t!ddd fdd}| j"r&|nt# t$r6|  t%j&fdd|dd}dj'd| rfdndd |d |d||||( t)t*+ d!}|,t-|| t| |.| W nJ tk
r } z*t| dd"l/m0} |1 }|2  W 5 d }~X Y nX qFq.q|S )#Nr   ZGPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r/   z+Mixed precision is currently not supported.r*   )re   r+   rj   Zis_tf_modelrk   zRun Tensorflow on r8   c                    s   g | ]} d  jd qS )r   r   )r   rd   )r%   rg   )re   rngr#   r&   r'     s     z"run_tensorflow.<locals>.<listcomp>)shaperm   Fr   c                      s    ddS )NF)trainingr#   r#   rs   r   r#   r&   encoder_forward  s    z'run_tensorflow.<locals>.encoder_forwardc                      s     ddS )NF)Zdecoder_input_idsr   r#   r#   r   r#   r&   encoder_decoder_forward  s    z/run_tensorflow.<locals>.encoder_decoder_forwardc                     s8   j dd jg} j dd jg}| |ddS )Nr   F)Zvisual_featsZ
visual_posr   )randomnormalZvisual_feat_dimZvisual_pos_dim)Zfeatspos)re   rs   r   r    r#   r&   lxmert_forward  s    z&run_tensorflow.<locals>.lxmert_forwardc                      s     S rp   r#   r#   )rr   r#   r&   rt   &  ru   z run_tensorflow.<locals>.<lambda>r   rv   r   rz   r(   r)   r   r-   )r(   )3r   re   	threadingZ set_intra_op_parallelism_threadsZset_visible_devicestestZis_built_with_cudar9   r:   Zlist_physical_devicesZexperimentalZset_memory_growthZ
distributeZOneDeviceStrategyr   r   r   r   r   NotImplementedErrorr   rA   r{   r   r   r|   r}   rJ   r   RandomrM   ZconstantrE   r   Zis_encoder_decoder
isinstancer   r   rw   rF   rG   rH   r   rI   r   r   rN   Znumbar(   Zget_current_devicereset)rO   rQ   rR   rS   r0   r!   rT   rU   rV   r+   r"   rb   Zphysical_devicesr   r1   r   r   r4   r5   r   valuesr   r   r   r   rf   r(   r/   r#   )re   rr   rs   r   r   r    r&   run_tensorflow  s    












$r   c                  C   s  t  } | jddddtdddgtt dd	t  d
 | jdddtdddgdd
 | jddtd ttdd	t d | jddddtdgdddddgdd
 | jdddtt	j
dddd  | jd!dtt	j
dd"d#d  | jd$d%dd&d'd( | jd)dtd d*d  | jd+d,ttjttd-d. | jd/dd&d0d( | jd1dd&d2d( | jd3d4ttjttd5d. | jd6d7dd&d8d( | jd9d:dd d;d< | jd=d>dd d?d< | jd@dAdd dBd< | jdCdDdddgtddEdFgdGdH | jdIdJddKtdLdM | jdNdOdtdgdP | jdQdRdtdSdTdUdVdWdXdYgdP | jdZdd&d[d( | jdd\ | jd]d^ddtd_gd`da | jdbdtd dcd  | jdddd&ded( | jddf t|  |  }|S )gNz-mz--modelsF+zbert-base-casedzroberta-baseZgpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer   r   r    zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r   r   r   r   r   z-ez	--enginesr,   r?   ry   ri   r   zEngines to benchmarkz-cz--cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r   r   r   r   z
--onnx_dirZonnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r   actionr   z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r   z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r   r   r   z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_countsr   r   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r   r   r   r   r   r   z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r   z-bz--batch_sizes)r   r   r   z-sz--sequence_lengths             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )r\   z-nz--num_threadsr   zThreads to use)r   r   r   r   r   z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )r`   )argparseArgumentParseradd_argumentrH   listr   keysjoinr   ospathr   ZFLOAT32r   ZBYSCRIPTintset_defaultsr   add_arguments
parse_args)parserra   r#   r#   r&   parse_argumentsE  sV   

					

r   c                  C   s  t  } t| j | jtjkr0| js0td d S | jtj	krZ| jrZ| j
dkrZtd d S t| jdkrt| jd  d dkrdg| _td	d
 | jD | _td|   tj| jszt| j W n" tk
r   td| j Y nX d| jk}d| jk}d| jk}d| jk}d| jk}|rTttjtdk rTtdtj  d S t| j}g }| jD ]}t| t tj!"  |s|s|rf| j#dgkrt$d |r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r*|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|rf|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r|t)| j| j| j&|| j|| j'| j| j(| j| j7 }i }	|rhzh| j* }
|t+| j| j
| j| j&|| j|| j'| j| j(| j#| j,| j-| j| j.| j| j/| j0|
|	| j1| j2| 7 }W n  t3k
r0   t4d Y nX qht56 7d}|	rh| j8p\d| d}t9|	| t|dkr| j'dgkrt$d d S | j:pd| d}t;|| | j<pd| d}t=|||  d S )Nzfp16 is for GPU onlyZmigraphxzint8 is for CPU onlyr   r   r   )r7   Zswimr   c                 S   s   h | ]}|d krt n|qS )r   )	cpu_count)r%   xr#   r#   r&   	<setcomp>  s     zmain.<locals>.<setcomp>zArguments: z#Creation of the directory %s failedr?   ry   ri   r,   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%SZbenchmark_fusion_z.csvzNo any result available.Zbenchmark_detail_Zbenchmark_summary_)>r   r   r"   r0   r   r   rO   r9   r:   r   rP   r<   modelsr   rU   sortedr!   rJ   r   r   existsr+   mkdirOSErrorZenginesr   r>   r?   rF   r   Zforce_num_layersZset_num_threadsr~   Z
__config__Zparallel_inforW   r;   r   rR   rT   Z
test_timesr   Zuse_mask_indexrh   rX   rY   rZ   r[   r\   r_   r`   r   r   r   rI   strftimeZ
fusion_csvr   Z
detail_csvr
   Z
result_csvr   )ra   Zenable_torchZenable_torch2Zenable_torchscriptZenable_onnxruntimeZenable_tensorflowrS   rb   r!   r^   r]   Z
time_stampZcsv_filenamer#   r#   r&   main  s   


$











r   __main__)4__doc__r   loggingr   r   r   rB   ZpsutilZbenchmark_helperr   r   r   r   r   r   r	   r
   r   r   r   rc   r   Zhuggingface_modelsr   r   Zonnx_exporterr   r   r   r   	packagingr   Zquantize_helperr   	getLoggerr9   r   environrH   r?   Ztransformersr   r   r   rh   r   boolr   r   r   r   __name__r#   r#   r#   r&   <module>   s>   4

 ap  I #
