U
    T?h<                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dlmZm	Z	m
Z
mZmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ ed	Zdd
dZdd Zedkre Zeej  ee dS )    N)datetime)	Precisioncreate_onnxruntime_sessionget_ort_environment_variablesprepare_environmentsetup_logger)DEFAULT_TOLERANCEMODEL_CLASSESPRETRAINED_GPT2_MODELS
Gpt2Helper)version)QuantizeHelper)
AutoConfig)__version__ c                 C   s@  t  }|jdddtddt d |jddtd	tt d
dt  d |jddtt	j
dddd |jddtt	j
dddd |jdddtdd |jdddddd |jdddddd |jdd  |jd!td"dd"d#d$gd%d& |jd'ddd(d |jdd) |jd*d+ttjttd,d- |jd.ddd/d |jdd0 |jd1d2d3td#gd4d5 |jd6d3td#gd7d5 |jd8d9d3td:d;d<d=d>d?gd@d5 |jdAdBdd dCdD |jdEdtdFdGd |jdHdddI |jddJ |jdKdddI |jddL |jdMdddI |jddN |jdOdddI |jddP || }|S )QNz-mz--model_name_or_pathTz;Model path, or pretrained model name selected in the list: z, )requiredtypehelpz--model_classFZGPT2LMHeadModelz!Model type selected in the list: )r   r   defaultchoicesr   z--cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r   r   r   r   z
--onnx_dirZonnx_modelszDirectory to store onnx modelsz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r   z-vz--validate_onnx
store_truezValidate ONNX model)r   actionr   z-oz--optimize_onnxz'Use optimizer.py to optimize onnx model)optimize_onnxz--stager         a6  Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). 1 - decode the first token when past_sequence_length is zero; 2 - decode the remaining tokens when past_sequence_length is not zero; 0 - one onnx model for both stages 1 and 2. Note that we will optimize 1 and 2 differently for best performance.)r   r   r   r   r   z	--use_gpuzuse GPU for inference)use_gpuz-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r   z--torchscriptzuse Torchscript)torchscriptz-bz--batch_sizes+z
batch size)nargsr   r   r   z--sequence_lengthsz!sequence lengths (excluding past)z-sz--past_sequence_lengths          @         zpast sequence lengthsz-rz--result_csvz$CSV file for saving summary results.)r   r   r   z--thread_numzThreads to usez--include_copy_output_latency)r   r   )include_copy_output_latencyz	--verbose)verbosez--output_torch_latency)output_torch_latencyz--disable_io_binding)disable_io_binding)argparseArgumentParseradd_argumentstrjoinr
   listr	   keysospathintset_defaultsr   FLOAT32
parse_args)argvparserargs r<   e/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/gpt2/benchmark_gpt2.pyparse_arguments!   s    
			
r>   c           !      C   s  t tt dk rtdtd|   | jtjkrL| j	rD| j
sLtd| jtjkrf| j
rftd| jdkr| jdgkstdt| jdkrtjd	d
n| j ttj  | j}| j}t||| j
 t| j d }t}tj| j| j |d}|j| j||d}t!| j
rdnd}|"| |j#dk}|j$|| j| jd	|d}	|	d }
t| j d }|j%|||
| j&|||d | j	s| jtj'kr$|	| jtjkrt(| jnd }
|j	|	d |
| jtjk|j)j*|j)j+|d	| jd | jtjkr$td t,-|
|	d | t,.|}td |	d }
| j r@|j |||||d}t/|
| j
d| j| j&d}|d krfd S |0t1| j2t1| jt1| j3|| j}|4||| jtjk}| j5pd6t78 9d}t:|ddd}d d!d"d#d$d%d&d'd(d)d*d+d,d-g}t;j<||d.}|=  | j2D ]}| j3D ]}| jD ]t}|dkrH|dkrH|dksLtt>d/||| |j?||||j*|j+|j#|j@|| jtjk||d0}|0||||| j}z| jAs| jBr*|C||| jD\}}tE|D ]V\}}tF|tGrt>d1| d2tH| d3|d jI  nt>d1| d4|jI  qnd }d }| jJrP|K||| jD\}}n |jL||||| jDd| jMd5\}}| jAr|}| jJsg }|D ]}|N|O P  q|jQ||| jtR| j tR| j d6rtd7tR| j  d8 td9||||| jJrd:nd|rd;nd | j| j| jtS | j
| j| j	| j |||| jJ|rF|d<nd=|d<d>} |T|  W n6 tUk
r   tjVd?d	d@ Y    W 5 Q R  d S X q$qqW 5 Q R X tdA|  |S )BNz3.1.0z/This tool requires transformers 3.1.0 or later.z
Arguments:z'fp16 requires --optimize_onnx --use_gpuzquantization only supports CPUr   r   z<past_sequence_lengths shall be 0 for stage==1 (init decoder)T)Zlogical)r   	cache_dir)configr?   zcuda:0cpu   )Zhas_pastZ
new_folderrawr   )has_position_idshas_attention_maskZfp32)Zauto_mixed_precisionstagezquantizing model...Zint8zfinished quantizing modelF)Zenable_all_optimizationZnum_threadsr)   zbenchmark_result_{}.csvz%Y%m%d-%H%M%Sar   )modenewline
model_namemodel_classrF   environment_variablesgpu	precision	optimizerr   
batch_sizesequence_lengthpast_sequence_lengthr+   torch_latencyonnxruntime_latency)
fieldnameszMRunning test for batch_size=%d sequence_length=%d past_sequence_length=%d ...)Zfloat16rD   rE   ztorch output z is tuple of size z, shape z shape )Zreturn_numpyr(   )rK   ZrtolZatolz:Pytorch and ONNX Runtime outputs are all close (tolerance=z).zZbatch_size=%d, sequence_length=%d, past_sequence_length=%d, onnxruntime_latency=%.2f %s %sz(disable_io_binding)z, torch_latency={torch_latency}z.2fNone)rJ   rK   rF   rL   rM   rN   rO   r   rP   rQ   rR   r+   rS   rT   	Exception)exc_infozResults are saved to file )Wr   parsetransformers_versionRuntimeErrorloggerinforN   r   ZFLOAT16r   r   AssertionErrorZINT8rF   Zpast_sequence_lengthstorchZset_num_threadsZ
thread_numpsutil	cpu_countprintZ
__config__Zparallel_infor?   Zonnx_dirr   r	   rK   r   r   Zfrom_pretrainedZmodel_name_or_pathr   devicetoZn_layerZget_onnx_pathsZexport_onnxr)   r7   r/   r@   Znum_attention_headsZhidden_sizer   Zquantize_onnx_modelZquantize_torch_modelr   Zget_output_shapesmaxZbatch_sizesZsequence_lengthsZget_output_buffersZ
result_csvformatr   nowstrftimeopencsv
DictWriterwriteheaderdebugZget_dummy_inputsZ
vocab_sizeZvalidate_onnxr*   Zpytorch_inferenceZ
test_times	enumerate
isinstancetuplelenshaper+   Zonnxruntime_inferenceZ$onnxruntime_inference_with_binded_ior(   appendrA   numpyZcompare_outputsr   r   writerowrW   error)!r;   r?   
output_dirrK   Z
gpt2helperr@   modelrc   Zuse_external_data_formatZonnx_model_pathsZonnx_model_pathZuse_paddingsessionZmax_output_shapesZoutput_buffersZcsv_filenameZcsv_fileZcolumn_namesZ
csv_writerrP   rQ   rR   Zdummy_inputsZoutput_shapesoutputsrS   ivalueZort_outputsZort_latencyZcopy_outputsoutputrowr<   r<   r=   main   s   
"


 




"
  


.r   __main__)N)!r,   rj   loggingr3   r   r`   r_   Zbenchmark_helperr   r   r   r   r   Zgpt2_helperr   r	   r
   r   	packagingr   Zquantize_helperr   Ztransformersr   r   rZ   	getLoggerr\   r>   r   __name__r;   r)   r<   r<   r<   r=   <module>	   s*   

  s
