U
    T?h;                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 eeZdd Zdd Zdd	 Zd
d Zdd Zedkre  dS )    N)setup_logger)BenchmarkRecordc                  C   s  t  } | jddtdd | jddtdd | jdd	td
d | jddtdd | jddddd | jddddd | jdtddd | jdtddd | jdtddd | jdtddd | jd td!d"d# | jd$td!d%d&d'd(gd)d* | jd+td!d,d-d.gd/d* | jd0td1d2d | jd3ddd4d | jd5td6d7d | jd8td d9d |  }t|d:|jd;d< 	d=d> d?|j
 d@|j }|js||_tj|jd!dA | jdB9  _|S )CNz-b--batch-sizesz1 2)typedefaultz-s--sequence-lengthsz8 16 32 64 128 256 512z-w--warmup-runs   z-n
--num-runs  z--hf-pt-eagerF
store_truez,Benchmark in PyTorch without `torch.compile`)r   actionhelpz--hf-pt-compilez)Benchmark in PyTorch with `torch.compile`--hf-ort-dir-path zDPath to folder containing ONNX models for Optimum + ORT benchmarking)r   r   r   z--ort-msft-model-pathzAPath to ONNX model from https://github.com/microsoft/Llama-2-Onnxz --ort-convert-to-onnx-model-pathz'Path to ONNX model from convert_to_onnx--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored--model-nameTzModel name in Hugging Face)r   requiredr   --precisionZint4Zint8Zfp16Zfp32zPrecision to run model)r   r   choicesr   --devicecpucudaZrocmzDevice to benchmark modelsz--device-idr   zGPU device IDz	--verbosezPrint detailed logsz	--timeout
   z8Number of mins to attempt the benchmark before moving on--log-folderz'Path to folder to save logs and results
model_size/.-z./_)exist_ok<   )argparseArgumentParseradd_argumentstrint
parse_argssetattr
model_namesplitreplacer   	precision
log_folderosmakedirstimeout)parserargsZlog_folder_name r4   e/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/llama/benchmark_all.pyget_args   s    
 r6   c              
   C   s  g }d\}}}d\}}}	}
d}d}d}d}d}d}d	}t |`}|D ]R}|d
d}||krvt|t|d  }qF||krt|t|d  }qF||krd}qF||krd}qF||krt|t||d }|d }qF||krt|t||d }	qF||krFd|kr:t||dd |d d }
n@|||t| d  dd}t||  d }t|d }
|||||||	|
g }|	| qFW 5 Q R X |S )N)NNN)NNNNzBatch Size: zSequence Length: zto get past_key_valueszwith past_key_valuesz	Latency: zThroughput: zpeak=
r   promptz	per-token r   ZCPU=   z MB'"Zmax_used_MB)
openr,   r'   lenfloatrfindfindjsonloadsappend)	device_idlog_filebase_resultsentries
batch_sizeZsequence_lengthstepZ	latency_sZ
latency_msZ
throughputZmemoryZbatch_patternZsequence_patternZprompt_step_patternZper_token_step_patternZlatency_patternZthroughput_patternZmemory_patternfZ
input_linelineZpeakusageentryr4   r4   r5   process_log_file   sV    




&"	rP   c                 C   sv  dd l }|j| dddddddd	d
ddddgd}|d d|d< |d d|d< |d d|d< |d	 d|d	< |d d|d< |d d|d< |d d|d< |d d|d< dd l}|j}tdd |D }d}d}|r|d dd }|d dd }g }	| D ]\}
}|d dkrVt|d |d d|d ||}nT|d dkrt|d |d d|d t	j
t	j}n"t|d |d |d |d dd}|d |j_|d |j_|d |j_|d	 |j_|d
 |jjd< |d |jjd< |d |jjd< |d |j_|d |jjd< |d |j_|	| qt||	 t|dd |	 td!| d" d S )#Nr   zWarmup RunszMeasured Runsz
Model NameZEngineZ	PrecisionZDevicez
Batch SizezSequence LengthZStepzLatency (s)zLatency (ms)zThroughput (tps)zMemory (GB))columnsr'   r@   c                 S   s(   g | ] }|j d kr|j  d|j qS ))zort-nightly-gpuzort-nightlyonnxruntimezonnxruntime-gpu==)keyversion).0ir4   r4   r5   
<listcomp>   s   
z save_results.<locals>.<listcomp>r   rS   r;   )optimum-ortrR   rR   )pytorch-eagerpytorch-compileZpytorchZmeasure_stepengineZlatency_s_meanZthroughput_tps.csvz.jsonzResults saved in !)ZpandasZ	DataFrameZastypepkg_resourcesworking_setsortedr+   Ziterrowsr   torch__name____version__configwarmup_runsZmeasured_runsrJ   Z
seq_lengthZ
customizedmetricsZlatency_ms_meanZmax_memory_usage_GBrE   Zsave_as_csvZsave_as_jsonr,   loggerinfo)resultsfilenamepddfr_   Zinstalled_packagesZinstalled_packages_listZort_pkg_nameZort_pkg_versionrecordsr    rowrecordr4   r4   r5   save_results   s              "rq   c           	   	   C   s   | dt j  dd}tj| j|}t|dF}tj|||d}z|	| j
 W n tjk
rr   |  Y nX W 5 Q R X td | j| j| j|| j| jg}t| j||}|S )Nr    %Y-%m-%d_%H:%M:%Sz.logw)stdoutstderrz Gathering data from log files...)datetimenowr/   pathjoinr.   r>   
subprocessPopenwaitr1   TimeoutExpiredkillrh   ri   rf   num_runsr*   r-   devicerP   rF   )	r3   benchmark_cmdr\   Zlog_filenamelog_pathrG   processrH   rj   r4   r4   r5   	benchmark(  s    
r   c                  C   s  t  } t| j t| j dtjj_	g }t
| jtjd< | jrdddddd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jdg}td t	| |d}|| | jr$dddddd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jdg}td t	| |d}|| | jrdddddd| jd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jdg}td t	| |d}|| | jrdddddd| jd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jg}td t	| |d}|| | jrdddddd| jd| jd	| jd
| jd| jd| jdt
| jdt
| jd| jd| jg}td t	| |d }|| | j d!| j d!tj d"d#}t |tj!"| j| d S )$NTZCUDA_VISIBLE_DEVICESpythonz-mzmodels.llama.benchmarkz--benchmark-typezhf-pt-eagerr   r   r   r   r   r   r
   r   r   z--authz'Benchmark PyTorch without torch.compilerZ   zhf-pt-compilez$Benchmark PyTorch with torch.compiler[   zhf-ortr   z Benchmark Optimum + ONNX RuntimerY   zort-msftz--ort-model-pathz)Benchmark Microsoft model in ONNX Runtimezort-convert-to-onnxz/Benchmark convert_to_onnx model in ONNX RuntimerR   r    rr   r]   )#r6   r   verboserh   ri   __dict__rb   backendsZcudnnr   r&   rF   r/   environZhf_pt_eagerr*   r-   Zbatch_sizesZsequence_lengthsr   rf   r   r.   	cache_dirextendZhf_pt_compileZhf_ort_dir_pathZort_msft_model_pathZort_convert_to_onnx_model_pathr   rv   rw   rq   rx   ry   )r3   Zall_resultsr   rj   Zcsv_filer4   r4   r5   main:  s:   











"r   __main__)r#   rv   rC   loggingr/   rz   rb   Zbenchmark_helperr   rg   r   	getLoggerrc   rh   r6   rP   rq   r   r   r4   r4   r4   r5   <module>   s$   
 9Q 2