U
    T?ha                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlZeeZdddd	Zd
d Z dd Z!dd Z"dd Z#dd Z$dd Z%edkre%  dS )    )annotationsN)setup_logger)add_io_bindings_as_tensorsget_initial_inputs_and_outputs)
AutoConfigAutoModelForCausalLMAutoTokenizerBitsAndBytesConfigzargparse.Namespace)argsc                 C  s  | j dkrJd }| jdkrt| jdkrttdddtjd}tj| jdkrJ| jn| j	| j
| j| j| jdd|| jd	id
	}nzLtj| jdkr| jn| j	| j
| j| j| jd| jdkrdndd| j}W nh tk
r( } zHtd| tj| jdkr| jn| j	| j
| j| j| jddd| j}W 5 d }~X Y nX |  | j dkrt|}n:t }| jdkrldd| jifnd}tj| j||gd}|S )N   
pt-compilept-eagerint4cudaTZnf4)Zload_in_4bitZbnb_4bit_use_double_quantZbnb_4bit_quant_typeZbnb_4bit_compute_dtype Zflash_attention_2Z80GB)	cache_dirtorch_dtypeuse_auth_tokentrust_remote_code	use_cacheattn_implementationZquantization_configZ
max_memoryZsdpa)r   r   r   r   r   r   z&Try to load a model using eager mode: eagerr   CUDAExecutionProvider	device_idZCPUExecutionProvider)sess_options	providers)benchmark_typeonnx_precisiondevicer	   torchfloat16r   from_pretrainedhf_dir_path
model_namer   r   authtrustr   totarget_device	ExceptionprintevalcompileortZSessionOptionsZInferenceSessiononnx_model_path)r
   modelZ
bnb_configer   ep r1   e/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/llama/benchmark_e2e.py	get_model8   sn    	

r3   c           
   
   C  s   | j dkr(t  |f |}W 5 Q R X d }| j dkrP| jdkrltj| j nt|||| j| j	}|
  t }t|D ]X}| j dkrt ( |f |}| jdkrtj| j W 5 Q R X q||| |  q|t }|| | }	|	|fS )Nr   r   cpu)r   r   Zno_gradr   r   Zsynchronizer'   r   use_fp16use_buffer_shareZsynchronize_inputstimeperf_counterrangeZrun_with_iobindingZsynchronize_outputs)
r
   r.   runsinputsoutputsZ
io_bindingstart_endavgr1   r1   r2   run_inferencex   s*    









rA   c           	   	   C  sF   t   t||||| j| j| j| j\}}t| || j||\}}||fS N)clear_cacher   r'   r5   r6   enginerA   Zwarmup_runs)	r
   r.   config	tokenizerprompt_lengthpromptr;   r<   r>   r1   r1   r2   prepare_model_for_inference   s           rI   c                   C  s   t   tj  d S rB   )gcZcollectr   r   Zempty_cacher1   r1   r1   r2   rC      s    rC   c                 C  sv   t j| ddddddddd	|d
  dd|d
  dd	| dd| dddgd}|j|dd td| d d S )Nz
Batch SizezPrompt LengthzPrompt Processing Latency (ms)z"Prompt Processing Throughput (tps)zSampling Latency (ms)zSampling Throughput (tps)z"First Token Generated Latency (ms)z&First Token Generated Throughput (tps)Average Latency of First    z Tokens Generated (ms)Average Throughput of First z Tokens Generated (tps)zWall-Clock Latency (s)zWall-Clock Throughput (tps))columnsF)indexzResults saved in !)pdZ	DataFrameZto_csvloggerinfo)resultsfilenameZ
gen_lengthdfr1   r1   r2   save_results   s(    

rW   c               
   C  s  t  } | jddtddddgd | jdd	td
dd | jddd
ddd | jddd
ddd | jddttjdddd | jdtddd | jddd
d d! | jd"d#dtjdd$d%d&d'd( | jd)d
dd*d | jd+d
dd,df | jd-d.d/d0 | jd1d2d3d0 | jd4d5dtd6d7d8d9d6gd:d; | jd<d=td>d?d | jd@dAttj	
 rVdBndCdCdBgdD | jdEdFtdGdH | jdIdJtdKdH | jdLdMtdNdH | jdOtdPdH |  }tj|j t|j d|jkrt|dQ|j  dR |jdSkr|jdT|jif|_|jdkr*|js*tdU|jdV|_|jdV|_t|dW|j |jdXksx|jd7kr||jdCkr|d6nd9|_|jdCkrdY|j n|j}|jd9krtjntj}|jdkrdndZ}t|d[| t|d\| t|d]| t|d^|jd9k |jo|dk|_|S )_Nz-btz--benchmark-typeTr   r   r,   )typerequiredchoicesz-mz--model-nameFz<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rX   rY   helpz-az--auth
store_truez5Use Hugging Face authentication token to access model)defaultactionr[   z-tz--trustzeWhether or not to allow for custom models defined on the Hugging Face Hub in their own modeling filesz-cz--cache-dir.Zmodel_cachezPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.)rX   r]   r[   z--hf-dir-pathr   zPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.z-oz--onnx-model-pathzPath to ONNX model)rY   r[   z-fz--prompts-filemodelsllamazprompts.jsonzsJSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt)rY   r]   r[   z--use_buffer_sharez3Use when GroupQueryAttention (GQA) is in ONNX modelz--anomaly-filteringzUse this flag to filter anomaly accelerator times for tokens generated.               This may give more accurate latency and throughput metrics for tokens generated.               Wall-clock metrics are still reported with anomaly times though.z-bz--batch-sizesz1 2)r]   z-sz--prompt-lengthsz16 64 256 1024z-pz--precisionfp32r   int8Zfp16zePrecision for model. For ONNX models, the model's precision should be set before running this script.)rY   rX   r]   rZ   r[   z-gz--generation-length   z Number of new tokens to generatez-dz--devicer   r4   )rX   r]   rZ   z-idz--device-idr   )rX   r]   z-wz--warmup-runs   z-nz
--num-runsd   z--seedrL   execution_providerZExecutionProviderr   r   z,Please specify a path to `--onnx-model-path` r   >   rb   rc   zcuda:ptr'   r   rD   r5   )argparseArgumentParseradd_argumentstrospathjoinintr   r   Zis_available
parse_argsnprandomseedZmanual_seedr   setattrr   upperrg   r   r-   AssertionErrorbatch_sizessplitprompt_lengths	precisionr    Zfloat32r6   )parserr
   r'   r   rD   r1   r1   r2   get_args   s    	

*r~   c            9        s  t  } td t| j d }t| j}tj|dd d}W 5 Q R X t	j
| jdkrZ| jn| j| j| j| jd}tj
| jdkr| jn| j| j| j| jd}t| }g }t| j| jD ]\}}t|t| }}td| d|  t  || j }	||krTttd	| d
| j d| j d| d| d| d| d| d| j d| d|| g| }
||g}ztd t| |||||
\}}t| || j||\}}|d }|||  }td| d td|||   d |||g td t  t| |||||
\}}|d   }|j!d }|j"}t#|drF|j$n
|j%|j& }t'j(|| j)t'j*d}g }g }t+, }||	krt| |d||\}}|-| t+, }|d  j!d dkr|d! .dd }|j/dd"0d|j12|d|j1}t'3|d  d|4 }n|d  d d dd d f }t'j5|dd"}||B |j6k}|7||j68|dg}t+, } |-| |  t'j9||gdd"}|d7 }||d< t'9|d! | :t'j;8|dgd|d!< d#|krt'j<|d# dd"d$ 8|dd |d#< |d  j!d dkr |d  d d d dd d f = |d < |d  >  | j?d%krF|d& |d&< n| j@svtA|jBD ]>}!|d'|! d( |d)|! d(< |d'|! d* |d)|! d*< qX|d! j!d }"tA|jBD ]d}!t'j(|||"|| j)| jCd}#t'j(|||"|| j)| jCd}$|Dd'|! d(|#= d'|! d*|$= i qqvt+, }%|Ed$ | jFrd+ tG|tH|}&tItJ fd,d|}tH|}'td-|&|'  d.  d/d  d0 t.|tH| }(|(d })|d|(  }*td1|) d td2|* d |d$ }+|+d },|d|+  }-td3|, d td4|- d | jd5 }.t.|d |. tH|d |.  }/|/d }0|d|/  }1td6|. d7|0 d td8|. d7|1 d t.|tH| }2|2d }3|d|2  }4td6| j d7|3 d td8| j d7|4 d |%| }5||| j |5  }6td9|5 d: td;||| j |5   d td< ||)|*|,|-|0|1|3|4|5|6g
 |-| W q tKk
r }7 z td=| d| d>|7  W 5 d }7~7X Y qX qd?| j? d@tLjLM dAdB}8tN||8| j d S )CNFc                 S  s   dd |   D S )Nc                 S  s   i | ]\}}t ||qS r1   )rq   ).0kvr1   r1   r2   
<dictcomp>h  s      z*main.<locals>.<lambda>.<locals>.<dictcomp>)items)dr1   r1   r2   <lambda>h      zmain.<locals>.<lambda>)object_hookr   )r   r   r   zRunning batch size = z, prompt length = z2
                                A prompt of size z was not found in 'zv'. There are a couple of solutions to fix this.
                                1) You can change one of the keys in 'z' to be z).
                                    If za < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until zB = actual prompt's length.
                                    If zm > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that zd = actual prompt's length.
                                2) You can add a new key-value entry in 'z' of the form 'z,': 'your prompt goes here'.
                zMeasuring prompt processing...i  z&Average Latency of Prompt Processing: z msz)Average Throughput of Prompt Processing: z tpszMeasuring token generation...Z	input_idshead_dim)r   Zdtype   ZlogitsZattention_mask)dimZposition_idsr   ri   Zpast_key_valueszpresent.z.keyzpast_key_values.z.value
   c                   s   |   k S rB   r1   )Zacc_timeZanomaly_threshold_factorZ
min_time_sr1   r2   r   	  r   zFiltered out z$ anomaly accelerator times that are zx greater than z ms...zAverage Latency of Sampling: z Average Throughput of Sampling: z"Latency of First Token Generated: z%Throughput of First Token Generated: rL   rK   z Tokens Generated: rM   zWall-Clock Latency: z szWall-Clock Throughput: zAdding results to CSVz$Could not benchmark at batch size = z - Z
benchmark_Z_e2e_z%Y-%m-%d_%H:%M:%Sz.csv)Or~   r   rR   rS   __dict__openZprompts_filejsonloadr   r!   r"   r#   r   r$   r%   r   r3   	itertoolsproductry   r{   rq   rC   Zgeneration_lengthNotImplementedErrortextwrapdedentrI   rA   Znum_runsextendcloneshapeZnum_key_value_headshasattrr   Zhidden_sizeZnum_attention_headsr   Zzerosr'   boolr7   r8   appendsumZ	unsqueezerepeatZ
vocab_sizeviewZgatherZsqueezeZargmaxZeos_token_idZmasked_fillZreshapecatr&   Zint64max
contiguousZzero_rD   r6   r9   Znum_hidden_layersr   updatepopZanomaly_filteringminlenlistfilterr(   datetimenowrW   )9r
   Zsize_to_promptfrE   rF   r.   Zall_csv_metricsZ
batch_sizerG   
max_lengthrH   Zcsv_metricsr;   r<   Zaccelerator_prompt_latency_sZaccelerator_prompt_latency_msZaccelerator_prompt_thrptZall_token_idsZcurrent_lengthZ	num_headsZ	head_sizeZhas_eosZaccelerator_timesZsampling_timesZwall_clock_start_timeZaccelerator_time_latency_sZsampling_start_timeZprompt_end_indicesZidxsZnext_token_logitsZnext_tokensZtokens_to_addZsampling_end_timeiZnew_sequence_lengthZpresent_keyZpresent_valueZwall_clock_end_timeZ	orig_sizenew_sizeZavg_sampling_latency_sZavg_sampling_latency_msZavg_sampling_thrptZfirst_token_latency_sZfirst_token_latency_msZfirst_token_thrptZhalfwayZhalfway_token_latency_sZhalfway_token_latency_msZhalfway_token_thrptZall_token_latency_sZall_token_latency_msZall_token_thrptZwall_clock_latency_sZwall_clock_thrptr/   rU   r1   r   r2   main`  s   






    
&& 
 
 
	
 
0r   __main__)&
__future__r   rj   r   rJ   r   r   loggingrn   r   r7   numpyrs   ZpandasrQ   r   Zbenchmark_helperr   Zllama_inputsr   r   Ztransformersr   r   r   r	   Zonnxruntimer,   	getLogger__name__rR   r3   rA   rI   rC   rW   r~   r   r1   r1   r1   r2   <module>    s8   
@	 # ~