U
    T?hh                     @   sr  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&Z'e(e)Z*d	d
 Z+e j,e-dddZ.e j,dddZ/dd Z0dd Z1dd Z2dd Z3dd Z4dd Z5d"ddZ6dd  Z7e)d!krne7  dS )#    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings_as_ortvalues%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                 C   sR   | j dkrdS | j dkrFzt|jW S  tk
rD   t|jj Y S X t| S )N   hf-pt-eagerhf-pt-compiler   hf-ort)benchmark_typelenZinputs_names	ExceptiondecoderZinput_names
get_inputsargsmodel r    a/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_len(   s    

r"   )r   ort_model_inputs_lenc                 C   s  d\}}| j dkrdn| jj}| j dkrdt| j| j| j| jdd}t| j| j| j| j| jdd}nh| j dkr|d	krt| j| j| j| jdd}t| j| j| j| j| jdd}nPt	| j| j| j| jd
|| j| j
ddd
}t	| j| j| jd| j|| j| j
ddd
}n| j dkrjt	| j| j| j| jd
|| j| j
dd| jd}t	| j| j| jd| j|| j| j
dd| jd}nb| j dkr|dk}t| j| jd
| j|| j| j
|d}t| j| j| jd|| j| j
|d}ntd||fS )NNNort-msfti   r   T)return_dict)use_fp16r&   >   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr'   use_buffer_shareenginer&      ort-convert-to-onnxort)r*   r+   r,   r'   r-   r.   r&   
world_size   )r+   r*   r,   r'   r-   split_kvz/Unable to auto-detect inputs for provided model)r   configZmax_position_embeddingsr	   target_device
batch_sizesequence_lengthr
   r'   r   r-   r2   r   r   )r   r#   init_inputsiter_inputsr,   r4   r    r    r!   r   5   s    

	

r   r   c                 C   s0  d\}}d\}}| j dkr| jr&| jn| j}t }tj|| jrFtjntj	| j
| j
d| jd| j}t }| j dkrt|}n>| j dkrt }| j|_| jrd|_d|_ntd| j  | j d	krt| jtkr| jd
 n| j}t| jtkr| jd nd }d }d }	t| jD ]`}
d|
ksd|
ksd|
krDqd|
ksX|
dkr\|
}d|
krj|
}	d|
kr|
}|
}	qt }tj| j||	| j
| j
d|dkrdnd |||d
}t }| j dkrtd| j !| j"  t }tj#| j !| j"|| jgd}t }td||  d |S )Nr$   r   T)Ztorch_dtypeuse_auth_tokentrust_remote_codeZ	use_cache	cache_dirr   >   r   r0   r%   r/   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.dataZdecoder_modelz
model.onnxZdecoder_with_past_modelZdecoder_merged_model)	decoder_file_namedecoder_with_past_file_namer<   r=   Zuse_io_bindingZ
use_mergedproviderprovider_optionsZsession_options   r0   r%   zLoading model from )	providerszLoaded model in  s)$r   Zhf_pt_dir_path
model_nametimer   from_pretrainedr'   torchZfloat16Zfloat32authr>   tor6   compiler1   ZSessionOptionsr   Zenable_profilingverboseZlog_verbosity_levelZlog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   loggerinfoort_model_pathformatrankZInferenceSession)r   r   Zsess_options
start_timeend_timesourcerB   rC   r@   rA   filenamer    r    r!   	get_model   s    	




r^   c                    sX   j dkrt jnt jtjdd} jr>||}t|  fdd} fdd}|D ]}|  || |  qZd} j dkrt j	nt j	tjdd}	|	D ]4}|  t

 }
|| |  t

 }|||
 7 }q j dkrtd	 | j	 } j| } jdkrTtd
 j  td j  td| d td| d d S )NrD   zWarm up)filedescc                     s*    j dkr jdkr j S  fddS )NcpurD   c                     s&    j dkrtj rtj S dd S )Nra   c                  W   s   d S Nr    kwargsr    r    r!   <lambda>      =time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>devicerJ   cudais_availableZsynchronizerc   r;   r    r!   re     s
    +time_fn.<locals>.<lambda>.<locals>.<lambda>)ri   r   
io_bindingZsynchronize_inputsrc   r;   r    r!   re     s
    ztime_fn.<locals>.<lambda>c                     s*    j dkr jdkr j S  fddS )Nra   rD   c                     s&    j dkrtj rtj S dd S )Nra   c                  W   s   d S rb   r    rc   r    r    r!   re   (  rf   rg   rh   rc   r;   r    r!   re   %  s
    rl   )ri   r   rm   Zsynchronize_outputsrc   r;   r    r!   re   "  s
    r   Z	Benchmark zBatch Size: zSequence Length: z	Latency: rF   zThroughput: z tps)r   rangeZwarmup_runsr   sysstdoutrN   rU   rV   Znum_runsrH   r7   rY   r8   )r   fninputsZwarmup_rangeoutputsZ
input_syncZoutput_sync_
total_timeZbench_rangerZ   r[   ZlatencyZ
throughputr    r;   r!   time_fn  sF    






rw   c           	      C   s  d| j  d| j d| j  d| j d| j d|jdd d| dtj	 d}d }| jdkrt
tjtjgddd"}td	 || W 5 Q R X W 5 Q R X |jd
dj| j| jd}tj| j| d}t|d}|| W 5 Q R X n|| | d}|S )NbZ_sru   -z%Y-%m-%d_%H:%M:%Sr   T)Z
activitiesZrecord_shapesZprofile_memoryZmodel_inferencer3   )Zgroup_by_stack_n)Zsort_byZ	row_limitz.logwz.json)r7   r8   r   lower	precisionri   __name__replacedatetimenowr   r   ZCPUCUDAr   Zkey_averagestableZpt_filter_byZpt_num_rowsrR   pathjoin
log_folderopenwrite)	r   rr   rs   Zinputs_typeprefixr]   ZprofZ	prof_datafr    r    r!   
profile_fnR  s$    T

  

r   c                    s   t  }t|}|jdd   | jdkrVtd|jd dtjdd  d t	
  tj  t| jdk fd	d
d tj  d S )Ng?)intervalr   zCPU usage: F)Zlogical%ra   c                      s    S rb   r    r    rr   rs   r    r!   re   |  rf   zmeasure_fn.<locals>.<lambda>)Zis_gpufunc)rR   getpidpsutilProcessZcpu_percentrY   rU   rV   	cpu_countgcZcollectrJ   rj   Zempty_cacher   ri   rp   rq   flush)r   rr   rs   pidprocessr    r   r!   
measure_fno  s    

&
r   c                    s    fdd}|}| j dkr*|| || | jrt| ||d}| j dkr jj }td| d|  t	|tj
| j| t| ||d}| j dkrԈ jj }td| d|  t	|tj
| j| d S td	 t| || t| || td
 t| || t| || d S )Nc                    s    f | }|S rb   r    rs   rt   r   r    r!   
get_logits  s    
z$run_hf_inference.<locals>.get_logitsr   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingrU   warningrR   renamer   r   r   Zdecoder_with_pastrV   rw   r   )r   r9   r:   r   r   generate_fnnew_lognameold_lognamer    r   r!   run_hf_inference  s.    




r   c                    sV   fdd}fdd}fdd} j dkr4|n|}i } jr|||\}	}t ||	d}
 }td	| d
|
  t|tj	 j
|
 t |||\}}t ||d}
 }td	| d
|
  t|tj	 j
|
 d S td |||\}	}t ||	 t ||	 td |||\}}t || t || d S )Nc                    sP   t | }  jdkrHt|  jt j j|\}}t d| ||fS | |fS )Nra   rm   )r   ri   r   intrY   r-   setattr)rs   kv_cache_ortvaluesrm   r   r    r!   prepare_ort_inputs  s    

     z-run_ort_inference.<locals>.prepare_ort_inputsc                    s     |  d S rb   )Zrun_with_iobinding)rm   r   r    r!   with_io_binding  s    z*run_ort_inference.<locals>.with_io_bindingc                    s     d | }|S rb   )runr   r   r    r!   without_io_binding  s    z-run_ort_inference.<locals>.without_io_bindingra   r   r   r   r   r   r   )ri   r   r   r   rU   r   rR   r   r   r   r   r^   rV   rw   r   )r   r9   r:   r   r   r   r   r   r   Zort_init_inputsr   r   Zort_iter_inputsr    r   r!   run_ort_inference  s4    

r   c                 C   sH   | j dkrt| ||| n*| j dkr4t| ||| ntd| j  d S )N>   r   r   r   rD   r?   )r   r   r   r   )r   r9   r:   r   r    r    r!   run_inference  s
    

r   c              
   C   s   t  }|jddtddddddgd	 |jd
dtddd |jdddddd |jdddtdddddgdd |jdtddd |jd tdd!d |jd"tdd#d |jd$d%d&d' |jd(d)d*d' |jd+d,ttj rd-nd.d.d-d/gd0 |jd1d2td3d4 |jd5d6td7d4 |jd8d9td:d4 |jd;td<d4 |jd=td>d4 |jd?td@d4 |jdAdddB |jdCtdDdEd |jdFtdGdHd |jdIdddB |jdJttj	
dKdLd |jdMtddNdOdP | }tj|j t|j dQ|jkrJt|dR|j  dS |jdTkr(|jdU| if|_n"|jdVkrJ|jdU| if|_d-|_|jdkrf|jsftdW|jdXkr|jstdY|jdZ|_|jdZ|_|jd[ks|jdkr|jd.krdnd|_|jrt|jd@krt|jd@kstd\|S )]Nz-btz--benchmark-typeTr   r   r   r%   r0   )rO   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rO   r   helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr   z-pz--precisionfp32Zint4int8fp16zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r   rO   r   r   r   z--hf-pt-dir-pathrn   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rO   r   r   z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r   z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicerj   ra   Zrocm)rO   r   r   z-idz--device-idr   )rO   r   z-wz--warmup-runsr3   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr/   z	--profile)r   r   z--pt-filter-byZself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored)rO   r   r   r   r1   rP   ZExecutionProviderZCUDAExecutionProviderZ	device_idZROCMExecutionProviderz,Please specify a path to `--hf-ort-dir-path`rD   z+Please specify a path to `--ort-model-path` >   r   r   zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrJ   rj   rk   r   rR   r   r   
parse_argsnprandomseedZmanual_seedr   r   ri   upperrP   rT   AssertionErrorrW   batch_sizessplitsequence_lengthsr|   r   r   )rY   parserr   r    r    r!   get_args  s        
	   *r   c                  C   s  t  } t }t| }t|j t|j dtj	j
_| |_||_tj|j|j|j|jd}tj|j|j|j|jd}|jdkrd|j n|j}|jdk}t|d| t|d| t|d| t|d	| t|}t||}|jd
krBtj|j|jdd}	ttdd |	j j!}
|o2t"|
dko2|jdk}t|d| nt|dd t#$|j%|j&D ]j\}}|jdkrtd| d| d t|dt'| t|dt'| t(||\}}t)|||| q^d S )NT)r>   r<   r=   ra   zcuda:r   	tokenizerr5   r6   r'   rD   F)Zload_external_datac                 S   s
   | j dkS )NZGroupQueryAttention)Zop_type)noder    r    r!   re     rf   zmain.<locals>.<lambda>r   r-   z
Batch size = z and sequence length = z...r7   r8   )*r   r   r   r   rN   rU   rV   __dict__rJ   backendsZcudnnZ	benchmarkrY   r2   r   rI   rG   r>   rK   r   ri   r|   r   r^   r"   r   onnxZ
load_modelrW   rX   listfiltergraphr   r   	itertoolsproductr   r   r   r   r   )rY   r2   r   r   r5   r6   r'   r   r#   Z
onnx_modelZ	gqa_nodesr-   r7   r8   r9   r:   r    r    r!   main  sR    

      

r   __main__)r   )8r   r   r   r   loggingrR   rp   rH   numpyr   r   r   rJ   Zbenchmark_helperr   r   Zdist_settingsr   r   Zllama_inputsr   r   r   r	   r
   r   Zoptimum.onnxruntimer   Ztorch.profilerr   r   r   Ztqdmr   Ztransformersr   r   r   Zonnxruntimer1   	getLoggerr}   rU   r"   	Namespacer   r   r^   rw   r   r   r   r   r   r   r   r    r    r    r!   <module>   sF    
 UF>;	
 	2
