U
    T?h&                     @  s*  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlZedZd	d
dddZd	d
dddZd d	ddddddddZddddZ g fddddZ!e"dkr&dZ#ej$#e# e%e# e!  dS )!    )annotationsN)setup_logger)get_rankget_size)add_io_bindings_as_ortvaluesconvert_inputs_for_ort%get_merged_sample_with_past_kv_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)setup_torch_model)
AutoConfig zargparse.Namespacer   )argsconfigc                 C  s"   | j r
dnd\}}|j}|||fS )N)      )r   r   )use_past_kvZmax_position_embeddings)r   r   past_sequence_lengthZcurr_sequence_lengthmax_sequence_length r   d/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/llama/llama_parity.pyget_sequence_lengths!   s    r   c                 C  s~   t  }d}t| |\}}}| jrDt|| j||||| j| jd|d
}n6| jrft|| j||| jd|d}nt	|| j||dd}|S )N   T)Zseq_lenpast_seq_lenmax_seq_lenuse_fp16use_buffer_sharereturn_dict
world_size)r   r   r   )r   )
r   r   mergedr   devicer   r   r   r
   r	   )r   r   r   Z
batch_sizer   Zsequence_lengthr   inputsr   r   r   
get_inputs'   s8    
r#   strbooldictzNone | torch.nn.ModulezNone | AutoConfig)r   locationuse_auth_tokenkv_cache_ortvaluespytorch_modelr   c                 C  s0  |}|d kr2t | ||| jr tjntj| jd\}}t| |}| jdkrPtj	  t

 }|f |j   }	| jdkrtj	  t

 }
td|
|  d | jr|d k	r~tj  t| |\}}}t|| j||d}| j  d}|dkr|d| jif}tj| jt |gd	}t||}| jdkrt||| jt| j| j|d
\}}|  t

 }| | |!  t

 }
|" d }~n$t

 }|#d |}t

 }
|d }td|
|  d d| jksd| jkrdnd}t$j%|	|||d}t&d|  |s,t&dt$'|	|   |S )NZtorch_dtyper!   cpuzPyTorch took z s)r   r   r   ZExecutionProviderZCUDAExecutionProvider	device_id)Zsess_options	providers)Z
ort_inputsr!   r-   r   r)   r   zONNX Runtime took int4int8g      4@g      ?)ZrtolZatolz,Are PyTorch and ONNX Runtime results close? z
Max diff: )(r   r   torchfloat16float32r!   r#   execution_providercudaZsynchronizetimeZlogitsdetachr,   numpyloggerinfo	small_gpuZempty_cacher   r   r   upperrankortZInferenceSessionZonnx_model_pathZSessionOptionsr   r   intZsynchronize_inputsZrun_with_iobindingZsynchronize_outputsZcopy_outputs_to_cpurunnpZallclosewarningmax)r   r'   r(   r)   r*   r   Zpy_modelr"   
start_timeZ
pt_outputsZend_timer   _r   epZ	ort_modelZ
io_bindingZort_outputsZtolZparityr   r   r   verify_parityJ   s    	









	
 rG   z	list[str])argvc                 C  sr  t  }|jddddd |jdddtjdd	d
 |jdddtjddd
 |jdddddddgdd |jddddd |jdd |jddddd |jdd |jd d!dd"d |jdd# |jd$dd%d |jdd& |jd'd(dd)d*d+d,gd-d. |jd/dtd0d1d2 |jd3dd4d | g kr6| n|| }|j	d5ksd|j	d)krh|j
dkrhd,nd+|_	|S )6Nz-mz--model_nameFzModel name in Hugging Face)requiredhelpz-tz--torch_model_directory.zMPath to folder containing PyTorch model and associated files if saved on disk)rI   defaultrJ   z-oz--onnx_model_pathTzSPath to ONNX model (with external data files saved in the same folder as the model)z-epz--execution_providerr,   r5   Zrocmz(Execution provider to verify parity with)rI   rL   choicesrJ   z-vz	--verbose
store_truezPrint verbose logs)actionrJ   )verbosez-pz--use_past_kvzfUse past key and past value as inputs to the model. Necessary for decoder_with_past_model.onnx models.)r   z-gz--use_buffer_sharezWUse if model has GroupQueryAttention and you want to enable past-present buffer sharing)r   z--mergedz2Use merged model (i.e. decoder_merged_model.onnx).)r    z-fpz--precisionr/   r0   fp16fp32zPrecision of model)rI   rM   rJ   z--cache_dirz./model_cachezQmodel cache dir to override default HF cache dir to avoid overflood the /home dir)rI   typerL   rJ   z--small_gpuzhLoad the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. >   rR   r0   )argparseArgumentParseradd_argumentospathjoinset_defaultsr$   
parse_args	precisionr4   )rH   parserr   r   r   r   get_args   s    

	
	r^   c                 C  s  t | }t|j td|  t }t|d|jdk ||_t|d|j	dkrTdnd|  t|dt
|j |jtjdk}|r|jn|j}i }|jst|||| nhd  }}|jst||||jrt
jnt
j|jd	\}}d
|_t||||||d}d|_t||||||d d S )NzArguments: r   rQ   device_namer,   zcuda:r!   rK   r+   F)r*   r   T)r^   r   rP   r9   r:   r   setattrr\   r=   r4   r1   r!   r_   Ztorch_model_directoryrW   rX   rY   Z
model_namer    rG   r;   r   r   r2   r3   r   )rH   r   r=   r(   r'   r)   r   llamar   r   r   main
  sB    
 
	     rb   __main__r   )NN)&
__future__r   rT   loggingrW   r6   r8   rA   r1   Zbenchmark_helperr   Zdist_settingsr   r   Zllama_inputsr   r   r   r	   r
   r   Zllama_torchr   Ztransformersr   Zonnxruntimer>   	getLoggerr9   r   r#   rG   r^   rb   __name__seedrandomZmanual_seedr   r   r   r   <module>   s2    
(  \d'

