U
    T?hC                     @   s^   d dl Zd dlZd dlmZ d dlZejejej	ej
dZdd ZG dd dZdd
dZdS )    N)AutoTokenizer)ztorch.int32ztorch.int64ztorch.float32ztorch.float16c                 C   s8   ddl m} ||  | | |  |jj d S )Nr   )cudart)cudar   Z
cudaMemcpydata_ptrZelement_sizeZnelementZcudaMemcpyKindZcudaMemcpyDeviceToDevice)dstsrcr    r   h/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/phi2/inference_example.pycuda_memcpy   s    r
   c                   @   s\   e Zd Zdd Zdd Zdd Zejeeddd	Z	dddZ
dddZdd Zdd ZdS )ORTGeneratorc                 C   s:   || _ d| _d| _d| _d| _d| _d| _d| _i | _d S )N    P   i   r   F)	onnx_decoder_path	num_heads	head_size
num_layersmax_sequence_length	device_iduse_cuda_graphuse_traced_inputsstatic_inputs_map)selfZdecoder_pathr   r   r	   __init__    s    zORTGenerator.__init__c                 C   s  || j krd S td}td| j}i }tj|dftj|d|d< tjdgtj|d|d< tj|dg tj|d|d< tjdgtj|d|d	< || j| j	| j
f}t| jD ]@}tj||tjd
}|d| | d| |  i qtj|ddftj|d|d< || j |< d S )Ncpur      )dtypedevice	input_idsr   step	seqlens_ktotal_sequence_lengthr   r   	past_key_past_value_   logits)r   torchr   r   zerosint32tensorint64r   r   r   ranger   float16update
contiguousclone)r   
batch_sizeZ
cpu_deviceZcuda_deviceZ	static_ioZcache_shapeicacher   r   r	   append_static_inputs+   s    

,z!ORTGenerator.append_static_inputsc              	   C   s@  | j rtjntj| _tj|d | jtjd}tj|d | jtjd}|j\}}| j	oj|| j
koj| joj| j | _| jstjdg| jtjdn| j
| d }| jstj|dg | jtjdn| j
| d }t||ddtj | jstjdgtdtjdn| j
| d	 }||d< | | d
}	| jrB| |	d< | j	rh| |	d< | |	d	< |	d= | jrv| jnd}
| jrd|| j|
| jfn|| j|
| jf}| jst| jD ]d}tj|| j| jd}| js |	d| | d| |  in|	d| | i qnXt| jD ]L}|	d| | j
| d|   d| | j
| d|   i q(tj||d| j| jd}d| i}| js8| jrd|| j|| jfn|| j|| jf}t| jD ]`}tj|| j| jd}| js|d| | d| | in|d| | i q|	|fS )Nr   r!   attention_maskr   r   r   r   r   r    )r   r4      r"   r#   past_r$   r%   present_key_present_value_present_)use_fp16r&   r,   float32torch_dtyper)   r   r(   shaper   r   use_buffer_share	packed_kvr   r*   r
   sumsubtor.   use_stepr   r   r   r+   r   r'   r-   r/   )r   encodings_dictr   r4   r0   sequence_lengthr   r   Ztotal_seq_lengthinputsZpast_seq_lengthZ
past_shaper1   pastr%   outputspresent_shapepresentr   r   r	   get_initial_inputs_and_outputsB   s    
	.  z+ORTGenerator.get_initial_inputs_and_outputs)modelrF   rH   c           
   	   C   s  |  }d }| D ]P\}}|j||jj|jjdkr8dn|jjtt|j t	|j
| d |j}q| D ]}|j}	| jrd|	kr||	dd }|j|	|jj|jj| jrtjntjt	|j
| d qn||	 }|j|	|j|jdkrdn|j| jrtjntjt	|j
| d qn|S )Nr   r   )nameZdevice_typer   Zelement_typer=   Z
buffer_ptrrJ   rG   )
io_bindingitemsZ
bind_inputr   typeindexpt_to_npreprr   tupler=   r   get_outputsrM   r>   replaceZbind_outputr:   npr,   r;   )
r   rL   rF   rH   rN   r   kvoutputrM   r   r   r	   apply_io_binding   sD    		zORTGenerator.apply_io_bindingTFc           	      C   s   || _ t }d|_d|_|| _| j dkr<d| j | jdfnd}tj| j||gd| _t	 | _
tj rxtd| j ntd| _|| _|| _|| _|| _tjd	d
d| _d| j_d S )N   r   ZCUDAExecutionProvider)r   Zenable_cuda_graphZCPUExecutionProvider)sess_options	providersr   r   zmicrosoft/phi-2T)Ztrust_remote_codez[PAD])r   ortZSessionOptionsZlog_verbosity_levelZlog_severity_levelr   InferenceSessionr   sessZ
RunOptionsror&   r   Zis_availabler   r:   r>   r?   rC   r   Zfrom_pretrained	tokenizerZ	pad_token)	r   r   r:   r>   r?   rC   r   r]   epr   r   r	   create_session   s$    
$zORTGenerator.create_sessionc              
   C   s  |  |\}}|d  }|j\}}	|	}
tj|| jtjd}|rLdd l}g }d}|
|k rP| | j	||}|rv| }|
  |r| jr| jdd | j	|| j | jr| jd| jrt|nd d}n| j	|| j |  |r| }|||  |d d d d	d d f }tj|d	d
}||B | jjk}||| jj|dg}tj||gd	d
}t|rvqP|
d7 }
|tj|d< | jrt| j| d |d  | j| d |d< | jrtj|
d g| jtj d|d< | jrt| j| d |d  | j| d |d< | jr|d }|| |d tj|d< |
|d d< | jrt| j| d |d  | j| d |d< |d d | j| d d< | j| d |d< n*t|d | |dgdtj|d< |d jd dkr0|d d d d dd d f ! |d< | jr0| j| d |d< |d "  | j#sPt$| j%D ]X}| j&s|d|  |d| < |d|  |d| < n|d|  |d| < qL|d jd }| j&rd|| j'|| j(fn|| j'|| j(f}t$| j%D ]d}tj|| j| j)d}| j&s2|*d| |! d| | ! in|*d| |! i qqP|rt+d| d|	 d||	   t+dd|d   ddt,-|dd    d d S | jj.|dd}|S )Nr   r!   r   TZgpu_graph_idz-1Fr%   )dimr   r   r   r    r4   r7   r"   r8   r#   r9   r6   r5   zBatch size: z, Sequence length: z, Token num: zPrompt letency: i  zms, Token latency: ms)Zskip_special_tokens)/rK   r/   r=   r&   r'   r   booltimer[   ra   Zsynchronize_inputsr   rb   Zadd_run_config_entryZrun_with_iobindingr   strZsynchronize_outputsappendZargmaxrc   Zeos_token_idZmasked_fillZreshapecatallrB   r(   r
   r   rC   r)   r*   r.   Zzero_r>   r+   r   r?   r   r   r<   r-   printrW   ZmeanZbatch_decode)r   rD   
max_lengthcuda_graph_annotation	benchmarkrF   rH   Zall_token_idsr0   rE   Zcurrent_lengthZhas_eosrj   ZlatencyZ
prompt_runrN   startendZnext_token_logitsZnext_tokensZtokens_to_addZprevious_seqlens_kr1   Znew_sequence_lengthrI   rJ   textsr   r   r	   generate_impl   s    

  &  
0zORTGenerator.generate_implc                 C   s   | j j|dd}| |||S )NT)padding)rc   Zbatch_encode_plusrv   )r   promptrp   rq   rD   r   r   r	   generatea  s    zORTGenerator.generatec                 C   sx   |\}}|| }i }t jdd||ft jd |d< t j||ft jd |d< | j|||dd | j|||dd d S )	Nr   iX  )r   r   r4   F)rr   T)r&   randintr(   tolistZonesrv   )r   prompt_shape	token_numrq   r0   rE   rp   rD   r   r   r	   generate_benchmarkf  s     zORTGenerator.generate_benchmarkN)TTFFF)F)__name__
__module____qualname__r   r3   rK   r_   r`   dictr[   re   rv   ry   r~   r   r   r   r	   r      s   ])         

 r   FTc                    s~   t |   |||||  fdd}dg}	|s<||	 |rzd}
dD ]0} | dD ]}||f} j||
|d qZqHd S )Nc                    sZ   t | }r j|d  j| d|d}tt |D ] }td| |  td||  q4d S )N)r0      )rp   rq   zPrompt: zTexts: )lenr3   ry   r+   ro   )rx   Zexample_batch_sizeru   r1   	generatorr   r   r	   
simple_run  s    zrun_phi2.<locals>.simple_runzV```python
    def print_prime(n):
    """
    Print all primes between 1 and n
    """r   )r   r5   r\      )   i   )rq   )r   re   r3   r~   )Zonnx_model_pathr>   r   r?   r:   rC   r   Zrun_benchmarkr   rx   r}   r0   rE   r|   r   r   r	   run_phi2u  s    

r   )FTFFF)numpyrW   r&   Ztransformersr   Zonnxruntimer_   r(   r*   r;   r,   rR   r
   r   r   r   r   r   r	   <module>   s$     \     