U
    T?hD:                     @   sr  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
Z
ddlZddlZddlmZ dd Zejdd	d
Zd&ee	e dddZejeedddZejeedddZejeejdddZeejedddZejeeeeedddZejeeeedddZe ee	e eeed d!d"Zd#d$ Z e!d%krne  Z"ee"j#e"j$e"j%e"j&e"j' dS )'z
Export LLM to onnx
    N)Path)Optional)nnc                   C   st   dd t jj_dd t jj_dd t jj_dd t jj_dd t jj_dd t jj_dd t jj_	d	d t jj_
d
S )z1do not init model twice as it slow initializationc                 _   s   | S N xargskwargsr   r   _/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/large_model_exporter.py<lambda>       z*disable_huggingface_init.<locals>.<lambda>c                 _   s   | S r   r   r   r   r   r   r      r   c                 _   s   | S r   r   r   r   r   r   r      r   c                 _   s   | S r   r   r   r   r   r   r      r   c                 _   s   | S r   r   r   r   r   r   r      r   c                 _   s   | S r   r   r   r   r   r   r      r   c                 _   s   | S r   r   r   r   r   r   r       r   c                 _   s   | S r   r   r   r   r   r   r   !   r   N)torchr   initZkaiming_uniform_Zuniform_Znormal_Z	constant_Zxavier_uniform_Zxavier_normal_Zkaiming_normal_Zorthogonal_r   r   r   r   disable_huggingface_init   s    r   )modelc                 C   s   d}d}|   D ]$}|| |  7 }|| 7 }qd}d}|  D ]$}|| |  7 }|| 7 }qF|| d d }|S )z-to calculate how much memory this model needsr      )
parametersZnelementZelement_sizebuffers)r   Z
param_sizeZ	param_sumparambuffer_sizeZ
buffer_sumbufferZall_sizer   r   r   get_model_parameter_size$   s    r   )hf_model	cache_dirc                 C   sP   t   tjj| tj|dd}|dkr(| }tj|}t|ddd }||fS )zU
    get the pretrained torch model from hugginface,
    and sample model-inputs
    T)Ztorch_dtyper   Ztrust_remote_codeNzHello, my dog is cutept)Zreturn_tensors)	r   transformersZAutoModelForCausalLMZfrom_pretrainedr   Zfloat16ZAutoTokenizertuplevalues)r   r   	tokenizerr   sample_inputsr   r   r   "initialize_model_and_sample_inputs4   s       r!   )r   gpulistr    c                    s  dd } fdd |   } g }|| j|dd tt|  d }|  D ]0\}}| D ]\}}	||	j|dd t|	tjj	fkrt
t|	t| }
t|	D ]f\}}||j|dd |t||
 t| }||  || td| d	| d	| d
|  qqb|	|d  td| d	| d
|d   qbtt| dkrP||d  td| d
|d   qPt  | |d |d d W 5 Q R X | S )z/Make the model executable across multiple GPUs.c           	      S   s$  g }d }|D ]}t |tjk	r*|| nt| drJ||| jj nt| drvt| 	 |j}||| n`tt| 
 d dr||t| 
 jj n.|d k	r|j|kr||| n
|| |d kr|d j}q| D ]$\}}t |tjkr||||< qt||fS )Nweightr   r   )typer   Tensorappendhasattrtor#   devicenextr   childrenitemsr   )	modinputsr
   Zmodifyed_inputsZ	first_devZlayer_inputr)   keyvaluer   r   r   input_gpu_device_hookJ   s*    


z5auto_pipeline_parallel.<locals>.input_gpu_device_hookc                    s*   |  | |  D ]} |d | qd S )N   )r(   named_children)r-   devlayermove_layer_to_device_rurcr   r   r7   c   s    
z9auto_pipeline_parallel.<locals>.move_layer_to_device_rurcTZwith_kwargsr   zmove .z to r2   attention_mask)halfr&   register_forward_pre_hookr*   iterr3   r$   r   r   Z
ModuleListmathfloorlen	enumerateminr(   printlistno_grad)r   r"   r    r1   Z	all_hooksZpre_fixZtop_nameZ
top_modulenamemoduleZnum_layers_on_each_gpuidxZ
attn_layerZto_devr   r6   r   auto_pipeline_parallelG   s2    

$ 
rJ   )r   r    	with_pastc                    s2  g fdd}| j |dd}t| jj t  } fdd|D }| |d |d d	}|  d |}td D ]\}	}
d |	 ||	< q~d 	 D ]\}}|
|}	|||	< qtt||D ]V\}	\}}t|tjkr|| j d
|kr|||	< |r | |d |d |dn|}q|||jfS )zn
    auto retrieve onnx inputs from torch model as we can't enumlate all possibilities
    for all models
    c                    s     ||f  d S )Nr   )r&   )_r.   r
   )user_inputsr   r   hook_for_inputs   s    z-retrieve_onnx_inputs.<locals>.hook_for_inputsTr8   c                    s   g | ]}  |jqS r   )getdefault).0r/   )forward_paramsr   r   
<listcomp>   s     z(retrieve_onnx_inputs.<locals>.<listcomp>r   r2   r:   	use_cache)r;   rT   )r=   inspect	signatureforwardr   rE   keysremoverB   r,   indexzipr$   r   r%   r(   r)   past_key_values)r   r    rK   rN   Zhook_handle
input_keysdefault_valuesoutonnx_inputsrI   Z_valr/   r0   r   )rR   rM   r   retrieve_onnx_inputs   s,    

"ra   )r   sample_inputs_tpreturnc                 C   s   t jdjd d }tdt| d  d td|d  d t| |d krdd tt j D }t|d	krtt| d
 t	| ||} qtd | 
  } ntd |   } | S )a	  
    According to the model size, we will upload it to
    CPU if has no GPU or enough GPU memory,
    Single GPU if has only one GPU in local or model size is enough to fit one GPU
    Multiple GPU if there is more than one gpu in local and model is too large
    r   r   zModel_Size = z GBztotal_mem_per_cpu = g?c                 S   s   g | ]}t |qS r   )r   r)   rQ   ir   r   r   rS      s     z.move_to_appropriate_device.<locals>.<listcomp>r2   zk GPUs are used to export onnx,                    Please set CUDA_VISIBLE_DEVICES to use specific GPU groupz5!!!! convert model to float and export onnx using CPUzExport model on a single GPU)r   cudaZget_device_propertiesZtotal_memoryrD   r   rangeZdevice_countrA   rJ   cpufloatr<   )r   rb   Ztotal_mem_per_cpuZdevice_collectionr   r   r   move_to_appropriate_device   s    rj   )r    r)   rc   c                 C   s>   g }| D ],}t |tjr*||| q|| qt|S )zmove inputs to device)
isinstancer   r%   r&   r(   r   )r    r)   Zsample_inputs_Z
sample_intr   r   r   adapt_inputs_to_device   s    rl   )r   r`   torch_input_namesr\   rK   input_with_pastc                    s>  d}ddi}|dk	rht |}t|d d jd jd k d}| dksXtdd| di}|st| j	j
}tfddtt D }	d	|	krd
|	kstdd}
ddddddd}t|	D ]6\ }||kr· fddt   D }|||< q|rXt|D ]B}|	d| df7 }	|	d| df7 }	|||	d < |||	d < q|sd|rt|D ]*}|
d| df7 }
|
d| df7 }
qltD ]\ }|r|dkr| < nn|d
kr  }tj|tj|jd df|j|jdfdd < n*|d	kr  }|ddddf  < q|	|
|fS )z"fetch onnx inputs and outputs namer   Z
batch_sizeNr2   Zseq_lenc                    s$   g | ]}t  | tjr| qS r   )rk   r   r%   rd   )r`   rm   r   r   rS      s      z2fetch_onnx_inputs_outputs_name.<locals>.<listcomp>	input_idsr;   z6input_ids and attention_mask must be existed in inputs)Zlogits)r   r2   )rp   r;   c                    s   i | ]}|  d | qS )Z__unknown_dims__r   rd   )rI   r   r   
<dictcomp>   s      z2fetch_onnx_inputs_outputs_name.<locals>.<dictcomp>zpast_key_values.z.keyz.valuezpresent.r\   )r)   dtype)dim)rA   r   ZtensorshapeZnonzeroviewZnumelAssertionErroritemconfigZnum_hidden_layersr   rg   rB   rt   catZonesr)   rs   )r   r`   rm   r\   rK   rn   Znum_of_past_keyZkv_cache_axisZ	seq_indexonnx_inp_namesonnx_out_namesonnx_dynamic_axesrG   Zunknown_dimsre   Z	attn_maskrp   r   )rI   r`   rm   r   fetch_onnx_inputs_outputs_name   sd    	,



 
r~   )r   onnx_io_tupler`   	onnx_pathopsetc                 C   s   |j }|\}}}t }	tj|	d}
tjj| t	||
d||||d |j
dd |j| d j
dd tt|
}tj|t|tt|	dkd| dddd	 W 5 Q R X d
S )z do export with torch.onnx.exportztmp.onnxF)r   r	   fverboseZopset_versionZinput_namesZoutput_namesZdynamic_axesT)
missing_okz	_ext.datar2   r   )Zsave_as_external_dataZall_tensors_to_one_filelocationZsize_thresholdZconvert_attributeN)rG   tempfileTemporaryDirectoryospathjoinr   onnxZexportr   unlinkparentloadstrZ
save_modelrA   listdir)r   r   r`   r   r   onnx_model_namer{   r|   r}   Z
tmpdirnameZtmp_onnxZ
onnx_modelr   r   r   do_export_internal  s4    

r   )r   r   onnx_path_strrK   r   c                 C   s   t | |\}}t||}t|t| j}t|||\}}	}
t||	||
|d}d}t|	 }|j
dkrr|| }t|||	|| |sdS t||	||
|d}d}|j| }t|||	|| dS )z
    do export
    model: torch model
    onnx_path: where the onnx model saved to
    sample_inputs_tp: inputs for torch model
    Fz
model.onnxz.onnxNTzmodel_with_past.onnx)r!   rj   rl   r*   r   r)   ra   r~   r   absolutesuffixr   r   )r   r   r   rK   r   r   rb   r    r]   r`   Zpast_key_valuer   r   r   r   r   r   export_onnx6  s     


r   c                  C   sx   t  } | jdddtdgdd | jddd	td
dd | jdd	tddd | jddd	dd | jdd	tddd |  S )zarguments parsing.z-mz--modelTzmeta-llama/Llama-2-70b-hfz+Pre-trained models in huggingface model hub)requiredr$   rP   helpz-sz--saved_pathFz./onnx_models/z"where the onnx model will be savedz--cache_dirNz[cache directly of huggingface, by setting this to avoid useless downloading if you have onez--with_past
store_truez;The tool will export onnx without past-key-value by default)actionrP   r   z--opset   zothe opset to save onnx model,               try to increase it if this opset doens't have new features you want)argparseArgumentParseradd_argumentr   int
parse_args)parserr   r   r   parse_argumentsZ  sL    
r   __main__)N)(__doc__r   rU   r?   r   r   pathlibr   typingr   r   r   r   r   r   Moduler   r   r!   rE   r   rJ   boolra   rj   r)   rl   r~   r   r   rF   r   r   __name__r	   r   r   Z
saved_pathrK   r   r   r   r   r   <module>   s@   =#D$#.
