U
    T?h                 
   @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z. ddl/m0Z0 ddl1m2Z3 ddl4m5Z5m6Z6 e7dZ8G dd deZ9dleee:  ej;dddZ<ej;dddZ=ej;dddZ>dme:e?dddZ@dne:e?e?dd d!ZAe:e?e?e)d"d#d$ZBejed%d&d'ZCejed%d(d)ZDejed%d*d+ZEdoeee:eFeeG eeG d.d/d0ZHeed1d2d3ZIdpeeFee d4d5d6ZJd7d8 ZKd9d: ZLd;d< ZMed=d>d?ZNee?e?e?d@dAdBZOed=dCdDZPdqee:eFeFeFdGdHdIZQed=dJdKZRed=dLdMZSedNdOdPZTdre:e?dQdRdSZUdse:e:e?e?dTdUdVZVdWdX ZWe9jXfej;e9dYdZd[ZYej;ee e%f ejZejZeFeFeeeF  ee:ef d\d]d^Z[d_d` Z\dtej;eee:  e?dbdcddZ]duej;eee:  dedfdgZ^dveee:  eee:  dhdidjZ-e_dkkre-  dS )wa  
This converts GPT2 or T5 model to onnx with beam search operator.

Example 1: convert gpt2 model with beam search:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx

Example 2: convert gpt2 model with beam search containing specific cuda optimizations:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu                       --past_present_share_buffer --use_decoder_masked_attention

Example 3: convert gpt2 model with beam search with mixed precision and enable SkipLayerNorm strict mode:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode

Example 4: convert T5 model with beam search in two steps:
    cd ./models/t5
    python convert_to_onnx.py -m t5-small
    cd ../..
    python convert_generation.py -m t5-small --model_type t5                                            --decoder_onnx ./models/t5/onnx_models/t5-small_decoder.onnx                                    --encoder_decoder_init_onnx ./models/t5/onnx_models/t5-small_encoder_decoder_init.onnx          --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 5: convert T5 model with beam search. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx           --use_gpu --past_present_share_buffer --use_decoder_masked_attention

Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
    python convert_generation.py -m google/mt5-base --model_type mt5 --output mt5-base-beamsearch.onnx -e

Example 8: convert gpt2 model with greedy search:
    python convert_generation.py -m gpt2 --output gpt2_greedy_search.onnx --num_beams 1 --num_return_sequences 1

Example 9: convert gpt2 model with sampling:
    python convert_generation.py -m gpt2 --output gpt2_sampling.onnx --num_beams 1 --num_return_sequences 1 --top_p 0.6
    N)Enum)Path)AnyDictListOptionalUnion)	Precisionsetup_logger)NumpyHelper)
GraphProto
ModelProtoTensorProto)	OnnxModel)
GPT2ConfigGPT2LMHeadModelGPT2Tokenizer	MT5ConfigMT5ForConditionalGenerationT5ConfigT5ForConditionalGenerationT5Tokenizer)GraphOptimizationLevelInferenceSessionSessionOptionsget_available_providers)main)PRETRAINED_GPT2_MODELS)export_onnx_models)PRETRAINED_MT5_MODELSPRETRAINED_T5_MODELS c                   @   s    e Zd ZdZdZdZdd ZdS )GenerationTypeZbeam_searchZgreedy_searchZsamplingc                 C   s   | j S )N)value)self r%   ]/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/convert_generation.py__str__U   s    zGenerationType.__str__N)__name__
__module____qualname__
BEAMSEARCHGREEDYSEARCHSAMPLINGr'   r%   r%   r%   r&   r"   P   s   r"   )argvreturnc                 C   s"  t  }|d}|jdddtddtt t  d |jdd	td
d
ddgddd
ddg d |jdd	tt	j
dddd |jdd	tddd |jdd	tddd |jdd	ddd |jd	d |d}|jddtd d |jd!d"d	ttjtjtjgd#d |jd$d%d	d&d'gd(d) |jd*d+d	dd,d |jd	d- |jd.d/d	dd0d |jd	d1 |jd2d3d	dd4d |jd	d5 |jd6d7d	dd8d |jd	d9 |jd:d;d	dd<d |jd	d= |d>}|jd?d	dd@d |jd	dA |jdBd	ddCd |jd	dD |jdEd	ddF |jd	dG |jdHtd	dIdJdK |jdLd	ddMd |jd	dN |jdOd	ddPd |jd	dQ |jdRd	ddSd |jd	dT |jdUd	ddVd |jd	dW |jdXd	ddYd |jd	dZ |jd[d	dd\d |jd	d] |jd^d	dd_d |jd	d` |da}|jdbtd	dcdddK |jdetd	dfdgdK |jdhtd	didjdK |jdktd	dcdldK |jdmtd	dcdndK |jdotd	dcdpdK |jdqtd	drdsdK |jdttd	drdudK |jdvtd	tdw dxdK |jdytd	dcdzdK |jd{td	d|d}dK |jd~td	dIddK |jdtd	dddK |jdtd	dddK |jdtd	dddK |d}|jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	tdcdd |jdd	ddd |jd	d || }|S )zParse arguments

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.

    Returns:
        argparse.Namespace: Parsed arguments.
    zInput optionsz-m--model_name_or_pathTzEPytorch model checkpoint path, or pretrained model name in the list: , )requiredtypehelpz--model_typeFgpt2t5mt5z*Model type (default is gpt2) in the list: )r2   r3   defaultchoicesr4   --cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r2   r3   r8   r4   z--decoder_onnxr!   zLPath of onnx model for decoder. Specify it when you have exported the model.z--encoder_decoder_init_onnxzgPath of ONNX model for encoder and decoder initialization. Specify it when you have exported the model.z	--verbose
store_truezPrint more information)r2   actionr4   )verbosezOutput options--outputz,Output path for onnx model with beam search.z-p--precisionzTPrecision of model to run. fp32 for full precision, fp16 for half or mixed precisionz-b--op_block_list*autozDisable certain onnx operators when exporting model to onnx format. When using defaultvalue for gpt2 type of model fp16 precision, it will be set to ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]. Other situation, it will be set to [])r2   nargsr8   r4   z-e--use_external_data_formatz!save external data for model > 2G)use_external_data_formatz-sz--run_shape_inferencezrun shape inference)run_shape_inferencez-dpvsz--disable_pad_vocab_sizezDo not pad logits MatMul weight to be a multiple of 8 along the dimension where dim value is the vocab size. The logits MatMul may hence be of poor performance for fp16 precision.)disable_pad_vocab_sizez-dsgdz,--disable_separate_gpt2_decoder_for_init_runzDo not create separate decoder subgraphs for initial and remaining runs. This does not allow for optimizations based on sequence lengths in each subgraph)*disable_separate_gpt2_decoder_for_init_runz-iz--disable_shared_initializerszdo not share initializers in encoder and decoder for T5 or in the init decoder and decoder for GPT2. It will increase memory usage of t5/mt5/gpt2 models.)disable_shared_initializersz6Beam search parameters that stored in the output modelz--output_sequences_scoreszoutput sequences scores)output_sequences_scoresz--output_token_scoreszoutput token scores)output_token_scoresz--early_stopping)r2   r=   )early_stoppingz--no_repeat_ngram_sizer   zNo repeat ngram size)r3   r2   r8   r4   z--vocab_maskz\Enable vocab_mask. This mask applies only to every generated token to filter some bad words.)
vocab_maskz--past_present_share_bufferzWUse shared buffer for past and present, currently work for gpt2 greedy/sampling search.)past_present_share_bufferz--use_decoder_masked_attentionzUses `DecoderMaskedSelfAttention` or `DecoderMaskedMultiHeadAttention` to optimize the decoding Attention computation. Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 32, 64 and 128 are supported.)use_decoder_masked_attentionz--prefix_vocab_maskzeEnable prefix_vocab_mask. This mask can be used to filter bad words in the first generated token only)prefix_vocab_maskz--custom_attention_maskz]Enable custom_attention_mask. This mask can be used to replace default encoder attention mask)custom_attention_maskz--presence_maskz!Presence mask for custom sampling)presence_maskz--seedzRandom seed for sampling op)seedzYBeam search parameters not stored in the output model, for testing parity and performancez--min_length   zMin sequence lengthz--max_length2   zMax sequence lengthz--num_beams   z	Beam sizez--num_return_sequencesz&Number of return sequence <= num_beamsz--length_penaltyz<Positive. >1 to penalize and <1 to encourage short sentence.z--repetition_penaltyz-Positive. >1 to penalize and <1 to encourage.z--temperature      ?z6The value used to module the next token probabilities.z--top_pzTop P for samplingz--filter_valueZInfzFilter value for Top P samplingz--min_tokens_to_keepzAMinimum number of tokens we keep per batch example in the output.z--presence_penalty        z%presence penalty for custom sampling.z--customz&If 1 customized top P logic is appliedz--vocab_sizezIVocab_size of the underlying model used to decide the shape of vocab maskz--eos_token_idzKcustom eos_token_id for generating model with existing onnx encoder/decoderz--pad_token_idzKcustom pad_token_id for generating model with existing onnx encoder/decoderz0Other options for testing parity and performancez--use_sln_strict_modez_Enable strict mode for SLN in CUDA provider. This ensures a better accuracy but will be slower.)use_sln_strict_mode	--use_gpuz)use GPU for inference. Required for fp16.)use_gpuz--disable_parityzdo not run parity test)disable_parityz--disable_perf_testzdo not run perf test)disable_perf_testz--torch_performanceztest PyTorch performance)torch_performancez--total_runsz4Number of times of inference for latency measurementz--save_test_dataz-save test data for onnxruntime_perf_test tool)save_test_data)argparseArgumentParseradd_argument_groupadd_argumentstrjoinr   r    r   ospathset_defaultsr	   FLOAT32FLOAT16intfloat
parse_args)r.   parserZinput_groupZoutput_groupZmodel_groupZbeam_parameters_groupZ
test_groupargsr%   r%   r&   parse_argumentsY   s   	
		


    

   
rr   )rq   c                 C   s   | j }d|d| jdd| jtjkr$dndddd	d
dg}| jrJ|d| jg | jrZ|d | j	rj|d t
| jr|dg || j | jtjkr| jstd| jrtd|  t|d dS )zqConvert GPT-2 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r0   r?   z--optimize_onnxr@   fp32fp16z--test_runs1z--test_cases10z--overwriter:   r\   rE   rA   zEfp16 or mixed precision model cannot run in CPU. Please add --use_gpuzarguments for convert_to_onnx:)r.   N)model_name_or_pathdecoder_onnx	precisionr	   rk   	cache_dirextendr]   appendrF   lenop_block_listrl   AssertionErrorr>   loggerinfoconvert_gpt2_to_onnx)rq   Z
model_name	argumentsr%   r%   r&   gpt2_to_onnx  s8    


r   c                 C   s   t | j| jt| jj| j| j| jt	j
k| jdddddd| jd}td|d   td|d   |d | _|d | _dS )	znConvert T5 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    FT)r]   rF   Zoptimize_onnxry   r>   Zuse_decoder_start_tokenZmerge_encoder_and_decoder_init	overwriteZdisable_auto_mixed_precisionZuse_int32_inputs
model_typezonnx model for encoder: r   zonnx model for decoder: rU   N)export_t5_onnx_modelsrw   rz   r   outputparentr]   rF   ry   r	   rl   r   r   debugencoder_decoder_init_onnxrx   )rq   pathsr%   r%   r&   
t5_to_onnx  s(    


r   T)	onnx_pathrF   c                 C   sN   ddl m} tj| dd}|j|ddd}|r@tj|| |d n
td d	S )
zShape inference on an onnx file, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    r   )SymbolicShapeInferenceTZload_external_dataF)Z
auto_mergeZguess_output_ranksave_as_external_dataz4Failed to run symbolic shape inference on the model.N)	Z&onnxruntime.tools.symbolic_shape_inferr   onnx
load_modelZinfer_shapesr   saver   warning)r   rF   r   modeloutr%   r%   r&   shape_inference(  s    r   )r   rF   r/   c                 C   s  t j| dd}|jjd j}t|}| }||ks8t|| }|jdkrNdS d}|	|j
d }|dkr||dd}	|	dkrdS |	|	j
d }|dkrdS d}|jtjjkrdS t|jd	krdS |jd }
|
d
 dkrdS t|
d
 d
 }||
 }|jr|rHtj|jd |ftjd}tjt||fdd}||jd< n<tj||jd ftjd}tjt||fdd}||jd< | |_ndS tj|| |d dS )zPad the logits MatMul weight in the provided decoder model, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   MatMulFrU   NZ	Transpose      Zdtypeaxisr   )r   r   graphr   namer   output_name_to_noder   op_typeget_initializerinputmatch_parent	data_typer   ZDataTyperl   r}   dimsmathceilraw_datanpZzerosZfloat16concatenater   to_arraytobytesr   )r   rF   decoder_model_protologits_output_namedecoder_modelr   matmul_nodeZpad_along_axis_1Zlogits_weightZtranspose_before_matmulZactual_vocab_sizeZpadded_vocab_sizepaddingZpadding_dataZweight_with_paddingr%   r%   r&   pad_weights_of_logits_matmul:  sN    


r   )
model_pathr]   r[   r/   c                    sz   t  }tj|_|rddgndg}|rhdt kr8tdn
td |rhddi}d|i  fdd|D }t| ||d	}|S )
a  Create OnnxRuntime session.

    Args:
        model_path (str): onnx model path
        use_gpu (bool): use GPU or not
        use_sln_strict_mode (bool): use strict mode for skip layer normalization or not

    Raises:
        RuntimeError: CUDAExecutionProvider is not available when --use_gpu is specified.

    Returns:
        onnxruntime.InferenceSession: The created session.
    ZCUDAExecutionProviderZCPUExecutionProviderz5CUDAExecutionProvider is not available for --use_gpu!zuse CUDAExecutionProviderZ"enable_skip_layer_norm_strict_modeTc                    s$   g | ]}| kr| | fn|qS r%   r%   ).0r   Zprovider_optionsr%   r&   
<listcomp>  s    z&create_ort_session.<locals>.<listcomp>)	providers)	r   r   ZORT_DISABLE_ALLZgraph_optimization_levelr   RuntimeErrorr   r   r   )r   r]   r[   Zsess_optionsZexecution_providersZcuda_provider_optionsort_sessionr%   r   r&   create_ort_session  s    



r   )r   ry   c              	   C   s  |t jk}t| j}|d }|dks(tdddgdd t|D  }t| jt|krttdt| d	t| j t|D ]\}}| j| j|krtd
| d| d	| j| j t	j
}|dkr|rt	jnt	j}| j| jjj}	|	|kr|td
| d| d	|	 q|td dgdd t|D  }
t| jt|
kr\tdt|
 d	t| j t|
D ]\}}| j| j|krtd| d| d	| j| j |rt	jnt	j}| j| jjj}||krdtd
| d| d	| qdtd dS )a  Verify GPT-2 subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of GPT-2
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
       rU   	input_idsposition_idsattention_maskc                 S   s   g | ]}d | qS )Zpast_r%   r   ir%   r%   r&   r     s     z(verify_gpt2_subgraph.<locals>.<listcomp> Number of inputs expected to be . Got Input  is expected to be $ is expected to have onnx data type z:Verifying GPT-2 graph inputs: name and data type are good.logitsc                 S   s   g | ]}d | qS )Zpresent_r%   r   r%   r%   r&   r     s     !Number of outputs expected to be Output z;Verifying GPT-2 graph outputs: name and data type are good.N)r	   rl   r}   r   r   range
ValueError	enumerater   r   INT32FLOATr3   tensor_type	elem_typer   r   r   )r   ry   
is_float16input_countlayer_countexpected_inputsr   expected_inputexpected_type
input_typeexpected_outputsexpected_outputoutput_typer%   r%   r&   verify_gpt2_subgraph  s:    

"
"

r   c              	   C   sD  |t jk}|rtjntj}t| j}|d d }|dks<tddg}t|D ]$}|d|  |d|  qLt|D ]$}|d|  |d	|  qzt| jt|krt	d
t| dt| j t
|D ]\}}| j| j|krt	d| d| d| j| j |dk r$tjn|}	| j| jjj}
|
|	krt	d| d|	 d|
 qdg}t|D ]&}|d|  |d|  qjt| jt|krt	dt| dt| j t
|D ]r\}}| j| j|krt	d| d| d| j| j | j| jjj}||krt	d| d| d| qdS )  Verify T5 decoder subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of T5 decoder
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
    r   rW   rU   r   encoder_attention_maskpast_key_self_past_value_self_Zpast_key_cross_Zpast_value_cross_r   r   r   r   r   r   present_key_self_present_value_self_r   r   N)r	   rl   r   r   r}   r   r   r   r|   r   r   r   r   r3   r   r   r   )r   ry   r   Z
float_typer   r   r   r   r   r   r   r   r   r   r%   r%   r&   verify_t5_decoder_subgraph  sB    

""
r   c              	   C   s  |t jk}t| jd d }|dks(tdddg}t| jt|krbtdt| dt| j t|D ]r\}}| j| j|krtd	| d
| d| j| j t	j
}| j| jjj}||krjtd	| d| d| qjddg}	t|D ]$}|	d|  |	d|  qt|D ]&}|	d|  |	d|  qt| jt|	krvtdt|	 dt| j t|	D ]\}}
| j| j|
krtd| d
|
 d| j| j |rt	jnt	j}| j| jjj}||kr~td| d| d| q~td dS )r   r   rW   rU   Zencoder_input_idsr   Zdecoder_input_idsr   r   r   r   r   r   Zencoder_hidden_statesr   r   Zpresent_key_cross_Zpresent_value_cross_r   r   zMT5 encoder graph verified: name and data type of inputs and outputs are good.N)r	   rl   r}   r   r   r   r   r   r   r   r   r3   r   r   r   r|   r   r   r   )r   ry   r   r   r   r   r   r   r   r   r   r   r%   r%   r&   'verify_t5_encoder_decoder_init_subgraph-  s<    

""
r   shared_   )graph1graph2shared_prefixmin_elementssignature_cache1signature_cache2c                 C   s  i }i }g }g }	g }
| j D ]}|jrt|j|ks4q|j D ]z}|jr:t|j|ksTq:t||||r:||j ||j< || |j|kr||j }|||j< |	| |
|  qq:qtd|
  | j	D ]:}t
t|jD ]&}|j| |
krtd|j|  qq|j	D ]@}t
t|jD ]*}|j| |
kr"td|j|  q"q|	D ]}|j | qV|jD ]}|j|krp||j |_qp|j	D ]n}t
t|jD ]X}|j| |kr||j|  }td|j d| d|j|  d|  ||j|< qq|D ]}| j | q
| jD ]}|j|kr$||j |_q$| j	D ]n}t
t|jD ]X}|j| |kr\||j|  }td|j d| d|j|  d|  ||j|< q\qJ|	D ]}||j |_q|	D ]@}tj|j}tj|j|j|}| j| |j| q|	S )	a  Remove initializers with same value from two graphs.

    Args:
        graph1 (GraphProto): the first graph to process
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
    zshared initializers:zname is found in graph 1: zname is found in graph 2: zgraph 2 rename node z input z from z to zgraph 1 rename node )initializerr   sumr   Zhas_same_valuer   r|   r   r   noder   r}   r   r   remove
value_infor   numpy_helperr   shapehelpermake_tensor_value_infor   )r   r   r   r   r   r   Zmapping_initializers_1Zmapping_initializers_2Zshared_initializers_1Zshared_initializers_2Zshared_initializers_namesZinitializer1Zinitializer2Zshared_namer   jr   r   new_namer   r%   r%   r&   remove_shared_initializersq  sv    











*

*r   )encoder_modelr   c                 C   s`   t | }t |}|d |d i i  }}|| || t|jj|jjd||d}|S )NZe_Zd_Zs_)r   r   r   )r   Zadd_prefix_to_namesZremove_duplicated_initializerr   r   r   )r   r   encoderdecoderr   r   initializersr%   r%   r&   get_shared_initializers  s    




r   )r   r   r/   c                 C   s   g }| j D ]$}|jr
t|j|ks$q
|| q
|D ]}| j | q4|D ]2}tj|j}tj	
|j|j|}| j| qJ|S )a^  Remove initializers of a graph, when they have number of elements larger than a threshold.

    Args:
        graph (GraphProto): the graph.
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.

    Returns:
        List[TensorProto]: initializers that are removed from the graph.
    )r   r   r   r|   r   r   r   r   r   r   r   r   r   r   )r   r   Zmoved_initializerstensorr   r   r   r%   r%   r&   move_initializers  s    
r   c                 C   s   | j dkrtd| j d| j dkr.| j}n| j dkr@| j}n| j dkrR| j}n| j dkrd| j}n| j dkrv| j}nt| j d	kr| j}nb| j d
kr| j	}nP| j dkr| j
}n>| j dkr| j}n,| j dkr| j}ntd| j d| j  d| j|fS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.rU   r   r   rW            r   	   
   z has unsupported type r;   )r3   r   r   fr   stgZfloatsZintsstringsZtensorsZgraphs)	attributer#   r%   r%   r&   _attribute_to_pair  s0    










r  c                 C   sD   i }| j D ]}t|\}}|||i q
| jr@|d| ji |S )Ndomain)r  r  updater  )r   kwargsattrkeyr#   r%   r%   r&   	kwargs_of1  s    
r  c                 C   s   t dd | jjjjD S )Nc                 S   s   g | ]}|j r|j n|jqS r%   )	dim_param	dim_value)r   dr%   r%   r&   r   <  s     zshape_of.<locals>.<listcomp>)tupler3   r   r   dim)vir%   r%   r&   shape_of;  s    r  )subgc              
   C   s  d}d}g }t | jD ]X\}}||krbt|}tjj|j|jjj	|d |d |d d|d gd}|
|g q|
tjjdtjjdgd	g | d
 | j
| g }t | jD ]Z\}}||krt|}tjj|j|jjj	|d |d |d d|d gd}|
|g q| d | j
| g }| jD ]}	|	jdkrt|	}
|
ddi g }|
|	j t|dk r|
dg qft|dk r|
dg tjjd||	jfd|	ji|
}	|
|	g q0| d | j
| | S )Nr   rU   r   r   max_seq_lenrW   r   r   past_sequence_lengthr   r   r   	AttentionrO   r  r!   r  r   r   )r   r   r  r   r   r   r   r3   r   r   r{   r   r   
ClearFieldr   r   r   r  r  r}   	make_node)r  Zinput_past_0Zoutput_past_0
new_inputsr   r  r   new_outputs	new_nodesr   r  nisr%   r%   r&   1update_decoder_subgraph_past_present_share_buffer?  sV     



 
r&  )r  is_beam_searchswitch_attentionr/   c                 C   s  |rg }t | jD ]\}}||g q|tjjdtjjdgdg |tjjdtjjdddgdg | d | j| |rdd	d
ddg}g }| j	D ]}|j
dkrt|}	|	 D ]<}
|
dkr  dS |
|kr|
dkrtd|
 d |	|
= qg }||j |rft|dk r2|dg qt|dk rL|dg t|dk rf|dg tjjd||jfd|ji|	}||g q| d | j	| dS )aS  Update the Attention nodes to DecoderMaskedSelfAttention.

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
        is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
    
beam_widthrU   r  cache_indirection
batch_sizer  r   rO   	num_headsscalemask_filter_valuer  r  Zqkv_hidden_sizesFZunidirectionalzRemoving attribute: zB from Attention node while switching to DecoderMaskedSelfAttentionr  r!   r   r  ZDecoderMaskedSelfAttentionr   r   T)r   r   r{   r   r   r   r   r   r   r   r   r  copyr   r   r}   r!  r   r   )r  r'  r(  r"  Z_ir  'decoder_masked_attention_supported_attrr$  r   r  kr%  r%   r%   r&   4update_decoder_subgraph_use_decoder_masked_attentiono  sr    
   


  
r2  c                 C   s  t  }g }dd t| jD }i }i }| jD ]N}|jD ]*}|r6||krR|g||< q6|| | q6|jD ]}|rh|||< qhq,| jD ]}|jdkr|jd r|jd sq|jd |jd  }	}
d}| jD ]}|j|
kr|} qq|dkrqt	j
|}|jdkr| dkr|jd |kr||	 }|jdkr|jd r|jd |kr|jd d	sf|jd d
r||jd  || t||jd  dkr|| q||fS )az  Correct graph which originally use dim of past_seq_len from input_ids's shape which is fixed to max_seq_len after
       shared past/present buffer

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
    return:
        tensor_names_to_rename : set of tensor names which is equal to past_sequence_length
        nodes_to_remove : list of node to remove
    c                 S   s   i | ]\}}|j |qS r%   r   )r   indexinpr%   r%   r&   
<dictcomp>  s      z+find_past_seq_len_usage.<locals>.<dictcomp>GatherrU   r   Nr   Shaper   r   )setr   r   r   r|   r   r   r   r   r   r   r   sizeitem
startswithaddr}   )r  tensor_names_to_renamenodes_to_removegraph_input_namesZinput_name_to_nodesr   r   Z
input_nameZoutput_nameZshape_tensor_nameZshape_index_nameZini_gather_indicesr   Zgather_indices_arr
shape_noder%   r%   r&   find_past_seq_len_usage  sX    






$	
rB  rU   rZ   )r   	attn_maskkv_num_heads
world_sizewindow_sizec           1      C   sb  |  tjjdtjdgdgd tjjd|dg|d g| dd}tjjd|d dgdg| dd}tjjd	dgd
g| d	tjd}tjjd|g|d g| dd}tjjd|d dgdg| ddd}	tjjd	dgdg| d	tjd}
| j	j
j|||||	|
g ttdd | j	j
j}t|D ]*\}}| |dddgdddg}| |ddgddg}d\}}}|d k	r|\}}}n|d k	r|\}}| |dddgdddg}| |ddgddg}d\}}}|d k	r|\}}}n|d k	r|\}}| |ddgddg}| |dgdg}d\}}|d k	rB|\}}n|d k	rT|d }d}|d k	r|d k	r|jD ]}|jdkrr|j}qrd}|jD ]}|jdkr|j}q|jd |jd ko|jd |jd k}|d k	o|d k	o|d k	} |d ko|d ko|d k}!d\}"}#}$|r<| s.|!r<t| |jd }%t| |jd }&t| |jd }'|%jd }(tj|%|&|'fdd|(d |( })tjj|)d!| d"})|  |) tjjd|jd |)jg|)j d#g| dd}*| j	j
j|*g | j	j
j| | j	j
j| | j	j
j| |*jd }"| rZt| |jd }+t| |jd },t| |jd }-|+jd }(tj|+|,|-fddd |( }.tjj|.d$| d"}.|  |. tjjd|*jd |.jg|.j d#gd%}/| j	j
j|/g | j	j
j| | j	j
j| | j	j
j| |/jd }"n|jd }"|jd }#|jd }$tjjd&|"|#|$|jd' |jd( |jd |
jd |d k	r|jd nd)|d k	r|jd  nd)g	|j|j d*d&d+|| |dkr|| n|| |t!|d k	o|d k	|d,
}0| j	j
j| | j	j
j|0g |d k	r@| j	j
j| |d k	r0| j	j
j| q0| S )-NonerU   r   r   r   valsZ	ReduceSumZ	_row_sumsinputsoutputsr   SubZseqlens_k_int64CastZ	seqlens_k)rK  rL  r   tor8  _shaper7  Ztotal_seq_len_int64r   )rK  rL  r   r   Ztotal_seq_lenc                 S   s
   | j dkS )NMultiHeadAttention)r   )r   r%   r%   r&   <lambda>V      z&replace_mha_with_gqa.<locals>.<lambda>ZRotaryEmbeddingAddr   )NNNr   )NNinterleavedr,  )r!   r!   r!   rZ   r   r   ZQKV_Weight_r3  _outputZ	QKV_Bias_)rK  rL  ZGroupQueryAttentionr  r  r!   rQ  com.microsoft)	rK  rL  r   r  r,  rD  Zlocal_window_sizeZ	do_rotaryZrotary_interleaved)"add_initializerr   r   make_tensorr   INT64r!  create_node_namer   r   r   r   r{   listfilterr   match_parent_pathr  r   r   r   r   r   r   r   r   stackreshaper   Z
from_arrayr   r   replacerm   )1r   rC  rD  rE  rF  Zreduce_sum_nodeZsub_nodeZseqlen_k_cast_noderA  Zgather_nodeZtotal_seqlen_cast_nodeZ	mha_nodesidxr   Zq_path_1Zq_path_2Zq_rotaryZq_addq_matmulZk_path_1Zk_path_2Zk_rotaryZk_addk_matmulZv_path_1Zv_path_2Zv_addv_matmulrU  Zattr,  Zroot_input_is_sameZall_paths_have_biasZall_paths_have_no_biasZq_input_to_attentionZk_input_to_attentionZv_input_to_attentionqwkwvwr  
qkv_weightZpacked_matmul_nodeZqbkbZvbZqkv_biasZpacked_add_nodeZgqa_noder%   r%   r&   replace_mha_with_gqa  s4   















*

 








rk  c              	      s  d}dd j D }|dk r4|| ds4|d7 }qd}tj| d }d| |   fddt|D }td	|  tj   }td
|  |d }|d }|d }	d}
jD ]}|jdkr|j d |krtd|j	 d|j  |
d7 }
||j d  }d| }dgdt|j  }|
| |j| |jtjddg tj|tj||d|	g}j|g q|
|krtd| d|
 d S )NrU   c                 S   s   g | ]
}|j qS r%   r3  r   gir%   r%   r&   r     s     zBupdate_decoder_subgraph_output_cross_attention.<locals>.<listcomp>r   pastr   c                    s"   i | ]}j |d     j|qS )r   )r   r   )r   layerinput_cross_past_0r  r%   r&   r6    s      zBupdate_decoder_subgraph_output_cross_attention.<locals>.<dictcomp>z    --past_key_cross_inputs=zpast_key_cross_0_shape is r   DecoderMaskedMultiHeadAttentionz'    -- add cross QK output from: node: z with output: Zoutput_cross_qk_r!   Z	output_qkz#Did not add cross QK for all layersz vs )r   r<  r}   r   r   printr  r   r   r   r|   r{   r  r   r   make_attributer   r   r   r   )r  input_self_past_0r@  Zoutput_self_present_0
num_layersZpast_key_cross_inputsZinput_past_key_cross_0_shapeZbatch_size_dimZnum_heads_dimZcross_seq_len_dimZnum_layer_output_qkr   ro  Zcross_attention_out_nameZappended_namesZcross_attentionr%   rp  r&   .update_decoder_subgraph_output_cross_attention  sB    



  

rw  c              	   C   s&  d}dd | j D }|dk r4|| ds4|d7 }qd}tt| j | d }d| | }g }g }| jD ]}|jdkrh||g qht||k rd	S d }	| jD ]}|jd
kr|}	 qqdddddg}
d}t| \}}t|dkrf|D ]}td| d|  q|D ]}td|j d|j	  qt
jjddgdgdd}t
jjddg|gdtjd}|||g | jD ]}t|jdkr|	d k	r|jd |	j d krt
jjddgdgdtjd}|jd |j d< ||g |jdkr4t|}| D ]}||
kr||= q|j d |j d |j d g}|t|j dkrH|j d nd g |t|j d!krn|j d! nd g |t|j d"kr|j d" nd g |t|j d#kr|j d# nd g |dg |d$g |d%g |t|j dkr|j d nd g d|d< t
jjd&||jfd'|j	i|}||krlt|j D ]\}}||krH||j |< qH||g ql| d( | j| d)d | j D }g }t| j D ]`\}}||kr||k rt|}t
jj|j	|jjj|d |d d*|d gd+}||g qd|kr8|t
jjdt
jjdgd,g d$|krb|t
jjd$t
jjdgd,g d%|kr|t
jjd%t
jjd-d$d*gd,g | d. | j | g }t| jD ]V\}}||krt|}t
jj|j	|jjj|d |d d*|d gd+}||g q| d/ | j| d0S )1NrU   c                 S   s   g | ]
}|j qS r%   r3  rl  r%   r%   r&   r     s     zSupdate_decoder_subgraph_share_buffer_and_use_decoder_masked_mha.<locals>.<listcomp>r   rn  rW   r   rQ  FZRelativePositionBiasrO   r,  r-  r.  r  Z#past_sequence_length_squeezed_int64r   zFound tensor name z to be renamed to zFound node to removed: type:z, name:ZSqueezer  Zpast_sequence_length_squeezedZ!node_past_sequence_length_squeezer3  rN  Z&node_past_sequence_length_squeeze_cast)r   rO  Zpast_sequence_length_int64Zpast_sequence_length_castr!   r  r  r  r)  r*  rr  r   r   c                 S   s   g | ]
}|j qS r%   r3  )r   r5  r%   r%   r&   r   q  s     r  r  r  r+  r   r   T)r   r<  rm   r}   r   r   r{   rB  rs  r   r   r   r!  r   rZ  r   r  r/  r   r   r  r   r3   r   r   r   )r  ru  r@  Zoutput_self_past_0rv  rq  r$  Z	old_nodesr   Zrel_pos_bias_noder0  Ztarget_squeezed_past_seq_namer>  r?  Zname_to_renamenrZsqueeze_nodeZ	cast_noder  r1  r%  r4  r   Zorig_input_namesr"  r   r  r   r#  r%   r%   r&   ?update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha  s   




0

&&&&&  




 
  


ry  )model_protoc                 C   s  t | }| }g }g }| D ]}|jdkr d|jd krNd|jd krNq ||jd  }||jd  }||jd  }||jd }	||jd }
||jd }|	r|
r|s dS t|	}t|
}t|}tj	|||gdd}|j
d	d
d}tjj|d |	jdkrtjntj|jd |jd g|  d}| jj|g tjjd	|jd |d g|d g|d}|jd |jd< d|jd< d|jd< ||g ||||g q || || |  |  dS )Nrr  Zpast_key_crossrU   Zpast_value_crossr   r   Fr   r   Z
MatMul_QKV)Zname_prefixZ_weightrH  Z_outrJ  r!   T)r   r   Znodesr   r   r   r   r   r   r   r[  r   r   rY  r   r   r   rl   r   flattentolistr   r   r{   r!  r   Z	add_nodesZremove_nodesZupdate_graphtopological_sort)rz  
onnx_modelr   Znodes_to_addr?  r   rc  rd  re  Zq_weightZk_weightZv_weightrf  rg  rh  ri  Zmatmul_node_nameweightr   r%   r%   r&   pack_qkv_for_decoder_masked_mha  sX    








r  )decoder_onnx_pathrF   c                 C   s   t j| dd}tt|jjD ]X}|jj| jdksF|jj| jdkr|jj| jjj	j
d }|drp|  d|_qtj|| |d dS )aQ  Update the input shapes for the inputs "input_ids" and "position_ids" and make the sequence length dim value 1 for each of them.
       The decoder model will be over-written.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   rU   r  r   )r   r   r   r}   r   r   r   r3   r   r   r  HasFieldClearr  r   r   )r  rF   r   r   Zshape_dim_protor%   r%   r&   *update_input_shapes_for_gpt2_decoder_model  s    	
r  )r  init_decoder_onnx_pathrF   r/   c                 C   sV  t j| dd}|jjd j}t|}| }||ks8t|| }|jdkrNdS |	|dddddddd	dddddgdddd
dddddddddg}|dkr|	|dddddd	ddddg
ddd
dddddddg
}|dkr2|	|ddddd	dddgddd
dddddg}|dkr2|	|ddd	ddgdd
dddg}|dkr@dS |d }	|	jdk}
|
sd}|	|	ddddg|dddg}|dkrd
}|	|	ddddg|dddg}|dkrd}|	|	dddg|ddg}|dkrd
}|	|	dddg|ddg}nd}|	|	dddg|ddg}|dkr>d
}|	|	dddg|ddg}|dkrbd}|	|	ddg|dg}|dkrd
}|	|	ddg|dg}|dkrdS |d
krdnd
}|
s|
|	d|}n|
|	d|}|dkrdS |d }|d }t jjdtjd
gdgd}t jjdtjd
gdgd}t jjdtjd
gd
gd}t jjdtjd
gdgd}|| || || || d|jd  }t jjd|jd ddddg|g|ddd}|
s|jd n|jd }d|jd  }t jjd|ddddg|g|ddd}|| || |||jd | ||	|| |  tj|||d dS )a  Generates the initial decoder GPT2 subgraph and saves it for downstream use.
       The initial decoder model will be saved to init_decoder_onnx_path.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        init_decoder_onnx_path (str): Path of GPT-2 init decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   FrN  LayerNormalizationrT  FastGelurU   NSkipLayerNormalizationrZ   r  ZSliceLastTokenStartsrH  ZSliceLastTokenEndsZSliceLastTokenAxesZSliceLastTokenStepsZedge_modified_SliceZGatherLastToken_0_rJ  r   ZGatherLastToken_1_r   )r   r   r   r   r   r   r   r   r   r^  r   r   rY  r   r   rX  r!  r[  add_nodeZreplace_node_inputr}  r   )r  r  rF   Zinit_decoder_model_protor   gpt2_init_decoder_modelr   Zlogits_matmul_nodeZ"logits_matmul_to_residual_add_pathZresidual_add_nodeZis_skiplayernorm_pathZ&residual_add_to_attention_parent_indexZresidual_add_to_attention_pathZ residual_add_to_add_parent_indexZadd_before_residual_addZ	attentionZmatmul_after_attentionZslice_startsZ
slice_endsZ
slice_axesZslice_stepsZslice_0_output_nameZslice_node_0Zadd_before_residual_add_outputZslice_1_output_nameZslice_node_1r%   r%   r&   generate_gpt2_init_decoder  s   




 
 




  
    
  
  
  
    








r  c           	      C   s   t d}t |j}t |j}t |j}| jjD ]J}|jjjj	D ]8}|
dr>|j||||fkr>t|j}|  ||_q>q.| jjD ]J}|jjjj	D ]8}|
dr|j||||fkrt|j}|  ||_qqdS )zoMake dim_proto numeric.

    Args:
        model: T5 encoder and decoder model.
        config: T5 config.
    rU   r  N)rf   r,  Zd_modelZd_kvr   r   r3   r   r   r  r  r  rm   r  r  r   )	r   configsequence_lengthr,  Zhidden_sizeZ	head_sizer   Z	dim_protor  r%   r%   r&   make_dim_proto_numeric_t5  s4    





r  )rq   generation_typec           ,      C   s  | j dk}|tjk}|tjk}|tjk}| j}td|  t| j	dkr| j	d dkr|r| j
tjkrdddd	g| _	td
| j	  td ng | _	|s|r|std| jrtd| jrtd|r|r| jstd| jr|std| jr| jstd|r| jr>tj| jr>td| j  nd| js~d| j| j
tjkr`dnd}tt| jj| | _td| j d| j d t|  nH| jr| jrtd| j d| j  ntd| j d t|  d}| j sP| j
tjkrP|rP|s|s|rPtd| j d t!| j| j"}|sPt#d  d}	d!}
| j$s|r|sx|sx|rtd"| j d# d$| j
tjkrdnd}tt| jj| }
t%| j|
| j"}	|	st#d% |	rt&| j| j"std&|s| j's|	rVtd'| j d t(| j| j" |	rVtd'|
 d t(|
| j" |rpt)j*| j| j+d(}n2| j d)krt,j*| j| j+d(}nt-j*| j| j+d(}| j.rtd*|  |j/}|r|j/n|j0}|j1}| j1d+kr| j1}| j/d+kr| j/}| j0d+kr| j0}t2j3| jd,d-}| j  d.|j4_5d!}| j dkrt6|j4| j
 |	rt2j3|
d,d-}| j  d/|j4_5t6|j4| j
 nt7|j4| j
 d!}|rd0d1d2d3d4d5d6g}n|s|rd0d1d2d6g}| j8r|9d7 n
|9d8 | j:r|9d9 n
|9d8 | j;r|9d: n
|9d8 |r\| j<r@| j=r@|9d; n
|9d8 | j>r\|9d< d=g}| jrt|9d> | jr| jst?d?|9d@ d!}|rt2j@jAdA||dB| j  dC}nF|rt2j@jAdD||dE| j  dC}n"|rt2j@jAdF||dG| j  dC}dH|_Bd!}|rtt2j@CdI|t2j@CdJ|t2j@CdK| jDt2j@CdL| jErNdndt2j@CdM| j dkrjdndg}n|rt2j@CdI|t2j@CdJ|t2j@CdM| j dkrdndt2j@CdK| jDg}n|rbt2j@CdI|t2j@CdJ|t2j@CdM| j dkrdndt2j@CdK| jDt2j@CdN| jFt2j@CdO| jGt2j@CdP| jHt2j@CdQ| jIt2j@CdR| j<t2j@CdS| jJg
}|r~|Kt2j@CdT|g |jLK| g }| j dUkr| j'rtdV| j d t(| j| j" t2j3| jd,d-}| j  dW|j4_5tM|j4| j
 tN|| tN|| |rh| jstdXtdY tO|j4r>tdZ n
td[ tP|r^td\ n
td] | jQstR||}tt| d^d_d` |D  da |jLKt2j@Cdb|j4t2j@Cdc|j4t2j@Cddt|j4jSdekr|jTnd+g n |		r~| jQ	s*tR||}tt| d^dfd` |D  dg |	rDtdh tU|j4 | j	rdtV|j4|d	sdtdi|jL9t2j@Cdj|j4 ntW|j4}tt| dk |	rtdl tU|j4 | j	rtV|j4|d,	stdm|jL9t2j@Cdc|j4 t2j@Xd0tYjZdndog}t2j@Xd1tYjZdg}t2j@Xd2tYjZdg}t2j@Xd3tYjZdg}t2j@Xd4tYjZdg}t2j@Xd5tYj[dg}t2j@Xd6tYj[dg}d!}|
r|||||||g}n|
s|
r||||g}| j8
rt2j@Xd7tYjZ|g} |9|  | j:r t2j@Xd9tYjZdn|g}!|9|! | j;r(t2j@Xd:tYjZdndog}"|9|" | j<rX| j=rXt2j@Xd;tYjZdn|g}#|9|# |r| j>rt2j@Xd<tYjZdg}$|9|$ d!}%|rt2j@Xd=tYjZdnd4d1g}%n"|s|rt2j@Xd=tYjZdnd1g}%|%g}&| jrt2j@Xd>tYj[dnd4g}'|&9|' | jr$t2j@Xd@tYj[dpdnd3|g}(|&9|( t2j@\|g|s@| j  dqn
| j  dr||&|})t2j@j]|)ds|j^dt}*| j"rddul_m`}+ |+at2jb|+advk rt#dw tcjd|*| jd,d,dx nt2d|*| j tdy| j  d!S )zzConvert model according to command line arguments.

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r5   z**** past_present_share_buffer=rU   r   rC   rT  r  r  r  z**** Setting op_block_list to zI**** use --op_block_list if you want to override the block operator list.z<Currently only gpt2 with greedy search/sampling is supportedzLoutput_sequences_scores currently is not supported in greedy search/samplingzHoutput_token_scores currently is not supported in greedy search/samplingzi`use_decoder_masked_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearchzS`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_attention`z?`use_decoder_masked_attention` option is only supported on GPUsz)skip convert_to_onnx since path existed: z{}_past_{}.onnxrt   rs   zConvert GPT model z	 to onnx z ...z,skip convert_to_onnx since paths specified: z and zConvert model z to onnx ...Fz=Pad logits MatMul weights for optimal MatMul perf in fp16 on z. The file will be overwritten.z]Tried and failed to pad logits MatMul weights. Performance may be sub-optimal for this MatMulNz*Creating an initial run GPT2 decoder from z. zgpt2_init_past_{}.onnxzuTried and failed to generate the init decoder GPT2 model. Performance may be sub-optimal for the initial decoding runzGCould not update the input shapes for the non-initial decoder subgraph.z Run symbolic shape inference on rz   r6   zConfig=rZ   Tr   z decoderz init decoderr   
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyrN   r!   rQ   r   rS   rT   	sequencessequences_scoresz8--output_token_scores requires --output_sequences_scoresscoresZ
BeamSearchZBeamSearch_rJ  ZGreedySearchZGreedySearch_ZSamplingZ	Sampling_rW  eos_token_idpad_token_idno_repeat_ngram_sizerM   r   temperaturetop_pfilter_valuemin_tokens_to_keepcustompresence_penalty
vocab_sizer6   r7   zSymbolic shape inference on z encoder and decoder initzMpast_present_share_buffer is only supported with use_decoder_masked_attentionzl*****update t5 decoder subgraph to share past/present buffer and use decoder_masked_multihead_attention*****z4*****update t5 decoder subgraph successfully!!!*****zF*****DecoderMaskedMultiHeadAttention is not applied to T5 decoder*****z9*****pack qkv for decoder masked mha successfully!!!*****z3*****pack qkv for decoder masked mha failed!!!*****z shared initializers (c                 S   s   g | ]
}|j qS r%   r3  r   r%   r%   r&   r   c	  s     z,convert_generation_model.<locals>.<listcomp>z>) in encoder and decoder subgraphs are moved to the main graphr   r   decoder_start_token_idr   c                 S   s   g | ]
}|j qS r%   r3  r   r%   r%   r&   r   	  s     zC) in decoder and init decoder subgraphs are moved to the main graphzY*****update init decoder subgraph to make past and present share buffer******************zLCould not update the init decoder subgraph to use DecoderMaskedSelfAttentionZinit_decoderz: initializers from the decoder are moved to the main graphzT*****update decoder subgraph to make past and present share buffer******************zGCould not update the decoder subgraph to use DecoderMaskedSelfAttentionr+  r  zmax_length - sequence_lengthz beam searchz greedy searchzonnxruntime.transformers)Zproducer_nameZopset_imports)versionz1.12.0z0Require onnx >= 1.12 to save large (>2GB) model!)r   Zall_tensors_to_one_filezmodel save to )er   r"   r+   r,   r-   rO   r   r   r}   r~   ry   r	   rl   NotImplementedErrorrK   rL   rP   r   r]   rx   rh   ri   existsformatrw   r   r   r   as_posixr   r   r   rH   r   rF   r   rI   r  r  rG   r   r   from_pretrainedrz   r   r   r>   r  r  r  r   r   r   r   r   r   rN   r|   rQ   rR   r  rS   rT   r   r   r!  r  rt  r  rM   r  r  r  r  r  r{   r  r   r  ry  r  rJ   r   r   r  r&  r2  r   r   r   r   r   Z
make_graphZ
make_modelZopset_import	packagingr  parse__version__r   r   ),rq   r  Zis_gpt2Zis_beamsearchZis_greedysearchZis_samplingrO   Zonnx_filenameZlogits_matmul_weight_paddedZgpt2_init_decoder_generatedZgpt2_init_decoder_onnx_pathZgpt2_init_decoder_onnx_filenamer  r  r  r  r   r  rK  rL  r   Zattr_to_extendr   r   r   r  r  r  r  r  r  Zgraph_inputsrN   rQ   r   rS   rT   r  Zgraph_outputsr  r  Z	new_graphZ	new_modelr  r%   r%   r&   convert_generation_model&  s   




 

   	



















  



  	
  
  
  

  


	
r  )rq   r   r   r   r  r  bad_words_idsr/   c                 C   s   | j rtj std| jtjkr,|  t	| j r:dnd}|
| td |
|}|
|}g }t| jD ]f}	t }
|j||| j| j| j| j| j||| j| j| j|r|ndd| jp| jd}	|t |
  qv|jd }dd	lm} |||S )
a  Test PyTorch performance of text generation.

    Args:
        args (argparse.Namespace): arguments parsed from command line
        model (Union[GPT2LMHeadModel, T5ForConditionalGeneration]): PyTorch model
        input_ids (torch.Tensor): input_ids
        attention_mask (torch.Tensor): Attention mask
        eos_token_id (int): EOS token ID
        pad_token_id (int): Padding token ID
        bad_words_ids (List[List[int]]): Words shall not be generated.

    Raises:
        RuntimeError: PyTorch with CUDA is not available for --use_gpu

    Returns:
        Dict[str, Any]: A dictionary with string with metric name, and value can be integer or string.
    z=Please install PyTorch with Cuda for testing gpu performance.zcuda:0cpuFNTr   r   r  r  r  rM   r  r  r  r  r  r  r  Zreturn_dict_in_generateZoutput_scoresr   get_latency_result)r]   torchcudaZis_availabler   ry   r	   rl   ZhalfdevicerO  Zset_grad_enabledr   
total_runstimegenerater  r  r  rM   r  r  r  r  rK   rL   r|   r   benchmark_helperr  )rq   r   r   r   r  r  r  r  Ztorch_latency_startr+  r  r%   r%   r&   test_torch_performance
  sB    






r  c                 C   sp   t j| jt jd}t| jd D ]J}d}t| jd D ]2}| | | |kr`|dkr`d|| |< q6|d7 }q6q |S )Nr   r   rU   )r   onesr   int32r   )r   r  r   r   Zabs_posr   r%   r%   r&   create_attention_mask\
  s    r  F)rq   	sentences	is_greedyc           +      C   s  | j dksttj| j| jd}d|_|j|_t	j| j| j|j
d}|dkrVdddg}||d	d
d}|d }|d }d}|j|d
d}	dd |	D }	| jrtd|	 ng }	|j}
|
j
}|
j
}|
j}g }d}| jstd td |j||| j| j| j| j| j||| j| j| j|	r|	ndd
| jp*| jd}td| td td|j | jrbtd|j | jrvtd|j  t!|jD ]4\}}|j"|d
d}|#| t| d|  qtd td |r|$ % &t'j(t'j)| jgt'j(dt'j)| jgt'j(dt'j)| jgt'j*dd}n|$ % &t'j(t'j)| jgt'j(dt'j)| jgt'j(dt'j)| jgt'j(dt'j)| jgt'j(dt'j)| jgt'j*dt'j)| jgt'j*dd}| jrt'j+|t'j(d}| jr|	D ]}d ||< q||d!< | j,rt-|||d< |j.d  }| j/r*t0d" t'j+||ft'j(d}||d#< | j1rt2| j3j45 }td$| d d%l6m7} t0d&| d' |g}t!|D ]*\}}t8j9:|d(t;| }||| qztd)| | j<rdS td* t=| j3| j>| j?}td+ |@d|}g }tA| jBD ],}tCC }|@d|}|#tCC |  qd d,lDmE}  |j.d  }| ||}!td- |d  }"td|" | jr|td|d.  | jrtd|d/  |r|"j.\}}#g }$tA|D ]6}|j"|"| d
d}|$#| td0| d1|  qnj|"j.\}}%}#g }$tA|D ]P}tA|%D ]@}&|j"|"| |& d
d}|$#| td0| d2|& d|  qq |r|jF|| jd3}'tGH|"}(td td4 t|' t| td td5 t|( t|$ td ||$k})td6|)rd7nd8 |)|!d9< | jIrtJ| ||||||	}*td:|* td;|! |!S )<a9  Test GPT-2 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r5   r  left)rz   r  NzThe product is releasedzI enjoy walking in the parkzTest best way to investptTZreturn_tensorsr   r   r   walk in park)Zadd_prefix_spacec                 S   s   g | ]
}|gqS r%   r%   r   Zword_idr%   r%   r&   r   
  s     z"test_gpt_model.<locals>.<listcomp>r  2--------------------------------------------------CTest PyTorch model and beam search with huggingface transformers...r  !huggingface transformers outputs:r  r  r  Zskip_special_tokens: 'Testing beam search with onnxruntime...r   )r   r  r  r  r   r  r  r  r  r  r  r   rN   zYUse prefix vocab mask with all ones in ORT, but no corresponding setting for Torch model.rQ   test_data_diroutput_test_datazSaving test_data to z/test_data_set_* ...test_data_set_
ORT inputszCreating ort session......zRun ort session......r  ORT outputs:rU   r   batch z sequence: 
 sequence rZ   Torch Sequences:ORT Sequences:Torch and ORT result is same	differentparityTorch LatencyORT)Kr   r   r   r  rw   rz   padding_sideZ	eos_tokenZ	pad_tokenr   r  encoderN   r   r   r  r  r^   rs  r  r  r  r  rM   r  r  r  r  rK   rL   r  r  r  r   decoder|   r  numpyastyper   r  arrayfloat32r  rR   r  r   rQ   r   ra   r   r   r   r  bert_test_datar  rh   ri   rg   rf   r_   r   r]   r[   runr   r  r  r  r  r`  r  
LongTensorr`   r  )+rq   r  r  	tokenizerr   rK  r   r   	bad_wordsr  r  r  r  r  torch_decoded_sequencesbeam_outputsr   sequencedecoded_sequencerN   bad_word_idr+  rQ   r  r  
all_inputsdirr   resultlatencyr  r  r  r   r  r  ort_decoded_sequencesnum_sequencesr   torch_sequencesort_sequencesis_sametorch_latency_outputr%   r%   r&   test_gpt_modelh
  sD   













"
	

r  )rq   r  c           )      C   s4  | j dkst| jr"td dS tj| j| jd}d|_	| j dkrXt
j| j| jd}ntj| j| jd}|dkrzddg}||d	d
d}|d }|d }d}||dd }dd |D }| jrtd| ng }|j}	|	j}
|	j}|	j}td|
 d| d|  g }| jstd td |j||| j| j| j| j| j|
|| j| j| j|rZ|ndd
| jpj| jd}td| td td|j | jrtd|j  | jrtd|j! t"|jD ]4\}}|j#|d
d}|$| t| d|  qtd td t%j&|t%j'd }| jr2|D ]}d!||< q"|( ) *t%j't%j+| jgt%j'd t%j+| jgt%j'd t%j+| jgt%j'd t%j+| jgt%j'd t%j+| jgt%j,d t%j+| jgt%j,d d"}| jr||d#< | j-rt.|||d< | j/rFt0| j1j23 }td$| d!d%l4m5} |g}t"|D ]*\}}t6j78|d&t9| }||| qtd'| t:| j1| j;| j<}g }t=| j>D ],}t?? }|@d|}|$t?? |  qr|jAd! }d!d(lBmC} |||}td) |d! } td|  | jrtd|d*  | jrtd|d+  | jA\}}!}"g }#t=|D ]P}t=|!D ]@}$|j#| | |$ d
d}|#$| td,| d-|$ d|  q*q| js |jD|| jd}%tEF| }&td td. t|% t| td td/ t|& t|# td ||#k}'td0|'rd1nd2 |'|d3< | jGr&tH| ||||
||}(td4|( td5| |S )6a=  Test T5 or MT5 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  zLSkipping parity test as prefix vocab mask is not implemented by Hugging FaceNr  r  r6   z4translate English to French: The product is releasedzsummarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.r  Tr  r   r   r  rZ   c                 S   s   g | ]
}|gqS r%   r%   r  r%   r%   r&   r   h  s     z!test_t5_model.<locals>.<listcomp>r  zeos_token_id:z, pad_token_id:z, vocab_size:r  r  r  r  r  r  r  r  r  r  r   r   r  rN   r  r  r  r  r  r  rU   r   r  r  r  r  r  r  r  r  r  r  )Ir   r   rQ   r   r   r   r  rw   rz   r  r   r   r  rN   r  r  r  r  r^   rs  r  r  r  r  rM   r  r  r  r  rK   rL   r  r  r  r   r  r|   r   r  r  r  r  r  r  r  rR   r  ra   r   r   r   r  r  r  rh   ri   rg   rf   r   r]   r[   r   r  r  r  r   r  r  r`  r  r  r`   r  ))rq   r  r  r   rK  r   r   r  r  r  r  r  r  r  r  r   r  r  rN   r  r  r  r  r  r   r  r  r  r  r+  r  r   r  r  r  r  r   r  r  r  r  r%   r%   r&   test_t5_model;  s   









"
	

r  )r.   r  c                 C   sv  t | }t|j |jdkr|jr@tj|js@td|j |j	rdtj|j	sdtd|j	 |jrp|j	r||j	r|jstd|j
dko|jdk}|jdkr|r|jdkr|jdk rt|tj td	 |jd
ks|js|jrdS nt|tj nt| td |jdkr&t||d}nt|||d}|rr|jr`td|j d|j d ntd|j  |S )a/  Main entry function

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Raises:
        ValueError: Path does not exist: --encoder_decoder_init_onnx
        ValueError: Path does not exist: --decoder_onnx
        ValueError: --decoder_onnx and --encoder_decoder_init_onnx are not used together for T5

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  z1Path does not exist: --encoder_decoder_init_onnx z$Path does not exist: --decoder_onnx zB--decoder_onnx shall use together with --encoder_decoder_init_onnxrU   r5   rY   rX   zThe test for gpt2_sampling onnx model is limited to non-custom model with small top_p(e.g <=0.01) value. The result should be the same as gpt2 greedy search.g{Gz?Nzstart testing model...)r  )r  r  zOutput files: r1   z.datazOutput file: )rr   r
   r>   r   r   rh   ri   r  r   rx   r  r  r  r  r"   r-   r   r   r  rT   r,   r  r  rF   r   )r.   r  rq   r  r  r%   r%   r&   r     sB    


r   __main__)N)T)T)r   r   NN)r   )r   rU   rZ   )T)T)NF)N)NN)`__doc__rb   loggingr   rh   r  enumr   pathlibr   typingr   r   r   r   r   r  r   r   r  r  r	   r
   Zfusion_utilsr   r   r   r   r~  r   Ztransformersr   r   r   r   r   r   r   r   Zonnxruntimer   r   r   r   Z4onnxruntime.transformers.models.gpt2.convert_to_onnxr   r   Z0onnxruntime.transformers.models.gpt2.gpt2_helperr   Z2onnxruntime.transformers.models.t5.convert_to_onnxr   r   Z,onnxruntime.transformers.models.t5.t5_helperr   r    	getLoggerr   r"   rf   	Namespacerr   r   r   boolr   r   r   r   r   r   rm   dictr   r   r   r  r  r  r&  r2  rB  rk  rw  ry  r  r  r  r  r+   r  ZTensorr  r  r  r  r(   r%   r%   r%   r&   <module>   s   '(
	   -N!8LG    j %
1  P=          d' ;     %   x


B T >";
