U
    T?hL                     @   sX  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 dddddZdd	d
ddZdd Zd@ddZeeedddZeeeedddZeeeedddZeedddZeedddZeeeeeeeeeeedd d!ZdAeeedd#d$Zeedd%d&Zeeeeeeeeeed'
d(d)ZdBeeeeeeeeeeeed+d,d-ZdCeeeeeeeeeeeeed.d/d0ZdDeeeeeeeeeeed1d2d3ZdEeeeeeeeeeeed1d4d5Zeeeeeeeeeed6
d7d8Zd9d: Z dFd;d<Z!d=d> Z"e#d?krTd dl$Z$z
e"  W n$ e%k
rR   e$j&e'   Y nX dS )G    Nmeasure_memoryzrunwayml/stable-diffusion-v1-5zstabilityai/stable-diffusion-2z stabilityai/stable-diffusion-2-1z+stabilityai/stable-diffusion-xl-refiner-1.0)1.5z2.02.1zxl-1.0CUDAExecutionProviderROCMExecutionProviderZMIGraphXExecutionProviderZTensorrtExecutionProvider)cudarocmZmigraphxtensorrtc               
   C   s$   ddddddddd	d
g
} d}| |fS )Nz.a photo of an astronaut riding a horse on marsz@cute grey cat with blue eyes, wearing a bowtie, acrylic paintingzia cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital paintingzdan illustration of a house with large barn with many cute flower pots and beautiful blue sky sceneryzgone apple sitting on a table, still life, reflective, full color photograph, centered, close-up productzWbackground texture of stones, masterpiece, artistic, stunning photo, award winner photozSnew international organic style house, tropical surroundings, architecture, 8k, hdrznbeautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstationzcblue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realisticzldelicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8kz*bad composition, ugly, abnormal, malformed )promptsnegative_promptr   r   l/var/www/html/venv/lib/python3.8/site-packages/onnxruntime/transformers/models/stable_diffusion/benchmark.pyexample_prompts#   s    r   c                 C   s   t d|| |dS )NT)Zis_gpufuncmonitor_typestart_memoryr   )r   r   r   r   r   r   measure_gpu_memory6   s    r   )
model_name	directorydisable_safety_checkerc           	      C   s   ddl m}m} dd l}|d k	rJtj|s0t| }|j	|||d}n|j	| d|dd}|
|jj|_|jdd |rd |_d |_|S )Nr   )DDIMSchedulerOnnxStableDiffusionPipeline)providerZsess_optionsZonnxT)revisionr   Zuse_auth_tokendisable)	diffusersr   r   onnxruntimeospathexistsAssertionErrorZSessionOptionsfrom_pretrainedfrom_config	schedulerconfigset_progress_bar_configsafety_checkerfeature_extractor)	r   r   r   r   r   r   r   session_optionspiper   r   r   get_ort_pipeline:   s,    r,   )r   r   enable_torch_compileuse_xformersc           	      C   s   ddl m}m} ddlm}m} |j| |dd}|jj|d |rN|	  |rt
|j|_t
|j|_t
|j|_td ||jj|_|jdd	 |rd |_d |_|S )
Nr   )r   StableDiffusionPipeline)channels_lastfloat16)Ztorch_dtyper   )Zmemory_formatz)Torch compiled unet, vae and text_encoderTr   )r   r   r/   torchr0   r1   r#   toZunetZ*enable_xformers_memory_efficient_attentioncompileZvaeZtext_encoderprintr$   r%   r&   r'   r(   r)   )	r   r   r-   r.   r   r/   r0   r1   r+   r   r   r   get_torch_pipelineX   s"    r6   )enginer   
batch_sizer   c                 C   s6   | dd dd}|  d| d| |r0dnd S )	N/zstable-diffusion-sd__b Z_safe)splitreplace)r7   r   r8   r   Zshort_model_namer   r   r   get_image_filename_prefixs   s    rA   )r8   image_filename_prefixc
                    sN  ddl m}
 t|
stt \}} fdd}t|	||}t|	||}|  g }t|D ]\}}||krx qt|D ]}t }|g  |g  ddj	}t }|| }|
| td|dd	 t|D ]*\}}|| d
| d
| d
| d qqqbddlm} d| ||t|t| t|||dS )Nr   )r   c                      s   d d d S Nwarm up)num_inference_stepsnum_images_per_promptr   r   r8   heightr+   stepswidthr   r   warmup   s    z run_ort_pipeline.<locals>.warmup      @)rE   r   guidance_scaleInference took .3f secondsr<   .jpg__version__r   r7   versionrH   rJ   rI   r8   batch_countnum_promptsaverage_latencymedian_latencyfirst_run_memory_MBsecond_run_memory_MB)r   r   
isinstancer"   r   r   	enumeraterangetimeimagesappendr5   saver   rS   sumlen
statisticsmedian)r+   r8   rB   rH   rJ   rI   rW   rV   r   memory_monitor_typer   r   r   rK   first_run_memorysecond_run_memorylatency_listipromptjinference_startr`   inference_endlatencykimageort_versionr   rG   r   run_ort_pipelinex   sR    

(rt   c
                    sJ  t  \}
} fdd}t|	||}t|	||}|  td g }t|
D ]\}}||krh qtj  t|D ]}t }|g  d|g  d dj	}tj  t }|| }|
| td|dd t|D ]*\}}|| d	| d	| d	| d
 qqzqRdtj ||t|t| t|||dS )Nc                      s   d d d S rC   r   r   rG   r   r   rK      s    z"run_torch_pipeline.<locals>.warmupFrL   )rl   rH   rJ   rE   rM   r   	generatorrN   rO   rP   r<   rQ   r2   rT   )r   r   r2   set_grad_enabledr]   r   Zsynchronizer^   r_   r`   ra   r5   rb   rS   rc   rd   re   rf   )r+   r8   rB   rH   rJ   rI   rW   rV   r   rg   r   r   rK   rh   ri   rj   rk   rl   rm   rn   r`   ro   rp   rq   rr   r   rG   r   run_torch_pipeline   sT    





(rw   )r   r   r   r8   r   rH   rJ   rI   rW   rV   tuningc                 C   s   |}|r|dkr|dddf}t   }t| |||}t   }td||  d td| ||}t||||||||	|
|
}|| ||dd|d	d
 |S )N)r   r      )Ztunable_op_enableZtunable_op_tuning_enableModel loading took rP   ZortExecutionProviderr>   Fr   r   r   r   enable_cuda_graph)r_   r,   r5   rA   rt   updater@   )r   r   r   r8   r   rH   rJ   rI   rW   rV   r   rg   rx   Zprovider_and_options
load_startr+   load_endrB   resultr   r   r   run_ort   s:    
	r   Tc                 C   s   ddl m}m} |d k	rRtj|rRd| kr@|j||d dd}q|j||dd}nDd| krz|j| d|d dd}|| n|j| d|dd	}|| |rd |_d |_	|S )
Nr   ORTStableDiffusionPipelineORTStableDiffusionXLPipelinexlF)r   r*   use_io_binding)r   r   T)exportr   r*   r   )r   r   r   )
optimum.onnxruntimer   r   r   r    r!   r#   Zsave_pretrainedr(   r)   )r   r   r   r   r   r   pipeliner   r   r   get_optimum_ort_pipeline0  sD    
r   c
                    sH  ddl m}
m} t|
|fs"tt } fdd}t|	||}t|	||}|  g }t|D ]\}}||kr| qt|D ]}t		 }|d d dj
}t		 }|| }|| td|dd	 t|D ]*\}}|| d
| d
| d
| d qqqfddlm} d| ||t|t| t|||dS )Nr   r   c                      s   d d d S rC   r   r   rG   r   r   rK   q  s    z(run_optimum_ort_pipeline.<locals>.warmupg        )rE   r   rM   rF   rN   rO   rP   r<   rQ   rR   Zoptimum_ortrT   )r   r   r   r\   r"   r   r   r]   r^   r_   r`   ra   r5   rb   r   rS   rc   rd   re   rf   )r+   r8   rB   rH   rJ   rI   rW   rV   r   rg   r   r   r   rK   rh   ri   rj   rk   rl   rm   rn   r`   ro   rp   rq   rr   rs   r   rG   r   run_optimum_ort_pipeline_  sT    	
(r   )
r   r   r   r8   r   rH   rJ   rI   rW   rV   c                 C   s|   t   }t| |||}t   }td||  d td| ||}t||||||||	|
|
}|| ||dd|dd |S )Nrz   rP   optimumr{   r>   Fr|   )r_   r   r5   rA   r   r~   r@   )r   r   r   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r+   r   rB   r   r   r   r   run_optimum_ort  s4    
	r   F)work_dirrU   r8   r   rH   rJ   rI   rW   rV   max_batch_sizenvtx_profileuse_cuda_graphc           .         sH  t d ddlm} |   |ks&tddlm} ||}| }ddlm}m	} ddl
m} |j}|| ||\}}}}}||d|d|||||d		jj|||d
 dddtj d    fdd}t|
||	}t|
||	}|  td| |}g }t \} }!t| D ]\}"}#|"|kr8 qt|D ]}$t }%j|#g  |!g  ddd\}&}'t }(|(|% })||) t d|)dd|'  t|&D ],\}*}+|+| d|" d|$ d|* d qq@q   ddlm}, ddlm}- |  d|-d|, d| ||t!|t"| t#$|||||dS )Nzd[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)r   init_trt_pluginsPipelineInfo
EngineTypeget_engine_pathsr/   DDIMFr%   
output_dirverboser   r   r   framework_model_direngine_type   T)opt_image_heightopt_image_widthopt_batch_sizestatic_batchZstatic_image_shapeZmax_workspace_sizeZ	device_idc                      s&   j dg  dg  dd d S NrD   negativeT)denoising_stepsrK   runr   r8   rH   r   rI   rJ   r   r   rK     s         z"run_ort_trt_static.<locals>.warmuport_trtrL   {   r   ZguidanceseedEnd2End took rO    seconds. Inference latency: r<   rQ   rR   r   z	tensorrt())r   r7   rU   r   r   rH   rJ   rI   r8   rV   rW   rX   rY   rZ   r[   r   r}   )%r5   trt_utilitiesr   r"   diffusion_modelsr   
short_nameengine_builderr   r   pipeline_stable_diffusionr/   ORT_TRTbackendZbuild_enginesr2   r   Zcurrent_deviceload_resourcesr   rA   r   r]   r^   r_   r   ra   rb   teardownr
   rS   r   namerc   rd   re   rf   ).r   rU   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r   r   r   r   pipeline_infor   r   r   r/   r   onnx_dir
engine_dirr   r   r<   rK   rh   ri   rB   rj   r   r   rk   rl   rm   rn   r`   pipeline_timero   rp   rq   rr   trt_versionrs   r   r   r   run_ort_trt_static  s    


	
.
r   )r   rU   r   r8   r   rH   rJ   rI   rW   rV   r   r   r   c           1         sT  t d ddlm} ddlm} |   |ks2tddlm} ||}ddlm	}m
} ddlm} |j}|| ||\}}}}}||d|d	||d
|djj|||d d
d
d	|d tj j }||\}}j|    fdd}t|||
} t|||
}!|  td| |}"g }#t \}$}%t|$D ]\}&}'|&|krd qt|	D ]}(t })j|'g  |%g  ddd\}*}+t },|,|) }-|#|- t d|-dd|+  t|*D ],\}.}/|/|" d|& d|( d|. d qؐqlqL  dd l}0d|0j d |	|t!|#t"|# t#$|#| |!|dS )N][I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)r   cudartr   r   r   r   r   FT)r%   r   r   r   r   r   r   r   r   r   r   Z
onnx_opsetr   r   r   r   Zstatic_shapeZenable_all_tacticstiming_cachec                      s&   j dg  dg  dd d S r   r   r   r   r   r   rK     s         z#run_tensorrt_static.<locals>.warmuptrtrL   r   r   r   rO   r   r<   rQ   r
   default)r7   rU   r   rH   rJ   rI   r8   rV   rW   rX   rY   rZ   r[   r}   )%r5   r   r   r   r   r"   r   r   r   r   r   r   r/   TRTr   load_enginesmaxmax_device_memory
cudaMallocactivate_enginesr   r   rA   r   r]   r^   r_   r   ra   rb   r   r
   rS   rc   rd   re   rf   )1r   rU   r   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r   r   r   r   r   r   r   r   r/   r   r   r   r   r   r   r   r<   shared_device_memoryrK   rh   ri   rB   rj   r   r   rk   rl   rm   rn   r`   r   ro   rp   rq   rr   r   r   r   r   run_tensorrt_staticU  s      


	
.r   )r   rU   r8   r   rH   rJ   rI   rW   rV   r   r   c           *         s  t d dd l}ddlm} ddlm} d dksHd dkr^td d d|  ksptdd	lm	} dd
l
m m  f	dd}ddlm} ||}|||tj j }||\}}j|  d
fdd			fdd}t|
||	}t|
||	}|  | }td||}g }t \}}t|D ]\} }!| |kr qDt|D ]}"t }#r|  	|!g |g dd\}$}%r|  t }&|&|# }'||' t d|'dd|%  t|$D ],\}(})|)| d|  d|" d|( d qqqp   |d|j!d
||t"|t#| t$%|||dS )Nr   r   r   r      zCImage height and width have to be divisible by 8 but specified as: z and .r   r   c           	         s\    j }||\}}}}}| |d|d||d	}|jj|||dddd|d |S )Nr   Fr   r   Tr   )r   r   r   )	Zpipeline_classr   r   r   r   r   r   r   r   )	r   r8   r   rH   r   r   r   rJ   r   r   r   init_pipeline  s>      z-run_tensorrt_static_xl.<locals>.init_pipeliner   c              	      s   j | | d|dS Ng      @r   r   rl   r   r   )image_heightimage_widthr   rI   r   r   run_sd_xl_inference3  s    z3run_tensorrt_static_xl.<locals>.run_sd_xl_inferencec                      s   dg  dg   d S NrD   r   r   r   r8   r   r   r   rK   >  s    z&run_tensorrt_static_xl.<locals>.warmupr   r   r   r   rO   r   r<   .pngr
   r   r   r7   rU   r   rH   rJ   rI   r8   rV   rW   rX   rY   rZ   r[   r}   )N)&r5   r
   r   r   r   r   
ValueErrorr"   r   r   r   r   r   r   r/   r   r   r   r   r   r   r   r   rA   r   r]   r^   r_   cudaProfilerStartcudaProfilerStopra   rb   r   rS   rc   rd   re   rf   )*r   rU   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r   r   r   r   r   r   r   r/   r   r   r<   r   rK   rh   ri   r   rB   rj   r   r   rk   rl   rm   rn   r`   r   ro   rp   rq   rr   r   )r   r8   r   rH   r   r   r   r   r   r   rI   r   rJ   r   r   run_tensorrt_static_xl  s    #



.r   c           &         s  ddl m} ddlm} |||j| || dddlm}  |ksJt  dfdd	 fdd	}t	|
||	}t	|
||	}|  j
 }td
| |}g }t \}}t|D ]\}}||kr qt|D ]}t }|r|  |g  |g  dd\}}|r(|  t }|| } ||  td| dd|  t|D ]:\}!}"| d| d| d|! d}#|"|# td|# q`qqƈ  ddlm}$ ddlm}% |d|%d|$ d ||t|t| t||||dS )Nr   )initialize_pipeline)r   )rU   r   r   rH   rJ   r   r   r   r   c              	      s   j | | d|dS r   r   r   )rH   r   rI   rJ   r   r   r     s    z+run_ort_trt_xl.<locals>.run_sd_xl_inferencec                      s   dg  dg   d S r   r   r   r   r   r   rK     s    zrun_ort_trt_xl.<locals>.warmupr   r   r   r   rO   r   r<   r   zImage saved torR   r   r
   r   r   )N)Z
demo_utilsr   r   r   r   r   r   r"   r   r   r   r   rA   r   r]   r^   r_   r   r   ra   r5   rb   r   r
   rS   r   rc   rd   re   rf   )&r   rU   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r   r   r   r   r   rK   rh   ri   r   rB   rj   r   r   rk   rl   rm   rn   r`   r   ro   rp   rq   rr   filenamer   rs   r   )r8   rH   r   r   rI   rJ   r   run_ort_trt_xlt  sz    




r   )
r   r8   r   r-   r.   rH   rJ   rI   rW   rV   c                 C   s   dt jj_dt jj_t d t }t| |||}t }td||  d t	d| ||}|st 
   t||||||||	|
|
}W 5 Q R X nt||||||||	|
|
}|| d |rdn
|rdnd|dd	 |S )
NTFrz   rP   r2   r4   Zxformersr   r|   )r2   backendsZcudnnZenabledZ	benchmarkrv   r_   r6   r5   rA   Zinference_moderw   r~   )r   r8   r   r-   r.   rH   rJ   rI   rW   rV   r   rg   r   r+   r   rB   r   r   r   r   	run_torch  sV    



	r   c                  C   s  t  } | jdddtdddddgdd	 | jd
ddtdtt dd	 | jddddd | jdddttt ddd | jdddtd dd | jdddtddd | jd ddd!d" | jdd# | jd$ddd%d" | jdd& | jd'ddd(d" | jdd) | jd*d+t	d,d,d-d.d/d0d1d2d3gd4d5 | jd6dt	d7d8d | jd9dt	d7d:d | jd;d<dt	d=d>d | jd?d@dt	d,dAd | jdBdCdt	t
d,dDdEdFd | jdGdHdt	t
d,d2d/dId | jdJdKdddLd" | jddM |  }|S )NNz-ez--engineFr   r   r2   r
   z-Engines to benchmark. Default is onnxruntime.)requiredtyper   choiceshelpz-rz
--providerr   z8Provider to benchmark. Default is CUDAExecutionProvider.z-tz--tuning
store_truezsEnable TunableOp and tuning. This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.)actionr   z-vz	--versionr   z>Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.)r   r   r   r   r   z-pz
--pipelinez[Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.)r   r   r   r   z-wz
--work_dirr   z?Root directory to save exported onnx models, built engines etc.z--enable_safety_checkerzEnable safety checker)r   r   r   )enable_safety_checkerz--enable_torch_compilez#Enable compile unet for PyTorch 2.0)r-   z--use_xformerszUse xformers for PyTorch)r.   z-bz--batch_sizery            r   
          z)Number of images per batch. Default is 1.)r   r   r   r   z--heighti   z$Output image height. Default is 512.z--widthz#Output image width. Default is 512.z-sz--steps2   zNumber of steps. Default is 50.z-nz--num_promptsz Number of prompts. Default is 1.z-cz--batch_count      z(Number of batches to test. Default is 5.z-mz--max_trt_batch_sizezdMaximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.z-gz--enable_cuda_graphz/Enable Cuda Graph. Requires onnxruntime >= 1.16)r}   )argparseArgumentParseradd_argumentstrlist	PROVIDERSkeys	SD_MODELSset_defaultsintr^   
parse_args)parserargsr   r   r   parse_arguments!  s   





					

r  c                    sL   dd l }|t }| D ]( | r<t fdddD rt j qd S )Nr   c                 3   s   | ]}| j kV  qd S )N)r    ).0xlibr   r   	<genexpr>  s     z)print_loaded_libraries.<locals>.<genexpr>)ZlibcuZlibnvr
   )psutilProcessr   getpidZmemory_mapsanyr5   r    )Zcuda_related_onlyr  pr   r  r   print_loaded_libraries  s
    r  c                  C   s  t  } t|  | jdkr| jdkr,dtjd< ddlm} ddlm} |	||	dkrbdtjd	< | j
r| jdkr| jd
kr| jd kstd|	||	dk rtdtjdd | jdkrdnd}t|d }td| t| j }t| j }| jdkr| jdkrd| jkrVtd t| j| j| jd| j| j| j| j| j||| jd| j
d}nDtd t| j| j| j| j | j| j| j| j| j||| jd| j
d}n| jdkr |dkr d| jkrdtjd	< t|| j|| j| j | j| j| j| j| j||d}nr| jdkrz| jr$tj | js,t!dtd| d| j"  t#|| j|| j| j | j| j| j| j| j||| j"d }n| jdkrd| jkrtd! t$| j| j| jd| j| j| j| j| j||| jd| j
d}n| jdkr$td" t%| j| j|| jd| j| j| j| j| j||| jd| j
d#}nNtd$| j& d%| j' d& t(|| j| j | j&| j'| j| j| j| j| j||d'}t| t)d(d)d*d+L}d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<g}	t*j+||	d=}
|
,  |
-| W 5 Q R X | jd>krt.| jd
k d S )?Nr   )r   1ZORT_DISABLE_TRT_FLASH_ATTENTIONr   )rU   rR   z1.16.0Z!ORT_ENABLE_FUSED_CAUSAL_ATTENTION)r   r
   z:The stable diffusion pipeline does not support CUDA graph.z1.16z.CUDA graph requires ONNX Runtime 1.16 or laterz%(funcName)20s: %(message)s)fmtr	   r   z&GPU memory used before loading models:r
   r   zNTesting Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.TF)r   rU   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r   r   zLTesting Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.r   r   )r   r   r   r8   r   rH   rJ   rI   rW   rV   r   rg   z?--pipeline should be specified for the directory of ONNX modelsz/Testing diffusers StableDiffusionPipeline with z provider and tuning=)r   r   r   r8   r   rH   rJ   rI   rW   rV   r   rg   rx   zGTesting Txt2ImgXLPipeline with static input shape. Backend is TensorRT.zETesting Txt2ImgPipeline with static input shape. Backend is TensorRT.)r   rU   r   r8   r   rH   rJ   rI   rW   rV   r   rg   r   r   r   zNTesting Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile=z, xformers=r   )r   r8   r   r-   r.   rH   rJ   rI   rW   rV   r   rg   zbenchmark_result.csvar>   )modenewliner   r   r7   rU   r   r   rH   rJ   rI   r8   rV   rW   rX   rY   rZ   r[   r}   )
fieldnamesry   )/r  r5   r7   rU   r   environ	packagingr   rS   parser}   r   r   r   coloredlogsinstallr   r   r   r   r   r8   rH   rJ   rI   rW   rV   Zmax_trt_batch_sizer   r   r   r    isdirr"   rx   r   r   r   r-   r.   r   opencsv
DictWriterwriteheaderwriterowr  )r  rU   rs   rg   r   Zsd_modelr   r   Zcsv_fileZcolumn_namesZ
csv_writerr   r   r   main  s\   










r"  __main__)N)r   T)FT)FT)FT)FT)T)(r   r  r   re   sysr_   __init__r  r2   Zbenchmark_helperr   r   r   r   r   r   boolr,   r6   r  rA   rt   rw   r   r   r   r   r   r   r   r   r   r  r  r"  __name__	traceback	Exceptionprint_exceptionexc_infor   r   r   r   <module>   s.  
ED5  1E:         %  lB !
	 F

