U
    hG                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 ddl
mZ dd Zd&d
dZd'ddZd(ddZG dd dZG dd dZdd Zd)ddZd*ddZdd Zed+d"d#Zd,d$d%ZdS )-    N)contextmanager)AnyDictList   )languagec                 C   sL   d | } dddd|  dg}t|}|tjjd}dd |D }|S )	N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounitsc                 S   s   g | ]}t |qS  )int.0xr   r   @/var/www/html/venv/lib/python3.8/site-packages/triton/testing.py
<listcomp>   s     znvsmi.<locals>.<listcomp>)join
subprocesscheck_outputdecodesysstdoutencodingsplit)attrscmdoutretr   r   r   nvsmi
   s    

r      meanc              	   C   s  ddl }|dkst|j |j kr0td|   |dk	r`|D ]}|  |d d|_qB|j	 }|j
| |   W 5 Q R X |j  |jjdd}|jjdd}|  |  |  |j  ||}	tdt||	 }
|j	 }|j
|8 t|
D ](}|dk	r,|D ]}d|_q|   qW 5 Q R X |j  g }d}t|D ]X}|jjdd}|jjdd}|  |  |  |j  ||||
 g7 }qZ||}t||| S )	a+  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    r   Nminmaxr!   ZmedianzQCannot capture graph in default stream. Please use side stream in benchmark code.TZenable_timingr   
   )torchAssertionErrorcudaZcurrent_streamZdefault_streamRuntimeErrorZdetach_Zrequires_grad_gradZ	CUDAGraphgraphsynchronizeEventrecordZreplayelapsed_timer$   r   rangetensorgetattritem)fnrepgrad_to_nonereturn_moder'   r   gstart_event	end_eventestimate_msn_repeatir   Z	n_retriestimesr   r   r   do_bench_cudagraph   sT    










r@      d   Tc                    s  |dkst ddl |    j  |r@ jtd jdd}n jtd jdd} jjdd	} jjdd	}	|  t	d
D ]}
|
  |   q|	   j  ||	d
 }tdt|| }tdt|| } fddt	|D } fddt	|D }	t	|D ]}
|   qt	|D ]H}|dk	rD|D ]}d|_q6|
  ||   |   |	|   q$ j   jdd t||	D  jd}|dk	r؈ | j| jd }t|dkr|d }|S t || S )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float]
    :param fast_flush: Use faster kernel to flush L2 between measurements
    :type fast_flush: bool
    r"   r   Ng    Ar)   )dtypedeviceg    ATr%      r   c                    s   g | ]} j jd dqS Tr%   r)   r.   r   r>   r'   r   r   r      s     zdo_bench.<locals>.<listcomp>c                    s   g | ]} j jd dqS rF   rG   rH   rI   r   r   r      s     c                 S   s   g | ]\}}| |qS r   )r0   )r   ser   r   r   r      s     )rC   )r(   r'   r)   r-   emptyr   int8r.   r/   r1   Zzero_r0   r$   r+   r2   zipfloatZquantiletolistlenr3   r4   )r5   Zwarmupr6   r7   Z	quantilesZ
fast_flushr8   cacher:   r;   _r<   Zn_warmupr=   r>   r   r?   r   r   rI   r   do_benchR   sL    





 
rT    c                 C   sN  dd l }dd l}t| |js&|| } t||js<||}|d krHd}t|rZ|| jn|}|d krjd}t|r||| jn|}t| |jr| j|jkr|  } | 	 
   } t||jr|j|jkr| }|	 
   }| jdks|jdkr|jj| |||dd d S |j| |||dsJt| d|  d	| d
| d| d
d S )Nr   g{Gz?g        r   T)atolrtolZ	equal_nan)rV   rW    z is not close to z (atol=z, rtol=))numpyr'   
isinstanceZTensorr2   callablerC   bfloat16rO   cpudetachsizeZtestingZassert_allcloseZallcloser(   )r   yrV   rW   err_msgnpr'   r   r   r   assert_close   s2    

rd   c                   @   sL   e Zd ZdZdee ee eee ee eeeef eeeedddZ	dS )		Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rU   FN)x_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                 C   sL   || _ || _|
| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _dS )a  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)rf   rg   ro   rh   ri   rj   rp   stylesrm   rn   rk   rl   )selfrf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   colorrq   r   r   r   __init__   s    .zBenchmark.__init__)rU   rU   FFNN)
__name__
__module____qualname____doc__r   strr   r   boolrt   r   r   r   r   re      s(         
re   c                   @   s4   e Zd Zdd ZdeeeedddZdd	d
ZdS )Markc                 C   s   || _ || _d S N)r5   
benchmarks)rr   r5   r}   r   r   r   rt     s    zMark.__init__F   )bench	save_path
show_plots
print_datac              
      sv  dd l }dd lm}	 dd l}
|j}dd |jD }dd |jD }t|j}|
j|| | | d}|jD ] t	 tt
fs fdd|D  t t|krtdt| d  tt| }g g g   }}}|jD ]t}| jf ||j|i|j|}z|\}}}W n& tk
r0   |d d   }}}Y nX ||g7 }||g7 }||g7 }qt | | | |jt|< qj|jr|	  |	 }|d }t|jD ]\}}||d	  ||d
   }}|jr|j| d nd }|jr|j| d nd }|j|| || |||d |  s|  s|t}|t}|j|| ||d|d q|   |!|j"pp| |#|j$ |%|j&rdnd |'|j(rdnd |r|	)  |r|	*|j+,||j d |||j  }|r"|j-d dkr"|j./ \}}|| ||  |d< |rBt0|jd  t0|1  |rr|j2|j+,||j dd| ddd |S )Nr   c                 S   s   g | ]}| d qS )-minr   r   r   r   r   r     s     zMark._run.<locals>.<listcomp>c                 S   s   g | ]}| d qS )-maxr   r   r   r   r   r     s     )columnsc                    s   g | ]} qS r   r   )r   rS   r   r   r   r     s     z	Expected z values, got r   r   r   )labelrs   Zlsg333333?)alphars   logZlinearz.png   Diff:z.csvz%.fF)Zfloat_formatindex)3osZmatplotlib.pyplotZpyplotZpandasrj   listrf   Z	DataFramerg   r[   tuplerQ   
ValueErrordictrN   ri   r5   rh   rl   	TypeErrorlocrk   ZfigureZsubplot	enumeraterq   ZplotZisnullallZastyperO   Zfill_betweenZlegendZ
set_xlabelrm   Z
set_ylabelrn   Z
set_xscalero   Z
set_yscalerp   showZsavefigpathr   shaper   rP   printZ	to_stringZto_csv)rr   r   r   r   r   Zdiff_colZsave_precisionZkwragsr   ZpltpdZy_meanZy_minZy_maxrf   dfZx_argsZrow_meanZrow_minZrow_maxra   r   ZaxZfirst_xr>   colZstyZcol0Zcol1r   r   r   _run  sv    



 

"z	Mark._runrU   c                 K   s   t | jt}|r| jgn| j}g }|rRtj|dd ttj|dd}	|	d |D ]6}
|	| j
|
|||f| |rV|	d|
j d qV|r|	d |	  |r|r|d	 S |S d S )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )r[   r}   re   r   makedirsopenr   r   writeappendr   rk   close)rr   r   r   r   Z	return_dfkwargsZhas_single_benchr}   Z
result_dfshtmlr   r   r   r   runP  s&    

zMark.runN)Fr~   )FFrU   F)	ru   rv   rw   rt   re   ry   rz   r   r   r   r   r   r   r{     s
     Er{   c                    s    fdd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                    s
   t |  S r|   )r{   )r5   r}   r   r   <lambda>o      zperf_report.<locals>.<lambda>r   )r}   wrapperr   r   r   perf_reporth  s    r   c                 C   s^   ddl }ddlm} | s"|j } |jj| d }|jj| d }|| d d d	 }|S )
z return DRAM bandwidth in GB/s r   Nr   driverZmem_clock_rateZmem_bus_widthr   g    .A   )r'   runtimer   r)   current_deviceactiveutilsget_device_properties)rD   r'   r   Zmem_clock_khzZ	bus_widthZbw_gbpsr   r   r   get_dram_gbpss  s    
r   c           	      C   s   dd l }ddlm} |s"|j }|jj|d d }|j|}|d dk rd| |j	ks^t
d}nV| |j|jfkrzd}n@| |j	|j|jfkrd}n&| |jtjtjtjfkrd	}ntd
|| | d }|S )Nr   r   r   multiprocessor_count   r      i   i   dtype not supported&.>)r'   r   r   r)   r   r   r   r   get_device_capabilityfloat16r(   float32Zint32r]   Zint16rM   tlZ
float8e4nvZfloat8e4b15Zfloat8e5r*   	rC   Z
clock_raterD   r'   r   Znum_subcoresZ
capabilityZops_per_sub_coretflopsr   r   r   get_max_tensorcore_tflops  s$    
r   c                     s    fdd}|S )Nc                    s   t   fdd}|S )Nc            
         s   dd l }|t  }  | k}|r|dkrtjjd }tj	d dd}d|ksht
d|d jjj}| d	j d
| d}tjddd|gd|d}	|	jdkst
ddt|	jkst
n
| | d S )Nr   zcuda-memcheck__file__PATH1)r   ZPYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]Zpytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environr(   nodeZcallspecidru   r   r   
returncodery   r   )
rl   r   r   Z	ppid_nameZrun_cuda_memcheckr   r   Ztest_idr   r   )target_kwargstest_fnr   r   r     s    z1cuda_memcheck.<locals>.decorator.<locals>.wrapper)	functoolswraps)r   r   r   )r   r   	decorator  s    z cuda_memcheck.<locals>.decoratorr   )r   r   r   r   r   cuda_memcheck  s    r   F    c                 c   s  zt dddddg t dddd|  d	|  g t dddd
| d	| g tdgd }tdgd }t||  dk std|  dt|| dk std| dd|  }d| d }||fV  W 5 t dddddg t ddddg t ddddg X d S )Nr	   r
   r   z-pmz-rgcz-rmcr   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr&   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   absr(   )Zref_sm_clockZref_mem_clockZcur_sm_clockZcur_mem_clockr   Zgbpsr   r   r   set_gpu_clock  s0      r   c           	      C   s   dd l }ddlm} |s"|j }|jj|d d }|j }|d dk rx| |j	kr^d}q| |j
krnd}qtd	n.| |j	krd}n| |j
|jfkrd}ntd	|| | d
 }|S )Nr   r   r   r   r   r       @   r   r   )r'   r   r   r)   r   r   r   r   r   r   r   r*   r]   r   r   r   r   get_max_simd_tflops  s&    





r   )r    Nr!   )rA   rB   NNTr!   )NNrU   )N)N)r   r   )N)r   r   r   r   
contextlibr   typingr   r   r   rU   r   r   r   r@   rT   rd   re   r{   r   r   r   r   r   r   r   r   r   r   <module>   s&   	
?
L
%Bc

