U
    yh'                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ ddlmZm	Z	m
Z
 dddd	d
dgZdd Zdd Zdd Zdd Ze jG dd dZdd Zdd ZdS )    N)defaultdict)
DeviceType   )create_bandwidth_info_strdo_bench_gpuget_num_bytesZforeachZpersistent_reductionZ	pointwiseZ	reductionZ
split_scantemplatec                    s.    fddt D }t|dkr&|d S dS dS )z
    Similar to get_kernel_category but use the source code. Call this API
    if we have not compile the src_code to module yet.
    c                    s   g | ]}d |  kr|qS )z@triton_heuristics. .0chsrc_coder	   S/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/wrapper_benchmark.py
<listcomp>   s     z6get_kernel_category_by_source_code.<locals>.<listcomp>r   r   unknownN_kernel_category_choiceslen)r   choicesr	   r   r   "get_kernel_category_by_source_code   s    
r   c                    s.    fddt D }t|dkr&|d S dS dS )a  
    Given the module defining a triton kernel, return the category of the kernel.
    Category can be one of:
    - pointwise
    - reduction
    - persistent_reduction

    Currently we simply decide the category depending on what decorator is imported
    by the kernel.
    c                    s   g | ]}| j kr|qS r	   )__dict__r
   
kernel_modr	   r   r   1   s     
 z'get_kernel_category.<locals>.<listcomp>r   r   r   Nr   )r   r   r	   r   r   get_kernel_category&   s    r   c                    s<   ddl m   fdd| j D }t|dks4t|d S )Nr   CachingAutotunerc                    s(   g | ] \}}| d rt| r|qS )triton_)
startswith
isinstance)r   kvr   r	   r   r   ;   s   
 
z%get_triton_kernel.<locals>.<listcomp>r   )Z)torch._inductor.runtime.triton_heuristicsr   r   itemsr   AssertionError)modZ	cand_listr	   r   r   get_triton_kernel8   s    
r%   c              	      s  ddl m} d}|j D ]p\}tdrtds:qt}t}  tdd |j	j
D }|jdddkrt d	|id
 dfdd	}| dd|dd   d|dd  }	|r(tdst }
t|	 |
 D ].\}}td|||j|j|j d|j  qn\t fddddd}t|jdksXtd|jd }t|||j|j|j|	 dd |d7 }q|dkrtd dS )aX  
    An experimental API used only when config.benchmark_kernel is true.

    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
    Used in the compiled modules.

    Put this method here rather than codegen it for convenience since its implementation
    does not change based on different graph modules being compiled.
    r   )PyCodeCacheget_argscallc                 S   s   g | ]}| d r|qS )Z
in_out_ptr)r   )r   arg_namer	   r	   r   r   Y   s   
z)benchmark_all_kernels.<locals>.<listcomp>Zkernel_num_gbNZnum_in_out_argsg    eA c                    sZ   t dd |||fD s8d|dd|dd|dd}nd	} | d
  }t|  |||dS )Nc                 s   s   | ]}|d kV  qd S Nr	   )r   xr	   r	   r   	<genexpr>d   s     z>benchmark_all_kernels.<locals>.get_info_str.<locals>.<genexpr>  3z regs  z	 spills  8z shared memr*   g     @@)prefixsuffix)anyr   )msn_regsn_spillssharedr1   Zkernel_detail_strZgb_per_s)num_gbr	   r   get_info_strc   s        z+benchmark_all_kernels.<locals>.get_info_strZ20    
   benchmark_all_configsr.   z @ c                      s
     S r+   )r(   r	   )argsr   r	   r   <lambda>|       z'benchmark_all_kernels.<locals>.<lambda>(   T)repZ
fast_flushr   z.Autotuner should have selected the best config)r1   zpNo kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True)r*   )Ztorch._inductor.codecacher&   cacher"   hasattrr%   r   r'   r   fn	arg_namesZinductor_metagetr   upperr#   r=   printr5   r6   r7   configr   Z	launchers)benchmark_namer=   r&   ZnfoundZ
kernel_keyZtriton_kernelZkernel_categoryZnum_in_out_ptrsr9   Zkernel_descZbench_resultlauncherr4   r	   )r>   r   r8   r   benchmark_all_kernelsD   s^    
(
 



rM   c                   @   s.   e Zd ZU eed< eed< eed< eed< dS )ProfileEventcategorykeyself_cuda_time_mscountN)__name__
__module____qualname__str__annotations__floatr	   r	   r	   r   rN      s   
rN   c                    s   fddt t  fdd}|D ]t}|jr:td|jtjkrHq(d}|jdr|jdrjd	}n(|jd
r|d}n|jdrd}nd}||| q(fdd fdd}|  d S )Nc                    s   | j d   S )zT
        ev.self_cuda_time_total is in microsecond. Convert to millisecond.
          )self_cuda_time_totalev)nrunsr	   r   get_self_cuda_time   s    z4parse_profile_event_list.<locals>.get_self_cuda_timec                    s.   t || j| | j d} | | d S )N)rO   rP   rQ   rR   )rN   rP   rR   append)r\   rO   Z
profile_ev)
all_eventsr^   r]   r	   r   	add_event   s    z+parse_profile_event_list.<locals>.add_eventz!Don't support the legacy profilerr   r   Z
triton_poitriton_pointwiseZ
triton_redtriton_reductionZ
triton_pertriton_persistent_reductiontriton_unknownc                    s   ddl m } |jdd dd g }d}td|  d	 |D ]D}||j7 }|j  d
 dd}||jd d |j|j|g q:|d|d|  d
 ddg t||ddddgd |S )Nr   )tabulatec                 S   s   | j S r+   )rQ   r[   r	   r	   r   r?      r@   zCparse_profile_event_list.<locals>.report_category.<locals>.<lambda>T)rP   reverse        z
  == z category kernels == d   .2f%x   ZTotalr*   ZKernelzSelf CUDA TIME (ms)ZCountZPercent)headers)rf   sortrI   rQ   r_   rP   rR   )rO   Zprofile_eventsrf   rows
total_timer\   percent)wall_time_msr	   r   report_category   s&    
" 
z1parse_profile_event_list.<locals>.report_categoryc                     s  dddddg} t   t | s6tt   i }d}| D ]*}| krB| | }|||< ||7 }qB| d dd	}td
|  tddd d }| D ].}||d d dd	}|d| 7 }q|d| ddd7 }t| d S )Nrb   rc   rd   re   r   rh   ri   rj   rk   z#
Percent of time when GPU is busy: zTotal wall time z.3fz mszOutput for tabulate: z, r4   )setkeysissubsetr#   listrI   rG   )Zcategory_listZper_category_wall_timeZtotal_cuda_msrO   _timeZgpu_busy_percentZtabulate_linerq   )r`   rK   rs   rr   r	   r   report   s8    

z(parse_profile_event_list.<locals>.report)	r   rw   Z	is_legacyr#   Zdevice_typer   ZCPUrP   r   )rK   
event_listrr   r]   ra   r\   rO   ry   r	   )r`   rK   r^   r]   rs   rr   r   parse_profile_event_list   s(    	&r{   c              	   C   s  ddl }| }|jddddd |jdd	dd
d |jddddd | }|jrbt| |j nd}d}|||dd }|jsdS tj	jdd}|||d W 5 Q R X t
  d}	||	 td|  d td|	  |jdd}
t|
jddd t| |
|||  dS )zM
    This is the function called in __main__ block of a compiled module.
    r   Nz--benchmark-kernelsz-k
store_truez,Whether to benchmark each individual kernels)actionhelpz--benchmark-all-configsz-cz8Whether to benchmark each individual config for a kernelz	--profilez-pz&Whether to profile the compiled moduler<   )timesrepeatrY   T)Zrecord_shapesz/compiled_module_profile.jsonz4Profiling result for a compiled module of benchmark :z+Chrome trace for the profile is written to )Zgroup_by_input_shaperZ   )Zsort_byZ	row_limit)argparseArgumentParseradd_argument
parse_argsZbenchmark_kernelsrM   r=   ZprofiletorchZprofilertempfile
gettempdirZexport_chrome_tracerI   Zkey_averagestabler{   )rK   Zbenchmark_compiled_module_fnr   parserr>   r   r   rr   ppathrz   r	   r	   r   compiled_module_main   sT    
   r   )dataclassesr   collectionsr   r   Ztorch.autogradr   Zruntime.runtime_utilsr   r   r   r   r   r   r%   rM   	dataclassrN   r{   r   r	   r	   r	   r   <module>   s(   
N	d