U
    zhr                     @  s~  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dl m!Z! d d	l"m#Z#m$Z$m%Z%m&Z&m'Z' er*d d
l(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6 dZ7da8e9e:Z;G dd dZ<G dd dZ=G dd de>Z?ej@ddddZAejBG dd dZCejBG d d! d!ZDeD ZEee!jFe!jGf ZHejBG d"d# d#ZIejBG d$d% d%ZJG d&d' d'eJZKG d(d) d)eJZLG d*d+ d+eLZMG d,d- d-eLZNG d.d/ d/eJZOG d0d1 d1eOZPd2d3d4d5d6ZQdS )7    )annotationsN)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)	AnyCallableDictIterableListOptionalSequenceTYPE_CHECKINGUnion)multiprocessing)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)BaseProcess)Queue)
ModuleType)TritonTemplateCaller   )config)do_bench_cpudo_bench_gpu)VCUDA_VISIBLE_DEVICESFc                   @  s   e Zd ZdS )PingN__name__
__module____qualname__ r(   r(   R/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/autotune_process.pyr#   ;   s   r#   c                   @  s   e Zd ZdS )PongNr$   r(   r(   r(   r)   r*   ?   s   r*   c                   @  s   e Zd ZdS )!NonzeroWorkspaceNotSupportedErrorNr$   r(   r(   r(   r)   r+   C   s   r+   Optional[int]devicec              	   c  sZ   | dkrdV  dS t jt}t| t jt< z
dV  W 5 |dkrJt jt= n
|t jt< X dS )z
    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
    specified single device. If device is None, don't manipulate the environment.
    N)osenvirongetr"   str)r.   currentr(   r(   r)   set_cuda_visible_deviceG   s    

r4   c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded	< e	d
d
ddddZ
e	d
d
ddddZddddZddddZddddZdddddZd)ddd d!Zddd"d#Zddd$d%Zd*ddd'd(ZdS )+TuningProcessz
    Abstraction for launching a helper process to benchmark kernels. Spawns
    the parent process and uses multiprocessing queues to send benchmark
    requests and return results.
    Nr,   r.   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queuez
Queue[Any]None)r7   r8   returnc              
   C  sV   t dtjt zt| | W n, tk
rP } zt 	d W 5 d}~X Y nX dS )z4
        Entry point for the child process.
        z2Entering TuningProcess child. Visible devices = %szException in TuningProcessN)
logdebugr/   r0   r1   r"   r5   workloop	Exception	exception)r7   r8   exr(   r(   r)   process_maini   s    
zTuningProcess.process_mainc                 C  s^   |   }|dkrqZq t|tr,|t  q t|trF||  q tdt| q dS )z<
        Work loop for the benchmarking subprocess.
        NzInvalid request type )	r1   
isinstancer#   putr*   BenchmarkRequest	benchmarkRuntimeErrortype)r7   r8   objr(   r(   r)   r=   z   s    

zTuningProcess.workloopboolr:   c                 C  s   | j dk	o| jdk	o| jdk	S )z?
        True if the sub-process has been initialized.
        Nr6   r7   r8   selfr(   r(   r)   valid   s
    
zTuningProcess.validc                 C  s   d | _  | _| _dS )z2
        Reset to an uninitialized state.
        NrK   rL   r(   r(   r)   clear   s    zTuningProcess.clearc              	   C  sv   |   rdS td}| | _| | _|j| j| j| jfd| _| jdk	sRt	t
| j | j  W 5 Q R X dS )z
        Create child process, request/response queues, and do the warm up.
        Set the environment to make only the provided GPU device visible
        to the process.
        Nspawn)targetargs)rN   r   Zget_contextr   r7   r8   ProcessrA   r6   AssertionErrorr4   r.   start)rM   ctxr(   r(   r)   
initialize   s    


zTuningProcess.initializer   )rH   r:   c                 C  s&   |    | jdk	st| j| dS )z8
        Push a work item to the child process.
        N)rW   r7   rT   rC   )rM   rH   r(   r(   r)   rC      s    zTuningProcess.put      ^@      @      ?c                 C  s   | j dk	st| jdk	stzx|}d}|dk	rz|dkrz|d8 }z| jjdd}W qzW q& tjk
rv   | j  sr Y q&X q&|dkr| jj|d}|W S  tjk
r   | j j}|dkr| j||d n| 	   Y qX qdS )a,  
        Get a response from the child process. Raises queue.Empty on timeout
        or if the process dies.

        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
        to populate the timeouts:

        Arguments:

            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
                                      remains alive. Defaults to 1.0 or to
                                      config.max_autotune_subproc_terminate_timeout_seconds.
        Returns:
            A response from the child process (Any type)
        NrZ   g      ?timeout)graceful_timeoutterminate_timeout)
r6   rT   r8   r1   queueEmptyis_aliveexitcodekillrO   )rM   result_timeoutr]   r^   Zremaining_timeoutresstatusr(   r(   r)   r1      s2    

zTuningProcess.getc                 C  s4   |   r0| jdk	st| jdk	s$t| jd dS )z8
        Signal the child process to terminate.
        N)rN   r6   rT   r7   rC   rL   r(   r(   r)   	terminate   s    zTuningProcess.terminatec                 C  s    | j dk	r| j   |   dS )z5
        Wait for the child process to exit.
        N)r6   joinrO   rL   r(   r(   r)   wait   s    

zTuningProcess.wait      @c                 C  s   | j d k	r~|   | j j|d | j  rvtd| j j | j   | j j|d | j  rvtd| j j | j   | 	  d S )Nr[   z&Sending SIGTERM to process with PID %dz&Sending SIGKILL to process with PID %d)
r6   rg   rh   ra   r;   warningpiderrorrc   rO   )rM   r]   r^   r(   r(   r)   rc     s"    




zTuningProcess.kill)rX   rY   rZ   )rj   rZ   )r%   r&   r'   __doc__r.   __annotations__r6   r7   r8   staticmethodrA   r=   rN   rO   rW   rC   r1   rg   ri   rc   r(   r(   r(   r)   r5   \   s(   


     3	r5   c                   @  st   e Zd ZU dZdZded< dZded< ddd	d
ZddddZddddZ	dddddZ
dddddZdS )TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorr9   rJ   c                 C  s   | j dk| jdkkst| j dk	r&dS |  }td| t | _ |D ].}t|d}|	  |
t  | j 
| qH| j jD ]}t|jddtstqtt|d| _tsdaddl}|| j dS )z,
        Start the child processes.
        Nz$Sub-process autotune device list: %sr-   )rd   )max_workersTr   )rr   rs   rT   get_device_listr;   r<   r_   r   r5   rW   rC   r#   rB   r1   r*   r   lenEXIT_HANDLER_REGISTEREDatexitregisterrg   )rM   devicesr.   prx   r(   r(   r)   rW   $  s$    


zTuningProcessPool.initializezSequence[Optional[int]]c                 C  sZ   t jsdgS tj }ttjkrNdd tjt dD }t	||ksJt
|S tt|S )zD
        Gather the list of devices to be used in the pool.
        Nc                 S  s   g | ]}t |qS r(   )int).0dr(   r(   r)   
<listcomp>U  s     z5TuningProcessPool.get_device_list.<locals>.<listcomp>,)r   Zautotune_multi_devicetorchcudaZdevice_countr"   r/   r0   splitrv   rT   listrange)rM   countrz   r(   r(   r)   ru   I  s    

z!TuningProcessPool.get_device_listc                 C  sZ   | j dk	r| j   d| _ | jdk	rV| jjD ]}|  q,| jjD ]}|  qBd| _dS )z:
        Signal all child processes to terminate.
        N)rs   shutdownrr   r_   rg   ri   )rM   r{   r(   r(   r)   rg   [  s    




zTuningProcessPool.terminater   float)choicer:   c              	   C  s   |j dk	st| jdk	st| j }||j  zTz|tjtjtjW W :S  t	j
k
r   td| d td Y W S X W 5 | j| X dS )z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        NzFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)ZbmreqrT   rr   r1   rC   r   Z+max_autotune_subproc_result_timeout_secondsZ-max_autotune_subproc_graceful_timeout_secondsZ.max_autotune_subproc_terminate_timeout_secondsr_   r`   warningswarnr   )rM   r   r6   r(   r(   r)   rQ   j  s     


zTuningProcessPool.targetList[TritonTemplateCaller]!Dict[TritonTemplateCaller, float]choicesr:   c                 C  sP   | j dk	std| jdk	s ti }t|| j| j|D ]\}}|||< q:|S )z>
        Benchmark each choice in a separate process.
        Nz&Tuning process pool is not initialized)rr   rT   rs   zipmaprQ   )rM   r   resultsr   resultr(   r(   r)   rE     s    
zTuningProcessPool.benchmark)r%   r&   r'   rn   rr   ro   rs   rW   ru   rg   rQ   rE   r(   r(   r(   r)   rq     s   
%rq   c                   @  sd   e Zd ZU ded< ded< ded< ded< d	ed
< dZded< edddddZddddZdS )
TensorMetaztorch.devicer.   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr|   offsetNzOptional[str]namez/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]#Union[TensorMeta, List[TensorMeta]])irnodesr:   c              	     s   t |tr6 fdd|D }tdd |D s2t|S |}t |tjrRtd|}| }|d k	sftt|	 |t
jjj| tjdt
jjj| tjdt
jjj| jtjd| dS )Nc                   s   g | ]}  |qS r(   )from_irnodesr}   xclsr(   r)   r     s     z+TensorMeta.from_irnodes.<locals>.<listcomp>c                 s  s   | ]}t |tV  qd S N)rB   r   r   r(   r(   r)   	<genexpr>  s     z*TensorMeta.from_irnodes.<locals>.<genexpr>Zfake)fallback)r.   r   r   r   r   r   )rB   r   allrT   r   LayoutBufferZ	get_dtyper   Z
get_devicer!   graphZsizevarsZ
size_hintsget_sizer   Zunbacked_symint_fallbackZ
get_strideZ	size_hintZ
get_layoutr   get_name)r   r   r   noder   r(   r   r)   r     s4    
zTensorMeta.from_irnodestorch.TensorrJ   c                 C  s   t | j| j| j| j| jdS )N)r.   r   
extra_size)r   r   r   r.   r   r   rL   r(   r(   r)   	to_tensor  s    zTensorMeta.to_tensor)r%   r&   r'   ro   r   classmethodr   r   r(   r(   r(   r)   r     s   
!r   c                   @  st   e Zd ZdZdddddddZddd	d
ddZddddZdddddd
ddZdddddd
ddZdS )rD   a1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    r2   r   Iterable[Any])kernel_nameinput_tensor_metaoutput_tensor_meta
extra_argsc                 C  sR   || _ t|tr|g}|| _t|ttfrBt|dks:t|d }|| _|| _	d S )Nr   r   )
r   rB   r   r   tupler   rv   rT   r   r   )rM   r   r   r   r   r(   r(   r)   __init__  s    
zBenchmarkRequest.__init__r   Callable[[], None]input_tensorsoutput_tensorr:   c                G  s   t d S r   NotImplementedErrorrM   r   r   r(   r(   r)   make_run_fn  s    zBenchmarkRequest.make_run_fnr9   rJ   c                 C  s   d S r   r(   rL   r(   r(   r)   cleanup_run_fn  s    zBenchmarkRequest.cleanup_run_fnNr   Optional[torch.Tensor]r   c                G  s   t d S r   r   rM   fnr   r   r(   r(   r)   do_bench  s    zBenchmarkRequest.do_benchc          
      G  s   t tj}|rt }|d krNt|dks0ttdd | jD }| j	
 }|rft | }t }z| j|d|i}W n& tk
r   t d td Y S X |rt | }t }| j|f||f }|rt | }	t dt| |||	 |   |S )Nr   c                 s  s   | ]}|  V  qd S r   )r   r   r(   r(   r)   r     s     z-BenchmarkRequest.benchmark.<locals>.<genexpr>r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)r;   isEnabledForloggingDEBUGtimerv   rT   r   r   r   r   r   r+   infor   r   r<   r2   r   )
rM   r   r   r<   Zstart_tsZcreate_tensor_elapser   Zload_elapseoutZbench_elapser(   r(   r)   rE     s<    

zBenchmarkRequest.benchmark)	r%   r&   r'   rn   r   r   r   r   rE   r(   r(   r(   r)   rD     s   rD   c                   @  s:   e Zd ZdZddddddZddd	d
ddddZdS )TestBenchmarkRequestz
    Supports unit testing. Defined in this file so that the TuningProcess
    sub-process knows how to unpickle these objects.
    NzOptional[float]r9   )valuer:   c                 C  s
   || _ d S r   )r   )rM   r   r(   r(   r)   r   5  s    zTestBenchmarkRequest.__init__r   r   r   r   r   c                G  s   | j d krtd| j S )NzFailed to run)r   r>   r   r(   r(   r)   rE   8  s    
zTestBenchmarkRequest.benchmark)N)r%   r&   r'   rn   r   rE   r(   r(   r(   r)   r   /  s   r   c                   @  s$   e Zd ZddddddddZdS )	GPUDeviceBenchmarkRequestNr   r   r   r   r   c             	   G  s   dd ||fD }t |dks.td| t |dkrHtt|}n
tj }tj| t|}tj	  W 5 Q R X |S )Nc                 S  s2   h | ]*}t |tjr|jr|jjd k	r|jjqS r   )rB   r   ZTensorZis_cudar.   indexr}   Ztensorr(   r(   r)   	<setcomp>G  s
   z5GPUDeviceBenchmarkRequest.do_bench.<locals>.<setcomp>r   zCan not mix devices )
rv   rT   nextiterr   r   Zcurrent_devicer.   r    synchronize)rM   r   r   r   Zdevice_idx_setZ
device_idxr   r(   r(   r)   r   A  s    
z"GPUDeviceBenchmarkRequest.do_benchr%   r&   r'   r   r(   r(   r(   r)   r   @  s   r   c                      s^   e Zd Zdddddddddddd
 fdd	Zd
d
ddddZdd ZddddZ  ZS )TritonBenchmarkRequestr   r2   r   r   z	List[int]r|   )
r   r   r   r   module_pathmodule_cache_keygrid
num_stages	num_warpsmatrix_instr_nonkdimc                   s:   t  |||| || _|| _|| _|| _|	| _|
| _d S r   )superr   r   r   r   r   r   r   )rM   r   r   r   r   r   r   r   r   r   r   	__class__r(   r)   r   ^  s    zTritonBenchmarkRequest.__init__r   r   r   c          	      G  s   t | j| j}td| j| j t|| jj}t	| j
}i }dd l}d||jkr^d|d< ddlm} tjjr| jdkrtj|f||f| j
d| ji|d|| jjjiS tj|f||f| j
d| ji|d|| jjjiS d S )Nz"benchmark module key: %s, path: %sr   ZwarmupF)_cuda_getCurrentRawStreamr   stream)r   load_by_key_pathr   r   r;   r<   getattrr   runr   r   inspect	signature
parametersZtorch._Cr   r   versionZhipr   	functoolspartialr   r   r.   r   )	rM   r   r   mod
run_methodr   Z
warmup_argr   Zget_raw_streamr(   r(   r)   r   s  sT    

z"TritonBenchmarkRequest.make_run_fnc                 C  s$   t | j| j}t|| j  d S r   )r   r   r   r   r   r   
precompile)rM   r   r(   r(   r)   r     s    z!TritonBenchmarkRequest.precompilerJ   c                 C  s   d| j d| jd| jS )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r   r   rL   r(   r(   r)   __str__  s    zTritonBenchmarkRequest.__str__)r   )r%   r&   r'   r   r   r   r   __classcell__r(   r(   r   r)   r   [  s
    &,r   c                      sv   e Zd Zdddddd fddZdd Zd	d	d
dddZddddZdd ZddddZddddZ	  Z
S )CUDABenchmarkRequestr2   r   r   r   r   r   r   source_codec                   sV   t  |||| || _d| _d | _d | _d| _d| _d| _t	
| jd\| _| _d S )Nr   F so)r   r   r   workspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerM   r   r   r   r   r   r   r(   r)   r     s    zCUDABenchmarkRequest.__init__c                 C  s*   t d|  t| jd t d|  d S )NPrecompiling %sr   Done precompiling %s)r;   r<   r   compiler   rL   r(   r(   r)   r     s    zCUDABenchmarkRequest.precompiler   r   r   c             	   G  s   |    |   dd t||g D }td| j| j| j| j|| j	 t
tj j}t| j| j}t
d}| jdkrtj| jd d tj|jd| _t
| j }tj|f|| j	d ||f S )Nc                 S  s   g | ]}t | qS r(   )r   data_ptrr   r(   r(   r)   r     s   z4CUDABenchmarkRequest.make_run_fn.<locals>.<listcomp>zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   r.   )ensure_dll_loadedupdate_workspace_sizer   r;   r<   r   r   r   r   r   r   r   r   current_streamcuda_streamr   r   ZzerosZfloat64r.   r   r   r   r   )rM   r   r   rR   
stream_ptrr   Zworkspace_ptrr(   r(   r)   r     sB    	
z CUDABenchmarkRequest.make_run_fnr9   rJ   c              
   C  s   | j r
d S |   tdd | jD }dd t|d D }ttj j	}t
| j| j}t }||| jt|d |f  tj  |j| _td| j| j| j| j| j|| j d| _ d S )Nc                 S  s   h | ]
}|j qS r(   )r   )r}   metar(   r(   r)   r     s     z=CUDABenchmarkRequest.update_workspace_size.<locals>.<setcomp>c                 S  s   g | ]}t d qS r   )r   )r}   _r(   r(   r)   r     s     z>CUDABenchmarkRequest.update_workspace_size.<locals>.<listcomp>r   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r   r  rv   r   r   r   r   r   r  r  r   r   r   r   r   r   r   r   r   r;   r<   r   r   )rM   Zunique_input_countrR   r  r   Zc_workspace_sizer(   r(   r)   r    s<    	

z*CUDABenchmarkRequest.update_workspace_sizec                 C  s(   | j d kr$t| jd\| _ | _| _d S )Nr   )r   r   loadr   r   r   rL   r(   r(   r)   r    s
    
 z&CUDABenchmarkRequest.ensure_dll_loadedc                 C  s   | j d k	r| j   d | _d S r   )r   closer   rL   r(   r(   r)   r     s    

z#CUDABenchmarkRequest.cleanup_run_fnc                 C  s   d| j d| jd| jS )Nr   z, self.source_file=z, self.hash_key=)r   r   r   rL   r(   r(   r)   r     s    zCUDABenchmarkRequest.__str__)r%   r&   r'   r   r   r   r  r  r   r   r   r(   r(   r   r)   r     s   '"r   c                   @  s$   e Zd ZddddddddZdS )	CPUDeviceBenchmarkRequestNr   r   r   r   r   c                G  s   t |S r   )r   r   r(   r(   r)   r     s    z"CPUDeviceBenchmarkRequest.do_benchr   r(   r(   r(   r)   r    s   r  c                      s`   e Zd Zdddddd fddZdd Zd	d	d
dddZddddZddddZ  ZS )CppBenchmarkRequestr2   r   r   r   c                   s,   t  |||| || _t|| _d | _d S r   )r   r   r   r   r   r   r   r   r(   r)   r   *  s    
zCppBenchmarkRequest.__init__c                 C  s,   t d|  tj| jdd t d|  d S )Nr   Fr   r   )r;   r<   r   r
  r   rL   r(   r(   r)   r   7  s    zCppBenchmarkRequest.precompiler   r   r   c                G  sv   t j| jdd| _dd t||g D }td| j| j|| j t	| j| j}t
jgt| |_tj|f|| j S )NFr  c                 S  s   g | ]}|  qS r(   )r   r   r(   r(   r)   r   C  s     z3CppBenchmarkRequest.make_run_fn.<locals>.<listcomp>zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s)r   r
  r   r   r   r;   r<   r   r   r   ctypesZc_ulonglongrv   Zargtypesr   r   )rM   r   r   rR   r   r(   r(   r)   r   >  s"    zCppBenchmarkRequest.make_run_fnr9   rJ   c                 C  s   | j d k	r| j   d S r   )r   r  rL   r(   r(   r)   r   U  s    
z"CppBenchmarkRequest.cleanup_run_fnc                 C  s   d| j S )Nr   )r   rL   r(   r(   r)   r   Y  s    zCppBenchmarkRequest.__str__)	r%   r&   r'   r   r   r   r   r   r   r(   r(   r   r)   r  &  s
   r  r   r   r   c                 C  s
   t | S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )tuning_poolrE   )r   r(   r(   r)   benchmark_in_sub_process]  s    r  )R
__future__r   
contextlibr  dataclassesr   r   r/   r_   r   r   concurrent.futuresr   r   r   r   r   typingr   r	   r
   r   r   r   r   r   r   r   Ztorch._inductor.async_compiler   Ztorch._dynamo.testingr   Ztorch._inductorr   Ztorch._inductor.codecacher   r   r   r   r   Zmultiprocessing.processr   Zmultiprocessing.queuesr   typesr   Z torch._inductor.select_algorithmr   r   r   Zruntime.runtime_utilsr   r    Zvirtualizedr!   r"   rw   	getLoggerr%   r;   r#   r*   r>   r+   contextmanagerr4   	dataclassr5   rq   r  r   r   ZLayoutOrBufferr   rD   r   r   r   r   r  r  r  r(   r(   r(   r)   <module>   sh   ,
 =4ZLu
7