U
    zh^                      @   sN  d dl Z d dlZd dlmZ d dlZd dlZddlmZ ddlm	Z	m
Z
 ddlmZ G dd deZG d	d
 d
eZe jedddZejedddZejedddZejedddZG dd deZG dd deZG dd deZdgdggZdgdggdgdggdgd gggZd!d!d"gd#d$d%gd#d$d%ggZejedd&d'ZdS )(    N)IntEnum   )ir)get_dtype_sizesympy_product)Vc                   @   s   e Zd ZdZdZdZdS )	NCCL_COLLr   r      N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER r   r   O/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/comm_analysis.pyr      s   r   c                   @   s   e Zd ZdZdZdZdS )NVIDIA_GPU_TYPEr   r   r	   N)r
   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      s   r   )returnc                  C   sL   t jjt jjjpd} d| kr&tjS d| kr4tjS d| krBtjS tjS d S )N ZV100ZA100ZH100)	torchutilsZcollect_envZget_gpu_inforunr   r   r   r   )Zgpu_infor   r   r   get_gpu_type   s    r   )noder   c                 C   sh   t | tjstd|  | j}|d k	s,td|kr:tjS d|krHtjS d|krVtj	S td| d S )Nz!node is not a collective kernel: Z
all_reduceZ
all_gatherZreduce_scatterzUnsupported collective kernel: )

isinstancer   _CollectiveKernel
ValueErrorZpython_kernel_nameAssertionErrorr   r   r   r   )r   Zkernel_namer   r   r   get_collective_type(   s    r!   c                 C   s\   d}| j D ]L}t|jj}t|tjr0t|}ntj	j
j|dd}||t|jj 7 }q
|S )Nr   )fallback)inputsr   Zlayoutsizer   sympyIntegerintr   graphZsizevarsZ	size_hintr   Zdtype)r   Zsz_bytesZinpZnumelr   r   r   get_collective_input_size_bytes8   s    

r)   c                 C   s:   t | tjkr(ddlm} || jd S td|  d S )Nr   )_get_group_size_by_namezUnsupported collective type: )typer   r   Z"torch.distributed.distributed_c10dr*   Zconstant_args	TypeError)r   r*   r   r   r   get_collective_group_sizeE   s    r.   c                   @   s   e Zd ZdZdZdZdS )NCCL_HWr   r   r	   N)r
   r   r   NVLINKZPCINETr   r   r   r   r/   S   s   r/   c                   @   s   e Zd ZdZdZdS )	NCCL_ALGOr   r   N)r
   r   r   ZTREERINGr   r   r   r   r2   Y   s   r2   c                   @   s   e Zd ZdZdS )
NCCL_PROTOr   N)r
   r   r   LLr   r   r   r   r4   ^   s   r4   g333333@gffffff@g333333?      ?g      @g@g     C@gffffff4@gU@g     6@g      3@c                 C   s  t | }|d d d }d}t| }t|| }|}|dkrBdS tj}tj}t| }	t	j
jj}
t	j
jj}t }|dkr|d nd}|dkr|nd}t| | }|dkr|
n|}d}|| }t|||dks|	tjkrdnd }|	tjkrd|d  }n|	tjtjfkr|d }d| | }|| }|d	 }tj}|	tjkrZ|dkrTd| }nd}n|	tjtjfkrt|d }t| | }t| | | }ttj | | }d
}|dkrd}t||}||| | ||  7 }|d }|| }|| S )a9  
    Returns estimated NCCL collective runtime in nanoseconds (ns).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r	   g      ?gUUUUUU?r6   g    eAg        g     @@)r)   r.   mathceilr2   r3   r4   r5   r!   r   Z	_inductorconfigZintra_node_bwZinter_node_bwr   llMaxBwsminr   r   r   r   r/   r0   baseLathwLatr1   max)r   Ztensor_storage_size_bytesZtensor_storage_size_GBZnum_gpus_per_nodeZ
group_sizeZnNodesZnRanksZ	nccl_algoZ
nccl_protoZcollZbwIntraZbwInterZcompCapIndexindex2Zindex1ZllMaxBwbwZ	nChannelsZbusBwZnstepsratio	bandwidthZbandwidth_GB_per_nsZintraHwZnInterStepsZlatencyZintraLatZinterLatZnetOverheadZ
latency_nsZtransport_nsr   r   r    estimate_nccl_collective_runtime   sf    






rD   )	functoolsr8   enumr   r%   r   r   r   r   r   r   Zvirtualizedr   r   r   	lru_cacher   ZIRNoder!   r'   r)   r.   r/   r2   r4   r=   r>   r;   floatrD   r   r   r   r   <module>   s\   