U
    zh9                     @   s  d dl mZ d dlZddlmZmZmZ ddlmZ ddl	m
Z
mZmZ ejedZed ed d	d
dZed ed d	ddZdd Zdd Zed dddZed dd	ddZdedddZed ed d	ddZdd Zdd Zed ed d	d d!ZdS )"    )ListN   )configir	scheduler)WeakDep)is_collectiveis_waittuple_sortedZoverlapzscheduler.BaseSchedulerNode)snodesreturnc                 C   sr   g }t  }| D ]P}t|jr(|| qt|D ]"}||jkr0|| || q0|| q|t| |S )z~
    Greedily moves waits as late as possible (i.e. until we reach a use). Optimal in terms of
    communication overlap.
    )	setr	   nodeaddr
   
node_usersappendremoveextend)r   Z	new_orderZ	cur_waitssnodewait r   G/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/comms.py
sink_waits   s    


r   c                    s   g }g }t | D ]x t jr*|  q|D ]}t|jdks.tq.t|dkr~t fdd|D r~|d}|| qF|  qt|dkst|	t
| |ddd S )a  
    Greedily moves comms as early as possible (i.e. until we reach an input).
    Optimal in terms of communication overlap.

    TODO: We might want to adjust this in the future to account for memory limitations.
    e.g. when we are compiling FSDP, this heuristics will cause the all-gathers to be prefetched as soon as possible,
    which is the beginning of the forwards pass. We'll have to either do a special pass for FSDP,
    or we'll want to redo this pass with memory considerations so we handle the FSDP case in a general way.
    r   c                 3   s   | ]} |j kV  qd S N)inverse_users).0commr   r   r   	<genexpr>9   s    zraise_comms.<locals>.<genexpr>r   N)reversedr   r   r   lenr   AssertionErroranypopr   r
   )r   Znew_order_reversedZ	cur_commsr   r   r   r   raise_comms%   s     

r%   c                 C   sX   t  }| g}t|dkrTg }|D ],} | jD ] }||kr*|| || q*q |}q|S Nr   )r   r!   r   r   r   )r   Z	ancestors	cur_nodes	new_nodesinpr   r   r   get_ancestorsD   s    

r*   c                 C   sX   t  }| g}t|dkrTg }|D ],} | jD ] }||kr*|| || q*q |}q|S r&   )r   r!   r   r   r   )r   Zdescendantsr'   r(   r)   r   r   r   get_descendantsR   s    

r+   )nodesc                 C   sD   dd | D }t dt|D ]"}|| t||d    qdS )z
    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
    (might not be the same ordering as the eager mode program).
    TODO: Come up with a better approach
    c                 S   s   g | ]}t |jr|qS r   r   r   )r   nr   r   r   
<listcomp>f   s     
 z3decide_global_ordering_of_comms.<locals>.<listcomp>r   N)ranger!   Zadd_fake_depr   get_name)r,   
comm_nodesir   r   r   decide_global_ordering_of_comms`   s    r4   c                 C   s   t dd | D rtd S )Nc                 s   s   | ]}t |jV  qd S r   r-   )r   r   r   r   r   r   m   s     z'assert_no_comm_nodes.<locals>.<genexpr>)r#   r"   )r   r   r   r   assert_no_comm_nodesl   s    r5   )r   r   c                 C   s0   t jdkr|  }ntt js"tt | }|S )z:
    Returns estimated op runtime in nanoseconds (ns)
    default)r   estimate_op_runtimeZget_estimated_runtimecallabler"   )r   runtimer   r   r   r7   p   s
    


r7   c                    s`  g g | D ]}t |jr| qtdkr6| S dd D  dd D }t| d| D ](}|jD ]}|krl|  d7  < qlqbfdd| D t t| fdd	fd
d}tdkst|t	 d  d g  d}t
dt D ]B} |  ||d    @ }t| |tdd |D  }t|d  }	|t| |}
|
|	kr~nt||d    }t|  fdd}t||d}|D ]D}||	kr qt|}|	| |d krq| ||7 }q||
 } |  @ }|t	| || g t|dk}|rLd}n|}q| S )a:  
    Decides a global ordering of all compute and communication nodes,
    assuming that we already have a global ordering of communication nodes.

    Overall scheduling procedure is:
        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
            We prioritize compute nodes that are needed sooner.
        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
        Step 4: We schedule comm N + 1.
        Repeat this for subsequent comm nodes.
    r   c                 S   s   i | ]}|t |qS r   )r*   r   r   r   r   r   
<dictcomp>   s      z/reorder_compute_for_overlap.<locals>.<dictcomp>c                 S   s   i | ]}|t |qS r   )r+   r:   r   r   r   r;      s      r   c                    s   h | ]} | d kr|qS )r   r   r:   )indegr   r   	<setcomp>   s      z.reorder_compute_for_overlap.<locals>.<setcomp>c                    sx   | kst | kst |  |   |  t| jD ]2}|kr@|  d8  < | dkr@| q@dS )z)
        Schedule a single node.
        r   r   N)r"   r   r   r
   r   r   )r   user)final_orderr<   ready_to_schedule_nodesunscheduled_nodesr   r   schedule_node   s    


z2reorder_compute_for_overlap.<locals>.schedule_nodec                    sp   t | }tfdd|D s"tt|dkrld}t|D ]"}| kr:| || d}q:|s"tdq"dS )z\
        Schedules all nodes in `snodes` in an arbitrary topologically valid order.
        c                 3   s   | ]}| kV  qd S r   r   r:   )rA   r   r   r      s     zFreorder_compute_for_overlap.<locals>.schedule_nodes.<locals>.<genexpr>r   FTznUnable to find a free node (indeg == 0). This is an impossible state to reach. Please report a bug to PyTorch.N)r   allr"   r!   r
   r   )r   Z	all_nodesprogressr   )r@   rB   rA   r   r   schedule_nodes   s    
z3reorder_compute_for_overlap.<locals>.schedule_nodesc                 s   s   | ]}t |V  qd S r   )r7   r:   r   r   r   r      s   z.reorder_compute_for_overlap.<locals>.<genexpr>c                    s2   t tD ]}|  |  kr|  S qtS r   )r0   r!   )r   idx)comm_ancestorsr2   r   r   earliest_comm_descendant   s    
z=reorder_compute_for_overlap.<locals>.earliest_comm_descendant)key   )r   r   r   r!   dictfromkeysr   r   r"   listr0   r5   sumr7   r
   sorted)r   r   Zcomm_descendantsr>   rE   Zrolled_over_compute_costrF   Z+needed_by_next_comm_and_ready_compute_nodesZtotal_compute_runtime_costZprev_comm_runtime_costZstep1_runtime_costZready_to_schedule_compute_nodesrH   Zcompute_runtime_costZrollable_compute_costZneeded_by_next_comm_nodesZis_prev_comm_blocking_next_commr   )rG   r2   r?   r<   r@   rB   rA   r   reorder_compute_for_overlap|   s    




 
rP   c                 C   s   d}t | jtjr"d| jj d}d}t| jdrlt| jjdrlt| jjdrld| jjj d| jjj d}d}t| jd	r| jj	}| jj
j | | d| dS )
N z ()layoutsizestridez (size=z	, stride=name)
isinstancer   r   ZExternKernelOutZpython_kernel_namehasattrrS   rT   rU   rV   	__class____name__)r   detailZout_tensor_infoZ	node_namer   r   r   node_summary   s     
r\   c                 C   s   d}d }| D ]}|d krht |jr6|t|7 }|j}n t|jrJtdn|t|7 }tt|  qt |jr|tdqt|jrtt|  d }qtdt|  qtd|d d   d S )Ng        z8Wait is not expected when there is no collective runningzkFound two collectives running at the same time. `visualize_overlap` needs to be updated to handle this casez| zEst. runtime (ms): i  )r   r   r7   r	   r"   overlap_logdebugr\   )orderZtotal_est_runtimeZcur_comm_noder   r   r   r   visualize_overlap3  s0    



r`   c                 C   s   | }t jD ]}t|tr,|t kr,t | }tj dkrt	d| d zt
| W n0 tk
r } zt	t| W 5 d }~X Y nX ||}tj dkr
t	d| d zt
| W q
 tk
r } zt	t| W 5 d }~X Y q
X q
|S )Nr   z.==== Visualize overlap before reordering pass z ====z-==== Visualize overlap after reordering pass )r   Z'reorder_for_compute_comm_overlap_passesrW   strglobalstorchdistributedZget_rankr]   r^   r`   	Exception)r   r_   per   r   r   $reorder_compute_and_comm_for_overlapR  s,    


 
"rh   )typingr   rc   rQ   r   r   r   dependenciesr   utilsr   r	   r
   Z_loggingZgetArtifactLoggerrZ   r]   r   r%   r*   r+   r4   r5   floatr7   rP   r\   r`   rh   r   r   r   r   <module>   s2    % 