U
    yh                  	   @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZmZ d dlmZmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d d	l'm(Z) d d
l*m+Z+m,Z, e-dZ.ej/e0d< ej1j2Z2e Z3e4 Z5ee6 e0d< e 7e4Z8ee6ee6 f e0d< e 7e4Z9ee6ee6 f e0d< da:e6e0d< d\e6dddZ;ee ee edddZ<eddG dd dZ=ej>e=dddZ?e$eee6df e6f ee= d d!d"Z@eeed#d$d%ZAeee&d#d&d'ZBe$eee eeeej>d(d)d*ZCe$e=ee= eej>eDf dd+d,d-ZEe$ee= eej>eDf e=d.d/d0ZFe$ee= dd1d2d3ZGe<g g d4e$eDdd5d6d7ZHe<eHgg d4e$dd8d9d:ZIe<g g d4e$dd8d;d<ZJe Kd=d>d?d@dAdBdCgZLeddG dDdE dEZMeddG dFdG dGZNeddG dHdI dIZOej>eOdJdKdLZPe$eee6df e6f eeO dMdNdOZQe$eOeej> eeOeOf dPdQdRZRe$eOeej> eeOeOf dSdTdUZSe<eJgeIgd4e$e6e6ddVdWdXZTe$eej> eej> dYdZd[ZUdS )]    N)	dataclassfield)wraps)AnyCallablecastDefaultDictDictIterableListOptionalSetTupleUnion)
FakeTensorFakeTensorMode)CommTypedump_graphs_to_files	find_node
get_outputOP)IterGraphModule)TensorMetadata)_pytree)tree_flattentree_unflattenZgraph_optimizationlogger_optimized_func_prerequisite_sets_apply_before_sets _dump_graph_folderfolderc                 C   s   | st  } | ad S N)tempfilemkdtempr!   r"    r'   \/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/_spmd/graph_optimization.pyenable_graph_optimization_dump5   s    r)   )prerequisitesapply_afterreturnc                    s   t t d fdd}|S )a  Define the contract of a graph optimization pass.

    All the passes should be wrapped with this decorator.
    `prerequisites` is used to annotate the prerequisite passes of the this pass.
    `apply_after` means that this wrapped pass must be applied after the passes
    in `apply_after`. The difference between `prerequisites` and `apply_after`
    is that all the passes in `prerequisites` must be applied to the graph and
    must be applifed before the wrapped pass while the passes `apply_after` are
    optional. But if a pass in `apply_after` is applied to the graph, it has to
    be done before the wrapped pass.
    Optimizer pass developers are required to add these fields accordingly and
    users need to follow the restrictions to avoid the assert.

    Current design has one limitation: users can only apply the optimizations
    once.  In some cases, we may need to run multiple the same optimization
    multiple time, e.g., optimization passes -> profiling the result -> apply
    optimization passes with the profiling result again. This limitation will be
    addressed limitation in the future.

    Args:
        prerequisites (Iterable[Callable]): the list of string to the names of
            passes which are the prerequisites of this pass.
        apply_after (Iterable[Callable]): the list of string to the names of
            passes that can not be applied after the wrapped pass.
    funcr,   c                    sz   t tddd fddD t< D ]}t|  q2t ttjt	f t
t
d d fdd}|S )	Nr-   c                 S   s   | j  d| j S )N.)
__module____name__)r.   r'   r'   r(   make_key_   s    z8graph_optimization_pass.<locals>.inner.<locals>.make_keyc                    s   h | ]} |qS r'   r'   ).0f)r2   r'   r(   	<setcomp>c   s     z9graph_optimization_pass.<locals>.inner.<locals>.<setcomp>)gmargskwargsr,   c                    s:  t   }t| tjtfs tdtks8td dt t}|r^t| d dt	 
tstt	 t  d dt d | f|| | j  | j  |   t d j }tr t| trt| d	| j| d
| j| d| jit nt|| it tdt   |  d S )NzPThe first argument of the pass must be either fx.GraphModule or IterGraphModule.zCannot apply z twice.z must be applied after r/   z are the prerequisites of z+ but are not applified. Applied passes are Zafter_Z	_setup_gmZ_main_gmZ_cleanup_gmzSpent %f seconds applying %s)time
isinstancefxGraphModuler   AssertionErrorr   r   intersectionr   issubsetgraphZlinteliminate_dead_codeZ	recompileaddr1   r!   r   Zsetup_gmZmain_gmZ
cleanup_gmr   info)r6   r7   r8   beginZinvalid_passesprefix)r.   func_keyr'   r(   pass_wrapperg   sB    


   	z<graph_optimization_pass.<locals>.inner.<locals>.pass_wrapper)r   strr   r   rB   r   r   r;   r<   r   r   )r.   Zapply_after_passrG   r+   r*   )r.   rF   r2   r(   inner^   s      )z&graph_optimization_pass.<locals>.inner)r   )r*   r+   rJ   r'   rI   r(   graph_optimization_pass@   s    5rK   T)unsafe_hashc                   @   s^   e Zd ZU eej ed< eej	 ed< eej	 ed< eej	 ed< ej	ed< e
ej	 ed< dS )	CommBlockshape	node_listinputs
wait_nodes	comm_nodeoutputsN)r1   r0   __qualname__r   torchSize__annotations__r   r;   Noder   r'   r'   r'   r(   rM      s   

rM   )rR   r,   c                 C   s  d}g }g }t j| j| j}dd |D }d}d}d}t| dg}	|	r|dk r|	 }
|
dkrx|d7 }|	rD|	d qD||
 |
j	|r||
 qD|
j
D ]}t|tjr|	| qqD|std	t }t|}	|	rB|	 }
|
dk	st|
j
D ]D}t|tjr0|j	|r0|	| || q||
  qqq|d jd
d}t|rrtdd |jD nd||| ||dS )a-  Find out all the nodes belong to this communcation given a collective node (e.g., allreduce).

    Args:
        comm_node(fx.Node): The target communication/collective node.

    Returns:
        The CommBlock that encapsulates the related nodes (e.g., wait_node) of
        the given comm_node.
       c                 S   s   g | ]}t |tjr|qS r'   r:   r;   rX   )r3   Zinpr'   r'   r(   
<listcomp>   s      z"get_comm_block.<locals>.<listcomp>r   )Z	wait_commZwait_tensor)splitreshapegetitemdetachaliasN   z?The wait nodes are too far away from the comm node {comm_node}.tensor_metac                 s   s   | ]}t |V  qd S r$   )int)r3   sr'   r'   r(   	<genexpr>   s     z!get_comm_block.<locals>.<genexpr>rN   rO   rQ   rR   rP   rS   )pytreearg_tree_leavesr7   r8   collectionsdequepopleftappendname
startswithusersr:   r;   rX   RuntimeErrorsetr=   rB   metagetrM   rU   rV   rN   )rR   ZMAX_WAIT_DISTANCErO   rQ   rP   Zinput_nodesdistanceZwait_prefixesZnon_end_users_nodesnodesnodechildrS   userrb   r'   r'   r(   get_comm_block   s\    






ry   .)r6   comm_opsr,   c                    s    fdd| j jD S )Nc                    s    g | ]}|j  rt|qS r'   )rm   rn   ry   r3   rv   rz   r'   r(   r[      s   z'get_all_comm_blocks.<locals>.<listcomp>r@   ru   )r6   rz   r'   r|   r(   get_all_comm_blocks   s    
r~   )fake_tensor_modevalr,   c                 C   s"   t | tj|j|jd|jd|jS )Nrr   )dtypedevicerequires_grad)r   rU   emptyrN   r   r   r   r   r   r'   r'   r(   _create_meta_val   s    r   c              	   C   s   t |j|j|j|jd di dS )NF)rN   r   r   stridememory_formatZis_quantizedZqparams)r   rN   r   r   r   r   r'   r'   r(   _create_meta_tensor_meta   s    r   )r6   r   meta_valfunctionr7   r8   r,   c                 O   s   | j |||}|d krt||f\}}g }	d }
|D ]6}t|tjsP|	| q4|jd }|	t|| q4t	|	|\}}|||}n|}||jd< t
|||jd< |S )Nr   rb   )r@   call_functionr   r:   r;   rX   rl   rr   r   r   r   )r6   r   r   r   r7   r8   rv   Z	flat_argsspecZnew_flat_argsr   argr   Z	fake_argsZfake_kwargsZnew_meta_valr'   r'   r(   _call_function  s"    


r   )r6   fused_comm_blockcomm_blocksnode_indicesr,   c                    s  d}| j jD ]$}||jkr q2t |||}q|j}|jd }| j |$ | j tj	|dd |D f}W 5 Q R X g }	|}
| j | t
|D ]\}}|jd }tt|j}|r| }t|tjsq | |k r|	| |t|j q| j tj||f}| j | | j tj||jf}W 5 Q R X | j || q|
|krZ|}
W 5 Q R X t|	 fddd}	| j |	|
 | j   dS )zzScatter the result of the fused communication node to the original users -- splitting the output and reshape each subitem.r   c                 S   s"   g | ]}t ttj|j qS r'   )rc   r   rU   rV   rN   numel)r3   cbr'   r'   r(   r[   F  s     z(_scatter_wait_result.<locals>.<listcomp>c                    s    |  S r$   r'   )rv   r   r'   r(   <lambda>g      z&_scatter_wait_result.<locals>.<lambda>)keyN)r@   ru   rR   maxrs   rQ   inserting_afterr   atenr\   	enumerateri   rj   listro   rk   r:   r;   rX   rl   extendoperatorr^   r]   rN   Znode_replace_all_uses_withsorted
move_afterrA   )r6   r   r   r   Zlast_wait_node_idxrv   fused_comm_nodefused_wait_nodeZ
split_nodeZneed_sort_nodesZlast_split_reshape_nodeidx
comm_blockZ	orig_waitru   Z	user_nodeZsplit_idx_nodeZwait_output_noder'   r   r(   _scatter_wait_result.  sT    

 



 
r   )r6   r   r   r,   c              
   C   s  |d j d }d}g }|D ]Z}|j d }|jdrFttj|jd }|| || }||kr||kslt|}|}q| j	
|. g }	|D ]}|	t| tdtjj| qW 5 Q R X | j	
|	d  t| tdtj|	}
W 5 Q R X |d }|j}|jd }| j	
|
N t|j|jf\}}|
|d< t||\}}t| t|
jd |jf||}W 5 Q R X | j	
|N t|j|jf\}}||d< t||\}}t| t|
jd |jf||}W 5 Q R X |	|
||g }| j	|| |
jd}t|j||g|g||
g|hd}t| ||| |S )zLFuse the CommBlocks using concat given a list of CommBlock (only allreduce).r   cloneNr   rb   rf   )rP   rm   rn   r   r;   rX   r7   rl   r=   r@   r   r   r   r   flattenZ
using_intscatrR   rQ   r   r8   r   rr   targetr   rs   rM   rN   r   )r6   r   r   Zlast_input_nodeZlast_input_indexZall_input_nodesr   Z
input_nodeindexZ
cat_inputsZcat_nodeZ	last_commZlast_comm_nodeZlast_wait_nodeZflatten_argsr   r7   r8   r   r   Znodes_to_moverb   r   r'   r'   r(   _fuse_with_catm  s    

    


	r   )r6   r   r,   c                 C   sb   dd t | jjD }|D ]B}|j}d}|jD ]}|| }||kr.|}|}q.| j||j qd S )Nc                 S   s   i | ]\}}||qS r'   r'   r3   irv   r'   r'   r(   
<dictcomp>  s      z&_expedite_comm_ops.<locals>.<dictcomp>r   )r   r@   ru   rR   rP   Znode_append)r6   r   r   r   Z
last_inputZlast_input_idxinputZ	input_idxr'   r'   r(   _expedite_comm_ops  s    
r   )r*   r+   )r6   bucket_size_mbr,   c           	      C   s   t | tjdf}t| | t | tjdf}dd t| jjD }d}|d }d } }}|t|k r|tt	j
|| j d 7 }|d7 }||k rqXt| ||| | |}|}d}qX|t|k rt| ||| | dS )	zoRun fuse communication with concat.

    This implementation uses concat to concat the bucketed gradients.
    
all_reducec                 S   s   i | ]\}}||qS r'   r'   r   r'   r'   r(   r     s      z+comm_fusion_with_concat.<locals>.<dictcomp>i   r      ra   N)r~   r   	ALLREDUCEr   r   r@   ru   lenr   rU   rV   rN   r   r   )	r6   r   r   r   Zbucket_sizeZbucket_cap_sizerD   endZ	curr_sizer'   r'   r(   comm_fusion_with_concat  s$    
r   )r6   r,   c                 C   s  t | tjdf}t }|D ]}|jD ]}||j q$qdd t| jj	D }|D ]}t
|jdksttd| dtttt|jj}d}dd	 |jD D ]}|| }	|	|k r|}|	}qd
}
t|jD ]\}
}||jd kr qq|
dkst| j|j|
d | qRdS )zFDelay the execution of wait tensors of allreduce until its first user.r   c                 S   s   i | ]\}}||qS r'   r'   r   r'   r'   r(   r     s      z&schedule_comm_wait.<locals>.<dictcomp>ra   z1Found a allreduce that has zero outputs/users -- r/   l        c                 s   s   | ]}|j D ]
}|V  qqd S r$   )ro   )r3   outputrx   r'   r'   r(   re     s       z%schedule_comm_wait.<locals>.<genexpr>r   r   N)r~   r   r   rq   rS   updatero   r   r@   ru   r   r=   nextiterrO   rQ   Zmove_before)r6   r   Zallreduce_usersZ	allreducer   r   Ztarget_nodeZtarget_node_indexrx   r   Zwait_idxrv   r'   r'   r(   schedule_comm_wait  s0    

r   c                 C   s8  d}t  }t| jjD ]}|jr"q|jtjks|jt	j
jkr>qt  }t|dg}d}d}|r||k r| }|dkr|d7 }|rZ|d qZ|| |jtjkrt|jdrd}tj|j|j}	|	D ]}
t|
tjr||
 qqZ|r|| qt| jjD ],}|jrq||kr$q| j| qdS )a  Erase the orphant copy_ that generated when tracing optimizer.

    Two reasons why we could not simply use the DCE of fx.Graph.
    1. fx.Graph treats copy_ as a side-effect node and does not erase it.
    2. Users may want to preserve some orphan `copy_` that is not from the
       optimizer.
    If the second reason does not hold, this pass can be rewritten as using
    DCE from fx.Graph (with the overwrite to the side-effect node list).
    rY   Nr   Fra   )zaten._foreach_zaten._fused_T)rq   reversedr@   ru   ro   opr   CALL_FUNCTIONr   r   copy_defaultri   rj   rk   rl   rB   rH   rn   rg   rh   r7   r8   r:   r;   rX   r   
erase_node)r6   ZMAX_COPY_DISTANCEZremove_candidatesrv   Zcopy_ancestorsru   rt   Zshould_removeZvisitingparentsparentr'   r'   r(   remove_copy_from_optimizer  sF    


r   AdamArgsparamsgradsZexp_avgsZexp_avg_sqsmax_exp_avg_sqsstate_stepsc                   @   s   e Zd ZU ejed< eed< eedZ	e
ej ed< eedZe
ej ed< eedZe
ej ed< eedZe
ej ed< eedZe
ej ed< d	d
 Zdd Zdd ZdS )FusedAdamBlock
optim_nodegenerate_outputdefault_factoryparam_outputsgrad_outputsexp_avgs_outputsexp_avg_sqs_outputsr   c                    s4    fdd}|d j  |d j |d j d S )Nc              
      s    j j}| j  |tj j | f}W 5 Q R X t j j|  D ]b\}}|| |tj||f}W 5 Q R X || |tj	||f}W 5 Q R X |
| qDd S r$   )r   r@   r   r   r   r^   r   r7   r   r   rl   )Zarg_idxoutput_listr@   optim_getitemr   r   updated_argoutput_copyselfr'   r(   _generate_outputsj  s      z:FusedAdamBlock.generate_outputs.<locals>._generate_outputsr         r   r   r   )r   r   r'   r   r(   generate_outputsg  s    zFusedAdamBlock.generate_outputsc                    s4    fdd}|d j  |d j |d j d S )Nc           	         sB   j } j jD ]8}|jtjks0td j  d|jd | kr|} qHq| j ksbtd j  | j gtt	t
tj  j jd   |jD ]d}|jtjkstd|j d|jd }tt|j}t|jdstd|j d|||< qt|D ]$\}}| j ks t| d	q |s>td
 j  dd S )NzThe user of z is not getitem.ra   z!Cannot find the getitem node for r   Unexpected node target r/   
aten.copy_th output is not replaced.The output for 
 is empty.)r   ro   r   r   r^   r=   r7   r   r   r   r   r;   rX   r   r   rH   rn   r   )	Zargs_idxr   r   rx   r   r   r   r   r   r   r'   r(   _populate_outputs  s@    

"




z:FusedAdamBlock.populate_outputs.<locals>._populate_outputsr   r   r   r   )r   r   r'   r   r(   populate_outputs~  s    zFusedAdamBlock.populate_outputsc                 C   s&   | j r
d S | jr|   n|   d S r$   )r   r   r   r   r   r'   r'   r(   __post_init__  s
    
zFusedAdamBlock.__post_init__N)r1   r0   rT   r;   rX   rW   boolr   r   r   r   r   r   r   r   r   r   r   r'   r'   r'   r(   r   [  s   

$r   c                   @   sP   e Zd ZU ejed< eed< eedZ	e
ej ed< dd Zdd Zd	d
 ZdS )ForeachAddBlockadd_noder   r   rS   c              
   C   s   | j j}ttttdf | j jd D ]h\}}|| j  |t	j
| j |f}W 5 Q R X || |tj||f}W 5 Q R X | j| q&| jstd| j  dd S )N.r   r   r   )r   r@   r   r   r   r   r7   r   r   r   r^   r   r   rS   rl   r=   )r   r@   r   r   r   r   r'   r'   r(   r     s    &z ForeachAddBlock.generate_outputsc                    s    fddt ttdf  jjd D  _ jjD ]l}|jtj	ksRt
d|j t t|jd }tt|j}t|jdst
dt|j | j|< q2t jD ] \}}| jkst
| d	qd S )
Nc                    s   g | ]
} j qS r'   )r   )r3   _r   r'   r(   r[     s    z4ForeachAddBlock.populate_outputs.<locals>.<listcomp>.r   r   ra   r   z'The execpted output node is different, r   )r   r   r   r   r7   rS   ro   r   r   r^   r=   rc   r   r   rH   rn   r   )r   r   r   r   r   r   r'   r   r(   r     s$    



z ForeachAddBlock.populate_outputsc                 C   s&   | j r
d S | jr|   n|   d S r$   )rS   r   r   r   r   r'   r'   r(   r     s
    
zForeachAddBlock.__post_init__N)r1   r0   rT   r;   rX   rW   r   r   r   rS   r   r   r   r   r'   r'   r'   r(   r     s   

r   c                   @   s   e Zd ZU eed< eed< dS )FusedOptimizerBlockstepoptimN)r1   r0   rT   r   rW   r   r'   r'   r'   r(   r     s   
r   )r   r,   c                 C   s   d}t | dg}| }d}|r||k r| }|dkrP|d7 }|r|d qq|jtjkrtt|j	drt|}qq|
dd tj|j|jD  q|| krtd|  d	| d
t|dd}t| dd}t||S )z@Given a fused optimizer node and return the FusedOptimizerBlock.rY   Nr   ra   zaten._foreach_addc                 s   s   | ]}t |tjr|V  qd S r$   rZ   )r3   ar'   r'   r(   re     s   z,get_fused_optimizer_block.<locals>.<genexpr>z;Cannot find step node (foreach_add) for the optimizer node z with z? BFS distance. The API design does not match the tracing graph.Fr   )ri   rj   rk   rl   r   r   r   rH   r   rn   r   rg   rh   r7   r8   rp   r   r   r   )r   ZMAX_STEP_DISTANCEru   Z	step_nodert   rv   r   r   r'   r'   r(   get_fused_optimizer_block  s4    


r   )r6   	optim_opsr,   c                    s    fdd| j jD S )zQFind all the FusedOptimizerBlock that the optimizer operators are in `optim_ops`.c                    s    g | ]}|j  rt|qS r'   )rm   rn   r   r{   r   r'   r(   r[   	  s   z2get_all_fused_optimizer_blocks.<locals>.<listcomp>r}   )r6   r   r'   r   r(   get_all_fused_optimizer_blocks  s    
r   )r6   orig_optim_blocksplit_gradientsr,   c           %   
      s  t |jjj }t g g g g g g t g g g g g g f}g g f}g g f}t|jD ]\}}||kr`dnd}	||	 | t|||	 D ]\}
}|
r||
|  q||	 jd }t	|j
dstd|j
 d|jd }dt	|j
kstd|j
 d	|jd }||	 | qLtd
d || D s0tdt| j}g }t|j|jf\ }ttt D ]&\}}t|tjrd| | qdtjtjd fdd}tdD ]}	g }g }| j|jjf ||	 D ]@}|tttjdf |jjjd |  ||jj |  q| j!t"j#j$|df}W 5 Q R X t%|dd}t|j D ]\\}}|| }||| ||	 j| |kstd| d||	 j|  |||	 j|< qN| j|j d $ | j!t"j&j'||	 |jjj}W 5 Q R X t(|dd}t||	 D ]B\}}d}|D ].}t)|j|} t)||}!|| | |!|  qq |t*|| qt+ |\}"}#| j,||" | j-||# t./|jj0|jj1|jj2D ]}$| j3|$ q| j4  |jj D ]}$| j3|$ q| j4  |d |d fS )a  Split the `orig_optim_block` into two FusedOptimizerBlock.

    The first one will be the optimizer that optimize `split_gradients`. The second one is
    used to optimize the remaining gradients.
    An assert will be raised if one of the optimizer optimize zero gradients.
    r   ra   r   r   zThe copy output is z, expect aten.copy_r^   zThe copy getitem is z, expect operator.getitemc                 s   s   | ]
}|V  qd S r$   r'   )r3   lr'   r'   r(   re   :  s     z$_split_fused_adam.<locals>.<genexpr>z1At least one split optimizer does not have input.)	orig_nodenew_nodec                    s   |  D ]}| |< qd S r$   r'   )r   r   r   Zflatten_output_argsZflatten_output_args_indicesr'   r(   replace_flatten_output_argsG  s    z6_split_fused_adam.<locals>.replace_flatten_output_argsr   .Tr   z*The expected step output node mismatched,  r   )5r   r   r   r7   r   r   rl   zipr   rH   r   rn   r=   all
ValueErrorr   r@   r   r8   ri   defaultdictrq   r:   r;   rX   rB   ranger   r   r   r   r   rS   r   r   Z_foreach_addZScalarr   _fused_adamr   r   getattrr   r   Znode_set_argsZnode_set_kwargs	itertoolschainr   r   r   r   rA   )%r6   r   r   Zorig_optim_args
optim_argsZorig_optim_indicesZorig_step_indicesr   ZgradientZ	group_idxZorig_argZ	optim_argZorig_step_outputZorig_step_getitemZorig_step_idxr   resultsr   Z
output_argr   Z	step_argsZorig_step_outputsr   Z
step_blockr   Zstep_outputr   optim_blockZcurr_idxZorig_idxZ
list_namesrm   Z	orig_listZ	curr_listZoutput_argsZoutput_kwargsZcopy_outputr'   r   r(   _split_fused_adam  s    $







r
  )r6   r	  r   r,   c                 C   s8   |st dt|jjjdr,t| ||S tdd S )Nz#The given split_gradients is empty.zaten._fused_adamz Only fused_adam is supported now)r   rH   r   r   r   rn   r
  NotImplementedError)r6   r	  r   r'   r'   r(   split_fused_optimizer  s
    r  )r6   target_comm_nodetarget_dest_noder,   c                    s   t | dD ]}|jj|kr
 q.q
td| t| d}|D ].}t|jjj }t	t
|j}||jkr< qzq<t| dt| ||j\}}	t| |j|jjg}
t| j fddd }| j|
| dS )	zExtract a comm block and split out a new optimizer and step for it.

    This subgraph is then moved to the forward graph.
    r   zCannot find r  z$ is not used by any fused optimizer.c                    s
   | j  kS r$   )rm   )nr  r'   r(   r     r   z0iter_move_grads_and_optimizers.<locals>.<lambda>r   N)r~   rR   rm   r   r   r   r   r   r7   r   r   rS   r   r  find_all_descendantsr   r   r   r@   Zmove_to_next_iter_before)r6   r  r  r   Zoptim_blocksr	  r  Z
one_outputZ
move_optimr   
move_nodesZ	stop_noder'   r  r(   iter_move_grads_and_optimizers  s$    

 r  )r6   parent_nodesr,   c                    sx   t |dkstdt| jt|}t  |r^| } | |fdd|j	D 7 }q. fdd| jj
D }|S )zBIdentify the list of nodes to move during FX graph transformation.r   zNo parent nodes are given.c                    s$   g | ]}t |tjr| kr|qS r'   rZ   )r3   u)r   r'   r(   r[     s      z(find_all_descendants.<locals>.<listcomp>c                    s   g | ]}| kr|qS r'   r'   r{   )move_node_setr'   r(   r[     s      )r   r=   r   r@   ri   rj   rq   rk   rB   ro   ru   )r6   r  Zdq_parent_nodesrv   r  r'   )r  r   r(   r    s    



r  )r    )Vri   r  loggingr   r%   r9   dataclassesr   r   	functoolsr   typingr   r   r   r   r	   r
   r   r   r   r   r   rU   Ztorch.fxr;   Ztorch._subclasses.fake_tensorr   r   Z#torch.distributed._spmd.graph_utilsr   r   r   r   r   Z)torch.distributed._spmd.iter_graph_moduler   Ztorch.fx.passes.shape_propr   Ztorch.utilsr   rg   Ztorch.utils._pytreer   r   	getLoggerr   LoggerrW   Zopsr   r   rq   r   rH   r  r   r   r!   r)   rK   rM   rX   ry   r~   r   r   r   rc   r   r   r   r   r   r   
namedtupler   r   r   r   r   r   r
  r  r  r  r'   r'   r'   r(   <module>   s    4V	F @U!#5O.& 
 
$