U
    yht                     @   s  U d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
l m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z- d dl.m/Z0 d dl1m2Z2m3Z3m4Z4m5Z5 da6ee j7 e8d< ej9j:Z:G dd deZ;eG dd dZ<eG dd dZ=e
e>dddZ?dLej@jAee
df eeeBe
f  eeejCeejDeee' ee' f f  e
dddZEdd ZFejjGe!dd d!d"ZHeejGe
f e
e
d#d$d%ZIeee=eJf  eeeeJ ee' f d&d'd(ZKejGee
df ed)d*d+ZLejGee
df ed)d,d-ZMejGee
df eeBe
f eed.d/d0ZNejGee
df eeBe
f eed.d1d2ZOejGee
df eeBe
f eed.d3d4ZPe:jQjReLe:jSjReLe:jTjReMe:jUjReLiZVeej@jAef e8d5< e:jWjReNe:jXjReOe:jXjYeOiZZeej@jAef e8d6< e:j[jRePe:jXjYePe:j\jRePiZ]eej@jAef e8d7< d8dd9ejGeejGe
f e>ee eej^ d:d;d<Z_eeejGe
f eej^e
f d=d>d?Z`ej^ejGeejGe
f ejGd@dAdBZaej^eejjGejj^f ddCdDdEZbejceejGeejG f dFdGdHZddMej^eejC ee< ee e>eej^eeBe<f f dIdJdKZedS )N    N)	dataclass)autoEnum)partial)	AnyCallablecastDictListOptionalSequenceTupleUnion)_get_tracer)OP)
get_logger)
DeviceMeshDTensor)OpSchema)redistribute_local_tensor)_PartialDTensorSpec	Placement	ReplicateShard
TensorMeta)make_fx
proxy_slot)_pytree)tree_flattentree_maptree_map_onlytree_unflattenloggerc                   @   s   e Zd Ze Ze ZdS )TrainingPhaseN)__name__
__module____qualname__r   ZFORWARDZBACKWARD r(   r(   T/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/_spmd/distribute.pyr$   &   s   r$   c                   @   s"   e Zd ZU eed< ee ed< dS )Schemamesh
placementsN)r%   r&   r'   r   __annotations__r
   r   r(   r(   r(   r)   r*   +   s   
r*   c                   @   sP   e Zd ZU dZeed< eed< eed< edddZe	e
jed dd	d
ZdS )DSymInta;  DSymInt represents a value retrieved by a SymInt op from a DTensor.

    DSymInt helps View and Factory ops to determine the placement and shape of the
    output tensor, as those operators either do not have an input DTensor or
    the input DTensor is insufficient to determine the output tensor's placement.
    global_valuelocal_valuer+   )returnc                 C   s   | j | jkS N)r0   r/   )selfr(   r(   r)   is_shard>   s    zDSymInt.is_shard)nodedtensorr1   c                 C   s   d}|j tjkr@tt|jd }| ||| ||jdS |j tj	krh| |
 | 
 |jdS |j tjkrtt|jd }| ||| ||jdS td|j  d S )Nr      )r/   r0   r+   zDSymInt does not support )targetatensym_sizer   intargssizeto_localdevice_mesh	sym_numelZnumel
sym_stridestrideNotImplementedError)clsr5   r6   dimr(   r(   r)   	from_nodeA   s,    
zDSymInt.from_nodeN)r%   r&   r'   __doc__r;   r-   r   boolr4   classmethodfxNoder   rF   r(   r(   r(   r)   r.   1   s   
r.   )objr1   c                 C   s4   t | tsdS d}| jD ]}t |trd} q0q|S )zECheck if object is 1) DTensor and  2) with any placement of _Partial.FT)
isinstancer   r,   r   )rL   Z
is_partialZ	placementr(   r(   r)   _is_partial_dtensor\   s    


rN   .)op
local_argskwargsspecsr1   c                    s<   |d kri } d kri  t t d fdd}| t|||S )Nargr1   c                    sl    |  \}}}}t ||  | jd}t|t||d}t|t||d}t| tjrh|  krht| ||S | S )N)rB   dtype)tensor_meta)	r   rB   rU   r   tuplerM   torchTensorr   )rT   Ztensor_shaper+   Zcurrent_placementZtarget_placementrV   Zcurrent_specZtarget_specrR   r(   r)   redistributez   s,        
z2_dispatch_with_local_tensors.<locals>.redistribute)r   r    )rO   rP   rQ   rR   r[   r(   rZ   r)   _dispatch_with_local_tensorsj   s    r\   c           
      C   s|   t | \}}t|j}i }t|D ]D\}}t|tr$|r^| || j|j	|| j	f||j
< |j
||< q$t||}	||	fS r2   )r   pytreetree_leavesargs_schema	enumeraterM   r   r=   r+   r,   _local_tensorr"   )
r<   Ztarget_schemar[   flatten_argsargs_tree_specZflatten_args_schemarR   irT   Zunflattened_argsr(   r(   r)   _update_specs_for_redistribute   s    
	


re   )r5   	op_schemar1   c                 C   s   t | j\}}t|j}tttjj	f t
ddd}t|t|ksJttt||D ]&\}\}}||rXt|trX|||< qXt||}	t|	D ]\}
}| |
| qd S )NrS   c                 S   s.   t | tjjr$| jtjtjtjfkS t | t	S r2   )
rM   rX   rJ   rK   r8   r9   r:   r@   rA   r;   )rT   r(   r(   r)   is_sym_int_or_int   s    z6_update_node_from_op_schema.<locals>.is_sym_int_or_int)r   r<   r]   r^   r_   r   r;   rX   rJ   rK   rH   lenAssertionErrorr`   ziprM   r"   Z
update_arg)r5   rf   	flat_argsrc   Zflat_args_schemarg   rd   rT   Z
arg_schemar<   idxr(   r(   r)   _update_node_from_op_schema   s    	

rm   )node_to_objrT   r1   c                 C   s>   t |tjjr6| | }t r2ttttf |jt	= |S |S d S r2   )
rM   rX   rJ   rK   r   r   r	   r   __dict__r   )rn   rT   rL   r(   r(   r)   
_remap_arg   s    rp   )sizesr+   r1   c                 C   sZ   dd | D }dd t | D p&t g}t||jksRtdt| d|j d||fS )Nc                 S   s    g | ]}t |tr|jn|qS r(   rM   r.   r0   .0sr(   r(   r)   
<listcomp>   s    z)unpack_sizes_and_dims.<locals>.<listcomp>c                 S   s*   g | ]"\}}t |tr| rt|qS r(   )rM   r.   r4   r   )rt   rd   ar(   r(   r)   rv      s   
 z"The number of sharded dimensions (z2) must match number of dimensions in device mesh (z).)r`   r   rh   ndimri   )rq   r+   local_sizesr,   r(   r(   r)   unpack_sizes_and_dims   s    rz   )r5   r<   r1   c                 C   s   t |dks"td| j d| t|d tsBtd|d  t|d tsbtd|d  t|d |d j\}}| jd |f| _t	t
jj| j}tj||d j||d j|dd	S )
N   zExpect two args but got op z with args r   z*Expect 1st argument to be DTensor but got r7   $Expect 2nd argument as list but got FZlocal_tensorr?   r,   	run_check)rh   ri   r8   rM   r   listrz   r?   r<   r   rX   _ops
OpOverload
from_localra   )r5   r<   ry   r,   rO   r(   r(   r)   binop_sym_int_consumer_rule   s"    "  r   c           
   	   C   s\   |\}}}}}}dd |D }t j||j|jd}	tjt |	| |||||j|j	ddS )Nc                 S   s    g | ]}t |tr|jn|qS r(   rr   rs   r(   r(   r)   rv     s    z7slice_backwad_sym_int_consumer_rule.<locals>.<listcomp>)devicerU   Fr}   )
rX   zerosr   rU   r   r   Zslice_scatterr>   r?   r,   )
r5   r<   Zgrad_outputZinput_sizesrE   startendstepry   Zinput_tensorr(   r(   r)   #slice_backwad_sym_int_consumer_rule  s,           r   )r5   r<   rQ   default_meshr1   c                 C   s   t j| }tdd |D r4td| j d| dt|d tsTtd|d  t|d |\}}|f|dd  | _t	t
jj| j}tj|| j|||d	d
S )Nc                 s   s   | ]}t |tV  qd S r2   )rM   r   rt   rw   r(   r(   r)   	<genexpr>   s     z*factory_with_sizes_rule.<locals>.<genexpr>z4Not expect DTensor argument for factory op, but got z with arguments .r   r|   r7   Fr}   )r]   arg_tree_leavesanyri   r8   rM   r   rz   r<   r   rX   r   r   r   r   )r5   r<   rQ   r   rk   ry   r,   rO   r(   r(   r)   factory_with_sizes_rule  s    
 
r   c                 C   s>   t dd || _ttjj| j}tj|| j||t	 gddS )Nc                 S   s   t | tr| jS | S r2   rr   rw   r(   r(   r)   <lambda>7      z%factory_arange_rule.<locals>.<lambda>Fr}   )
r    r<   r   rX   r   r   r8   r   r   r   r5   r<   rQ   r   rO   r(   r(   r)   factory_arange_rule1  s    
r   c                 C   s>   || | _ | _ttjj| j}tj|| j | j|t	 gddS )NFr}   )
r<   rQ   r   rX   r   r   r8   r   r   r   r   r(   r(   r)   default_factory_op_ruleA  s    r   VIEW_SYM_INT_CONSUMERSFACTORY_SYM_INT_CONSUMERSFACTORY_OPSF)force_make_fxr   )r5   rn   r   r   r1   c          	   
   C   s  t   ttt|| j}ttt|| j}tt jj	| j
}tdd tj| D r|tkrt|dksztd| t| | ||| < W 5 Q R  d S |tkr|d k	stdt| | ||||| < W 5 Q R  d S tttjsttd|| | j
tjjkrtjj}tdd |}td	d |}|tkrVt| | ||||| < W 5 Q R  d S tt|||d
}t|dd|}|j  |W  5 Q R  S Q R X d S )Nc                 s   s    | ]}t |tr| V  qd S r2   )rM   r.   r4   r   r(   r(   r)   r   y  s   
z._get_dtensor_dispatch_graph.<locals>.<genexpr>r   zExpect empty kwargs, but got z%Requires default mesh for factory opszYAssuming using local_value from SymInt for %sis mathematically correct. Full args are %s.c                 S   s   t | tr| jS | S r2   rr   r   r(   r(   r)   r     r   z-_get_dtensor_dispatch_graph.<locals>.<lambda>c                 S   s   t | tr| jS | S r2   rr   r   r(   r(   r)   r     r   )rQ   rR   F)Z_allow_non_fake_inputs) rX   Zno_gradr    r   rp   r<   rQ   r   r   r   r8   r   r]   r   r   rh   ri   r   rM   r#   loggingLoggerwarningr9   viewdefaultZreshaper   r\   r   grapheliminate_dead_code)	r5   rn   r   r   r<   rQ   Zop_overloaddispatchgmr(   r(   r)   _get_dtensor_dispatch_graphk  sd        
   
r   )dtrn   r1   c           	      C   s   t jt jt jddd}| j}t | j}t|||}dd |jjD }dd |jjD }t|dksltt|dks|t| ||d	 < t	j
|| jt gd
d||d < t|d	 |dd}|dk	st|||d	  fS )zCreate a graph for a dummy add function from a partial DTensor.

    This dummy add is used for triggering all_reduce on a Partial DTensor
    during the DTensor expansion of the traced graph.
    Also returns the actual DTensor after resharding.
    )gradzeror1   c                 S   s   | | S r2   r(   )r   r   r(   r(   r)   	dummy_add  s    z)_build_dummy_add_graph.<locals>.dummy_addc                 S   s   g | ]}|j tjkr|qS r(   )rO   r   PLACEHOLDERrt   nr(   r(   r)   rv     s      z*_build_dummy_add_graph.<locals>.<listcomp>c                 S   s   g | ]}|j tjkr|qS r(   )rO   r   CALL_FUNCTIONr   r(   r(   r)   rv     s      r{   r7   r   Fr~   T)r   N)rX   rY   ra   Z
zeros_liker   r   nodesrh   ri   r   r   r?   r   r   )	r   rn   r   r   r   Z
traced_addZplaceholdersZcall_functionstraced_dispatchr(   r(   r)   _build_dummy_add_graph  s,    
     r   )r   r5   rn   r1   c                    s  g }d}|j d D ]}t|tjs0|| q|| }t|sL|| qd}tt|}t||\}}	dd |j	j
D }
dd |j	j
D }t|
dkrt|dkst|d |
d  |j	  |	||
d < i  |j	j
D ]}|jtjkr| |< q|jtjkrt|j dkr(t|j d dksDtd|j  d	t|j  | |j d d   ||j d d  | |j d d  < q|jtjkrt| |jt||j | j	|  | j	| fd
d |< W 5 Q R X qq|r| j	| | j	|S |S d S )NFr   Tc                 S   s$   g | ]}|j d ks|j dkr|qS )Z	wait_commZwait_tensornamer   r(   r(   r)   rv     s   
 
z#_convert_output.<locals>.<listcomp>c                 S   s   g | ]}|j d kr|qS )addr   r   r(   r(   r)   rv     s     
 r7   !Expecting single output, but got  c                    s    |  S r2   r(   r   Zvalue_remapr(   r)   r   (  r   z!_convert_output.<locals>.<lambda>)r<   rM   rJ   rK   appendrN   r   r   r   r   r   rh   ri   replace_all_uses_withr   rO   r   r   OUTPUTZGET_ATTRsetattrr8   getattrinserting_before	node_copy
erase_nodeoutput)r   r5   rn   new_argsZhas_partialargumentrL   r   r   Z
result_objwaitr   dtnr(   r   r)   _convert_output  s\    




&
(r   )r   node_replacementsr1   c              
      s  | j jD ]}||krq|| }tj|j }di  } |j jD ]$}|jtjkr>||  |< |d7 }q>| j | |j jD ]h}|jtjkrq||jtj	krt
|jdkstd|j dt
|jd  |jd }t
|dkr|d }nd }	t|D ]\}}
|
d krq|
jdkst|
jjdks.t|
jjdks@t|	d ks^|	|
jd ks^t|
jd }	|
jd |kstq|	d k	st|	} | }|| q|| j | fdd	 |< td
d ||fD r|| |   qq|| j | W 5 Q R X q| j   |   d S )Nr   r7   r   r   Zcall_function	_operatorgetitemc                    s    |  S r2   r(   r   r   r(   r)   r   k  r   z _rebuild_graph.<locals>.<lambda>c                 s   s.   | ]&}t |jtjjo$|jjjd V  qdS ))zaten::_foreachzaten::_fused_adamN)rM   r8   rX   r   r   Z_schemar   
startswithr   r(   r(   r)   r   l  s
   
z!_rebuild_graph.<locals>.<genexpr>)r   r   r]   r   r<   rO   r   r   r   r   rh   ri   r`   r8   r&   r%   r   r   allr   r   Z	recompile)r   r   r5   r   rb   rd   r   outputsr   sourceoutnew_noder(   r   r)   _rebuild_graph0  sZ    






r   )r   r1   c                    sl   i i t jt jd dfdd t| jD ]8t jj fdd t jj fdd q.S )N)arg_nodeconsumerr1   c                    s&   | kr"|| <   |g |  d S r2   )
setdefaultr   )r   r   )last_consumer_to_nodesnode_to_last_consumerr(   r)   _register_final_consumer  s    z=_get_last_consumer_to_nodes.<locals>._register_final_consumerc                    s
    | S r2   r(   )r   r   r5   r(   r)   r     r   z-_get_last_consumer_to_nodes.<locals>.<lambda>c                    s
    | S r2   r(   )Z
kwarg_noder   r(   r)   r     r   )rJ   rK   reversedr   r5   Zmap_argr<   rQ   )r   r(   )r   r   r5   r   r)   _get_last_consumer_to_nodes  s     r   )r   inpsschemasr   _allow_partialr1   c                    s  t dadd tjD }i }i }t| j}i }	t| jjD ]\}
}tdk	sPtt	d|
|j
|j |j
tjkr|
t|k std|
d  dt| d	tj||
  ||
 j||
 jd
d||< nt|jtjjrtt||jd  }t||||< nt|jtjjr8t|||d}|dk	r|||< n|j
tjkr|sXt| ||}|jd D ]<}t|t j!rb|| }t|trbt"|j#|j|	|j$< qbnB|j
tj%krt&t't(||j}t&t't(||j)}t*t+dd |t,|-   |j|krt dkrt. fdd D s.tdt/tdd |}t/tdd |}t/tdd |}t/tdd |}||_||_)t|j|||j|| d jd||< n0t dkstd|j d|j||||< nt0d|j
 ||kr:|| D ]}||= qq:t1| | | |	fS )zTransform a graph module to a distributed graph module.

    Returns:
        - transformed graph module
        - map from output name to DTensorSpec

    Zspmd_expc                 S   s   h | ]}t t|qS r(   )r   operator)rt   r   r(   r(   r)   	<setcomp>  s     z*_convert_to_distributed.<locals>.<setcomp>Nznode%s: op=%s target=%szgot more placeholder nodes (r7   z) than inputs ()Fr   r   )r   c                 S   s
   t | tS r2   )rM   r.   r   r(   r(   r)   r     r   z)_convert_to_distributed.<locals>.<lambda>c                 3   s   | ]} d  j |j kV  qdS )r   N)r+   )rt   dZdsymintsr(   r)   r     s    z*_convert_to_distributed.<locals>.<genexpr>z&all DSymInts must have the same mesh. c                 S   s   | j S r2   r0   r   r(   r(   r)   r     r   c                 S   s   | j S r2   r   r   r(   r(   r)   r     r   c                 S   s   | j S r2   r/   r   r(   r(   r)   r     r   c                 S   s   | j S r2   r   r   r(   r(   r)   r     r   )r0   r/   r+   zBSPMD expansion does not support SymInt in non-operator nodes, got r   zUnrecognized node.op type )2r   r#   r   __all__r   r   r`   r   ri   inforO   r8   r   r   rh   r   r   cloner+   r,   rM   rX   r   ZOpOverloadPacketr   r<   r.   rF   r   r   r   r   rJ   rK   r*   r?   r   r   r    r   rp   rQ   r   filterrW   valuesr   r!   
ValueErrorr   )r   r   r   r   r   	operatorsrn   r   r   Zoutput_schemasrd   r5   r6   replacementZinp_argrL   r<   rQ   rP   Zlocal_kwargsZglobal_argsZglobal_kwargsr   r(   r   r)   _convert_to_distributed  s    

  
 


r   )NN)NF)fr   r   dataclassesr   enumr   r   	functoolsr   typingr   r   r   r	   r
   r   r   r   r   rX   Z(torch.distributed._spmd.experimental_opsZtorch.fxrJ   Z#torch.distributed._spmd.comm_tensorr   Z#torch.distributed._spmd.graph_utilsr   Z!torch.distributed._spmd.log_utilsr   Ztorch.distributed._tensorr   r   Z$torch.distributed._tensor._op_schemar   Z'torch.distributed._tensor._redistributer   Z)torch.distributed._tensor.placement_typesr   r   r   r   r   r   Z"torch.fx.experimental.proxy_tensorr   r   Ztorch.utilsr   r]   Ztorch.utils._pytreer   r    r!   r"   r#   r   r-   Zopsr9   r$   r*   r.   rH   rN   r   r   strrY   Sizer\   re   rK   rm   rp   r;   rz   r   r   r   r   r   Z_unsafe_viewr   expandZslice_backwardr   r   fullZaranger   r   Zscalar_tensorr   r   ZGraphModuler   r   r   r   ZGraphr   r   r(   r(   r(   r)   <module>   s   , *  
+   






       
   W &KW  