U
    yhD                     @   sv  d dl Z d dlmZmZmZmZ d dlZd dlmZ	 d dl
m  mZ d dlm  mZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZ d dlmZmZmZ d dl m!Z! ej"j#Z#d	gZ$e j%d
d	 Z&eedf e'e'dddZ(eedf e!edddZ)ej*j+ee,df ee-e,f edddZ.dd Z/ej*j+ee,df ee-e,f e,dddZ0ej*j+ee,df ee-e,f e,dddZ1eeee ee e'e'e'e!e'eeef d
ddZ2ej*j+ee,df ee-e,f e,dddZ3eeeee e'e'ee'e!e'ed d!d"Z4ej*j+ee,df ee-e,f e,dd#d$Z5e#j/j6e0e#j7j6e1e#j8j6e3e#j9j6e3e#j:j6e5e#j;j6e5iZ<d%d& Z=d'd( Z>dS ))    N)castDictOptionalTuple)Tensor)DTensor	ReplicateShard)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)DTensorSpec	Placement
TensorMeta)
DeviceMeshloss_parallelc                   c   s   t   dV  t  dS )a  
    A context manager that enables loss parallelism, where efficient parallelized loss computation
    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
    loss is supported.

    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

    Args:
        input (:class:`DTensor`):
            Input logits. Assumed to be sharded on the class dimension.
        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
            Must be ground truth class indices (class probabilities currently not supported).
            Assumed to be replicated across the ``DeviceMesh``.
        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
            If given, assumed to be replicated across the ``DeviceMesh``.
        label_smoothing:
            Currently not supported.

    Returns:
        A replicated :class:`DTensor`.

    Example:
        A sharded DTensor is manually created here to showcase the usage.
        In practice, it is usually the output of a TP module.

        >>> # xdoctest: +SKIP("distributed")
        >>> from torch.distributed.tensor.parallel import loss_parallel
        >>> from torch.distributed.device_mesh import init_device_mesh
        >>> ...
        >>> device_mesh = init_device_mesh("cuda", (8,))
        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
        >>> target = torch.randint(16, (4,), device="cuda")
        >>> with loss_parallel():
        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
        >>>     loss.backward()
        >>> ...
    N)_enable_custom_loss_ops_disable_custom_loss_ops r   r   X/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/tensor/parallel/loss.pyr      s    *.)
placementsdimreturnc                 C   s6   t | dkstd| d |s2td| ddS )N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErrorZis_shard)r   r   r   r   r   _find_all_reduce_mesh_dimN   s    
r   )r   meshr   c                 C   sf   t | tr2| j|kr| S td| d| j dn0t | tjrPtj| ||ddS tdt|  d S )Nz	Expected z	 but got r   F)Zdevice_meshr   Z	run_checkzUnsupported type )	
isinstancer   r   RuntimeErrortorchr   Z
from_local	TypeErrortype)Ztensorr   r   r   r   r   _cast_to_dtensorZ   s    

   r%   )op_callargskwargsr   c                 C   sX   t j| ||}t jj|j}t|tr.|S t|tr@|d S t	dt
| dd S )Nr   zUnexpected tensor meta type: r   )r   _op_dispatcherZunwrap_to_op_infoZsharding_propagator_propagate_tensor_metaZschemar    r   tupler!   r$   )r&   r'   r(   Zop_infotensor_metar   r   r   r*   j   s    

r*   c                 C   s   |   } |r| jtjksttj| tjjd\}}| 	|} | 
 dkrN| }n2tj| |dd}tj|tjjj||fd}| | }tjt||dd}	tj|	tjjj||fd}	t|	}
||
 }|s|	|}|S )N)Ztype_promotion_kindr   T)Zkeepdim)ZreduceOpgroup)
contiguousdtyper"   ZhalfAssertionErrorutilsZelementwise_dtypesZELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtoZnumelZamaxfuncolZ
all_reducec10dZReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr   mesh_dimZcomputation_dtypeZresult_dtypeZshiftedZx_maxZshifted_sumexpZshifted_logsumexpresultr   r   r   _log_softmax}   s8     

    

r@   c                 C   s~   t t|d }t t|d }t t|d }|j}t|j|}t| ||}t|j	|||j
|}	t|j
|j|d}
t|	|
|	jdS )Nr   r      r,   requires_grad)r   r   intbool_specr   r   r*   r@   _local_tensorr   r   rD   )r&   r'   r(   r<   r   r=   specr>   output_tensor_metaresZres_specr   r   r   _log_softmax_handler   s"    rL   c                 C   s(   t t|d }t tj|d }||S )Nr      )r   r   r"   r/   r3   )r&   r'   r(   grad_outputZinput_dtyper   r   r   _log_softmax_backward_handler   s    rO   )
r<   targetweightlocal_weight	reductionignore_indexchannel_dim_sizer   r>   r   c	                    s  |   d dk rd ttd fdd}	|d k	rZ|	|}
|d k	sJt|	|}| | } t||k|d}| }t|d}||||}t|  |}|	|||}|
  }t||k|d}|tjjkrdkr| dd	}||fS |d k	rFt| j}d
| < |
|}
t|
 |
 }t||k|d}| }n||k | }|tjjkrp| }n|tjjkr| | }||fS )Nr   rA   r   )rQ   r   c                    s4   dkr,dg }| j d | < | |}n| }|S )Nr   r   )shapeview)rQ   rV   wchannel_dimZn_dimsr   r   _weight_view   s    z'_nll_loss_forward.<locals>._weight_viewZlogical_dim_sizer   g        )r   r   r0   r"   where	unsqueezer
   _partition_valuegatherZ_reduce_valuesqueezer   NONEvalueZnew_fulllistrV   expandr8   r3   r:   MEAN)r<   rP   rQ   rR   rS   rT   rU   r   r>   r[   rX   Zlocal_wsafe_targetZsafe_target_partial_placementZsafe_target_partial_Zresult_partialZresult_reducedr?   total_weight	new_shapeZwsumr   rY   r   _nll_loss_forward   sL    

  




rl   c              
      s  t t|d }|d }|d }t t|d }t t|d }| dkrJdnd}|j| }	|j}
t|
j| tt	|
j|g|}t
 f|
jj }t|||
j}d }|d k	rt|||
j} fddt|
jjD }||
j|j}|jd |jj| kst|tjjkr|}n|}t|}|| |d< |d< t| t||}t|j|j|d k	rV|jnd ||||	|
j 	\}}t|
j||d}t|||jd	|fS )
Nr   r   rA   rM      c                    s"   g | ]}| krt d nt qS )r   )r	   r   ).0ir>   r   r   
<listcomp>(  s    z-_nll_loss_forward_handler.<locals>.<listcomp>rB   rC   )r   r   rE   r   rV   rG   r   r   r   r   r   r   ndimr%   rangeZredistributerH   r0   r   rc   rd   re   r*   r+   rl   r   rD   )r&   r'   r(   r<   rP   rQ   rS   rT   rZ   rU   rI   target_placementsall_replicate_placementsrR   Zsharded_placementsZoutput_placementsrJ   r?   rj   out_specr   rp   r   _nll_loss_forward_handler  s`    
 

rw   )rN   r<   rP   rQ   rS   rT   rj   rU   r   r>   r   c
                 C   s  |  dk rdnd}
|tjjkr(| | } ||
}t||k|d}t|}t|d}|	|

 }||||	}|jjd k	st|jj d }tj|jd |jd}|  dkr|||< n\|  dkr||||f< nB||
d}|j}|d|j|
 }||||f< |||
d}|  |     kr<dkrJn n
| |
} |d k	rdd	 t|  D }|jd ||
< ||}t|j}d||
< ||}t||
|}| | } t||k| d} |t| |  S )
NrA   r   r   r\   g      ?)devicer]   c                 S   s   g | ]}d qS )r   r   )rn   _r   r   r   rq     s     z6_nll_loss_and_log_softmax_backward.<locals>.<listcomp>)r   r   rg   rd   r_   r"   r^   Z
zeros_liker
   rb   flattenr`   Zmask_bufferdatar0   floatZarangerV   rx   Z	transposeZreshaperW   rs   re   rf   ra   r9   )rN   r<   rP   rQ   rS   rT   rj   rU   r   r>   rZ   rh   Z
grad_inputri   Zmasked_safe_targetZgrad_updateZ	arange_1dZgrad_input_tZintermidate_shapeZgrad_input_2drk   rX   Zw_targetr   r   r   "_nll_loss_and_log_softmax_backwardV  sJ    


 
$




r}   c                 C   s^  t t|d }t t|d }|d }|d }t t|d }t t|d }t t|d }	| dkrfdnd}
|j|
 }|j}t|j|
}t	t
|j|
g|
}t f|jj }t|||j}|d k	rt|||j}t|}|| |d< |d< t|	||j|d< t| t||}t|j|j|j|d k	r(|jnd |||	||j|
}t|j|j|d}t|||jd	S )
Nr   r   rA   rM   rm         rB   rC   )r   r   rE   r   r   rV   rG   r   r   r   r   r   r   rr   r%   re   r*   r+   r}   rH   r   rD   )r&   r'   r(   rN   r<   rP   rQ   rS   rT   rj   rZ   rU   rI   r>   rt   ru   rJ   r?   rv   r   r   r   _nll_loss_backward_handler  sZ    
 r   c                   C   s   t jjt d S N)r   r)   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s    r   c                  C   s   t D ]} tjj|  qd S r   )r   r   r)   r   pop)Z	custom_opr   r   r   r     s    r   )?
contextlibtypingr   r   r   r   r"   Ztorch._prims_commonZ_prims_commonr1   Z)torch.distributed._functional_collectivesdistributedZ_functional_collectivesr4   Z"torch.distributed.distributed_c10dZdistributed_c10dr5   r   Ztorch.distributed._tensorr   r   r	   Z+torch.distributed._tensor.ops.embedding_opsr
   Z&torch.distributed._tensor.ops.math_opsr   r   r   Z)torch.distributed._tensor.placement_typesr   r   r   Ztorch.distributed.device_meshr   ZopsZaten__all__contextmanagerr   rE   r   r%   Z_opsZ
OpOverloadobjectstrr*   r@   rL   rO   rl   rw   r}   r   defaultZ_log_softmax_backward_dataZnll_loss_forwardZnll_loss2d_forwardZnll_loss_backwardZnll_loss2d_backwardr   r   r   r   r   r   r   <module>   s   
2
 



 


I

LE

<      
