U
    yh                  
   @   s  d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
m  m  mZ d dlm  m  mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZm Z  d dlm!Z!m"Z" d dl#m$Z$m%Z% d	d
dgZ&e	j'j(Z(G dd de	j)j*Z+G dd de	j)j*Z,G dd	 d	e	j-Z.de	j-ee% eee  e.ddd
Z/dej0ee% eee1ej0e%gdf  eeej0ee%gdf  eeej0ee%gdf  ej0dddZ2dS )    N)AnyCallablecastOptionalSequenceTuple)mesh_broadcast)Redistributeredistribute_local_tensor)compute_global_tensor_info)DTensorSpecPartial	Placement	ReplicateShard
TensorMeta)is_rng_supported_meshOffsetBasedRNGTracker)_mesh_resources
DeviceMeshDTensordistribute_tensordistribute_modulec                   @   s<   e Zd Zedeee  dddZeej	dddZ
dS )	_ToTorchTensorr   )inputgrad_placementsc                 C   s   |j | _|| _|j}||S N)_specdtensor_specr   _local_tensorview_as)ctxr   r   local_tensor r#   O/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/_tensor/api.pyforward?   s    z_ToTorchTensor.forwardgrad_outputc           	      C   sl   | j }|j}| j}|j}t|||j\}}t|}|p:|j}t||t|j	||j
dd}t|||jdd fS )Nshapestridedtypetensor_metarequires_grad)r   meshr   r-   r   
placementstupler   r   r)   r+   r   r/   )	r!   r'   r   r0   r   Zdtensor_meta_tensor_strideZ	grad_specr#   r#   r$   backwardN   s6      
z_ToTorchTensor.backwardN)__name__
__module____qualname__staticmethodr   r   r   r%   torchTensorr5   r#   r#   r#   r$   r   >   s   
r   c                   @   sZ   e Zd Zed
ejeeedf e	e
ej e
eedf  ddddZedddd	ZdS )_FromTorchTensorN.r   )r   device_meshr1   	run_checkr)   r*   returnc                 C   s   || _ || _|r |r || }}nF|sN|sNt|||\}	}
t|	t|
 }}ntd| d| dd| d kr|jd|j	d}n4|rt
|D ]&\}}| r| }t|||d qt||t|||jd}t||||j	d}|S )	NzFound shape:z	, stride:.z3Please pass both shape and stride at the same time.r   r.   )Zmesh_dimr,   )previous_placementprevious_device_meshr   r:   Sizer2   RuntimeErrorZget_coordinateZ	new_emptyr/   	enumerateis_replicate
contiguousr   r   r   r+   r   r    )r!   r   r=   r1   r>   r)   r*   Ztensor_shaper4   Zglobal_shapeZglobal_strideidx	placementZ	dist_specZdist_tensorr#   r#   r$   r%   o   sJ    
  z_FromTorchTensor.forwardr&   c                 C   sh   | j }| j}|j|krT|j}t|||jjd}|j}t|||dd}|d d d d d fS | d d d d d fS )Nr,   T)Zis_backward)	rA   rB   r1   r   r   r-   r   r
   to_local)r!   r'   rA   rB   Zcurrent_specZtarget_specr"   outputr#   r#   r$   r5      s$    
   z_FromTorchTensor.backward)NN)r6   r7   r8   r9   r:   r;   r   r   r   boolr   rC   intr%   r5   r#   r#   r#   r$   r<   n   s     
?r<   c                   @   st  e Zd ZU ejed< eed< ddgZe	 Z
ej	ed< eejejeed dddZdd Zd	d
 Zedd Zdd Zdd Zeejd+ddZed,ddddejee eee  eeej eeedf  d dddZddeee  ejdddZd-dd ee eee  ed d!d"d#Zddeee  ejdd$d%Z e!ed&d'd(Z"e!ee d&d)d*Z#dS ).r   r   r   _op_dispatcher)r"   specr/   r?   c             	   C   s^   |j r|std |jdk	s&tdtjj| |jj|jj	|j
|j|j|d}||_||_|S )a^  
        Construct a DTensor from a local tensor, device mesh, and placement and
        other tensor properties (i.e. shape, requires_grad, strides, etc).
        Note: This is not a public API and it's only supposed to be used by the
            operator implementations and internals. If you want to construct a
            DTensor from a local tensor, consider using `DTensor.from_local`, if
            you want to construct a DTensor from a "global" tensor (where you
            already have tensor initialized and want to shard this tensor),
            consider using `distribute_tensor`.
        zxTo construct DTensor from torch.Tensor, it's recommended to use local_tensor.detach() and make requires_grad consistent.NzTensorMeta should not be None!)stridesr+   devicelayoutr/   )r/   warningswarnr-   AssertionErrorr:   r;   Z_make_wrapper_subclassr)   r*   r+   rQ   rR   r   r   )clsr"   rO   r/   rr#   r#   r$   __new__   s"    

zDTensor.__new__c                 C   s"   d| j  d| jj d| jj dS )NzDTensor(local_tensor=z, device_mesh=z, placements=))r   r   r0   r1   selfr#   r#   r$   __repr__   s    zDTensor.__repr__c                 C   s   dg| j | jffS )ze
        protocol to inform how to flatten a DTensor to local tensor
        for PT2 tracing
        r   )r   r/   rZ   r#   r#   r$   __tensor_flatten__  s    zDTensor.__tensor_flatten__c           	      C   sR   |d k	st d| d }|\}}t|||jjd}t|j|j|d}t|||dS )NzEExpecting spec to be not None from `__tensor_flatten__` return value!r   r(   r,   r.   )rU   r   r-   r+   r   r0   r1   r   )	Zinner_tensorsZflatten_specZ
outer_sizeZouter_strider"   rO   r/   Zunflatten_tensor_metaZunflatten_specr#   r#   r$   __tensor_unflatten__  s*    zDTensor.__tensor_unflatten__c                 C   s8   t dd | jD s| S dd | jD }| j| j|dS )Nc                 s   s   | ]}t |tV  qd S r   )
isinstancer   .0pr#   r#   r$   	<genexpr>#  s     z6DTensor.__coerce_tangent_metadata__.<locals>.<genexpr>c                 S   s    g | ]}t |trt n|qS r#   )r_   r   r   r`   r#   r#   r$   
<listcomp>%  s    z7DTensor.__coerce_tangent_metadata__.<locals>.<listcomp>r=   r1   )anyr1   redistributer=   )r[   r1   r#   r#   r$   __coerce_tangent_metadata__"  s    z#DTensor.__coerce_tangent_metadata__c                 C   s   | j | j|jdS )Nre   )rg   r=   r1   )r[   Zmetadata_tensorr#   r#   r$   #__coerce_same_metadata_as_tangent__*  s    z+DTensor.__coerce_same_metadata_as_tangent__r#   Nc                 C   s   t j|||pi S r   )r   rN   dispatch)rV   functypesargskwargsr#   r#   r$   __torch_dispatch__0  s
    zDTensor.__torch_dispatch__T)r>   r)   r*   .)r"   r=   r1   r>   r)   r*   r?   c          	      C   s   |p
t  }|j}|| jjkr.| js.| |} |dkrLdd t|jD }nJt	|}t
|D ]8\}}| r\tt|}|jdk r\t|j| j ||< q\t| |t||||S )a  
        Create a :class:`DTensor` from a local torch.Tensor on each rank
        according to the `device_mesh` and `placements` specified.

        Args:
            local_tensor (torch.Tensor): local torch.Tensor on each rank.
            device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
                tensor, if not specified, must be called under a DeviceMesh
                context manager, default: None
            placements (List[:class:`Placement`], optional): the placements that
                describes how to place the local torch.Tensor on DeviceMesh, must
                have the same number of elements as `device_mesh.ndim`. If not
                specified, we will by default replicate the tensor across the
                `device_mesh` from the first rank of each dimension of the `device_mesh`.

        Keyword args:
            run_check (bool, optional): indicate whether to run check across ranks
                to check meta information and data. if have :class:`Replicate` in
                `placements`, the data on first rank of the device mesh dimension
                will be broadcasted to other ranks.
            shape (torch.Size, optional): A List of int which specifies the size of
                DTensor which build on top of `local_tensor`. Note this needs to be
                provided if the shape of `local_tensor` are different across the ranks.
                If not provided, `shape` will be computed assuming the given distributed
                tensor is evenly sharded across ranks.
            stride (tuple, optional): A List of int which specifies the stride of DTensor.
                If not provided, `stride` will be computed assuming the given distributed
                tensor is evenly sharded across ranks.

        Returns:
            A :class:`DTensor` object

        .. note:: `from_local` is differentiable, the `requires_grad` of the created
            `DTensor` object will depend on if `local_tensor` requires_grad or not.
        Nc                 S   s   g | ]
}t  qS r#   r   ra   r3   r#   r#   r$   rd   v  s     z&DTensor.from_local.<locals>.<listcomp>r   )r   get_current_meshdevice_typerQ   typeis_metatorangendimlistrE   is_shardr   r   dimr<   applyr2   )	r"   r=   r1   r>   r)   r*   rs   rH   rI   r#   r#   r$   
from_local;  s(    2


zDTensor.from_local)r   )r   r?   c                C   s4   t  s| jS |dk	r(t|ts(t|}t| |S )a  
        Get the local tensor of this DTensor on its current rank. For sharding it returns
        a local shard of the logical tensor view, for replication it returns the replica on
        its current rank.

        Keyword args:
            grad_placements (List[:class:`Placement`], optional): the placements describes
                the future layout of any gradient layout of the Tensor returned from this
                function.
                `to_local` converts DTensor to local tensor and the returned local tensor
                might not be used as the original DTensor layout later in the code. This
                argument is the hint that user can give to autograd in case the gradient
                layout of the returned tensor does not match the original DTensor layout.
                If not specified, we will assume the gradient layout remains the same
                as the original DTensor and use that for gradient computation.

        Returns:
            A :class:`torch.Tensor` or `AsyncCollectiveTensor` object. it represents the
            local tensor on its current rank.

        .. note:: `to_local` is differentiable, the `requires_grad` of the local tensor returned
            will depend on if the `DTensor` requires_grad or not.
        N)r:   Zis_grad_enabledr   r_   r2   r   r|   )r[   r   r#   r#   r$   rJ     s     zDTensor.to_localF)async_op)r=   r1   r~   r?   c                C   s   |p| j }|dkrtdt|}t|D ]B\}}| rDtdq*t|tr*|jdk r*t|j| j ||< q*t	|}t
| |||S )aM  
        `redistribute` performs necessary collective operations that redistribute the current
        DTensor from its current placements to a new placements, or from is current DeviceMesh
        to a new DeviceMesh. i.e. we can turn a Sharded DTensor to a Replicated DTensor by
        specifying a Replicate placement for each dimension of the DeviceMesh.

        Args:
            device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
                DTensor, if not specified, must be called under a DeviceMesh
                context manager, default: None
            placements (List[:class:`Placement`], optional): the new placements that
                describes how to place the DTensor into the DeviceMesh, must
                have the same number of elements as `device_mesh.ndim`.

        Keyword args:
            async_op (bool, optional): whether to perform the DTensor redistribute operation
                asynchronously or not. Default: False

        Returns:
            A :class:`DTensor` object

        .. note:: `redistribute` is differentiable.
        Nz&placements is needed for redistribute!zTCan not redistribute to Partial, redistributing to Partial is for internal use only!r   )r=   rD   ry   rE   Z
is_partialr_   r   r{   rx   r2   r	   r|   )r[   r=   r1   r~   irI   r#   r#   r$   rg     s    #
zDTensor.redistributec                C   s&   | j t g| jj dd}t||S )a  
        Return the full tensor of this DTensor. It will perform necessary collectives
        to gather the local tensors from other ranks in its DeviceMesh and concatenate
        them together. It's a syntatic sugar of the following code:

        `dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()`

        Keyword args:
            grad_placements (List[:class:`Placement`], optional): the placements describes
                the future layout of any gradient layout of the full Tensor returned from this
                function.
                `full_tensor` converts DTensor to a full torch.Tensor and the returned torch.tensor
                might not be used as the original replicated DTensor layout later in the code. This
                argument is the hint that user can give to autograd in case the gradient
                layout of the returned tensor does not match the original replicated DTensor layout.
                If not specified, we will assume the gradient layout of the full tensor be replicated.

        Returns:
            A :class:`torch.Tensor` object that represents the full tensor of this DTensor.

        .. note:: `full_tensor` is differentiable.
        F)r1   r~   )rg   r   r=   rx   r   r|   )r[   r   Z
redist_resr#   r#   r$   full_tensor  s
     zDTensor.full_tensor)r?   c                 C   s   | j jS )z
        The :class:`DeviceMesh` attribute that associates with this DTensor object.

        .. note:: device_mesh is a read-only property, it can not be set.
        )r   r0   rZ   r#   r#   r$   r=     s    zDTensor.device_meshc                 C   s   | j jS )z
        The placements attribute of this DTensor that describes the layout of this
        DTensor on the its DeviceMesh.

        .. note:: placements is a read-only property, it can not be set.
        )r   r1   rZ   r#   r#   r$   r1     s    zDTensor.placements)r#   N)NN)NN)$r6   r7   r8   r:   r;   __annotations__r   	__slots__op_dispatchZOpDispatcherrN   r9   Z_disable_dynamorL   rX   r\   r]   r^   rh   ri   classmethodro   r   r   r   r   rC   r   rM   r}   rJ   rg   r   propertyr=   r1   r#   r#   r#   r$   r      sx   

*
  
Q
%  
7
)tensorr=   r1   r?   c              
   C   sP  t jd |pt }|j}|dkrrzddlm} || ||W S  tk
rp } zd}t||W 5 d}~X Y nX t	j
st|rt|t	_
| jstd|| jjkr| js| |} |dkrdd	 t|jD }t||jkrtd
t| d|j dt| trV| j|kr*td| j d| d| jt|krRtd| j d| d| S |  }t|}t|D ]\}}	|	 rt t!|	}	|	j"dk rt!|	j"| j }	|	||< |	#|||}n:|	$ rt t%|	}	|	&|||}ntd|	 d| dqnt|}|dk	st'dt(||t)| * | + | j,dd}
t|-| j.|
| j.dS )a'  
    Distribute a leaf torch.Tensor (i.e. nn.Parameter) to the ``device_mesh`` according
    to the ``placements`` specified. The rank of ``device_mesh`` and ``placements`` must be
    the same. If you want to construct a DTensor in the middle of the Autograd computation,
    please use ``DTensor.from_local`` instead.

    Args:
        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
            want to shard a tensor on a dimension that is not evenly divisible by
            the number of devices in that mesh dimension, we use ``torch.chunk``
            semantic to shard the tensor and scatter the shards.
        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
            tensor, if not specified, must be called under a DeviceMesh context
            manager, default: None
        placements (List[:class:`Placement`], optional): the placements that
            describes how to place the tensor on DeviceMesh, must have the same
            number of elements as `device_mesh.ndim`. If not specified, we will
            by default replicate the tensor across the `device_mesh` from the
            first rank of each dimension of the `device_mesh`.

    Returns:
        A :class:`DTensor` or `XLAShardedTensor` object.

    Note:
        When initialize the DeviceMesh with the `xla` device_type, `distribute_tensor`
        return `XLAShardedTensor` instead. see [link](https://github.com/pytorch/pytorch/issues/92909)
        for more details. The XLA integration is experimental and subject to change.
    ztorch.dtensor.distribute_tensorxlar   )xla_distribute_tensorDTo use DTensor API with xla, you must install the torch_xla package!NzY`distribute_tensor` should be used to distribute leaf tensors! but found non-leaf tensor!c                 S   s   g | ]
}t  qS r#   rp   rq   r#   r#   r$   rd   `  s     z%distribute_tensor.<locals>.<listcomp>zW`placements` must have the same length as `device_mesh.ndim`! Found placements length: z, and device_mesh.ndim: r@   z-Cannot distribute a DTensor with device mesh z to a different device mesh z,Cannot distribute a DTensor with placements z to a different placements z-. do you want to call `redistribute` instead?z8Trying to distribute tensor with unsupported placements z on device mesh dimension !z(distributing a tensor should not be Noner(   )r0   r1   r-   r.   )/r:   _C_log_api_usage_oncer   rr   rs   torch_xla.distributed.spmdr   ImportErrorrandomZ_rng_trackerr   r   Zis_leafrD   rQ   rt   ru   rv   rw   rx   len
ValueErrorr_   r   r=   r1   r2   detachry   rE   rz   r   r   r{   Z_shard_tensorrF   r   Z_replicate_tensorrU   r   r   sizer*   r+   Zrequires_grad_r/   )r   r=   r1   rs   r   emsgr"   rH   rI   rO   r#   r#   r$   r     s    "  





	
)moduler=   partition_fninput_fn	output_fnr?   c              
      s  t jd  pt   j}|dkrvzddlm} ||  |W S  tk
rt } zd}t||W 5 d}~X Y nX t	j
tdddd	}	|dkr|  D ]\}
}|	|  qn(|  D ]\}
}||
|  |	|  qdk	rRttj}|d
kr"tjdtd
d |  fdd n0|dkrB|  fdd ntd| ddk	rttj}|d
krtjdtd
d |  fdd n0|dkr|  fdd ntd| d| S )a9  
    This function expose three functions to control the Tensors inside the module:
    1. To perform sharding on the module before runtime execution by specifying the
        ``partition_fn`` (i.e. allow user to convert Module parameters to :class:`DTensor`
        parameters according to the `partition_fn` specified).
    2. To control the inputs or outputs of the module during runtime execution by
        specifying the ``input_fn`` and ``output_fn``. (i.e. convert the input to
        :class:`DTensor`, convert the output back to torch.Tensor)

    Args:
        module (:class:`nn.Module`): user module to be partitioned.
        device_mesh (:class:`DeviceMesh`): the device mesh to place the module.
        partition_fn (Callable): the function to partition parameters (i.e. shard certain
            parameters across the `device_mesh`). If `partition_fn` is not specified,
            by default we replicate all module parameters of `module` across the mesh.
        input_fn (Callable): specify the input distribution, i.e. could control how the
            input of the module is sharded. `input_fn` will be installed as a module
            `forward_pre_hook` (pre forward hook).
        output_fn (Callable): specify the output distribution, i.e. could control how the
            output is sharded, or convert it back to torch.Tensor. output_fn will be
            installed as a module `forward_hook` (post forward hook).

    Returns:
        A module that contains parameters/buffers that are all `DTensor`s.

    Note:
        When initialize the DeviceMesh with the `xla` device_type, `distribute_module`
        return nn.Module with PyTorch/XLA SPMD annotated parameters. See [link](https://github.com/pytorch/pytorch/issues/92909)
        for more details. The XLA integration is experimental and subject to change.
    ztorch.dtensor.distribute_moduler   r   )xla_distribute_moduler   N)mr0   r?   c              
   S   s   t  g|j }| j D ]6\}}|d k	rt|ts| |tt	|j
|| q| j D ],\}}|d k	rZt|tsZt	|||| j|< qZd S r   )r   rx   _parametersitemsr_   r   Zregister_parameternn	Parameterr   data_buffers)r   r0   Zfull_replicatekeyparambufferr#   r#   r$   replicate_module_params_buffers  s    z:distribute_module.<locals>.replicate_module_params_buffers   zDeprecating input_fn that takes two arguments (inputs, device_mesh), please use input_fn that takes in (module, inputs, device_mesh) instead!)
stacklevelc                    s
   | S r   r#   )r3   inputsr=   r   r#   r$   <lambda>      z#distribute_module.<locals>.<lambda>   c                    s   | | S r   r#   )modr   r   r#   r$   r   	  r   z-input_fn should take in 3 arguments, but got z arguments!zDeprecating output_fn that takes two arguments (inputs, device_mesh), please use output_fn that takes in (module, inputs, device_mesh) instead!c                    s
   | S r   r#   r   r   outputsr=   r   r#   r$   r     r   c                    s   | | S r   r#   r   r   r#   r$   r     r   z.output_fn should take in 3 arguments, but got )r:   r   r   r   rr   rs   r   r   r   r   Moduler   Znamed_modulesr   inspect	signature
parametersrS   rT   FutureWarningZregister_forward_pre_hookr   Zregister_forward_hook)r   r=   r   r   r   rs   r   r   r   r   nameZsubmodnum_argsr#   )r=   r   r   r$   r     st    &    







)NN)NNNN)3r   rS   typingr   r   r   r   r   r   r:   Z#torch.distributed._tensor._dispatchdistributedZ_tensorZ	_dispatchr   Z torch.distributed._tensor.randomr   Ztorch.nnr   Z+torch.distributed._tensor._collective_utilsr   Z'torch.distributed._tensor._redistributer	   r
   Z torch.distributed._tensor._utilsr   Z)torch.distributed._tensor.placement_typesr   r   r   r   r   r   r   r   Ztorch.distributed.device_meshr   r   __all__ZopsZatenZautogradFunctionr   r<   r;   r   r   r   strr   r#   r#   r#   r$   <module>   sP     
0]  Q  
     