U
    yhD                     @   s   d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZmZ d dlmZ e eZeeef dddZejed	d
dZG dd deZ G dd deZ!dS )    N)abcdefaultdict)	AnyDictIterableListOptionaloverloadSequenceTupleUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroup)returnc                   C   s   t ji dS )N)stagefound_inf_per_device)r   ZREADY r   r   \/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_state   s    r   )tensorr   c                 C   s    | j p| jjdddtj fkS )NZxlacpuZhpu)Zis_cudadevicetypetorchZ_CZ_get_privateuse1_backend_name)r   r   r   r   _is_supported_device   s    r   c                   @   s"   e Zd ZdZejddddZdS )_GeneralMultiDeviceReplicatorz
    Lazily serves tensor to request device. This class extends
    _MultiDeviceReplicator to allow support for "cpu" as a device.
    N)master_tensorr   c                 C   s   t |st|| _i | _d S N)r   AssertionErrorZmaster_per_device_tensors)selfr   r   r   r   __init__!   s    z&_GeneralMultiDeviceReplicator.__init__)__name__
__module____qualname____doc__r   Tensorr#   r   r   r   r   r      s   r   c                
       s  e Zd ZdZddddddejjfeeeee	e
ee dd	 fd
dZeejejdddZeeej eej dddZeeejdf eejdf dddZeeej eej dddZeejeej f eejeej f dddZeej ejejddddZd#ejjejeje
eejejf dddZejjddddZejddddZd$eeeejf  dd d!d"Z  Z S )%ShardedGradScaleraA	  
    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
    functionality from GradScaler:
    * Supports Pytorch DDP and FSDP implementations
    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
    nodes

    Example::

        # Creates a ShardedGradScaler once at the beginning of training.
        scaler = ShardedGradScaler()

        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)

                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()

                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
            process group for sharding
    cudag      @g      ?g       @i  TN)r   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   c                    s2   t  j||||||d | jr.|| _tt| _d S )N)r+   r,   r-   r.   r/   )superr#   _enabledr0   r   r   _per_optimizer_states)r"   r   r+   r,   r-   r.   r/   r0   	__class__r   r   r#   X   s    
zShardedGradScaler.__init__)outputsr   c                 C   s   d S r   r   r"   r6   r   r   r   scalen   s    zShardedGradScaler.scalec                 C   s   d S r   r   r7   r   r   r   r8   r   s    .c                 C   s   d S r   r   r7   r   r   r   r8   v   s    c                 C   s   d S r   r   r7   r   r   r   r8   z   s    c                    s   j s
|S t|tjrht|s"tjd kr8|j jd k	sFt|jj	|jdd }|
|jS g ttjttj f d fdd  |S )NTr   Znon_blocking)valc                    s   t | tjrxt| sttdkrXjd kr:| j jd k	sHt	t
j | d | j }|| jS t | tjrt | }t | ttfrt| |S |S tdd S )Nr   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer   r(   r   r    len_scale_lazy_init_scale_growth_trackerr   appendr   getr   dtyper   r   maplisttuple
ValueError)r:   Z
scaled_valiteratorapply_scaler"   stashr   r   rH      s    

z,ShardedGradScaler.scale.<locals>.apply_scale)r2   r;   r   r(   r   r    r=   r>   r   tor   rA   r   r   )r"   r6   Zscaled_outputr   rG   r   r8   ~   s    
 &)grads	found_inf	inv_scaler   c                 C   s   t |dkrd S | dks$td| dks8td|D ]x}|jjdkrbtd|j tdt	|
  dkst|
  dkrtd	g|_ qq<| j| 9  _q<d S )
Nr      z%inv_scale must be a 1-element tensor.z%found_inf must be a 1-element tensor.r   z2tensor device is %s but was expected to be ``cpu``zDGradients were found on a non-CPU device when expected to be on CPU.T      ?)r<   numelr    r   r   loggererrorrE   r   isinfanyitemisnanr   data)r"   rK   rL   rM   gradr   r   r   *_foreach_non_finite_check_and_unscale_cpu_   s(    z<ShardedGradScaler._foreach_non_finite_check_and_unscale_cpu_)	optimizerrM   rL   
allow_fp16r   c              
   C   sd  t |}t |}tdd }t  |jD ]}|d D ]}	|	jd krJq:|sd|	jjtjkrdtd|	jj	r|	jjtjkr|	j
tj }
|

tj|	_|	j }n|	j}||j |j | q:q.| D ]^\}}| D ]L}|d jj
dkr| ||||| qt||||| qqW 5 Q R X |js^| jd k	sPt|| jj |jS )Nc                   S   s   t tS r   )r   rC   r   r   r   r   <lambda>       z3ShardedGradScaler._unscale_grads_.<locals>.<lambda>paramsz%Attempting to unscale FP16 gradients.r   r   )r   r   r   Zno_gradZparam_groupsrX   rA   Zfloat16rE   Z	is_sparser   float32ZcoalesceZ_valuesr   r?   itemsvaluesrY   r@   Z*_amp_foreach_non_finite_check_and_unscale_r!   r=   r    )r"   rZ   rM   rL   r[   Zper_device_inv_scaleZper_device_found_infZper_device_and_dtype_gradsgroupparamZparam_grad_fp32Z
to_unscaler   Zper_dtype_gradsrK   r   r   r   _unscale_grads_   sL    

z!ShardedGradScaler._unscale_grads_)rZ   r   c           
      C   sh  | j s
d S | d | jt| }|d tjkr:tdn|d tjkrPtd| jd k	s^t	| j
   }tjddtj| jjd}| |||d|d	< tj|d< | jt| }g }g }g }|d	  D ]p}| jd
kr$|jjd
kr$|| || j}|| |tj|d| jd q|tj|d| jd q|D ]}	|	  qB|rdt|| d S )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().)rN   g        )rA   r   Tr   r   )Zasync_oprb   )r2   _check_scale_growth_trackerr3   idr   ZUNSCALEDRuntimeErrorZSTEPPEDr=   r    doubleZ
reciprocalfloatr   fullr_   r   rd   ra   _devicer   r?   rJ   distZ
all_reducer0   waitZ_foreach_copy_)
r"   rZ   Zoptimizer_staterM   rL   ZworksZfound_inf_on_cpusZfound_inf_on_devicesZfound_inf_on_deviceZworkr   r   r   re     s`    
      


  zShardedGradScaler.unscale_)rL   r   c                 C   s~   | j dk	r| jdk	st| dkrB|  j | j9  _ | jd n8| jd }|| jkrt|  j | j9  _ | jd n|| _dS )z
        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
        NrO   r   rN   )r=   _growth_trackerr    rU   _backoff_factorfill__growth_interval_growth_factor)r"   rL   Z
successfulr   r   r   _amp_update_scale_cpu_:  s    

z(ShardedGradScaler._amp_update_scale_cpu_)	new_scaler   c                    s*  | j s
dS | d\ }|dk	rt|tr8| j| nLd}|jj| jksRt	||
 dksft	||jdksxt	|| j| n fdd| j D }t|dkst	d	|d }t|dkrtdt|D ]}||| 7 }qԈ jjd
kr| | nt| j| j|| j| j| j tt| _dS )a  
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        Nupdateznew_scale should be a float or a 1-element torch.cuda.FloatTensor or                     torch.FloatTensor with requires_grad=False.rN   Fc                    s.   g | ]&}|d    D ]}|j jddqqS )r   Tr9   )ra   rJ   r   ).0staterL   r=   r   r   
<listcomp>p  s    z,ShardedGradScaler.update.<locals>.<listcomp>r   z,No inf checks were recorded prior to update.r   )r2   rf   r;   rj   r=   rq   r   r   rl   r    rP   Zrequires_gradZcopy_r3   ra   r<   rangert   r   Z_amp_update_scale_ro   rs   rp   rr   r   r   )r"   ru   ro   reasonZ
found_infsZfound_inf_combinedir   ry   r   rv   L  s<    


zShardedGradScaler.update)T)N)!r$   r%   r&   r'   rm   rb   ZWORLDstrrj   intboolr   r   r#   r	   r   r(   r8   r   r   r   r   r
   rY   ZoptimZ	Optimizerr   r   rd   re   rt   rv   __classcell__r   r   r4   r   r)   '   sZ   2&-# ?4r)   )"loggingcollectionsr   r   typingr   r   r   r   r   r	   r
   r   r   r   Ztorch.distributeddistributedrm   Ztorch.amp.grad_scalerr   r   r   Z"torch.distributed.distributed_c10dr   	getLoggerr$   rQ   r~   r   r(   r   r   r   r)   r   r   r   r   <module>   s   ,
	