U
    yh>                     @   sF   d dl Z d dlZd dlm  m  m  mZ G dd dejjZ	dS )    Nc                   @   sj   e Zd ZdZejjejdddZ	e
dd Zdd Zd	d
 Zdd Zdd ZdedddZdd ZdS )PostLocalSGDOptimizera  
    Wraps an arbitrary :class:`torch.optim.Optimizer` and runs `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
    This optimizer runs local optimizer at every step.
    After the warm-up stage, it averages parameters periodically afer the local optimizer is applied.

    Args:
        optim: The local optimizer.
        averager: A model averager instance to run post-localSGD algorithm.

    Example::

        >>> # xdoctest: +SKIP("undefined variables")
        >>> import torch
        >>> import torch.distributed as dist
        >>> import torch.distributed.algorithms.model_averaging.averagers as averagers
        >>> import torch.nn as nn
        >>> from torch.distributed.optim import PostLocalSGDOptimizer
        >>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
        >>>   PostLocalSGDState,
        >>>   post_localSGD_hook,
        >>> )
        >>>
        >>> model = nn.parallel.DistributedDataParallel(
        >>>    module, device_ids=[rank], output_device=rank
        >>> )
        >>>
        >>> # Register a post-localSGD communication hook.
        >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
        >>> model.register_comm_hook(state, post_localSGD_hook)
        >>>
        >>> # Create a post-localSGD optimizer that wraps a local optimizer.
        >>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
        >>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
        >>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
        >>> opt = PostLocalSGDOptimizer(
        >>>     optim=local_optim,
        >>>     averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
        >>> )
        >>>
        >>> # In the first 100 steps, DDP runs global gradient averaging at every step.
        >>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
        >>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
        >>> for step in range(0, 200):
        >>>    opt.zero_grad()
        >>>    loss = loss_fn(output, labels)
        >>>    loss.backward()
        >>>    opt.step()
    )optimaveragerc                 C   s   || _ | j j| _|| _d S N)r   param_groupsr   )selfr   r    r   a/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/optim/post_localSGD_optimizer.py__init__:   s    
zPostLocalSGDOptimizer.__init__c                 C   s   | j jS r   )r   stater   r   r   r	   r   ?   s    zPostLocalSGDOptimizer.statec                 C   s
   | j  S r   )r   __repr__r   r   r   r	   r   C   s    zPostLocalSGDOptimizer.__repr__c                 C   s   | j  }| jj|d< |S )z
        This is the same as :class:`torch.optim.Optimizer` :meth:`state_dict`,
        but adds an extra entry to record model averager's step to the checkpoint
        to ensure reload does not cause unnecessary warm up again.
        step)r   
state_dictr   r   )r   Zoptim_state_dictr   r   r	   r   F   s    
z PostLocalSGDOptimizer.state_dictc                 C   s8   | j | d|kr"|d | j_ntd d| j_dS )aW  
        This is the same as :class:`torch.optim.Optimizer` :meth:`load_state_dict`,
        but also restores model averager's step value to the one
        saved in the provided ``state_dict``.

        If there is no ``"step"`` entry in ``state_dict``,
        it will raise a warning and initialize the model averager's step to 0.
        r   z]Loaded state dict does not contain a step counter for an averager. Setting step counter to 0.r   N)r   load_state_dictr   r   warningswarn)r   r   r   r   r	   r   P   s    	z%PostLocalSGDOptimizer.load_state_dictc                 C   s   | j   | jj| jd dS )zI
        Performs a single optimization step (parameter update).
        )paramsN)r   r   r   Zaverage_parametersr   r   r   r   r	   r   c   s    
zPostLocalSGDOptimizer.stepTset_to_nonec                 C   s   | j j|d d S )Nr   )r   	zero_grad)r   r   r   r   r	   r   j   s    zPostLocalSGDOptimizer.zero_gradc                 C   s   | j | d S r   )r   add_param_group)r   Zparam_groupr   r   r	   r   m   s    z%PostLocalSGDOptimizer.add_param_groupN)T)__name__
__module____qualname____doc__torchr   	Optimizer	averagersZModelAveragerr
   propertyr   r   r   r   r   boolr   r   r   r   r   r	   r      s   1

r   )
r   r   Z6torch.distributed.algorithms.model_averaging.averagersdistributedZ
algorithmsZmodel_averagingr   r   r   r   r   r   r   r	   <module>   s   