U
    Mhc                     @   s  d dl mZmZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ ddgZG dd deZd	d
e de de	 de d	 e_ee ee ee ee ee ee eeeeeeeeeeedddZee ee ee ee ee ee eeeeeeeeeeedddZeeddee ee ee ee ee ee eee eeeeeeeeeedddZdS )    )castListOptionalTupleUnionN)Tensor   )_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported_dispatch_sqrt_foreach_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_stack_if_compiling_use_grad_for_differentiable_view_as_real	OptimizerParamsTNAdamnadamc                       sr   e Zd Zddddddeeeeef eeeeee eeed	 fd
dZ fddZ	dd Z
edddZ  ZS )r   Mb`?g?g+?:0yE>r   Mbp?FN)foreachmaximize
capturabledifferentiable)paramslrbetasepsweight_decaymomentum_decaydecoupled_weight_decayr   r   r    r!   c                   s   d|kst d| d|ks,t d| d|d   krDdk sXn t d|d  d|d   krpdk sn t d|d  d|kst d	| d|kst d
| t|||||||	||
|d
}t || d S )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: )
r#   r$   r%   r&   r'   r(   r   r   r    r!   )
ValueErrordictsuper__init__)selfr"   r#   r$   r%   r&   r'   r(   r   r   r    r!   defaults	__class__ C/var/www/html/venv/lib/python3.8/site-packages/torch/optim/nadam.pyr.      s2    zNAdam.__init__c                    s  t  | | jD ]}|dd |dd  |dd |dd |dd |d D ]}| j|g }t|dkrZt|d	 st	|d	 }|d rtj
|t |jd
ntj
|t d|d	< t|d sZ|d }|d rtj
|t |jd
ntj
|t d|d< qZqd S )Nr   Fr   r    r!   r(   r"   r   stepdtypedevicer7   
mu_product)r-   __setstate__param_groups
setdefaultstategetlentorchZ	is_tensorfloattensorr   r8   )r/   r>   grouppZp_stateZstep_valZmu_prod_valr1   r3   r4   r;   E   s<    
    zNAdam.__setstate__c                 C   s,  d}|d D ]}	|	j d k	r|t|	O }||	 |	j jrDtd||	j  | j|	 }
t|
dkr|d rtjdt	 |	j
dntjdt	 d	|
d
< |d rtjdt	 |	j
dntjdt	 d	|
d< tj|	tjd|
d< tj|	tjd|
d< ||
d  ||
d  ||
d  ||
d
  q|S )NFr"   z'NAdam does not support sparse gradientsr   r    r3   r6   r)   r9   r5   r*   r:   )Zmemory_formatexp_avg
exp_avg_sq)gradrA   
is_complexappendZ	is_sparseRuntimeErrorr>   r@   Zzerosr   r8   rC   ZonesZ
zeros_likeZpreserve_format)r/   rD   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepshas_complexrE   r>   r3   r3   r4   _init_groupc   s>    



 
 
zNAdam._init_groupc                 C   s   |    d}|dk	r.t  | }W 5 Q R X | jD ]}g }g }g }g }g }g }	ttttf |d \}
}| |||||||	}t||||||	|
||d |d |d |d |d |d |d	 |d
 |d |d q4|S )zPerforms a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr$   r#   r&   r'   r%   r   r(   r   r    r!   )beta1beta2r#   r&   r'   r%   r   r(   r   r    r!   rR   )	Z _cuda_graph_capture_health_checkrA   Zenable_gradr<   r   r   rB   rS   r   )r/   closureZlossrD   rL   rM   rN   rO   rP   rQ   rT   rU   rR   r3   r3   r4   r5      sV    


z
NAdam.step)r   r   r   r   r   F)N)__name__
__module____qualname__r   rB   r   boolr   r.   r;   rS   r   r5   __classcell__r3   r3   r1   r4   r      s8         

)2a  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}, \:\textit{maximize}             \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        decoupled_weight_decay (bool, optional): whether to use decoupled weight
            decay as in AdamW to obtain NAdamW (default: False)
        z	
        z

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    )r"   rM   rN   rO   rP   rQ   rT   rU   r#   r&   r'   r%   r(   r   r    r!   rR   c                C   sb  t | D ]R\}}|s|| n||  }|| }|| }|| }|| }t|rzt|}t|}t|}t|}tj s|rt }|jj|jj  kr|jjkrn n|jj|kst	d| d|d7 }|r|}nt
|}d||  }|	dkr&|r|d||	   n|j||	d}|ddd||
     }|ddd|d |
     }||9 }||d|  ||j||d| d	 || }|s|r||}|| }|| d|  d|   }|| | d|   }||| ||| qt
|| }|| |j||| d|  dt
|  d	 |j||| | d|  d	 qd S )
NzVIf capturable=True, params, mu_products and state_steps must be on supported devices: .r   r   alphar*         ?Q?)value)	enumeraterA   rI   Zview_as_real_utilsis_compilingr   r8   typeAssertionErrorr   Zmul_addZlerp_Zaddcmul_divsqrtZaddcdiv_Zadd_)r"   rM   rN   rO   rP   rQ   rT   rU   r#   r&   r'   r%   r(   r   r    r!   rR   iparamrH   rF   rG   r:   Zstep_tcapturable_supported_devicesr5   Zbias_correction2mumu_nextdenomZmu_product_nextr3   r3   r4   _single_tensor_nadam  sp    




 


	


    rp   c          "         s  t | dkrd S |rtdtj sb|rbtddtfddt| ||D sbtd dt	| |||||g}|
 D ]\\}}}}}}}|rt|||| |rt|}|d jrtj|tjd	d
dd	d nt|d |	dkr6|rt|d|	   n(|r&tj|||	d ntj|||	d}t||d   t| t|||d  t|}|r t|}td|}t|d t|d	 t|  t| td|}t|d t|d	 t|  ~t|}t|d	 t| t| n:fdd|D } fdd|D } fdd|D }t|| t|| t|| ~|r&t|d	 t| t|d	}t| t|| |}~t||}t| t|d	 t|| |} ~t||}!t|!| | t||!| qtfddt||D }tfddt||D } t|||| t||||  qd S )Nr   z#_foreach ops don't support autogradF)Zsupports_xlac                 3   sD   | ]<\}}}|j j|j j  ko*|j jkn  o:|j j kV  qd S N)r8   re   ).0rE   mpr5   )rl   r3   r4   	<genexpr>  s   $z&_multi_tensor_nadam.<locals>.<genexpr>zWIf capturable=True, params, mu_products, and state_steps must be on supported devices: r\   r*   cpu)r8   r]   r   r`   g      c                    s    g | ]}t d  t|  qS )r   )r   r   rr   r5   )rU   r3   r4   
<listcomp>  s   z'_multi_tensor_nadam.<locals>.<listcomp>c                    s(   g | ] } d ddt |     qS )r*   r_   r`   r   rv   rT   r'   r3   r4   rw     s   c                    s,   g | ]$} d ddt |d      qS )r*   r_   r`   r   rx   rv   ry   r3   r4   rw     s   c                    s0   g | ](\}}t  d |  d t |  d qS r*   rx   )rr   r:   rm   r#   r3   r4   rw      s   c                    s0   g | ](\}}t  | d t ||   d qS rz   rx   )rr   r:   rn   r|   r3   r4   rw   &  s   )r@   rf   rA   rc   rd   r   allzipr   Z"_group_tensors_by_device_and_dtypevaluesr   Z_foreach_negZis_cpuZ_foreach_add_rC   Z_foreach_mul_Z_foreach_addZ_foreach_lerp_Z_foreach_addcmul_Z_foreach_sqrtZ_foreach_mulZ_foreach_powZ_foreach_sub_Z_foreach_neg_Z_foreach_sqrt_Z_foreach_div_Z_foreach_subZ_foreach_addcdiv_r   )"r"   rM   rN   rO   rP   rQ   rT   rU   r#   r&   r'   r%   r(   r   r    r!   rR   Zgrouped_tensorsZgrouped_paramsZgrouped_gradsZgrouped_exp_avgsZgrouped_exp_avg_sqsZgrouped_mu_productsZgrouped_state_steps_Zexp_avg_sq_sqrtexponentZmusZmu_nextsZbias_correction_sqrtro   Zstep_size_gradsZstep_size_expavg	numeratorr3   )rT   rU   rl   r#   r'   r4   _multi_tensor_nadamp  s   




   

  
       





      r   )Zsingle_tensor_fnF)r"   rM   rN   rO   rP   rQ   r(   r   r    r!   rR   r   rT   rU   r#   r&   r'   r%   c                C   s   t dd |D stdt dd |D s4td|dkrNt| |	dd\}}|rdtj rdtd	|rxtj sxt}nt}|| |||||||||||||||	|
d
 dS )zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c                 s   s   | ]}t |tjV  qd S rq   
isinstancerA   r   rr   tr3   r3   r4   rt   V  s     znadam.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc                 s   s   | ]}t |tjV  qd S rq   r   r   r3   r3   r4   rt   [  s     zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF)Z	use_fusedz6torch.jit.script not supported with foreach optimizers)rT   rU   r#   r&   r'   r   r(   r%   r    r!   rR   )r}   rK   r
   rA   ZjitZis_scriptingr   rp   )r"   rM   rN   rO   rP   rQ   r(   r   r    r!   rR   r   rT   rU   r#   r&   r'   r%   r   funcr3   r3   r4   r   9  sL      
)FNFFFF) typingr   r   r   r   r   rA   r   Z	optimizerr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r   __doc__rB   rZ   rp   r   r   r3   r3   r3   r4   <module>   s   D 6'D_ J
      