U
    MhPO                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZ ddgZG d	d deZd
de
 de de	 d e_dee ee ee ee ee ee ee eee eeeeeeedddZdd Zee ee ee ee ee ee eeeeeeeedddZee ee ee ee ee ee eeeeeeeedddZee ee ee ee ee ee eeeeeeeeddddZdS )    )ListOptionalN)Tensor)$_get_fused_kernels_supported_devices   )
_default_to_fused_or_foreach_differentiable_doc_foreach_doc_get_scalar_dtype
_get_value_maximize_doc_use_grad_for_differentiable_view_as_real	OptimizerParamsTAdagradadagradc                       sr   e Zd Zdddddeeeeeeee eeee d
 fdd	Z fd
dZdd Z	dd Z
edddZ  ZS )r   {Gz?r   绽|=NF)maximizedifferentiablefused)
paramslrlr_decayweight_decayinitial_accumulator_valueepsforeachr   r   r   c                   s  d|kst d| d|ks,t d| d|ksBt d| d|ksXt d| d|ksnt d| t||||||||	|
d	}t || |
r|	rtdd	| _t   d
 t fdd| j	D std  d|rtd| j	D ]}|d D ]z}| j
| }|d r6tjdt|d d|jdntjdt d|d< t|r`t||n|}tj||tjd|d< qqd S )Ng        zInvalid learning rate: zInvalid lr_decay value: zInvalid weight_decay value: z)Invalid initial_accumulator_value value: zInvalid epsilon value: )	r   r   r   r   r   r   r   r   r   z)`fused` does not support `differentiable`Tcudac                 3   s2   | ]*}|d  D ]}|j j ko&t|V  qqdS )r   N)devicetypetorchZis_floating_point).0ZpgpZfused_supported_devices E/var/www/html/venv/lib/python3.8/site-packages/torch/optim/adagrad.py	<genexpr>G   s   
 z#Adagrad.__init__.<locals>.<genexpr>zX`fused=True` requires all the params to be floating point Tensors of supported devices: .z0`fused` and `foreach` cannot be `True` together.r   r   r&   Zis_fused)dtyper    r+   step)Zmemory_formatsum)
ValueErrordictsuper__init__RuntimeErrorZ_step_supports_amp_scalingr   removeallparam_groupsstater"   Zzerosr
   r    tensor
is_complexcomplexZ	full_likeZpreserve_format)selfr   r   r   r   r   r   r   r   r   r   defaultsgroupr$   r7   Z
init_value	__class__r%   r'   r2      st    



  zAdagrad.__init__c                    s   t  | d }| jD ]4}|dd  |dd |dd |dd }qt| j }t|dkovt	|d d }|s|D ]$}tj
t|d t|dd	|d< qd S )
Nr   r   Fr   r   r   r-   r*   r,   )r1   __setstate__r6   
setdefaultlistr7   valueslenr"   Z	is_tensorr8   floatr
   )r;   r7   r   r=   Zstate_valuesZstep_is_tensorsr>   r&   r'   r@   h   s"    


 zAdagrad.__setstate__c                 C   s4   | j D ](}|d D ]}| j| }|d   qqd S )Nr   r.   )r6   r7   Zshare_memory_)r;   r=   r$   r7   r&   r&   r'   share_memory}   s    

zAdagrad.share_memoryc           
      C   s~   d\}}|d D ]d}|j d k	r||j jO }|t|O }|| ||j  | j| }	||	d  ||	d  q||fS )N)FFr   r.   r-   )grad	is_sparser"   r9   appendr7   )
r;   r=   params_with_gradgrads
state_sumsstate_stepshas_sparse_gradhas_complexr$   r7   r&   r&   r'   _init_group   s    


zAdagrad._init_groupc           
      C   s   d}|dk	r&t   | }W 5 Q R X | jD ]}g }g }g }g }| |||||\}}	t|||||d |d |d |d ||d |d |d |	|d	 t| d
dt| ddd q,|S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   r   
grad_scale	found_inf)r   r   r   r   rO   r   r   r   rP   r   rR   rS   )r"   Zenable_gradr6   rQ   r   getattr)
r;   closureZlossr=   rK   rL   rM   rN   rO   rP   r&   r&   r'   r-      sF    

    

zAdagrad.step)r   r   r   r   r   N)N)__name__
__module____qualname__r   rE   r   boolr2   r@   rG   rQ   r   r-   __classcell__r&   r&   r>   r'   r      s6         
Pa[  Implements Adagrad algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
            &\textbf{initialize} :  state\_sum_0 \leftarrow \tau                          \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
            &\hspace{5mm}\theta_t \leftarrow
                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    aH  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        initial_accumulator_value (float, optional): initial value of the
            sum of squares of gradients (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        z	
        a  
        fused (bool, optional): whether the fused implementation (CPU only) is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: None). Please note that the fused implementations does not
            support sparse or complex gradients.
    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html

    F)r   rL   rM   rN   r   rR   rS   rO   r   r   rP   r   r   r   r   r   c                C   s   t dd |D std|dkr<|dkr<t| |	dd\}}|dkrHd}|dkrTd}|rjtj rjtd|rtj rtd|rtj st}n|rtj st}nt}|| ||||||||||	|
||d	 dS )
ztFunctional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    c                 s   s   | ]}t |tjV  qd S N)
isinstancer"   r   )r#   tr&   r&   r'   r(     s     zadagrad.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)Z	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers
r   r   r   r   rO   r   r   rP   rR   rS   )	r5   r3   r   r"   ZjitZis_scripting_fused_adagrad_multi_tensor_adagrad_single_tensor_adagrad)r   rL   rM   rN   r   rR   rS   rO   r   r   rP   r   r   r   r   r   _funcr&   r&   r'   r      sN      
c                 C   s   |   }t|||S r[   )sizer"   Zsparse_coo_tensor)rH   grad_indicesrC   rd   r&   r&   r'   _make_sparse=  s    rf   )r   rL   rM   rN   rR   rS   r   r   r   r   rO   r   r   rP   c             	   C   s  |d kr|d kst t| |||D ]\\}}}}|d7 }t|}|sH|n| }|dkrr|jrdtd|j||d}|d|d |   }|jr| }| }| }|	t
|||d ||}|  	|	}|j	t
|||| | d q"t|}|r$t|}t|}t|}|j||dd |rH| |	 }n| 	|	}|j||| d |r"t|}t|}q"d S )Nr   r   z;weight_decay option is not compatible with sparse gradientsalpha   value)AssertionErrorzipr   rI   r3   addZcoalesceZ_indicesZ_valuesZadd_rf   powZsparse_maskZsqrt_r"   r9   Zview_as_realZaddcmul_sqrtZaddcdiv_Zview_as_complex)r   rL   rM   rN   rR   rS   r   r   r   r   rO   r   r   rP   paramrH   Z	state_sumZstep_tr-   Zclrre   Zgrad_valuesstdZ
std_valuesr9   r&   r&   r'   ra   B  sH    
 




ra   c                   s  |rt d|d kr|d ks t t| dkr0d S t| |||g}| D ]J\\}}}}}|
optdd |D }|rt|||| ||	d|||||d qJ|rt||| |rt	|}|d j
rtj|tjddd	dd
 nt|d |dkr|rtj|||d
 ntj|||d
} fdd|D }tj|||dd t|}t||	 |dksj|r|t|| |}nt||}t||| qJd S )Nz#_foreach ops don't support autogradr   c                 s   s   | ]}|j V  qd S r[   )rI   )r#   rH   r&   r&   r'   r(     s    z(_multi_tensor_adagrad.<locals>.<genexpr>Tr^   g      ?cpu)r    rg   r   c                    s&   g | ]}  d t |d     qS )r   )r   )r#   r-   r   r   r&   r'   
<listcomp>  s    z)_multi_tensor_adagrad.<locals>.<listcomp>rj   )rl   rD   r   "_group_tensors_by_device_and_dtyperC   anyra   r   r"   Z_foreach_negZis_cpu_foreach_add_r8   Z_foreach_addZ_foreach_addcmul_Z_foreach_sqrtZ_foreach_mul_Z_foreach_mulZ_foreach_addcdiv_)r   rL   rM   rN   rR   rS   r   r   r   r   rO   r   r   rP   Zgrouped_tensorlistsdevice_paramsdevice_gradsdevice_state_sumsdevice_state_stepsrb   Zdevice_has_sparse_gradZ	minus_clrrr   	numeratorr&   rt   r'   r`     s    



  
  
r`   )r   rL   rM   rN   rR   rS   r   r   r   r   rO   r   r   rP   returnc                C   s>  | sd S |
s|rt d|r$t d|d k	r6|j|ind }|d k	rL|j|ind }t| |||g}| D ]\\}}\\}}}}}d\}}|d k	r|d k	r||kr|j|dd||< || }|d k	r|d k	r||kr|j|dd||< || }t|d tj||||||||	|||d |d k	rjt	||gt
|  qjd S )Nz5`fused` does not support sparse grad or complex paramz<adagrad with fused=True does not support differentiable=True)NNT)Znon_blockingr   )r   r   r   r   r   rR   rS   )r3   r    r   rv   itemstor"   rx   Z_fused_adagrad_Z_foreach_sub_rD   )r   rL   rM   rN   rR   rS   r   r   r   r   rO   r   r   rP   Zgrad_scale_dictZfound_inf_dictZgrouped_tensorsr    rb   ry   rz   r{   r|   Zdevice_grad_scaleZdevice_found_infr&   r&   r'   r_     sd    
 r_   )NNNFNFF)typingr   r   r"   r   Ztorch.utils._foreach_utilsr   Z	optimizerr   r   r	   r
   r   r   r   r   r   r   __all__r   __doc__rY   rE   r   rf   ra   r`   r_   r&   r&   r&   r'   <module>   s   0 *9       J>g