U
    yh                     @   s  U d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d d	l)m*Z*m+Z+m,Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d d
l3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z: d dl;m<Z< dddddddddddddgZ=dZ>dZ?dZ@dZAeeB ZCee&eejDeEeFeBf ZGeeGeeG eeG eeBdf f ZHeeBeHf ZIeeI ZJeeBeeIeJf f ZKeL ZMee eNd< e jOd d! ZPeG d"d dZQeG d#d$ d$eQZRejSdd%diejTeBeUeUeCd'd(d)ZVG d*d+ d+ZWd,d- ZXddd.ejTeejYjZd/f eUeeejT  eeQ eRd0d1d2Z[eeBeHf eKeRdd3d4d5Z\eejTejYjZf eBed6d7d8Z]eeBef eReeBef d9d:d;Z^ejTeReeBeHf d<d=d>Z_ejTeeBeHf eRe7d?d@dAZ`ejYjZddBdCdDZaeKeeBeHf dEdFdGZbejYjZeeBeHf eReKdHdIdJZcejTeejYjZd/f eReKdKdLdMZdejTejYjZeKeReKdNdOdPZeejTeejYjZd/f eKeRddQdRdSZfddd.ejTeeejT  eeQ eeBeHf dTdUdZgddd.ejTeejYjZeejYjZ f eeejT  eeQ eKdVdWdZhddd.ejTeejYjZeejYjZ f eeejT  eeQ eeeBeHf eKf dVdXdZiejTeeejTeeBeHf f eeBeHf f eeBeHf dYdZd[Zjdd\ejTeeBeHf eeQ e7d]d^dZkdd\ejTeejYjZeejYjZ f eKeeQ dd_d`dZldd\ejTeejYjZeejYjZ f eeBeHf eKeeQ e7dadbdZmedd\ejTeeQ ddcdddeZnedd\ejTeejYjZd/f eeQ ddfdgdhZodS )j    N)asdict	dataclassfield)chain)AnyCallablecastDict	GeneratorIterableListno_type_checkOptionalSetTupleUnion)ShardedTensor)_broadcast_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)DTensor)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)_IncompatibleKeys)DistributedDataParallel)tree_map_onlyFQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dictZ_flat_paramparam_groupsparamsstate_patched_state_dictc                  c   s.   t  } t   z
d V  W 5 | r(t   X d S N)gc	isenableddisableenable)
is_enabled r>   Y/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/checkpoint/state_dict.py_gc_context[   s    
r@   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dS )r-   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dictN)__name__
__module____qualname____doc__rA   bool__annotations__rB   rC   rD   rE   rF   rG   r>   r>   r>   r?   r-   f   s   
$c                   @   s   e Zd ZU eedZeeee	j
f eee	j
f f ed< eedZeeee	j
f eee	j
f f ed< eedZee ed< dZeed< dZeed< ejZeed< eedZeej ed	< d
S )_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rH   rI   rJ   r   dictrP   r	   r   strtorchTensorr'   rM   rQ   setrR   r   rS   rL   rT   
contextlibnullcontextrU   r   listrV   r   nnModuler>   r>   r>   r?   rN      s   
rN   )maxsizeT)modelnameskip_ddp_prefixskip_compiler_prefixreturnc           
         s  | td}d|kr|hS |d}g }| }t|D ]8\}}t|trj|dksTt|j}|sh|| q4t|t	r|t
|d k r||d  tkrd| t|t}	 r  d  fdd|	jD   S t|t}|tkr|| t||}q4t|tjjjr.|dkst|j}|sn|| q4|| |tjjjkrd|t
|d krntdq4t||}q4d| tdhS )	a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `Set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .module   c                    s   h | ]}  | qS r>   r>   .0fqnprefixr>   r?   	<setcomp>   s     z_get_fqns.<locals>.<setcomp>	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPAssertionErrorri   appendFSDPlen_FLAT_PARAMjoingetattrZ_fqnsr#   rY   Z_dynamoZ
eval_frameZOptimizedModulerq   r_   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)
rb   rc   rd   re   Z	obj_namesZfqn_obj_namesZcurr_objiZcurr_obj_nameZ
flat_paramr>   rn   r?   	_get_fqns   sD    


 






r   c                   @   s   e Zd ZdS )_EXTRA_STATEN)rH   rI   rJ   r>   r>   r>   r?   r      s   r   c                 #   s4   t  tjttd fdd  | dE d H  d S )N)ri   curr_fqnrf   c                 3   s    |  |r| dnd}|  D ].\}}|kr6q$| | } ||E d H  q$t| jdd| jddD ]*\}}|| jkrqn| | }||fV  qnt| jdtj	j
tj	j
kr| tjjj }|t fV  d S )Nrh   rg   F)recurseget_extra_state)addZnamed_childrenr   Znamed_buffersnamed_parametersZ_non_persistent_buffers_setr}   	__class__r_   r`   r   r~   ri   r   r   )ri   r   rc   	submodulenew_fqnobjr   Zvisited_modulesr>   r?   r      s*    

 

z+_iterate_valid_model_state.<locals>.recurserg   )r[   r_   r`   rX   r
   )rb   r>   r   r?   _iterate_valid_model_state   s    r   )
submodulesoptions.)rb   optims
optim_onlyr   r   rf   c                C   s6  |rt dt |r |s td|p(t }i }i }t| D ]\}}t|trNq:t| |}	|	|d}
|
dk	rt
tt || |	 || ||< n|	 ||< |	D ]}
t|ts|||
< qq:t| D ]"\}}|D ]}
t
tj|||
< qqt }|rVt|}|  D ]L\}}||krqt| |}	t|	dks>td|dd |	D  q|jrn|jsntdt| }|r|jrt|j|jd	}t|j|jp|jd	}tj }nt!|jd
}t"|jd
}tj#}t$j%dd }t&j'|| |||d}nt$j(}t)f t*|||||t
t+t,j- || t|dkdS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrj   z)Submodule FQN should only have 1 instancec                 s   s   | ]}| d V  qdS )rh   Nr>   rk   r>   r>   r?   	<genexpr>A  s     z"_verify_options.<locals>.<genexpr>z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpuZ
rank0_only)r   c              
   s   s<   t  * tj| |||d d V  W 5 Q R X W 5 Q R X d S )Nri   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsry   r   r   r>   r>   r?   $fsdp_state_dict_type_without_warning_  s    
z=_verify_options.<locals>.fsdp_state_dict_type_without_warningr   r   )rP   rQ   rR   rU   rV   rS   rT   ).r   warnFutureWarningr   r-   r   ru   r   r   getr   r   rX   updatecopyr^   itemsrY   rZ   r[   named_modulesrz   rw   rF   rA   
ValueErrorry   rV   r   rB   r   r!   ZFULL_STATE_DICTr   r   ZSHARDED_STATE_DICTr\   contextmanager	functoolspartialr]   rN   r   r   r_   r`   )rb   r   r   r   r   rP   rQ   rc   paramfqnsrm   Zparam_Zfqns_rR   ri   rV   r   r   r   r   rU   r>   r>   r?   _verify_options  s    






 

r   )model_state_dictoptim_state_dictinforf   c                 C   s   |j D ]}t|}|d k	stdq|jrf| sf|jsf|jsf|jrF|jsf|jrf|j	sft
dt d|jr|s|jr||js|j	st
d| |  D ] }t|krt
| dt dqd S )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rh   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rV   r"   rw   rS   rR   rC   rB   rA   rE   rF   r   distget_rankrT   keysr{   )r   r   r   ri   Z
fsdp_statekeyr>   r>   r?   _verify_state_dict  sN    
	r   )r   apirf   c                 C   s,   t | |}|tkr(tjt | j|| d}|S )N)self)r}   r7   r   r   r   )r   r   callr>   r>   r?   _state_dict_fn  s    
r   )
state_dictr   rf   c                 C   sF   |j r0|jrtj st nd}t| |j|dS |jr>t| S | S d S )N)r   )rB   
ranks_only)rA   rB   rY   distributedZis_initializedtupler   r   )r   r   r   r>   r>   r?   _maybe_full_or_cpu_state_dict  s      r   )rb   r   rf   c              	   C   s  |j s
i S |  t| d }W 5 Q R X t| D ]t}t| |}t|dks\t||ftt	|}||kr6t
ddd}|||std| d| ||||< q6|jri }| D ]L}|jD ]@}||sq|jr|| ||< q|t|d  }	|| ||	< qq|}|jrZ|  D ]6\}}
|
jr6q"t| |}|D ]}|| qDq"t| D ](\}}t|rf|jrf|| qft||S )Nr   rj   )rf   c                 S   s   t |t | krdS |d}| d}d}t|D ]P\}}||| krr|d7 }|t |kr|t |d k  S q4|dkr~q4q4 dS q4dS )NFrh   r   rj   )ri   rq   T)rz   rs   rt   )r   rm   Z	fqn_splitZ	key_splitZfqn_idxZkey_idxZkey_namer>   r>   r?   verify  s    

z%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is )rS   rU   r   r^   r   r   rz   rw   nextiterrL   r   poprR   
startswithrD   rC   r   requires_gradr   rY   	is_tensoris_metar   )rb   r   r   r   r   rm   r   new_state_dictro   r   r   pr>   r>   r?   _get_model_state_dict  sF    





r   )rb   r   r   rf   c              
   C   sT  |j r|s|jsti i S i }t| D ]f\}}t| |}t| |ddd}t||D ]8\}}	|jrlt dkr||	kr||||	< |||	< qRq&|jrd }
|	 D ]<\}}t
|r| dkr|
d kr|j}
q|
|jkstq|
d k	stt|||
|jd |	 D ]\}}|||< q| ( ttt| d||jdW  5 Q R  S Q R X d S )NF)rd   re   r   )devicerE   load_state_dict)r   rE   )rS   rF   r$   r   r   zipr   r   r   r   rY   r   dimr   rw   r   rE   rU   r   r   )rb   r   r   local_state_dictr   valuer   Zfqns_with_prefixrm   Zfqn_with_prefixr   Zlocal_stater>   r>   r?   _load_model_state_dict  sX    

   
   
 r   )optimrf   c                 C   s   | j r
dS | jD ]6}|t D ](}|jdk	r2td|jrt||_qqg }| jD ]"}d|krR||d  d|d< qR| j	dd | jD ]}d|kr|
d|d< q| jdd dS )	zH
    Initialize optim states by calling the step() with zero grads.
    Na  state_dict can only be used if the optimizer states are initialized (usually after one step() with gradients) or gradients are None. For the later case, state_dict will fake the gradients as zero to initialize the optimizer states. However, the gradients are not None.lrg        )closurer   T)Zset_to_none)r6   r4   _PARAMSZgradr   r   rY   Z
zeros_likerx   stepr   Z	zero_grad)r   param_groupr   Zlrsr>   r>   r?   _init_optim_state<  s(    




r   )r   rf   c           	   
   C   s   dd }i }t t| t  D ]@\}}t t| D ](\}}|| ||t d| d| < q4qt t| t D ]L}|t}t tt	 |D ].}| D ] \}}||t d| d| < qqqn|S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 S   s*   t | tjttfs&tdt|  dd S )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rh   )ru   rY   rZ   intfloatNotImplementedErrortype)vr>   r>   r?   _raise_if_type_not_supported  s    z?_flatten_optim_state_dict.<locals>._raise_if_type_not_supportedrh   )
r   r*   _STATEr   r+   _PGr   r   r   rX   )	r   r   retrm   r6   kr   r   r   r>   r>   r?   _flatten_optim_state_dictb  s    *
r   )r   r   r   rf   c                 C   s`  i }g }t |t|i}| jD ]>}|tg i |t D ]}|j| D ]r}|d t }	t|	tsbt|	| |j	stqDi ||< | j
|  D ]*}
|t  d| d|
  tt|| |
< qqDq6ttt |d t d }| D ]|}|tkrq|t d| d|  }||d kr||d |< q|d | |krtd| d| d| d|d |  d	qq|S )z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    rh   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r   r   r4   rx   r   rP   ru   r^   rw   r   r6   r   r   r*   r   rX   r   )r   r   r   r6   pg_state
return_osdr   r   rm   r5   Z
state_nameZfirst_param_fqnr   r   r>   r>   r?   _unflatten_optim_state_dict  s:    	
$r   )rb   
optimizersr   rf   c              
      s  |j s
i S ti tg i}|D ]}t| t|d }|jr|  t| ||}W 5 Q R X |sbqt	|t 
 D ]*}d|krr|t ||t |dd< qr|t D ]}dd |t D }||t< qnt	tdd |jD }tt|tt|}	i  |  D ]X\}
}t| |
}t|d	ks(ttt|}||	krBq|	| }| |< | |< qt	|t 
 D ]$}
 |
 }|t |
|t |< qn|t D ] } fd
d|t D |t< q|sqtt|t |t  tt|t |t  q|jrtt t!|}t"||S )Nr   rq   
_orig_mod.rg   c                 S   s   g | ]}| d dqS )r   rg   rr   rl   r   r>   r>   r?   
<listcomp>  s     z)_get_optim_state_dict.<locals>.<listcomp>c                 s   s   | ]}|t  V  qd S r8   )r   )rl   gr>   r>   r?   r     s     z(_get_optim_state_dict.<locals>.<genexpr>rj   c                    s   g | ]} | qS r>   r>   )rl   pidZfqn_pid_mappingr>   r?   r     s     )#rT   r   r   r   r   rV   rU   ry   r   r^   r   r   rr   r   r   from_iterabler4   rW   r   rangerz   r   r   rw   r   r   r   r*   r   r+   extendrG   r,   r   r   )rb   r   r   r   r   Zosdr   r   r5   Zparam_pid_mappingr   r   r   rm   r   groupr>   r   r?   _get_optim_state_dict  sX    

 

 r   )rb   r   r   r   rf   c              	   C   s  i }g }t |t|i}i }tdd tt|t   D r<|S |jD ]}|tg i |t D ]}	|j	|	 D ]}
|
|j
krd}tt|t D ]"}|
ttt |t krd} qqnd}|sql|d t }t|tst||
 |	jrtt|t  |
 ||
< tt|t D ]6}|
ttt |t kr
t|t d |t|< q
qlq^qBtt|t D ]R}|t|d}|dkrxqV| D ]$\}}|tkrq||| |< qqV|S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c                 s   s   | ]}t |tV  qd S r8   )ru   r   r   r>   r>   r?   r   (  s    z*_split_optim_state_dict.<locals>.<genexpr>FTr   rj   )r   r   allr   r*   r   r4   rx   r   rP   rQ   r+   r   rX   ru   r^   rw   r   rz   idr   r   )rb   r   r   r   r6   r   r   Z
pg_mappingr   r   rm   Z	in_paramsZloaded_param_groupr5   idxr   r   r>   r>   r?   _split_optim_state_dict  sX    
 
 "

r   )rb   r   r   r   rf   c              
      s.  |j s
d S |D ]}t| |rTt|kr8t| |||}qXt|ttttf ||}ni }|j	rZ| 
 D ]\}}t| |}t| |dd}	||	krqht|dkst| |	 |t D ]6}
ttttf |
}fdd|t D }||t< qtt|t }t| D ]&}|kr||||< qqh|  t| ||}W 5 Q R X n|jrd|_t| |f|}d|_d   fdd}ttj||} d k	stt|\}}t|\}}t || d	 | D ]6}||kr||kst|| ||< || ||< qt!||}t"|d
|d qd S )NF)re   rj   c                    s   g | ]}|  qS r>   r   )rl   r   )rm   fqn_with_compilerr>   r?   r   |  s    z*_load_optim_state_dict.<locals>.<listcomp>Tc                    s2   |   dkr. d kr| j n | jkr.td| S )Nr   zDevice mismatch)r   r   r   )tr   r>   r?   _device  s    
z'_load_optim_state_dict.<locals>._devicer   r   r   )#rT   r   r   r   r   r   r	   rX   r)   rV   r   r   rz   rw   r   r   r   r   r*   r^   r   rr   rU   ry   Zoptim_state_dict_to_loadrF   rA   r   r&   rY   rZ   r   r   r   r   )rb   r   r   r   r   r   Zoriginal_fqn_r   Zfqns_with_compilerr   valr5   Z	osd_stater   r   r   Zflatten_osdZosd_mappingZflatten_local_osdZlocal_osd_mappingZ	optim_keyr>   )r   rm   r   r?   _load_optim_state_dictV  s    
     
  


  	
 r   )rb   r   r   rf   c             
   C   sL   t  < t| t d||d}t| |}t|i | |W  5 Q R  S Q R X dS )aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    Fr   r   r   N)r@   r   r   r   r   )rb   r   r   r   r   r>   r>   r?   r.     s    
)rb   r   r   r   rf   c             
   C   sh   t  X t|tjjr|fnt|}t| |d||d}t| ||}ti || |W  5 Q R  S Q R X dS )a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    Tr   N)	r@   ru   rY   r   	Optimizerr   r   r   r   )rb   r   r   r   r   r   r>   r>   r?   r/     s    c             
   C   sv   t  f t|tjjr|fnt|}t| |d||d}t| |}t| ||}t	||| ||fW  5 Q R  S Q R X dS )a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    Fr   N)
r@   ru   rY   r   r   r   r   r   r   r   )rb   r   r   r   r   r   r   r>   r>   r?   r0     s     F
)rb   r   rf   c           	         s   |si S t tt| tjrtdt t	t
tjt
ttf f |}i }| D ]p\}}|  D ]^\}}||krvqdt| |}t|dkstdtt| d | fdd| D  qdqT|S t	t
ttf |S d S )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rj   z/FQNs for a submodule should only have 1 elementrh   c                    s   i | ]\}} | |qS r>   r>   )rl   Zsubfqnr   rn   r>   r?   
<dictcomp>u  s      z/_unflatten_model_state_dict.<locals>.<dictcomp>)ru   r   r   r   r_   r`   r   r   r   r   r	   rX   r)   r   r   r   rz   rw   r   )	rb   r   Zcast_state_dictr   r   Zsub_state_dictrc   mr   r>   rn   r?   _unflatten_model_state_dict[  s*    
r   )r   )rb   r   r   rf   c             
   C   sR   t | |}t 8 t| t d|d}t|i | t| ||W  5 Q R  S Q R X dS )a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    Fr   r   N)r   r@   r   r   r   r   )rb   r   r   r   r>   r>   r?   r1   |  s     )rb   r   r   r   rf   c             	   C   s\   t  L t|tjjr|fnt|}t| |d|d}ti || t| ||| W 5 Q R X dS )a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr  N)	r@   ru   rY   r   r   r   r   r   r   )rb   r   r   r   r   r>   r>   r?   r2     s    )rb   r   r   r   r   rf   c             
   C   s|   t | |}t b t|tjjr&|fnt|}t| || |d}t||| t	| ||| t
| ||W  5 Q R  S Q R X dS )a4  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r  N)r   r@   ru   rY   r   r   r   r   r   r   r   )rb   r   r   r   r   r   r>   r>   r?   r3     s$    *    )rb   r   rf   c                   sj   t jt| |dfdd}|| _t jt| |d tttf d fdd}|| _t	
| t	
| dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rb   r   c                      s     S r8   r>   r>   _state_dict_callr>   r?   state_dict_call)  s    z0_patch_model_state_dict.<locals>.state_dict_callr   c                    s    | d d S )N)r   r>   r   _load_state_dict_callr>   r?   load_state_dict_call4  s    z5_patch_model_state_dict.<locals>.load_state_dict_callN)r   r   r.   r   r1   r	   rX   r   r   r7   r   )rb   r   r  r  r>   r  r  r?   _patch_model_state_dict  s     
r	  )rb   r   r   rf   c                   s   t jt| ||dfdd}t jt| ||d tttf d fdd}t| t| t	|t
jjrr|fnt|}|D ]}||_||_q~dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rb   r   r   c                      s     S r8   r>   r>   r  r>   r?   r  e  s    z4_patch_optimizer_state_dict.<locals>.state_dict_callr   c                    s    | d d S )N)r   r>   r   r  r>   r?   r  o  s    z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)r   r   r/   r2   r	   rX   r   r7   r   ru   rY   r   r   r   r   r   )rb   r   r   r  r  r   r>   r  r?   _patch_optimizer_state_dict?  s.    

r
  )TT)pr\   r   r9   r   dataclassesr   r   r   	itertoolsr   typingr   r   r   r	   r
   r   r   r   r   r   r   r   rY   Ztorch.distributedr   r   Ztorch.nnr_   Z'torch.distributed._shard.sharded_tensorr   Z#torch.distributed._state_dict_utilsr   r   r   r   r   Ztorch.distributed._tensorr   Z;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   Ztorch.distributed.fsdpr   r   r   ry   r   r   r   r    r!   Z$torch.distributed.fsdp._common_utilsr"   r#   Ztorch.nn.modules.moduler$   Ztorch.nn.parallelr%   rv   Ztorch.utils._pytreer&   __all__r{   r   r   r   rX   r'   rZ   r   r   r(   r)   r*   r+   r,   r[   r7   rM   r   r@   r-   rN   	lru_cacher`   rL   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r.   r/   r0   r   r1   r2   r3   r	  r
  r>   r>   r>   r?   <module>   sv   8(


.
  >$ 
- 
 
 
D
.&A
.@I]
)1Z$
%
,-
>6