U
    yhL                     @   s
  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZ d dlZd dlmZ d dlm  m  mZ d dlm  m  mZ d dlm  m  mZ  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d d	l9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA d d
lBmCZC d dlDmEZE d dlFmGZG d dlHmIZI erd dlJmKZK dZLzd dlMmNZNmOZO W n ePk
r   dZLY nX eQdZRdZSeejTejTf ZUeeejTeUf  ZVe?jWe6jWe?jXe6jXe?jYe6jYe?jZe6jZe?j[e6j[iZ\e?jZe?j[gZ]e?jYe?j[fZ^ede,eVe?eeC ee' e,dddZ_ee,eVe'e,dddZ`eeeadddZbee'eadddZceeQejTd d!d"ZdeejTeQejTd#d$d%ZeejTeQeejTejTf d#d&d'Zfede,e"jgee
ej"jg  eee
ej"jh  ee
ej"jg  f e,d(d)d*Ziee eadd+d,d-Zjee,e"jgee"jh eeeQejkf  e,d.d/d0Zlee,e"jge,d1d2d3Zmee,ee? ee> ee; eaeaeQeQe,d4	d5d6Znee,e,d7d8d9Zoee,e:eae,d:d;d<Zpede,e'e,d=d>d?Zqee,e,d7d@dAZree,e"jgeeeQejkf  eee"jggdf  eae,dBdCdDZsee,ee"jh e"jgdEdFdGZte"jgee
ej"jg  ee"jg dHdIdJZudej"jgeej"jg ee
ej"jh  eej"jh dKdLdMZvej"jgeej"jg eew dNdOdPZxe"jgeew dQdRdSZye"jgee"jh eeeQejkf  ddTdUdVZzeeeQejkf  eQeejk dWdXdYZ{e"jgee"jh ee"jg eeaeaf dZd[d\Z|e"jgee"jggdf ee"jg dd]d^d_Z}e"jgeejk ee"jg d`dadbZ~e"jgee"jg ee"jg dNdcddZe"jgee"jh eej eejk ddedfdgZee"jh eej eejk ddhdidjZdkdl Ze"jgee"jh eejk eQejkdmdndoZe"jgee"jh ejTddpdqdrZeej ddsdtduZe"jgee"jh ee"jh dvdwdxZee"jh ddydzd{Ze?d|d}d~ZejTe$jdddZdS )    N)AnyCallableDequeDict	GeneratorIterableIteratorListno_type_checkOptionalSetTupleTYPE_CHECKINGUnion)default_hooks)_mesh_resources
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  Z_fsdp_synced)stateprocess_groupsharding_strategypolicydevice_meshreturnc                 C   s   |d k	r|d k	rt d|tk}|r\|d krN|d krN|d krNt d| dqt| ||} n.|rv|| _|jdd| _n|d k	r|nt | _| j | _| j | _	| j	}|r|| j
 9 }tj|| _|| j | _| S )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   Zmesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr0   r   ranksize
world_size_inter_node_pgr   DefaultStateZ_get_gradient_predivide_factorZ_gradient_predivide_factorZ_gradient_postdivide_factor)r/   r0   r1   r2   r3   Zis_hybrid_strategyZdata_parallel_world_size r@   T/var/www/html/venv/lib/python3.8/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_stateg   s@    
  rB   )r/   r0   r3   r4   c                 C   s   |rBt |r0|| _|jdd| _|jdd| _qtd|j nX|d krrt }t|| j	
 \}}|| _|| _n(t|r|\| _| _ntdt| t| jd| _| S )Nr   r5      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r0   )"_is_valid_hybrid_shard_device_meshr9   r:   r>   r0   r6   ndimr   !_init_intra_and_inter_node_groups_device_handleZdevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_stateZ_inter_node_state)r/   r0   r3   Zdefault_groupZintra_node_groupZinter_node_groupr@   r@   rA   r8      s2    
 r8   )r0   r4   c                 C   s(   t | to&t| dko&tdd | D S )N   c                 s   s   | ]}t |tjV  qd S N)
isinstancedistProcessGroup).0Zpgr@   r@   rA   	<genexpr>   s     z1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>)rN   tuplelenallrD   r@   r@   rA   rI      s
    

rI   )r3   r4   c                 C   s   t | to| jdkS )NrL   )rN   r   rF   )r3   r@   r@   rA   rE      s    rE   )num_devices_per_noder4   c                 C   s   t | \}}|S )aU  
    Return a process group across the current node.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return an intra-node subgroup across
    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
    For example, rank 3 would get [0, 1, ..., 7].
    )rO   Znew_subgroups)rV   Zintra_node_subgroup_r@   r@   rA   _init_intra_node_process_group   s    rX   )global_process_grouprV   r4   c           	         s   d}t | }t | }| }t |  }tD ]6  fddt|D }t j||d} |kr6|}q6|dk	st| d|S )a  
    Return an inter-node process group where each contained rank has the same local rank.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
    would get [5, 13].
    Nc                    s   g | ]} |  qS r@   r@   )rQ   iZ
local_rankrV   r@   rA   
<listcomp>   s    z2_init_inter_node_process_group.<locals>.<listcomp>)Zranksbackendz. expected to assign inter-node pg, but did not)rO   get_backendZget_world_sizeZget_rankrangeZ	new_groupAssertionError)	rY   rV   Zinter_node_pgZsharding_backendr=   Z	num_nodesZmy_local_rankZranks_for_inter_groupgrpr@   r[   rA   _init_inter_node_process_group   s"    

rb   c                 C   s   t |t| |fS )a  
    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
    ``_HYBRID_SHARD_ZERO2`` in FSDP.
    This function assumes each node has an equal number of CUDA-enabled devices.
    Returns:
        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
    )rX   rb   )rY   rV   r@   r@   rA   rG   
  s    rG   )r/   moduleignored_modulesignored_statesr4   c                 C   s   |d k	r|d k	rt dd }|d k	}|r<t|}t|d ng }t|d k	rRt|ng d t|dkrt|d tjr||}n|}t||| _t	|| j|| _
t|| j| _| S )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r6   list_check_ignored_statesrT   rN   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r/   rc   rd   re   ignored_parameterspassed_as_ignored_statesZignored_states_listr@   r@   rA   _init_ignored_module_states  s:    	 	rr   )re   rq   r4   c                 C   s   t | dkrdS |rftdd | D }tdd | D }|s|stdd | D td}td	| n6td
d | D stdd | D td}td| dS )z
    Check that the ignored states are uniformly parameters or uniformly modules.

    We may remove this check in the future if we permit mixing.
    r   Nc                 s   s   | ]}t |tjV  qd S rM   )rN   rh   ri   rQ   r/   r@   r@   rA   rR   W  s     z(_check_ignored_states.<locals>.<genexpr>c                 s   s   | ]}t |tjV  qd S rM   rN   rh   Modulers   r@   r@   rA   rR   X  s     c                 S   s   h | ]}t |qS r@   rJ   rs   r@   r@   rA   	<setcomp>[  s     z(_check_ignored_states.<locals>.<setcomp>)keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c                 s   s   | ]}t |tjV  qd S rM   rt   rs   r@   r@   rA   rR   a  s     c                 S   s   h | ]}t |qS r@   rv   rs   r@   r@   rA   rw   b  s     z>ignored_modules expects nn.Module list elements but got types )rT   rU   sortedreprr6   )re   rq   
all_paramsZall_modulesZsorted_typesr@   r@   rA   rg   L  s    rg   )r/   rc   ignored_params	device_idr4   c                 C   s   d}|dk	r&t |tjr|nt|}|dkrt||D ]J}|jjdkrJq8|dkrZ|j}q8|jj|jkr8td|j d|jj q8|ptdtj }t	|| _
| S )a  
    Determine device handle used for initializing FSDP.

    If a device is specified by ``device_id``,
    then returns device handle corresponds to that device type. Otherwise, If the
    module is already on a non-CPU device, then the device type is that non-CPU device type.
    If the module is on CPU or meta, then the device type is the current cuda device.

    This method will be called once ignored paramters was determined, as the device handle maybe needed
    for other initialization.
    N>   cpumetazLFSDP does not support modules with different device types but got params on z and cuda)rN   torchdevice_get_orig_paramsrJ   RuntimeErrorr   current_devicer   Zfrom_devicerH   )r/   rc   r|   r}   Zdetermined_deviceparamr@   r@   rA   _init_device_handlei  s,    
 r   )r/   rc   r4   c                 C   s<   t || _i }| D ]\}}t|}|j||< q|| _| S rM   )_get_buffer_namesZ_buffer_namesnamed_buffersr   Zdtype_buffer_name_to_orig_dtype)r/   rc   r   buffer_namebufferr@   r@   rA   _init_buffer_state  s    
r   )	r/   r1   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitr4   c                 C   s  | j dkr4|tjkr,td|p"tj d tj}n|tjkrNtjdtdd |pVtj| _|pbt | _	|d k	rt
jdt| j	  tjtdd	k| _|pt | _|| _|| _tj| _d | _t | _t | _t !| j||| _"d | _#t$ }|| _%d }	|	| _&g }
|
| _'| S )
NrC   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.zoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   )
stacklevelz'torch.distributed.fsdp.mixed_precision. 1)(r=   r%   NO_SHARDwarningswarn
FULL_SHARDFutureWarningr1   r$   r   r   Z_CZ_log_api_usage_oncestrosenvirongetr   Z_use_full_prec_in_evalr!   r   r   _use_orig_paramsr   ZIDLEZtraining_stateZ_is_rootr   Z_free_event_queuerO   Zget_debug_levelZ_debug_levelexec_order_utilsZ_ExecOrderDataZ_exec_order_dataZ_unshard_eventdict_fully_sharded_module_to_handle_handleparams)r/   r1   r   r   r   r   r   r   r   r   r   r@   r@   rA   _init_core_state  sP    



r   )r/   r4   c                 C   s4   g }|| _ g }|| _g }|| _d| _d | _d | _| S )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handlesZ_sync_gradientsZ
_comm_hookZ_comm_hook_state)r/   r   r   r   r@   r@   rA   _init_runtime_state  s    r   )r/   backward_prefetchforward_prefetchr4   c                 C   s   || _ || _| S rM   )r   r   )r/   r   r   r@   r@   rA   _init_prefetching_state  s    r   )r/   r3   r4   c                 C   s,   |r"t | jd k	r"t| j| _nd | _| S rM   )r   Zget_parent_meshr9   r)   rH   _fsdp_extension)r/   r3   r@   r@   rA   _init_extension  s    r   c                 C   s*   t j| _t }t | _|| _i }|| _| S rM   )r'   ZFULL_STATE_DICTZ_state_dict_typer#   r"   Z_optim_state_dict_configZ_state_dict_configZ_unshard_params_ctx)r/   Zstate_dict_configZunshard_params_ctxr@   r@   rA   _init_state_dict_state  s    r   )r/   fully_sharded_moduler}   param_init_fnsync_module_statesr4   c           
         s   t | j| t| j}t| j j\}}|s6|rN|dk	rNt|| j n.|rbt|| j n|r|tj	| fddd dd  jD }t
| j|| t| j| j _tt| j}	|rt||	 j  jtkrt||	 j t |	|  S )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.Nc                    s   t | d ko|  jkS rM   )r   rk   )	submoduler/   r@   rA   <lambda>;  s   z0_init_param_handle_from_module.<locals>.<lambda>)Zcheck_fnc                 S   s   h | ]}|  D ]}|qqS r@   buffers)rQ   Zignored_moduler   r@   r@   rA   rw   ?  s   
 z1_init_param_handle_from_module.<locals>.<setcomp>)_check_single_device_modulerm   _get_device_from_device_idr;   _need_to_materialize_modulerk   _materialize_with_param_init_fn_materialize_meta_moduler-   Zmaterialize_module_move_module_to_device_get_compute_devicecompute_devicerf   r   _sync_module_params_and_buffersr0   r1   r7   r>   _init_param_handle_from_params)
r/   r   r}   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_buffersmanaged_paramsr@   r   rA   _init_param_handle_from_module!  sl    	      
  
  r   )r/   r   r   c                 C   s   t |dkrd S t||| jt| j | jj| jj| jj	| jj
| j| j| jd}|  | jr^t| j|j || _|| j|j< td}| jjr|jj|kr|| d S )Nr   )Zfsdp_extensionr~   )rT   r   r   SHARDING_STRATEGY_MAPr1   r   Zoffload_paramsr   Zparam_dtypeZreduce_dtypeZkeep_low_precision_gradsr0   r   r   Zshardr   r`   r   appendZ
flat_paramr   Z_fully_sharded_moduler   r   Zflat_param_to)r/   r   r   handle
cpu_devicer@   r@   rA   r   _  s.    

r   )root_modulerk   r4   c           	   
   C   s  d}z|dk	rt |nt  }W n: tk
rX } zt|dt|  |W 5 d}~X Y nX |D ]8}t|tjjst|dt|  t|r^tdq^| 	 D ]}t
|s|| qdd |D }| |krtd|  | 	 D ]0}t|}|dk	rt|d	st||j q|S )
ah  
    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

    Return the modules contained in their module
    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
    already-computed ignored modules are included.

    ``_ignored_modules`` represents the argument passed by the user to FSDP.
    z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP modulesc                 S   s*   h | ]"}|  D ]}t|tjs|qqS r@   )modulesrN   	fsdp_fileFullyShardedDataParallel)rQ   rc   childr@   r@   rA   rw     s
   
 z'_get_ignored_modules.<locals>.<setcomp>zTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: rk   )set	TypeErrorrJ   rN   r   rh   ru   r   r6   r   traversal_utilsZ_composableaddr   r   hasattrr`   updaterk   )	r   rk   Z
msg_prefixZignored_root_moduleserc   rd   r   optional_fsdp_stater@   r@   rA   rj   ~  s6    *

rj   )r   rd   rp   r4   c                 C   sz   t  }dd |D }|| |dk	r>dd |D }|| |  D ].}t|}|dk	rFt|dsht||j qF|S )z
    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

    :class:`FlatParameter` s are excluded from the result.
    c                 S   s&   h | ]}|  D ]}t|s|qqS r@   )
parametersr   )rQ   mpr@   r@   rA   rw     s
    
   z&_get_ignored_params.<locals>.<setcomp>Nc                 S   s   h | ]}t |s|qS r@   )r   rQ   r   r@   r@   rA   rw     s     rm   )r   r   r   r   r   r`   rm   )r   rd   rp   Zall_ignored_paramsZparams_in_ignored_modulesZparams_in_ignored_parametersr   r   r@   r@   rA   rl     s     


rl   )r   rd   r4   c                    sl   t  }dd |D  | fdd|  D  |  D ].}t|}|dk	r8t|dsZt||j q8|S )z6Return the cleaned buffer FQNs in ``ignored_modules``.c                 S   s   h | ]}|  D ]}|qqS r@   r   )rQ   r   r   r@   r@   rA   rw     s    
  z,_get_ignored_buffer_names.<locals>.<setcomp>c                    s    h | ]\}}| krt |qS r@   r   )rQ   r   r   Zbuffers_in_ignored_modulesr@   rA   rw     s   Nro   )r   r   r   r   r   r   r`   ro   )r   rd   Zall_ignored_buffer_namesr   r   r@   r   rA   rn     s    
	rn   )r   r4   c                 C   s   dd |   D S )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.c                 S   s   h | ]\}}t |qS r@   r   )rQ   r   rW   r@   r@   rA   rw     s    z$_get_buffer_names.<locals>.<setcomp>)r   )r   r@   r@   rA   r     s    r   )rc   r|   r}   r4   c                 C   s^   dd t | |D }t|dkr@td|kr@|dkrZtdnt|dkrZtd| dS )	z
    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

    Thus, after this method, the
    module must be either fully on the CPU or fully on a non-CPU device.
    c                 S   s   h | ]
}|j qS r@   r   rQ   r   r@   r@   rA   rw     s     z._check_single_device_module.<locals>.<setcomp>rL   r~   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rC   z;FSDP only supports single device modules but got params on )r   rT   r   r   r   )rc   r|   r}   Zdevicesr@   r@   rA   r     s    r   )r}   r;   r4   c              	   C   sn   | dkrdS t | tjr| nt| }|tdkrjtd|  d| dtj  d tdtj }|S )z
    Return a ``torch.device`` for the specified ``device_id``.

    Processes ``device_id`` and returns either the corresponding device or
    ``None`` if ``device_id`` is ``None``.
    Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z. If this is incorrect, please explicitly call `torch.cuda.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.)rN   r   r   r   r   r   r   )r}   r;   r   r@   r@   rA   r     s    
r   )rc   r|   rd   r4   c                 C   sx   t t| |}tdd |D }|  D ]*}||kr6q(|jddD ]}||jO }qBq(| ontontdd |D }||fS )z
    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

    At most of the returned bools can
    be ``True``. If either is ``True``, then ``module`` needs to be
    materialized.
    c                 s   s   | ]}|j V  qd S rM   )is_metar   r@   r@   rA   rR   D  s     z._need_to_materialize_module.<locals>.<genexpr>Frecursec                 s   s   | ]}t |V  qd S rM   )r.   Zis_faker   r@   r@   rA   rR   P  s     )rf   r   anyr   r   r   _TORCHDISTX_AVAIL)rc   r|   rd   r   r   r   bufr   r@   r@   rA   r   7  s    r   )r   r   rd   r4   c                 C   s@   t |s td| dt| t| |}|D ]}|| q.d S )Nz	Expected z to be callable but got )callabler6   rJ   _get_modules_to_materialize)r   r   rd   modules_to_materializerc   r@   r@   rA   r   U  s    
r   )r   r   rd   c           	   
   C   s   |pt t j }t| |}zht  V |D ]J}t|jdd|j	dd}t
t|dk}|r.|j|dd |  q.W 5 Q R X W nF tk
r } z(tdt| dt| d |W 5 d }~X Y nX d S )NFr   r   )r   r   zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.)r   r   r   r   r   no_grad	itertoolschainr   r   rT   rf   Zto_emptyZreset_parametersBaseExceptionr   r   r   rJ   )	r   r   rd   Zmaterialization_devicer   rc   Zmodule_state_iterZhas_module_statesr   r@   r@   rA   r   c  s(    


 
r   c                 C   sp   g }t | g}| h}|rl| }|| | D ]4}||kr4t|d kr4||kr4|| || q4q|S rM   )collectionsdequepopleftr   childrenr   r   )r   rd   r   queueZvisited_modulesrc   Zchild_moduler@   r@   rA   r     s     


r   )rc   r|   r   r   r4   c                    s   t d |dk	rt }||  g }g }|r| }| fdd|jddD  | fdd|jddD  |	 D ]}t
|tjs|| qq,fdd	|D }	fd
d	|D }
t|	|
| dS tt| d}|dk	r|j krt  dS )a  
    Move ``module`` depending on ``device_from_device_id`` and its current device.

    This includes moving ignored modules' parameters.

    - If ``device_from_device_id`` is not ``None``, then this moves
    ``module`` to the device.
    - If ``device_from_device_id`` is ``None``, then this does not move
    ``module`` but warns the user if it is on CPU.

    Precondition: ``_check_single_device_module()``.
    r~   Nc                 3   s   | ]}|j  kr|V  qd S rM   r   r   r   r@   rA   rR     s   
z)_move_module_to_device.<locals>.<genexpr>Fr   c                 3   s   | ]}|j  kr|V  qd S rM   r   )rQ   r   r   r@   rA   rR     s   
c                    s   g | ]}| kr|qS r@   r@   r   )r|   r@   rA   r\     s      z*_move_module_to_device.<locals>.<listcomp>c                    s   g | ]}| kr|qS r@   r@   r   )r   r@   rA   r\     s      )r   r   r   r   r   r   extendr   r   r   rN   r   r   _move_states_to_devicenextr   _warn_cpu_init)rc   r|   r   r   r   r   r   Zcurr_moduler   Zparams_to_moveZbufs_to_mover   r@   )r   r   r|   rA   r     s0    



r   )r   r   r   r4   c              
   C   s   t | dkrt |dkrdS t | dkr4| d j}nt |dkrJ|d j}td}|dk	r| D ]>}t , |||_|jdk	r|j||j_W 5 Q R X q`|D ]}|||_qn||krt  dS )z
    Move states to the specified device.

    Precondition: ``_check_single_device_module()`` and module's parameters and
    buffers have been materialized if needed.
    r   Nr~   )rT   r   r   r   todataZgradr   )r   r   r   r   r   r   r   r@   r@   rA   r     s"    



r   c                   C   s   t d d S )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.)r   r   r@   r@   r@   rA   r     s    r   )rc   r|   r   r;   r4   c                 C   s   t t| |d}|dk	r,|jjdkr,|j}n*|dk	rD|jdkrD|}ntdtj }|dk	r||krtd| d| d| |S )ax  
    Determine and return this FSDP instance's compute device.

    If a device is
    specified by ``device_id``, then returns that device. Otherwise, If the
    module is already on a non-CPU device, then the compute device is that non-CPU
    device. If the module is on CPU, then the compute device is the current
    device.

    Since this method should be called after materializing the module, any
    non-CPU device should not be meta device. For now, the compute device is
    always a CUDA GPU device with its explicit index.

    Precondition: ``_check_single_device_module()`` and
    ``_move_module_to_device()``.
    Nr~   r   z4Inconsistent compute device and `device_id` on rank z: z vs )r   r   r   rJ   r   r   r   r6   )rc   r|   r   r;   r   r   r@   r@   rA   r     s    r   )rc   r   r0   r4   c           
         s   g }|   D ]`}t|tdst|td |  t rb  \}} fdd|D }|| q|  q|D ]H}| tr \}}fdd|D }	||	 qr| qrt	| t
||tdd dS )	z
    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
    been set.
    FTc                    s   g | ]}t  |qS r@   getattrrQ   attr)detached_bufferr@   rA   r\   7  s     z3_sync_module_params_and_buffers.<locals>.<listcomp>c                    s   g | ]}t  |qS r@   r   r   )detached_paramr@   rA   r\   @  s     r   )srcN)r   r   FSDP_SYNCEDsetattrdetachr+   Z__tensor_flatten__r   r   +_check_module_states_for_sync_module_statesr*   PARAM_BROADCAST_BUCKET_SIZE)
rc   r   r0   module_statesr   attrsrW   Zinner_buffersr   Zinner_paramsr@   )r   r  rA   r   "  s0    r   )r  r4   c                 C   s"   | rt dd | D rtdd S )Nc                 s   s   | ]}|j t d kV  qdS )r~   N)r   r   )rQ   Ztensorr@   r@   rA   rR   Q  s    z>_check_module_states_for_sync_module_states.<locals>.<genexpr>zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)r   r6   )r  r@   r@   rA   r  N  s    r  )rc   r|   r4   c                 c   sF   |   }z$t|}||kr
t|s
|V  q
W n tk
r@   Y nX dS )aD  
    Return an iterator over the original parameters in ``module``.

    The iterator does not return
    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
    present due to nested FSDP wrapping), or any original parameters already
    flattened (only relevant when ``use_orig_params=True``).
    N)r   r   r   StopIteration)rc   r|   Z	param_genr   r@   r@   rA   r   [  s    r   )r|   r4   c                 C   sF   t | D ]8\}}||krt|std| d|  d|j qdS )a5  
    Check that original parameters in ``fsdp_module`` have been flattened.

    The flattened parameters are made
    invisible to ``named_parameters()`` for the module hierarchy rooted at
    ``fsdp_module``. This should be called as a sanity check after flattening
    the wrapped module's parameters.
    z Found an unflattened parameter: z;  N)r   r   r   r<   	__class__)Zfsdp_moduler|   
param_namer   r@   r@   rA   _check_orig_params_flattenedq  s
    r  r1   c                 C   s   | t jkrtjS tjS rM   )r%   r   r   Zallreduce_hookZreduce_scatter_hookr  r@   r@   rA   _get_default_comm_hook  s    r  c                 C   s   t j| dS )NrD   )r   r?   rD   r@   r@   rA   rK     s    rK   )N)N)N)N)r   r   r   r   typingr   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   Ztorch.distributeddistributedrO   Z(torch.distributed.fsdp._exec_order_utilsZfsdpZ_exec_order_utilsr   Z'torch.distributed.fsdp._traversal_utilsZ_traversal_utilsr   Z2torch.distributed.fsdp.fully_sharded_data_parallelZfully_sharded_data_parallelr   Ztorch.nnrh   Z(torch.distributed.algorithms._comm_hooksr   Ztorch.distributed.device_meshr   r   Z"torch.distributed.distributed_c10dr   Z$torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   Z"torch.distributed.fsdp._flat_paramr   r   r   r   Z%torch.distributed.fsdp._limiter_utilsr   Ztorch.distributed.fsdp.apir    r!   r"   r#   r$   r%   r&   r'   Ztorch.distributed.fsdp.wrapr(   Z&torch.distributed.tensor.parallel.fsdpr)   Ztorch.distributed.utilsr*   Ztorch.utils._python_dispatchr+   Ztorch.utils.hooksr,   r   Z
torchdistxr-   r.   ImportErrorintr  r  rP   ZHybridShardProcessGroupTypeZProcessGroupTyper   r   ZSHARD_GRAD_OPZHYBRID_SHARDZ_HYBRID_SHARD_ZERO2r   r7   Z#NO_RESHARD_AFTER_FORWARD_STRATEGIESrB   r8   boolrI   rE   rX   rb   rG   ru   ri   rr   rg   r   r   r   r   r   r   r   r   r   r   rj   rl   r   rn   r   r   r   r   r   r   r   ZTensorr   r   r   r   r   r  r   r  r  r?   rK   r@   r@   r@   rA   <module>   s  @$	(

     
 3+% / ,A=< 

#

" 7 (-	