U
    yhT                     @   s   d dl Z d dlZd dlZd dlmZ ddlmZ eejdsledejj	d< edejj	d< edejj	d< d d	l
mZmZmZ d
d Zdd ZG dd dejjZG dd dZdddZdS )    N)_pytree   )_dummy_typeZ_CudaStreamBase
_CUDAGraph_graph_pool_handle_cuda_isCurrentStreamCapturing)r   r   r   c                   C   s   t  S )zReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r    r   r   C/var/www/html/venv/lib/python3.8/site-packages/torch/cuda/graphs.pyis_current_stream_capturing   s    r
   c                   C   s   t  S )zReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )r   r   r   r   r	   graph_pool_handle!   s    r   c                       sv   e Zd ZdZ fddZd fdd	Z fdd	Z fd
dZ fddZ fddZ	 fddZ
 fddZ  ZS )	CUDAGraphzrWrapper around a CUDA graph.

    .. warning::
        This API is in beta and may change in future releases.
    c                    s   t  | S N)super__new__)cls	__class__r   r	   r   4   s    zCUDAGraph.__new__Nglobalc                    s   t  j||d dS )a  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )poolcapture_error_modeN)r   capture_begin)selfr   r   r   r   r	   r   7   s    zCUDAGraph.capture_beginc                    s   t    dS )aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r   capture_endr   r   r   r	   r   J   s    	zCUDAGraph.capture_endc                    s   t    dS )z,Replay the CUDA work captured by this graph.N)r   replayr   r   r   r	   r   U   s    zCUDAGraph.replayc                    s   t    dS )z1Delete the graph currently held by this instance.N)r   resetr   r   r   r	   r   Y   s    zCUDAGraph.resetc                    s
   t   S )zReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r   r   r   r   r   r	   r   ]   s    zCUDAGraph.poolc                    s
   t   S )z/Enable debugging mode for CUDAGraph.debug_dump.)r   enable_debug_moder   r   r   r	   r   e   s    zCUDAGraph.enable_debug_modec                    s   t  |S )z
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r   
debug_dump)r   Z
debug_pathr   r   r	   r   i   s    zCUDAGraph.debug_dump)Nr   )__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   __classcell__r   r   r   r	   r   -   s   r   c                   @   sD   e Zd ZU dZdZejd ed< dedddZ	d	d
 Z
dd ZdS )grapha  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Nztorch.cuda.Streamdefault_capture_streamr   )r   c                 C   sr   | j jd krtj | j _|d kr&dn|f| _|d k	r:|n| j j| _| jd k	sRttj| j| _	|| _
|| _d S )Nr   )r   r$   torchcudaStreamr   Zcapture_streamAssertionErrorstream
stream_ctx
cuda_graphr   )r   r+   r   r)   r   r   r   r	   __init__   s    
zgraph.__init__c                 C   s@   t j  t  t j  | j  | jj	| j
d| ji d S )Nr   )r%   r&   synchronizegcZcollectZempty_cacher*   	__enter__r+   r   r   r   r   r   r   r	   r/      s    


zgraph.__enter__c                 C   s   | j   | j||| d S r   )r+   r   r*   __exit__)r   exc_type	exc_value	tracebackr   r   r	   r0      s    
zgraph.__exit__)NNr   )r   r   r    r!   r$   typingOptional__annotations__strr,   r/   r0   r   r   r   r	   r#   t   s   
   r#      Fc           &         s2  t  rt  rtdd}t| ts6d}| f} |f}g  t| |D ]\}}t|t jjrt	|j
dkrt	|jdkrt	|jdkstdtdd | D stdtj| } t| td	d |D sDtd
qDdd  D }	dd | D  fddtt	| D }
dd tt	| D }dd tt	| D }|dkrRt n|}t j  t jt j  t| ||
D ]n\}}}t|D ]T}t|| }t jjtdd |D tdd |D tdd |D d|d}q~~qW 5 Q R X t j  g }g }t| ||D ]V\}}}t jj||d || }W 5 Q R X t|\}}|t| || qg }g }tt|
t|t|tD ]\}}}}tdd |D }t jj||dF t jjtdd |D tdd |D tdd |D d|d}W 5 Q R X g }d}|D ]0} | jr@|||  |d7 }n
|d qt|}|| || q|   |   dd }!g }"t!| D ]\}#}|!||# ||# |# |	|# ||# |
|# ||# ||# ||# 	}$t|t jjrdd }%|%||j"|$|j#|_#|"| n
|"|$ q|r*|"d S t|"S ) a  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FTr   zModules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c                 s   s   | ]}|j d kV  qdS )FNrequires_grad.0br   r   r	   	<genexpr>  s     z)make_graphed_callables.<locals>.<genexpr>zIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c                 s   s   | ]}t |tjV  qd S r   )
isinstancer%   ZTensor)r<   argr   r   r	   r>   "  s     zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                 S   s   g | ]}t |qS r   )len)r<   argsr   r   r	   
<listcomp>)  s     z*make_graphed_callables.<locals>.<listcomp>c                 S   s*   g | ]"}t |tjjr"t| nd qS )r   )r?   r%   nnModuletuple
parameters)r<   cr   r   r	   rC   *  s   c                    s   g | ]} | |  qS r   r   r<   iZflatten_sample_argsZper_callable_module_paramsr   r	   rC   .  s   c                 S   s   g | ]}t j qS r   r%   r&   r   r<   _r   r   r	   rC   3  s     c                 S   s   g | ]}t j qS r   rL   rM   r   r   r	   rC   4  s     Nc                 s   s   | ]}|j r|V  qd S r   r9   r<   or   r   r	   r>   C  s      c                 s   s   | ]}|j r|V  qd S r   r9   rI   r   r   r	   r>   D  s      c                 s   s   | ]}|j rt|V  qd S r   r:   r%   Z
empty_likerO   r   r   r	   r>   E  s     )outputsinputsZgrad_outputsZonly_inputsZallow_unused)r   c                 s   s"   | ]}|j rt|nd V  qd S r   rQ   rO   r   r   r	   r>   h  s    c                 s   s   | ]}|j r|V  qd S r   r9   rO   r   r   r	   r>   n  s      c                 s   s   | ]}|j r|V  qd S r   r9   rI   r   r   r	   r>   o  s      c                 s   s   | ]}|d k	r|V  qd S r   r   rO   r   r   r	   r>   p  s         c	           
         s8   G 	fdddt jj  fdd}	|	S )Nc                       s@   e Zd ZefddZeejjj fddZ	dS )zOmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedc                    s`   t D ].}|  ||  kr| ||  q   ttsNttdd D S )Nc                 s   s   | ]}|  V  qd S r   detachrO   r   r   r	   r>     s     zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>)rangedata_ptrcopy_r   r?   rF   r(   )ctxrS   rJ   )	fwd_graphlen_user_argsstatic_input_surfacestatic_outputsr   r	   forward  s    zWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forwardc                    sr   t |t kstt|D ]*\}}|d k	r| | kr|| q   tts`ttdd D S )Nc                 s   s"   | ]}|d k	r|  n|V  qd S r   rU   r;   r   r   r	   r>     s    zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>)rA   r(   ziprX   rY   r   r?   rF   )rZ   Zgradsggrad)	bwd_graphstatic_grad_inputsstatic_grad_outputsr   r	   backward  s    zXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backwardN)
r   r   r    staticmethodr_   r%   autogradfunctionZonce_differentiablerf   r   )rc   r[   r\   rd   re   r]   r^   r   r	   Graphed  s
   	rj   c                     s(   t j|  } jt|  }t |S r   )r   arg_tree_leavesapplyrF   Ztree_unflatten)	user_argsZflatten_user_argsout)rj   module_paramsoutput_unflatten_specr   r	   functionalized  s    
zVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized)r%   rh   Function)
r[   rc   ro   r\   rp   r]   r^   re   rd   rq   r   )
rj   rc   r[   r\   ro   rp   rd   re   r]   r^   r	   make_graphed_autograd_function  s    $z>make_graphed_callables.<locals>.make_graphed_autograd_functionc                    s    fdd}|S )Nc                     s    j kr|  S |  S d S r   )training)rm   funcgraph_training_stategraphedorig_fwdr   r	   new_fwd  s    
zEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwdr   )rv   rw   rx   ry   rz   r   ru   r	   make_graphed_forward  s    z4make_graphed_callables.<locals>.make_graphed_forward)$r%   Zis_autocast_enabledZis_autocast_cache_enabledRuntimeErrorr?   rF   r`   rD   rE   rA   Z_backward_hooksZ_forward_hooksZ_forward_pre_hooksr(   allbuffersr   rk   appendrW   r   r&   r-   r)   r'   Ztree_leavesrh   rb   r#   Ztree_flattenreversedr:   reverse	enumeratert   r_   )&Z	callablesZsample_argsZnum_warmup_itersZallow_unused_inputr   Zjust_one_callablerH   rB   Zflatten_argZper_callable_len_user_argsZ"per_callable_static_input_surfacesZ
fwd_graphsZ
bwd_graphsZmempoolrv   r]   rN   rR   Zgrad_inputsZper_callable_static_outputsZ"per_callable_output_unflatten_specr[   Zflatten_outputsspecZ per_callable_static_grad_outputsZper_callable_static_grad_inputsr^   rc   ro   re   rd   Zgrad_idxr@   rs   retrJ   rx   r{   r   rK   r	   make_graphed_callables   s    E



  
	


3r   )r8   FN)r.   r4   r%   Ztorch.utilsr   Z_utilsr   hasattrZ_C__dict__Ztorch._Cr   r   r   r
   r   r   r#   r   r   r   r   r	   <module>   s&   	GK     