U
    zh6                     @   s8  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlZd dlZd dlmZ d dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ dd	lmZmZmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$ e%e&Z'dd Z(edd Z)eej*e
dddZ+d0ddZ,dd Z-eej*e
dddZ.G dd dej/Z0eej*e
dddZ1edd Z2d d! Z3ej4j5Z5e5j6e5j7e5j8e5j9e5j:e5j;e5j<e5j=e5j>e5j?e5j@e5jAe5jBe5jCjDe5jCjEe5jFe5jGe5jHe5jIe5jJe5jKe5jLhZMeeMZMed"d# ZNee
ejOf d$d%d&ZPd'd( ZQd aRd)d* ZSd+d, ZTd1d.d/ZUdS )2    N)contextmanager)partial)CallableUnion)SymInt)get_decompositions)bind_symbols   )aot_function
aot_modulemake_boxed_compiler)strip_overloads)default_partition
draw_graph#min_cut_rematerialization_partitionc                 C   s4   | j jdtjjjdD ]}tjjj|_q|   | S )Ncall_functionoptarget)	graph
find_nodestorchopsaten_to_copytor   	recompile)fx_gnode r   L/var/www/html/venv/lib/python3.8/site-packages/torch/_functorch/compilers.py_canonicalize$   s     
r!   c               	   c   s*   t jd} z
d V  W 5 t j|  X d S )NF)r   _CZ_jit_set_autocast_mode)Zold_jit_autocast_flagr   r   r    _disable_jit_autocast-   s    
r#   )r   returnc              	   C   s  t   t|  | jjdtjjjdD ]6}t|j	dkr(t|j
dkr(d|j
kr(tjjj|_q(| jjD ]<}i }|j
 D ]"\}}t|tjr|j}|||< qz||_
qh| j  |   tj| }tj|j tj| }tj|}tdd |D s
||  W 5 Q R X |S )a  
    Compiles the :attr:`fx_g` with Torchscript compiler.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fx_g(fx.GraphModule): The input Fx graph module to be compiled.

    Returns:
        Torch scripted model.
    r   r   r	   dtypec                 s   s   | ]}t |tjjV  qd S N)
isinstancer   Z_subclassesZ
FakeTensor).0tr   r   r    	<genexpr>`   s     zts_compile.<locals>.<genexpr>)r#   r   r   r   r   r   r   r   lenargskwargsr   r   nodesitemsr'   devicetypeZlintr   jitscriptr"   Z_jit_pass_remove_mutationfreezeevalZoptimize_for_inferenceany)r   inpsr   Z
new_kwargskvfr   r   r    
ts_compile6   s0    
 
&

r;   Tc                 C   s   t | j t| ||d | S )N)
clear_meta)printcoder   )r   _namer<   r   r   r    _draw_graph_compilee   s    
rA   c                 C   s   t tt| dS )Nr@   )r   r   rA   rB   r   r   r    draw_graph_compilek   s    rC   c                 C   s   | S )z
    Returns the :attr:`fx_g` Fx graph module as it is. This is a no-op compiler
    and can be used to check accuracy.

    .. warning::
        This API is experimental and likely to change.

    r   r   r?   r   r   r    nopo   s    
rE   c                       s(   e Zd Z fddZ fddZ  ZS )DebugInterpreterc                    s"   t | jf| | _t j|  d S r&   )r   modulesymbol_mappingsuperrun)selfr,   	__class__r   r    rJ   }   s    zDebugInterpreter.runc           
         s   fddfddfdd  fdd}t  |}d	|jkrt|jd	 \}}t|\}}t|t|kstt| d
t| ttt|||D ].\}}	t	|	t
jsq|||	fdd q|S )Nc                    s:   t | ts| S t| jj j}|js2t	|t
|S r&   )r'   r   sympyexpandr   exprZxreplacerH   Z	is_numberAssertionErrorint)nir)rK   r   r    subst_symint   s
    
z/DebugInterpreter.run_node.<locals>.subst_symintc                    s   t  fdd| D S )Nc                 3   s   | ]} |V  qd S r&   r   )r(   rS   rU   r   r    r*      s     zHDebugInterpreter.run_node.<locals>.subst_symint_tuple.<locals>.<genexpr>)tuple)ZnisrV   r   r    subst_symint_tuple   s    z5DebugInterpreter.run_node.<locals>.subst_symint_tuplec                    sT    |   dkrPt| jD ]4} | |||kr | |dkr dS qdS )Nr   r	   FT)Znumelrangendimstridesize)abidxrV   r   r    check_significant_strides   s    z<DebugInterpreter.run_node.<locals>.check_significant_stridesc              	      s   t |st| j|jks6t|  d| j d|j |  | kszt|  d|   d|   d|   | |}|st|  d|   d|   d|  d S )Nz:  != z aka )callablerQ   r%   r\   r[   )nvrvdescZsame_strides)r`   rX   r   r    check   s    **
*z(DebugInterpreter.run_node.<locals>.checkvalra   c                      s   d  dj  S )Nzoutput z where )rH   r   )irK   r   r    <lambda>       z+DebugInterpreter.run_node.<locals>.<lambda>)rI   run_nodemetapytreeZtree_flattenr+   rQ   ziprY   r'   r   Tensor)
rK   nrf   rT   Zn_valsZn_specZr_valsZr_specrc   rd   rL   )r`   rh   rK   rU   rX   r    rk      s    

*zDebugInterpreter.run_node)__name__
__module____qualname__rJ   rk   __classcell__r   r   rL   r    rF   |   s   rF   c                 C   s
   t | jS )z
    Returns a (slow) interpreter over the FX graph module that also checks
    various debugging properties (e.g., that tracing strides matched real
    strides.)
    )rF   rJ   rD   r   r   r    	debug_nop   s    ru   c                 C   s(   t |  tj| }tj| }|S r&   )r   r   r2   r3   r4   r5   )r   r?   r:   r   r   r    simple_ts_compile   s    rv   c                 C   s
   t | tS r&   )r
   rv   )r:   r   r   r    nnc_jit   s    rw   c                 C   s   t | j | S r&   )r=   r>   rD   r   r   r    print_compile   s    
rx   )fnc                 K   sB   t t ttd}|| t| tjjr2t| f|S t	| f|S dS )a  
    Wrapper function over :func:`aot_function` and :func:`aot_module` to perform
    memory efficient fusion. It uses the
    :func:`min_cut_rematerialization_partition` partitioner to perform efficient
    recomputation. It uses NVFuser to compile the generated forward and backward
    graphs.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
            that takes one ore more arguments. Must return one or more Tensors.
        **kwargs: Any other overrides you want to make to the settings

    Returns:
        Returns a ``Callable``  or ``nn.Module`` that retains the eager behavior
        of the original :attr:`fn`, but whose forward and backward graphs have
        gone through recomputation optimizations, and the graphs have been
        compiled with nvfuser.

    Zfw_compilerZbw_compilerZpartition_fnZdecompositionsN)
r;   r   default_decompositionsupdater'   r   nnModuler   r
   )ry   r-   configr   r   r    memory_efficient_fusion   s    
r   c                 C   sH   |  d tddd |D  d ddlm} |  |  t| |S )NfooaQ  
##############################################################
# To minimize FX graph, copy and paste the below and run it  #
##############################################################

import torch
import torch.fx as fx
from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess

inps = c                 S   s   g | ]}|j |jfqS r   )shaper%   )r(   rh   r   r   r    
<listcomp>  s     z!debug_compile.<locals>.<listcomp>a?  
inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
from foo import FxModule
mod = FxModule().cuda()

with torch.jit.fuser("fuser2"):
  # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess
  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
r   )FxModule)	to_folderr=   r   r   cudar;   )r   r7   r   r   r   r    debug_compile  s    
	r   c                 C   s   g }t | d}t|}g }|D ]}t|dkrD|}|t }nX|\}}}}	}
|	tjtjtj	tj
tjtjtthkrtjdd||	|
d}ntj||	|
d}|| q"W 5 Q R X |S )zZ
    Return a random input for the given inputs meta generated from _save_fx_default.
    rbr	   r   )r%   r0   )openpickleloadr+   randomZrandr   rR   Zint32Zint64boolZuint8floatrandintappend)Zinput_data_pathinputsr:   Zinputs_metarl   r1   inputr   r[   r%   r0   r   r   r    
get_inputs3  s.    

r   c           	         sb   ddl m} fdd fddfdd}fd	d
}fdd}||||||tdS )aO  
    The forward, backward, and joint computation graph will be stored in
    {folder_name}/{current_name}/{current_name}_forward_{graph_index},
    {folder_name}/{current_name}/{current_name}_backward_{graph_index}, and
    {folder_name}/{current_name}/{current_name}_joint_{graph_index} respectively.
    The input shape of the graphs will be stored in the .input files.
    These files can be loaded with pickle,
    and is a list of format (type, shape, stride, dtype, device).
    In the case of type = int or float, it is just (type,).
    For joint graph input, it is a nested list [[],[]]
    where the two inner lists have the same format.
    If dump_example_input is True, example_inputs will be stored in .pt file.
    Since each function might produce multiple graphs,
    the graph_index is used to distinguish difference graphs
    r   )aot_module_simplifiedc                    s   g }t | dkrBt| d trB| | d 7 }| | d 7 }|S | D ]P}t|tksbt|tkrt|t|f qF|t||j| |j	|j
f qF|S )Nr   r	   )r+   r'   rW   r1   rR   r   r   r   r[   r%   r0   )r,   
input_metaarg)get_input_metar   r    r   d  s    z(_save_fx_default.<locals>.get_input_metac                    s  t | jjdkr(ttjd |t d S t| }|j	t
jj  |  |}tj d  dd | d  d  d| dt 	 t|t d  d  d| dt d  d| dt dd rt
| d  d  d| dt d  d| dt d	 d S )
Nr   z!No nodes in graph {%s}_{%s}_{%s}./T)exist_okr?   z.inputwbz.pt)r+   r   r.   logloggingWARNINGgraph_indexcopydeepcopyZset_codegenr   fxZCodeGenr   osmakedirsr   r   dumpr   save)Z
gm_to_saver,   	type_namegmr   )current_namedump_example_inputfolder_namer   r   r    graph_saver_helpers  s:    
22z,_save_fx_default.<locals>.graph_saver_helperc                    s    | |d | S )Nforwardr   )r   Zfw_argsr   r   r    graph_saver_forward  s    z-_save_fx_default.<locals>.graph_saver_forwardc                    s    | |d t d7 a | S )NZbackwardr	   )r   )r   Zbw_argsr   r   r    graph_saver_backward  s    z._save_fx_default.<locals>.graph_saver_backwardc                    s    | |d t | |S )NZjoint)r   )r   Z
joint_argsr   r   r    graph_saver_joint  s    z+_save_fx_default.<locals>.graph_saver_jointrz   )Zfunctorch.compiler   r{   )	r   r   r   r   Zexample_inputsr   r   r   r   r   )r   r   r   r   r   r    _save_fx_defaultR  s    #r   Fc                 C   s   da tt| ||S )as  
    Dump the forward, backward, and joint computation graph.
    Example Usage:
    save_fx_func = graph_dumper_aot(current_name, folder_name, dump_example_input = False)
    optimize_ctx = torchdynamo.optimize(
        save_fx_func
    )
    with torch.enable_grad():
        with optimize_ctx:
            result = forward_and_backward_pass(model, example_inputs)
    r   )r   r   r   )r   r   r   r   r   r    graph_dumper_aot  s    r   )T)F)Vr   r   r   r   r   
contextlibr   	functoolsr   typingr   r   rN   r   Ztorch.fxr   Ztorch.nnr}   Ztorch.utils._pytreeutilsZ_pytreerm   r   Ztorch._decompr   Z%torch.fx.experimental.symbolic_shapesr   Zaot_autogradr
   r   r   Zcompile_utilsr   Zpartitionersr   r   r   	getLoggerrq   r   r!   r#   ZGraphModuler;   rA   rC   rE   ZInterpreterrF   ru   rv   rw   r   r   detachZgelu_backwardZleaky_relu_backwardZsigmoid_backwardZthreshold_backwardZhardtanh_backwardZhardsigmoid_backwardZhardswish_backwardZtanh_backwardZsilu_backwardZelu_backwardZcudnn_batch_normZcudnn_batch_norm_backwardZmasked_fillZScalarro   ZeluZ
leaky_reluZhardtanhZ	hardswishZhardsigmoidZconj_physicalZis_same_sizer{   rx   r~   r   r   r   r   r   r   r   r   r   r    <module>   s   
	
.
7	

']