U
    zh                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG ddl6mHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZY ddlZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZdmeZe er>d dlfmgZg eheiZjejkleidZmejkleidZnejojpZpeq Zre5s rd dltmuZu ndd Zud d! Zvd"d# Zwd$d% Zxd&d' Zyd(d) ZzG d*d+ d+ej{j|Z}dS ),    N)defaultdict)contextmanager)
AnyCallableDefaultDictDictListOptionalSetTupleTYPE_CHECKINGUnion)get_decompositions)defakedynamo_timed)
LazyStringtrace_structured)make_channels_last_strides_for)
FakeTensor)BackwardState)magic_methodsmethod_to_operator)free_unbacked_symbolshas_free_symbolsresolve_unbacked_bindingsRuntimeAssertShapeEnvSymTypes)no_dispatch   )configir)DeviceOpOverridesget_device_op_overridesget_scheduling_for_deviceget_wrapper_codegen_for_deviceregister_backend_for_device)CppWrapperCpu)CppWrapperCuda)WrapperCodeGen)CppWrapperCodeGenErrorLoweringExceptionMissingOperatorWithDecompMissingOperatorWithoutDecomp)ConstantFixedLayoutInputBuffer	Pointwise	Reduction
StorageBox	TensorBoxTorchBindObject)	constrain_to_fx_stridesFALLBACK_ALLOW_LISTfallback_handler%fallback_node_due_to_unsupported_typelayout_constraints	loweringsmake_fallbackneeds_realized_inputsunsupported_output_tensor)SizeVarAllocator)convert_shape_to_inductorgather_origins get_cloned_parameter_buffer_nameget_sympy_Expr_dtype#maybe_get_suppress_shape_guards_ctxshould_assume_input_aligned)NullHandlerV)_EffectTypeZ
perf_hintsoutput_code)log_module_codec                  O   s   d S N )argskwargsrL   rL   G/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/graph.pyrJ   n   s    rJ   c                 C   st   t jt jt jt jt jt jt jt jt j	t j
t jt jt jh}|rl|t j |t j |t j |t j | |kS rK   )torchfloat32Zfloat64int64Zint32Zint16Zint8Zuint8boolZbfloat16Z	complex32Z	complex64Z
complex128Zfloat16addZfloat8_e4m3fnZfloat8_e5m2Zfloat8_e4m3fnuzZfloat8_e5m2fnuz)dtypecudaZsupported_dtyperL   rL   rO   supported_dtype_of_cpp_wrapperr   s(    rW   c                 C   sl   t | tjtjtjjjfs"tdt | tjjjr8tj	S t | tjrLt
| S | jrXtj	S | jrdtjS d S d S )Nzgget_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer)
isinstancesympySymbolExprcorenumbersIntegerAssertionErrorrP   rR   rC   
is_integerZis_floatrQ   )Zconstant_bufferrL   rL   rO   may_get_constant_buffer_dtype   s     ra   c                 C   s   dd t D }| |kS )Nc                 S   s   h | ]}t |qS rL   )r   ).0mrL   rL   rO   	<setcomp>   s     z"is_magic_method.<locals>.<setcomp>)r   )opZ	magic_opsrL   rL   rO   is_magic_method   s    rf   c                 C   sT   | d}| }t|D ]8\}}t||sDtdd|d |  t||}q|S )N.z#Node referenced nonexistent target )split	enumeratehasattrRuntimeErrorjoingetattr)objtargetZtarget_atomsZattr_itriZatomrL   rL   rO   getattr_recursive   s    

rq   c                 C   s   t js
dS tjtjh}tjtjtjtjtj	tj
tjtjtjtjtjtjh}dd }t| jD ]^}||}|snq\||krd|jd< |jddr\|jD ]$}||}|sq||krd|jd< qq\dS )a  
    Nodes like convolution/convolution_backward want its input to be dense.
    If we pad their inputs, we result in extra calls to copy kernels!  On the other hand, padding usually helps reduction.

    The pass finds nodes that dislike padding. These are nodes that can be reached
    from a convolution/convolution_backward in the backward direction without
    going thru a reduction.
    Nc                 S   s"   | j dkrt| jdr| jjS d S )Ncall_function_overloadpacket)re   rj   ro   rs   noderL   rL   rO   _get_overload_packet   s    

z8mark_nodes_dislike_padding.<locals>._get_overload_packetTZdislike_paddingF)r    Zcomprehensive_paddingatenconvolutionconvolution_backwardZvar_meansumZmeanprodanyZaminZamaxminmaxZargminZargmaxZscatter_reducereversednodesmetagetZall_input_nodes)gZops_dislike_paddingZops_like_paddingrv   curre   ZpriorZprior_oprL   rL   rO   mark_nodes_dislike_padding   s@    	

r   c                       s  e Zd ZU eej ed< ejdddZ	ejdddZ
dd Zdpejjeeej  d fddZeedddZeedddZejjeej ed dddZdd Zdd ZejdddZedd Zed d!d"Zed d#d$Zed d%d&Ze fd'd(Z d
d)ej!ed*d+d,Z"ee d-d.d/Z#d0d1 Z$ed2d3d4Z%ed2d5d6Z&d7d8 Z'dqd9d:Z(eeej d;d<d=Z)ed> fd?d@Z* fdAdBZ+eejedCdDdEZ,dFdG Z-dHdI Z.dJdK Z/ fdLdMZ0dNdO Z1e2ejj3dPdQdRZ4e5e6e7ej8f dSf ej9dTdUdVZ:ejj3dW fdXdYZ;dZd[ Z<d\d] Z=d^d_ Z>d`da Z?dbdc Z@ddde ZAedfd
dgdhdi ZBdjdk ZCdldm ZDed2dndoZE  ZFS )rGraphLoweringgraph_outputs)exc                 C   sx   | j rt| t| fS ddlm} |dt| jj }| j	||\}}}dd |D }dd |D }||fS )z
        Support dynamic shapes and dynamic strides by assigning variables
        to each dimension.  We duck-shape tensors, so if two tensors
        have the same size they get assigned the same symbolic variable.
        r   )ConstantSourceZ__inductor_unknown_tensor_c                 S   s$   g | ]}t |tjr|jjn|qS rL   rX   rP   SymIntru   exprrb   rp   rL   rL   rO   
<listcomp>  s     z8GraphLowering.symbolic_sizes_strides.<locals>.<listcomp>c                 S   s$   g | ]}t |tjr|jjn|qS rL   r   r   rL   rL   rO   r     s     )
reuse_shape_envr@   sizestrideZtorch._dynamo.sourcer   len
_shape_envZ
var_to_valZ,create_symbolic_sizes_strides_storage_offset)selfr   r   sourcer   r   _rL   rL   rO   symbolic_sizes_strides   s&    z$GraphLowering.symbolic_sizes_stridesc                 C   s,   dd |  D }dd | D }||fS )z+
        Primarily used to weights
        c                 S   s   g | ]}t |qS rL   rY   r^   r   rL   rL   rO   r     s     z6GraphLowering.static_sizes_strides.<locals>.<listcomp>c                 S   s   g | ]}t |qS rL   r   r   rL   rL   rO   r     s     )r   r   )r   r   r   r   rL   rL   rO   static_sizes_strides  s    z"GraphLowering.static_sizes_stridesc                 C   st   t dd kr&ddlm} td|tt t dd krLddlm} td|tt t dd krpddl	m
} td|t d S )Ncpur   )CppSchedulingrV   )CUDACombinedSchedulingZxpu)TritonScheduling)r$   Zcodegen.cppr   r&   r)   r'   Z codegen.cuda_combined_schedulingr   r(   Zcodegen.tritonr   )r   r   r   r   rL   rL   rO   init_backend_registration  s&          z'GraphLowering.init_backend_registrationNF)gmexample_inputsc                    sx  t  | || _|d k	r|n| j||
d| _d| _|
| _|| _|| _|| _	d| _
|d krht }d| _n|| _d| _|| _|  |j | _t | _t|| _g | _i | _i | _|r|jnt | _|r|jnt | _d| _g | _|r|ni | _|rt| nt | _|r|j ni | _ i | _!i | _"t | _#t | _$t | _%t | _&t | _'d | _(d | _)g | _*|	| _+d | _,i | _-t | _.g | _/i | _0t1t2| _3t44 | _5|| _6|| _7|| _8i | _9|| _:|| _;t<t=| _>d | _?| jr| @ nt | _AtB|jC dh| _D|d k	r|ni | _Ed| _Fd| _Gg | _Hd | _Ii | _J|K | _L| jMjNOdi | _P|d k	rX|jQni | _Q| R  i | _St | _Td S )N)is_inferencer   FTzaten.convolution_backward  dynamo_flat_name_to_original_fqn)Usuper__init__r   decide_layout_opt
layout_optnum_channels_last_convr   is_const_graph
const_codeconst_moduleZextra_tracebackr   r   r   Zfreeze_runtime_assertsZdeferred_runtime_assertscopyras_by_symbolsetbound_unbacked_symbolsr?   sizevarsgraph_input_namesgraph_inputsgraph_inputs_originaldevice_typesdevice_idxsrV   buffersconst_output_indexkeysZfolded_constants	constantstorchbind_constantsconstant_reprsremoved_buffersZremoved_inplace_buffersmutated_buffersZnever_reuse_buffersinplaced_to_remove
device_opswrapper_codeextern_kernel_nodesextern_node_serializercurrent_nodelistsmutated_inputsmutated_input_idxsname_to_bufferr   listname_to_userstimeZcreation_timenamecpp_wrapperZrecord_multi_kernel_choiceZmulti_kernel_to_choiceaot_modegraph_idnext_post_grad_graph_counterZpost_grad_graph_id	schedulerfind_nodes_prefer_channels_lastnodes_prefer_channels_lastr   graph_warned_fallbackuser_visible_outputs	cache_key
cache_pathcache_linemapZdisable_cudagraphs_reasondevice_node_mapping__copy__orig_gmmoduler   r   r   allocated_constant_namer   Zeffectful_opsaligned_inputs)r   r   r   	shape_envr   r   r   r   r   r   r   r   r   r   r   r   	__class__rL   rO   r   (  s    






 zGraphLowering.__init__)returnc                C   sX  t js
dS t jrdS dd | jjD }t|}|dkr:dS tjjj	rftjj
 rftdd |D rfdS tt| jjd| krtd	 dS td
d |D rtd dS dd }dd }dd }|rddlm} tt}|D ]}	tjj|	\}
}}|
r|dd$}tj |	j|| W 5 Q R X W 5 Q R X | }||	rLd}n$||	r\d}n||	rld}nd}||  |7  < qtd qd}d}d}d}t| }|d | |d |  |d |  |d |  }||k}|std|| |S tt||rtd dS tt||r6td dS tt||rTtd  dS dS )!zl
        Decide if we should enable layout optimization for this graph based on
        heuristics.
        FTc                 S   s"   g | ]}|j tjjjjkr|qS rL   )ro   rP   opsrw   rx   default)rb   nrL   rL   rO   r     s     z3GraphLowering.decide_layout_opt.<locals>.<listcomp>r   c                 s   s4   | ],}d D ]"}|j | jd jtdkV  q
qdS )r   r   valr   N)rM   r   devicerP   rb   r   idxrL   rL   rO   	<genexpr>  s    z2GraphLowering.decide_layout_opt.<locals>.<genexpr>i,  z*Skipped layout opt because only a few convc                 s   s,   | ]$}d D ]}t |j| jd V  q
qdS )r   r   N)r   rM   r   r   rL   rL   rO   r     s    zeSee perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670c                 S   s(   | j d dko&| j d jd ddkS )Nr   r   rM   r   r   r   rL   rL   rO   
is_grouped  s    z3GraphLowering.decide_layout_opt.<locals>.is_groupedc                 S   sJ   | j d jd dd | j d jd dkoH| j d jd ddkS )Nr   r   r      r   r   rL   rL   rO   is_in_out_channel  s    0z:GraphLowering.decide_layout_opt.<locals>.is_in_out_channelc                 S   s4   | j d jd ddko2| j d jd ddkS )Nr   r   r   @   r   r   rL   rL   rO   is_small_channel  s    z9GraphLowering.decide_layout_opt.<locals>.is_small_channel)FlopCounterMode)displaygroupedZsmallZin_outr   zConv inputs meta not foundg|?5^?gtV?g333333?guV?zhSkipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %dzFSkip layout opt because found grouped convolution with >1 in_channels!zBSkip layout opt because some convolutions have smaller out_channelz>Skip layout opt because all convolution channels are too small)r    Zlayout_optimizationZforce_layout_optimizationr   r   r   rP   backendsmkldnnZenabledZis_availableallr   logdebugr|   Ztorch.utils.flop_counterr   r   float	_inductorZfx_utilsZget_fake_args_kwargsrG   	fake_modero   Zget_total_flopsrz   valuesmap)r   r   Z
conv_nodesZnconvr   r   r   r   Zflop_countsru   successrM   rN   Zflop_counter_modeZcounted_flopsZ	node_typeZGROUPED_MULTIPLIERZDEFAULT_MULTIPLIERZIN_OUT_MULTIPLIERZSMALL_MULTIPLIERZtotal_flopsZweighted_flopsZdo_layout_optrL   rL   rO   r     s    
	

 






	
zGraphLowering.decide_layout_opt)r   r   c                 C   s   | j dk	r| j  d| S |S )z2Prepend the given name with the graph name if any.Nr   r   r   r   rL   rL   rO   qualify_nameO  s    
zGraphLowering.qualify_name)r   r   subgraph_namer   c                 C   s(   t ||| j| j| j| j| j| |dS )a  
        Make a subgraph of the current graph with all inherited
        parts, except the graph module (`gm`) and `example_inputs`.
        The subgraphs are lowered separately, but intended to be
        inlined in the parent graph's codegening. Hence the need
        for maintaining the same `shape_env` and other properties.
        The subgraph name is qualified by the parent graph's name.
        )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r  )r   r   r   r  rL   rL   rO   make_subgraphU  s    zGraphLowering.make_subgraphc                 C   s   t  }t| jjjD ]D}|jtjjj	j
kr6|| q|jD ]}||kr<||  qq<q| jjjD ]}||krd||j qd|S )aC  
        The rule to decide if an node prefer channels last is simple.
        1. if it's input/output of a convolution
        2. if one of its user prefers channels last

        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
        channels last.

        Consider the scenario: conv -> batch-norm -> relu -> conv
        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
        1. the output of batch-norm should be channels last initially since its input is a conv's output.
           Forcing the batch-norm's output to be contiguous results in the first copy
        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
           We need convert it to channels last layout which results in the second copy.
        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
        can be saved.
        )r   r   r   r   r   ro   rP   r   rw   rx   r   rT   usersupdate)r   Z
output_setr   userrL   rL   rO   r   n  s    


z-GraphLowering.find_nodes_prefer_channels_lastc                 C   s&   || j kr"| j | td| d S )NzUsing FallbackKernel: %s)r   rT   perf_hint_loginfor  rL   rL   rO   warn_fallback  s    
zGraphLowering.warn_fallback)r   c                 C   sJ   | j |j |jd k	r&| j|j tjjrF|| jkrFtjj| j|< d S rK   )	r   rT   typeindexr   rG   r   r   r   )r   r   rL   rL   rO   add_device_info  s
    
zGraphLowering.add_device_infoc                 C   s   t jS rK   )rG   r   r   rL   rL   rO   r     s    zGraphLowering.fake_mode)buffer_namec                 C   sh   || j kr| j | S || jkr(| j| S || jkrdtjj| }t|tj|j|j	ftj
| S d S rK   )r   r   r   rG   r   r!   ConstantBufferr/   r   rU   r   )r   r  datarL   rL   rO   
get_buffer  s     




 
zGraphLowering.get_bufferc                 C   sx   || j kr| j | jS || jkr.| j|  S || jkrF| j|  S td|}|rf| |dS td| d S )Nz1(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),r   could not find )	r   rU   r   	get_dtyper   rematchgroupKeyError)r   r  rc   rL   rL   rO   r    s    


zGraphLowering.get_dtypec                 C   s   ddl m} || jkr$| j|  S || jkrV| j| }tt|dd |rNdS | S || jkrn| j|  S t	d| d S )Nr   )MultiOutputLayoutZlayoutr  )
r!   r  r   Znumelr   rX   rm   	get_numelr   r  )r   r  r  bufrL   rL   rO   r    s    



zGraphLowering.get_numelc                    s   t  j| S rK   )r   run)r   rM   r   rL   rO   r    s    zGraphLowering.runset_name)bufferr!  c                C   sh   |  dt| j }| j| || j|< t|tjr@| sZ|	 d k	rZ| 
|	  |rd||_|S )Nr  )r  r   r   appendr   rX   r!   ComputedBufferZis_zero_elements
get_devicer  r   )r   r"  r!  r   rL   rL   rO   register_buffer  s    


zGraphLowering.register_buffer)buffer_namesc                 C   s"   |  dd| }|| j|< |S )Nlist_r   )r  rl   r   )r   r'  r   rL   rL   rO   register_list  s    
zGraphLowering.register_listc                    s    fdd  | d S )Nc                    s   t | ttfr | D ]} | qt | tjrt| dr`t | jtjr`t| jdr`t | jjtjsdd S |  D ]}j| 	|  qld S )Nr  )
rX   r   tupler!   IRNoderj   r  Zget_read_namesr   r#  )valuexZ	read_nameregisterr   rL   rO   r/    s    

z1GraphLowering.register_users_of.<locals>.registerrL   )r   Znode_outputrL   r.  rO   register_users_of  s    zGraphLowering.register_users_ofr  c                 C   sD   t |tst| j| || jkr(dS | j| D ]}|  q2dS )z
        When a buffer is mutated we need to make sure all the reads to
        the old version are realized before the mutation happens.
        N)rX   strr_   r   rT   r   realize)r   r   r
  rL   rL   rO   mark_buffer_mutated  s    
z!GraphLowering.mark_buffer_mutatedc                 C   sP   || j kr|| jks td| t| j | }|| jjkrF| jj| S | j| S )z
        In AOTI, module buffers may have been mutated during the tracing and compilation.
        Thus we need to read from previously stored original buffers, to make sure the
        generated model.so uses correct initial values.
        z$Can not find the original value for )r   r   r_   rB   r   r   )r   r   	orig_namerL   rL   rO   get_original_value_of_constant  s    
z,GraphLowering.get_original_value_of_constantc              
   C   sV  |}t jjs| j D ]v\}}|js| | kr| | kr|j|jkr|j	|j	kr|
  |
  kr| | kr|  S q|d krdt| j }|d  rd| }| |}tdd|}|}d}|| jkr| d| }|d7 }q|| j|< |j	d|jdt| dt| dt|d	| j|< || j|< |S )	NZconstantr   Z	constant_z[^a-zA-Z0-9_]r   r    r-  )r    aot_inductoruse_runtime_constant_foldingr   itemsZ	is_mkldnnr   r   rU   r   Zuntyped_storageZdata_ptrZstorage_offsetr   isdigitr  r  subr*  hashr   r   )r   r   r  r4  constant_namer,  prefixZcntrL   rL   rO   allocate_non_dup_const_name$  sH    










8
z)GraphLowering.allocate_non_dup_const_namec              
   C   s4   |  ||}tt|t|j|jf| | S rK   )	r?  r4   creater!   r  r/   r   rU   r   )r   r  r   new_namerL   rL   rO   add_tensor_constantJ  s    z!GraphLowering.add_tensor_constant)r   device_overridec              
   C   sn   | j | j|ks|dkr|S tjj < | | d|j |jpBd | j | 	|W  5 Q R  S Q R X dS )z
        We AOT copy constants to the devices they are needed on.
        If device_override doesn't match the constant's device, then
        copy it and return a different name.
        Nr   r   )
r   r   rP   utils_python_dispatch_disable_current_modesr?  r  r  to)r   r   rC  rL   rL   rO   r=  S  s    zGraphLowering.constant_namero   c           	   	      s"  t  |||}| j| t|tr<|jj}|| j|< |S t|t	t
tfrdt|}|| j|< |S t|trrd S t|tjst||js| |\}}n| |\}}| |}tt|t|j|j||}|| j|< |jj| j|< | |j t  t |r| j!"| W 5 Q R X |S rK   )#r   placeholderr   r#  rX   r   ru   r   r   intrS   r   rY   Zsympifyr   rP   Tensorr_   Z_has_symbolic_sizes_stridesr   r   r  r4   r@  r0   r/   r   rU   r  r   r  rD   rE   r   rT   )	r   ro   rM   rN   exampler   sizesstridestensorr   rL   rO   rI  c  s<    







zGraphLowering.placeholderc           
   
      sn  |t jkr.t|d tttfr.t |||S t|drB|||S  fdd}|t	krt|t
jjstt| d| dd }|tkrt| nptjr||||\}}}t|grtnt}td|||| t|| n$t|grt|||nt|||z$tdt	|  t	| ||}|W S  tk
rh }	 zt|	||||	jd W 5 d }	~	X Y nX d S )	Nr   Z_inductor_lowering_functionc                    sF   d }t jjj| jkr<tjtdd}| jf||\}}|}|||fS )NT)Zignore_mutated_args_FIXME)	rP   _CTagZneeds_fixed_stride_ordertags	functoolspartialr6   r   )ro   rM   rN   layout_constraintZconstrain_fnr  rL   rO    get_custom_op_layout_constraints  s     zEGraphLowering.call_function.<locals>.get_custom_op_layout_constraintsz is not an OpOverloadrg   z"Creating implicit fallback for:
%sz  via %s)operatorgetitemrX   r   r*  dictr   rr   rj   r;   rP   Z_opsZ
OpOverloadr_   r   rh   r7   r<   r    Zimplicit_fallbacksr   r,   r-   r   r  Zoperator_strr   	Exceptionr+   with_traceback__traceback__)
r   ro   rM   rN   rV  	base_namerU  errorouter   r  rO   rr     sV    


 
  

zGraphLowering.call_function)tr   c                 C   s   t | jdko| jd dkS )zM
        True if this is a small constant attr that will be inlined.
        r   r      )r   shape)ra  rL   rL   rO   can_inline_constant  s    z!GraphLowering.can_inline_constantc              
   C   s   t | j|}t|tjjr(tj||dS t|tjj	rT|| j
|< d| j|< t||S tjjsjtjsjt|rv| ||S t h |jdkrt| |j|jW  5 Q R  S | |rddlm} || |j|jdW  5 Q R  S W 5 Q R X | ||S )N)r   Zgraph_moduler   rL   r   )rO  )rU   r   )rq   r   rX   rP   fxGraphModuler!   ZSubgraphrP  ZScriptObjectr   r   r5   r    r7  r8  Zalways_keep_tensor_constantsr>   rB  r   rc  r.   itemrU   r   rd  loweringrO  tolist)r   ro   rM   rN   r,  rO  rL   rL   rO   get_attr  s*    



 
,zGraphLowering.get_attrc                 C   s   t d S rK   r_   r   ro   rM   rN   rL   rL   rO   call_module  s    zGraphLowering.call_modulec                 C   s   t d S rK   rk  rl  rL   rL   rO   call_method  s    zGraphLowering.call_methodc              	      s  t  |||}t|ttfs$|f}t|ttfs>tt|tdd |D sXt|tj	j
jd }t|ttfsz|f}dd |D }g }t|t|kstt||D ]B\}}t|tjtjfs|| q|| ||jd   q|| _| j D ]\}	}
t|
ttjfs*tdt|
 t|
ts8q|
  t|
tsPt|
j}
t|
tjsht|
}|
j}
t|
tr|
 |	krtj |
| j!|	  z | j"|}| j!|	 | j|< W q t#k
r   Y qX q| $  t%&d| j'| j(d k	r| j(nd	 d S )
Nc                 s   s:   | ]2}t |ttjtd tjtjtjj	j
ttjfV  qd S rK   )rX   r4   r!   r.   r  r  rY   r[   ZlogicZboolalgBooleanrJ  ZEffectfulKernelrb   r-  rL   rL   rO   r     s   z'GraphLowering.output.<locals>.<genexpr>r   c                 S   s   g | ]}t j|qS rL   )r!   ExternKernelZrealize_inputrp  rL   rL   rO   r      s     z(GraphLowering.output.<locals>.<listcomp>r   z'Unsupported inductor graph input type: zGForce channels last inputs for %d conv for the current graph with id %dr   ))r   outputrX   r*  r   r_   r  r   rG   r   r   rM   r   zipr!   r4   BaseViewr#  try_match_insignificant_stridesr   r   r   r   r9  rY   r[   r2  r  r3   r0   get_nameZMutationLayoutSHOULDREMOVEZrealize_intor   r  
ValueErrorfinalizer   r   r   r   )r   ro   rM   rN   resultZfx_node_argsZresult_correct_stridesrZfx_noder   r,  Zvalue_storage_boxindr   rL   rO   rr    sp    
   zGraphLowering.outputc                 C   s   | j D ]}|  qd S rK   )r   Zdecide_layout)r   r  rL   rL   rO   rx  Q  s    
zGraphLowering.finalizert   c                 c   s$   | j }z|| _ d V  W 5 || _ X d S rK   )r   )r   ru   oldrL   rL   rO   set_current_nodeU  s
    
zGraphLowering.set_current_node.)meta_strides_inpr   c                    s   t jj|stdd |D }t fddt|| D rD|S  fdd}|| || sh|S t jj	|\}}t
|j}t| D ]"\}}	 j|	dr|| ||< qt jj|j|j|j||j}
tt jj||
S )a  
        Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
        dimensions - size 0 or 1 - will be updated.

        If there are real stride differences (NHWC vs NCHW) then the input will be returned.
        c                 S   s$   g | ]}t |tjr|jjn|qS rL   r   rb   srL   rL   rO   r   m  s    zAGraphLowering.try_match_insignificant_strides.<locals>.<listcomp>c                 3   s    | ]\}} j ||V  qd S rK   )r   statically_known_equals)rb   s1s2r  rL   rO   r   q  s   z@GraphLowering.try_match_insignificant_strides.<locals>.<genexpr>c                    s@   t | ||D ].\}}} j|dr&q j||s dS qdS )Nr   FT)rs  r   statically_known_leqr  )rc  meta_stridesZtensor_stridesdimr  r  r  rL   rO   significant_strides_equalw  s    zPGraphLowering.try_match_insignificant_strides.<locals>.significant_strides_equalr   )rP   r   r!   Zis_storage_and_layoutr_   r   rs  
get_strideget_sizeZas_storage_and_layoutr   r   ri   r   r  r/   r   rU   r   offsetr4   ZReinterpretView)r   rO  r~  r  r  ZstorageZ
old_layoutZ
new_striderp   r  Z
new_layoutrL   r  rO   ru  ^  s8    
  
z-GraphLowering.try_match_insignificant_stridesr   c           $         sD  fdd}t jh}jdkrB\}}|t||O }tj|0  t	  jdkrj
tjk	rtr|d tj
dd||}njdkrj
tkr|d tj
 f||\}}j
||}ndtj
rB|d tjd	 tjtjtjfr4jd	 jj}nt }n|d
 t }tjjjjtjjj jtjjj!jtjjj"jtjjj#jg t$dd j%D }t$ fddj%D }j&ddr0t|t'r0|(  jd	 ) }	tj*j+j,|	 }
t-|dr|. |	kr0|
s0t/|	}tj01||}|rZt|t'rZt|j2tj3rZ|(  |sf|r(tjd	 tj4r(jd	 ) }	tj56jd	 }t t7|	dk}|s(|r(t |	r(t/|	}t |8 dkrj9krj:j;kr|stj<}j:j;ko| }tj0j1|||d}t t=j%}|dkr
t|t'r
j%D ]}|j
t>kr|?  tjjj@jtjjjAjtjjjBjg}g }jCs|DtjjjEj tjFjGrd|tjjHjIjtjjHjIjJtjjjKjtjjLjMjtjjLjMjNtjjLjMjJtjjLjMjOg7 }|tjjHjPjtjjHjPjJtjjHjQjJtjjHjRjtjjLjSjtjjLjSjJg7 }tjFjTrd|tjjUjVjg7 }|j
|krtj0j1|t/jd	 ) dd}|j
|krΈ|jWd krtj01|t/tXjd	 jY}|jdkrRt|j2j2tZt[frR|(  qR|\t j% t|t'r(|] r(|?  t|t'rht|j2t^rh|j2j2}t|tZrh|_ rh|(  W 5 Q R X W 5 Q R X W 5 Q R X t|t'rVt|j2tj^rVt|j2j2tj`r|j2j2_ant|j2j2tjbrV|j2j2_at|j2j2tjcrt|j2j2j2tj`r|j2j2j2_anFt|j2j2tjdrV|j2j2jesVt|j2j2jfd tjbrV|j2j2jfd _ag| t= }tht jD ]}|j| i O }qvfdd}jdkr@t	jjjkjl}|D ]}jmn|g }|jo| }|p q|sfdd }||jr }d k	r*jstt||jrk| d|jr dd ||ju }d k	rfjstt||juk| d|ju dd |D ]f}t7|j}|jv } | rtw| dd d d }!jmx|!g D| njstt|j|j dd qjq jv|O  _vtyt	jjjkjlj&d!i }"d"d# |"z D }#||#ks@t{d$| d|# d%|  d&|  |S )'Nc                    s   t dt j|  d S )Nzlowering %s %s)r   r   r   format_node)msgr   rL   rO   r     s    z%GraphLowering.run_node.<locals>.debugrr   r8   F)Zadd_to_fallback_setr:   rf   r   r   c                 s   s   | ]}|j d kV  qdS )rr  N)re   rb   r
  rL   rL   rO   r     s     z)GraphLowering.run_node.<locals>.<genexpr>c                 3   s   | ]}|j  kV  qd S rK   rH  r  )as_strided_opsrL   rO   r     s    Zinductor_realize_to_stridesr  r      )allow_paddingr   Trr  c                     s>   g } j  d  D ] }| d|  d| d qd| S )Nzunbacked_symbol_defs=z in:

z***
)r   r#  get_unbacked_symbol_defsrl   )rz  b)buffer_watermarkr   rL   rO   format_bufferss  s    z.GraphLowering.run_node.<locals>.format_buffersrI  c                 S   s&   z
t | W S  tk
r    Y d S X d S rK   )rJ  	TypeError)r  rL   rL   rO   convert  s    
z'GraphLowering.run_node.<locals>.convertz >= r   z <= c                 S   s   t | S rK   )r1  r-  rL   rL   rO   <lambda>      z(GraphLowering.run_node.<locals>.<lambda>)keyunbacked_bindingsc                 S   s   h | ]}t jjj||qS rL   )rG   r   r   Zunbacked_renamingsr   r  rL   rL   rO   rd     s   z)GraphLowering.run_node.<locals>.<setcomp>zfailed z (inductor >= fx)
fx node is: z
new buffers are:

)}r   r   re   Zfetch_args_kwargs_from_envrA   r!   r+  Zcurrent_originsr}  rG   ro   rW  rX  r9   r8   r:   rr   rf   rX   r   rP   r   SymFloatZSymBoolru   r   r   run_noder   rw   Z
as_stridedr   Zas_strided_Zas_strided_scatterresizeZ	resize_asr|   r  r   r4   r2  r   r   rD  Zany_is_symbolicrj   r  Zget_stride_orderrq  Zrequire_stride_orderr  rt  rK  Z_prims_commonZis_non_overlapping_and_denser   r  r   r   r   ZNHWC_STRIDE_ORDERr   r=   Zrealize_hintry   mmZ_int_mmr   r#  rx   rP  Z_has_mkldnnr   Z_linear_pointwisebinaryZmkldnn_rnn_layerZonednnZqlinear_pointwiserO  Zbinary_tensorZ_convolution_pointwiseZ_convolution_pointwise_Z _convolution_transpose_pointwiseZqconv2d_pointwiseZhas_mklZmklZ_mkl_linearrM   r   rc  r1   r2   Z
mark_reuseZhas_exceeded_max_readsr3   Zhas_large_inner_fnZLoopsZorigin_nodeBufferr$  ZMultiOutputindicesinputsr0  ranger  r   r   r   r   popZvar_to_rangeZ _default_unspecified_value_rangeissubsetlowerr&  ZAssertScalarupperr   sorted
setdefaultr   r   r_   r  )$r   r   r   ZoriginsrM   rN   ry  Z	is_outputZis_input_for_as_stridedrN  Zsym_stridesZstride_orderZdenseZunbacked_symbols_in_stridesr  Z	num_usersr
  Zneed_fixed_layoutZneed_fixed_channels_last_layoutcurrZnew_unbacked_defsrp   r  r   Zi0ZrasZvrr  r  r  raZfvsmissingi1r  Zrenamed_unbacked_bindingsr   )r  r  r   r   rO   r    s   


  





 

 

  










	







&

 





   zGraphLowering.run_nodec                 C   s   t jrtdtjdkr(tdtj | j D ]X}d }t|trN|	 }n"t|t
jt
jt
jjjfrpt|}t|| js2td| q2d S )NzC++ codegen is disabled)linuxdarwinzUnsupported platform zUnsupported input dtype )r    Zdisable_cpp_codegenr*   sysplatformr   r   rX   r4   r  rY   rZ   r[   r\   r]   r^   ra   rW   rV   )r   r,  rU   rL   rL   rO   !validate_can_generate_cpp_wrapper  s    


 z/GraphLowering.validate_can_generate_cpp_wrapperc                 C   s   d| j k| _| jr|   | j  }|d |d t|dksXtdd	|t|dk}|rldn|
 }t|| _t|| j}|d k	std| d	| | _| jr| jjj| j_| jjj| j_d S )
NrV   r   r   r   zDoes not support mixing {}+r   zDevice z not supported)r   rV   r   r  r   discardr   r_   formatrl   r  r#   r   r%   r   r   Z_names_iterZsrc_to_kernel)r   r   Zonly_cpuZdevice_typeZwrapper_code_gen_clsrL   rL   rO   init_wrapper_code  s,    



 zGraphLowering.init_wrapper_codec              	      sv  dj krjd_tddi  j}W 5 Q R X dd  tjj	 }|dk	rt
tjts|jrn|j  dd	 |jD } fd
d	t|tjD n& fdd	t
tjtrjntjD jr
ddlm} fdd	tjD }|D ]}|| |< qtjj  | W 5 Q R X d_j  j  tjjj   tjjj!  " S " S dS )ad  
        For CPU, the cpp wrapper codegen is done in one pass.
        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
        generate cpp wrapper code and compile it to a dynamic library in the second pass.
        rV   Fztriton.store_cubinTc                 S   sT   t | tjtjfr| jjS t | tr,t| S t | tjsLt	dt
t|  | S d S )Nz&Unknown type when creating real inputs)rX   rP   r   r  ru   hintr   r   rK  r_   r1  r  r  rL   rL   rO   materialize  s    
 z;GraphLowering.codegen_with_cpp_wrapper.<locals>.materializeNc                 S   s   g | ]}|d k	r|qS rK   rL   )rb   paramrL   rL   rO   r   '  s   z:GraphLowering.codegen_with_cpp_wrapper.<locals>.<listcomp>c                    s   g | ]} |qS rL   rL   rp  r  rL   rO   r   ,  s    c                    s   g | ]} |qS rL   rL   rp  r  rL   rO   r   3  s   r   )clone_preserve_stridesc                    s.   g | ]&\}}|j krt | tjr|qS rL   )r   rX   rP   rK  )rb   r   r   )real_inputsr   rL   rO   r   ?  s   
)#r   r   r    patchcompile_to_modulecallrP   Z_guardsZTracingContextZtry_getrX   rG   r  rF   Zoutput_stridesclearparams_flat	itertoolschainr   r   Z
compile_fxr  ri   r   rD  rE  rF  r   r   r   r   Zprecomputed_replacementsZinv_precomputed_replacementscodegen)r   compiledZtracing_contextr  r  r   r   rL   )r  r  r   rO   codegen_with_cpp_wrapper  sP    
 



	

z&GraphLowering.codegen_with_cpp_wrapperc                 C   sf   ddl m} |   || j| _ tj| j| j j | j	
|  | j   | j	| j}| j	  |S )Nr   	Scheduler)r   r  r  r   rG   r   Zdraw_orig_fx_graphr   r   r   Zpush_codegened_graphr  generater   Zpop_codegened_graph)r   r  ry  rL   rL   rO   r  _  s    

zGraphLowering.codegenc                 C   s>   ddl m} |j| _|j| _|j| _|| j| _ | j   dS )a  
        This is a more compact version of the `codegen()` above
        where we codegen this graph as a subgraph of some parent
        graph. The parent graph is passed as an argument: the
        intention is to inline codegening of the subgraph in
        the parent graph's wrapper code (including the generated
        kerenls). The wrapper code is not finalized (via `.generate()`
        call), as this will be done in the parent graph's `codegen()`.
        r   r  N)r   r  r   r   r   r   r  )r   Zparent_graphr  rL   rL   rO   codegen_subgraphm  s    
zGraphLowering.codegen_subgraphc                 C   sX   d}g }g }| j jD ]8}| }||7 }|||d f ||| f q|||fS )Nr   r  )r   r   Zget_read_write_buffers_sizesr#  Zget_estimated_runtime)r   total_bytesZnode_countsZnode_runtimesru   Z	num_bytesrL   rL   rO   count_bytes  s    zGraphLowering.count_bytesZcode_gen)Z
phase_nameZfwd_onlyc                    sJ  ddl m} | jr|  n|  \ }td  z dd |D }| \}W n* tk
r|   t	d fddd	  Y n X t	dfd
d fddd	 |j
||| j| jd}|| _| _|| _|jd k	stt|j td|j td|j tjrtd|j tjd tj|j tjtj|jd d  |S )Nr   )PyCodeCacheOutput code: 
%sc                 S   s   g | ]\}}||j fqS rL   )Zstack_trace)rb   line_noru   rL   rL   rO   r     s     z3GraphLowering.compile_to_module.<locals>.<listcomp>Zinductor_output_codec                      s    S rK   rL   rL   coderL   rO   r    r  z1GraphLowering.compile_to_module.<locals>.<lambda>)Z
payload_fnc                      s   d iS )NfilenamerL   rL   )pathrL   rO   r    r  c                      s    S rK   rL   rL   r  rL   rO   r    r  )linemapattrszOutput code written to: %szCompiled module path: )filer   z.debug) 	codecacher  r   r  r  output_code_logr   writerZ  r   Zload_by_key_pathr   r   r   r   r   __file__r_   rJ   r   r  r    Zbenchmark_kernelprintr  stderrrG   rI   r   osr  splitext)r   r  r  r  modrL   )r  r  rO   r    sH    




zGraphLowering.compile_to_modulec                 C   s   | j r|ddlm} | js td|  \}}td| d }t	 rh| j
rh| jrh| | j
}td| |j| ||| jdS |  jS d S )Nr   )AotCodeCompilerz"AOT mode only supports C++ wrapperr  z#Serialized Extern Kernel Nodes: 
%s)rV   )r   r  r  r   r_   r  r  r   r    	is_fbcoder   r   compilerV   r  r  )r   r  r  r  Zserialized_extern_kernel_nodesrL   rL   rO   compile_to_fn  s4       zGraphLowering.compile_to_fnc                 C   s   dd | j D S )Nc                 S   s,   g | ]$}t |tjst |tjs| qS rL   )rX   r!   ZNoneAsConstantBufferZShapeAsConstantBufferrv  )rb   ru   rL   rL   rO   r     s   z2GraphLowering.get_output_names.<locals>.<listcomp>)r   r  rL   rL   rO   get_output_names  s    zGraphLowering.get_output_namesc                 C   s4   || j  ko2| j |  dko2| j |  jdkS )Nr   r   )r   r   r  r%  r  r  rL   rL   rO   is_unspec_arg  s
    zGraphLowering.is_unspec_arg)NNNFFNNNFFNNNN)N)G__name__
__module____qualname__r   r!   r+  __annotations__rP   rK  r   r   r   re  rf  r	   r   staticmethodrS   r   r1  r  r  r   r  r   r  propertyr   r  r  r  r   r  r  r&  r)  r0  r3  r5  r?  rB  r=  rI  rr   rd  rj  rm  rn  rr  rx  r   Noder}  r   r   rJ  r   r4   ru  r  r  r  r  r  r  r  r  r  r  r  __classcell__rL   rL   r   rO   r      s   
#                "2
&
	6AL7  @[

0r   )~rS  r  loggingrW  r  r  r  r   collectionsr   
contextlibr   typingr   r   r   r   r   r	   r
   r   r   r   rY   rP   Ztorch._loggingZtorch.fxZtorch._decompr   Ztorch._dynamo.utilsr   r   r   r   Ztorch._prims_commonr   Ztorch._subclasses.fake_tensorr   Z%torch.fx.experimental._backward_stater   Ztorch.fx.experimental.sym_noder   r   Z%torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   Ztorch.utils._mode_utilsr   r   r    r!   Zcodegen.commonr"   r#   r$   r%   r&   Zcodegen.cpp_wrapper_cpur'   Zcodegen.cpp_wrapper_cudar(   Zcodegen.wrapperr)   excr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   rh  r6   r7   r8   r9   r:   r;   r<   r=   r>   r   r?   rD  r@   rA   rB   rC   rD   rE   ZvirtualizedrF   rG   Ztorch._higher_order_ops.effectsrH   	getLoggerr  r   Z_loggingZgetArtifactLoggerr  r  r   rw   countr   r  Ztorch._inductor.fb.utilsrJ   rW   ra   rf   rq   r   re  ZInterpreterr   rL   rL   rL   rO   <module>   sd   0 (
, 

7