U
    yh                     @  sd  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+ d dlm,Z, d dl-Z-d dl.Z.d dl/Z.d dl0m1  m2Z3 d d	l4m5Z5 d d
l6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZB d dlCmDZDmEZE d dlFmGZGmHZH ddlImJZJ ddlKmLZLmMZN eOePZQe&dZRee-jSe-jSf ZTdZUdZVeVeVd @ d kr.eVdks6tWddd ZXddddZYG dd  d e-jZZ[dBd#d$d%d&d'Z\e]dd(d)d*d+Z^d,d-d.d/d0Z_d1d2 Z`d3d4 Zad5d6d7d8d9Zbd:d:d:d;d<d=ZMd>d? Zcd@dAdBdCdDZddEdFdBdGdHZedIdJdKdLZfdMdN ZgdOdP ZhdCdRdSdTdUZidDdVdWdRd$dXdYdZZjdEdRdSd^d_Zkd`dRdadbdcZld`dddedfdgZmdWd)dhdiZndjdk Zodldm Zpe+dnZqe&dodpdqZrG drds dse#eeqerf Zsdtdud%dvdwZtdxdy Zudzd{ Zvd|d} ZwdFd~ddddZxdd ZyddRdddZzdd Z{ddWddddZ|dd Z}dRddddZ~ddddddZd`d(dddZd`d(dddZdd Zdd ZddddZg Zded< d`dddZdd ZejdGddZdd)ddZe]ddd ZG dd de!ZG dd dZG dd deZejdd ZG dd dZe]dd(d)ddZd(d)ddZdd(dÜddńZdRd(dƜddȄZddʜdd̄Zdd΄ ZddЄ Zdd҄ ZddԄ ZG ddք dփZdd؄ Zddڄ Zdd܄ Zddބ Zejdd ZdHddZdd Zdd Zdd Zdd Zdd ZdddddZejdd Zdd Ze]ddd Ze]ddd Zdd Zdd Zdd Zd(d)dd ZddddZdd ZG dd dejZd	d
 Zdd Zdd ZdWdWdddZddddZejG dd dZejdd ZdRdddZdRdSddZdRdSddZd d! ZdId"d#d$Zd%d& Zd'd(d)d*Zd'd+d,d-ZÐd.d/ ZdRdRd0d1d2ZdRd3d4d5ZdRdRdRd6d7d8Zddddɐd9dRdRdRd(dVd:d;d<d<d(d(d=d>d?ZȐd@dA ZdS (I      )annotationsN)datetime)StringIO)Path)AnyCallableDictGenericIterableList
NamedTupleOptionalProtocolSetTupleTypeVarUnion
ValuesView)Concatenate	ParamSpec)mock)get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)	ShapeProp)CeilDivCleanDivFloorDivModularIndexing)make_symbolSymT)bound_sympyValueRanges   )config)	cache_dirceildiv_T   @      zmust be power of 2c                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr$   )ALIGN_BYTES)nbytes r.   G/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/utils.py_alignI   s    r0   z
sympy.Exprvc                 C  s<   t | tjtjfr"ttt| jS t | tp:t	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddZMaxallmap_is_alignedargsaligngcdr,   r1   r.   r.   r/   r8   N   s    r8   c                   @  s$   e Zd ZdZdZdZedd ZdS )r:   z<Symbolically round up to the nearest multiple of ALIGN_BYTES)r$   Tc                 C  s,   t |ttjfrtt|S t|r(|S d S N)r3   intr4   Integerr0   r8   )clsvaluer.   r.   r/   eval[   s    z
align.evalN)__name__
__module____qualname____doc__nargs
is_integerclassmethodrA   r.   r.   r.   r/   r:   U   s
   r:      d   zCallable[[], Any]float)fnreturnc              	     s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]}|  |   qR|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]
}|   qt jjt jjjgd,}
t|	D ]}|  |   qt j  W 5 Q R X td	 t|
 jd
dd tdd |
 D }t||	 dkr`tdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Acuda)dtypedeviceT)Zenable_timing   r$   )Z
activitiesz
raw eventsZself_cuda_time_total)Zsort_by	row_limitc                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Sync)device_typer   CUDAname.0eventr.   r.   r/   
<listcomp>   s    
z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS )r   r.   )rX   irY   Znum_event_per_groupr.   r/   rZ      s   zprofiling time breakdown)rS   c                 s  s   | ]}|j V  qd S r<   )Zdevice_time_totalrW   r.   r.   r/   	<genexpr>   s     z+do_bench_using_profiling.<locals>.<genexpr>g     @@zprofiling results: %s ms)torchrN   synchronizeemptyr=   EventrecordrangeZzero_Zelapsed_timemaxprofilerprofileZProfilerActivityrU   logdebugZkey_averagestabler   eventslenRuntimeError	enumerateZ_build_treesum)rL   ZwarmuprepcacheZstart_eventZ	end_event_Zestimate_msZn_warmupZn_repeatpr[   Zfiltered_eventsZactual_eventsresr.   r\   r/   do_bench_using_profilingc   sf    	




rt   boolrM   c               
   C  s   z8ddl m}  tjdd | d k	o6tttjdd dW S  tk
rN   Y dS  t	k
r } zdt
|ksntW Y dS d }~X Y nX d S )	Nr   )	roi_alignztorchvision::nmsZMetaZtorchvisionrw   Fztorchvision::nms does not exist)Ztorchvision.opsrw   r^   Z_CZ%_dispatch_has_kernel_for_dispatch_keyhasattrgetattropsImportErrorrl   strAssertionError)rw   er.   r.   r/   has_torchvision_roi_align   s    
 r   z"Union[Optional[torch.device], str]ztorch.device)rP   rM   c                 C  s`   | d krt djS t| tr(t | } | jdkr\| jd kr\t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
r^   tensorrP   r3   r|   typer   r   ZWorkerZcurrent_devicerP   Zdevice_interfacer.   r.   r/   decode_device   s    


r   c                 C  s   t tj| tdS Nr$   )	functoolsreduceoperatormulr4   r>   itr.   r.   r/   sympy_product   s    r   c                 C  s2   t | t |kstttdd t| |D S )Nc                 s  s   | ]\}}|| V  qd S r<   r.   )rX   abr.   r.   r/   r]      s     zsympy_dot.<locals>.<genexpr>)rk   r}   r4   expandrn   zip)Zseq1Zseq2r.   r.   r/   	sympy_dot   s    r   zIterable[_T]zValuesView[_T])r   rM   c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS r.   )idrX   xr.   r.   r/   
<dictcomp>   s      zunique.<locals>.<dictcomp>)valuesr   r.   r.   r/   unique   s    r   zUnion[int, sympy.Expr])numerdenomrM   c              	   C  sr   t | tjst |tjr.tt| t|S t | trBt |tsht|  dt|  d| dt| t| |S )Nz: , )	r3   r4   Exprr   sympifyr=   r}   r   runtime_ceildiv)r   r   r.   r.   r/   r'      s      r'   c                 C  s   | d krdS t | dd }dddddddd	d
ddddddddddd}t| D ]}|||< qVt| t rr| S d||  S )Nz*i8.rR   i1Zfp8e4nvZfp8e5Zfp8e4b15Z
fp8e4b15x4Zfp16Zbf16Zfp32Zfp64i8Zi16Zi32Zi64u8u16u32Zu64)ru   Z
float8e4nvZfloat8e5Zfloat8e4b15Zfloat8e4b15x4Zfloat8_e4m3fnZfloat8_e5m2float16bfloat16float32float64Zint8Zint16int32int64Zuint8Zuint16Zuint32Zuint64*)r|   splitlistr   r3   )keyZ	dtype_strZtysr2   r.   r.   r/   _type_of   s4    
r   z"Iterable[Union[int, torch.SymInt]]zList[sympy.Expr])lstrM   c                 C  s   dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s*   g | ]"}t |tjr|jjnt|qS r.   )r3   r^   SymIntnodeexprr4   r>   rX   r[   r.   r.   r/   rZ     s    z-convert_shape_to_inductor.<locals>.<listcomp>r.   r   r.   r.   r/   convert_shape_to_inductor  s    r   z Iterable[Union[int, sympy.Expr]]zList[Union[int, torch.SymInt]]c                   s   ddl m   fdd| D S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r$   Vc                   sB   g | ]:}t |tr|n&t |tjr*t|n jjjj|d dqS )N)hint)r3   r=   r4   r>   graphsizevars	shape_envZcreate_symintnoder   r   r.   r/   rZ   $  s   

z+convert_shape_to_symint.<locals>.<listcomp>)virtualizedr   r   r.   r   r/   convert_shape_to_symint  s    
r   ztorch._ops.OpOverloadopc                 C  s(   t | tjjsttdd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s   | ]}|j d k	V  qd S r<   )Z
alias_inforX   r   r.   r.   r/   r]   3  s     zis_view.<locals>.<genexpr>)r3   r^   _ops
OpOverloadr}   any_schema	argumentsr   r.   r.   r/   is_view.  s    r   c                 C  sh   | j dksdS t| jtjjs.| jtjks.dS | jtjksDt| jrXt	dd | j
D S tjj| jjkS )Ncall_functionFc                 s  s   | ]}t |V  qd S r<   )is_pointwise_use)rX   ur.   r.   r/   r]   @  s     z#is_pointwise_use.<locals>.<genexpr>)r   r3   targetr^   r   r   r   getitemr   r6   usersTagZ	pointwisetags)Zuser.   r.   r/   r   6  s    

r   c           
      C  s   t j }g }g }t|D ]@\}}t|t jrP||d|  || q|| qtdd |	 D svt
|| t||}t| jjdkrt| jjd jdkr|f}|| t ji |}	|	|fS )Nargc                 s  s   | ]}t |tj V  qd S r<   r3   r^   Tensorr   r.   r.   r/   r]   O  s     z$gen_gm_and_inputs.<locals>.<genexpr>r$   r   r   )r^   ZfxZGraphrm   r3   r   appendplaceholderr6   r   r}   r   tuplerk   r   Zreturnsr|   r   outputZGraphModule)
r   r9   kwargsgZg_argsZa_argsnr   r   gmr.   r.   r/   gen_gm_and_inputsE  s$    

r   rN   r|   rP   c                 C  s(   | dkrd S t | }| r$|  d S Nr   )r   Zis_availabler_   r   r.   r.   r/   r_   \  s
    r_   zCallable[..., Any]r=   )modeltimesrP   rM   c                 C  sT   t | td t }t|D ]}| | }t | q"t }|d k	sLt|| S )Ni9  )r_   r^   Zmanual_seedtimeperf_counterrc   r}   )r   Zexample_inputsr   rP   t0rq   resultt1r.   r.   r/   timedd  s    

r   r.   
         ?c                   sD   t  fddt|D }t | }t|| d |S )Nc                   s   g | ]}t  qS r.   )r   )rX   rq   r9   rP   rL   r   r.   r/   rZ   v  s     z%print_performance.<locals>.<listcomp>z.6f)r^   r   rc   Zmedianprint)rL   r9   r   repeatZbaselinerP   ZtimingsZtookr.   r   r/   print_performances  s    "r   r   objmethodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S r<   r.   r.   r   r.   r/   <lambda>      z#precompute_method.<locals>.<lambda>N)ry   setattrr   r.   r   r/   precompute_method|  s    r   z	List[str])r   methodsc                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)r   )r   r   r   r.   r.   r/   precompute_methods  s    r   c                 C  s   t | |kt | |k  S r<   )r=   )r   r   r.   r.   r/   cmp  s    r   c                 C  s*   t | dkr"t| | d g| S | S d S )Nr$   r   )rk   r   )r   sizer.   r.   r/   pad_listlike  s    r   c                 C  s$   t | dkrg S dd }t| |dS )Nr   c                 S  s   t | tr| S |  S d S r<   )r3   r|   get_name)elemr.   r.   r/   	sort_func  s    
ztuple_sorted.<locals>.sort_funcr   )rk   sorted)r   r   r.   r.   r/   tuple_sorted  s    r   PRVT)	covariantc                   @  s0   e Zd ZeddddZddddd	d
ZdS )CachedMethodNonerv   c                 C  s   d S r<   r.   selfr.   r.   r/   clear_cache  s    zCachedMethod.clear_cachezP.argszP.kwargsr  )r9   r   rM   c                 O  s   d S r<   r.   r  r9   r   r.   r.   r/   __call__  s    zCachedMethod.__call__N)rB   rC   rD   staticmethodr  r	  r.   r.   r.   r/   r    s   r  z!Callable[Concatenate[Any, P], RV]zCachedMethod[P, RV]c                   s<   d j  dt  fdd}fdd}||_|S )N___cachec                   s$   t | st|  |  t| S r<   )rx   r   ry   r  rL   r   r.   r/   wrapper  s    
zcache_on_self.<locals>.wrapperc                   s   t |  rt|   d S r<   )rx   delattrr  r   r.   r/   r    s    
z"cache_on_self.<locals>.clear_cache)rB   r   wrapsr  )rL   r  r  r.   r  r/   cache_on_self  s    r  c                 C  sN   ddl m} t| tr2ttjdd | D t S t| |j	rD| j
S t S d S )Nr$   irc                 S  s$   g | ]}t |d r|jr|jjqS )r   )rx   r   origins)rX   r   r.   r.   r/   rZ     s   
 z%aggregate_origins.<locals>.<listcomp>) r  r3   r   r   r   r   or_setZExternKernelr  )node_scheduler  r.   r.   r/   aggregate_origins  s    
	r  c                 C  s   t | }|dkr,dd |D }tt|}n|dkrg }|D ]T}|jdkr<d|jkr<|jd d }t|d tr||d  q<||d j q<tt|}n|d	krd
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]4}|j d krd|jkr|jd dk	r|jd jjqS )r   r  N)r   r   _overloadpacketrB   rX   originr.   r.   r/   rZ     s
   

z)get_fused_kernel_name.<locals>.<listcomp>r^   r   Zsource_fn_stackrR   r$   Zinductor_nodec                 S  s   g | ]}|j d kr|jqS r   )r   rV   r  r.   r.   r/   rZ     s    
 rq   Zfused)r  r   r  r   r   r3   r|   r   rB   NotImplementedErrorjoin)r  Zdescriptive_namesall_originssourcesr  Z	source_fnr.   r.   r/   get_fused_kernel_name  s,    r#  c                 C  s  t | }dd |D }tt}tt}|D ]h}d|jkrj|jd d k	rjt|jd j}|| |j d|jkr.|jd d d }|| |j q.|j	 dd
t|  dd
t|  d	}g }	t| D ].\}
}|	|j	 d
|
 dd
t|  q|d
|	fS )Nc                 S  s   g | ]}|j d kr|qS r  r   r  r.   r.   r/   rZ     s     
 z'get_kernel_metadata.<locals>.<listcomp>r  Z	from_noder   z Source Nodes: [r   z], Original ATen: [] z => 
)r  collectionsdefaultdictr   r   r|   r  r   rV   commentr   r   keysitems)r  r  r!  Zinductor_nodesZfrom_node_dictZoriginal_aten_dictr   r   metadataZdetailed_metadataZoriginal_nodenodesr.   r.   r/   get_kernel_metadata  s&    


2r.  zIterable[torch.fx.Node]zSet[torch.fx.Node])initial_queuerM   c                 C  sX   t | } t| }| rT|  }|jD ].}|r4||r4q"||kr"|| | | q"q|S )zJReturns the set of nodes whose values depend on those within initial_queue)r   r  popr   addr   )r/  Zskip_filterZdominated_setr   userr.   r.   r/   dominated_nodes  s    

r3  c                   s\   dd l }ddlm   fddfdd| D }fdd| D }t|j|| S )	Nr   r$   r  c                   sD   t |  jr| jS t |  jr,| jS t |  joBt |  jS r<   )r3   	TensorBoxdata
StorageBoxIRNodeZ	Pointwise)r   r  is_unrealized_noder.   r/   r9  (  s
    

z*gather_origins.<locals>.is_unrealized_nodec                   s   g | ]} |r|j qS r.   r  )rX   valr9  r.   r/   rZ   /  s      z"gather_origins.<locals>.<listcomp>c                   s   g | ]} |r|j qS r.   r:  )rX   r   r<  r.   r/   rZ   0  s      )	itertoolsr  r  r   r  chain)r9   r   r=  Zkwarg_originsZarg_originsr.   r8  r/   gather_origins#  s    r?  )r   rM   c                 C  s   t | tjr| jS t | tjr0dtt| jS t | tj	rNdtt| jS t | t
ttfr| jj ddtt| j dS t| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))r3   r4   SymbolrV   r5   r   r7   	sympy_strr9   ZMulr   r   r   funcrB   r|   )r   r.   r.   r/   rC  4  s    "rC  c                 C  sB   ddl m} tjr6t|jdd  }r6|jdkr6t| S t	 S d S )Nr$   r   Zcurrent_nodeZ
index_expr)
r   r   r%   Zcompute_all_boundsry   interpreterr   r"   r#   unknown)r   r   Zfx_noder.   r.   r/   get_bounds_index_exprF  s    rG  r!   zsympy.Symbol)prefixidxrM   c                 C  s   | t jkstt| |dddS )9
    Used to generate an integer-nonnegative symbol.
    TintegerZnonnegative)r!   ZSIZEr}   r    )rH  rI  r.   r.   r/   sympy_index_symbol_with_prefixT  s    rM  c                 C  s   | s
t jot jS r<   )r%   Zdebug_index_assertsZassert_indirect_indexing)checkr.   r.   r/   generate_assert`  s    rO  )rV   rM   c                 C  s    | d dkst tj| dddS )rJ  r   sTrK  )r}   r4   rB  rV   r.   r.   r/   sympy_index_symbold  s    rR  zDict[sympy.Expr, Any])r   replacementsrM   c                   s*   dd  t |  fdd| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                 S  s6   t | tjstt |tr.tj|| j| jdS |S d S )NrK  )r3   r4   r   r}   r|   rB  rG   Zis_nonnegative)Zreplacedreplacementr.   r.   r/   	to_symbolv  s    
zsympy_subs.<locals>.to_symbolc                   s   i | ]\}}| ||qS r.   r.   )rX   kr2   rU  r.   r/   r     s      zsympy_subs.<locals>.<dictcomp>)r4   r   Zxreplacer+  )r   rS  r.   rW  r/   
sympy_subsp  s    
rX  )r   rM   c                 C  s:   t | tjp8t | tjo8tdd t|  |  D S )Nc                 s  s   | ]}t |V  qd S r<   is_symbolicr   r.   r.   r/   r]     s     zis_symbolic.<locals>.<genexpr>)	r3   r^   r   r   r   r=  r>  r   stride)r   r.   r.   r/   rZ    s     rZ  )r9   rM   c                  G  s   t dd | D S )Nc                 s  s   | ]}t |V  qd S r<   rY  r   r.   r.   r/   r]     s     z"any_is_symbolic.<locals>.<genexpr>r   )r9   r.   r.   r/   any_is_symbolic  s    r]  c                 C  s   ddl m} ddddddd	d
dh	}t rJ|dddddddddddh | jjD ]>}t|j|krl|  S |j	
d }d k	rR||rR|  S qRd S )Nr   )free_unbacked_symbolsz,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzaten.multinomial.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultZrun_and_save_rng_stateZrun_with_rng_statezaten._local_scalar_densezaten._assert_scalarzaten._unsafe_index_put.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr;  )Z%torch.fx.experimental.symbolic_shapesr^  r^   $are_deterministic_algorithms_enabledupdater   r-  r|   r   r   get)r   r^  Zforbidden_setr   r;  r.   r.   r/   %get_first_incompatible_cudagraph_node  s@    
rb  c                 C  s   t | d k	S r<   )rb  r   r.   r.   r/   has_incompatible_cudagraph_ops  s    rd  ztorch.fx.GraphModulerc  c                 C  s&   t tt| jj}|jdks"t|S )z$Get the output node from an FX graphr   )nextiterreversedr   r-  r   r}   )r   Z	last_noder.   r.   r/   output_node  s    rh  z	List[Any]_registered_cachesr   c                 C  s0   t | drt| js"t|  dt|  | S )zq
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_inductor_cache().
    cache_clearz# does not have a cache_clear method)rx   callablerk  AttributeErrorri  r   rj  r.   r.   r/   clear_on_fresh_inductor_cache  s    
rn  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)ri  rk  rj  r.   r.   r/   clear_inductor_caches  s    ro  c              
   #  s   t   t }zztjtjd|i tj	|d tjtjd iX dV  t
| trt| dksptdtj rt }|  fdd|D  W 5 Q R X W 5 Q R X t| W n" tk
r   td	|  Y nX W 5 t   X dS )
z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    TORCHINDUCTOR_CACHE_DIRZtritonZTRITON_CACHE_DIRNr   z!expected empty cache_entries dictc              	     s,   i | ]$}d |kr|t jt j |qS ).lock)ospathgetsizer   )rX   fZtriton_cache_dirr.   r/   r     s    z(fresh_inductor_cache.<locals>.<dictcomp>z(on error, temporary cache dir kept at %s)ro  tempfilemkdtempr   patchdictrr  environrs  r   r3   rk   r}   existslistdirr`  shutilrmtree	Exceptionrg   warning)Zcache_entriesZinductor_cache_dirfilesr.   rv  r/   fresh_inductor_cache  s0     


r  z	List[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r   reverse)__getitem__rc   rk   r   rg  r   )seqgetterZa_rr.   r.   r/   argsort  s    r  c                 C  s   t jd| d S )Nr.   rO   )r^   r`   Zelement_sizer  r.   r.   r/   get_dtype_size
  s    r  c                   @  s   e Zd ZU ded< dS )LineContextr   contextN)rB   rC   rD   __annotations__r.   r.   r.   r/   r    s   
r  c                   @  s   e Zd ZdZd-ddZddddZd	dd
dZd	dddZdd Zdd Z	dd Z
dd Zdd Zdd Zd.ddZd/ddZd0dd Zd1d"d#Zd$d d%d&d'Zd(d) Zd*d+ Zd,S )2IndentedBuffer   r   c                 C  s   g | _ || _d S r<   )_lines_indent)r  initial_indentr.   r.   r/   __init__  s    zIndentedBuffer.__init__z)tuple[str, list[tuple[int, LineContext]]]rv   c                 C  s   t  }d}g }| jD ]p}t|tr4| }|d krPqnt|trP|||jf qt|ts^t|	| |	d |d|
d 7 }q| |fS )Nr$   r&  )r   r  r3   DeferredLineBaser  r   r  r|   r}   writecountgetvalue)r  bufrr   Zlinemapliner.   r.   r/   getvaluewithlinemap  s     




z"IndentedBuffer.getvaluewithlinemapr|   c                 C  s   |   \}}|S r<   )r  )r  r2   rq   r.   r.   r/   r  ,  s    zIndentedBuffer.getvaluec                 C  s   t  }| jD ]l}t|tr,| }|d kr8qnt|tr8qt|tsFt|drd||d d  q|| |d q|	 S )N\rR   r&  )
r   r  r3   r  r  r|   r}   endswithr  r  )r  r  r  r.   r.   r/   getrawvalue0  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S r<   )r  clearr  r.   r.   r/   r  B  s    zIndentedBuffer.clearc                 C  s
   t | jS r<   )ru   r  r  r.   r.   r/   __bool__E  s    zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr%  )r  tabwidthr  r.   r.   r/   rH  H  s    zIndentedBuffer.prefixc                 C  s   |  d d S )Nr&  	writeliner  r.   r.   r/   newlineK  s    zIndentedBuffer.newlinec                 C  sl   t |tr| j| nPt |tr:| j||   n.| r\| j|   |  n| jd d S Nr  )r3   r  r  r   r  with_prefixrH  stripr  r  r.   r.   r/   r  N  s    

zIndentedBuffer.writelinec                 C  s   |D ]}|  | qd S r<   r  )r  linesr  r.   r.   r/   
writelinesX  s    zIndentedBuffer.writelinesr$   c                   s   t j fdd}| S )Nc                	   3  s.    j  7  _ z
d V  W 5  j  8  _ X d S r<   r  r.   offsetr  r.   r/   ctx]  s    
z"IndentedBuffer.indent.<locals>.ctx)
contextlibcontextmanager)r  r  r  r.   r  r/   indent\  s    zIndentedBuffer.indentc                 C  s   |  j |7  _ d S r<   r  r  r  r.   r.   r/   	do_indentg  s    zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S r<   r  r  r.   r.   r/   do_unindentj  s    zIndentedBuffer.do_unindentFc                 C  s   t |trtd}|jD ],}t |ts|rt|t|t|  }qt	|rTd}|jD ]4}t |trv| j
| qZt| |t|d   qZn@t|}|r| }|sd S | }|dD ]}| | qd S )Ninfr   r&  )r3   r  rK   r  r  minrk   lstripmathisinfr   r  r=   textwrapdedentrstripr   )r  Z
other_coder  r  r  r.   r.   r/   splicem  s&    





zIndentedBuffer.splicezCallable[[Any], Any])rD  rM   c                   s&   t | jd} fdd| jD |_|S )Nr  c                   s   g | ]} |qS r.   r.   )rX   r  rD  r.   r/   rZ     s     z&IndentedBuffer.map.<locals>.<listcomp>)r  r  r  )r  rD  rs   r.   r  r/   r7     s    zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nr@  rA  )r   r  r  r.   r.   r/   __repr__  s    zIndentedBuffer.__repr__c                 C  s8   | j |j kstt| j d}|| j ||j |S )Nr  )r  r}   r  r  r  )r  otherrs   r.   r.   r/   __add__  s
    zIndentedBuffer.__add__N)r   )r$   )r$   )r$   )F)rB   rC   rD   r  r  r  r  r  r  r  rH  r  r  r  r  r  r  r  r7   r  r  r.   r.   r.   r/   r    s$   





r  c                      s$   e Zd Z fddZdd Z  ZS )FakeIndentedBufferc                   s   t    d S r<   )superr  r  	__class__r.   r/   r    s    zFakeIndentedBuffer.__init__c                 C  s(   |dkrt | |S td| dd S )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__rl   )r  rV   r.   r.   r/   r    s
    
z#FakeIndentedBuffer.__getattribute__)rB   rC   rD   r  r  __classcell__r.   r.   r  r/   r    s   r  c                 c  s   z
d V  W 5 | t _|t _X d S r<   )sysstdoutstderr)Zinitial_stdoutZinitial_stderrr.   r.   r/   restore_stdout_stderr  s    
r  c                   @  s^   e Zd ZdZdd ZddddZdd d	d
dZdd Zdd Zdd Z	dd Z
dd ZdS )r  z.A line that can be 'unwritten' at a later timec                 C  s   |  sd}|| _d S r  )r  r  r  r.   r.   r/   r    s    zDeferredLineBase.__init__zOptional[str]rv   c                 C  s   t dS )zJReturns either self.line or None to indicate the line has been 'unwritten'Nr  r  r.   r.   r/   r	    s    zDeferredLineBase.__call__r|   )r  rM   c                 C  s   t dS )z3Returns a new deferred line with the same conditionNr  r  r.   r.   r/   	_new_line  s    zDeferredLineBase._new_linec                 C  s   |  | | j S r<   r  r  )r  rH  r.   r.   r/   r    s    zDeferredLineBase.with_prefixc                 C  s   |  | j S r<   )r  r  r  r  r.   r.   r/   r    s    zDeferredLineBase.lstripc                 C  s   |  | j| S r<   r  )r  r   r.   r.   r/   r    s    zDeferredLineBase.__getitem__c                 C  s
   t | jS r<   )ru   r  r  r.   r.   r/   r    s    zDeferredLineBase.__bool__c                 C  s
   t | jS r<   )rk   r  r  r.   r.   r/   __len__  s    zDeferredLineBase.__len__N)rB   rC   rD   rE   r  r	  r  r  r  r  r  r  r.   r.   r.   r/   r    s   r  c                 C  s6   d}t j| j}||k r2tjd||dd dS dS )ND   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extraFT)r^   rN   get_device_propertiesZmulti_processor_countrg   r  )r   r  r  r.   r.   r/   
is_big_gpu  s    r  c                   C  s   t jpt jpt jS r<   )r%   Zmax_autotuneZmax_autotune_gemmZsearch_autotune_cacher.   r.   r.   r/   use_max_autotune  s    r  zList[torch.dtype])allowed_layout_dtypesrM   c                 C  s,   t  o*| jjdko*| j|ko*t| jjp(dS )NrN   r   )r  rP   r   rO   r  r   )layoutr  r.   r.   r/   _use_template_for_cuda  s    
r  )backendrM   c                 C  s"   |   dd tj  dD kS )Nc                 S  s   g | ]}|  qS r.   )r  r   r.   r.   r/   rZ     s    z)_use_autotune_backend.<locals>.<listcomp>,)upperr%   Zmax_autotune_gemm_backendsr   )r  r.   r.   r/   _use_autotune_backend  s    r  F)enable_int32c                C  s:   t jt jt jg}|r(t jt jt jt jg}t| |o8tdS )NZTRITON)r^   r   r   r   r   r  r  )r  r  layout_dtypesr.   r.   r/   use_triton_template  s    r  c           	      C  s   ddl m} |jjj|| | dd}|dks:|tjjk r>dS ddlm	} t
jjrVdS t
jt
jt
jt
jg}t| |oztd}|r| std	 dS |S )
Nr$   r   rR   )fallbackr   F)try_import_cutlassZCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r   r   r   r   Z	size_hintr%   rN   Zcutlass_backend_min_gemm_sizeZcodegen.cuda.cutlass_utilsr  r^   versionZhipr   r   r   r   r  r  rg   r  )	r  mr   rV  r   Z	gemm_sizer  r  rs   r.   r.   r/   use_cutlass_template  s$    r  c                 C  s   t  o| jjdkS r   )r  rP   r   )r  r.   r.   r/   _use_template_for_cpu  s    r  c                 C  s   ddl m} ddlm} ddlm} t| r4tds8dS tj	j
sDdS tjg}|||\}}}	} }}t||	frrdS t||jr| }|d|||	| jt d}
| j|ko|
d k	o||
jd  d	ko| d
 dkot||jo| S )Nr$   r  )create_micro_gemm)mm_argsCPPF
micro_gemm)Znum_threadsr   rR   )r  r  Zcodegen.cpp_micro_gemmr  Zkernel.mm_commonr  r  r  r%   cppZweight_prepackr^   r   has_free_symbolsr3   BaseViewZunwrap_viewrO   parallel_num_threadsZregister_blocking
get_strider6  Zis_module_buffer)r  Zmat1Zmat2r  r  r  r  r  r   rV  r  r.   r.   r/   use_cpp_packed_gemm_template  s@         

r  c                   C  s   t   ptdS )NZATEN)r  r  r.   r.   r.   r/   use_aten_gemm_kernels7  s    r  c                   @  s8   e Zd ZU edZded< dd Zdd Zdd	 Z	d
S )DebugDirManagerr   r|   prev_debug_namec                 C  s   t tj| _d S r<   )re  r  counterr   r  r.   r.   r/   r  ?  s    zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )NZ_tmp_)r^   _dynamor%   debug_dir_rootr  r   new_namer  r.   r.   r/   	__enter__B  s    zDebugDirManager.__enter__c                 G  s   t | j | jtjj_d S r<   )r~  r  r  r  r^   r  r%   r  )r  r9   r.   r.   r/   __exit__G  s    zDebugDirManager.__exit__N)
rB   rC   rD   r=  r  r  r  r  r  r  r.   r.   r.   r/   r  ;  s
   

r  c              
     sv   ddl m} |j g  fdd}tddi6 tj|d| tj	  | ||}W 5 Q R X W 5 Q R X |fS )Nr$   GraphLoweringc              	     s0    | }t |j}|  W 5 Q R X |S r<   )open__file__r   read)r  modru  compile_to_modulesource_codesr.   r/   patched_compile_to_moduleR  s    z3run_and_get_code.<locals>.patched_compile_to_modulefx_graph_cacheFr  )
r   r  r  r%   ry  r   r  r^   r  reset)rL   r9   r   r  r  r   r.   r  r/   run_and_get_codeL  s      
r  c              
     sp   ddl m} g  dd fdd}tddi6 tj|d	| tj  | ||}W 5 Q R X W 5 Q R X  S )
zLGet the inductor-generated code, but skip any actual compilation or running.r$   r  r  r  c                   s8   G dd d}| j r|  n|  \}} | | S )Nc                   @  s    e Zd ZdZdd Zdd ZdS )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                 S  s   d S r<   r.   r  r.   r.   r/   r  l  s    zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__c                 _  s   d S r<   r.   r  r.   r.   r/   callo  s    zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callN)rB   rC   rD   rE   r  r  r.   r.   r.   r/   DummyModulei  s   r	  )Zcpp_wrapperZcodegen_with_cpp_wrapperZcodegenr   )r  r	  coderq   r  r.   r/   r  h  s
    
z+get_code.<locals>.patched_compile_to_moduler  Fr  )	r   r  r%   ry  r   r  r^   r  r  )rL   r9   r   r  r  rq   r.   r  r/   get_codeb  s      
r  c                 O  sD   t | f||}dt|  kr(dks<n tdt| |d S Nr$      z%expected one or two code outputs got r   )r  rk   r}   )rL   r9   r   r  r.   r.   r/   get_triton_code  s     
r  c                 O  sH   t | f||\}}dt|  kr,dks@n tdt| |d S r  )r  rk   r}   )rL   r9   r   rq   r  r.   r.   r/   run_and_get_triton_code  s     
r  c              	   c  sD   ddl m} |j|  }zt|||j| < dV  W 5 ||j| < X dS )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)Ztorch._inductorr  Z	loweringsr   partial)Zaten_opZoverride_fnr  orig_fnr.   r.   r/   override_lowering  s    

r  c                   s4   ddl m} |j  fdd}tjj|d|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                   s&   | |  | |}r"| | |S r<   r.   )Z	schedulerr-  outr  post_fnpre_fnr.   r/   r    s
    


z(add_scheduler_init_hook.<locals>.wrapperr  )torch._inductor.schedulerr  r  unittestr   ry  r  )r  r  r  r  r.   r  r/   add_scheduler_init_hook  s    r  c                 C  s    t jrt|  n
t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r%   Zdeveloper_warningsrg   r  info)msgr.   r.   r/   developer_warning  s    r  c                  C  s   z^t jd} | d tt jk r\tt j| d  dkr\t j| d  d dkr\t j| d  W S W n tk
rr   Y nX t jD ]"}|drz|tdd   S qzdS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr$   r   -z--only=N)r  argvr   rk   
ValueError
startswith)rI  r   r.   r.   r/   get_benchmark_name  s    

r$  c                 C  s   t dd | D S )Nc                 s  s   | ]}|d kV  qdS )r$   Nr.   r   r.   r.   r/   r]     s     zis_ones.<locals>.<genexpr>r6   r+  r.   r.   r/   is_ones  s    r'  c                 C  s   t dd | D S )Nc                 s  s   | ]}|d kV  qdS )r   Nr.   r   r.   r.   r/   r]     s     zis_zeros.<locals>.<genexpr>r%  r&  r.   r.   r/   is_zeros  s    r(  c                 C  s   t dd | D S )Nc                 s  s*   | ]"}t |tjr|jtd kV  qdS )r   N)r3   r^   r   rP   )rX   itemr.   r.   r/   r]     s   z is_cpu_device.<locals>.<genexpr>r%  )inputsr.   r.   r/   is_cpu_device  s    r+  ztorch.dtype)r;  rM   c                 C  s*   t | tjstd| jr tjS tjS d S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r3   r4   r   r}   rG   r^   r   r   )r;  r.   r.   r/   get_sympy_Expr_dtype  s     r,  c              	   o  s0   | r&t jj||}|V  W 5 Q R X nd V  d S r<   )r^   re   rf   )Zshould_profiler9   r   rr   r.   r.   r/   maybe_profile  s    r-  c                  C  s   t jj} | dk rt } | S r   )r%   r  threadsr^   Zget_num_threads)r.  r.   r.   r/   r  
  s    r  c                 C  s   ddl m}m} | tjtjtjfks(tt	|j
drddlm} | }| tjtjfkrf|| |S tjjjjr~|tj|S |tj|S n8| tjtjfkr|| S tjjjjr|tjS |tjS d S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflopsZ
clock_rate)max_clock_rate)triton.testingr/  r0  r^   r   r   r   r}   inspect	signature
parametersra  Ztorch._utils_internalr1  backendsrN   matmulZ
allow_tf32)rO   r/  r0  r1  Zsm_clockr.   r.   r/   get_device_tflops  s    

r8  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)r2  r:  r9  r.   r.   r/   get_gpu_dram_gbps-  s    r;  c                  C  s"   ddl m}  | jjdddS )Nr   driverZmax_shared_mem)Ztriton.runtimer=  Zactiveutilsr  ra  r<  r.   r.   r/   get_gpu_shared_memory4  s    r?  c                 C  s
   |  dS )NZwelford)r#  reduction_typer.   r.   r/   is_welford_reduction:  s    rB  c                 C  s   t | rdS dS )N   r$   )rB  r@  r.   r.   r/   reduction_num_outputs>  s    rD  c                   C  s   t  dkS )NLinux)platformsystemr.   r.   r.   r/   is_linuxB  s    rH  zIterable[Any]itrc                 C  s   t dd | D S )Nc                 s  s"   | ]}t |tjo|j V  qd S r<   )r3   r4   r   Z	is_numberr   r.   r.   r/   r]   G  s     z#has_free_symbols.<locals>.<genexpr>r\  rI  r.   r.   r/   r  F  s    r  c                  G  s   ddl m} | D ]}t||jrPt|j sHt|jdrt|j r dS qt||j	|j
|jfrt|drzt|ds~tt| st| r dS qt||jsqqtdt| qdS )Nr$   r  r  Tget_sizezunexpected type for is_dynamic F)r  r  r3   r4  r  r5  rK  rx   r  r6  r  ZComputedBufferr}   r7  	TypeErrorr   )r9   r  tr.   r.   r/   
is_dynamicJ  s"    
rN  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)rB   rC   rD   rP  rQ  r.   r.   r.   r/   rO  `  s   rO  c              	   C  s   ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }| |j
 t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W 5 Q R X d S )Nr$   )stable_topological_sortwzutf-8F)modeencodingdelete)r   	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)Zpattern_matcherrR  rw  NamedTemporaryFileior   r   r   	propagater   r   r   nowZlintZ	recompiler  rg   r  rV   )rD  r   Zinpr  rR  ru  Z	before_ioZafter_io
start_timeZtime_elapsedrM  r.   r.   r/   pass_execution_and_savej  s8    


r^  c                 C  s   ddl m} t| |jkS Nr$   r  )r  r  r   Z_CollectiveKernelr   r  r.   r.   r/   is_collective  s    ra  c                 C  s   ddl m} t| |jkS r_  )r  r  r   Z_WaitKernelr`  r.   r.   r/   is_wait  s    rb  )dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )r^   Z
_functorchr%   Zfunctionalize_rng_ops)rc  rd  Znum_rng_seed_offset_inputsr.   r.   r/   num_fw_fixed_arguments  s    re  )fx_gc                 C  sb   dd }d}g }| j jD ](}|jdkr||r8|| |d7 }q|ttt|ksZtt|S )z>
    Infers which inputs are static for a backwards graph
    c                 S  s   d| j kod| j kod| j kS )NZtangentsZbwd_seedZbwd_base_offsetrQ  )r   r.   r.   r/   is_saved_tensor  s
    
z'count_tangents.<locals>.is_saved_tensorr   r   r$   )r   r-  r   r   r   rc   rk   r}   )rf  rg  	arg_countZstatic_arg_idxsr   r.   r.   r/   count_tangents  s    


ri  c                   @  s*   e Zd ZU ded< dd Zedd ZdS )	BoxedBoolru   r@   c                 C  s   | j S r<   )r@   r  r.   r.   r/   r    s    zBoxedBool.__bool__c                 C  s   t | trd| _| S dS NF)r3   rj  r@   rj  r.   r.   r/   disable  s    
zBoxedBool.disableN)rB   rC   rD   r  r  r
  rl  r.   r.   r.   r/   rj    s   
rj  c              	   #  sH   ddl m} |j fdd}tjj|d| d V  W 5 Q R X d S )Nr$   )WrapperCodeGenc                   s      | | |||f||S r<   )r   )r  rV   Zkernel_coder,  r9   r   kernel_listZorig_define_kernelr.   r/   new_define_kernel  s    
z2collect_defined_kernels.<locals>.new_define_kerneldefine_kernel)Zcodegen.wrapperrm  rq  r  r   ry  r  )ro  rm  rp  r.   rn  r/   collect_defined_kernels  s
    rr  rQ  c                 C  s   | d S )NZ__original__r.   rQ  r.   r.   r/    get_cloned_parameter_buffer_name  s    rs  c                 C  s   | dkS )N)rN   Zxpur.   r   r.   r.   r/   is_gpu  s    rt  c                 C  s   t | tstt| S r<   )r3   r|   r}   rt  r   r.   r.   r/   device_need_guard  s    ru  c                 C  s   | t jt jt jhkS r<   )r^   r   ru   r   r  r.   r.   r/   ,needs_fallback_due_to_atomic_add_limitations  s    rv  )op_overloadc                 C  s   | j tjjjkrdnd}|d |hkp|r8t|r8t|p| j tjjjkrv|dkrv|rv|dkrvtj	j
rvtj	jpt dkp||kr|tjtjhkpt S )Nr1  rn   r   r$   )Zoverloadpacketr^   rz   ZatenZscatter_rt  rv  Zscatter_reduce_r%   r  Zfallback_scatter_reduce_sumZdynamic_threadsr  ru   r   r_  )rw  rA  Z
self_dtypeZ	src_dtypeZsrc_device_typeZsrc_is_tensorZ	reduce_tyr.   r.   r/   use_scatter_fallback  s.    	rx  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]\}}td|dd ||krdtd	 q8||krvtd
 q8t||r|	 }t|rdnd d |r|j
dk	sttd|j
jj  td |jjD ]}t| qtd |jjD ]}t| qq8tdt| q8dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr%  3:zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )Ztorch._inductor.codegen.simdry  rz  r  r{  r   rk   rm   r3   Zis_reductionr   r}   r5  Zreduction_hintZread_writesZreadsZwritesrl   r   )r  ry  rz  r{  rI  r   Zis_reddepr.   r.   r/   dump_node_schedule  s,    


r  ztorch.Tensorr   c                 C  s   |   t| j t dkS )Nr   )Zstorage_offsetr  rO   GPU_ALIGN_BYTESr  r.   r.   r/   tensor_is_aligned&  s
    r  Zexample_inputc                 C  s   t | jjsdS tjpt| S rk  )rt  rP   r   r%   Zassume_aligned_inputsr  r  r.   r.   r/   should_assume_input_aligned2  s    r  c                  C  s4   t jj } | st S | jj}|s,t S | S r<   )	r^   Z_guardsZTracingContextZtry_getr  nullcontextrW  r   Zsuppress_guards)Ztracing_contextr   r.   r.   r/   #maybe_get_suppress_shape_guards_ctx;  s    r  	namespacerP   c                 C  s   t t d |  | S )NZ
aoti_eager)r   r&   r  r.   r.   r/   aoti_eager_cache_dirL  s    r  )op_func_name_with_overloadc                 C  sB   ddl m} ddlm}m} |  d}| }|tj|||dS )Nr   )FileLock)get_lock_dirLOCK_TIMEOUTrq  )timeout)filelockr  Ztorch._inductor.codecacher  r  rr  rs  r   )r  r  r  r  Zop_conf_lock_fileZlock_dirr.   r.   r/   aoti_eager_op_conf_lockP  s
    
r  )nsr  rT   c           
      C  s
  t | |}|| d }| s$g S t| t|}t|}|D ]}||d  }| |d< | sg   W  5 Q R  W  5 Q R  S |d D ]D}	|	d rtd|	d dkrd|	d	< tt	|	d
 
dd |	d
< qqF|W  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S )N.jsonkernel_path	meta_inforN  !Only support static shape for nowrT   r   rR   device_indexrO   r   )r  r|  r  r  jsonloadas_posixr}   ry   r^   r   )
r  r  rT   Zdevice_kernel_cacheop_confru  	json_datar)  Zkernel_lib_abs_pathr,  r.   r.   r/   load_aoti_eager_cache[  s,    



  r  )dynamic_shapesoptionsremove_runtime_assertionsdisable_constraint_solverz
Tuple[Any]zDict[str, Any]zOptional[Dict[str, Any]])r  r  rT   dynamicru  r9   r   r  r  r  r  c                  s0  |rt dttjttjttji}t|  tj	||}t
 fdd|D sXtdt| |}| sv|jdd |d }| s|  tjtjd|  iz z<tjj||||||	|
d	d
}g }|D ]}i }||d< t|tjrR|jj |d< t|grd|d< n|jj|d< |j |d< t|  |d< t|! |d< nXt| sbt ||d< |dkrxdnd|d< |t|  |d< g |d< g |d< ||d< |"| qi }||d< t#|$| |d< g }d}|| d }| rdnd}t%| t&||}zt'(|}W n( t)k
rL } zg }W 5 d}~X Y nX t|ts^t |D ].}t|tsvt |d |krbd	} qqbW 5 Q R X |r|"| t&|d}t'j*||dd W 5 Q R X W 5 Q R X |W W  5 Q R  S  t)k
r  } zW Y W 5 Q R  dS d}~X Y nX W 5 Q R X dS )zO
    Compile the given function with persistent cache for AOTI eager mode.
    r  c                 3  s   | ]}t | tjfV  qd S r<   r   )rX   inputZsupported_scalar_typesr.   r/   r]     s   z5aoti_compile_with_persistent_cache.<locals>.<genexpr>z-Only support tensor, int, float, bool for nowT)parentslibrp  F)r  r  r  r  Zsame_signaturerN  rT   rR   r  rO   sizesstridesr   r   Zscalar_valuer  r  r  rrS  Nr  )r  r  )+r}   r=   r^   r   rK   ru   r   r*  pytreeZarg_tree_leavesr6   r  r  r|  mkdirr   ry  rz  rr  r{  absoluter  Z_exportZaot_compiler3   r   rP   r   r+  r   rO   r   r   r[  r   r   relative_tor  r  r  r  r  dump)r  r  rT   r  ru  r9   r   r  r  r  r  Ztype_to_torch_dtypeZflattened_inputsZpersistent_cacheZpersistent_cache_libZkernel_lib_pathZkernel_metadata_itemsr  r,  Zkernel_meta_infor  Zupdate_jsonr  rT  Zop_conf_filer~   r)  r.   r  r/   "aoti_compile_with_persistent_cachex  s    



$r  c              	   O  s   t jjtdd tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | ||}	| }
|| || W 5 Q R X |	|
fS )Nrh   Tr   )output_code_log)r  r   ry  r  r%   r^   r  r  rZ  loggingr   StreamHandlerZtorch._inductor.graphr  
addHandlerlevelsetLevelDEBUGr  removeHandler)rL   r9   r   rZ  r  Zlog_capture_stringchr  Z
prev_levelr   rP  r.   r.   r/   run_and_get_cpp_code  s    




r  )rI   rJ   )rN   )r$   rN   )r.   r   r   r   rN   )N)N)N)
__future__r   r'  r  dataclassesenumr   r3  rZ  r=  r  r  r  r   rr  rF  r~  r  rw  r  r   r  r   r   pathlibr   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r4   r^   Ztorch._exportZtorch.utils._pytreer>  Z_pytreer  Ztorch._dynamo.device_interfacer   Ztorch._dynamo.utilsr   Ztorch.autogradr   Ztorch.autograd.profiler_utilr   Ztorch.fx.passes.shape_propr   Ztorch.utils._sympy.functionsr   r   r   r   Ztorch.utils._sympy.symbolr    r!   Ztorch.utils._sympy.value_rangesr"   r#   r  r%   Zruntime.runtime_utilsr&   r'   r   	getLoggerrB   rg   r(   r   Z	VarRangesr  r,   r}   r0   r8   Functionr:   rt   	lru_cacher   r   r   r   r   r   r   r   r   r   r   r_   r   r   r   r   r   r   r   r   r  r  r  r  r#  r.  r3  r?  rC  rG  rM  rO  rR  rX  rZ  r]  rb  rd  rh  ri  r  rn  ro  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r$  r'  r(  r+  r,  r-  r  r8  r;  r?  rB  rD  rH  r  rN  EnumrO  r^  ra  rb  re  ri  	dataclassrj  rr  rs  rt  ru  rv  rx  r  r  r  r  r  r  r  r  r  r.   r.   r.   r/   <module>   sh   @
$R#	          	
!*$
 
 		 %		





! 	&6t