U
    yh                    @   sx  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmZmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@ ddlAmBZBmCZC erd dlDZDddlEmFZF e@ jGZHeejIejJeKf ZLe,jMeLdddZNejOeKdddZPejOeKdddZQejRjSeKd d!d"ZTeeKeUf ZVeeeeUejf d#f eeVgeeUd#f f f ZWdCeKed$ eeW ed% eeKeKf d&d'd(ZXejYG d)d* d*ZZd+Z[G d,d- d-Z\G d.d/ d/Z]ejYG d0d1 d1e]Z^ejYG d2d3 d3e]Z_ejYG d4d5 d5e]Z`G d6d7 d7e]ZaejYG d8d9 d9e]ZbejYG d:d; d;ebZcejYG d<d= d=ebZdejYG d>d? d?ebZeG d@dA dAebZfeKZgG dBd% d%e=ZhdS )D    N)count)
AnyCallableDictIteratorListOptionalSetTupleTYPE_CHECKINGUnion)Expr)countersdynamo_timed)MultiKernelState)ConvertIntKeyDivideByKeySymTypes)_get_qualified_name)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)ReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str)V   )maybe_hipify_code_wrapper)CodeGenDeferredLineIndentedBufferPythonPrinter)	config_ofsignature_to_meta)GraphLowering)nodereturnc                 C   s&   |   |  ttjj| j fS N)	
get_device	get_dtyper#   r$   graphsizevarssimplifylayoutZstorage_size)r.    r7   Q/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyA   s    r9   )argr/   c           
      C   s   ddl m}m} t| j}|dkrL| jd k	r@| jjr@d| dS d| dS ||kr`|| }|S | D ]d\}}t	|d |}t
|dkrh|d }||kstd	| d
| || }	| d|	 d  S qhtd| d S )Nr%   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorzat::&z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cppr;   r<   repr	real_type
alias_infoZis_writeitemsrefindalllenAssertionError)
r:   r;   r<   python_typecpp_typeZpy_containerZcpp_containerZcontainer_matchZcontained_typeZcpp_contained_typer7   r7   r8   convert_arg_typeL   s(    
rL   )retr/   c                 C   sT   t | j}ddd}||d }|d k	s6td| |dkrP| jd k	rP|d7 }|S )Nz
at::Tensorzstd::vector<at::Tensor>)r=   zList[Tensor]zNYI return type: r=   r>   )rB   rC   getrI   rD   )rM   rJ   Zpython_to_cpprK   r7   r7   r8   convert_return_typek   s    
rO   )kernelr/   c                 C   s   | j j}| j j}t|}|dks(td|dkr>t|d }n(|dkrfddd |D }d| d}d	d |D }| d
d| dS )Nr   z#must have at least one return valuer%   , c                 S   s   g | ]}t |qS r7   )rO   ).0rr7   r7   r8   
<listcomp>   s     z%get_cpp_op_schema.<locals>.<listcomp>zstd::tuple<r@   c                 S   s    g | ]}t | d |j qS ) )rL   name)rR   r:   r7   r7   r8   rT      s     ())Z_schema	argumentsreturnsrH   rI   rO   join)rP   argsrZ   Znum_returnsZcpp_return_valueZtuple_returnsZcpp_arg_typer7   r7   r8   get_cpp_op_schema}   s    r]   .ztriton.ConfigWrapperCodeGen)rV   configsgridswrapperr/   c              	      s0  t  }tttjf tjddd td fdd}d|  }|d| d	 |  t|d
kr||d }|d|  nt|d
kst	t|t|kst	t
 }t||D ]^\}}	dd |	j D }
d|
}
||}d|
 d| }||krq|| || qW 5 Q R X || fS )N)itemr/   c                 S   s   t | tjr| S t| S r0   )
isinstancesympyr   Integer)rb   r7   r7   r8   _convert_to_sympy_expr   s    z@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr)gridc                    s4   d kst | r| S t fdd| D }|S )Nc                 3   s   | ]} |V  qd S r0   r7   )rR   g)rf   r7   r8   	<genexpr>   s     zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>)callabletuplecodegen_shape_tuple)rg   Z
sympy_gridrf   ra   r7   r8   determine_grid   s    z8user_defined_kernel_grid_fn_code.<locals>.determine_gridZgrid_wrapper_for_zdef z(meta):r%   r   zreturn c                 S   s    g | ]\}}d | d| qS )zmeta['z'] == r7   )rR   rV   valr7   r7   r8   rT      s     z4user_defined_kernel_grid_fn_code.<locals>.<listcomp>z and if z	: return )r)   r   intrd   r   
TritonGrid	writelineindentrH   rI   setzipkwargsrE   r[   addgetvalue)rV   r_   r`   ra   outputrn   fn_namerg   seencZguardsZ	statementr7   rm   r8    user_defined_kernel_grid_fn_code   s,    




r~   c                   @   s(   e Zd ZU eed< ejed< dd ZdS )SymbolicCallArginnerZ
inner_exprc                 C   s
   t | jS r0   )strr   selfr7   r7   r8   __str__   s    zSymbolicCallArg.__str__N)__name__
__module____qualname__r   __annotations__rd   r   r   r7   r7   r7   r8   r      s   

r   i  c                       sN   e Zd Z fddZeedddZeddddZedd	d
ddZ  Z	S )MemoryPlanningStatec                    s    t    tt| _d| _d S Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_sizer   	__class__r7   r8   r      s    
zMemoryPlanningState.__init__)keyr/   c                 C   s   t | j|d S r0   )boolr   rN   )r   r   r7   r7   r8   __contains__   s    z MemoryPlanningState.__contains__FreeIfNotReusedLinec                 C   s   | j |  }|jrt|S r0   )r   pop	is_reusedrI   r   r   rb   r7   r7   r8   r      s    
zMemoryPlanningState.popN)r   rb   r/   c                 C   s   |j r
t| j| | d S r0   )r   rI   r   appendr   r7   r7   r8   push   s    
zMemoryPlanningState.push)
r   r   r   r   ReuseKeyr   r   r   r   __classcell__r7   r7   r   r8   r      s   r   c                   @   s   e Zd ZdS )WrapperLineNr   r   r   r7   r7   r7   r8   r      s   r   c                   @   s.   e Zd ZU ded< ded< eddddZdS )	EnterSubgraphLiner^   ra   r-   r3   Ncoder/   c                 C   s   | j | j |  d S r0   )ra   push_codegened_graphr3   	do_indentr   r   r7   r7   r8   codegen   s    zEnterSubgraphLine.codegenr   r   r   r   r)   r   r7   r7   r7   r8   r      s   
r   c                   @   s&   e Zd ZU ded< eddddZdS )ExitSubgraphLiner^   ra   Nr   c                 C   s   | j   |  d S r0   )ra   pop_codegened_graphdo_unindentr   r7   r7   r8   r      s    
zExitSubgraphLine.codegenr   r7   r7   r7   r8   r      s   
r   c                   @   s2   e Zd ZU eed< ee ed< eddddZdS )EnterDeviceContextManagerLine
device_idxlast_seen_device_guard_indexNr   c                 C   s   t jjr|d t jjr\| jd krFtjr6|d qZ|td q| j| j	kst
dq| jd kr|tjr~d| j	 dntd| j	 d q|d| j	 d n<|d	t jj| j	 d
 |  |t jj| j	 d S )N
z<AOTICudaStreamGuard stream_guard(stream, this->device_idx_);zcat::cuda::CUDAStreamGuard stream_guard(at::cuda::getStreamFromExternal(stream, this->device_idx_));z4AOTInductor only supports running on one CUDA devicezAOTICudaGuard device_guard(z);z!at::cuda::CUDAGuard device_guard(zdevice_guard.set_index(zwith :)r$   r3   cpp_wrapperrs   aot_moder   r   Zabi_compatibler&   r   rI   
device_opsZdevice_guardr   Z
set_devicer   r7   r7   r8   r     s:    



z%EnterDeviceContextManagerLine.codegen)r   r   r   rq   r   r   r)   r   r7   r7   r7   r8   r      s   
r   c                   @   s   e Zd ZeddddZdS )ExitDeviceContextManagerLineNr   c                 C   s   t jjs|  d S r0   )r$   r3   r   r   r   r7   r7   r8   r   -  s    z$ExitDeviceContextManagerLine.codegen)r   r   r   r)   r   r7   r7   r7   r8   r   ,  s   r   c                   @   sD   e Zd ZU ded< ed dddZedddd	Zed
ddZ	dS )MemoryPlanningLiner^   ra   stater/   c                 C   s   | S )zFirst pass to find reuser7   r   r   r7   r7   r8   plan6  s    zMemoryPlanningLine.planNr   c                 C   s   dS )zSecond pass to output codeNr7   r   r7   r7   r8   r   :  s    zMemoryPlanningLine.codegenr/   c                 C   sr   g }t | D ]F}|jdkrqt| |j}||j d|jtjkrJ| n|  qt| j	 dd
| dS )zF
        Emits a string representation that fits on one line.
        ra   =rW   rQ   rX   )dataclassesfieldsrV   getattrr   typer   Bufferget_namer   r[   )r   r\   fieldro   r7   r7   r8   r   >  s    
"zMemoryPlanningLine.__str__)
r   r   r   r   r   r   r)   r   r   r   r7   r7   r7   r8   r   2  s   
r   c                   @   s8   e Zd ZU ejed< eedddZe	ddddZ
dS )	AllocateLiner.   r   c                 C   s   | j  tjjkrt| jS t| j }tj	rV||krV|
|}d|_t| j|j | j S | j  jdkr| j| j }|d k	r| jtttj|d7  _| S )NTcpur%   )r.   r   r$   r3   removed_buffersNullLinera   r9   r   allow_buffer_reuser   r   	ReuseLiner1   r   static_shape_for_buffer_or_noner   rq   	functoolsreduceoperatormul)r   r   r   Z	free_lineZstatic_shaper7   r7   r8   r   Q  s    


zAllocateLine.planNr   c                 C   s2   | j  tjjkst| j| j }|| d S r0   )	r.   r   r$   r3   r   rI   ra   make_buffer_allocationrs   )r   r   liner7   r7   r8   r   e  s    zAllocateLine.codegen)r   r   r   r   r   r   r   r   r   r)   r   r7   r7   r7   r8   r   M  s   

r   c                   @   sD   e Zd ZU ejed< dZeed< ee	dddZ
eddd	d
ZdS )r   r.   Fr   r   c                 C   sl   t | j dkr| S t| jjtjr*| S | jr4t| j	 t
jjkrPt| jS tjrh|t| j|  | S r   )rH   r.   Zget_inputs_that_alias_outputrc   r6   r   ZMultiOutputLayoutr   rI   r   r$   r3   r   r   ra   r   r   r   r9   r   r7   r7   r8   r   p  s    

zFreeIfNotReusedLine.planNr   c                 C   s4   | j  tjjkst| js0|| j	| j  d S r0   )
r.   r   r$   r3   r   rI   r   rs   ra   make_buffer_freer   r7   r7   r8   r   |  s    zFreeIfNotReusedLine.codegen)r   r   r   r   r   r   r   r   r   r   r   r)   r   r7   r7   r7   r8   r   k  s   

r   c                   @   sN   e Zd ZU ejed< ejed< dZeed< ee	dddZ
edd	d
dZdS )r   r.   	reused_asT
delete_oldr   c                 C   sL   | j  tjjkr2| j tjjks(tt| jS | j tjjksHt| S r0   )	r.   r   r$   r3   r   r   rI   r   ra   r   r7   r7   r8   r     s
    
zReuseLine.planNr   c                 C   sL   | j  tjjkst| j tjjks,t|| j	| j | j| j
 d S r0   )r.   r   r$   r3   r   rI   r   rs   ra   make_buffer_reuser   r   r7   r7   r8   r     s
    zReuseLine.codegen)r   r   r   r   r   r   r   r   r   r   r   r)   r   r7   r7   r7   r8   r     s
   


r   c                   @   s   e Zd ZdS )r   Nr   r7   r7   r7   r8   r     s   r   c                
       sb  e Zd ZdZ fddZeeddddZddd	d
ZeddddZ	e
edddZeee dddZddddZddddZddddZddddZdeedddZdd Zdd  Zd!d" Zedd#d$Zeddd%d&Zddd'd(Zee dd)d*d+Zedd,d-d.Zedd,d/d0Zd1d2 Zd3d4 Zeee e ee d5d6d7Z!dd8d9Z"d:d; Z#d<d= Z$deeeee eeee e%j&j' d?d@dAZ(e)dBdC Z*dDdE Z+dFdG Z,edHdIdJZ-edHdKdLZ.ee/ee0j1f dMdNdOZ2e3j4dPdQdRZ5dSdT Z6dUdVe7e8edWdXdYZ9e7edZd[d\Z:eeeed]d^d_Z;e<e7d`f edadbdcZ=e<e7d`f edadddeZ>eddfdgZ?eddhdiZ@djdk ZAdldm ZBdndo ZCdpdq ZDdrds ZEdeee e dtdudvZFdwdx ZGedydzd{ZHd|d} ZId~d ZJdd ZKdd ZLdd ZMdd ZNdd ZOeeeP dddZQdedddZRdd ZSdd ZTdd ZUdddZVdd ZWdd ZXdddZYdd ZZee dddZ[eeedddZ\e8dddZ]dd Z^dd Z_dd Z`dddZadd Zbdd Zcdd Zddd Zedd Zfdd Zgdd Zhdd ZiejddÄ Zkejddń ZlejddǄ ZmejddɄ Znejdd˄ Zo  ZpS )r^   zB
    Generate outer wrapper in Python that calls the kernels.
    c                    sV  t    t  _t  _t  _t  _t  _i  _	t
  _g  _d _d _d _d _d _d _d _d _d _d _d  _d _t _i  _t
  _d  _i  _t
  _g  _  !   "  t#j$j%st#j$j&' D ]\}} (|| qt
  _)t
  _*t+  _,t-.d  j/ _/t-.d t0d d	 fd
d}| _1i  _2t3  _4d S )N []#Nonezsize()zstride()T)r   r/   c                    s    j |  d S r0   headerrs   )r   r   r7   r8   add_import_once  s    z0WrapperCodeGen.__init__.<locals>.add_import_once)5r   r   r   _names_iterr)   r   prefixsuffixwrapper_callZsrc_to_kernelru   kernel_numel_exprlinesdeclaredeclare_maybe_referenceendingopen_bracketclosed_bracketcomment	namespaceZnone_strsizestrider   supports_intermediate_hookspexprexpr_printeruser_defined_kernel_cacheunbacked_symbol_declsallow_stack_allocationstack_allocated_bufferscomputed_sizescodegened_graph_stackwrite_headerwrite_prefixr$   r3   r   Zconstant_reprsrE   write_constant	allocatedfreeddictreusesr   	lru_cachewrite_get_raw_streamr   r   _metasr   Zmulti_kernel_state)r   rV   hashedr   r   r   r8   r     sX    
zWrapperCodeGen.__init__N)rV   r   r/   c                 C   s   | j | d|  d S )Nz = None  # r   )r   rV   r   r7   r7   r8   r     s    zWrapperCodeGen.write_constantr   c                 C   sN   t jj }d}|d k	r.|jd k	r.d|j }| jd| dtj d d S )Nr   z
# AOT ID: z
                a  
                from ctypes import c_void_p, c_long
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align

                from torch import device, empty_strided
                from a    import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                from torch._inductor.codegen.multi_kernel import MultiKernelCall

                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()

            )	torchZ_guardsZTracingContextZtry_getZaot_graph_namer   splicer   r   )r   contextZaot_config_commentr7   r7   r8   r     s    zWrapperCodeGen.write_headerc                 C   s$   | j dtjtjjd d S )Nz
            import triton
            import triton.language as tl
            from {} import grid, split_scan_grid, start_graph, end_graph
            {}
            Zget_raw_stream)	r   r   formatr   r   r$   r3   r   Zimport_get_raw_stream_asr   r7   r7   r8   write_triton_header_once
  s    z'WrapperCodeGen.write_triton_header_once)metar/   c                 C   sL   t |}|| jkrBdt| j }|| j|< | j| d|  | j| S )Nr   = )rB   r   rH   r   rs   )r   r  varr7   r7   r8   add_meta_once  s    

zWrapperCodeGen.add_meta_oncec                    s    fddt jjD S )Nc                    s   g | ]}|  jqS r7   )codegen_referencer   rR   xr   r7   r8   rT   "  s     z2WrapperCodeGen.get_output_refs.<locals>.<listcomp>)r$   r3   graph_outputsr   r7   r   r8   get_output_refs   s    zWrapperCodeGen.get_output_refsc                 C   s   d S r0   r7   r   r7   r7   r8   mark_output_type$  s    zWrapperCodeGen.mark_output_typec              
   C   sv   t jj D ]d\}}t|tjr"qt| dkr4q| 	| }| 	|
 }| jd| d| d| d qd S )Nr   zassert_size_stride(rQ   rX   )r$   r3   graph_inputsrE   rc   rd   r   r"   get_sizerl   
get_strider   rs   )r   rV   bufr   r   r7   r7   r8   codegen_input_size_asserts'  s    z)WrapperCodeGen.codegen_input_size_assertsc                 C   sd   | j d tjj D ]F\}}t|tjr.qd| d}| j | d| d}| j | qd S )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r   rs   r$   r3   r  rE   rc   rd   r   )r   rV   r  r   r7   r7   r8   codegen_input_nan_asserts3  s    z(WrapperCodeGen.codegen_input_nan_assertsc              	   C   s   | j d | j   tjjr4| j tjj	
  tjjrdtjj}ttjjdkrb|d7 }| j | d | j d | | j tjj tjr|   tjr|   W 5 Q R X d S )Nzs

            async_compile.wait(globals())
            del async_compile

            def call(args):
            rQ   r%   ,z = argszargs.clear())r   r   rt   r   tritondebug_sync_graphrs   r$   r3   r   synchronizer  r[   Zgraph_input_namesrH   codegen_inputsZsize_assertsr  Znan_assertsr  )r   lhsr7   r7   r8   r   >  s"    	zWrapperCodeGen.write_prefix)r   r/   c                 C   s,   |    d| }| | d| d |S )Nstreamz = get_raw_stream(rX   )r  rs   )r   r   r3   rV   r7   r7   r8   r   [  s    
z#WrapperCodeGen.write_get_raw_streamc                 C   s
   | j d S )N)r   r   r7   r7   r8   get_codegened_grapha  s    z"WrapperCodeGen.get_codegened_graphc                 C   s   | j | d S r0   )r   r   )r   r3   r7   r7   r8   r   d  s    z#WrapperCodeGen.push_codegened_graphc                 C   s
   | j  S r0   )r   r   r   r7   r7   r8   r   g  s    z"WrapperCodeGen.pop_codegened_graphc                 C   s   t | j S r0   )nextr   r   r7   r7   r8   next_kernel_suffixj  s    z!WrapperCodeGen.next_kernel_suffixc                 C   s   |  t|| j || _d S r0   )rs   r   r   )r   r   r7   r7   r8   codegen_device_guard_enterm  s    
z)WrapperCodeGen.codegen_device_guard_enterc                 C   s   |  t  d S r0   )rs   r   r   r7   r7   r8   codegen_device_guard_exits  s    z(WrapperCodeGen.codegen_device_guard_exit)output_refsr/   c                 C   s0   |r | j dd| d  n| j d d S )Nzreturn (rQ   , )z	return ())r   rs   r[   )r   r!  r7   r7   r8   generate_returnv  s    zWrapperCodeGen.generate_return)resultr/   c                 C   s   d S r0   r7   r   r$  r7   r7   r8   generate_before_suffix|  s    z%WrapperCodeGen.generate_before_suffixc                 C   s   d S r0   r7   r%  r7   r7   r8   generate_end  s    zWrapperCodeGen.generate_endc                 C   s   |  || d S r0   )generate_extern_kernel_alloc)r   Zfallback_kernelr\   r7   r7   r8   generate_fallback_kernel  s    z'WrapperCodeGen.generate_fallback_kernelc              
   C   s   |  }| }| }| j}tjr6d|kr6d| }| | j | d| dd| d|  | j	rtj
r|d k	rtd d  d	7  < | d
|jd| d d S )NZview_as_complexz.clone()r  rW   rQ   rX   ZinductorZintermediate_hooksr%   zrun_intermediate_hooks()r   Zget_origin_nodeZget_kernel_namer   r   memory_planningrs   r   r[   r   Zgenerate_intermediate_hooksr   rV   )r   Zextern_kernelr\   Zoutput_nameZorigin_nodekernel_namer   r7   r7   r8   r(    s&    
$z+WrapperCodeGen.generate_extern_kernel_alloc)rP   outout_viewr\   c                 C   s8   | d|r|n|  | | dd| d d S )Nzout=rW   rQ   rX   )r   rs   r[   )r   rP   r,  r-  r\   r7   r7   r8   generate_extern_kernel_out  s    z)WrapperCodeGen.generate_extern_kernel_outc              
   C   sv   t |||| d\}}|dD ]}| | qtjj }	| |	jtj}
| | dd	| d| d|
 d d S )N)ra   r   .run(rQ   , grid=	, stream=rX   )
r~   splitrs   r$   r3   	schedulerget_current_device_or_throwr   indexr[   )r   r+  rg   r_   r\   triton_meta	arg_typesr   r   current_devicestream_namer7   r7   r8   #generate_user_defined_triton_kernel  s       
 z2WrapperCodeGen.generate_user_defined_triton_kernelc           	      C   sf   | dd tt| }|dr:|d dg| 7 }n|rP|dt| 7 }|d7 }| | d S )NrW   r  zaten.scatter_reducerQ   r   z	, reduce=rX   )r[   mapr   
startswithrB   rs   )	r   rz   inputscpp_kernel_namepython_kernel_nameZsrc_is_tensorr   rw   r   r7   r7   r8   generate_scatter_fallback  s    

z(WrapperCodeGen.generate_scatter_fallbackc                 C   s<   | j  d| | j }||||g}| | || d S )NrQ   )r   r[   r   rs   wrap_kernel_call)r   rP   r
  indicesvalues
accumulateZindices_strr\   r7   r7   r8   generate_index_put_fallback  s    z*WrapperCodeGen.generate_index_put_fallbackr   )buf_namer?  r>  codegen_argscpp_op_schemacpp_kernel_keycpp_kernel_overload_nameop_overloadc              	   C   s&   |  | d| dd| d d S )Nr  rW   rQ   rX   )rs   r[   )r   rF  r?  r>  rG  rH  rI  rJ  rK  Zraw_argsoutputsr7   r7   r8   6generate_extern_kernel_alloc_and_find_schema_if_needed  s    zEWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_neededc              	   C   s  t jr|   t }|| j tjjr>tjj	r>tjj
r>t }t }|| j  t jrh| | t jrv|   |rt jr|   d| _n|   t jjr|   | jD ](}t|tr|| j q| j| q|  }|    t jj!r| jtjj"#  t jr| $  t jjr(| %  | &| W 5 Q R X | '  || j( |  || j W 5 Q R X | )| || j* | +| | ,| |- S )NF).r   Zprofile_bandwidthr  r)   r   r   r$   r3   r   r   Zis_const_graph
contextlib	ExitStackenter_contextr   rt   Zprofiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphr*  memory_planr   memory_plan_reuser  Zstore_cubin!generate_reset_kernel_saved_flagsr   rc   r   r   rs   r  r  r  r   r  generate_end_graph generate_save_uncompiled_kernelsr#  finalize_prefixr   r&  r   r'  add_benchmark_harnessZgetvaluewithlinemap)r   Zis_inferencer$  stackr   r!  r7   r7   r8   generate  sP    










zWrapperCodeGen.generatec                 C   s"   ddl m} || | j| _d S )Nr%   )MemoryPlanner)r*  r\  r   r   )r   r\  r7   r7   r8   rS    s    zWrapperCodeGen.memory_planc                 C   s  t j }| jr>t| jd tr>| jd jj|kr>| j  q
t	 g}g }t
t| jD ]^}| j| }t|tr||d | j|< qXt|tr|t	  qXt|trX||  qX||  t|dksttdd |D }| jdk	otjo|tk| _d S )Nr  r   c                 s   s   | ]}|j V  qd S r0   )r   )rR   sr7   r7   r8   ri   9  s    z3WrapperCodeGen.memory_plan_reuse.<locals>.<genexpr>F)r$   r3   get_output_namesr   rc   r   r.   rV   r   r   rangerH   r   r   r   r   rI   sumr   r   MAX_STACK_ALLOCATION_SIZE)r   Z	out_namesZplanning_statesZpast_planning_statesir   r   r7   r7   r8   rT    s8    





z WrapperCodeGen.memory_plan_reuse)r   c              	   C   s,   | | j | d| d| j | j  d S )Nz_size = .)rs   r   r   r   r   r   rV   r7   r7   r8   codegen_input_size_var_declC  s    z*WrapperCodeGen.codegen_input_size_var_declc              	   C   s,   | | j | d| d| j | j  d S )Nz
_stride = rc  )rs   r   r   r   rd  r7   r7   r8   codegen_input_stride_var_declF  s     z,WrapperCodeGen.codegen_input_stride_var_decl)r   r  c                    s  t d fdd}t d fdd}t }dd tt| }ttfdd	| }|D ]F\}}	t|	tjrn|	|krn 	j
 |	 d
| j  ||	 qn|D ]l\}}
|
 }t|D ]R\}}	t|	tjr|	|kr҈ 	j
 |	 d
|| d| dj  ||	 qq|D ]t\}}
|
 }t|D ]X\}}	t|	tjrD|	|krD 	j
 |	 d
|| d| dj  ||	 qDq,dS )z$Assign all symbolic shapes to localsNc                    s     |  |  dS )N_size)re  rV   r   r   r7   r8   sizeofP  s    z-WrapperCodeGen.codegen_inputs.<locals>.sizeofc                    s     |  |  dS )NZ_stride)rf  rh  ri  r7   r8   strideofU  s    z/WrapperCodeGen.codegen_inputs.<locals>.strideofc                 S   s   t | d tjS )Nr%   )rc   rd   r   r
  r7   r7   r8   is_expr]  s    z.WrapperCodeGen.codegen_inputs.<locals>.is_exprc                    s
    |  S r0   r7   rl  )rm  r7   r8   <lambda>b      z/WrapperCodeGen.codegen_inputs.<locals>.<lambda>r  r   r   )r   r   ru   r   filterrE   rc   rd   Symbolrs   r   r   rx   r  	enumerater  )r   r   r  rj  rk  Z
bound_varsZgraph_inputs_exprZgraph_inputs_tensorsrV   shapevalueZshapesdimr7   )r   rm  r   r8   r  K  s<     $$zWrapperCodeGen.codegen_inputs)symc                 C   sj   t |tjrft|tjrf|| jkr&d S | j| tj	j
j| }| | j | d| | | j  d S Nr  )rc   rd   rq  r   r   ZPRECOMPUTED_SIZEr   rx   r$   r3   r4   Zinv_precomputed_replacementsrs   r   r   r   )r   rv  exprr7   r7   r8   ensure_size_computed|  s    
z#WrapperCodeGen.ensure_size_computedc                 C   s   d S r0   r7   r   r7   r7   r8   rX    s    zWrapperCodeGen.finalize_prefixTr5   )r
  r5   r/   c                C   s   t ||dS )Nrz  r   )r   r
  r5   r7   r7   r8   codegen_python_sizevar  s    z%WrapperCodeGen.codegen_python_sizevar)r
  r/   c                 C   s
   |  |S r0   )r|  )r   r
  r7   r7   r8   codegen_sizevar  s    zWrapperCodeGen.codegen_sizevar)basenamerV   r5  r/   c                 C   s   | d| dS )Nr   r   r7   )r   r~  rV   r5  r7   r7   r8   codegen_tuple_access  s    z#WrapperCodeGen.codegen_tuple_access.)rs  r/   c                 C   sN   t t| j|}t|dkr dS t|dkr<d|d  dS dd| dS )Nr   z()r%   rW   r"  rQ   rX   )r   r;  r|  rH   r[   )r   rs  partsr7   r7   r8   codegen_python_shape_tuple  s    z)WrapperCodeGen.codegen_python_shape_tuplec                 C   s
   |  |S r0   )r  )r   rs  r7   r7   r8   rl     s    z"WrapperCodeGen.codegen_shape_tuplec                 C   s.   d d|t|t|| || |gS )Nzalloc_from_pool({})rQ   )r  r[   r   r   rl   )r   rV   offsetdtypers  r   r7   r7   r8   codegen_alloc_from_pool  s    z&WrapperCodeGen.codegen_alloc_from_poolc              	   C   s@   |  |}|  |}| |}d|  d| d| d| d	S )Nzreinterpret_tensor(rQ   rX   )rl   r}  r   )r   datar   r   r  writerr7   r7   r8   codegen_reinterpret_view  s    


z'WrapperCodeGen.codegen_reinterpret_viewc                 C   s   |  | d| d d S )Nz.copy_(rX   rs   )r   srcdstr7   r7   r8   codegen_device_copy  s    z"WrapperCodeGen.codegen_device_copyc                 C   s$   |  | j | d| | j  d S rw  )rs   r   r   )r   rV   rt  r7   r7   r8   codegen_multi_output  s    z#WrapperCodeGen.codegen_multi_outputc                 C   s4  dd |j D \}t|jdkr:| |j d| d nt|jdkrrt|jd trr| |j d| d nt|jdkrt|jd tr| |j d	| d | d
|j d|jd j d|j d|jd j d	 | |j d|j d|jd j  nt	d|j | |
  d d S )Nc                 s   s   | ]}|  V  qd S r0   r  )rR   tr7   r7   r8   ri     s     z8WrapperCodeGen.codegen_dynamic_scalar.<locals>.<genexpr>r   r  .item()r%   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // zunrecognized keypath z = None)r=  rH   Zkeypathrs   rv  rc   r   r   ZdivisorrI   r   )r   r.   r  r7   r7   r8   codegen_dynamic_scalar  s    "0z%WrapperCodeGen.codegen_dynamic_scalarc              	      s   fdd} fdd} fdd}  dddg   x  jd	d
d tjj D ]4\}} d|  ||| |	 |j
|j q\ttjjdkr܈ d tjj D ]"\}} d|  ||| qtjj D ]\}}t|tjrttjjj|d trqt|tjrB||tjjj|dd qdd | D }dd | D }	||||	| |  qddtjj  d}
 d|
   d W 5 Q R X d S )Nc                    s8     |  d| d| d| d| d
 d S )Nz = rand_strided(rQ   
, device='	', dtype=rX   )rs   r  )rV   rs  r   devicer  rz   r   r7   r8   add_fake_input  s    ,z@WrapperCodeGen.benchmark_compiled_module.<locals>.add_fake_inputc                    s     |  d|  d S rw  r  )rV   ro   rz   r7   r8   add_expr_input  s    z@WrapperCodeGen.benchmark_compiled_module.<locals>.add_expr_inputc                    s(   dd l } |  d||d d S )Nr   z = pickle.loads(rX   )picklers   dumps)rV   rt  r  r  r7   r8   add_torchbind_input  s    zEWrapperCodeGen.benchmark_compiled_module.<locals>.add_torchbind_inputr   z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tstripzglobal r   zimport pickle*   fallbackc                 S   s   g | ]}t jjj|d dqS r  r  r$   r3   r4   	size_hintr	  r7   r7   r8   rT     s   z<WrapperCodeGen.benchmark_compiled_module.<locals>.<listcomp>c                 S   s   g | ]}t jjj|d dqS r  r  r	  r7   r7   r8   rT     s   zcall([rQ   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))
writelinesrt   r   r$   r3   	constantsrE   rs   r   r   r  r  rH   torchbind_constantsr  rc   rd   rq  r4   Z
var_to_valrN   r   r   r  r  r  r1   r2   r[   keys)r   rz   r  r  r  rV   rt  Ztorchbind_objrs  r   Zcall_strr7   r  r8   benchmark_compiled_module  sb        
 z(WrapperCodeGen.benchmark_compiled_modulec              	   C   sT   t js
dS | | |dddg |  |ddt  dg W 5 Q R X dS )zL
        Append a benchmark harness to generated code for debugging
        Nr   zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   Zbenchmark_harnessr  r  rt   r    )r   rz   r7   r7   r8   rY    s    

z$WrapperCodeGen.add_benchmark_harness)rV   rP   metadatac                 C   s2   |r| dnd}| j d| | d|  d S )Nr   r   z

r  )r   r   )r   rV   rP   r  cudaZmetadata_commentr7   r7   r8   define_kernel-  s    zWrapperCodeGen.define_kernelc              	      s  ddl m} |  |j}ddlm}m}m} g }	i }
g }g }t|jD ]\}}||krXqF|| }||j	krt||
|< qF|
| t|tjr|	
||| | d qFt|tjr|	
|||j | |jjd qF|	
||| t|ttjfrFtjj|drF|
| qFd}t|	||dttjj |
t !|dt"|	|d	gd
}t#|j$g}t%|dkr|& D ]$}t|tjtjfsx|
| qx|
t'| t(|}|| j)kr| j)| S | dt%| j) }||f| j)|< t* +d|d ddl,m-}m.} /|  d|i|0 }dd |D }/d|d|d|d j/|j1dd ddl,m2  ddl3m4 |h fdd| tjj }+d|j5 d t67|j$\}}t68|j$}d| d| }| 9|: | ||fS ) Nr   )patch_triton_dtype_reprr%   )KernelArgTypeSizeArg	TensorArg)rV   bufferr  )rV   r  r  r  ztl.int32)Z
size_dtyperB  )rB  )	signaturer  r  r_   _zasync_compile.triton(z, ''')gen_common_triton_importsTritonKernelr+  c                 S   s   g | ]}|j |j|jd qS )rw   Z	num_warpsZ
num_stagesr  )rR   r   r7   r7   r8   rT     s
   zDWrapperCodeGen.define_user_defined_triton_kernel.<locals>.<listcomp>zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            Tr  )JITFunction)	constexprc           	   	      s  dd t | jD }| jjdi }| jjjD ]l}|kr@q0|| jjkr0| jj| }t| r  	d j
|jdd | | q0t|tttfrN  t|rd|jd}n|}|| }r.d	}t|tr
d
|j d|j }n
d
|}	| | d|  n	| d| | q0||kr0|dkr0t|dr0|jdr0	d|j d|j d|  | q0d S )Nc                 S   s   h | ]}|j d kr|jqS )LOAD_GLOBAL)opnameargval)rR   instr7   r7   r8   	<setcomp>  s   
zUWrapperCodeGen.define_user_defined_triton_kernel.<locals>.traverse.<locals>.<setcomp>r   z@triton.jitTr  ztl.constexpr(rX   r   : rc  r  tlr   r  zfrom z import z as )disBytecodefn__globals__rN   __code__co_namesrc   newliners   r   r  rx   rq   r   r   rt  r   r   r   hasattrr<  )	Z
cur_kernelZunqualified_loadsZglobal_annotationsZsymbol_namesymbolZ
symbol_str
annotationZannotion_codeZannotation_coder  Zcompile_wrapperr  Zsymbols_includedtraverser7   r8   r    sV    







zBWrapperCodeGen.define_user_defined_triton_kernel.<locals>.traversez''', device_str='z')z# Original path: r   );torch.utils._tritonr  r   commonr  r  r  rr  	arg_namesZ
constexprsr   rc   r   r   r   r2   r   r  r6   r  rq   rd   re   r$   r3   r4   Zstatically_known_equalsr,   r   creater3  r4  r   fromkeysr+   idr  rH   rC  r   rk   r   r)   rs   r  r  r  r   Zinductor_meta_commonr  r  Ztriton.languager  r   inspectgetsourcelinesgetsourcefiler  ry   )r   rP   r_   rw   r  original_namer  r  r  r  r  Znon_constant_indicesZequal_to_1_arg_idxidxr   r:   Zindex_dtyper6  	cache_keyrV   r  r  Zinductor_metar8  r  linenosrcfiler  r7   r  r8   !define_user_defined_triton_kernel3  s    


	  


 	;z0WrapperCodeGen.define_user_defined_triton_kernel)r+  c                 C   s   | d|j  d}|tjf| jkr^| j|tjf | | j | d| |j | j	  n"| | d| |j | j	  t
||jS )Nr  numelr  )r   r$   r3   r   rx   rs   r   r   r  r   r   )r   r+  treerx  r7   r7   r8   generate_numel_expr  s     "z"WrapperCodeGen.generate_numel_exprc                 C   s<   | j d|tj|fdd}| | |r8| d| j  d S )NZ	workspace)r%   )rs  r   zworkspace.zero_())make_allocationr   Zuint8rs   r   )r   nbytesr  Z	zero_fillr   r7   r7   r8   generate_workspace_allocation  s        
z,WrapperCodeGen.generate_workspace_allocationc                 C   s   | dd | d| j S )NrW   rQ   rX   )r[   r   )r   rV   	call_argsr7   r7   r8   rA     s    zWrapperCodeGen.wrap_kernel_callc                 C   s8   | j d | j dtjj d || j   d S )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r   rs   r$   r3   Zgraph_idrP  rt   )r   rZ  r7   r7   r8   rQ  #  s
    z2WrapperCodeGen.generate_profiler_mark_wrapper_callc                 C   s   | j d d S )Nzstart_graph())r   rs   r   r7   r7   r8   rR  *  s    z#WrapperCodeGen.generate_start_graphc                 C   s   | j dtjd d S )Nz
end_graph(rX   )r   rs   r   Zprofile_bandwidth_outputr   r7   r7   r8   rV  -  s    z!WrapperCodeGen.generate_end_graphc                 C   s   | j dtj d d S )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r   r   r   r   r   r7   r7   r8   rU  0  s
    z0WrapperCodeGen.generate_reset_kernel_saved_flagsc                 C   s   | j dtj d dS )a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_cuda_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   r7   r7   r8   rW  9  s
    z/WrapperCodeGen.generate_save_uncompiled_kernels)rV   	grid_argsc                 C   s   |S r0   r7   )r   rV   r  r7   r7   r8   generate_default_gridS  s    z$WrapperCodeGen.generate_default_gridrg   )grid_fnc
              
   C   s   |rd dd |D }
tjj }| |jtj}|rd dd |D }| d| d}| | d|
 d| d	| d qd
| d}| | d| d|
 d| d n| | || dS )a7  
        Generates kernel call code.

        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.

        triton: Defines whether the GPU backend uses Triton for codegen.
                Otherwise it uses the CUDA language for codegen.
                Only valid when cuda == True.
        rQ   c                 s   s   | ]}t |V  qd S r0   r{  rR   rb   r7   r7   r8   ri   l  s     z6WrapperCodeGen.generate_kernel_call.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S r0   r{  r  r7   r7   r8   ri   p  s     rW   rX   r/  r0  r1  z	c_void_p(rc  N)	r[   r$   r3   r3  r4  r   r5  rs   rA  )r   rV   r  rg   Zdevice_indexr  r  r7  r  r6  Zcall_args_strr8  r9  Zgrid_strZ
stream_ptrr7   r7   r8   generate_kernel_callV  s    $z#WrapperCodeGen.generate_kernel_callc                 C   s   | j | d S r0   )r   r   )r   r   r7   r7   r8   rs   {  s    zWrapperCodeGen.writelinec                 C   s   |D ]}|  | qd S r0   r  )r   r   r   r7   r7   r8   r  ~  s    zWrapperCodeGen.writelinesc                 C   s   | j t| d S r0   )r   r   r!   )r   ctxr7   r7   r8   rP    s    zWrapperCodeGen.enter_contextc                    s   ddl m}m} | rdd l}t|tr4t|jjS t|t	j
rHt|S t|ttfrtjG dd d tt| fdd|D S t|tjjrt|S t|tjtfr| S | rt||jjr||S t|S d S )Nr   )dtype_to_stringhas_triton_packagec                   @   s   e Zd ZU eed< dd ZdS )z+WrapperCodeGen.val_to_arg_str.<locals>.Shimrefc                 S   s   | j S r0   )r  r   r7   r7   r8   __repr__  s    z4WrapperCodeGen.val_to_arg_str.<locals>.Shim.__repr__N)r   r   r   r   r   r  r7   r7   r7   r8   Shim  s   
r  c                 3   s   | ]}  |V  qd S r0   )val_to_arg_str)rR   ar  r   r7   r8   ri     s     z0WrapperCodeGen.val_to_arg_str.<locals>.<genexpr>)r  r  r  r  rc   r   r   r.   rx  rd   r   rk   r   r   	dataclassrB   r   r   _ops
OpOverloadr   r   r   r   r  languager  )r   r]  type_r  r  r  r7   r  r8   r    s$    
 zWrapperCodeGen.val_to_arg_strc                 C   s>   |  }| }t| }t| }| | ||||S r0   )r1   r2   rk   r  r  r  r   )r   r  r  r  rs  r   r7   r7   r8   r     s
    z%WrapperCodeGen.make_buffer_allocationc              
   C   sj   |j dkr:| d|j  d| | d| | d| d
S | d| | d| | d|j  d| d
S )	N)r   r  z = empty_strided_rW   rQ   rX   z = empty_strided(r  r  )r   rl   )r   rV   r  r  rs  r   r7   r7   r8   r    s
    
..zWrapperCodeGen.make_allocationc              	   C   s(   | j  | d| | j d| j d| 	S )Nr    rU   )r   r   r   )r   new_nameold_namer   r7   r7   r8   make_tensor_alias  s    z WrapperCodeGen.make_tensor_aliasc                 C   s   d|   S )Ndel )r   )r   r  r7   r7   r8   r     s    zWrapperCodeGen.make_buffer_free)names_to_delc                 C   s   dd dd |D  S )Nr  rQ   c                 s   s   | ]
}|V  qd S r0   r7   )rR   rV   r7   r7   r8   ri     s     z4WrapperCodeGen.make_free_by_names.<locals>.<genexpr>)r[   )r   r  r7   r7   r8   make_free_by_names  s    z!WrapperCodeGen.make_free_by_names)r  r  del_linec              	   C   s(   | j  | d| | | j d| j d	S )Nr  r   reuse)r   r   r   )r   r  r  r  r7   r7   r8   codegen_exact_buffer_reuse  s    z)WrapperCodeGen.codegen_exact_buffer_reuse)r   c                 C   s   |  |  kst| }| }d}|tj krJ|rJd| | }| | kr| | kr|| j	kr~|| j	|< | 
|||S | || | d| j}|| j	kr|| j	|< | j | d| | d| j dS )N;z; r   r  r  r  )r2   rI   r   r$   r3   r^  r   r  r  r   r  r  r   r   r   )r   oldnewr   r  r  r  Zreinterpret_viewr7   r7   r8   r     s(     

    

z WrapperCodeGen.make_buffer_reusec                 C   s:   |  t|| j | d|j  | j d| j d d S )Nr  r  z alias)rs   r(   r   viewr  r   r   )r   rV   r6   r7   r7   r8   codegen_deferred_allocation  s    (z*WrapperCodeGen.codegen_deferred_allocationc                 C   s   |  }|tjjks|| jkr"d S | j| t|tjtj	frDd S |
 }t|tjr\d S t|tjrt|jtjstdt|j d|j | |jj | || d S | t| | d S )Nzunexpected r  )r   r$   r3   r   r   rx   rc   r   ZExternKernelAllocZMultiOutputZ
get_layoutZMutationLayoutSHOULDREMOVEZNonOwningLayoutr  r   rI   r   codegen_allocationr  r  rs   r   )r   r  rV   r6   r7   r7   r8   r    s.    
 z!WrapperCodeGen.codegen_allocationc                 C   sj   |  dkstd| }t|tjr<| | | d S | |sJd S | j	
| | t| | d S )Nr   z)Only support zero workspace size for now!)Zget_workspace_sizerI   r   rc   r   ZInputBufferrs   r   	can_reuser   rx   r   )r   r  rV   r7   r7   r8   codegen_free  s    

zWrapperCodeGen.codegen_freec                 C   sV   |  }|tjjksN|tjjksN|tjjksN|tjjksN|tjjksN|| jkrRdS dS )NFT)	r   r$   r3   r   r  r  r  Znever_reuse_buffersr   )r   input_bufferoutput_bufferrV   r7   r7   r8   r    s    




zWrapperCodeGen.can_reusec                 C   s$   |  | jko"| j|   |  kS r0   )r   r   )r   r  Zreused_bufferr7   r7   r8   	did_reuse  s    zWrapperCodeGen.did_reusec                 C   sf   t |t |kst| | | j|  | j|  | | j| < | t	| || d S r0   )
r9   rI   r  r   rx   r   r   r   rs   r   )r   r  r	  r7   r7   r8   codegen_inplace_reuse%  s    
z$WrapperCodeGen.codegen_inplace_reusec                 C   s0   t |}|| jkr|S | j| | j| S d S r0   )r   r   rx   r   )r   r  rV   r7   r7   r8   codegen_unbacked_symbol_decl-  s
    
z+WrapperCodeGen.codegen_unbacked_symbol_declc                 C   s<   t |jj|D ](\}}| | j | d| | j  qd S rw  )rv   r3   r  rs   r   r   )r   subgraphouter_inputsouter_outputsZinner_inputZouter_inputr7   r7   r8   codegen_subgraph_prefix6  s    z&WrapperCodeGen.codegen_subgraph_prefixc                 C   s:   t |jj|D ]&\}}| | d|  | j  qd S rw  )rv   r3   r  rs   r  r   )r   r  r  r  Zinner_outputZouter_outputr7   r7   r8   codegen_subgraph_suffix:  s     z&WrapperCodeGen.codegen_subgraph_suffixc              	   C   s   zp| |j | | j d|j  | ||| tj}t|j |jj	|d W 5 Q R X | 
||| W 5 |    X d S )Nz subgraph: )parent_graph)r   r   r3   rs   r   rV   r  r$   Zset_graph_handlercodegen_subgraphr  )r   r  r  r  r  r7   r7   r8   r  B  s    zWrapperCodeGen.codegen_subgraphc                    s  |   |   dt|j  dd |jD } fddtt|jD }|j }t|jt	j
sp| d}|   dt|j  | d| d | t| |jj | |j|| | t|  | d | t| |jj | |j|| | t|  d S )	N = [None] * c                 S   s   g | ]}|  qS r7   r  rR   r  r7   r7   r8   rT   U  s     z6WrapperCodeGen.codegen_conditional.<locals>.<listcomp>c                    s   g | ]}  d | dqS r   r   r7   rR   rb  rh  r7   r8   rT   V  s     r  rp   r   zelse:)r   rs   rH   rL  Zoperandsr_  	predicater  rc   r   ZShapeAsConstantBufferr   Ztrue_subgraphr3   r  r   Zfalse_subgraph)r   Zconditionalr  r  r  r7   rh  r8   codegen_conditionalP  s     


z"WrapperCodeGen.codegen_conditionalc           
         s6  |   dd |jD }dd |jD }|   dt|  t|D ]"\}}|   d| d|  qH fddtt|D |}  dg}t|}|d t| }	| d	 | t| |j	j
 | |j	|| | d
|d  d | t|  | t| |jj
 | |j||	 | t|  d S )Nc                 S   s   g | ]}|  qS r7   r  r  r7   r7   r8   rT   i  s    z5WrapperCodeGen.codegen_while_loop.<locals>.<listcomp>c                 S   s   g | ]}|  qS r7   r  r  r7   r7   r8   rT   l  s    r  r   z] = c                    s   g | ]}  d | dqS r  r7   r  rh  r7   r8   rT   v  s     Z_cond_resultzwhile True:zif not r   z.item(): break)r   Zcarried_inputsZadditional_inputsrs   rH   rr  r_  r   r   Zcond_subgraphr3   r  r   Zbody_subgraph)
r   Z
while_loopZouter_carried_inputsZouter_additional_inputsrb  ZinpZcond_outer_inputsZcond_outer_outputsZbody_outer_inputsZbody_outer_outputsr7   rh  r8   codegen_while_loopg  sH    
    z!WrapperCodeGen.codegen_while_loopc                 C   sF   z*t | dd rW d S tjj| }t|W S  tk
r@   Y d S X d S )NZfree_symbols)r   r$   r3   Z
_shape_envZ_maybe_evaluate_staticrq   	Exception)r
  ro   r7   r7   r8   statically_known_int_or_none  s    
z+WrapperCodeGen.statically_known_int_or_nonec                 C   s4   g }| D ]&}t |}|d kr$ d S || q|S r0   )r^   r  r   )lstr$  r
  numr7   r7   r8   %statically_known_list_of_ints_or_none  s    
z4WrapperCodeGen.statically_known_list_of_ints_or_nonec                 C   s   t | d k	S r0   )r^   r  )r  r7   r7   r8    is_statically_known_list_of_ints  s    z/WrapperCodeGen.is_statically_known_list_of_intsc                 C   s   t |  S r0   )r^   r  r  r  r7   r7   r8   r     s    z.WrapperCodeGen.static_shape_for_buffer_or_nonec                 C   s   t | d k	S r0   )r^   r   r!  r7   r7   r8   !can_prove_buffer_has_static_shape  s    z0WrapperCodeGen.can_prove_buffer_has_static_shape)N)N)r   NNN)NT)NNTTNrg   N)N)r   )N)qr   r   r   __doc__r   r   r   r   r   r  TritonMetaParamsr  r   r  r  r  r  r   rq   r   r  r   r   r  r  r   r#  r)   r&  r'  r)  r(  r   r.  r:  r@  rE  r   r  r  rM  r   r[  rS  rT  re  rf  r   r   Z	TensorBoxr  rd   rq  ry  rX  r   r   r|  r}  r  r
   r  rl   r  r  r  r  r  r  rY  r  r  r  r  rA  rQ  rR  rV  rU  rW  r   r  r  rs   r  rP  r  r   r  r  r   r  r  r   r  r  r  r  r
  r  r  r  r  r  r  r  staticmethodr  r  r   r   r"  r   r7   r7   r   r8   r^     s  ?%    
    

>& 1
P      S	       	%

	
	*

	

)N)ir   rN  r   r  r   r  r   rF   	itertoolsr   typingr   r   r   r   r   r   r	   r
   r   r   rd   r   r   Z
torch._opsZtorch._dynamo.utilsr   r   Z$torch._inductor.codegen.multi_kernelr   Z%torch.fx.experimental.symbolic_shapesr   r   r   Ztorch.fx.noder   Z torch.utils._sympy.singleton_intr   Ztorch.utils._sympy.symbolr   r   r   r   r   r   r   runtimer   Zruntime.hintsr   utilsr   r    r!   r"   r#   Zvirtualizedr$   Zaoti_hipify_utilsr&   r  r'   r(   r)   r*   Ztriton_utilsr+   r,   r  r3   r-   Zdoprintr   r  r  r   r   r   r9   ZArgumentrL   rO   r  r  r]   rq   r$  rr   r~   r  r   ra  r   r   r   r   r   r   r   r   r   r   r   Z
BufferNamer^   r7   r7   r7   r8   <module>   s   0*	 
*	.