U
    yhO}                    @  s@  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d	d
l m!Z!m"Z"m#Z# d	dl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ ddlRmSZSmTZTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[ erddl(m\Z\ e]e^Z_ej`ae^dZbej`ae^dZcej`ae^dZdeddd Zeeddd  ZfejgG d!d" d"ZhejgG d#d$ d$Zid%d&d&d'd(d)ZjG d*d+ d+eOZkek jlZmd,d- Znd.d/ Zod0d1 ZpG d2d3 d3eKZqG d4d5 d5eNZrersd6 d5d7d8d9d:ZtG d;d< d<erZud<d7d8d=d>ZvG d?d@ d@ZwG dAdB dBeVZxG dCdD dDeWZydS )E    )annotationsN)	lru_cache)
AnyCallablecastDictListOptionalSetTupleTYPE_CHECKINGUnion)preserve_rng_state)AutotuneHintDeviceProperties)is_integer_dtype)has_triton_package   )free_symbol_is_typesymbol_is_typeSymT)ValueRanges   )configir)	code_hashget_pathPyCodeCache)is_metric_table_enabledlog_kernel_metadata)ReductionHintTRITON_MAX_BLOCK)do_bench_gpuget_max_y_gridnext_power_of_2)cache_on_selfget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholder	sympy_dot
sympy_subs)_ops
OpsHandlerReductionType	StoreModeV)"get_kernel_category_by_source_code   )CSECSEVariableDeferredLineIndentedBufferOpOverridesPythonPrinterSizeArg	TensorArg)constant_reprIterationRangesEntrypexpr
SIMDKernelSIMDScheduling)	config_ofsignature_ofsignature_to_meta)IRNodeZ
perf_hintsZscheduleZfusionc                  C  s,   t  s
dS ddl} t| jjdr$dS dS dS )zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NZAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   Ztriton.compiler.compilerhasattrcompiler)triton rI   P/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/codegen/triton.pygen_attr_descriptor_importK   s    rK   c                  C  s6   t  } | d t  }r$| | | d |  S )NzD
        import triton
        import triton.language as tl
        a,  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties
        )r7   splicerK   	writelinegetvalue)ZimportsZ	attr_descrI   rI   rJ   gen_common_triton_imports\   s    

rO   c                   @  sf   e Zd ZU ded< ded< ded< ded< ded	< d
ed< dd Zdd Zdd Zdd Zdd ZdS )IndexingOptionsstr	index_strSet[sympy.Symbol]	mask_varsmask_strzOptional[str]
expand_strbool_has_rindex
sympy.Exprindexc                 C  s
   t | jS N)rW   rT   selfrI   rI   rJ   has_mask{   s    zIndexingOptions.has_maskc                 C  s   t | jtjS r[   )r   rZ   r   TMPr\   rI   rI   rJ   has_indirect~   s    zIndexingOptions.has_indirectc                 C  s   | j S r[   )rX   r\   rI   rI   rJ   
has_rindex   s    zIndexingOptions.has_rindexc                 C  s
   d| j kS )NtmprU   r\   rI   rI   rJ   has_tmpmask   s    zIndexingOptions.has_tmpmaskc                 C  s
   d| j kS )NZrmaskrc   r\   rI   rI   rJ   	has_rmask   s    zIndexingOptions.has_rmaskN)	__name__
__module____qualname____annotations__r^   r`   ra   rd   re   rI   rI   rI   rJ   rP   r   s   
rP   c                   @  s   e Zd ZU ded< ded< ded< ded< ded	< ded
< ded< ded< eddddd dddZd'dddddZeddddZdd Z	dd Z
dd Zd d! Zd"d# Zd$d% Zd&S )(BlockPtrOptionsrY   constant_offsetzList[sympy.Expr]shapestrides	List[str]block_shapez	List[int]orderoffsetsrS   rT   reshape_suffixzList[IterationRangesEntry])rm   rk   range_treesrT   returnc           	   
     s  dd |D }|}dd | D  t  D ]\}}|r*d||< q*tjjrd|d jdksZt|d tjjst| ttjj	d krtjj	d dkr|
d  fd	d
}ttjj|dd ||D ttjjj|| ||tjj|| |dd |D ||dS )z,Helper to create a  BlockPtrOptions instancec                 S  s   g | ]}|j   d qS )BLOCK)prefixupper.0trI   rI   rJ   
<listcomp>   s     z*BlockPtrOptions.create.<locals>.<listcomp>c                 S  s   g | ]}|d kqS )r   rI   ry   srI   rI   rJ   r{      s     1r   xr3   c                   s(   t | t  kstdd t|  D S )z3Removes any broadcasting dims from a given sequencec                 S  s   g | ]\}}|s|qS rI   rI   )ry   itemis_broadcastingrI   rI   rJ   r{      s   z:BlockPtrOptions.create.<locals>.filter.<locals>.<listcomp>)lenAssertionErrorzip)itZbroadcasting_dimrI   rJ   filter   s    z&BlockPtrOptions.create.<locals>.filterc                 S  s   g | ]}t jj|jqS rI   )r1   graphsizevarslookup_precomputed_sizenumelrx   rI   rI   rJ   r{      s   c                 S  s   g | ]}|j  d qS )offsetrv   rx   rI   rI   rJ   r{      s     )rk   rl   rm   ro   rp   rq   rT   rr   )	enumerater1   kernelno_x_dimrv   r   popinside_reductionr   numelsappendrj   r   r   r   mapZguarded_order)	rm   rk   rs   rT   ro   rr   ir   r   rI   r   rJ   create   s:    


	zBlockPtrOptions.createTrQ   namert   c                 C  s   t jj}| j}|s"d||d< | jdkrB| d|| j dn|d|| j d|| j d|| j d	|| j	 d
|| g}dd
| dS )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should roffset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        0roffsetr    + ()zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(, )r1   r   index_to_strrq   rZ   rk   rl   rm   ro   rp   join)r]   r   r   frq   argsrI   rI   rJ   format   s    
zBlockPtrOptions.formatrt   c                 C  s   g }t t| jD ]l}| j| dkrtjj| j| dstjj	| j| t
| j| d  stjjrt| j| dks|| q|S )z6List of indices to pass to tl.load(boundary_check=...)r~   r   XBLOCK)ranger   rl   ro   r1   r   r   Zstatically_known_equalsrm   Zstatically_known_multiple_ofr!   r   r   r   )r]   checkr   rI   rI   rJ   boundary_check   s"    	zBlockPtrOptions.boundary_checkc                 C  s,   dgt | j }d|| jd< tj|S )z/Codegen string to pass to tl.advance(name, ...)r   RBLOCKr   )r   rl   rq   rZ   r1   r   r   )r]   advancerI   rI   rJ   advance_roffset   s    zBlockPtrOptions.advance_roffsetc                 C  s   dS NFrI   r\   rI   rI   rJ   r`      s    zBlockPtrOptions.has_indirectc                 C  s
   d| j kS )Nr   )ro   r\   rI   rI   rJ   ra      s    zBlockPtrOptions.has_rindexc                 C  s   |   S r[   )ra   r\   rI   rI   rJ   re     s    zBlockPtrOptions.has_rmaskc                 C  s   dS r   rI   r\   rI   rI   rJ   rd     s    zBlockPtrOptions.has_tmpmaskc                 C  s   t |  S r[   )rW   r   r\   rI   rI   rJ   r^     s    zBlockPtrOptions.has_maskN)T)rf   rg   rh   ri   staticmethodr   r   r%   r   r   r`   ra   re   rd   r^   rI   rI   rI   rJ   rj      s&   
3rj   rQ   rn   )value	old_shape	new_shapec                 C  s   t |trt |tst||kr$| S dd |D |krNd|  dd| dS d}g }|D ]F}|t|k r||| kr|d |d	7 }qZ|d
kst|d qZ|t|kst|  dd| dS )z7Workaround https://github.com/openai/triton/issues/2836c                 S  s   g | ]}|d kr|qS )r~   rI   r|   rI   rI   rJ   r{     s      z"triton_reshape.<locals>.<listcomp>ztl.reshape(z, [r   z])r   :r3   r~   None[])
isinstancelistr   r   r   r   )r   r   r   idxexpandsizerI   rI   rJ   triton_reshape  s    

r   c                   @  s   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3S )4TritonPrinterc                 C  s4   t |jdkstd| |jd  dtjj dS )Nr3   libdevice.trunc(r   ).to(r   r   r   r   _printr1   r   index_dtyper]   exprrI   rI   rJ   _print_TruncToInt&  s     zTritonPrinter._print_TruncToIntc                 C  s.   t |jdkst| | |jd  dS )Nr3   r   z.to(tl.float64))r   r   r   parenr   r   rI   rI   rJ   _print_ToFloat,  s    zTritonPrinter._print_ToFloatc                 C  s   d t| jt| j|jS )N % )r   r   r   r   r   r   rI   rI   rJ   _print_PythonMod5  s    zTritonPrinter._print_PythonModc                 C  sF   |j s
t|j\}}| | |}| | |}d| d| dS )N( // r   )
is_integerr   r   r   doprint)r]   r   r   divrI   rI   rJ   _print_FloorDiv<  s
    

zTritonPrinter._print_FloorDivc                 C  s0   |j \}}| | | d| | | S )Nz / )r   r   r   )r]   r   lhsrhsrI   rI   rJ   _print_IntTrueDivE  s    
zTritonPrinter._print_IntTrueDivc                 C  s4   t |jdkstd| |jd  dtjj dS Nr3   libdevice.floor(r   r   r   r   r   rI   rI   rJ   _print_floorK  s     zTritonPrinter._print_floorc                 C  s4   t |jdkstd| |jd  dtjj dS r   r   r   rI   rI   rJ   _print_FloorToIntQ  s     zTritonPrinter._print_FloorToIntc                 C  s4   t |jdkstd| |jd  dtjj dS Nr3   libdevice.ceil(r   r   r   r   r   rI   rI   rJ   _print_ceilingW  s    zTritonPrinter._print_ceilingc                 C  s4   t |jdkstd| |jd  dtjj dS r   r   r   rI   rI   rJ   _print_CeilToInt[  s    zTritonPrinter._print_CeilToIntc                 C  s   d|  | dS )Nlibdevice.sqrt(z.to(tl.float32)))r   r   rI   rI   rJ   _helper_sqrt_  s    zTritonPrinter._helper_sqrtc                 C  sH   |  |jd }|  |jd }|  |jd }d| d| d| dS )Nr   r3   r   	tl.where(r   r   )r   r   )r]   r   cpqrI   rI   rJ   _print_Whereb  s    zTritonPrinter._print_Wherec                 C  s|   t |j}t |jdkr(| |jd S t |jd }| tj|jd |  }| tj|j|d   }d| d| dS )Nr3   r   r   ztl.minimum(r   r   )r   r   r   sympyZMinr]   r   nargsmidabrI   rI   rJ   
_print_Minh  s    
zTritonPrinter._print_Minc                 C  s|   t |j}t |jdkr(| |jd S t |jd }| tj|jd |  }| tj|j|d   }d| d| dS )Nr3   r   r   ztl.maximum(r   r   )r   r   r   r   ZMaxr   rI   rI   rJ   
_print_Maxr  s    
zTritonPrinter._print_Maxc                 C  s*   t |jdkstd| |jd  dS )Nr3   tl_math.abs(r   r   r   r   r   r   r   rI   rI   rJ   
_print_Abs}  s    zTritonPrinter._print_Absc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.cos((r   ).to(tl.float32))r   r   rI   rI   rJ   _print_OpaqueUnaryFn_cos  s    z&TritonPrinter._print_OpaqueUnaryFn_cosc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.cosh((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_cosh  s    z'TritonPrinter._print_OpaqueUnaryFn_coshc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.acos((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_acos  s    z'TritonPrinter._print_OpaqueUnaryFn_acosc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.sin((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_sin  s    z&TritonPrinter._print_OpaqueUnaryFn_sinc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.sinh((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_sinh  s    z'TritonPrinter._print_OpaqueUnaryFn_sinhc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.asin((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_asin  s    z'TritonPrinter._print_OpaqueUnaryFn_asinc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.tan((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_tan  s    z&TritonPrinter._print_OpaqueUnaryFn_tanc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.tanh((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_tanh  s    z'TritonPrinter._print_OpaqueUnaryFn_tanhc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.atan((r   r   r   r   rI   rI   rJ   _print_OpaqueUnaryFn_atan  s    z'TritonPrinter._print_OpaqueUnaryFn_atanc                 C  s*   t |jdkstd| |jd  dS )Nr3   zlibdevice.llrint(r   r   r   r   rI   rI   rJ   _print_RoundToInt  s    zTritonPrinter._print_RoundToIntc                 C  sb   t |jdkst|j\}}|jr>|dk s.ttd| dd| d| | | d|  S )Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .zlibdevice.nearbyint(1e * z) * 1e)r   r   r   r   
ValueErrorr   r   )r]   r   numberndigitsrI   rI   rJ   _print_RoundDecimal  s    

z!TritonPrinter._print_RoundDecimalN)rf   rg   rh   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rI   rI   rI   rJ   r   %  s2   		
r   c                 C  sn   t | dd }|dkr d}nD|dkr.d}n6|dkr<d}n(|d	krJd
}n|dkrXd}n|d	krdd}d| S )Nr   r   rW   Zint1)float16bfloat16Zfloat32float8_e4m3fn
float8e4nvfloat8_e5m2float8e5Zfloat8_e4m3fnuzZ
float8e4b8Zfloat8e5b16tl.rQ   splitdtypeZtriton_type_namerI   rI   rJ   triton_compute_type  s    r  c                 C  sD   t | dd }|dkr d}n|dkr.d}n|dkr:d}d	| S )
Nr   r   rW   int8r   r   r  r  r  r  r  rI   rI   rJ   triton_store_type  s    r
  c                 C  s2   t | r*| jr*| tjkrdnd}d| S t| S )N@       ztl.int)r   	is_signedtorchint64r  )r  nbitsrI   rI   rJ   triton_acc_type  s    
r  c                      s*   e Zd Zdd fddZdd Z  ZS )TritonCSEVariablezValueRanges[Any]boundsc                   s   t  || t | _d S r[   )super__init__setrT   )r]   r   r  	__class__rI   rJ   r    s    zTritonCSEVariable.__init__c                 C  s\   |D ]R}t |tr"| j|j qt |tjr|jd dkr| j|jd  dh qd S )Nr   Zxyrmask)r   r  rT   updater   Symbolr   )r]   r   r   kwargsargrI   rI   rJ   update_on_args  s
    
z TritonCSEVariable.update_on_args)rf   rg   rh   r  r  __classcell__rI   rI   r  rJ   r    s   r  c                   @  s  e Zd ZdZeddddddZeddddd	Zed
d Zedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zed8d9 Zed:d; Z ed<d= Z!ed>d? Z"ed@dA Z#edBdC Z$edDdE Z%edFdG Z&edHdI Z'edJdK Z(edLdM Z)edNdO Z*edPdQ Z+edRdS Z,edTdU Z-edVdW Z.edXdY Z/edZd[ Z0ed\d] Z1ed^d_ Z2ed`da Z3edbdc Z4eddde Z5edfdg Z6edhdi Z7edjdk Z8edldm Z9edndo Z:edpdq Z;edrds Z<edtdu Z=edvdw Z>edxdy Z?edzd{ Z@ed|d} ZAed~d ZBedd ZCedd ZDedd ZEedd ZFedd ZGedd ZHedd ZIedd ZJedd ZKedd ZLedd ZMedd ZNdS )TritonOverrideszMap element-wise ops to TritonNtorch.dtypezOptional[torch.dtype])r  	src_dtypec                 C  sp   dddddd}|d k	r2t |||tjjtj_|tjkrHd|  dS |tjkr\|  dS |  d	t| d
S )Nr"  int)r#  	dst_dtypert   c                 S  sl   | |krdS t jt jh}| |kr8||kr8| |kr8td| t jksL|t jkrPdS | t jksd|t jkrhdS dS )Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!   r   )r  r   r  r   )r#  r%  Z
fp8_dtypesrI   rI   rJ   _get_min_elements_per_thread  s$    z>TritonOverrides.to_dtype.<locals>._get_min_elements_per_threadr   z != 0)z.to(tl.int8).to(tl.uint8).to(r   )maxr1   r   min_elem_per_threadr  rW   Zuint8r  )r   r  r#  r'  rI   rI   rJ   to_dtype  s    


zTritonOverrides.to_dtypec                 C  sh   t |}|tjtjfkrTt|dd }|  d| d}| d| d}| dS |  d| dS d S )Nr   r   z.to(tl.r   r(  z, bitcast=True).to(tl.float32))r  r  r   r   rQ   r  )r   r  r#  triton_dtypeZtriton_src_dtypeZcast_xrI   rI   rJ   to_dtype_bitcast  s    
z TritonOverrides.to_dtype_bitcastc                 C  sD   t j|}t|| }t|}|dkr,|S d| d| d| dS )Nz
tl.float32tl.full(r   r   )r  Z_prims_commonZdtype_to_typer<   r  )r   r  rl   type_Z
triton_valZtriton_typerI   rI   rJ   _shaped_constant.  s    z TritonOverrides._shaped_constantc                 C  s   | j ||g dS )Nrl   )r1  )clsr   r  rI   rI   rJ   constant<  s    zTritonOverrides.constantc                 C  s   d|  dS )Nr   r   rI   r   rI   rI   rJ   abs@  s    zTritonOverrides.absc                 C  s   d|  dS )Nzlibdevice.abs(r   rI   r5  rI   rI   rJ   libdevice_absD  s    zTritonOverrides.libdevice_absc                 C  s   d|  dS )Nztl_math.exp(r   rI   r5  rI   rI   rJ   expH  s    zTritonOverrides.expc                 C  s   d|  dS )Nzlibdevice.exp(r   rI   r5  rI   rI   rJ   libdevice_expL  s    zTritonOverrides.libdevice_expc                 C  s   d|  dS )Nzlibdevice.exp2(r   rI   r5  rI   rI   rJ   exp2P  s    zTritonOverrides.exp2c                 C  s   d|  dS )Nzlibdevice.expm1(r   rI   r5  rI   rI   rJ   expm1T  s    zTritonOverrides.expm1c                 C  s   d|  dS Nr   r   rI   r5  rI   rI   rJ   sqrtX  s    zTritonOverrides.sqrtc                 C  s   d|  dS r<  rI   r5  rI   rI   rJ   libdevice_sqrt\  s    zTritonOverrides.libdevice_sqrtc                 C  sp   t jj}|dkrdS |dkr.d|  d|  dS |dkr@|  dS |d kr^ttd	tj| S td
|d S )NZcompile_errorzcompile error!Zruntime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r   Zaccuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   rH   Zinject_relu_bug_TESTING_ONLYopsmaximumr4  r  int32r   )r   bugrI   rI   rJ   relu`  s    
zTritonOverrides.reluc                 C  s   d|  d| dS )Nztriton_helpers.minimum(r   r   rI   r   r   rI   rI   rJ   minimumr  s    zTritonOverrides.minimumc                 C  s   d|  d| dS )Nztriton_helpers.maximum(r   r   rI   rD  rI   rI   rJ   r@  v  s    zTritonOverrides.maximumc                 C  s   d|  d| d| dS )Nr   r   r   rI   )r   r   r   rI   rI   rJ   wherez  s    zTritonOverrides.wherec                 C  s   d|  dS )Nztl_math.cos(r   rI   r5  rI   rI   rJ   cos~  s    zTritonOverrides.cosc                 C  s   d|  dS )Nzlibdevice.cos(r   rI   r5  rI   rI   rJ   libdevice_cos  s    zTritonOverrides.libdevice_cosc                 C  s   d|  dS )Nztl_math.sin(r   rI   r5  rI   rI   rJ   sin  s    zTritonOverrides.sinc                 C  s   d|  dS )Nzlibdevice.sin(r   rI   r5  rI   rI   rJ   libdevice_sin  s    zTritonOverrides.libdevice_sinc                 C  s   t dd S )Nz/ops.index_expr not implemented outside a kernelNotImplementedError)r3  r   r  rI   rI   rJ   
index_expr  s    zTritonOverrides.index_exprc                 C  s   t dd S )Nz+ops.masked not implemented outside a kernelrK  )r  bodyotherrI   rI   rJ   masked  s    zTritonOverrides.maskedc                 C  s   d|  dS )Nzlibdevice.lgamma(r   rI   r5  rI   rI   rJ   lgamma  s    zTritonOverrides.lgammac                 C  s   d|  dS )Nzlibdevice.erf(r   rI   r5  rI   rI   rJ   erf  s    zTritonOverrides.erfc                 C  s   d|  dS )Nzlibdevice.cosh(r   rI   r5  rI   rI   rJ   cosh  s    zTritonOverrides.coshc                 C  s   d|  dS )Nzlibdevice.sinh(r   rI   r5  rI   rI   rJ   sinh  s    zTritonOverrides.sinhc                 C  s   d|  dS )Nzlibdevice.acos(r   rI   r5  rI   rI   rJ   acos  s    zTritonOverrides.acosc                 C  s   d|  dS )Nzlibdevice.acosh(r   rI   r5  rI   rI   rJ   acosh  s    zTritonOverrides.acoshc                 C  s   d|  dS )Nzlibdevice.asin(r   rI   r5  rI   rI   rJ   asin  s    zTritonOverrides.asinc                 C  s   d|  dS )Nzlibdevice.asinh(r   rI   r5  rI   rI   rJ   asinh  s    zTritonOverrides.asinhc                 C  s   d|  d| dS )Nzlibdevice.atan2(r   r   rI   r   yrI   rI   rJ   atan2  s    zTritonOverrides.atan2c                 C  s   d|  dS )Nzlibdevice.atan(r   rI   r5  rI   rI   rJ   atan  s    zTritonOverrides.atanc                 C  s   d|  dS )Nzlibdevice.atanh(r   rI   r5  rI   rI   rJ   atanh  s    zTritonOverrides.atanhc                 C  s   d|  d| dS )Nzlibdevice.copysign(r   r   rI   rY  rI   rI   rJ   copysign  s    zTritonOverrides.copysignc                 C  s   d|  dS )Nzlibdevice.erfc(r   rI   r5  rI   rI   rJ   erfc  s    zTritonOverrides.erfcc                 C  s   d|  dS )Nzlibdevice.erfinv(r   rI   r5  rI   rI   rJ   erfinv  s    zTritonOverrides.erfinvc                 C  s   d|  d| dS )Nzlibdevice.hypot(r   r   rI   rY  rI   rI   rJ   hypot  s    zTritonOverrides.hypotc                 C  s   d|  dS )Nzlibdevice.log10(r   rI   r5  rI   rI   rJ   log10  s    zTritonOverrides.log10c                 C  s   d|  dS )Nzlibdevice.log2(r   rI   r5  rI   rI   rJ   log2  s    zTritonOverrides.log2c                 C  s   d|  d| dS )Nzlibdevice.nextafter(r   r   rI   rY  rI   rI   rJ   	nextafter  s    zTritonOverrides.nextafterc                 C  s   |  d| S N & rI   rD  rI   rI   rJ   logical_and  s    zTritonOverrides.logical_andc                 C  s
   |  dS )Nz == 0rI   r   rI   rI   rJ   logical_not  s    zTritonOverrides.logical_notc                 C  s   |  d| S Nz | rI   rD  rI   rI   rJ   
logical_or  s    zTritonOverrides.logical_orc                 C  s   d|  d| dS )Nr    ^ r   rI   rD  rI   rI   rJ   logical_xor  s    zTritonOverrides.logical_xorc                 C  s   |  d| S re  rI   rD  rI   rI   rJ   bitwise_and  s    zTritonOverrides.bitwise_andc                 C  s
   d|  S )N~rI   rh  rI   rI   rJ   bitwise_not  s    zTritonOverrides.bitwise_notc                 C  s   |  d| S rj  rI   rD  rI   rI   rJ   
bitwise_or  s    zTritonOverrides.bitwise_orc                 C  s   |  d| S )Nrl  rI   rD  rI   rI   rJ   bitwise_xor  s    zTritonOverrides.bitwise_xorc                 C  s   |  d| S )Nz << rI   rD  rI   rI   rJ   bitwise_left_shift  s    z"TritonOverrides.bitwise_left_shiftc                 C  s   |  d| S )Nz >> rI   rD  rI   rI   rJ   bitwise_right_shift  s    z#TritonOverrides.bitwise_right_shiftc                 C  s   d| d}d|  d| dS )Nr   ).to(tl.uint32)ztl.rand(r   r   rI   seedr   rI   rI   rJ   rand  s    zTritonOverrides.randc                 C  s   d| d}d|  d| dS )Nr   ru  z	tl.randn(r   r   rI   rv  rI   rI   rJ   randn  s    zTritonOverrides.randnc              	   C  s*   d| d}d|  d| d| d| d	S )Nr   ru  ztriton_helpers.randint64(r   r   rI   )rw  r   lowhighrI   rI   rJ   	randint64  s    zTritonOverrides.randint64c                 C  s   t dd S )Nz.ops.load_seed not implemented outside a kernelrK  )r   r   rI   rI   rJ   	load_seed  s    zTritonOverrides.load_seedc                 C  s   d|  dS )Nzlibdevice.rsqrt(r   rI   r5  rI   rI   rJ   rsqrt  s    zTritonOverrides.rsqrtc                 C  s   d|  dS )Nzlibdevice.log1p(r   rI   r5  rI   rI   rJ   log1p  s    zTritonOverrides.log1pc                 C  s   d|  dS )Nzlibdevice.tan(r   rI   r5  rI   rI   rJ   tan!  s    zTritonOverrides.tanc                 C  s   d|  dS )Nzlibdevice.tanh(r   rI   r5  rI   rI   rJ   tanh%  s    zTritonOverrides.tanhc                 C  s   d|  dS )Nztl.sigmoid(r   rI   r5  rI   rI   rJ   sigmoid)  s    zTritonOverrides.sigmoidc                 C  s   d|  d|  d|  dS )Nzlibdevice.signbit(z) if (z).dtype is tl.float32 else z < 0rI   r5  rI   rI   rJ   signbit-  s    zTritonOverrides.signbitc                 C  s   d|  d| dS )Nzlibdevice.fmod(r   r   rI   rD  rI   rI   rJ   fmod2  s    zTritonOverrides.fmodc                 C  s   d|  d| dS )Nzlibdevice.pow(r   r   rI   rD  rI   rI   rJ   pow6  s    zTritonOverrides.powc                 C  s   d|  dS )Nztl_math.log(r   rI   r5  rI   rI   rJ   log:  s    zTritonOverrides.logc                 C  s   d|  dS )Nzlibdevice.log(r   rI   r5  rI   rI   rJ   libdevice_log>  s    zTritonOverrides.libdevice_logc                 C  s   d|  dS )Nzlibdevice.isinf().to(tl.int1)rI   r5  rI   rI   rJ   isinfB  s    zTritonOverrides.isinfc                 C  s   d|  dS )Nzlibdevice.isnan(r  rI   r5  rI   rI   rJ   isnanF  s    zTritonOverrides.isnanc                 C  s   d|  dS )Nzlibdevice.nearbyint(r   rI   r5  rI   rI   rJ   roundJ  s    zTritonOverrides.roundc                 C  s   d|  dS )Nr   r   rI   r5  rI   rI   rJ   floorN  s    zTritonOverrides.floorc                 C  sF   |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr   r   z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r   rI   )r   r   quotremrI   rI   rJ   floordivR  s    zTritonOverrides.floordivc                 C  sV   t dtj}t t || tj}t t | |tj}t ||}| d|  dS )Nr   r(  .dtype))r?  r4  r  rA  r+  ltr	  sub)r   zleftrightr  rI   rI   rJ   sign[  s
    zTritonOverrides.signc                 C  s   d|  dS )Nr   r   rI   r5  rI   rI   rJ   truncc  s    zTritonOverrides.truncc                 C  s   |  d| S )Nr   rI   rD  rI   rI   rJ   truncdivg  s    zTritonOverrides.truncdivc                 C  s   d|  dS )Nr   r   rI   r5  rI   rI   rJ   ceilm  s    zTritonOverrides.ceil)N)Orf   rg   rh   __doc__r   r+  r.  r1  classmethodr4  r6  r7  r8  r9  r:  r;  r=  r>  rC  rE  r@  rF  rG  rH  rI  rJ  rM  rP  rQ  rR  rS  rT  rU  rV  rW  rX  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  rg  ri  rk  rm  rn  rp  rq  rr  rs  rt  rx  ry  r|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rI   rI   rI   rJ   r!    s&  -





































































r!  rH   zOpsHandler[str])hrt   c                 C  s   | S r[   rI   r  rI   rI   rJ   _typecheck_TritonOverridesv  s    r  c                   @  sL   e Zd ZdZedd Zedd Zedd Zedd	 Z	ed
d Z
dS )TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                 C  s$   t j }dg| }| j|||dS )Nr3   r2  )r1   r   triton_tensor_ndimr1  )r3  r   r  ndimrl   rI   rI   rJ   r4    s    

zTritonKernelOverrides.constantc                 C  st   t jj|dd}t|tstt jjjt jj|j	t
|d}|tjtjhkrht jjt jj| ||}|j|_|S )NF	block_ptrr  )r1   r   indexingr   rP   r   csegeneratecomputerR   r&   r  rA  r  r+  rT   )r3  r   r  r  varrI   rI   rJ   rM    s      z TritonKernelOverrides.index_exprc              	   C  s   t j| }| }W 5 Q R X |jjr.t|}t jjjt jjd| dt	| d| dt
|d}t|||}|j| |S )Nr/  z.shape, r   r  r  )r1   r   Z
mask_loadsr  Zis_boolrW   r  r  r  r<   r   wrapr?  rF  rT   discard)r  rN  rO  Znew_maskresultretrI   rI   rJ   rP    s    zTritonKernelOverrides.maskedc                 C  s,   t jj| }d| dt jjd| dS )Ntl.load(z + Zload_seed_offsetr   )r1   r   r   inputseed_offset)r   r   r  rI   rI   rJ   r}    s    zTritonKernelOverrides.load_seedc                 C  sz   d|  d}|t jjjkr(t jjj| S t jj }t jj }t jj| d| d|  d ||ft jjj|< ||fS )Nzfrexp(r   r   z = triton_helpers.frexp()r1   r   r  cachenewvarr  rM   )r   	cache_keyZmantissaexponentrI   rI   rJ   frexp  s    zTritonKernelOverrides.frexpN)rf   rg   rh   r  r  r4  rM  r   rP  r}  r  rI   rI   rI   rJ   r  z  s   



r  c                 C  s   | S r[   rI   r  rI   rI   rJ    _typecheck_TritonKernelOverrides  s    r  c                   @  sP   e Zd ZU dZded< ded< dd Zdd	d
d
dddZdd Zdd ZdS )HelperFunctionsz#An ordered set of helper functions.zDict[str, str]_templates_seenrn   finalized_helpersc                 C  s   i | _ g | _d S r[   )r  r  r\   rI   rI   rJ   r    s    zHelperFunctions.__init___triton_helper_fn	base_namerQ   )template_codert   c                C  sL   | j |}|dk	r|S | t| j }|| j |< | j|j|d |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        Nr   )r  getr   r  r   r   )r]   r  r  Zexisting_namer   rI   rI   rJ   add  s    
zHelperFunctions.addc                 C  s
   t | jS r[   )iterr  r\   rI   rI   rJ   __iter__  s    zHelperFunctions.__iter__c                 C  s
   | j | S r[   )r  )r]   r   rI   rI   rJ   __getitem__  s    zHelperFunctions.__getitem__N)	rf   rg   rh   r  ri   r  r  r  r  rI   rI   rI   rJ   r    s   
r  c                      s  e Zd ZU eZded< eZded< dZdde	j
ddd	d
dd fddZdd Zdd ZddddZdd Zed
dddZdddddddddZdtd
d
d!d"d#d$d%Zdud&d'Zddddd(d)d*Zd+d, Zd
dd-d.d/Zdvd
dd0d1d2d3d4d5Zd0d
dd6dd0d7d8d9Zd:d; Zd6d6d<d=d=d>d?d@Zd
dd0dAdBdCZd
ddDdEZdFdGdHdHdIdJdKZdLdM ZdwdNdOZ dPdQ Z!dRdS Z"e#dTdU Z$dxdVdWZ%dXdY Z&dZd[ Z'd\d] Z(d^d_ Z)dyd
d`dadbdcZ*ddde Z+dfdg Z,dhdidjdkZ-dldm Z.dndo Z/dpdq Z0drds Z1  Z2S )zTritonKernelr  helper_functionszCallable[[sympy.Expr], str]kexprTNr   F)	mutations	pid_cachereduction_hintr*  disable_persistent_reductionrQ   zOptional[Set[str]])r   r  c                  s\   t  j||||||d t | _t | _|| _t | _	t
 | _t | _d | _|   d S )N)r   r  r  r  r  )r  r  r7   suffixr  outside_loop_varsr*  	itertoolscountblock_ptr_idr  r  autotune_hintstriton_metacodegen_range_tree)r]   r   r  r  r  r*  r  groupsr  rI   rJ   r    s     

zTritonKernel.__init__c                 C  sT   | j D ]}|js| || j q| jrP| j d jrP| jd| | j d   d S )Nr   zrbase = )rs   is_loopiteration_ranges_codegen_headerrN  r   rM   iteration_ranges_ranges_code)r]   treerI   rI   rJ   r    s    
zTritonKernel.codegen_range_treec                 C  s   dS )z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        TrI   r\   rI   rI   rJ   need_numel_args  s    zTritonKernel.need_numel_argsrW   r   c                 C  sP   | j rtjjsdS tjdi| jd}tjjr6|d9 }| j	d }t
jj||S )z^
        Heuristic to set self.persistent_reduction and add guards
        if needed.
        Fi   r     r   )r   r   rH   Zpersistent_reductionsr    INNERr  r  Zmulti_kernelr   r1   r   r   statically_known_leq)r]   	thresholdZ
last_numelrI   rI   rJ   should_use_persistent_reduction&  s      
z,TritonKernel.should_use_persistent_reductionc                 C  s6   | j tjko4| jo4t| jdko4tjj	| jd dS )Nr   r      )
r  r    r  persistent_reductionr   r   r1   r   r   Zstatically_known_geqr\   rI   rI   rJ   want_no_x_dim:  s    zTritonKernel.want_no_x_dimc                 C  s   dS )Nztl.device_assertrI   r\   rI   rI   rJ   assert_functionB  s    zTritonKernel.assert_function)
copy_shapedense_indexingoverride_maskr  rY   )rZ   c             	     s@  |  |}|j}d}t }|D ]}	t|	tjs2t|p@t|	tj	}|rHqt|	tj
rp| jj|	j }
||
j qt|	tjtjtjtjtjtjfrqt|	tj	tjtjfst|	j||	jd  d qtjjs|s| jdk	o|dk}d}d}t }|  D ]0}||jrd}nd}||j  d q |r.| j!r.tjj"r.|s.| js.t#|| dkr.| $|s.|r.| j%dkr.t&|dd | j'( D }| jdd	}d
d |D fddD }tj)dd}|*t+||   r.| ,| ddlm-} |. fdd|D  | ||S d}| /|}t|tj0r|rZ| dn| 1 }d| d| d}t2|t d|||S |r|s|r| dn| 1 }d| d| d}|}n"|s|rd| d| d}|}|r|h}| jr|| j | ,| |r*d3t4t5t6|nd}t2||||||S )zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fr   r  NTtl.int32c                 S  s   i | ]\}}||j qS rI   )r   )ry   vrz   rI   rI   rJ   
<dictcomp>  s      z)TritonKernel.indexing.<locals>.<dictcomp>)Zreorderc                 S  s   g | ]}|  qS rI   )symbolrx   rI   rI   rJ   r{     s     z)TritonKernel.indexing.<locals>.<listcomp>c                   s    g | ]}t jd |  dqS )Zstride_exclude)r   Wildr|   )symbolsrI   rJ   r{     s     _offsetr  r3   )rj   c                   s   g | ]} | qS rI   rI   r|   )mrI   rJ   r{     s     z.shaper/  r   z, tl.int32)r   tl.broadcast_to(r   .shape)rf  )7Zprepare_indexingZfree_symbolsr  r   r   r  r   r   r   ZRINDEXr_   r  Zvarname_mapr   r  rT   ZUNBACKED_INTZSIZEZPRECOMPUTED_SIZEZINDEXFLOATZUNBACKED_FLOATr   ZYBLOCKr  r   rH   r  
_load_maskactive_range_treesintersectionZvar_listrv   allow_block_ptrZuse_block_ptrr   is_indirect_indexingr   r,   Zrange_tree_nodesitemsr  matchr+   filter_masksrj   r   r   Integerdense_size_strrP   r   sortedr   rQ   )r]   rZ   r  r  r  r  Z
index_varsra   rT   r  Zcse_varZ
need_denseZ
have_denseZhave_loop_varsZdense_mask_varsr  Zindex_relative_to_xyr_indexrs   rm   r   rj   rV   rR   rU   rI   )r  r  rJ   r  F  s    
 
 

     
zTritonKernel.indexingrE   rj   z'Tuple[str, Optional[DeferredLine], str])r   r  r  rt   c              
   C  s   d }|  }|sd}n(|r4|dks&td|d}n
d|}| jr| jd jr| rdt| j }| j	t
|| d|j|dd	  t
|| d
| d|  d}n
||}|||fS )NrE   , other=0.0, boundary_check=z, padding_option='zero'r   r   = F)r   z = tl.advance(r   r   )r   r   r   rs   r  ra   nextr  rN  rM   r6   r   r   )r]   r   r  r  rO  advance_block_ptrr   r  rI   rI   rJ   codegen_block_ptr  s6    

 
zTritonKernel.codegen_block_ptrc                 C  s\   d| d|  |j d}t||j|j}| dttj| d}d| d| | dS )Nr  r   r   r(  	tl.store()r   rr   r   ro   r
  r1   r   	get_dtype)r]   r   r  r  r   rO  rI   rI   rJ   codegen_block_ptr_store_line  s
    z)TritonKernel.codegen_block_ptr_store_line)r   r   lowerrw   c                 C  s   |s|sd S t |tjst| j|dd}t |ts8t|j}| rL|jnd }|rft	j
| |nd }| ||rxdnd ||}	| |ptdd |jD }
| |}| jj||	dd d S )NFr  r   c                 s  s   | ]}t |tV  qd S r[   r   r  )ry   r  rI   rI   rJ   	<genexpr>  s    z,TritonKernel.check_bounds.<locals>.<genexpr>)Z
assignment)r   r   Exprr   r  rP   rR   r^   rU   r1   r   Zsexprrename_indexingZindirect_assertr  anyrT   get_load_bufferr  r  )r]   r   r   r  rw   r  rR   rU   size_strlineZindirectbufferrI   rI   rJ   check_bounds  s&     
  
zTritonKernel.check_boundsc                 C  s@   |  s| r| jS | jr6| jd jr6| s6| jS | jS d S )Nr   )	r`   rd   r  r   rs   r  ra   rN  loads)r]   r  rI   rI   rJ   r  	  s    
zTritonKernel.get_load_buffer)r   rZ   c              	   C  sz  | j |}| |}|}| j|dd}| }| }tdd | | D }	| 	|rdd}
nv|	snd}
nl| j
r| jd jr|| j jkrt| j j| j}n|h}t|| j@ dk}| o|p|}|rd}
qd}
nd	}
|s|r| rd
}nd	}d }d }tj|r|}nt|trX| ||||\}}}d| | |
 d}t||j|j}nLt|tjrd| d| d}|j}n$d| d|j d|j  |
 | d	}tj!|}|t"j#t"j$fkr|d7 }|t"j%krt"j&j'd kr|d7 }| (|}| j)*||}t|t+st,|j-|_-|rBd| d| d}| j)*||}|rR|.| | j
rj|/ sv|sv| j01| |S )NTr  c                 s  s   | ]}|d kV  qdS )r3   NrI   ry   r   rI   rI   rJ   r  *  s    z$TritonKernel.load.<locals>.<genexpr>z, eviction_policy='evict_last'r   r   z, eviction_policy='evict_first'rE   r  r  r   r   ))r  r,  z.to(tl.int1)r  r   )2r   r  r  r  ra   rd   r  Zget_strides_of_loadvaluesis_broadcastedr   rs   r  inplace_buffersr  Zother_namesr   Z
last_usager^   r1   r   is_unspec_argr   rj   r  r   ro   rr   r   r  rV   rR   rU   r  r  r   r   rW   versionhipr  r  r  r  r   rT   rM   re   r  r  )r]   r   rZ   r  Zindirect_indexingoriginal_indexr  ra   rd   Zis_coalescedepnamesZlast_useZ
evict_lastrO  r  Zappend_broadcastr  r  r  Zload_buffer
result_varrI   rI   rJ   load  s    

   
  $

zTritonKernel.loadr5   r0   r   )r   rZ   r   modert   c              	   C  s,  | j |}|}| j|d|d kd}|| j jk}| |}	|rT|	rT| jt|d d }
t|t	r| 
|||\}}
}| |||||}nf|d krd| d|j d| d|j d	}n:|d	krd
| d|j d| d|j d	}ntd| | jt|| |
r| j|
 | js(| j| d S )NT)r  r  ztl.debug_barrier()r  r   r  r   r   Z
atomic_addztl.atomic_add(zstore mode=)r   outputr  r  r  storesrM   r6   r   rj   r  r  rR   rU   rL  r   r  r  )r]   r   rZ   r   r  r  r  r  Z
is_inplacer  r  r  rO  r  rI   rI   rJ   storeq  s>    

  
    $$zTritonKernel.storer"  )r  offsets_nameoffsets_sizeindexing_dtyper  rt   c                 C  s   | j tj | j|}|  }| |}|tj	kr<d}	n|tj
krLd}	ntd| j| jd| d| d|	 d| d| d| d}
|
S )z3
        See [Note: Inductor bucketize op]
        r  tl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r   r   )r  r  r   ZELEMENTS_PER_WARP_32r   r  r  r   r  rA  r  rL  r  r  r  )r]   r  r#  r$  r%  r  Zoffsets_ptr
block_sizeZoffsets_size_strr-  r  rI   rI   rJ   	bucketize  s     


(zTritonKernel.bucketizec                 C  sD   |   }|dkrd| dS dg| }d|d< | dd| d	S )
Nr3   z!triton_helpers.promote_to_tensor(r   r   r   r   r   r   r   )r  r   )r]   r   ZndimssizesrI   rI   rJ   reduction_resize  s    
zTritonKernel.reduction_resizer/   z+Union[CSEVariable, Tuple[CSEVariable, ...]])r  r#  reduction_typer   rt   c                   s  j s
tdd jD }| t|}jr>|j jd j} 	fdd|}fdd}fdd	}||f}	|	j
jkrj
j|	 S  d
 t|}
j
 }dd |D |_d|fddjrXtj|}	t|}fdd t|trD fddt||D }n
 ||}dkrtj
jd| d| d}ddd |j||| ndkr||}ndkr@|\}}}d| d| d| d d	}fdd td!D \}}}j| d| d| d"|  tfd#d |||fD }nj
j||}nd$| }tj|}	t|}t|tsj | d%  d| d|
 d dkrd$| d&}t!"t!j#j$}j | d%  d| d' ddd j%d(| d)| d* d+| d| d| d| d,| d"| d-| d.| d"| d-| d. |j&||| nt'r| d/}| d0}| d1}j | d2  d|
 d j | d2  d|
 d j | d2  d|
 d dkrh|\}}}j%d3| d)| d)| d4| d| d| d5| d| d| d6 nFdksvtj%d3| d)| d)| d7| d| d| d| d8 j%d(| d"| d-| d.| d"| d-| d.| d"| d-| d. |}j
 }j
 }j&%d(| d9| d9| d:| d| d| d d;| d"(| d< d.| d"(| d< d.| d"(| d< d. |||f}nt)|}|||}j| d"||  |t!j*kr| d=}t+|}j&| d"|| d>| d nj&| d"||  |j
j|	< t|trzt,d?d  |D sft j-t.|O  _-nt|t/stj-0| |S )@Nc                 S  s   h | ]}|j  d qS r  r   ry   r  rI   rI   rJ   	<setcomp>  s     z)TritonKernel.reduction.<locals>.<setcomp>r   c                   s   j jd|  d  dS )Nr  r   r   r  r  r  )r  )r  r]   rI   rJ   <lambda>  s    z(TritonKernel.reduction.<locals>.<lambda>c              
     s`   dk}|rdnd}dkr> | d d|  d  dS  | d d	|  d  dS )
N>   prodr  r)  minZtriton_helperstl>   r)  r2  r   z2(r   r   r   )r*  )r   Z
use_helpermodule)dimr+  r]   rI   rJ   final_reduction  s    z/TritonKernel.reduction.<locals>.final_reductionc                   sF   |  d| d d| d| d  d| d| d d d S )	Nz                _, z_tmp = triton_helpers.z_with_index(r   z)
                r  _tmp
                )rL   r*  )r  r  r   rZ   )r5  root_opr]   rI   rJ   final_argreduce  s"    z/TritonKernel.reduction.<locals>.final_argreducer3   c                 S  s   h | ]}|d  dkr|qS )r   rrI   )ry   r  rI   rI   rJ   r.     s      rf  c                   s    s| S t  | |S r[   )r  rF  )ZtvalZfval)condrI   rJ   
where_cond  s    z*TritonKernel.reduction.<locals>.where_condc                   s    j  j| |S r[   r/  )r   default)r]   r=  rI   rJ   _mask_value  s    z+TritonKernel.reduction.<locals>._mask_valuec                   s   g | ]\}} ||qS rI   rI   )ry   r  d)r?  rI   rJ   r{     s     z*TritonKernel.reduction.<locals>.<listcomp>>   argminargmaxr  zindex, r  r)  r2  )rB  rA  Zwelford_reduceZwelford_combineztriton_helpers.welford(r   r   c                 3  s   | ]} j  V  qd S r[   r  r  ry   _r\   rI   rJ   r  '  s     z)TritonKernel.reduction.<locals>.<genexpr>r   r  c                 3  s$   | ]} j  j |V  qd S r[   )r  r  r  r*  )ry   var_namer\   rI   rJ   r  *  s   rE   = tl.full(_indexz, tl.int64)z                z_next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                _nextr8  Z_meanZ_m2Z_weightz = tl.zeros(z                    z@_next = triton_helpers.welford_combine(
                        z,
                        z+
                    )
                    z?_next = triton_helpers.welford_reduce(
                        z9, roffset == 0
                    )
                    z_tmp, z3_tmp = triton_helpers.welford(
                    z#
                )
                r7  z.to(tl.int8)r(  c                 s  s   | ]}t |tV  qd S r[   r  )ry   r   rI   rI   rJ   r    s     )1r   r   rs   r  r  r  r   rv   r  Z_map_tuple_or_scalarr  Zreduction_cacher  r  r  rT   r   r  r   Z	Reductiondefault_valuer<   r   tupler   rQ   r  r  Zwelford_reduce_fallbackr   rM   Zdefault_accumulatorrN  r  Ziinfor  r)  rL   r  r)   r*  Zget_reduction_combine_fnrW   r  allr  r  r  r  )r]   r  r#  r+  r   masksreduction_range_prefixr6  r:  r  acc_typer  r>  Zmasked_valueZaccumulator_indexZmeanm2weightZwelfordaccumulatorZlong_maxZaccumulator_m2Zaccumulator_weightZresult_meanZ	result_m2Zresult_weight
combine_fnupdatedZresult_typerI   )r?  r<  r  r5  r+  r9  r]   r=  rJ   	reduction  s   


	




   


"
 

	





	




zTritonKernel.reduction)r   rZ   r   c                 C  s   | j s
td| _ | j|dd}d| _ | j|}t|trl| jt	|| 
|||||d|  n>t|tszt| jt	|d| d|j d| d|j d		 d S )
NFTr  r  r  r   r  r   r   )r   r   r  r   r   r   rj   r  rM   r6   r  r   r   rP   rR   rU   )r]   r   rZ   r   r  r  rI   rI   rJ   store_reduction  s2    

 zTritonKernel.store_reductionc              
     s   t  d fddtdD }dtj|}d| d tddd	 tt	
 d
G  fddd} J t	| 2 || }ddd |D }d|  W 5 Q R X W 5 Q R X | jj dS )Nz@triton.jitc                   s&   g | ] t  fd dtD qS )c                 3  s   | ]}d   d| V  qdS )r  rE  NrI   ry   nr   rI   rJ   r    s     z7TritonKernel._lift_helper.<locals>.<listcomp>.<genexpr>)rK  r   )ry   )num_argsrY  rJ   r{     s     z-TritonKernel._lift_helper.<locals>.<listcomp>r   r   zdef {name}():rE   )rv   r  r  c                      s&   e Zd Zddd fddZdS )z+TritonKernel._lift_helper.<locals>.CSEProxyrQ   zCallable[..., CSEVariable]r   c                   s    fdd}|S )Nc                    s&   d 7   t| |S )NrE  )r  getattr)r   r  )r  helperhelper_namer   	overridesrI   rJ   inner  s
    zFTritonKernel._lift_helper.<locals>.CSEProxy.__getattr__.<locals>.innerrI   )r]   r   r`  r  r]  r^  r_  r  rJ   __getattr__  s    z7TritonKernel._lift_helper.<locals>.CSEProxy.__getattr__N)rf   rg   rh   rb  rI   ra  rI   rJ   CSEProxy  s   rc  c                 s  s   | ]}t |V  qd S r[   )rQ   )ry   r   rI   rI   rJ   r    s     z,TritonKernel._lift_helper.<locals>.<genexpr>return r  )r7   rM   r   r   r  chainfrom_iterabler4   r!  r1   ZMockHandlerindentZset_ops_handlerr  r  rN   )r]   fnrZ  r   	signaturerc  outputsrI   )r  r]  r^  rZ  r_  rJ   _lift_helper  s    
$zTritonKernel._lift_helperzTuple[torch.dtype, ...]zUCallable[[Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]], Tuple[CSEVariable, ...]]zTuple[CSEVariable, ...])dtypesrS  r  rt   c                   s  j s
tdd jD }| t|}jr:tdjd j}g }g }tj	j
j |t|} d }	t||D ]\}
}t|}d|}j	
j|
 dt| d}j	
jd	| d
  d}
||
 t|}d|}jsj	 } }d|d< dd
| d}|jr<dnd}j| d| d
| d
| d || qdd fdd}|d| d|	 d
| dt||}jsndd }|t|}d
dd |D }|d| d| dt||}|t||}|t||} fddt||D }t|||D ]*\}}}j| d | d
| d q@n|}|D ]}||_qvt|S )!Nc                 S  s   h | ]}|j  d qS r,  r   r-  rI   rI   rJ   r.    s     z$TritonKernel.scan.<locals>.<setcomp>z(ops.scan not supported inside ops.maskedr   r3   rf  r(  r   r  r   r~   r   r   zfloat('nan')z-1rG  c                 S  s   d dd | D S )N c                 s  s   | ]}| d V  qdS ),NrI   )ry   r   rI   rI   rJ   r    s     z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>)r   )r  rI   rI   rJ   csv  s    zTritonKernel.scan.<locals>.csvc                   s    fddt |D }tfdd|D r@fdd|D S fddt |D }j| d   t||D ]\}}r|_|jj|< qzt|S )Nc                   s    g | ]}  d | d  qS )r   rI   r  r  rM  rI   rJ   r{   "  s     z;TritonKernel.scan.<locals>.cse_multiple.<locals>.<listcomp>c                 3  s   | ]}| j jkV  qd S r[   r  r  ry   r  r\   rI   rJ   r  #  s     z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>c                   s   g | ]} j j| qS rI   rq  rr  r\   rI   rJ   r{   $  s     c                   s   g | ]} j  qS rI   rC  rD  r\   rI   rJ   r{   %  s     r  )	r   rL  r  rM   r   rT   r  r  rK  )r  rX  rM  Z
cache_keysresult_varsr  r  )ro  r]   rp  rJ   cse_multiple!  s    z'TritonKernel.scan.<locals>.cse_multipleztl.associative_scan((r  c                 S  s   dd t | |D S )Nc                 S  s   g | ]\}}t ||qS rI   )r?  r  )ry   ZaiZbirI   rI   rJ   r{   8  s     z5TritonKernel.scan.<locals>.sum_fn.<locals>.<listcomp>)r   rD  rI   rI   rJ   sum_fn7  s    z!TritonKernel.scan.<locals>.sum_fnc                 s  s   | ]}| d V  qdS )z * (rbase == (RBLOCK - 1))NrI   )ry   Zscan_varrI   rI   rJ   r  ;  s   z$TritonKernel.scan.<locals>.<genexpr>ztl.reduce((z), -1, z, keep_dims=True)c                   s&   g | ]\}} d | d| dqS )ztl.where(roffset > 0, r   r   rI   )ry   Z	full_scanZpartial_scan)cse_computerI   rJ   r{   I  s   z%TritonKernel.scan.<locals>.<listcomp>z = tl.where(roffset > 0, )r   r   rs   r  r  r  rv   	functoolspartialr  r  r  rk  r   r  r   r  r   r  r  r   r  r  Zdense_size_listZis_floating_pointrN  rM   rK  rT   )r]   rl  rS  r  rM  rN  Zbroadcasted_valuesZaccumulatorsZcombine_helper_fnr5  r   r  rO  r<  Zvalue_dtyperR  Zreduced_sizer>  rt  Zpartial_scan_varsru  Zsum_helper_fnZpre_reduce_varsZpartial_reduce_varsZ	accs_nextZfull_scan_varsrs  Zacc_nextZpartial_reducer  rI   )rv  ro  r]   rJ   scan  s    







  

zTritonKernel.scanc              	   C  s<  | j s"| js"| js"| js"| js"dS | jr| jd jr| j	d | j
 R | | jd | j | j| j  | j| j | j| j | j| j W 5 Q R X | j| j | jd   n8| j| j  | j| j | j| j | j| j | j| j | j   | j  | j  | j  | j  dS )a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nr   z(for roffset in range(0, rnumel, RBLOCK):)indexing_coder  r!  r  r  r   rs   r  rN  rM   rg  r  rL   r  Z
invalidater  cache_clearclearr\   rI   rI   rJ   codegen_body[  s>    



zTritonKernel.codegen_bodyc                 C  s  t  }| j \}}}}|dddg | j t }g }	t||D ]2\}
}dt| }t	j
|
}|r|| dt	j
j|  dt	j
j|  d|  d|  d
 n|
t	j
jkr$t	j
j|
 }|| dt	j
j|  dt	j
j|  d|j d|j d
 nPt|trft	j
j|j}d	|jkrPd
}|| d|  ntd|
 |	| qJ|dd|	 d W 5 Q R X |dddg |d krHg }g }d }|   D ]:}t!t	j
j|j"}|| |j#dkr|| q| $ r,dt%t&|d }nd}| dd| d}n
d| }t	j
j'( }|j)}|  |dt	j
j*+| d | \ |t	j
j*,| d| }|| d| d |t&t-j. d| d| d W 5 Q R X W 5 Q R X |dddg | h |dt	j
j*+| d | 8 |t	j
j*,| |dt&t-j. d| d W 5 Q R X W 5 Q R X |dddg | R |d |d |d |d |d |  |d! |d" W 5 Q R X |S )#NrE   zdef get_args():Zarg_z = rand_strided(r   z
, device='z	', dtype=r   r  r   r  z*Don't find the buffer or const tensor for rd  rn  
zdef call(args):r;  z
grid=grid(zgrid=zwith r   streamz = get_raw_stream(z.run(*args, z	, stream=z def benchmark_all_configs(args):z.benchmark_all_configs(*args, zif __name__ == '__main__':z#from triton.testing import do_benchzargs = get_args()z:ms = do_bench(lambda: call(args), rep=40, fast_flush=True)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))/r7   r   python_argdefs
writelinesrg  r  r  r   r   r1   r   Z
get_bufferrM   r   
size_hintsget_sizeZ
get_strideZ
get_devicer  	constantsr   Zstridedevicer  r   r:   	size_hintr   r   KeyErrorr   r   r  r>   r   rv   r  r   rQ   	schedulerget_current_device_or_throwrZ   
device_opsZdevice_guardZ
set_devicer*   KERNEL_NAME)r]   num_gbgridr  argdefs	call_argsri  rE  Zname_cntZ	var_namesarg_nameZarg_sigrF  bufZconst_tensorZsymval_hint
extra_argsZextra_args_strr  r   Zgrid_argcurrent_devicerZ   Zstream_namerI   rI   rJ   codegen_kernel_benchmark  s    D@"













z%TritonKernel.codegen_kernel_benchmarkc                 C  s   t dtjjdS )Nz
            from torch._dynamo.testing import rand_strided
            {}
            import torch
            from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid
        Zget_raw_stream)textwrapdedentr   r1   r   r  Zimport_get_raw_stream_asr\   rI   rI   rJ   imports_for_benchmark_kernel  s
    z)TritonKernel.imports_for_benchmark_kernelc                 C  s"   | j r| jstdS | jrdS dS )Nr  rU  Z	pointwise)r  r   r   r\   rI   rI   rJ   _get_heuristic  s    
zTritonKernel._get_heuristicc                  C  s   t jj t  tjtjtjj	tj
tjtjtjtjtjjtjjtjjd} t jjd k	r^d| d< t rnd| d< tjrtj| d< tj| d< tj| d< tjrtj| d< tj| d	< tj| d
< | S )N)Zbackend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTZis_hip	is_fbcodeprofile_bandwidthprofile_bandwidth_regexprofile_bandwidth_outputcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)r  utilsZ_tritonZtriton_hash_with_backendr  r   r  r  rH   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )inductor_metarI   rI   rJ   inductor_meta_common  sH    



z!TritonKernel.inductor_meta_commonc                 C  s6  t  }g }| jD ]>}tjj|}t|ttj	fs8d}nt
t|}|| q| js^|  |  }|d kr|t  tjr||   | j \}}	}
}	t|
D ]H\}}t|trttj|j}|tjjjkrt|jtjjj| |
|< qt }| jD ]~}|| jjkr"| | jj|  || jj!kr^|tjj"kr^|| j"kr^| | jj!| j# || jj$kr| | jj$|  qt%|}t&|
| j'd}|t()tjj*+ i d}t| j,t-t.j/|| j0| j1| j2d| 3 }d }tjstj4r| 5 d }||d< | 6 D ]L}t|j7 d|j8}|
| t9|| j'd|t:|< ||j7 d q
t;|
g|d< |d d	 j<D ]}d
|d |< qt|| _=| j>D ]D}|j7dkr| j?rq|j@d krq||j7A  d q| B  | jCD ]}|Dd || q| jr8| jE}d| d|d| d|d|d}nVd}t:|dkrbt:|
dkr^d}nd}d| d|d| d|d|d| jF d}|| |Dd|pt-t.jG ddH| d |I F | J| | jK D ]\}}|D| d|  q|| jL W 5 Q R X tjr.|| M| |N S ) Ni    )Z
size_dtype)ri  r  r  )r  kernel_nameZmutated_arg_namesr   num_loadnum_reductiong    eAZkernel_num_gbr   Zconfigsr   r3   r  r;  zBLOCK : tl.constexprrE   z$
                @triton_heuristics.z!(
                    size_hints=z%,
                    reduction_hint=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            r   r&  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r   r[  r  )Or7   r   r1   r   r   Zsymbolic_hintr   r$  r   r  r$   r   r   r   r  rL   rO   r   benchmark_kernelr  r   r  r   r:   r   r  r   Zinv_precomputed_replacementsr   r  r  Zinput_buffersr  r  Zremoved_buffersZ
inner_nameZoutput_buffersr  rC   r   r   r   r  r  r  rQ   r*   DESCRIPTIVE_NAMEr   r  r  r  r  Zestimate_kernel_num_bytesr  rv   r   rB   r   rA   Z
equal_to_1r  rs   r  
tensor_dimrw   r}  r  rM   r  r*  r  r   rg  codegen_static_numelsaliasesrN  r  rN   )r]   r   coder  r   Z
numel_hintr  Z
heuristicsr  rE  ri  r   r  r  Zmutated_argsZmutationZtriton_meta_signaturer  r  r  r  ZsizeargZarg_numr]  r  Zheuristics_lineZ	tile_hintoldnewrI   rI   rJ   codegen_kernel!  s    

 



 
	

 






"

zTritonKernel.codegen_kernelc                 C  s   | j D ]}|jdks| jrTtjj|j}t|t	j
tfrT||j dt|  |jdkr| jrtjj|j}t|t	j
tfrt|}t|}n4d}tjj||s|dkstd| |d9 }q|d|  |jdkr| jr|d	 qd
S )a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        rnumel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        r;  znumel =    i @  z!Failed to find static RBLOCK for r   zRBLOCK: tl.constexpr = r   zXBLOCK: tl.constexpr = 1N)rs   rv   r   r1   r   r   simplifyr   r   r   r  r$  rM   r  r$   r  r   r   )r]   r  r  Zsimplified_tree_numelvalrI   rI   rJ   r    s.    

 
z"TritonKernel.codegen_static_numelsc                 C  s   dS )Nr  rI   r\   rI   rI   rJ   _get_grid_fn  s    zTritonKernel._get_grid_fnc                 C  sx   | j D ]l}t|jtjtjfr&|j}ntjj	||}|j
dksF| jr^|| |t| |jd k	r|| qd S )Nr;  )rs   r   r   r   r  r  r1   r   wrapper_codeZgenerate_numel_exprrv   r   r   typegrid_dim)r]   r   r  	arg_typesr  r  r   rI   rI   rJ   add_numel_to_call_args_and_grid  s    


z,TritonKernel.add_numel_to_call_args_and_gridc                 C  sL   | j  \}}}}tt|D ]$}tj|| r|| d ||< q||fS )Nz.item())r   r  r   r   r1   r   r  )r]   rE  r  r  r   rI   rI   rJ   get_call_args	  s
    zTritonKernel.get_call_argszOptional[IRNode])r   nodec           	      C  s   t jj}|  \}}g }| |||| t jj }| jjd k	rZ| jj}|	|j
||j |||}|j||||jdd||  | jd	 | jjd k	r||dg d S )NT)cudarH   r  Zgrid_fnr  Z	workspace)r1   r   r  r  r  r  r  r   Zworkspace_argZgenerate_workspace_allocationnbytesZ	zero_fillZgenerate_default_gridZgenerate_kernel_callrZ   r  r  rM   Zmake_free_by_names)	r]   r   r  wrapperr  r  r  r  wsrI   rI   rJ   call_kernel	  s4      zTritonKernel.call_kernelc                 C  s   t jj}| j \}}}}t||D ]\}}t|tr$t jjrxt	j
r^|d| d| d q|d| d| d q$d| d}|| d| d}|| q$d S )	Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert_inf_and_nan("z);zassert not z.isnan().any().item()z.isinf().any().item())r1   r   r  r   r  r   r   r;   Zcpp_wrapperr   Zabi_compatiblerM   )r]   r  rE  r  r  r  Zarg_typer  rI   rI   rJ   codegen_nan_check,	  s    

zTritonKernel.codegen_nan_checkc                 O  s
   t ||S r[   )r  )r]   r   r  rI   rI   rJ   create_cse_var>	  s    zTritonKernel.create_cse_varr=   )entryc                 C  sD   |j  d| | |j }|jjr4| j| n| j| d S )Nr  )	r   r  r
  r   rootr  rz  rM   rN  )r]   r  r  rI   rI   rJ   codegen_iteration_ranges_entryA	  s    z+TritonKernel.codegen_iteration_ranges_entryc                 C  sR   |j d k	st| |j }| j}|dkr4d| dnd}d|j  d| | S )Nr  r(  r   rE   ztl.arange(0, zBLOCK))r  r   Zindexing_size_strr   rv   rw   )r]   r  r   r   convertrI   rI   rJ   r  I	  s
    z)TritonKernel.iteration_ranges_ranges_codec                 C  s0   | j }|  }dg| }d| d| d| dS )Nr3   r/  r   r   )r   r  )r]   r  r   r   r  r   rI   rI   rJ   iteration_ranges_scalar_codeP	  s    
z)TritonKernel.iteration_ranges_scalar_codec                 C  s   |j d k	std|j  d}|j dkrd|jsdt|jtrD|jt ksdd| d|j d  d|j  d}|j||}| j	dkr| d	| j	 dS |S )
Nztl.program_id(r   r3   r   z + tl.program_id(z) * tl.num_programs(r  r  r(  )
r  r   Zhas_zdimr   r   r$  r#   r  r  r   )r]   r  keypidrI   rI   rJ   iteration_ranges_get_pidV	  s    

 
z%TritonKernel.iteration_ranges_get_pidc                 C  s   |j }|jr,||j d| d| d n|jd krd||j d| |  || d nh|jd k	r| d| | }n| || d}|| d| 	| d|
  d|j d| g || d	|j d
| d d S )Nr  z	offset + basez
offset = 0r   z	offset = r   ru   zmask = z < r   )rv   r  rM   r   r  r  r  r  r  r  rw   )r]   r  r  r   r  rI   rI   rJ   r  i	  s     

z,TritonKernel.iteration_ranges_codegen_header)rE   )rE   )N)N)N)N)3rf   rg   rh   r  r_  ri   texprr  r  r    DEFAULTr  r  r  r  r  propertyr  r  r  r  r  r  r  r"  r(  r*  rU  rV  rk  ry  r}  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   rI   rI   r  rJ   r    sp   

|  
Z (&	 a%q,
a
$
 **
r  c                   @  s6   e Zd ZdZdZeZdd Zdd Ze	 dd Z
d	S )
TritonSchedulingr  r&  c                   s   t jj}t||\}}|r$|| tjr~ddlm m	 t
fdd|D s~ fdd|D }||j dd|  d S )	Nr   BaseSchedulerNodeForeachKernelSchedulerNodec                 3  s   | ]}t | V  qd S r[   )r   rW  )r  rI   rJ   r  	  s    z3TritonScheduling.codegen_comment.<locals>.<genexpr>c                   s   g | ]}t | r| qS rI   )r   get_namerW  )r  rI   rJ   r{   	  s   
z4TritonScheduling.codegen_comment.<locals>.<listcomp>z Fused node name list: r   )r1   r   r  r(   rM   r   Zdebug_fusionZtorch._inductor.schedulerr  r  r  commentr   )r]   node_scheduler  originsdetailed_originsZ
node_namesrI   r  rJ   codegen_comment	  s    

z TritonScheduling.codegen_commentc                 C  s`  t jj}||jkr |j| }n<tjjr6t|tjjnd}t|d d }d	d|||
 g}||j|< tjjrv|nd}|ttj|}|ttj|}|dd}tt| d\}	}
}t }|d	|d
 |j|dd t jj }|d|j d d| }t||\}}|d| d | 7 }||| | tdr\t||| |S )NrE   r   rE  rH   triton_z#pragma CMT#pyzasync_compile.triton(z, '''T)stripz''', device_str='z')z# kernel path: r~  Zkernel_metadata)r1   r   r  Zsrc_to_kernelr   rH   Zdescriptive_namesr'   r2   r   Znext_kernel_suffixZunique_kernel_namesreplacerQ   r*   r  r  r   r   r  r7   rM   rL   r  r  r  r(   define_kernelrN   r   r   )r]   src_coder  r   r  r  Z
fused_nameZkernel_categoryZ	subs_namebasenamerE  Zkernel_pathZcompile_wrapperr  Zmetadata_commentr  r  rI   rI   rJ   r  	  sB    


  
zTritonScheduling.define_kernelc              
     sv  | j |dd}t|fddfdd}fdd}td	d
d |D j | d k	rpjfS   jjzj	  d  W nV t
k
r } z8td|dd |D  td|  jf W Y S d }~X Y nX j}t|dks
t|d jdkr$tdn*t fddt fdd tddd |D  |  jfS )NT)r  c                     s$    j d k	sttj j d d S )Nr   z.kernel_perf)__file__r   ospathsplitextrI   )modrI   rJ   cache_file_path	  s    z?TritonScheduling.benchmark_fused_nodes.<locals>.cache_file_pathc               
     s>     } t j| r:t| }t| W  5 Q R  S Q R X d S r[   )r  r  existsopenfloatreadr  fd)r  rI   rJ   
load_cache	  s
    
z:TritonScheduling.benchmark_fused_nodes.<locals>.load_cachec               	     s.     } t | d}|t W 5 Q R X d S )Nw)r  writerQ   r  )r  msrI   rJ   store_cache	  s    z;TritonScheduling.benchmark_fused_nodes.<locals>.store_cachez%kernel src code for %s written to: %sc                 S  s   h | ]}|  qS rI   r  rW  rI   rI   rJ   r.  	  s     z9TritonScheduling.benchmark_fused_nodes.<locals>.<setcomp>r   z*Exception (%s) in compiling fused nodes %sc                 S  s   h | ]}|  qS rI   r  rW  rI   rI   rJ   r.  	  s     infr3   c                     s   j   d S )Nr   
clone_argsrI   )r   callwrapped_jit_functionrI   rJ   r0  
      z8TritonScheduling.benchmark_fused_nodes.<locals>.<lambda>c                     s
   j   S r[   r  rI   )r   r  rI   rJ   r0  
  r  z+The fused kernel for %s took %.3f ms to runc                 S  s   h | ]}|  qS rI   r  rW  rI   rI   rJ   r.  
  s     )Zgenerate_kernel_code_from_nodesr   r  r  debugr  get_argsr   r  r  	Exceptionr  	launchersr   r   Zn_spillsr"   )r]   Znodesr  r  r  er  rI   )r   r  r   r  r  r  rJ   benchmark_fused_nodes	  sP    

 
z&TritonScheduling.benchmark_fused_nodesN)rf   rg   rh   Z
int32_typeZ
int64_typer  Zkernel_typer  r  r   r  rI   rI   rI   rJ   r  	  s   3r  )z
__future__r   dataclassesrw  r  loggingr  r  r   typingr   r   r   r   r   r	   r
   r   r   r   r   r  Ztorch._loggingZtorch._dynamo.utilsr   Ztorch._inductor.runtime.hintsr   r   Ztorch._prims_commonr   Ztorch.utils._tritonr   Zutils._sympy.symbolr   r   r   Zutils._sympy.value_rangesr   rE   r   r   Z	codecacher   r   r   Zmetricsr   r   Zruntime.hintsr    r!   Zruntime.runtime_utilsr"   r#   r$   r  r%   r&   r'   r(   r)   r*   r+   r,   Zvirtualizedr-   r?  r.   r/   r0   r1   Zwrapper_benchmarkr2   commonr4   r5   r6   r7   r8   r9   r:   r;   Zsimdr<   r=   r>   r?   r@   Ztriton_utilsrA   rB   rC   rD   	getLoggerrf   r  Z_loggingZgetArtifactLoggerZperf_hint_logZschedule_logZ
fusion_logrK   rO   	dataclassrP   rj   r   r   r   r  r  r
  r  r  r!  Z_initialize_pointwise_overridesr  r  r  r  r  r  rI   rI   rI   rJ   <module>   s   0(
(



     
G'           