U
    yhz                    @   sx  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dl m!Z!m"Z"m#Z# d d	l$m%Z%m&Z&m'Z' d d
l(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZG ddlHmIZImJZJmKZKmLZL ddlMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZY ddlZm[Z[m\Z\m]Z]m^Z^m_Z_ ej`aebdZcddddddhZdddddddddd d d!
Zeddd"d#d$d%d&hZfd'd(d)d*d+d,d-d.d/d0d1
Zgd2d3d4ZhejiejjgZkd5d6d7d8d9d:gZld;d< Zmd=d> Znd?d@ ZodAdB ZpdCdD ZqdEdF ZrdasdGdH ZtejuejvejwdIdJdKZxejuejvejveydLdMdNZzejuejvejweydLdOdPZ{G dQdR dRe:Z|G dSdT dTZ}ej~jeYdUdVdWZeYdXdYdZZG d[d\ d\eQZG d]d^ d^eXZed_ G d`da daeZedb G dcdd ddeZG dedf dfeVZG dgdh dheZG didj djeZG dkdl dleZG dmdn dneZG dodp dpeZG dqdr dreZG dsdt dte8ZG dudv dvZG dwdx dxeZG dydz dzZejG d{d| d|ZejG d}d~ d~ZdS )    N)copydeepcopy)Enum)	AnycastDictListOptionalSequenceSetTupleUnion)dependencies)is_float_dtype)_pytree)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )counters   )	codecacheconfigirmetrics)WrapperCodeGen)range_expressable_in_32_bits)BaseSchedulerNodeBaseSchedulingForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)
cache_on_selfget_bounds_index_exprget_fused_kernel_nameis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BracesBufferCppWrapperKernelArgsCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)cexprcexpr_indexDTYPE_TO_CPP
INDEX_TYPEvalue_to_cppZschedule+*^z||minmaxargminargmaxZwelford)
sumprodxor_sumrK   rL   rM   rN   anywelford_reducewelford_combinerO   rP   rQ   rS   rT   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
ZTensorintfloatrW   strZ
ScalarTypeZMemoryFormatZLayoutZDevicenumberzstd::vectorzc10::optional)r   r	   eqnelegeltgtc                 C   s   |t krtj}| dkrdS | dkr&dS | dkrVt|rFdt|  dS dt|  d	S | d
krt|rvdt|  dS dt|  dS t| rdt|  dS t| d S )N)rQ   rO   rR   r   rP   r6   >   rN   rL   z-std::numeric_limits<z>::infinity()zstd::numeric_limits<z>::min()>   rM   rK   z>::max()Welford<>())DTYPE_LOWP_FPtorchfloat32r   rE   r+   AssertionError)reduction_typedtype rj   M/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/codegen/cpp.pyreduction_initx   s&    rl   c                 C   s0   | dkst tt|  }t| r,d| dS |S N   rM   rN   rb   >)rg   rE   r=   r+   )rh   ri   scalar_typerj   rj   rk   reduction_acc_type   s
    rq   c              	   C   s   | dkr| d| S | dkr,| d| S | dkrB| d| S | dkrX| d| S | d	krv|  d
| d| dS | dkrd| d| dS | dkrt |tr|\}}}nt| |\}}}d| d| d| d| d	S t| d S )NrO    + rP    * rQ    ^ rR    || )rK   rL   z_propagate_nan(, )rS   welford_combine(rT   , {}))
isinstancetuplereduction_projectrg   )rh   var
next_valuemeanm2weightrj   rj   rk   reduction_combine   s$    
r   c                 C   s:   t | r$| d| d| dfS | dkr6| dS |S )Nz.meanz.m2z.weightrn   z.index)r+   )rh   accrj   rj   rk   r}      s
    
r}   c                    s   ddg}t  fdd|D S )Nzconvert<half>zconvert<bfloat16>c                 3   s   | ]}| kV  qd S Nrj   ).0Zto_exprexprrj   rk   	<genexpr>   s     z#is_to_lowp_dtype.<locals>.<genexpr>)rR   )r   Zto_exprsrj   r   rk   is_to_lowp_dtype   s    r   c                 C   s4   t |trd|  dS t |ts$td|  dS d S )Nzat::vec::convert<float>(rw   zc10::convert<float>()r{   CppVecKernel	CppKernelrg   )Zlowp_varkernelrj   rj   rk   get_lowp_to_fp32_expr   s    
r   c           	      C   s   t jjrdnt }dt }td7 ad| dt|  d| d| dt| | d	g}| d| d
t| | d	g}| d| d}| d| dg}|||fS )Nmax_threadsZIndexValue_r6   zstruct z {size_t index; z	 value;}; z{0, z};z
_local{0, _arr[];)r   cppdynamic_threadsr,   index_value_name_counterrE   rl   )	rh   	src_dtypetmpvarnum_threadsZstruct_nameprefix
local_initZtmpvar_per_thdparallel_prefixrj   rj   rk   argmax_argmin_prefix   s    
r   )indexr~   c                 C   s$   ||d i}t | |}t||  S Nr6   )r1   sympysimplify)r   r~   replacement	new_indexrj   rj   rk   	stride_at   s    
r   )r   r~   
vec_lengthc                    s   d d fdd}fdd}| }t jddd}| trV| t||} t jd	dd}| tr| t|||} t | } | |krt| S | S )
a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                    s:   t | }t| kr6t d  } d7  |S )NZ_div_cr6   )r   r   gcdSymbol)divisorresult)div_freevar_idr~   r   rj   rk   visit_indexing_div  s
    
z7simplify_index_in_vec_range.<locals>.visit_indexing_divc                    sv   t | |}t| kr:t d  } d7  n8| dkrrt|krrt d   } d7  |S )NZ_mod_cr6   )r   r   r   r   )r   modulusr   )mod_freevar_idr~   r   rj   rk   visit_modular_indexing
  s    
z;simplify_index_in_vec_range.<locals>.visit_modular_indexingr   T)integerr   )r   ZWildhasr   replacer   r   simplify_index_in_vec_range)r   r~   r   r   r   Zoriginal_indexdivmodrj   )r   r   r~   r   rk   r      s    


r   c                 C   s   t | ||}t||S r   )r   r   )r   r~   r   Zindex_vec_simplifiedrj   rj   rk   stride_at_vec_range&  s    r   c                       s\   e Zd ZeeedddZdeeee	f  d fddZ
dd	 Zd
d Zdd Z  ZS )OuterLoopFusedSchedulerNode)node1node2c                 C   s   |j |j ksttdd ||fD s*ttdd ||fD r| |j t|tkr^t| n|gt|tkrzt| n|g |S | |j ||g|S d S )Nc                 s   s    | ]}t |tttfkV  qd S r   )typer   r'   r%   r   noderj   rj   rk   r   2  s   z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>c                 s   s   | ]}t |tkV  qd S r   r   r   r   rj   rj   rk   r   ;  s     )	schedulerrg   allrR   r   r   listget_outer_nodes)clsr   r   outer_loop_fusion_depthrj   rj   rk   fuse-  s&    	

z OuterLoopFusedSchedulerNode.fuser&   )r   outer_fused_nodesc                    sR   || _ || _g }| j D ](}t|ttfs,t|t|  qt	 
|| d S r   )r   r   r{   r'   r%   rg   extendr   	get_nodessuper__init__)selfr   r   r   Zflatten_snodes_node	__class__rj   rk   r   Q  s    
z$OuterLoopFusedSchedulerNode.__init__c                 C   s   | j S r   )r   r   rj   rj   rk   r   a  s    z+OuterLoopFusedSchedulerNode.get_outer_nodesc                    s~   t t ttd fdd tt|d D ]P}|| j}||d  j}tdd ||fD sr |jd |jd |s( dS q(d	S )
N)left_loop_levelright_loop_levelloop_fusion_depthreturnc                    s   ddddg}t  fdd|D s(dS |dks4t|d  }d	kr jd krXjd ks\ttd
d  fD s jd	 jd	 |sdS dS )Nr~   sizeoffsetstepsc                 3   s"   | ]}t  |t |kV  qd S r   )getattr)r   Zattr_comparer   r   rj   rk   r   z  s   zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>Fr6   r   c                 s   s   | ]}t |jd kV  qdS r6   N)leninner)r   Z
loop_levelrj   rj   rk   r     s   T)r   rg   r   rR   r   )r   r   r   Zouter_loops_attr_compare_list_innerr   rk   r   m  s6    
zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._innerr6   c                 s   s   | ]}t |jd kV  qdS r   )r   root)r   	loop_nestrj   rj   rk   r     s   zQOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>.<genexpr>r   FT)	LoopLevelrX   rW   ranger   r   rR   r   )r   cpp_kernel_proxy_listr   idxZleft_loop_nestZright_loop_nestrj   r   rk   "check_outer_fusion_loop_level_attrd  s&    
+
  z>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attrc                    sb   dd |D }t jt| |d jttd  d fdd  dd |D | j |d S )	Nc                 S   s   g | ]
}|j qS rj   )r   r   r   rj   rj   rk   
<listcomp>  s    zJOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>.<listcomp>r   r   )loop_level_nested_listc                    s   |dkst tdd | D s"t |d  }dkrLdd | D } || nJt}| d d }tt| D ]}|jt| | d  qlg |_||_d S )Nr6   c                 s   s   | ]}t |d kV  qdS r   r   r   Zloop_level_listrj   rj   rk   r     s    zrOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>._merge_outer_fusion_loop_levels.<locals>.<genexpr>c                 S   s   g | ]}|d  j qS )r   )r   r   rj   rj   rk   r     s   zsOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>._merge_outer_fusion_loop_levels.<locals>.<listcomp>r   )	rg   r   OuterLoopFusedKernelr   r   r   appendr   r   )r   r   Znext_loop_level_nested_listZouter_loop_fused_kernelZloop_level_of_first_kernelZ
kernel_idx_merge_outer_fusion_loop_levelskernel_grouprj   rk   r     s(    z_OuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>._merge_outer_fusion_loop_levelsc                 S   s   g | ]
}|j qS rj   )r   )r   Z
_loop_nestrj   rj   rk   r     s     )r   Z!cpp_outer_loop_fused_inner_countsr   r   r   r   r   )r   r   Zloop_nest_listrj   r   rk   merge_outer_fusion_kernels  s    

z6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernels)__name__
__module____qualname__classmethodr"   r   r   r   r%   r'   r   r   r   r   __classcell__rj   rj   r   rk   r   ,  s    %Br   c                   @   s<   e Zd ZdedddZdd Zdd Zd	d
 Zdd ZdS )RecordOptimizationContext )	func_namec                 C   s   || _ d | _d | _d S r   )r   current_nodeopt_ctx)r   r   rj   rj   rk   r     s    z"RecordOptimizationContext.__init__c                 C   sr   t js
tt jjstt jj| _| jd k	s.ttj| jjkrN| jjtj | _nt | _| jd k	sdt| j| j_	| S r   )
r5   interpreterrg   r   rB   keymetar   r   Zops_namer   rj   rj   rk   	__enter__  s    


z#RecordOptimizationContext.__enter__c                 C   s(   | j s
t| jst| j| j jtj< d S r   )r   rg   r   r   rB   r   r   exc_typeexc_valexc_tbrj   rj   rk   __exit__  s    

z"RecordOptimizationContext.__exit__c                 C   s   | j S r   )r   r   rj   rj   rk   get_opt_ctx  s    z%RecordOptimizationContext.get_opt_ctxc                 C   s   | j s
t| j S r   )r   rg   r   rj   rj   rk   get_fx_node  s    
z%RecordOptimizationContext.get_fx_nodeN)r   )	r   r   r   rZ   r   r   r   r  r  rj   rj   rj   rk   r     s
   r   r   r   c                 C   s   | j tjd S r   )r   getrB   r   r   rj   rj   rk   r    s    r  r   c                   C   s   t jjsttt jjS r   )r5   r   r   rg   r  rj   rj   rj   rk   get_current_node_opt_ctx  s    r  c                       sV   e Zd Zee d fddZdd Zdd Zej	dd	d
Z
ejdddZ  ZS )CppCSEVariableboundsc                    s&   t  || d| _d | _t | _d S NF)r   r   is_vecri   setdependent_itervars)r   namer
  r   rj   rk   r     s    zCppCSEVariable.__init__c                 C   s.   d| j  d| j d| j d| j d| j dS )NzCppCSEVariable(name: z
, bounds: z
, is_vec: z	, dtype: z, dependent_itervars: rw   )r  r
  r  ri   r  r   rj   rj   rk   __repr__  s    ,zCppCSEVariable.__repr__c                 C   s   |dkr|  |d  nD| jjdd |D   |dkrD|  |d  tdd |D r\d	| _ttjd
r|t d k	r|t j	| _	|t
krtj| _	d S )Nloadr6   c                 S   s   g | ]}t |tr|jqS rj   )r{   r  r  r   argrj   rj   rk   r     s   
z1CppCSEVariable.update_on_args.<locals>.<listcomp>
index_exprr   c                 s   s   | ]}t |tr|jV  qd S r   r{   r  r  r  rj   rj   rk   r     s     
 z0CppCSEVariable.update_on_args.<locals>.<genexpr>Tr   )_set_dependent_itervarsr  updaterR   r  hasattrr5   r   r  ri   BIN_CMP_OPSre   rW   )r   r  argskwargsrj   rj   rk   update_on_args  s$    


zCppCSEVariable.update_on_argsr   c                 C   sT   |j D ]H}|tjjkr$| j| q|jtjjjkr| j	tjjj|j j qdS )z
        Set the relevant itervars for this variable based on the `index` expression.
        This includes the itervars directly used in the `index` as well as relevant itervars
        of other cse variables used in the `index`.
        N)
free_symbolsr5   r   itervarsr  addr  csevarname_mapr  )r   r   srj   rj   rk   r  0  s    
z&CppCSEVariable._set_dependent_itervars)itervarc                 C   s
   || j kS r   )r  )r   r$  rj   rj   rk   
depends_on>  s    zCppCSEVariable.depends_on)r   r   r   r   r   r   r  r  r   Exprr  r   r%  r   rj   rj   r   rk   r    s
   "r  c                   @   s  e Zd ZdZedd Zedd Zedd Zedd	d
Zedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* Zed+d, Zed-d. Zed/d0 Zed1d2 Zed3d4 Zed5d6 Zed7d8 Zed9d: Z ed;d< Z!ed=d> Z"ed?d@ Z#edAdB Z$edCdD Z%edEdF Z&edGdH Z'edIdJ Z(edKdL Z)edMdN Z*edOdP Z+edQdR Z,edSdT Z-edUdV Z.edWdX Z/edYdZ Z0ed[d\ Z1ed]d^ Z2ed_d` Z3edadb Z4edcdd Z5ededf Z6edgdh Z7edidj Z8edkdl Z9edmdn Z:edodp Z;edqdr Z<edsdt Z=edudv Z>edwdx Z?edydz Z@ed{d| ZAed}d~ ZBedd ZCedd ZDedd ZEedd ZFeeGjHeGjHdddZIeeGjHeGjHdddZJeeGjHeGjHdddZKedd ZLedd ZMdS )CppOverrideszMap element-wise ops to C++c                 C   s   d|  d|  d| dS )N	decltype()(rr   rw   rj   abrj   rj   rk   r   E  s    zCppOverrides.addc                 C   s   d|  d|  d| dS )Nr(  r)   - rw   rj   r*  rj   rj   rk   subI  s    zCppOverrides.subc                 C   s   d|  d|  d| dS )Nr(  r)  rs   rw   rj   r*  rj   rj   rk   mulM  s    zCppOverrides.mulNc                 C   s2   |t kst| dt ddt |  d|  dS )N missing from .DTYPE_TO_CPPc10::convert<>(rw   )rE   rg   r   xri   r   rj   rj   rk   to_dtypeQ  s    zCppOverrides.to_dtypec                 C   s   |t kst| dt d|tjtjfkrpdt |  d|  d}dt |  d| d}dt tj  d| dS dt |  d|  dS d S )Nr0  r1  r2  r3  rw   zc10::bit_cast<)rE   rg   r   re   float16bfloat16rf   )r5  ri   r   Zcast_xrj   rj   rk   to_dtype_bitcastV  s    zCppOverrides.to_dtype_bitcastc                 C   s   d|  dS )Nz	std::abs(rw   rj   r5  rj   rj   rk   absf  s    zCppOverrides.absc                 C   s   d|  dS )Nz	std::sin(rw   rj   r:  rj   rj   rk   sinj  s    zCppOverrides.sinc                 C   s   d|  dS )Nz	std::cos(rw   rj   r:  rj   rj   rk   cosn  s    zCppOverrides.cosc                 C   s   d|  d|  dS )Nr(  z)(-rw   rj   r:  rj   rj   rk   negr  s    zCppOverrides.negc                 C   s   d|  dS )Nz	std::exp(rw   rj   r:  rj   rj   rk   expv  s    zCppOverrides.expc                 C   s   d|  dS )Nz
std::exp2(rw   rj   r:  rj   rj   rk   exp2{  s    zCppOverrides.exp2c                 C   s   d|  dS )Nzstd::expm1(rw   rj   r:  rj   rj   rk   expm1  s    zCppOverrides.expm1c                 C   s   d|  dS )Nz	std::erf(rw   rj   r:  rj   rj   rk   erf  s    zCppOverrides.erfc                 C   s   d|  dS )Nz
std::erfc(rw   rj   r:  rj   rj   rk   erfc  s    zCppOverrides.erfcc                 C   s   d|  dS )Nzcalc_erfinv(rw   rj   r:  rj   rj   rk   erfinv  s    zCppOverrides.erfinvc                 C   s   d|  dS )Nz
std::sqrt(rw   rj   r:  rj   rj   rk   sqrt  s    zCppOverrides.sqrtc                 C   s   d|  dS )Nz1 / std::sqrt(rw   rj   r:  rj   rj   rk   rsqrt  s    zCppOverrides.rsqrtc                 C   sF   t jj}|dkr |  d|  dS |d kr4d|  dS td|d S )Naccuracy + decltype()(1)zstd::log1p(rw   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   r   Zinject_log1p_bug_TESTING_ONLYrg   r5  bugrj   rj   rk   log1p  s    zCppOverrides.log1pc                 C   s   d|  dS )Nz	std::tan(rw   rj   r:  rj   rj   rk   tan  s    zCppOverrides.tanc                 C   s   d|  dS )Nz
std::tanh(rw   rj   r:  rj   rj   rk   tanh  s    zCppOverrides.tanhc                 C   s   d|  dS )Nzstd::signbit(rw   rj   r:  rj   rj   rk   signbit  s    zCppOverrides.signbitc                 C   s   d|  d| dS )Nz	std::pow(rv   rw   rj   r*  rj   rj   rk   pow  s    zCppOverrides.powc                 C   s   d|  dS )Nz	std::log(rw   rj   r:  rj   rj   rk   log  s    zCppOverrides.logc                 C   s   d|  dS )Nzstd::nearbyint(rw   rj   r:  rj   rj   rk   round  s    zCppOverrides.roundc                 C   s   d|  dS )Nzstd::floor(rw   rj   r:  rj   rj   rk   floor  s    zCppOverrides.floorc                 C   sF   |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : rw   rj   )r+  r,  quotremrj   rj   rk   floordiv  s    zCppOverrides.floordivc                 C   s   d|  dS )Nz
std::ceil(rw   rj   r:  rj   rj   rk   ceil  s    zCppOverrides.ceilc                 C   s   d|  dS )Nzstd::trunc(rw   rj   r:  rj   rj   rk   trunc  s    zCppOverrides.truncc                 C   s   |  d| S NrV  rj   r*  rj   rj   rk   truncdiv  s    zCppOverrides.truncdivc                 C   s   d|  d| dS )Nz
std::fmod(rv   rw   rj   r*  rj   rj   rk   fmod  s    zCppOverrides.fmodc                 C   s   d|  dS )Nzstd::isinf(rw   rj   r:  rj   rj   rk   isinf  s    zCppOverrides.isinfc                 C   s   d|  dS )Nzstd::isnan(rw   rj   r:  rj   rj   rk   isnan  s    zCppOverrides.isnanc                 C   s   d|  dS )Nzstd::lgamma(rw   rj   r:  rj   rj   rk   lgamma  s    zCppOverrides.lgammac                 C   s   d|  dS )Nz
std::acos(rw   rj   r:  rj   rj   rk   acos  s    zCppOverrides.acosc                 C   s   d|  dS )Nzstd::acosh(rw   rj   r:  rj   rj   rk   acosh  s    zCppOverrides.acoshc                 C   s   d|  dS )Nz
std::cosh(rw   rj   r:  rj   rj   rk   cosh  s    zCppOverrides.coshc                 C   s   d|  dS )Nz
std::sinh(rw   rj   r:  rj   rj   rk   sinh  s    zCppOverrides.sinhc                 C   s   d|  dS )Nz
std::asin(rw   rj   r:  rj   rj   rk   asin  s    zCppOverrides.asinc                 C   s   d|  dS )Nzstd::asinh(rw   rj   r:  rj   rj   rk   asinh  s    zCppOverrides.asinhc                 C   s   d|  d| dS )Nzstd::atan2(rv   rw   rj   r5  yrj   rj   rk   atan2  s    zCppOverrides.atan2c                 C   s   d|  dS )Nz
std::atan(rw   rj   r:  rj   rj   rk   atan  s    zCppOverrides.atanc                 C   s   d|  dS )Nzstd::atanh(rw   rj   r:  rj   rj   rk   atanh  s    zCppOverrides.atanhc                 C   s   d|  d| dS )Nzstd::copysign(rv   rw   rj   rj  rj   rj   rk   copysign  s    zCppOverrides.copysignc              	   C   s   d|  dd|  df}t dd |D r<tdd |D S t }tjj }tjj }|d| d |d	| d
|  d| d tjj	| ||f}t
||D ]\}}|tjjj|< q||fS )Nzfrexp(z)[0]z)[1]c                 s   s   | ]}|t jjjkV  qd S r   r5   r   r!  cacher   	cache_keyrj   rj   rk   r     s     z%CppOverrides.frexp.<locals>.<genexpr>c                 s   s   | ]}t jjj| V  qd S r   rp  rr  rj   rj   rk   r     s     zint32_t r   auto z = std::frexp(z, &);)r   r|   r7   r5   r   r!  newvar	writelinecomputespliceziprq  )r5  Z
cache_keyscodeexponentZmantissaZcse_varsrs  cse_varrj   rj   rk   frexp  s    zCppOverrides.frexpc                 C   s   d|  d| dS )Nzstd::hypot(rv   rw   rj   rj  rj   rj   rk   hypot  s    zCppOverrides.hypotc                 C   s   d|  dS )Nzstd::log10(rw   rj   r:  rj   rj   rk   log10   s    zCppOverrides.log10c                 C   s   d|  dS )Nz
std::log2(rw   rj   r:  rj   rj   rk   log2$  s    zCppOverrides.log2c                 C   s   d|  d| dS )Nzstd::nextafter(rv   rw   rj   rj  rj   rj   rk   	nextafter(  s    zCppOverrides.nextafterc                 C   sj   t jj}|dkrdS |dkr&|  dS |dkr>|  d|  dS |d krXd|  d	|  d
S td|d S )Ncompile_errorcompile error!runtime_error	; throw 1rG  rH  rI  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r   Zinject_relu_bug_TESTING_ONLYrg   rL  rj   rj   rk   relu,  s    
zCppOverrides.reluc                 C   s   d|  d| dS )Nzmin_propagate_nan(rv   rw   rj   r*  rj   rj   rk   minimum<  s    zCppOverrides.minimumc                 C   s   d|  d| dS )Nzmax_propagate_nan(rv   rw   rj   r*  rj   rj   rk   maximum@  s    zCppOverrides.maximumc                 C   s   |  d| d| S )N ?  : rj   )r+  r,  crj   rj   rk   whereD  s    zCppOverrides.wherec                 C   s   d|  d| dS )Nzmod(rv   rw   rj   r*  rj   rj   rk   r   H  s    zCppOverrides.modc                 C   s>   t  }|r|jd k	st||j}|tkr0tj}t| t| S r   )r  ri   rg   rd   re   rf   rG   rE   )valri   r   rj   rj   rk   constantL  s    zCppOverrides.constantc                 C   sV   t  }|r|jd k	st|j}ttj| }tjjjtjj	|t
| d}t||S )Nr	  )r  ri   rg   rC   r5   r   rename_indexingr!  generaterx  r)   r3   r6  )r   ri   r   Zidx_strr~   rj   rj   rk   r  W  s      zCppOverrides.index_exprc              
   C   s   t  }tjj }|d| d tj|2 |  | }|d| d W 5 Q R X W 5 Q R X |d tjj	| t
|d| d}|  d| d| S )	Nrt   = [&]return r   r(  z())r  z() : )r7   r5   r   r!  rv  rw  swap_buffersindentrx  ry  rG   )maskbodyotherr{  Zbody_varr   
other_coderj   rj   rk   maskedc  s    &
zCppOverrides.maskedc                 C   s   |  d| S )Nz && rj   r*  rj   rj   rk   logical_andt  s    zCppOverrides.logical_andc                 C   s
   d|  S )N!rj   r+  rj   rj   rk   logical_notx  s    zCppOverrides.logical_notc                 C   s   |  d| S )Nru   rj   r*  rj   rj   rk   
logical_or|  s    zCppOverrides.logical_orc                 C   s   |  d| S )N != rj   r*  rj   rj   rk   logical_xor  s    zCppOverrides.logical_xorc                 C   s   d|  d|  d| dS )Nr(  r)   & rw   rj   r*  rj   rj   rk   bitwise_and  s    zCppOverrides.bitwise_andc                 C   s   d|  d|  dS )Nr(  z)(~rw   rj   r  rj   rj   rk   bitwise_not  s    zCppOverrides.bitwise_notc                 C   s   d|  d|  d| dS )Nr(  r)   | rw   rj   r*  rj   rj   rk   
bitwise_or  s    zCppOverrides.bitwise_orc                 C   s   d|  d|  d| dS )Nr(  r)  rt   rw   rj   r*  rj   rj   rk   bitwise_xor  s    zCppOverrides.bitwise_xorc                 C   s   d|  d|  d| dS )Nr(  r)  z << rw   rj   r*  rj   rj   rk   bitwise_left_shift  s    zCppOverrides.bitwise_left_shiftc                 C   s   d|  d|  d| dS )Nr(  r)  z >> rw   rj   r*  rj   rj   rk   bitwise_right_shift  s    z CppOverrides.bitwise_right_shiftseedr   c                 C   s   d|  d| dS )Nznormalized_rand_cpu(rv   rw   rj   r  rj   rj   rk   rand  s    zCppOverrides.randc                 C   s   d|  d| dS )Nz
randn_cpu(rv   rw   rj   r  rj   rj   rk   randn  s    zCppOverrides.randnc              	   C   s   d|  d| d| d| d	S )Nzrandint64_cpu(rv   rw   rj   )r  r   lowhighrj   rj   rk   	randint64  s    zCppOverrides.randint64c                 C   s   d|  d|  d|  dS )Nr(  z)(1) / (decltype(z)(1) + std::exp(-z))rj   r:  rj   rj   rk   sigmoid  s    zCppOverrides.sigmoidc              
   C   s   t  }d|  d}d|  d}|d | L |d|  d| d| d |d	|  d
| d| d |d W 5 Q R X |d |S )Nr(  )(0)rI  [&]()auto left = z > 0 ? r  r   auto right = z < 0 ? return left - right;()r7   rw  r  )r5  r{  Zscalar_zeroZ
scalar_onerj   rj   rk   sign  s    


zCppOverrides.sign)N)Nr   r   r   __doc__staticmethodr   r.  r/  r6  r9  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rN  rO  rP  rQ  rR  rS  rT  rU  r[  r\  r]  r_  r`  ra  rb  rc  rd  re  rf  rg  rh  ri  rl  rm  rn  ro  r~  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r&  r  r  r  r  r  rj   rj   rj   rk   r'  B  s  


































































r'  r   c                       s.  e Zd ZdZ fddZedd Zedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zed8d9 Z ed:d; Z!ed<d= Z"ed>d? Z#ed@dA Z$edBdC Z%edDdE Z&edFdG Z'edHdI Z(edJdK Z)edLdM Z*edNdO Z+edPdQ Z,edRdS Z-edTdU Z.edVdW Z/edXdY Z0edZd[ Z1ed\d] Z2ed^d_ Z3ed`da Z4edbdc Z5eddde Z6edfdg Z7edhdi Z8edjdk Z9edldm Z:edndo Z;edpdq Z<edrds Z=edtdu Z>edvdw Z?edxdy Z@edzd{ ZAed|d} ZBedddZCedd ZDedd ZEedd ZF  ZGS )CppVecOverridesz.Map element-wise ops to aten vectorization C++c                    s^   t  |   fdd}tt D ]2\}}t|dd tkr&|dkr&t |||j q& S )Nc                    s    fdd}|S )Nc                     s  dd | D }dd | D }t | }|rV|rVg }|d j}| D ]}t|ttjfrtj}t }|slt	|jd k	r||j}t|tjr|j
st||}nt||}t|tr|jn|}t|trJ|jsJttjtst	t| dkr2|j|kr2t||}t|tr|jn|}t|ts,t	||_tj|}	||	 qB|| qB|rf||S tt}
t|
j|
j}|d k	st	|| |S d S )Nc                 S   s0   g | ](}t |ttjfs(t |tr|js|qS rj   )r{   rX   r   r&  r  r  r  rj   rj   rk   r     s
   
 zJCppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<listcomp>c                 S   s    g | ]}t |tr|jr|qS rj   r  r  rj   rj   rk   r     s   
 r   r   )r   ri   r{   rX   r   r&  re   int64r  rg   	is_numberr3   r  r  r4   valuer  r  r5   r   r   r   r6  	broadcastr   r   r  r   r   __getattr__)r  r  ZscalarsZvectorsnew_args	vec_dtyper  Z	arg_dtyper   Znew_argZ
scalar_opsZscalar_func)r   funcr   rj   rk   wrapper  sR    




  
z6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapperrj   )r  r  )r   r   )r  rk   wrap  s    6z%CppVecOverrides.__new__.<locals>.wrapr   )r  r  )	r   __new__varsr  itemsr   r  setattr__func__)r   r  Zkargsr  r  methodr   r   rk   r    s    EzCppVecOverrides.__new__c                 C   s   |  d| S )Nrr   rj   r*  rj   rj   rk   r     s    zCppVecOverrides.addc                 C   s   |  d| S )Nr-  rj   r*  rj   rj   rk   r.    s    zCppVecOverrides.subc                 C   s   |  d| S Nrs   rj   r*  rj   rj   rk   r/    s    zCppVecOverrides.mulc                 C   s   |  d| S r^  rj   r*  rj   rj   rk   truediv  s    zCppVecOverrides.truedivc                 C   s
   |  dS )Nz.abs()rj   r:  rj   rj   rk   r;     s    zCppVecOverrides.absc                 C   s
   |  dS )Nz.sin()rj   r:  rj   rj   rk   r<  $  s    zCppVecOverrides.sinc                 C   s
   |  dS )Nz.cos()rj   r:  rj   rj   rk   r=  (  s    zCppVecOverrides.cosc                 C   s
   |  dS )Nz.exp()rj   r:  rj   rj   rk   r?  ,  s    zCppVecOverrides.expc                 C   s
   |  dS )Nz.exp2()rj   r:  rj   rj   rk   r@  0  s    zCppVecOverrides.exp2c                 C   s   d|  d}|  d| S )Nr(  rI  z	.exp() - rj   r5  vec_onerj   rj   rk   rA  4  s    zCppVecOverrides.expm1c                 C   s
   |  dS )Nz.erf()rj   r:  rj   rj   rk   rB  :  s    zCppVecOverrides.erfc                 C   s
   |  dS )Nz.erfc()rj   r:  rj   rj   rk   rC  >  s    zCppVecOverrides.erfcc                 C   s
   |  dS )Nz	.erfinv()rj   r:  rj   rj   rk   rD  B  s    zCppVecOverrides.erfinvc                 C   s
   |  dS )Nz.sqrt()rj   r:  rj   rj   rk   rE  F  s    zCppVecOverrides.sqrtc                 C   sL   t tjtstt | tst| jd k	s,ttj| j d|  d| dS )N( == rw   r{   r5   r   r   rg   r  ri   _get_mask_typerj  rj   rj   rk   r\   J  s    zCppVecOverrides.eqc                 C   sL   t tjtstt | tst| jd k	s,ttj| j d|  d| dS )Nr  r  rw   r  rj  rj   rj   rk   r]   Q  s    zCppVecOverrides.nec                 C   sL   t tjtstt | tst| jd k	s,ttj| j d|  d| dS )Nr   < rw   r  rj  rj   rj   rk   r`   X  s    zCppVecOverrides.ltc                 C   sL   t tjtstt | tst| jd k	s,ttj| j d|  d| dS )Nr  z > rw   r  rj  rj   rj   rk   ra   _  s    zCppVecOverrides.gtc                 C   sL   t tjtstt | tst| jd k	s,ttj| j d|  d| dS )Nr   <= rw   r  rj  rj   rj   rk   r^   f  s    zCppVecOverrides.lec                 C   sL   t tjtstt | tst| jd k	s,ttj| j d|  d| dS )Nr  z >= rw   r  rj  rj   rj   rk   r_   m  s    zCppVecOverrides.gec                 C   s   |  d| S Nr  rj   rj  rj   rj   rk   and_t  s    zCppVecOverrides.and_c                 C   s
   |  dS )Nz.rsqrt()rj   r:  rj   rj   rk   rF  x  s    zCppVecOverrides.rsqrtc                 C   s   |  d| dS )Nz.pow(rw   rj   r*  rj   rj   rk   rR  |  s    zCppVecOverrides.powc                 C   s
   |  dS )Nz.log()rj   r:  rj   rj   rk   rS    s    zCppVecOverrides.logc                 C   s
   |  dS )Nz.round()rj   r:  rj   rj   rk   rT    s    zCppVecOverrides.roundc                 C   s
   |  dS )Nz.floor()rj   r:  rj   rj   rk   rU    s    zCppVecOverrides.floorc                 C   s
   |  dS )Nz.ceil()rj   r:  rj   rj   rk   r\    s    zCppVecOverrides.ceilc                 C   s
   |  dS )Nz.trunc()rj   r:  rj   rj   rk   r]    s    zCppVecOverrides.truncc                 C   s   |  d| dS )Nz.fmod(rw   rj   r*  rj   rj   rk   r`    s    zCppVecOverrides.fmodc                 C   s
   |  dS )Nz	.lgamma()rj   r:  rj   rj   rk   rc    s    zCppVecOverrides.lgammac                 C   s   |  d| S r  rj   r*  rj   rj   rk   r    s    zCppVecOverrides.logical_andc                 C   s
   d|  S )N~rj   r  rj   rj   rk   r    s    zCppVecOverrides.logical_notc                 C   s   |  d| S )Nr  rj   r*  rj   rj   rk   r    s    zCppVecOverrides.logical_orc                 C   s   |  d| S )Nrt   rj   r*  rj   rj   rk   r    s    zCppVecOverrides.logical_xorc                 C   s
   |  dS )Nz.tan()rj   r  rj   rj   rk   rO    s    zCppVecOverrides.tanc              	   C   sD   d|  d}d|  d}d|  d}| d| d| d|  d| 	S )	Nr(  rI  z)(2)z)(-2)z / ( + (rs   z).exp()) - rj   )r+  r  Zvec_twoZvec_minus_tworj   rj   rk   rP    s    zCppVecOverrides.tanhc                 C   s
   |  dS )Nz.reciprocal()rj   r  rj   rj   rk   
reciprocal  s    zCppVecOverrides.reciprocalc                 C   s
   |  dS )Nz.atan()rj   r:  rj   rj   rk   rm    s    zCppVecOverrides.atanc                 C   s
   |  dS )Nz.acos()rj   r:  rj   rj   rk   rd    s    zCppVecOverrides.acosc                 C   s
   |  dS )Nz.asin()rj   r:  rj   rj   rk   rh    s    zCppVecOverrides.asinc                 C   s
   |  dS )Nz.cosh()rj   r:  rj   rj   rk   rf    s    zCppVecOverrides.coshc                 C   s
   |  dS )Nz.sinh()rj   r:  rj   rj   rk   rg    s    zCppVecOverrides.sinhc                 C   s
   |  dS )Nz.log10()rj   r:  rj   rj   rk   r    s    zCppVecOverrides.log10c                 C   s
   |  dS )Nz.log2()rj   r:  rj   rj   rk   r    s    zCppVecOverrides.log2c                 C   s   |  d| dS )Nz.nextafter(rw   rj   rj  rj   rj   rk   r    s    zCppVecOverrides.nextafterc                 C   s   |  d| dS )Nz
.copysign(rw   rj   r*  rj   rj   rk   ro    s    zCppVecOverrides.copysignc                 C   s   |  d| dS )Nz.atan2(rw   rj   r*  rj   rj   rk   rl    s    zCppVecOverrides.atan2c                 C   s   |  d| dS )Nz.hypot(rw   rj   r*  rj   rj   rk   r    s    zCppVecOverrides.hypotc              
   C   s:   d|  d}d|  d}| d| d|  d| d|  d
S )	Nr(  rI  z)(0.5)z * ((rr   z)/(r-  z)).log()rj   )r5  r  Zvec_one_halfrj   rj   rk   rn    s    zCppVecOverrides.atanhc              	   C   s*   d|  d}d|  d| d|  d|  d	S )Nr(  rI  r  r  rr   rI   z).sqrt()).log()rj   r  rj   rj   rk   ri    s    zCppVecOverrides.asinhc                 C   s
   |  dS )Nz.acosh()rj   r:  rj   rj   rk   re    s    zCppVecOverrides.acoshc                 C   sj   t jj}|dkrdS |dkr&|  dS |dkr>|  d|  dS |d krXd|  d	|  d
S td|d S )Nr  r  r  r  rG  rH  rI  zat::vec::clamp_min(r  r  r  r  rL  rj   rj   rk   r    s    
zCppVecOverrides.reluc                 C   s   d|  d|  d|  dS )Nr(  z)(1)/(decltype(z)(1) + z.neg().exp())rj   r:  rj   rj   rk   r  	  s    zCppVecOverrides.sigmoidc                 C   s
   |  dS )Nz.neg()rj   r:  rj   rj   rk   r>    s    zCppVecOverrides.negc                 C   sx   d|  d}|  d| }d|  d| d| d}d|  d	| d
| d	| d	}| d| d| d| d| d| dS )Nr(  rw   rV  r  rW  r  z(0))rX  r  z	(0)) != (z(0)))z	::blendv(rv   r-  z(1), r  rj   )r+  r,  Z_trY  Zhas_remZis_negrj   rj   rk   r[    s
    zCppVecOverrides.floordivc                 C   s   |  d| S r^  rj   r*  rj   rj   rk   r_    s    zCppVecOverrides.truncdivc                 C   s   d|  d| dS )Nat::vec::minimum(rv   rw   rj   r*  rj   rj   rk   r    s    zCppVecOverrides.minimumc                 C   s   d|  d| dS )Nat::vec::maximum(rv   rw   rj   r*  rj   rj   rk   r  #  s    zCppVecOverrides.maximumc                 C   s   |  d|  S r  rj   r  rj   rj   rk   square'  s    zCppVecOverrides.squarec                 C   s   t tjtst|jtjkr|jtjks,ttj| tj	 }tj|tj	 }tj|tj	 }d| d| d| d| d	S d| d| d| dtj| |j d	S d S )Nr(  
)::blendv(rv   rw   )
r{   r5   r   r   rg   ri   re   rW   _get_mask_castrY   )r+  r,  r  Zblendv_aZblendv_bZblendv_crj   rj   rk   r  +  s    zCppVecOverrides.wherec                 C   s   t  }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|d | 4 |d	| d
 |d| d
 |d W 5 Q R X |d |S )Nr(  r  rI  r  rv   r  rw   r  r  r   r  r  r  r  )r5  r{  Zvec_zeror  Zblendv_lZblendv_rrj   rj   rk   r  7  s    $$


zCppVecOverrides.signNc           	      C   s  |t jt jt jt jt jt jt jt jfks:t	t
 d| tjj}|rTt|t jjsXt	t|jd }|snt	|jd k	s|t	ttjtst	|j}t| }tj|}t| }tj|}|t jkr|t jkrtj| d| d| d|  dS |jt jkr"|t jkr"|  d| d| dS ||kr~||  krDdkrZn nd	| d|  dS d	| d| d| d| d|  dS d
|  dS )Nz does not support r6   z::from<,r3  rw   z.to<rc   zat::vec::convert<r  )re   rW   rY   r8  r7  uint8int8int32r  rg   r   r5   r   r   r{   fxNoder  r  ri   r   r   rE   _get_num_vectorsr  )	r5  ri   r   r   Z	opt_ctx_xZsrc_cpp_typeZsrc_num_vectorsZdst_cpp_typeZdst_num_vectorsrj   rj   rk   r6  F  s@    	
$
$zCppVecOverrides.to_dtypec                 C   sD   t jj}|dkr |  d|  dS |d kr2|  dS td|d S )NrG  rH  rI  z.log1p()rJ  rK  rL  rj   rj   rk   rN  h  s    
zCppVecOverrides.log1pc                 C   s  t tjtstt }tjj }tj| \}|	d| d tj
|2 |  | }|	d| d W 5 Q R X W 5 Q R X W 5 Q R X |	d tjj| |j}| d}|jr|ntj| d| d}	t|t| }
|tjkr
tj  d|
 dntj| d|
 d}t |ts6t||jrlt }|	d	 tj
| |  |	d
| d |  |	d| d W 5 Q R X |	d |  tjjtjj|	}tjjtjj|}t |tst|t |tst|||_||_|	dtjj||| d W 5 Q R X W 5 Q R X W 5 Q R X |	d tjjtjj|}nR|jrtjjtjj|  d|	 d| }n$tjjtjj|  d| d|
 }|d| |||fi  |S )Nrt  r  r  r   r  r  rw   ::from([&]if (z.all_zero())elser  r  r  )r{   r5   r   r   rg   r7   r!  rv  r  rw  r  r  rx  ry  ri   r  _get_vec_typerG   rE   re   rW   r  r  r  	overridesr  r  )r  r  r  r{  r~   Znew_maskr   ri   Z	body_codeZbody_code_vecr  Zother_code_vecZbody_vec_varZother_vec_varcsevarrj   rj   rk   r  t  s|    0





"
  zCppVecOverrides.maskedc           	      C   s   t  }|r|jd k	st|j}ttjts.ttj| }tjjtjj	 }tj
||}|dkrlt| |S |d k	rtjjjtjjt|t| d}t||}t|tr|j}tj||}ntjd ||tjj}|d| |fi  |S )Nr   r	  r  )r  ri   rg   r{   r5   r   r   r  r  
tiling_idx_try_get_const_strider'  r  r!  r  rx  rC   r)   r3   r6  r4   r  arange_load_or_store_non_contiguousr  )	r   ri   r   r   
tiling_varstrider   r  r  rj   rj   rk   r    s6      
   zCppVecOverrides.index_expr)N)Hr   r   r   r  r  r  r   r.  r/  r  r;  r<  r=  r?  r@  rA  rB  rC  rD  rE  r\   r]   r`   ra   r^   r_   r  rF  rR  rS  rT  rU  r\  r]  r`  rc  r  r  r  r  rO  rP  r  rm  rd  rh  rf  rg  r  r  r  ro  rl  r  rn  ri  re  r  r  r>  r[  r_  r  r  r  r  r  r6  rN  r  r  r   rj   rj   r   rk   r    s  P




























































!

Fr  Zcppvecc                   @   s   e Zd Zedd ZdS )CppTile2DOverridesc                 C   s(   t tjtsttj| } t| |S r   )r{   r5   r   CppTile2DKernelrg   transform_indexingr  r  )r   ri   rj   rj   rk   r    s    zCppTile2DOverrides.index_exprN)r   r   r   r  r  rj   rj   rj   rk   r    s   r  c                       sP  e Zd ZeZeZdZdZ fddZ	e
edfddZedd	d
Zdd Zejdd Zdd Zd>ejdddZejedddZejejdddZejejdddZdd  Zejejeed!d"d#Zeejd$d%d&Zd?d'd(Zd)d* Z d+d, Z!d-d. Z"d/d0 Z#d1d2 Z$d3d4 Z%e&ed5d6d7Z'd8d9 Z(ejd:d; Z)d<d= Z*  Z+S )@r   rt  r   c                    s   t  | d | _g | _g | _d | _t | _t | _t | _	t | _
t | _t | _d| _t | _t| j| jdd| _t | _t | _|| _i | _d S )NFZtmp_acc)Zname_prefix)r   r   call_rangesrangesr  reduction_depthr>   reduction_prefixreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixr9   newvar_prefixsuffixreduction_csepreloads
poststoresr   Zreduction_omp_dec)r   r  r   r   rj   rk   r     s$    zCppKernel.__init__Nc                 C   sP  t jjr| js| jd | d}t jjr0dnt }	| d|	 d}
|
d|	 dd}| j| d| d	||| d
 | j| d|
 d
 | jd|	 ddd| d	||| d
dg | j	| d	| d
g | j
d|	 ddd| d	|||| d
dg |dkrL|rLt| drLd| krL| j|||	 d S )Nz(int max_threads = omp_get_max_threads();_localr   r   r   [[tid]r    = r   for (int tid = 0; tid < ; tid++){    }rS   weight_recp_vec_rangevec)r   r   r   r  rw  r,   r   r  
writelinesr  r  r  )r   r   acc_typerh   ri   reduction_combine_fnreduction_init_fn welford_weight_reciprocal_vec_fn	acc_localr   acc_per_threadacc_local_in_arrayrj   rj   rk   _gen_parallel_reduction_buffers  sT    



	z)CppKernel._gen_parallel_reduction_bufferslinec                 C   s   t d|S )Nztmp_acc[0-9]+researchr   r!  rj   rj   rk   get_reduction_var_pattern2  s    z#CppKernel.get_reduction_var_patternc                 C   sV   t | jjD ]D\}}t|tr| |}|r|d}||| d| jj|< qd S )Nr   r  )	enumeratestores_linesr{   rZ   r&  groupr   )r   ir!  mvar_namerj   rj   rk   %update_stores_with_parallel_reduction5  s    


z/CppKernel.update_stores_with_parallel_reductionc                 c   sZ   | j }|r<t||}t|tr<|j}t|ts4ttj	|_
|| _ z
|V  W 5 || _ X dS )z>Context manager to add an additional mask to loads and stores.N)
_load_maskr3   r  r{   r4   r  r  rg   re   rW   ri   )r   r  priorrj   rj   rk   r  =  s    

zCppKernel.maskedc                 C   s>   |j tkrdS dd }||| jj}|r:|| jjt|| < dS )an  
        https://github.com/pytorch/pytorch/issues/115260
        For FusedSchedulerNode[node1, node2], the node2 loads what node1 stores and the buffer is
        in low-precision floating point data type. When the output of node1 also serves as the output of the
        kernel, the result of nodes would be different from the case when output of node1 is not the output
        of the kernel (where we don't need to insert `to_dtype` for legalization). To address the problem, on
        storing the lowp node1 output, we also add the inverse dtype conversion to high precision data type
        to the cse cache.

        Example (pseudo code):
            node1_output = ...
            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
            store(buf, node1_output_lowp)
            node2_input_lowp = load(buf)
            node2_input = to_dtype(node2_input_lowp, dtype=torch.float)

        Without cse cache trick:
            node1_output = ...
            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
            store(buf, node1_output_lowp)
            node2_input_lowp = node_output_lowp # hit store cache
            node2_input = to_dtype(node2_input_lowp, dtype=torch.float)

        With cse cache trick:
            node1_output = ...
            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
            # also add `to_dtype(node1_input_lowp, dtype=torch.float)` -> `node1_output` to cse cache
            store(buf, node1_output_lowp)
            node2_input_lowp = node_output_lowp # hit store cache
            node2_input = node1_output # hit cse cache
        Nc                 S   sz   d }d }|  D ]4\}}|| krt|rtd|}|d k	r| }q|rv| D ]}|j|krR|} qjqR|d k	svt|S )Nztmp\d+)r  r   r#  r$  r*  valuesr  rg   )r~   rq  Zfp32_cse_varZfp32_cse_var_namer   r}  r,  rj   rj   rk   find_fp32_varu  s    

zECppKernel.cache_fp32_cse_var_before_lowp_store.<locals>.find_fp32_var)ri   rd   r!  rq  r   )r   Zvar_to_storer2  Zfp32_varrj   rj   rk   $cache_fp32_cse_var_before_lowp_storeP  s    !
z.CppKernel.cache_fp32_cse_var_before_lowp_storer6   r   r  c                 C   s(   | j | }||| | i}t||}|S r   )r  r1   )r   r   scaleitervar_idxr   r~   r   r   rj   rj   rk   scale_index_with_offset  s    

z!CppKernel.scale_index_with_offsetr   r   c                 C   s   t | |S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rC   r  r   r   rj   rj   rk   index_to_str  s    zCppKernel.index_to_strr   r$  c                    s   t  fdd|jD S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c                 3   sD   | ]<}|j jjkrtjj|j  trjj|j   V  qd S r   )r  r!  r"  r{   r  r%  r   r#  r$  r   rj   rk   r     s   z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>)rR   r  r   r   r$  rj   r=  rk   index_indirect_depends_on  s    z#CppKernel.index_indirect_depends_onc                 C   s   ||j kp| ||S r   )r  r?  r>  rj   rj   rk   index_depends_on  s     zCppKernel.index_depends_onc                 C   s   t t| j| jS r   )dictrz  r  r  r   rj   rj   rk   
var_ranges  s    zCppKernel.var_rangesr   r   lowerupperc                 C   s   |s|sd S t |tj}|r6t|tjj}tj	j
}n8tj	j
}z| jtj	_
t|tjj}W 5 |tj	_
X | j}|rtj	| |nd }	| ||rdnd |	}
| jj||
dd d S )N0F)Z
assignment)r   r   TMPr3   r  re   r  r  r5   r   rx  loadssexprr  indirect_assertr!  r  )r   r   r   rD  rE  Zindirectr  bufferZprior_computesize_strr!  rj   rj   rk   check_bounds  s    


zCppKernel.check_boundsr  r   c                 C   sp   | j |}| |}| dt| d}tj|tjfkrJd| d}| j	
| j|}|d||fi  |S )Nr  r   zstatic_cast<float>(rw   r  )r  inputr  rD   r5   graph	get_dtypere   r7  r!  r  rH  r  )r   r  r   r~   r!  r  rj   rj   rk   r    s    
zCppKernel.loadc                 C   s   d|kst | j|}| | | |}|d krP| dt| d| d}n|dkrtjjs| j	dkr| dt| d| d}qt
j|}dt|  d	| d
}d| dt| d| d}ntd| | jt|| d S )Nbufr  ] = r   Z
atomic_addr6   z] += zstatic_cast<r3  rw   zatomic_add(&z], ru  zstore mode=)rg   r  outputr3  r  rD   r   r   r   r   r5   rP  rQ  rE   NotImplementedErrorr(  rw  r<   )r   r  r   r  moder~   r!  ri   rj   rj   rk   store  s    

zCppKernel.storec                 C   sZ  |dk}|||f}|| j jkr*| j j| S | j j| jd| dd}d| _|rt|||\}}	}
| j|
 | j| | j	|	 |dkrdnd}| j
d k	st| j| j
 }t| j
d	 t| jD ]}|| j|  | j|  }q| jd
| d| d| d| dt| dd| dt| d| d| d	dg | d}t }| d| d}|d| dd}| jd| ddd| d| d| d| d| dd | d| d!| d| d"	d#dg | j| d$| dg n^t||}| j| d%| d$t|| d | j| d$t||| d | |||| t||}|| j j|< |S )&Nrn   
reduction FwriteTrN   Zgreater_or_nanZless_or_nanr6   zif(!(r  z.value, rv   z.index, z))) {r  z	.index = ; z	.value = r   r  r  r   r   r  r  r  r  r  z	    if(!(z.index))) {z        z.index; z.value;z    }r  r   )r	  reduction_cacher  rH  r  r   r  r  r  r  r  rg   r  r   r   r  r(  rD   r,   r   r  r  rq   rw  rl   r   r  r}   )r   ri   r   rh   r  Zargmax_or_argminreduction_keyr   r   r   r   
compare_opr   r+  r  r   r  r  r  r   rj   rj   rk   	reduction  sv    
    
& 

"


zCppKernel.reductionc              
   C   sB   |  |}| j|}| jt|| dt| d| d d S )Nr  rS  r   )r  r  rT  r   rw  r<   rD   )r   r  r   r  r~   rj   rj   rk   store_reduction(  s
    
zCppKernel.store_reductionc                    s    j rR j t|t| ks>t j  dt| dt|  jt|kstnLt|t|  _  fdd j D  _dd tt jD  _t| _ jd  j  j jd  fS )Nr  rr   c                    s   g | ]}  |qS rj   )r  )r   r5  r   rj   rk   r   7  s     z(CppKernel.set_ranges.<locals>.<listcomp>c                 S   s   g | ]}t tj|qS rj   )r/   r   ZXBLOCKr   nrj   rj   rk   r   8  s   )r  r|   rg   r  r   r  r   r  )r   lengthsZreduction_lengthsrj   r   rk   
set_ranges/  s     

zCppKernel.set_rangesc                 C   s   t jjjt| jddS )N    fallback)r5   rP  sizevars	size_hintr0   r  r   rj   rj   rk   ri  B  s     zCppKernel.size_hintc              	      sh  t  | jd k	st }tdd |D rft|dks>tt|d tsPt|d 	 }n| 	 }t
 }|r r  n
 | ndkrʈ r|   tdfdd fdd	ddddtt d fddtd fdd|   jrPj n
j W 5 Q R X d S )Nc                 s   s   | ]}t |tV  qd S r   )r{   r   r   rj   rj   rk   r   K  s     z/CppKernel.codegen_loops_impl.<locals>.<genexpr>r6   r   )loopc                    sR   dd }|   }t|dks tt|d tsB|| rB|d    |d  d S )Nc                 S   s   |   }|jo|jS r   )get_rootr  parallel)rj  r   rj   rj   rk   is_parallel_reductionc  s    zTCppKernel.codegen_loops_impl.<locals>.gen_loop_kernel.<locals>.is_parallel_reductionr6   r   )get_kernelsr   rg   r{   r   r.  )rj  rm  kernels)
gen_kernelrj   rk   gen_loop_kernelb  s     z5CppKernel.codegen_loops_impl.<locals>.gen_loop_kernelc              
      s   t | trX| jD ]D}|jr*|j|j qt }|   | W 5 Q R X qnt `}| sjtt	| dr 
| j |   |    
| j  
| j  
| j W 5 Q R X t	| dr܈ 
| j d S )Ncodegen_inner_loops)r{   r   r   r  
contextlib	ExitStackenter_contextr  rg   r  ry  r
  rr  rH  rx  r(  r  )r   rj  stackr{  rq  	gen_loopsrj   rk   rp  o  s$    






z0CppKernel.codegen_loops_impl.<locals>.gen_kernelr   c                 S   s   |dkst | D ]}| D ]x}|dkr<|j|jf    S |dkrf|j}|jrZ|j| }|    S |j}|jr~||j }n
||j	 }|    S qqd S )N)r   r  localry  r  )
rg   rn  r  r  r   rl  r  r  r  r  )loopsrK  rj  r   r  r   rj   rj   rk   get_reduction_code_buffer  s"    

z?CppKernel.codegen_loops_impl.<locals>.get_reduction_code_bufferF)rz  c              	      s   t  }d  }}| r| d }|jrL|sL| }|rB|    |  r|jr| d\}} |r|s~t | | D ]}| q| r| d } r|jr|r | 	  |jr|s | d W 5 Q R X d S )Nr   ry  r  )
rs  rt  r  ru  r  ry  is_reduction_onlyrl  rg   close)rz  Zin_reductionZstack_outerr  r  rj  r  )r{  gen_loopr{  r   threadsworksharingrj   rk   rx    s8    







z/CppKernel.codegen_loops_impl.<locals>.gen_loopsc              	      sl   t  Z}|  }|d kr(W 5 Q R  d S  | |   | jrV| j| j n|  W 5 Q R X d S r   )rs  rt  linesr  ru  r  r   r  )rj  rv  Z
loop_linesrw  rj   rk   r~    s    

z.CppKernel.codegen_loops_impl.<locals>.gen_loop)r   )F)r,   r  rg   rn  rR   r   r{   r   decide_parallel_depthmax_parallel_depthrs  rt  r|  r}  rl  mark_parallelsingleru  r  r   r   r   r   )r   r   r{  r  ro  	par_depthrv  rj   )	r{  rp  r~  rq  rx  r{  r   r  r  rk   codegen_loops_implG  s@      



" zCppKernel.codegen_loops_implc                 C   s   t | }| ||| d S r   )LoopNestWithSplitbuildr  )r   r{  r  r   rj   rj   rk   codegen_loops  s    
zCppKernel.codegen_loopsr  c                 C   s   t jjrdS dS d S )NZAOTI_TORCH_CHECKZTORCH_CHECK)r5   rP  Zaot_moder   rj   rj   rk   assert_function  s    zCppKernel.assert_functionc           	      C   s   | j d k	st| j d | }|  }d}d}|D ]Z}tjjj|dd}|d| ksZ||kr^ q|| tjjk rr q|d7 }||9 }|| }q0tjj	r|dkrt
|dkrd}|S )Nr6   r   re  rf  r   )r  rg   ri  r5   rP  rh  r   r   Zmin_chunk_sizer   r   )	r   r  r  r  seqZpardepthr   hintrj   rj   rk   r    s"    
zCppKernel.decide_parallel_depthc                 c   s   | j | j| j| jf}t | _ t | _t | _| j | _d V  | j| j  | j| j | j| j |\| _ | _| _| _d S r   )rH  rx  r(  r!  r>   cloner   ry  )r   r0  rj   rj   rk   write_to_suffix  s    zCppKernel.write_to_suffixc                 O   s
   t ||S r   )r  )r   r  r  rj   rj   rk   create_cse_var  s    zCppKernel.create_cse_var)r6   r4  r   )N),r   r   r   r'  r  rC   rI  r  r  r   r   rl   r  rZ   r&  r.  rs  contextmanagerr  r3  r   r&  r7  r:  r   r?  r@  rB  rW   rM  r  rW  r_  r`  rd  ri  r  r  propertyr  r  r  r  r   rj   rj   r   rk   r     sX   
7
;     

A 
r   c                	       s  e Zd ZeZddejf fdd	Zej	ej
dddZejedd	d
ZejedddZejfejedddZeejedddZedddZd9eej	ejee dddZd:ee ej	ejee eeeef  ee dddZeej	d fddZeeef eej	ejdd d!Zd;d"d#Zd$d% Zd&d' Zeed(d)d*Zeej
ed+d,d-Z d.d/ Z!d0d1 Z"d<d2d3Z#d=d5d6Z$d> fd7d8	Z%  Z&S )?r   r   r4  c                    sH   t  || t | _| js"t|dkr8| jj|d}|| _|| _d S )Nr   ri   )	r   r   r   pick_vec_isavec_isarg   	nelementstiling_factorr  )r   r  r   r  r  tiling_dtyper   rj   rk   r   	  s    

zCppVecKernel.__init__r;  c                    s`     ||rd S  fdd|jD D ]}t|ts6t|jr$ d S q$t|| j}|jr\|S d S )Nc                 3   s(   | ] }t |tjr jj|j V  qd S r   r   r   rG  r!  r"  r  r<  r   rj   rk   r     s   z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>)	r?  r  r{   r  rg   r  r   r  r  )r   r   r$  indirect_varr  rj   r   rk   r    s    

z"CppVecKernel._try_get_const_strideri   r   c                 C   s0   t | j|j d | j  }|dks,t|S )N   r6   )mathr\  r  itemsizer  Z	bit_widthrg   r   ri   num_vectorsrj   rj   rk   r  '  s
    zCppVecKernel._get_num_vectorsc                 C   s<   |  |}|dkr"dt|  dS dt|  d| dS d S )Nr6   at::vec::Vectorized<ro   zat::vec::VectorizedN<r  )r  rE   r  rj   rj   rk   r  .  s    
zCppVecKernel._get_vec_typec                 C   s.   |t jkrdS | |}dt|  d| dS )Nr   zat::vec::VecMask<r  ro   )re   rW   r  rE   r  rj   rj   rk   r  5  s    

zCppVecKernel._get_mask_type)r  ri   r   c                 C   s<   |j tjkstt|| |}| dt|  d| dS )Nz.template cast<r  rc   )ri   re   rW   rg   reprr  rE   )r   r  ri   r  rj   rj   rk   r  ;  s    
zCppVecKernel._get_mask_castr   c                 C   s   t d|S )Nztmp_acc[0-9]+_vecr"  r%  rj   rj   rk   r&  @  s    z&CppVecKernel.get_reduction_var_patternN)r~   r   ri   	load_maskc                 C   s   t  }|dk	stt| }| |}d}|r\|jsL| tj d| d}n| |tj }|dkrv| dt	| n|}	|tj
kr|   d|	 d}
n>|r| d| d| d|	 dn| | d	|	 d
| j d}
|
S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        Nr  rw   r   rr   z.template loadu<r  r3  z::loadu(rv   )r  rg   rE   r  r  r  re   rY   r  rD   rW   r  r  )r   r~   r   ri   r  r   Zcpp_typer  Zload_mask_strloadbufr!  rj   rj   rk   _get_vec_load_lineC  s"    

zCppVecKernel._get_vec_load_line)r~   r   ri   rK  store_valuer   c                    s>  |r|dk	st d dkr"j tjtdfddttd fdd}t }|dk	sbt t }|d	 |	 h |}	d
t
|  d|	 d}
||
 |r|| d tjj  d}i }fdd|jD D ]4}t|tst |jr||}| d| d||< qj|j|d}d}jdk	r|rNt dtjtsft jjjrj d| d}nj d}t r|dj  n|dj  |d| d| dj d| d	 |	  t }t|}|D ]"}td|  d || |}q|dk	rD| d| dn| }|rp|d | d ||	  |r|| d!| d" n|d#| d$| d% W 5 Q R X W 5 Q R X |sd&d'|}|d(| d% W 5 Q R X |d) |r|d%  | dS j |}t|ts0t d*|_|S dS )+a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr  c                    s$   | j dk r jd| j   S  jS d S )N   )r  r  r  r   rj   rk   get_result_size  s    
zCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size)vec_varr   c              	      s   | j s
tt }|d | j | j}|d k	s6t|tjkrFtj}|}|dt	|  d| d |  d}|| |d W 5 Q R X |d j
 |}t|tst|S )Nr  __at_align__ std::array<rv   	> tmpbuf;.store(tmpbuf.data());zreturn tmpbuf;r  )r  rg   r7   rw  r  ri   re   rW   rY   rE   r!  r  r{   r  )r  r{  r  result_sizer!  r  rK  r  r   rj   rk   vec_to_array  s&    






z@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_arrayr  r  rv   r  r  r   c                 3   s(   | ] }t |tjr jj|j V  qd S r   r  r<  r   rj   rk   r     s   z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>r  r   r6  r   zunexpected store with load maskz.is_masked(rw   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r  r[  ++)z\br  z
 = tmpbuf[z];ztmpbuf[rS  r   ztmpbuf.data()r   r  r  T) rg   rH  re   ri   rX   r  r  r7   rw  r  rE   r.   r  r  r  r{   r  r7  r/  r   is_gccr  rs  rt  rD   r#  r.  ru  r  ry  r!  r  )r   r~   r   ri   rK  r  r  r   r{  r  Zresult_declareZitervar_innerZreplacementsr  Z	array_varr  rv  Zindex_crhsZ	load_liner  rj   r  rk   r  j  s    



  

 ,


z*CppVecKernel._load_or_store_non_contiguousrN  c           
         s   t  }| j|}| |}tj|}| j| j }| 	||}|dkrVt
 ||S |dkr| |||| j}| j| j|}	n| |||}	t|	tst|	d||fi  d|	_|	S )Nr   r6   r  T)r  r  rO  r  r5   rP  rQ  r  r  r  r   r  r  r/  r!  r  rH  r  r{   r  rg   r  r  )
r   r  r   r   r~   ri   r  r  r!  r  r   rj   rk   r    s     
zCppVecKernel.load)r  r~   r   ri   c           	      C   s   t |ts"t |tr|js"t|| j| j }| dt| }| ||}t	 }|dkr|t
jkr||| d| d q|| d| d| j d n| j|||||d |S )a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        rr   r6   .store(ru  rv   )rK  r  )r{   rZ   r  r  rg   r  r  rD   r  r>   re   rY   rw  r  r  )	r   r  r~   r   ri   r  Zvar_exprr  r{  rj   rj   rk   _get_store_line  s.    

     zCppVecKernel._get_store_linec                    s   d kst |d kst t|ts*t ||js:| |}t }| j }| | | 	|}| 
|||tj }| j| fdd d S )NrR  c                    s
   t  | S r   r<   r:  r  rj   rk   <lambda>-	      z$CppVecKernel.store.<locals>.<lambda>)rg   r{   r  r  r  r  r  rT  r3  r  r  r5   rP  rQ  r(  ry  map)r   r  r   r  rV  r   r~   r{  rj   r  rk   rW  !	  s    


zCppVecKernel.storec              
   C   s  |dkst ||kst |tjtjfks,t t|ts>t ||jsN| |}|||f}|| jj	krp| jj	| S d}| dt
|  d}t||}| ||}	| jj| jd| dd}
|
 d}d	| _| j| d
|
 dt|| d | j|	 d
| d| || d tdd | j| jd  }|dkr| j| jkrL| jnd}t||| _| j| |d  | j| d| |||d	 d n"| j| d| ||| d |  |
||| | j ||	||| j| j| jd | j| jkrt!|r$| "|dkst dd| d}nXd| |dd d }dt
|  d}dt
|  d}| d| d| d| d| d
}| j#|
 dt$||
| d |
}n|}t%||}|| jj	|< |S )N>   rO   rQ   rT   rS   rK   rP   rL   zat::vecz::Vectorized<ro   rX  FrY  Z_vecTr   r  r   c                 S   s   | | S r   rj   rj  rj   rj   rk   r  V	  r  z(CppVecKernel.reduction.<locals>.<lambda>rS   r6   )r  r  r  z4Welford reduction does not support VectorizedN (N>1)zwelford_vec_reduce_all(rw   z	{ return r5  rk  z; }r  zat::vec::vec_reduce_all<z([](z& x, z& y) rv   )&rg   re   rY   r  r{   r  r  r  r	  r\  rE   rq   reduction_acc_type_vecr  rH  r  r  rw  rl   reduction_init_vec	functoolsreducer  r  r  r  r   r  r  welford_weight_reciprocal_vecr(  reduction_combine_vecr  r+   r  r   r   r}   )r   ri   r   rh   r  r]  Zvec_nsr  r  Zacc_type_vecr   Zacc_vecZreduction_sizeZreduction_factorr   Zreduce_all_bodyZvec_reduce_all_funcr   r   rj   rj   rk   r_  /	  s    	


  
 



"
zCppVecKernel.reductionc           	   
      s   |  |}| j }tj }|jr.tjntj	}t
 }| j| jkrr|| dt| dt|  d| d nV||krt|  d| }|d| dt|  d| d |}|| |||| | j| fdd	 d S )
Nr  z] = static_cast<r3  ru  _rt  z = at::vec::convert<c                    s
   t  | S r   r  r:  r  rj   rk   r  	  r  z.CppVecKernel.store_reduction.<locals>.<lambda>)r  r  rT  r5   rP  rQ  Zis_floating_pointre   rY   r  r>   r  r  rw  rD   rE   ry  r  r   r  )	r   r  r   r  r~   Z	out_dtyperi   r{  Zconverted_valuerj   r  rk   r`  	  s"    
"zCppVecKernel.store_reduction)
scalar_varr   c                 C   s   |j r
t|jtjkr:| j| j|   d|j	 d}n4|jd k	sHt| j| j| 
|j d|j	 d}t|ts|t|j|_|j|_d|_ |S )Nr  rw   r  T)r  rg   ri   re   rW   r!  r  rx  r  r  r  r{   r  r  )r   r  r  rj   rj   rk   r  	  s     
 zCppVecKernel.broadcast)r   r  r   c              	   C   sb   |j r
t|jd k	st| j| j| |j d| d| d}t|tsPt|j|_d|_ |S )Nz	::arange(rv   rw   T)	r  rg   ri   r!  r  rx  r  r{   r  )r   r   r  r  rj   rj   rk   r  	  s    
zCppVecKernel.arangec                 C   s@   t | }| |}t|r&d| dS t||}| d| dS )Nrb   rc   r  rw   )r=   r  r+   rl   )r   rh   ri   rp   vec_typeZscalar_initrj   rj   rk   r  	  s    

zCppVecKernel.reduction_init_vecc                 C   s6   |dkst t| }| |}t|r2d| dS |S rm   )rg   r=   r  r+   )r   rh   ri   rp   r  rj   rj   rk   r  	  s    
z#CppVecKernel.reduction_acc_type_vecc                 C   s6   |rt | j|n| j}t|}d| | d| dS )Nzstatic WeightRecp<z> weight_recps(ru  )r   r  rD   r  )r   ri   r   Zvec_num_range_threadZvec_num_range_thread_exprrj   rj   rk   r  	  s    z*CppVecKernel.welford_weight_reciprocal_vecFc              	   C   s   |dkrd| d| dS |dkr4d| d| dS |dkrJ| d| S |d	kr`| d
| S |dkrv| d| S |dkr|rd| d| dS d| d| dS nR|dk rt |tr|\}}}nt||\}}}d| d| d| d| d	S td S )NrL   r  rv   rw   rK   r  rO   rr   rP   rs   rQ   rt   rS   rx   z, &weight_recps)rT   ry   rz   )r{   r|   r}   rU  )r   rh   r~   r   Zuse_weight_recpsr   r   r   rj   rj   rk   r  	  s(    

z"CppVecKernel.reduction_combine_vecc           	   	      s4  |rt dt|tst |jd k	s(t |js@t ||||S |}|}|rd| |j d| d}|r| |j d| d}|r|rd| d| d| d| d	}| d| d| }nF|r| d| }| d| }n$|st | d| }| d| }d| |j d| d}| j	 d| d| d	S )
Nz2do not support mask in indirect_indexing assertionr  rw   r  z) & (r  z)).all_masked()z, "index out of bounds: z"))
rg   r{   r  ri   r  r   rJ  r  r  r  )	r   r~   rD  rE  r  Zlower_scalarZupper_scalarZcondZ
cond_printr   rj   rk   rJ   
  s,    zCppVecKernel.indirect_assert)N)NN)N)N)F)N)'r   r   r   r  r  re   rY   r   r   r&  r   r  ri   rX   r  rZ   r  r  r  r  r&  r	   r  r>   r   r  r  r  rW  r_  r`  r  r  r  r  r  r  rJ  r   rj   rj   r   rk   r     s\    ,   
"
d
	

 
r   c                       s   e Zd ZdZeZ fddZdd Zdd Zdd	 Z	e
ejd
 fddZd fdd	Zdd Z fddZejejdddZ  ZS )r  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    c                    s"   t  ||||d | || _d S r   )r   r   tiling_indices)r   r  r   r  r  r  r   rj   rk   r   ;
  s        zCppTile2DKernel.__init__c                 C   s   t | j| j  dS )Nr   )r.   r  	outer_idxr   rj   rj   rk   inner_itervarA
  s    zCppTile2DKernel.inner_itervarc                 C   sh   | j | j }| j | j }t||| j}t||| j}| jd kof|dkof||of|| of|| S r   )r  r  r  r   r  r/  r   )r   r   Z	outer_varZ	inner_varZouter_strideZinner_striderj   rj   rk   need_vec_transposeD
  s    


z"CppTile2DKernel.need_vec_transposec                 C   sN  t j|}| j}| dt| }d}tt|| j| j | j }	| }
|rb|| }}|
|	 }	}
d}dt|  d| d| d| d|	 d| d|
 d}|r| j	
 }n2|| j	jkr| j	j| j|d	d
}nd	}| j	j| }|rt|  d| d| d| d| d
}| j| |dt|}|r>| jt|| n| j| |S )Nrr   Z__place_holder__Tzat::vec::transpose_mxn<r  r3  rv   ru  FrY  r   r  rI   z] __attribute__ ((aligned (z)));)r5   rP  rQ  r  rD   r   r  r  rE   r!  rv  rq  r  r
  rw  r   rZ   r  r<   )r   r  r~   r   is_storeri   factorsrcdstZld_srcZld_dstZneed_defineZload_or_storetile_varZdefine_linerj   rj   rk   gen_transposed_tile_load_storeQ
  s2    

4&z.CppTile2DKernel.gen_transposed_tile_load_storerN  c                    s   t  }| j|}| |}|  }| |r| j|||dd}| dt|| j  }t	j
|}| |d|}	| j| j|	}
|
d||fi  t|
tstd|
_|
S | |}t ||S d S )NFr  rr   r   r  T)r  r  rO  r  r  r  r  rD   r  r5   rP  rQ  r  r!  r  rH  r  r{   r  rg   r  r  r   r  )r   r  r   r   r~   r   r  r  ri   r!  r  r   r   rj   rk   r  s
  s*    

   
zCppTile2DKernel.loadNc                    s  d|kst t }| j|}|  }| |}|d ks<t | |r| j|||dd}| dt|| j	  }	t
j|tkr| d|	 d| j	 d}
nBt
j|tjtjfkr| d|	 d| j	 d}
n| d|	 d}
| jt||
 n| |}t |||| d S )NrR  Tr  rr   r  rv   ru  )rg   r  r  rT  r  r  r  r  rD   r  r5   rP  rQ  rd   re   r  r  r(  rw  r<   r  r   rW  )r   r  r   r  rV  r   r~   r   r  Zstorebufr!  r   r   rj   rk   rW  
  s,    

   
zCppTile2DKernel.storec                 C   s2   |   }|d| d| d| j d| d	 d S )Nr  r  r  r[  r  )r  rw  r  )r   r{  r   rj   rj   rk   rr  
  s    z#CppTile2DKernel.codegen_inner_loopsc                    s:   t  ||}| jd | jk r$| jnt| j\| _| _|S r   )r   rd  r  r  reversedr  r  )r   r*  reduction_groupr  r   rj   rk   rd  
  s    
zCppTile2DKernel.set_rangesr8  c                 C   s   | j || j|  dS )Nr  )r7  r  r  r9  rj   rj   rk   r  
  s
    z"CppTile2DKernel.transform_indexing)N)r   r   r   r  r  r  r   r  r  r  rZ   r   r&  r  rW  rr  rd  r  r   rj   rj   r   rk   r  
  s   "
r  c                       s|   e Zd Zd fdd	ZdddZeejddd	Zdd
dZ	dd Z
ejejeedddZdd Zdd Zdd Z  ZS )CppVecKernelCheckerr4  c                    s   t  |||| t jd8  _d | _d| _g | _tj	 D ]\}}t
|tr<| j| q<t | _tjtjtjtjtjtjtjtjg| _d S )Nr6   T)r   r   r   generated_kernel_count_orig_wrapper_codesimd_vecfast_vec_listr  __dict__r  r{   r  r   rs  rt  
exit_stackre   rY   r8  r7  rW   r  r  r  r  supported_dtypes)r   r  r   r  r  kvr   rj   rk   r   
  s$    

zCppVecKernelChecker.__init__Nc                 C   s"   t tjrt d| d| _d S )NzDisabled vectorization: %sF)schedule_logisEnabledForloggingDEBUGdebugr  )r   msgrj   rj   rk   disable_vec
  s    zCppVecKernelChecker.disable_vecrN  c              
   C   s   t t}tj|}| }|s&t||_| j	 }t
| jdkr^| d |W  5 Q R  S || jkr|| j| j st|tjr| | d |W  5 Q R  S |W  5 Q R  S Q R X d S )Nr   
not a loopz not supported by load)r   r   r5   rP  rQ  r  rg   ri   r!  rv  r   r  r  r  r   r  r   r   rG  )r   r  r   node_ctx
load_dtyper   r~   rj   rj   rk   r  
  s"    




zCppVecKernelChecker.loadc              
   C   s   t t}t| jdkr4| d | jW  5 Q R  S tj|}|	 }|sPt
||_|| jkr| | d | jW  5 Q R  S d|kst
| |}|r| d|  | jW  5 Q R  S | jW  5 Q R  S Q R X d S )Nr   r  z not supported by storerR  zstore mode: )r   r   r   r  r  r  r5   rP  rQ  r  rg   ri   r  r  )r   r  r   r  rV  r  store_dtyper   rj   rj   rk   rW  
  s"    



zCppVecKernelChecker.storec                 C   sj   |t jkr|t jksL|t jkr0|t jkr0|tksL| d| d| d|  t|rdt| jgd S | jS )Nzreduction: dtype z, src_dtype z, reduction_type r   )re   rY   r  VECTORIZABLE_RTYPESr  r+   r|   r  )r   ri   r   rh   r  rj   rj   rk   r_    s     zCppVecKernelChecker.reductionrC  c                 C   s   | j S r   r  )r   r   r   rD  rE  rj   rj   rk   rM    s    z CppVecKernelChecker.check_boundsc                 C   s   | j S r   r  )r   r  r   r  rj   rj   rk   r`  !  s    z#CppVecKernelChecker.store_reductionc                 C   s,   | j d k	st| j tj_| j||| d S r   )r  rg   r5   rP  wrapper_coder  r   r   rj   rj   rk   r   $  s    
zCppVecKernelChecker.__exit__c                    sZ   t jj_t t j_t   G  fddd}jt |  jt 	 S )Nc                       s   e Zd Ze fddZeeejdfddZedfdd	Z	efd	d
Z
efddZeejejeedfddZefddZefddZedddZefddZedfdd	ZdS )z6CppVecKernelChecker.__enter__.<locals>.VecCheckerProxyc                    s    fdd}|S )Nc                     s>    j krd   t | |}tfdd|S )Nzop: c                    s    j S r   r  )r  r   rj   rk   r  >  r  zcCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.__getattr__.<locals>.inner.<locals>.<lambda>)r  r  r   pytreeZtree_map)r  r  Z
parent_val)r  parent_handlerr   rj   rk   r   9  s    
zQCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.__getattr__.<locals>.innerrj   )r  r   r  r   r  rk   r  7  s    zBCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.__getattr__rN  c                    s     | |S r   r  rN  r   rj   rk   r  B  s    z;CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.loadNc                    s    j | |||dS )N)rV  )rW  )r  r   r  rV  r   rj   rk   rW  F  s    z<CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.storec                    s     | |||S r   )r_  )ri   r   rh   r  r   rj   rk   r_  J  s    z@CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.reductionc                    s     | ||S r   )r`  )r  r   r  r   rj   rk   r`  N  s    zFCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.store_reductionrC  c                    s     | |||S r   )rM  rC  r   rj   rk   rM  R  s    zCCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.check_boundsc              
      s   t t}| }|stttj}|tjkrb| |jkrb| |j	krbt
dd |jjD rbtj|_ttj}|tjkr| |jkr| |j	ks| tjks| tj krtj|_|j jkrȈ d|j  | W  5 Q R  S Q R X d S )Nc                 s   s   | ]}|j tkV  qd S r   targetr  r   userrj   rj   rk   r   e  s   zRCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.constant.<locals>.<genexpr>zconstant dtype: )r   r   r  rg   re   iinfor  r  rL   rK   r   r   usersri   Zfinforf   rV   infr  r  )r  ri   r  r   	i32_iinfoZ	f32_iinfor   rj   rk   r  X  s:    
	

z?CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.constantc              
      s   t jt jkst fdd}tt}t jt jksHt| }|sXt|tjkr| rt	dd |j
jD rtj|_nd  d|  j }|W  5 Q R  S Q R X d S )Nc                     s   t j  fddtjjD } tdd |  D r@dS dd |  D }|rft|t krt	
t	j}jo|jko|jkS t|}t|jst|jrdS ttt|jt|jd S )	Nc                    s   i | ]\}}| kr||qS rj   rj   r   r  r  r  rj   rk   
<dictcomp>  s    zlCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.can_use_int32.<locals>.<dictcomp>c                 s   s   | ]}|d kV  qdS )r   Nrj   )r   r  rj   rj   rk   r     s     zkCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.can_use_int32.<locals>.<genexpr>Tc                 S   s2   i | ]*\}}t |tjr|jr|td |d qS )r   r6   )r{   r   r&  r  r   r  rj   rj   rk   r    s
     Fr6   )r   r  rz  r  r  rR   r1  r  r   re   r  r  r  rL   rK   r   r  ra  rD  rE  r!   r   rX   )sizesZvars_rangesr  Zexpr_ranges)r   r   r  rk   can_use_int32}  s2    


 zXCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.can_use_int32c                 s   s   | ]}|j tkV  qd S r   r  r  rj   rj   rk   r     s   zTCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_expr.<locals>.<genexpr>zindex_expr: z, dtype )r   r  r  rg   r   r   r  re   r  r   r   r  r  ri   r  r!  rv  )r   ri   r  r  r   Ztmp_varr   r   rk   r  y  s$    #


zACppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.index_exprTc                 S   s   t t| S r   )r.   rZ   )Z	index_varr   checkrj   rj   rk   indirect_indexing  s    zHCppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.indirect_indexingc                    s   |   j  S r   )r!  rv  )r  r  r  r   rj   rk   r    s    z=CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.maskedc                    s   | j kr d|  | S )Nz
to_dtype: )r  r  r4  r   rj   rk   r6    s    
z?CppVecKernelChecker.__enter__.<locals>.VecCheckerProxy.to_dtype)N)T)N)r   r   r   r  r  rZ   r   r&  r  rW  r_  r`  rW   rM  r  r  r  r  r6  rj   r  rj   rk   VecCheckerProxy6  s4   
    9r  )
r5   rP  r  r  r    ZMockHandlerr  ru  Zset_ops_handlerset_kernel_handler)r   r  rj   r  rk   r   *  s    

 zCppVecKernelChecker.__enter__)r4  )N)N)r   r   r   r   r  rZ   r   r&  r  rW  r_  rW   rM  r`  r   r   r   rj   rj   r   rk   r  
  s   

   r  c                       sj   e Zd Z fddZdd ZedddZdd	 Zej	fd
dZ
dd Zee dddZdd Z  ZS )CppKernelProxyc                    s4   t  |j|jj || _d | _d | _t	 | _
d S r   )r   r   r  wsr   r   r   r  r   r  picked_vec_isar   r   r   rj   rk   r     s
    zCppKernelProxy.__init__c                 C   s&   |D ]}t |tstt| qd S r   )r{   r'   rg   r;   propagate_scheduler_node)r   nodesr   rj   rj   rk   data_type_propagation  s    z$CppKernelProxy.data_type_propagation)scheduler_nodec                 C   s   t |jtjsdS d }t| |jjgt|jj	  }|D ]}|j
jD ]}|jdksJ|jdkrdqJ|jdkrv  dS t|dr|jrtj|jkst|jtj }|jr|jtkr  dS |r||jkstdq|j}qJ  dS qJq>||_dS )NTplaceholder)	get_indexr  )r  rW  r;  r>  rT  Fr   z+scheduler node do not support bf16/fp16 mix)r{   _bodyr   LoopBodyr;   r  
root_blockr   	subblocksr1  rP  r  opr  r  r   rB   r   rg   ri   rd   _lowp_fp_type)r   r  r  
sub_blocks	sub_blockr   r   rj   rj   rk   is_lowp_fp_scheduler  s6    



z#CppKernelProxy.is_lowp_fp_schedulerc                    s  t jjddd tjd fdd}tfdd|D r|D ]t}|jjgt|jj	
  }|D ]P}|jjD ]B}|jd	krn|jsttj|jkst|jtj }|jtksntqnqbq@d S |D ]R}t|tstt|jtjst|}td
dd}	|	| }
|
r|j}|| qd S )N	sub_graphc              
      s  t jjddd}t jjddd}t| j}g  |D ]}||rtdd |jD rZq6|jd }| |@ | j	d	||t j
fd
}|j}|| ||_t jd7  _W 5 Q R X q6||rB|j\}}}	}
}	|
jdkrtdd |
jD rq6tj|}| |4 | j	d	||
|fd
}||
| t jd7  _W 5 Q R X q6|jdkr|j\}}}}}|tkr|t j
t jt jt jfkst||tkrt j
n|t j
||f|_q6|jd	kr6|jd tkr6|j\}}}	 | ||t j
f|_q6q6t jjd fdd}||  d S )Nr  c                 S   s:   | j dkrdS t| jdks ttj| jd }|tkS )Nr  Fr   r6   )r  r   r  rg   r5   rP  rQ  rd   )r   r  rj   rj   rk   is_lowp_fp_load  s
    
zTCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.is_lowp_fp_loadc                 S   s2   | j dkrdS | j\}}}}}tj|}|tkS )NrW  F)r  r  r5   rP  rQ  rd   )r   r  Z	store_varr  rj   rj   rk   is_lowp_fp_store  s
    
zUCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.is_lowp_fp_storec                 s   s   | ]}|j d kV  qdS rW  Nr  r  rj   rj   rk   r     s     zNCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.<genexpr>r   r6  r  r6   r  c                 s   s   | ]}|j d kV  qdS r  r  r  rj   rj   rk   r   )  s    r_  r4  r  c                    s"   t jjd fdd}||  d S )Nr  c                    s   t jjddd dd | jD } fdd|D }|D ]n}| D ]`\}| jkrDtfdd|D skrDtd	d |D rDjd
 }| |  qDq8| j	d kr| 
  d S )NZto_nodec                 S   s   t dd | jD S )Nc                 s   s   | ]}|j d kV  qdS )r6  Nr  r   usrrj   rj   rk   r   k  s     zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>)r   r  r  rj   rj   rk   _used_by_toj  s    zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_toc                 S   s   g | ]}|j d kr|qS )r6  r  r   rj   rj   rk   r   m  s    
 zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<listcomp>c                    s   g | ]} |r||j iqS rj   )r  r   )r  rj   rk   r   p  s     c                 3   s"   | ]}|j d   j d  kV  qdS r4  Nr  r  r  rj   rk   r   v  s     zCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>c                 s   s   | ]}|j d  tkV  qdS r  )r  rd   r  rj   rj   rk   r   y  s    r4  )re   r  r  r  r  r   Zall_input_nodesreplace_all_uses_withZ
erase_nodeZowning_moduleZlint)r  Zall_to_nodesZall_to_nodes_and_usersZ
node_usersr  Zval_nodeZto_lowp_fp_legalized_nodes)r  r   rk   _eliminate_duplicate_to_nodec  s.    

	

	
z}CppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node)re   r  Graph)r  r"  r!  rj   rk   eliminate_to_dtypeb  s    )zWCppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype.<locals>.eliminate_to_dtype)re   r  r  r   r  r   r  r  Zinserting_afterZcall_methodrY   r   r   Zcpp_to_dtype_countr  r5   rP  rQ  Zinserting_beforeZreplace_input_withrd   r8  r7  r  rg   r   r#  )r  r  r  Zsub_graph_nodesr   r3   Zto_type_nodeZto_type_node_argsr  r  	value_varri   r   rh   r  r5  r$  rj   r!  rk   add_to_dtype  sx    


 


 

,z;CppKernelProxy.legalize_lowp_fp_dtype.<locals>.add_to_dtype)	loop_bodyc                    s.   | j gt| j  }|D ]} |j qd S r   )r  r   r  r1  rP  )r'  r  r  )r&  rj   rk   _legalize_lowp_fp  s    z@CppKernelProxy.legalize_lowp_fp_dtype.<locals>._legalize_lowp_fpc                 3   s"   | ]}t |to |V  qd S r   )r{   r'   r  r   r   r   rj   rk   r     s   z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>)r  rW  r  c                 S   s$   | j j}t|dko"d|ko"d|kS )Nr   r  rW  )Zread_writes	op_countsr   )r   r*  rj   rj   rk   is_memory_copy_scheduler_node  s    zLCppKernelProxy.legalize_lowp_fp_dtype.<locals>.is_memory_copy_scheduler_node)re   r  r#  r   r  r   r  r  r   r  r1  rP  r  r  r   rg   rB   r   ri   rd   r{   r'   )r   r  r(  r   r  r  Zfx_noder   r   r+  Zshould_legalizer  rj   )r&  r   rk   legalize_lowp_fp_dtype  s:     





z%CppKernelProxy.legalize_lowp_fp_dtypec              	      sX  t  t kstjtdd d\ fdd} fdd|t}tj j|jO  _tj j	|j	O  _	t
|_jsd S  fdd	tjftjd
fdd}tjjjddr ||\}}t |t |kstt |dkr||t|d |d |}	t jd7  _jj|d |d d\}
}|
|	 || d|
_d|_|d d |_nt |dkrJ|d t jd kr|d |d kst|t|d ||}|t|d |d |}	t jd7  _jj|d |d d\}}|| |j|d |d  |d d\}}|| ||	 W 5 Q R X d S )Nc                 S   s   t | d S r   r   )r  rj   rj   rk   r    r  z2CppKernelProxy.codegen_functions.<locals>.<lambda>r   c              
      sB    j | f| (}t jd8  _| |W  5 Q R  S Q R X d S r   )
new_kernelr   r  )r   r  r   )r   runrj   rk   codegen_kernel  s    z8CppKernelProxy.codegen_functions.<locals>.codegen_kernelc              
      s   |  \}}d}t D ]\}}|fttdffkrZ|rNt||| qd}|dfkstd| d d |   ||d W 5 Q R X qd S )NFrj   Tzunexpected group: r  rv   )rd  rz  r|   	itertoolschainrg   r  )r   r  Zreduction_varsZ	in_suffixfn	var_sizes)fn_listr*  r  var_sizes_listrj   rk   r/    s$    
z-CppKernelProxy.codegen_functions.<locals>.runc                    s  g }t  D ]6\}}tj|f| }|dd t|j|jD 7 }qt }g }t }t }|D ]}	|	jD ]}
t	
d|
js~qjt|	|
| }|dkrqjqj|dkr|t|
jdd   |t|
jdd   qjtdd |jD r|t|
jdd   qj|t|
jdd   qjq`|| | }t|dkrDtjd gS |rZt|dd  S ||@ | }t|}t|d	kr|d |kr|d tjd kr|S t||jd
dd  S )Nc                 S   s   g | ]
}|j qS rj   r  )r   deprj   rj   rk   r     s     zSCppKernelProxy.codegen_functions.<locals>.select_tiling_indices.<locals>.<listcomp>z^d\d+$r   r6   c                 s   s   | ]}t |tjV  qd S r   )r   r   ZSIZEr<  rj   rj   rk   r     s     zRCppKernelProxy.codegen_functions.<locals>.select_tiling_indices.<locals>.<genexpr>r4  r   r-  )rz  r   Zextract_read_writesr1  r2  ZreadsZwritesr  r  r#  r$  r  r   r   rX   r   r   r   r  sortedcount)r  Z	all_indexr3  r4  rwZcontig_varsZcontig_vars_listZnon_contig_stride_constZnon_contig_stride_otherr   r~   r  Zcontig_onlyZcontig_and_const_strideZcontig_vars_sorted)r5  r   r6  rj   rk   select_tiling_indices  sN     



z?CppKernelProxy.codegen_functions.<locals>.select_tiling_indicesr  c              
      s   j j| d}|}|rd}|D ]L}ttjjt ||*} | |oP|j}|sdW 5 Q R   qpW 5 Q R X q"|rt|dkr|g|fS t|dkr||g|fS g g fS )Nr  Tr6   r   )	r  r  r  r   r   r  r,   r  r   )ri   r  r  Z	could_vecZtiling_indiceZvec_checker)r/  r;  r   rj   rk   select_tiling  s,    


z7CppKernelProxy.codegen_functions.<locals>.select_tilingF)Zinplace_buffersr6   r   )r  Tr   )r   rg   r   rL   rd  r   r5   rP  removed_buffersZinplaced_to_remover  r  r   r  re   rY   ri   Z	_inductorr   patchr   r   Zgenerated_cpp_vec_kernel_countsplit_with_tiling
set_kernelr  simd_ompsimd_nelementsr  r  )r   r5  r6  r  r0  Zscalar_kernelr<  Ztiling_factorsr  Z
vec_kernel	main_loop	tail_loopZtile2d_kernelZouter_main_loopZouter_tail_loopZinner_main_loopZinner_tail_looprj   )r5  r*  r   r  r/  r;  r   r6  rk   codegen_functions  s    	+    


       

 
z CppKernelProxy.codegen_functionsc                 C   s$   |D ]}t | q| || d S r   )r;   Zpropagate_loopbodyrE  )r   Zloop_bodiesr6  r  rj   rj   rk   codegen_loop_bodies]  s    z"CppKernelProxy.codegen_loop_bodies)r  c                    s   |  | | | t|dks$t|d  t fdd|D rH jntj}dd fdd|D }d	d |D }| ||| d S )
Nr6   r   c                 3   s$   | ]}t |d o|j jkV  qdS )r  N)r  r  r)  )
first_noderj   rk   r   l  s   
z/CppKernelProxy.codegen_nodes.<locals>.<genexpr>c                 W   s4   |    |   ttjtr&| j| S | |S d S r   )decide_inplace_updatemark_runr{   r5   r   r2   r  Zcodegen)r   Z
index_varsrj   rj   rk   r3  t  s
    
z(CppKernelProxy.codegen_nodes.<locals>.fnc                    s   g | ]}t  |qS rj   )r  partialr   )r3  rj   rk   r   |  s     z0CppKernelProxy.codegen_nodes.<locals>.<listcomp>c                 S   s   g | ]}|j d  qS )r6   )r*  r   rj   rj   rk   r   }  s     )	r,  r  r   rg   r   r  re   rY   rE  )r   r  r  r5  r6  rj   )rG  r3  rk   codegen_nodesc  s    


zCppKernelProxy.codegen_nodesc                 C   s   |  | j|| d S r   )r  r   )r   r{  r  rj   rj   rk   r    s    zCppKernelProxy.codegen_loops)r   r   r   r   r  r'   r  r,  re   rY   rE  rF  r   rK  r  r   rj   rj   r   rk   r    s   1 6 #r  c                       s*   e Zd Z fddZedddZ  ZS )r   c                    s   t  |j|jj g | _d S r   )r   r   r  r  r   r   r  r   rj   rk   r     s    zOuterLoopFusedKernel.__init__r  c                    sv   g }dd | j D }|D ]N}|d j  d k	s2tt fdd|D sLt||d t | qt|t|S )Nc                 S   s   g | ]}|  qS rj   )rn  )r   rj  rj   rj   rk   r     s    z>OuterLoopFusedKernel.decide_parallel_depth.<locals>.<listcomp>r   c                 3   s   | ]}|j  kV  qd S r   r  r   rL  rj   rk   r     s     z=OuterLoopFusedKernel.decide_parallel_depth.<locals>.<genexpr>)	r   r  rg   r   r   r  r   rK   rL   )r   r  r  Zkernels_parallel_depthZnested_kernelsro  rj   rL  rk   r    s    
z*OuterLoopFusedKernel.decide_parallel_depth)r   r   r   r   rX   r  r   rj   rj   r   rk   r     s   r   c                   @   s   e Zd ZdZdZdZdS )ReasonFusedNodesZsame_vars_reduceZcompatible_reductionZcompatible_ranges_no_reductionN)r   r   r   SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrj   rj   rj   rk   rM    s   rM  c                   @   s   e Zd ZdZdd ZedddZdd Zd	d
 Zdd Z	e
e dddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zeeeef ddd Zeed!d"d#Zeee d$d%d&Zd'd( Zd)d* Zd+d, Zd2d.d/Z d0d1 Z!d-S )3CppSchedulingi  c                 C   s   || _ |   d| _d S r  )r   reset_kernel_group_ready_to_flush)r   r   rj   rj   rk   r     s    zCppScheduling.__init__)statusc                 C   s
   || _ d S r   rS  )r   rT  rj   rj   rk   _set_flush_status  s    zCppScheduling._set_flush_statusc                 C   s   t dd |D S )Nc                 s   s"   | ]}t ttjjj|V  qd S r   )r|   r  r5   rP  rh  r   r<  rj   rj   rk   r     s     z)CppScheduling.group_fn.<locals>.<genexpr>)r|   )r   r  rj   rj   rk   group_fn  s    zCppScheduling.group_fnc                 C   s4   ddl m} |  ttjj|r(t | _nt | _d S )Nr6   )CppWrapperCpu)	Zcpp_wrapper_cpurX  r{   r5   rP  r  CppWrapperKernelGroupr   KernelGroup)r   rX  rj   rj   rk   rR    s
    
z CppScheduling.reset_kernel_groupc                    s|  |  s|  rt||S | r<| r0tt||S | ||tjkrHt	|t
tfsbtt	|t
tfstt|j\}\}}|j\}\}}|dkr|dkst||f fdd t|t|k r|n|}t	|t
stt|t|k r|n|}	 |	}
|j|
d |j\}\}}|j\}\}}||ks<t||ft||S | ||rlt||| ||S t||S d S )Nrj   c           	         s   t | tr~t| jdks"t| jd }t }| jD ]>} |\}}|d krN|}||ksft||| jf|| q2|t|fS t | tst| j	}t |t
jst| \}}}|jt|j fS d S Nr   )r{   r%   r   snodesrg   r  r  r   r'   r   r   ComputedBufferZget_default_sizes_bodyrB  indexing_exprsr1  )	r   rB  r^  snoder  ZexprsZcomp_bufferr  r  get_indexing_ranges_exprsrj   rk   ra    s     

z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs)extra_indexing_constraints)Z
is_foreachr$   r   is_templaterg   r%   _why_fuse_nodesrM  rP  r{   r'   r*  r   Zrecompute_size_and_bodycan_fuse_vertical_outer_loopr   _get_outer_loop_fusion_depth)r   r   r   r  vars1reduce1vars2reduce2node_to_recompref_noderb  rj   r`  rk   r     sB    
  
zCppScheduling.fuser  c                 C   sb   |j \}\}}|j \}\}}||kr2||kr2tjS |dkrL||| krLtjS | ||r^tjS d S )Nrj   )r*  rM  rN  rO  &_can_fuse_nodes_with_compatible_rangesrP  )r   r   r   r  rg  rh  ri  rj  rj   rj   rk   rd    s    zCppScheduling._why_fuse_nodesc                 C   s  |j \}\}}|j \}\}}|dko*|dk}t|t|k}	t|dkpVt|dk}
|rd|	rd|
shdS t|t|k r||n|}t|t|k r|n|}t|trdS t|tstt|jt	j
rdS t|jt	jst|jj }d }t|trlt }|jD ]F}t|jt	j
r qHt|jt	js.t|t|jj  q t|dkrZdS ttt|}n0t|ts|tt|jt	jst|jj }||krdS dS )Nrj   r6   FT)r*  r  rP   r   r{   r%   r'   rg   r   r   ZTemplateBufferr]  dataget_sizer  r\  r   r|   r   nextiter)r   r   r   r  rg  rh  ri  rj  c1c2c3rk  rl  Zranges2Zranges1Z
ranges_setr_  rj   rj   rk   rm    sB    


z4CppScheduling._can_fuse_nodes_with_compatible_rangesc                 C   sN   t |ttfstt |ttfs$ttdd ||fD r>dS | ||d k	S )Nc                 s   s   | ]}t |tV  qd S r   )r{   r   r   rj   rj   rk   r   @  s    z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>F)r{   r%   r'   rg   rR   rd  r   r   r   rj   rj   rk   _can_fuse_horizontal_impl=  s    z'CppScheduling._can_fuse_horizontal_implc                 C   sD   |  s|  rdS t| t|  tjjkr8dS | ||S r  )rc  r   r   r   r   Zmax_horizontal_fusion_sizerv  ru  rj   rj   rk   can_fuse_horizontalF  s    z!CppScheduling.can_fuse_horizontalc                 C   sd  d}t dd ||fD s|S t|tr4| d n|}t|ttfsJtt|tr`| d n|}t|ttfsvt|j\}\}}|j\}\}	}
|dkr|	dkr|dkr|
dkr|S t dd ||fD r|j|jkr|jS |S t	t
|t
|	}|dkr`|d | |	d | kr`tdd ||fD r\t|tkrB|n|}|j|krV|S |S n|S |S )	Nr   c                 s   s    | ]}t |tttfkV  qd S r   )r   r   r%   r'   r   rj   rj   rk   r   S  s   z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>r4  rj   c                 s   s   | ]}t |tkV  qd S r   r   r   rj   rj   rk   r   l  s     r6   c                 s   s   | ]}t |tkV  qd S r   r   r   rj   rj   rk   r   w  s    )r   r{   r   r   r%   r'   rg   r*  r   rK   r   rR   r   )r   r   r   ZDISABLE_OUTER_LOOP_FUSIONZ_node1Z_node2r  rg  rh  ri  rj  r   Z_compare_noderj   rj   rk   rf  Q  sN     

z*CppScheduling._get_outer_loop_fusion_depthc                 C   sJ   |   oH|   oH| |j@ oH| ||o6|   oH| ||dkS r   )rc  	get_namesZ	ancestorsrv  r  rf  ru  rj   rj   rk   re    s    
z*CppScheduling.can_fuse_vertical_outer_loopc                 C   s   |  ||rdS dS d S )Nr6   r   )re  ru  rj   rj   rk   get_fusion_pair_priority  s    z&CppScheduling.get_fusion_pair_priorityc                 C   s@   |  rdS |  r|  S | ||r4|  p>| ||S r  )rc  r  rv  re  ru  rj   rj   rk   can_fuse_vertical  s    

zCppScheduling.can_fuse_verticalr  c                 C   s   | j }t|trg }g }| D ]D}t|ttfs6t| }t|}|	| |
| |
| q |||jr||}||dd |D  qt||D ]\}	}||	| qn&| }
t|}|	|
 |||
 |  }|tjkr| d dS )zC
        Turn an set of pre-fused nodes into a C++ kernel.
        c                 S   s   g | ]}|D ]}|qqS rj   rj   )r   _nodesr   rj   rj   rk   r     s       z.CppScheduling.codegen_node.<locals>.<listcomp>TN)r   r{   r   r   r%   r'   rg   r   r  rK  r   r   r   r   finalize_kernelrz  _get_scheduled_num_argsrQ  MAX_FUSED_KERNEL_ARGS_NUMrV  )r   r   r   r   Z
nodes_listr   r{  Zcpp_kernel_proxyZouter_fusion_cpp_kernel_proxyZ_kernel_proxyr  args_numrj   rj   rk   codegen_node  s>    


 

zCppScheduling.codegen_noder  c                 C   s   t |tot |jtjS r   )r{   r'   r   r   CppTemplateBuffer)r   r   rj   rj   rk   is_cpp_template  s     zCppScheduling.is_cpp_template)template_nodeepilogue_nodesc              	   C   s(  t d d  t|7  < | |s*tdtt|}|j\}\}}|dksNtttj|j	}dd |D }t
dd |D std	|j||d
\}}|, |f|D ]}	|	  |	  q| }
W 5 Q R X t|  |f|}| |
||j}W 5 Q R X ||| tj j|jO  _| j  dS )zG
        Codegen a CPP template, possibly with fused epilogues
        ZinductorZcpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrj   c                 S   s   g | ]
}|j qS rj   r  ra  rj   rj   rk   r     s     z2CppScheduling.codegen_template.<locals>.<listcomp>c                 s   s   | ]}t |tjV  qd S r   )r{   r   r]  ra  rj   rj   rk   r     s    z1CppScheduling.codegen_template.<locals>.<genexpr>z9Epilogue nodes must all be instances of ir.ComputedBuffer)r  N)r   r   r  rg   r   r'   r*  r   r  r   r   Zmake_kernel_renderrH  rI  r5   r   define_kernelr  call_kernelrP  r=  r   Zfree_buffers)r   r  r  r  ZrnumelZctbZepilogue_ir_nodesr   renderr   src_codeZnode_schedulekernel_namerj   rj   rk   codegen_template  s8    



zCppScheduling.codegen_templatec                 C   s
   | j  S r   )r   get_num_argsr   rj   rj   rk   r}    s    z%CppScheduling._get_scheduled_num_argsc                 C   s   | j S r   rU  r   rj   rj   rk   ready_to_flush  s    zCppScheduling.ready_to_flushc                 C   s   d S r   rj   r   rj   rj   rk   codegen_sync  s    zCppScheduling.codegen_syncNc                 C   s   t jj}tjjrt|tjjnd}dd|| g}t jj	rB|nd}|
ttj|}|
ttj|}|
dd}t }|d kr| jjn|}	|	 \}
}
}t jj	s|d|d |j|d	d
 t jj	s|d |j|| dd |S )Nr   r  r   r   z#pragma CMTz//zasync_compile.cpp_pybinding(z, '''T)stripz''')F)cuda)r5   rP  r  r   r   Zdescriptive_namesr*   joinZnext_kernel_suffixZcpp_wrapperr   rZ   r-   KERNEL_NAMEDESCRIPTIVE_NAMEr>   r   r  cpp_argdefsrw  ry  r  getvalue)r   r  r  Zkernel_argsr  Z
fused_namer  kernel_decl_nameZcompile_wrapperr  r  	arg_typesrj   rj   rk   r    s(    
zCppScheduling.define_kernelc                 C   sF   | j  }|r0| || j j}| j tjj| |   | 	d d S r  )
r   codegen_groupr  scheduled_nodesr  r5   rP  r  rR  rV  )r   r  r  rj   rj   rk   flush"  s    
 zCppScheduling.flush)N)"r   r   r   r~  r   rW   rV  rW  rR  r   r	   rM  rd  rm  rv  rw  rf  re  ry  rz  r   r   r%   r'   r  r"   r  r
   r  r}  r  r  r  r  rj   rj   rj   rk   rQ    s2   	;8	64"
rQ  c                       sL   e Zd Z fddZdd Zdd Zdd Zded
ddZdd Z	  Z
S )rZ  c                    sH   t    t | _t | _t| j| _t	 | _
| j
| j g | _d S r   )r   r   r@   r  r7   
loops_codeWorkSharingr  rs  rt  rv  ru  r  r   r   rj   rk   r   .  s    

zKernelGroup.__init__c                 G   s   || j t f| S r   )r  r,   )r   r   r  rj   rj   rk   r.  7  s    zKernelGroup.new_kernelc                 C   s*   |  j |7  _ | j}| j}||| d S r   )r  r  r  r  )r   r.  r  r{  r  rj   rj   rk   r|  :  s    zKernelGroup.finalize_kernelc                 C   s   | j  \}}}t|}|S r   )r  r  r   )r   arg_defs	call_argsr  r  rj   rj   rk   r  @  s    zKernelGroup.get_num_argsNr  c              	   C   sD  | j   | jsdS t }tjjo*tjdk}|r<|	dg |
t  |d kr\ttjn|}|d krrttjn|}| j \}}}dd|}|
d| d| d |  |rtjj}|d k	rd	t| d
 nd}	|	d|	|  dg | j D ]"\}
}|
d|
 d| d q|| j W 5 Q R X | S )Nr   linuxz!#include <ATen/record_function.h>z,
   zextern "C" void r  rw   Zgraph_r  zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));rt  r  r   )rv  r}  r  r7   r   r   enable_kernel_profilesysplatformr  rw  r   Z
cpp_prefixrZ   r-   r  r  r  r  ljustr  r  r5   rP  graph_idaliasesry  r  r  )r   r  r{  r  r  r  r  r  r  r   oldnewrj   rj   rk   r  E  s4    

zKernelGroup.codegen_groupc                 C   s&   | j  \}}}|j||d|d d S )NF)r  r  )r  r  Zgenerate_kernel_call)r   r  r  r  r  r  rj   rj   rk   r  i  s       zKernelGroup.call_kernel)N)r   r   r   r   r.  r|  r  rZ   r  r  r   rj   rj   r   rk   rZ  -  s   	$rZ  c                       s   e Zd Z fddZ  ZS )rY  c                    s   t    t | _d S r   )r   r   r8   r  r   r   rj   rk   r   q  s    
zCppWrapperKernelGroup.__init__)r   r   r   r   r   rj   rj   r   rk   rY  p  s   rY  c                   @   s<   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdS )r  c                 C   s    || _ d| _d | _t | _d S r  )r{  in_parallelr   rs  rt  rv  )r   r{  rj   rj   rk   r   w  s    zWorkSharing.__init__c                 C   sv   | j r|| jkr|   | j sr|| _d| _ tjjr@| jd n| jd| d | j	| j
  | jd d S )NTz#pragma omp parallelz!#pragma omp parallel num_threads(rw   zint tid = omp_get_thread_num();)r  r   r}  r   r   r   r{  rw  rv  ru  r  )r   r  rj   rj   rk   rl  }  s    zWorkSharing.parallelc                 C   s   | j r| jd | j S )Nz#pragma omp single)r  r{  rw  r   rj   rj   rk   r    s    zWorkSharing.singlec                 C   s   | j   d| _d S r  )rv  r}  r  r   rj   rj   rk   r}    s    
zWorkSharing.closec                 C   s   | j   | S r   )rv  r   r   rj   rj   rk   r     s    
zWorkSharing.__enter__c                 C   s   | j ||| d S r   )rv  r   r   rj   rj   rk   r     s    zWorkSharing.__exit__N)	r   r   r   r   rl  r  r}  r   r   rj   rj   rj   rk   r  v  s   r  c                   @   s(  e Zd ZU dZeej ed< dZeej ed< e	dZ
ejed< e	dZejed< dZeed< d	Zeed
< d	Zeed< d	Zeed< d	Zeed< dZed  ed< ejedZed  ed< dZee ed< dd Zee dddZdd ZedddZed  dddZdd Z d d! Z!d"d# Z"dS )$r   Nr~   r   r   r   r6   r   rl  FrA  r  	collapsedr  parent)default_factoryr   r   c                 C   s   t  }|r| nd| _d S r[  )r   r  r  rB  )r   r  rj   rj   rk   __post_init__  s    	zLoopLevel.__post_init__r  c                 C   s.   | j r| j gS g }| jD ]}|| 7 }q|S z,Get all kernel objects under this loop level)r   r   rn  r   ro  rj  rj   rj   rk   rn    s    
zLoopLevel.get_kernelsc                 C   s   | }|j r|j }q|S r  r  )r   r   rj   rj   rk   rk    s    zLoopLevel.get_rootr   c                 C   sF   | j s || _| }|dk	stdS t| j dks2t| j d | dS )zj
        Set the kernel under this loop level. No split is allowed under
        this loop level.
        Nr6   r   )r   r   rg   r   r@  )r   r   rj  rj   rj   rk   r@    s    zLoopLevel.set_kernelc                 C   s8   |dkr| gS g }| j D ]}|||d 7 }q|S d S )Nr   r6   )r   get_loops_atr   r  rz  rj  rj   rj   rk   r    s    
zLoopLevel.get_loops_atc                    s   fdd  fdd}|dkrV| \}}j }|rN||g|_||_ ||_ ||fS tjdkshtjd |d S d S )Nc                     s(   g }  j r$ j D ]}| |  q| S r   )r   r   r  )r   rj  r   rj   rk   clone_inner  s
    
z0LoopLevel.split_with_tiling.<locals>.clone_innerc                     s   t } tj| |  }tj|}| |_j|_d|_j	|_	  |_
|j
rb|j
D ]
}||_qVtjj}||_j|_d|_j	|_	  |_
|j
r|j
D ]
}||_q||fS r  )r   Integerr   r   r   r~   r   rl  r  r  r   r  r   )Zsympy_factorr   rC  rj  rD  r  r  r   rj   rk   do_split_with_tiling  s*    


z9LoopLevel.split_with_tiling.<locals>.do_split_with_tilingr   r6   )r  r   r   rg   r?  )r   r  r  r  rC  rD  r  rj   r  rk   r?    s    

zLoopLevel.split_with_tilingc                 C   sJ   t | }g |_| jr:| jD ]}| }||_|j| qt| j|_|S r   )r   r   r  r  r   r   r   )r   rj  Z
inner_loopZinner_loop_clonerj   rj   rk   r    s    
zLoopLevel.clonec           	      C   s0  t | j}t | j}tjjr(||kr(d S | jrF| jdkrFd| j dnd}| jrd}| jdkrp|d| j d7 }| jr|	dd| }n6| j
rd}n*| jrd	| }n| jst rd
}nd}t d| j d| }| j d| }| j dt | j }d| d| d| d}| js"|s(|gS ||gS )Nr6   zsimd simdlen(z) r   z#pragma omp forz
 collapse(rw   z for z#pragma omp z#pragma GCC ivdepr   =<z+=zfor(r[  )rD   r   r   r   r   Zno_redundant_loopsrA  rB  rl  r   r  r  r   r  rF   r~   r   r  )	r   Zoffset_exprZ	size_exprZsimdline1Z
offset_strrL  Z	steps_strline2rj   rj   rk   r    s<    


zLoopLevel.lines)#r   r   r   r~   r	   r   r&  __annotations__r   r  r   r   rl  rX   rA  rW   r  r  r  r  dataclassesfieldr   r   r   r   r   r  rn  rk  r@  r  r?  r  r  rj   rj   rj   rk   r     s(   
		.r   c                   @   s   e Zd ZU dZdZeee  ed< dZ	ee
 ed< ee
dddZdd	 Zee d
ddZedd Zdd Zdd Zdd Zee
 d
ddZdS )r  a  
    A loop-nest like structure but with some loop level split along
    the loop range into the main tiling loop and the tail. It is built
    with the `build` method as a loop nest and then split with
    `split_with_tiling` at some depth.

    A typical case is for vectorization where we typically split at the inner-most
    loop level. A more complicated case is 2D tiling where we split at
    both inner-most and outer levels.
    Nr   r   r  c                 C   s   | j }| j}| j}|dk	stg }|}d}tt||D ]:\}\}}	t||	|d}||krb| j|_|| |j	}q8t
|}
|r| |_n| |
_|
S )z4Build a LoopNest with the given `kernel` as the leafNr  )r  r  r  rg   r'  rz  r   r  r   r   r  r   )r   r  r  r  r   Zlevelsrj  Zloop_idxr~   r   r   rj   rj   rk   r  M  s$    
zLoopNestWithSplit.buildc                 C   s
   t | jS r   )rW   r   r   rj   rj   rk   __bool__e  s    zLoopNestWithSplit.__bool__r  c                 C   s0   g }| j dk	st| j D ]}|||7 }q|S )zJGet all the loop levels at the given `depth` (most outer loop has depth 0)N)r   rg   r  r  rj   rj   rk   r  h  s
    
zLoopNestWithSplit.get_loops_atc                 C   sl   d}| j dk	st| j }t|dkr(dS |r6|d jnd}t|dkrh|d j|krh|d7 }|d j}q:|S )z
        Maximal allowed depth for parallelism:
        1) Levels without splitting and
        2) All reduction or non-reduction levels
        When the loop is split at the top level, the max depth is 1.
        r   Nr6   F)r   rg   r   r  r   )r   	max_depthrz  r  rj   rj   rk   r  p  s    z$LoopNestWithSplit.max_parallel_depthc                 C   s$   | j dk	o"t| j dko"| j d jS )zr
        Whether all the loops are for reduction. Reduction loops
        are always the inner most ones.
        Nr   )r   r   r  r   rj   rj   rk   r|    s    "z#LoopNestWithSplit.is_reduction_onlyc                 C   s`   ||   kstd| jd k	s"t| j}|D ]
}||_q,td|D ]}|d j}d|d _qBd S )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr6   r   T)r  rg   r   rl  r   r   r  )r   r  rz  rj  r+  rj   rj   rk   r    s    

zLoopNestWithSplit.mark_parallelc                 C   s<   |  |}t|dkst|d d|}|dkr8|| _|S )a  
        Split the loop into main and tail loops at given `depth` so that the range
        of the main loop has range `floor_div(range, factor) * factor` and
        the tail loop handles the remainder. The main loop is tiled
        according to the `factor`.
        r6   r   )r  r   rg   r?  r   )r   r  r  rz  Zsplit_loopsrj   rj   rk   r?    s    
z#LoopNestWithSplit.split_with_tilingc                 C   s<   | j r| j gS g }| jdk	s t| jD ]}|| 7 }q&|S )z+Get all kernel objects under this loop nestN)r   r   rg   rn  r  rj   rj   rk   rn    s    
zLoopNestWithSplit.get_kernels)r   r   r   r  r   r	   r   r   r  r   r   r  r  r  r  r(   r  r|  r  r?  rn  rj   rj   rj   rk   r  =  s   

	r  )rs  r  r  r1  r  r  r#  r  r   r   enumr   typingr   r   r   r   r	   r
   r   r   r   r   re   Ztorch.fxZtorch._inductorr   Ztorch._prims_commonr   Ztorch.utilsr   r  Ztorch.utils._sympy.functionsr   r   r   Ztorch.utils._sympy.symbolr   r   r   Ztorch.utils._sympy.value_rangesr   r   Z_dynamo.utilsr   r   r   r   r   r   Zcodegen.wrapperr    Zoptimize_indexingr!   r   r"   r#   r$   r%   r&   r'   utilsr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   Zvirtualizedr2   r3   r4   r5   commonr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   Z	cpp_utilsrC   rD   rE   rF   rG   Z_loggingZgetArtifactLoggerr   r  ZNATIVE_OMP_RTYPESZRTYPE_TO_CPPr  ZPYTHON_TO_CPPZCONTAINER_PYTHON_TO_CPPr8  r7  rd   r  rl   rq   r   r}   r   r   r   r   	lru_cacher&  r   r   rX   r   r   r   r   r  r  r  r  r  r'  Z_initialize_pointwise_overridesr  r  r   r   r  r  r  r   rM  rQ  rZ  rY  r  	dataclassr   r  rj   rj   rj   rk   <module>   s  , 08	> ,!A  z
    
    )     "     @   C( 