U
    yh0                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZmZ d dlmZmZm Z  ddl!m"Z" dd	l#m$Z$m%Z%m&Z& dd
l'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl%m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl&m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZCmDZDmEZE ddlFmGZGmHZHmIZImJZJ ddlKmLZL eMeNZOejPQeNdZRejPQeNdZSejPQeNdZTeJ jUZVejWG dd dZXG dd deXZYG dd deXZZdd  Z[G d!d" d"eIZ\G d#d$ d$e8Z]ejWG d%d& d&Z^G d'd( d(Z_G d)d* d*Z`G d+d, d,eaZbdS )-    )annotationsN)AnyCallableCounterDefaultDictDictIterableListOptionalSequenceSetTupleUnion)FloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )configir	scheduler)	code_hash)Dep	MemoryDepStarDepWeakDep)TritonTemplateBuffer)!indexing_dtype_strength_reduction)ReductionHintTRITON_MAX_BLOCK)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)get_dtype_sizeIndentedBufferPlaceholdersympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernelZ
perf_hintsZscheduleZfusionc                	      sN   e Zd ZdZededdddddddd	d
 fddZdd Z  ZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    r2   )divisorlengthstrzList[sympy.Symbol]zDict[sympy.Symbol, sympy.Expr]
sympy.Expr
SIMDKernelIterationRangesRootnamevar_list
var_rangesnumelprefixkernelrootc          
        sD   t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
d S N)super__init__r@   rA   rB   rC   rD   r9   r:   rE   rF   )
selfr@   rA   rB   rC   rD   rE   r9   r:   rF   	__class__ N/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/codegen/simd.pyrI   S   s    
zIterationRanges.__init__c                 C  s
   t | jS rG   )r+   r@   rJ   rM   rM   rN   symbolk   s    zIterationRanges.symbol)	__name__
__module____qualname____doc__sympyIntegerrI   rP   __classcell__rM   rM   rK   rN   r8   C   s
   "r8   c                      sv   e Zd Zddddddddddd	 fd	d
Zdd Zdd Zdd ZddddZddddZddddZ	  Z
S )r>   Nr;   r<   intr=   boolzOptional[int])	r@   rC   rD   indexrE   is_loop
tensor_dimgrid_dimhas_zdimc             	     sl   |d kri }t  j|g i |||| d || _i | _|| _|rP|dkrL|	d ksPt|| _|| _|	| _|
| _	d S )Nr?   r)
rH   rI   rZ   nodes	pid_cacheAssertionErrorr[   r\   r]   r^   )rJ   r@   rC   rD   rZ   rE   ra   r[   r\   r]   r^   rK   rM   rN   rI   p   s&    	zIterationRangesRoot.__init__c                 C  s   d| j d| j dS )NzIterationRangesRoot(, z, ...))r@   rC   rO   rM   rM   rN   __repr__   s    zIterationRangesRoot.__repr__c                 C  s   | j  D ]}|  q
d S rG   )r`   valuescache_clear)rJ   noderM   rM   rN   rf      s    zIterationRangesRoot.cache_clearc                 C  s   t jj|| | jr.tt| j d|}ntt| j d||}|| j	krt
| j tt jj |||| }|t jj| < | j|  || j| < || j	|< | j	| S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        rZ   )r1   graphsizevarsstatically_known_equalsrC   r   r+   rD   r   r`   IterationRangesEntrynextrE   iter_vars_countrange_tree_nodesrP   rA   appendrB   )rJ   r9   r:   exprrg   rM   rM   rN   lookup   s(      

zIterationRangesRoot.lookupzList[sympy.Expr]lengthsc                 C  sB   t d}g }t|D ]}|| || || }qtt|S Nr2   )rU   rV   reversedro   rq   list)rJ   rs   r9   itervarsr:   rM   rM   rN   construct_entries   s    

z%IterationRangesRoot.construct_entriesc                 C  s   dd |  |D S )Nc                 S  s   g | ]}|  qS rM   )rP   ).0erM   rM   rN   
<listcomp>   s     z1IterationRangesRoot.construct.<locals>.<listcomp>)rx   rJ   rs   rM   rM   rN   	construct   s    zIterationRangesRoot.constructrZ   c                   s   dd |j D }fdd|D }|jdd d td g g  fdd	}|D ]<}tjj|j s|	 t
|j  |j || qXtjjj s|	 t
j  ttttfS )
z,Figure out vars from this tree used in indexc                 S  s   g | ]}t jj|qS rM   )r1   rE   rn   getry   srM   rM   rN   r{      s     z6IterationRangesRoot.vars_and_sizes.<locals>.<listcomp>c                   s    g | ]}|r|j  j kr|qS rM   rD   ry   nrO   rM   rN   r{      s       c                 S  s   t jj| jS rG   )r1   rh   ri   	size_hintr9   xrM   rM   rN   <lambda>       z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>keyr2   c                   s(    |    | j  | j  d S rG   )ro   rP   r:   rg   )r9   
index_varssizesrM   rN   add   s    z/IterationRangesRoot.vars_and_sizes.<locals>.add)free_symbolssortrU   rV   r1   rh   ri   rj   r9   rq   r   rC   rv   ru   )rJ   rZ   r`   r   rg   rM   )r9   r   rJ   r   rN   vars_and_sizes   s    

z"IterationRangesRoot.vars_and_sizes)N)rQ   rR   rS   rI   rd   rf   rq   rx   r}   r   rW   rM   rM   rK   rN   r>   o   s   	 $+r>   c                      sb   e Zd Zdddddd fddZdd Zd	d
 Zdd Zdd Zdd Zdd Z	dd Z
  ZS )rk   r;   r<   r8   )r@   r9   r:   rp   parentc                   sP   t  j||j| |j|j|j|||j|jd	 || _t	
d | j| _|| _d S )N)	r@   rC   rA   rB   rD   r9   r:   rE   rF   )rH   rI   rC   rA   rB   rD   rE   rF   r   	functools	lru_cache_codegencodegenrp   )rJ   r@   r9   r:   rp   r   rK   rM   rN   rI      s    zIterationRangesEntry.__init__c                 C  s.   d| j  d| j d| j d| j d| j dS )NzIterationRangesEntry(rc   ))r@   r9   r:   rp   rB   rO   rM   rM   rN   rd      s    zIterationRangesEntry.__repr__c                   s$    fdd| _ dd | j _ | _d S )Nc                     s    S rG   rM   rM   r@   rM   rN   r      r   z/IterationRangesEntry.set_name.<locals>.<lambda>c                   S  s   d S rG   rM   rM   rM   rM   rN   r      r   )r   rf   r@   )rJ   r@   rM   r   rN   set_name   s    zIterationRangesEntry.set_namec                 C  s   | j   d S rG   )r   rf   rO   rM   rM   rN   rf     s    z IterationRangesEntry.cache_clearc                 C  s   t j|  | jS rG   )r1   rE   codegen_iteration_ranges_entryr@   rO   rM   rM   rN   r     s    zIterationRangesEntry._codegenc                 C  s   g }t | jtjr|S t | jttfs4tt| j| jjdd  D ]D}t |tj	tjfsD|j
}t|dkrDtdd |D rD|| qD|S )Nr2   r   c                 s  s   | ]}t |tjV  qd S rG   )r   r   SIZEr   rM   rM   rN   	<genexpr>  s    z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>)
isinstancerp   rU   Symbolr   r   rb   typeargsrV   r   lenallro   )rJ   precomputed_argsargsymbolsrM   rM   rN   r   	  s    z%IterationRangesEntry.precomputed_argsc                 C  s
   t | jS rG   )hashr@   rO   rM   rM   rN   __hash__  s    zIterationRangesEntry.__hash__c                 C  s   | j |j kS rG   r   )rJ   otherrM   rM   rN   __eq__  s    zIterationRangesEntry.__eq__)rQ   rR   rS   rI   rd   r   rf   r   r   r   r   rW   rM   rM   rK   rN   rk      s   rk   c                 C  s6   | t dkrdS | t dkr dS t| r.dS t| S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)valuerM   rM   rN   constant_repr  s    
r   c                      s  e Zd ZU dZeZded< dZddej	ddddd	 fd
dZ
dd Zdd ZddddddZddddZdd Zdd Zdd Zddd d!Zd"d# Zd$d% Zdd&d'd(d)Zdd&d'd*d+Zd,d- Zd.d/ Zd0d1 Zed2d3d4d5d6Zed2d3d4d7d8Zd9d:d;d<Zdd=d>d?Zdd=d@dAZ dddBdCdDZ!dd=dEdFZ"dbdGdHZ#dIdJ Z$ddKdLdMZ%e&j'dNdO Z(dd=dPdQZ)edRdS Z*dTdU Z+dVdW Z,dXdY Z-dZd[ Z.d\d] Z/d^d_d`daZ0  Z1S )cr=   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]kexprFN)	mutationsra   reduction_hintdisable_persistent_reductionr;   zOptional[Set[str]])index_dtyper   c                  s   |d kri }t    t  _t  _dd |D  _|d k	rB|nt  _g  _i  _	t
  _ jd dk _| _| _t  _tt _| o   _   _d  _td dd fdd}| _ | d S )	Nc                 S  s   g | ]}t jj|qS rM   )r1   rh   ri   simplifyr   rM   rM   rN   r{   @  s     z'SIMDKernel.__init__.<locals>.<listcomp>r2   r<   r~   c                   s6   t jj|   }  jD ]} | |} q | S rG   )r1   rh   ri   simplify_with_rangesrB   range_treescombine_contiguous_dimscombine_modular_indexing_pairsrZ   treerO   rM   rN   simplify_indexingQ  s    
z.SIMDKernel.__init__.<locals>.simplify_indexing)rH   rI   r)   bodyZindexing_codenumelssetr   r   rn   	itertoolscountrm   inside_reductionr   r   
last_usagecollectionsdefaultdictrv   buf_accessesshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   r   r   initialize_range_tree)rJ   r   r   ra   r   r   groupsr   rK   rO   rN   rI   2  s2    	


zSIMDKernel.__init__c                 C  s   dS NFrM   rO   rM   rM   rN   r   \  s    zSIMDKernel.want_no_x_dimc                   s   | j  p| jd dk}d}|t| j d   d}| jr>d}n|rHd}nd}d fdd	|D }t D ]\}}|dk}||kr||nd }	|rd n||}
|
d kr|n|
}| jt	| d
| j| ||| ||o| j
 |	|
d kd
 qld S )Nr   r2   Zzyxrxyzr_   Zxyzr c                 3  s   | ]}| kr|V  qd S rG   rM   )ry   pZactive_prefixesrM   rN   r   m  s      z3SIMDKernel.initialize_range_tree.<locals>.<genexpr>rZ   z)ra   r[   r\   r]   r^   )r   r   r   r   join	enumeratefindr   ro   r>   r   )rJ   ra   Zno_r_dimprefixesZ	grid_dimsZtensor_dimsirD   is_reductionr\   r]   rZ   rM   r   rN   r   _  s:    
z SIMDKernel.initialize_range_treer<   r3   )r@   rZ   r   c                 C  s,   | j }d| _ z| |||W S || _ X d S r   )r   store)rJ   r@   rZ   r   priorrM   rM   rN   store_reduction  s
    zSIMDKernel.store_reductionrY   returnc                 C  s   dS r   rM   rO   rM   rM   rN   r     s    z*SIMDKernel.should_use_persistent_reductionc                 C  s   t tjdd | jD S )Nc                 s  s   | ]}|j  V  qd S rG   )rB   itemsry   r   rM   rM   rN   r     s    z(SIMDKernel.var_ranges.<locals>.<genexpr>)dictr   chainfrom_iterabler   rO   rM   rM   rN   rB     s
    zSIMDKernel.var_rangesc                 C  s   t dd | jD S )Nc                 s  s   | ]}t |jd k	V  qd S rG   )rX   r\   r   rM   rM   rN   r     s     z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>)sumr   rO   rM   rM   rN   triton_tensor_ndim  s    zSIMDKernel.triton_tensor_ndimc                 C  s(   dg|    }d||< dd| dS )NNone:[rc   ])r   r   )rJ   r   r   rM   rM   rN   indexing_size_str  s    zSIMDKernel.indexing_size_strz	List[str]c                 C  sP   dg|    }| jD ]6}|jd kr$q|jdks4| jr|j  d||j< q|S )N1r_   ZBLOCK)r   r   r\   rD   r   upper)rJ   r   r   rM   rM   rN   dense_size_list  s    

zSIMDKernel.dense_size_listc                 C  s   |   }dd| dS )Nr   rc   r   )r   r   rJ   r   rM   rM   rN   dense_size_str  s    zSIMDKernel.dense_size_strc                 C  sN   t |ts|S |jd }| j| }d kr0|S t|||ji}tjj	
|S Nr   )r   r   r   rn   r   r-   rp   r1   rh   ri   r   )rJ   rZ   r   Z	tree_node	new_indexrM   rM   rN   r     s    

z)SIMDKernel.combine_modular_indexing_pairsr>   r   c                 C  s<   t jj| }r,|\}}t| |||S | ||S d S rG   )r1   rh   ri   Zexpand_floor_divr   _combine_contiguous_dims)rJ   rZ   r   Z
expand_resr   denominatorrM   rM   rN   r     s    z"SIMDKernel.combine_contiguous_dimsc           
      C  s   t |tjtjfr|S ||\}}t|dkr4|S tjj	||t
|g||\}}}||krb|S ||}t|tt|||}	|	S )zI
        More aggressive simplification to merge contiguous dims
        r2   )r   rU   rV   r   r   r   r1   rh   ri   Z_simplify_loopsr4   r}   r-   r   zip)
rJ   rZ   r   r   r   Z	new_sizesZreindexZpruneZnew_index_varsr   rM   rM   rN   r     s      

z#SIMDKernel._combine_contiguous_dimsc                 C  s0   | j r| jrd S ttjdd |D | _d S )Nc                 s  s   | ]}|t k	r|jV  qd S rG   )EnableReductionr   r   rM   rM   rN   r     s     z,SIMDKernel.set_last_usage.<locals>.<genexpr>)r   r   r   r   r   r   r   )rJ   r`   rM   rM   rN   set_last_usage  s    zSIMDKernel.set_last_usagec                   s&    j d jtj fdd}| S )Nr   c                   3  sX    j d dkr" jrtd V  d S r.   d _zd V  rH   W 5 d _X d S )Nr   r2   FT)r   r   rb   codegen_bodyrM   rJ   Zshould_flushrM   rN   ctx  s    
z)SIMDKernel.disable_reduction.<locals>.ctx)r   r[   
contextlibcontextmanager)rJ   r   rM   r   rN   disable_reduction  s    zSIMDKernel.disable_reductionc                 G  s,   t |t | jkstdd t|| jD S )Nc                 S  s   g | ]\}}| |qS rM   )r}   )ry   r:   rangesrM   rM   rN   r{     s   z)SIMDKernel.set_ranges.<locals>.<listcomp>)r   r   rb   r   r|   rM   rM   rN   
set_ranges  s    
zSIMDKernel.set_rangeszIterable[sympy.Expr]zSequence[Sequence[sympy.Expr]])r   rs   c              
     sn  t jjdd | D  fdd| D t  fdd}dd }g }d}|D ]}g }|D ]}|d	r|d
d  qb|tk r| d	r|d	7 }q|d	 tk r|| r	|| st
| }	t|| }
|||
|||	||d	 |
 qb|t||| qb|| qVtdd D sftd d|  |fS )Nc                 S  s   g | ]}g qS rM   rM   )ry   _rM   rM   rN   r{     s     z6SIMDKernel._split_iteration_ranges.<locals>.<listcomp>c                   s   g | ]}  |qS rM   r   )ry   g)svrM   rN   r{     s     c                   sF    |}|  |stt|  || <  |  | tS rG   )r   statically_known_multiple_of	CantSplitr   ro   rl   )r   rp   
new_ranges	remainingr  Z	var_countrM   rN   	add_range  s    
z5SIMDKernel._split_iteration_ranges.<locals>.add_rangec                   s    fdd}|S )Nc                   s   |    |   S rG   rM   )Z	flat_varsidx1idx2sizerM   rN   getter  s    zISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getterrM   )r  r  r  r  rM   r  rN   make_combined  s    z9SIMDKernel._split_iteration_ranges.<locals>.make_combinedr   r2   c                 S  s
   t dS r   )rU   rV   )r  rM   rM   rN   r     r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>c                 s  s    | ]}t jj|d kV  qdS )r2   Nr1   rh   ri   r   r   rM   rM   rN   r   3  s    z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>zfailed to set ranges  )r1   rh   ri   r   r   rj   ro   r   Zstatically_known_gtr  r  r   operator
itemgetterr   rb   )r   rs   r
  r  return_getters_groupsZcurrent_groupZlength_groupZreturn_gettersr  Zsize1Zsize2rM   r  rN   _split_iteration_ranges  s^    	 
  z"SIMDKernel._split_iteration_rangesc                 C  s.   z|  || W dS  tk
r(   Y dS X d S )NTF)r  r  )clsr   rs   rM   rM   rN   is_compatible9  s
    zSIMDKernel.is_compatiblezList[List[sympy.Expr]]rr   c                   s   dd | j D }| js$td|d< t|t| j krXtdd t||D rX| j| S | ||\}}t	t
j| j|   fdd|D S )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c                 S  s   g | ]
}|j qS rM   )rC   )ry   rtrM   rM   rN   r{   P  s     z3SIMDKernel.split_and_set_ranges.<locals>.<listcomp>r2   r   c                 s  s,   | ]$\}}t jjt|| d kV  qdS r   Nr1   rh   ri   r   r,   )ry   r   r  rM   rM   rN   r   T  s   z2SIMDKernel.split_and_set_ranges.<locals>.<genexpr>c                   s   g | ]} fd d|D qS )c                   s   g | ]}| qS rM   rM   )ry   fnrw   rM   rN   r{   ^  s     z>SIMDKernel.split_and_set_ranges.<locals>.<listcomp>.<listcomp>rM   )ry   Zfnsr  rM   rN   r{   ^  s     )r   r   rU   rV   r   r   r   r   r  rv   r   r   r   )rJ   rs   r   r  r  rM   r  rN   split_and_set_rangesC  s    
 zSIMDKernel.split_and_set_rangesr~   c                 C  s   t |tjS rG   )r   r   TMPrJ   rZ   rM   rM   rN   is_indirect_indexing`  s    zSIMDKernel.is_indirect_indexingc                   s   |  |rdS dgt| j }|jD ]@}|| jkr4q$| j| }t|jtsNt||jj	  |j
9  < q$tjjj t fddt|| jD S )NFr2   c                 3  s"   | ]\}} | |kV  qd S rG   rM   )ry   Z	idx_rangeZ
iter_ranger  rM   rN   r   u  s   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>)r!  r   r   r   rn   r   r   r>   rb   rZ   r:   r1   rh   ri   r   anyr   )rJ   rZ   Zindex_numelsrP   entryrM   r  rN   is_broadcastedd  s    





zSIMDKernel.is_broadcasted)rZ   r   c                 C  s4   t |tr$ddt| j| dS | | |S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        r   rc   r   )r   rv   r   mapindex_to_strr   Zrename_indexingr   rM   rM   rN   r&  z  s    	
zSIMDKernel.index_to_strc                 C  s   |  |}t|tjjj}t|tj	s:t|tj
rJ|tjjj}t|tj
r|tj
D ]D}|j}t|dkrftdd |D rf|tjj|i}t||}qf| |  |S )Nr   c                 s  s    | ]}t |tjtjfV  qd S rG   )r   r   r   ZPRECOMPUTED_SIZEr   rM   rM   rN   r     s   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>)r   r-   r1   rh   ri   Zprecomputed_replacementsr   ZatomsrU   floorZceilingsubsr   r   lookup_precomputed_sizecodegen_indexing)rJ   rZ   ar   replacementsrM   rM   rN   prepare_indexing  s    
 zSIMDKernel.prepare_indexingc                   s    fdd j D }|rt|dkrtdd |D }ddd |d | D d| d  ksztd	d |d | D t|d | |d |< |S )
Nc                   s    g | ]}|j d ks jr|qS )r_   )rD   r   ry   trO   rM   rN   r{     s    
  z1SIMDKernel.active_range_trees.<locals>.<listcomp>r2   c                 s  s   | ]}|j d kV  qdS )r   Nr   r.  rM   rM   rN   r     s     z0SIMDKernel.active_range_trees.<locals>.<genexpr>r   c                 s  s   | ]}|j V  qd S rG   r   r.  rM   rM   rN   r     s     Zzyxc                 S  s   g | ]
}|j qS rM   r   r.  rM   rM   rN   r{     s    )r   r   r   r   rb   ru   )rJ   ZreorderZtreesr   rM   rO   rN   active_range_trees  s    
2
zSIMDKernel.active_range_treesc                 C  sx   | j D ]l}tjj|jdr0||j d q|j t	kr@qt	|j  }tjj
|j|r||j d qd S )Nr2   mask)r   r1   rh   ri   rj   rC   discardrD   r   r"   r  )rJ   Z	mask_varsr   Z	max_blockrM   rM   rN   filter_masks  s    
zSIMDKernel.filter_masksrp   c                 C  s   t jj||  }t|jtdD ]l}|| jkr"i }| j| 	 D ]}t jj
|||< qBt|dkrt| j| j|| j| _| j|   q"|S )Nr   r   )r1   rh   ri   r   rB   sortedr   r;   rn   r   r)  r   r-   rp   r   )rJ   rp   symr,  ZpsrM   rM   rN   r*    s    

 zSIMDKernel.codegen_indexingc                 c  s>   | j }|rt||}t|}|| _ z
|V  W 5 || _ X dS )z:Context manager to add an additional mask to tl.load/storeN)Z
_load_maskr/   logical_andr0   _unwrap)rJ   r1  r   rM   rM   rN   
mask_loads  s    

zSIMDKernel.mask_loadsc                 C  s\   dd | j  D }t||}i }| jD ].}t|j}t||dit||di ||< q(|S )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        c                 S  s   i | ]\}}||j qS rM   r4  )ry   kvrM   rM   rN   
<dictcomp>  s      z2SIMDKernel.get_strides_of_load.<locals>.<dictcomp>r2   r   )rn   r   r-   r   r+   r@   )rJ   rZ   Zindex_to_tile_indexesZindex_in_tile_varsstridesZ
range_treer   rM   rM   rN   get_strides_of_load  s    


 zSIMDKernel.get_strides_of_loadc                 C  s    t |trtt| |S | |S rG   )r   tupler%  )r  r   rM   rM   rN   _map_tuple_or_scalar  s    
zSIMDKernel._map_tuple_or_scalarc                 C  s"  g }t t| jj }| j \}}}}tjj	t
| j}t|D ]\}}|| jkrd|d qFtj|}tjj	|}	|	|krt }
d}| j| D ]8}t|ttfr|
d|  |d7 }q|
|j qt |
| }n|	}tj|}t|}||| dt||k    qFt|S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   Zno_index_dep_r2   )r   r.   r   inplace_buffersre   python_argdefsr1   rh   ri   r   r,   r   r   r   ro   Z	get_numelr   r   r   r   r   rZ   Z	get_dtyper(   rX   r   )rJ   nbytesZninplace_argsr  	call_argsZ	out_numelr   r   Z	arg_numelbuf_sizeindicesZno_index_dep_countdeprC   dtypeZ
dtype_sizerM   rM   rN   estimate_kernel_num_bytes  s0    


 z$SIMDKernel.estimate_kernel_num_bytesc                 C  sb  t | jjdkr4t | jjdkr4t | jjdkr4dS | j \}}}}d}|D ]}tj|}|rNt |j	j
dkrNt dd |j	j
D dkrqNt|j	j}	|dkr|	}qN||	krNtd| d	d
|	 d|  }
t|
 dd |D }dd |D }dd |D }td| d| d| d| d| d }
t|
  dS qNtd| d}
t|
 dS )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r2   r   N   c                 S  s   g | ]}|d kr|qS )r2   rM   )ry   r   rM   rM   rN   r{   R  s      z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>r   zExpected stride order z, but found stride orderr  z for kernel c                 S  s2   g | ]*}t j|r*tt j|jjnd qS rG   )r1   rh   
get_bufferr   get_stride_orderlayoutstridery   r@   rM   rM   rN   r{   ^  s   
c                 S  s,   g | ]$}t j|r$t j|jjnd qS rG   )r1   rh   rK  rM  r  rO  rM   rM   rN   r{   d  s   
c                 S  s0   g | ](}|t jjkrd n|t jjkr(dndqS )Z
GraphInputZIntermediateBufferN)r1   rh   Zgraph_inputsZname_to_bufferrO  rM   rM   rN   r{   j  s   

z  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   Zinput_buffersZoutput_buffersrA  rB  r1   rh   rK  rM  r  r   rL  rN  r$   logwarningr#   )rJ   kernel_nameZargdefsrD  	signaturer  Zuniform_stride_orderarg_namebufZstride_ordermsgZstride_order_listZ	size_listZsource_listrM   rM   rN   warn_mix_layout=  sX    

	

zSIMDKernel.warn_mix_layoutc           	      C  sr   t ||d|}d| _t | jd |}t ||}d| _t ||}t ||}t ||d|}t	|||fS )Nr   Fr   T)
r/   Z	reductionr   Z
index_exprr   truedivsubmulr0   r8  )	rJ   rH  r   Zsum_rnumelZmeanZdxZdx2m2rM   rM   rN   welford_reduce_fallback~  s    z"SIMDKernel.welford_reduce_fallbackc                 C  s   t d S rG   NotImplementedErrorrO   rM   rM   rN   codegen_kernel  s    zSIMDKernel.codegen_kernelc                 C  s   d S rG   rM   rO   rM   rM   rN   r     s    zSIMDKernel.codegen_bodyrk   )r#  c                 C  s   t d S rG   r_  )rJ   r#  rM   rM   rN   r     s    z)SIMDKernel.codegen_iteration_ranges_entry)F)2rQ   rR   rS   rT   pexprZsexpr__annotations__Zallow_block_ptrr!   DEFAULTrI   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   r   staticmethodr  classmethodr  r  r!  r$  r&  r-  r0  r3  r*  r   r   r9  r>  r@  rI  rX  r^  ra  r   r   rW   rM   rM   rK   rN   r=   )  s^   
*$
		C	


>Ar=   c                   @  s  e Zd ZeZdZdZdd Zdd Zdd Z	e	Z
e	Zd	d
 ZddddZedd ZeddddddZedd Zdd Zdd Zdd Zdd  Zd;d"d#d$d%Zd&d' Zd(d) Zeed*d+d, Zeed-fd.d/Zd0d1 Z dd#d2d3Z!d<d4d5Z"d6d7 Z#d8d9 Z$d:S )=SIMDSchedulingztorch.int32ztorch.int64c                 C  s
   || _ d S rG   )r   )rJ   r   rM   rM   rN   rI     s    zSIMDScheduling.__init__c                 C  s   t dd |D S )Nc                 s  s    | ]}t jjt|V  qd S rG   r  r   rM   rM   rN   r     s     z*SIMDScheduling.group_fn.<locals>.<genexpr>)r?  r   rM   rM   rN   group_fn  s    zSIMDScheduling.group_fnc                   s  t |tjst |tjr&tj||S |j\}\}}|j\}\ t||}| rn| sn| r|d n | r| s| r|d | r| r| ko|k}|s|d| | |S | s| s| kr|ks|d| | dS | r&t |j	t
}|s"|d |S | | ||}	| | ||}
| | |  ||}tjjrd}t|	dkrt|
dkr|	|
  ko|kn  }n|	|k}nt|
dkr|
|k}|s|d|	|
| dS dS | s| r|d	krd	kst|  krt fd
d| D sJ|d dS tjjr| s| | ||d	f d	ffk}|s|d |S dS | kr|d | kS | r| rt| ||S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz!node1 is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r2   c                 3  s"   | ]}t  f| V  qd S rG   )r=   r  
get_rangesr   Znumel2Zrnumel2rM   rN   r     s   z*SIMDScheduling.can_fuse.<locals>.<genexpr>z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ZForeachKernelSchedulerNodecan_fusegroupr'   is_split_scanr   is_templaterg   r   select_tiling	get_nodesr   tritonZ tiling_prevents_pointwise_fusionr   rb   r   Z tiling_prevents_reduction_fusioncan_fuse_horizontal)rJ   Znode1Znode2r  Znumel1Zrnumel1whyZreduction_can_fuseZis_triton_templateZtiling1Ztiling2Ztiling3ZcondZis_reduction_tiling_validrM   rj  rN   rk    s     


  


 
zSIMDScheduling.can_fusec              
     s  g t  }t  d t  	fdd	fdd} fdd
tj 
f	dd	}tD ]\krqr 	fd
d}r|r|  W 5 Q R X 
 qr|r|   W 5 Q R X qrtd d	 djd  qrS )NFc                   s2   | j \}\}}| kr|kp0|  ko0|dkS rt   rl  r   r  Z
node_numelZnode_rnumelrC   r\  rM   rN   fits_in_main_body  s    z@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_bodyc                   s&   | j \}\}}| ko$|dko$dkS rt   rt  ru  rv  rM   rN   fits_outside_reduction!  s    zESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reductionc                   s\    |  |  d |  rXt| tjrXt| jtjrXt| jj	tj
sX |   d S NT)r   ro   r   r   r   SchedulerNoderg   r   ZComputedBufferdataZScanget_namer   )current_loop_has_writescurrent_loop_reduced_writesdonenode_schedulerM   rN   schedule_node_in_loop%  s    


zDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loopc                  3  s    r<d d  D ]&} kr| r| j @ s qrVd tkrV  n
t d V  t   d d S )Nr2   r   F)	ancestorsr   popro   DisableReductionclear)Z
other_node)	r~  r  r  rw  rZ   rg   r  r`   r  rM   rN   end_current_reduction_loop4  s     



zISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loopc                   s<   dkrdS  | j @ sdS |r0t|d ttfr4tt S )Nr2   Fr   )r  r   r   r  rb   rY   )rg   r  )r  r\  rM   rN   #requires_closing_previous_reductionO  s    
 zRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reductionzunexpected group: (rc   z) != r2   )r   r   r   r   r   ro   r`  rl  )rJ   r`   rC   r\  Zcurrent_loop_writesrx  r  r  rM   )r~  r  r  rw  rZ   rg   r  r`   rC   r\  r  rN   generate_node_schedule  s6    




z%SIMDScheduling.generate_node_schedulez<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]r   c           	      C  s   |  }t|dd dj\}\}}| |||}tt}|D ]*}|jj|jj	B D ]}||j
 | qRq>td| | ||||S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        c                 S  s   t |  S rG   rX   r   r   rM   rM   rN   r   r  r   z-SIMDScheduling.codegen_node.<locals>.<lambda>r   zSchedule:
 %s)rp  maxrl  r  r   r   rv   read_writesreadswritesr@   ro   schedule_logdebugcodegen_node_schedule)	rJ   rg   r`   r  rC   r\  r  r   accessrM   rM   rN   codegen_nodei  s    
zSIMDScheduling.codegen_nodec                 C  sB   |   sttdd t| jj| jjD r4tj	S | j
jjS d S )Nc                 s  s   | ]}|  V  qd S rG   )is_contiguousry   rG  rM   rM   rN   r     s   z0SIMDScheduling.reduction_hint.<locals>.<genexpr>)r   rb   r   r   r   r  r  r  r!   INNERrg   r{  r   r   rM   rM   rN   r   ~  s    zSIMDScheduling.reduction_hintr<   z(Iterable[Union[ir.Buffer, ir.TensorBox]]rY   )rC   buffersr   c                   s   t t jjtjjjtjjjj	  fdd| s@dS dd |D }t
fdd|D shdS tjj|  |D ]}tjj| q|dS )	Nc                   s*   t jj| krdS  | o(| kS ry  )r1   rh   ri   Zis_expr_static_and_true)rz   )has_hintint_maxr   rM   rN   within_32bit  s    z;SIMDScheduling.can_use_32bit_indexing.<locals>.within_32bitFc                 S  s(   g | ] }t | tjs|  qS rM   )r   Z
get_layoutr   ZMultiOutputLayoutZstorage_size)ry   rV  rM   rM   rN   r{     s   z9SIMDScheduling.can_use_32bit_indexing.<locals>.<listcomp>c                 3  s   | ]} |V  qd S rG   rM   )ry   r  )r  rM   rN   r     s     z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>T)torchZiinfoZint32r  r1   rh   ri   r   Z	shape_envr  r   Z	guard_leq)rC   r  Z	buf_sizesr  rM   )r  r  r   r  rN   can_use_32bit_indexing  s    
z%SIMDScheduling.can_use_32bit_indexingc           	      C  sx   t  }|D ].}t|tjsq
||  ||  q
ddddd}dd |D }|| }t||rr| j	S | j
S )Nr;   zUnion[ir.Buffer, ir.TensorBox])r@   r   c                 S  s&   t j| }|d kr"td|  |S )Nz$Failed to find buffer matching name )r1   rh   rK  RuntimeError)r@   rV  rM   rM   rN   _get_buffer  s    z6SIMDScheduling.select_index_dtype.<locals>._get_bufferc                 S  s   g | ]}t j|qS rM   )r1   rh   rK  rO  rM   rM   rN   r{     s     z5SIMDScheduling.select_index_dtype.<locals>.<listcomp>)r   r   r   r%   update	get_namesZused_buffer_namesrg  r  
int32_type
int64_type)	r  r  rC   reduction_numelZbuffer_namesrg   r  r  Ztotal_numelrM   rM   rN   select_index_dtype  s    z!SIMDScheduling.select_index_dtypec                   sN   t t fdd|}|D ],}tdd t|jj|jjD s dS qdS )Nc                   s,   | t tfko*|   o*| jd d   kS )Nr2   r   )r   r  r   rl  r}  rv  rM   rN   r     s   zJSIMDScheduling.has_non_contiguous_pw_in_reduction_kernel.<locals>.<lambda>c                 s  s<   | ]4}t |t p2| p2t |jtjtfp2| V  qd S rG   )r   r   r  rZ   rU   rV   rX   Zstride1_for_last_dimr  rM   rM   rN   r     s   zKSIMDScheduling.has_non_contiguous_pw_in_reduction_kernel.<locals>.<genexpr>TF)rv   filterr   r   r   r  r  r  )rJ   r  rC   r\  Zpointwise_nodesrg   rM   rv  rN   )has_non_contiguous_pw_in_reduction_kernel  s     	z8SIMDScheduling.has_non_contiguous_pw_in_reduction_kernelc           
        s   t tdd |}t|dkrv fdd|D }||d t|krP|d }ntj}|tjkr| |||r|tj}ntj}t }|D ]}t	|dr|
|  q |||}	|||	fS )Nc                 S  s   | t tfko|  S rG   )r   r  r   r}  rM   rM   rN   r     s   z0SIMDScheduling.get_kernel_args.<locals>.<lambda>r   c                   s   g | ]}  |qS rM   )r   r   rO   rM   rN   r{     s     z2SIMDScheduling.get_kernel_args.<locals>.<listcomp>get_mutations)rv   r  r   r   r!   rd  r  r  r   hasattrr  r  r  )
rJ   r  rC   r  Z
reductionshintsreduction_hint_valr   rg   r   rM   rO   rN   get_kernel_args  s6    
  
zSIMDScheduling.get_kernel_argsc              	   C  s  ddl m} | |||}| |||\}}}	tdd |D }
|
rH|n| j}|}|||	d}|||}||_| || t	| |
 }W 5 Q R X | |||}td| ||_t||_|jr>tjjr>| j||ddi}| || t	| |
 }W 5 Q R X | |||}||_t||_t||g}n|}t	|( |D ]}|ttfkrR|  qRW 5 Q R X | | ||j tjr|  tjr|| tj j|jO  _tj j|jO  _tjj j!r~tj"r~|j#$ }|D ]}t%|t&j'sq|( }||kr$q|j)d k	s4t*|j)+ }|d k	rt,d	 d
  d7  < tjj -d|j.d| d q| j&/  d S )Nr   )TritonSplitScanKernelc                 s  s    | ]}t |to| V  qd S rG   )r   r%   rm  ry   rg   rM   rM   rN   r     s   z7SIMDScheduling.codegen_node_schedule.<locals>.<genexpr>r   r   r   z+Generating kernel code with kernel_name: %sr   TZinductorZintermediate_hooksr2   zrun_intermediate_hooks(rc   r   )0Z)torch._inductor.codegen.triton_split_scanr  ro  r  r"  kernel_typer   !codegen_node_schedule_with_kernelr1   set_kernel_handlerra  define_kernelrQ  r  rS  r   r   r   rq  multi_kernelr7   r   r  mark_runcodegen_commentcall_kernelZnan_assertsZcodegen_nan_checkrX  rh   removed_buffersinplaced_to_removewrapper_codeZsupports_intermediate_hooksZgenerate_intermediate_hooksr   Zlive_output_buffersr   r   r%   r|  rg   rb   Zget_origin_noder   	writeliner@   free_buffers)rJ   r  r   rC   r  r  tiled_groupsr  r   r   rm  r  Zkernel_argsZkernel_kwargsrE   src_coderS  Zkernel2Z	src_code2Zkernel_name2Zfinal_kernelrg   Z	live_outsr@   Zorigin_noderM   rM   rN   r    s    







z$SIMDScheduling.codegen_node_schedulec              	   C  s   dd }| t  }||| |D ]}|ttfkr(|  q(t|D ]j\}}|tkrj||  qJ|tkr|	  ||||d   qJt
|j || }|| qJW 5 Q R X d S )Nc                 S  s   t dd | S )Nc                 S  s   | t k	S rG   )r  r}  rM   rM   rN   r   e  r   zcSIMDScheduling.codegen_node_schedule_with_kernel.<locals>.current_reduction_nodes.<locals>.<lambda>)r   	takewhile)r`   rM   rM   rN   current_reduction_nodesd  s    zQSIMDScheduling.codegen_node_schedule_with_kernel.<locals>.current_reduction_nodes)r   	ExitStackr   r   r  Zdecide_inplace_updater   enter_contextr   closer    _bodyr  ri  r   )rJ   r  rE   r  stackrg   r   r   rM   rM   rN   r  c  s     

z0SIMDScheduling.codegen_node_schedule_with_kernelFOptional[str]r   c              
   C  s  |j \}\}}|dkst|j|j\}}|\ |sN|f|D ]}	|	  q@| }
|d$ |D ]}	|	||	  qdW 5 Q R X W 5 Q R X t	|
t
s|
d t| |d& t	|
t
r|
}n|
d |
j}W 5 Q R X |f|}tjr`| d }tjj|j}|jdk	s*td|j||jf }|  d| d|||  }|rv|W  5 Q R  S | |||}W 5 Q R X | | |||j tj j|jO  _tj j|jO  _| j !  dS )z
        Codegen a triton template

        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
        r2   z<STORE_OUTPUT>z<DEF_KERNEL>g    eANzmeta is NonerP  )"rl  rb   rg   Zmake_kernel_renderr  Zset_subgraph_bodyr   r  ri  r   r;   Zfinalize_hookr1   r  coder   benchmark_kernelrI  rh   ri   Z
size_hintsZ
call_sizesmetaZgrid_fnZimports_for_benchmark_kernelZcodegen_kernel_benchmarkgetvaluer  r  r  r  r  r   r  )rJ   template_nodeepilogue_nodesonly_gen_src_coder  rC   r\  rE   renderrg   Zpartial_coder  r  Znum_gbZ	grid_argsgridrS  rM   rM   rN   codegen_templatez  sH    
*




"

zSIMDScheduling.codegen_templatec                 C  s   t jjt jj  d S rG   )r1   rh   r  r  Z
device_opsZsynchronizerO   rM   rM   rN   codegen_sync  s    zSIMDScheduling.codegen_syncc                 C  s  ddl m} || | D ]}| }|D ]\}}}}| |||}	| |	||\}
}}|j||
||d}| |	| t	|$ |	D ]}|t
tfkr|  qW 5 Q R X tj j|jO  _tj j|jO  _q*| }| ||g|}| |g |tjj| q| j  d S )Nr2   )ForeachKernelr  )Ztriton_foreachr  Zhorizontal_partitionZget_subkernel_nodesr  r  Zcreate_sub_kernelr  r1   r  r   r  r  rh   r  r  ra  r  r  r  r  r   r  )rJ   Zforeach_noder  Zpartitions_with_metadatarE   r`   r  rC   r\  r  r  r   r   Z	subkernelrg   r  rS  rM   rM   rN   codegen_foreach  sD     zSIMDScheduling.codegen_foreach    c              	   C  s  |   \}}t|dkrdS |  }t|jt|ks:t|j|jg}tdd tj	
|D sdtdd tj	
|D }dd |jD }g }|D ]X}tjj|j|j}	t|	t|kstz@|	dd }
|
t|krW qtd	d |	|
d  D rW qW n tk
r   Y qY nX tjjt|d |
 tjjt||
d  f}tjjtd
d t||	D }|j|kr|d9 }t|d r|d9 }t|d r|d9 }tjj|tt	|| dkr|t|||j q|S )Nr2   rM   c                 s  s   | ]}t |ttfV  qd S rG   )r   r   r   r  rM   rM   rN   r     s   z3SIMDScheduling.candidate_tilings.<locals>.<genexpr>c                 S  s(   g | ] }|j tjjkrt|tr|qS rM   )r@   r1   rh   r  r   r   r  rM   rM   rN   r{     s    
z4SIMDScheduling.candidate_tilings.<locals>.<listcomp>c                 S  s   h | ]
}|j qS rM   r   r  rM   rM   rN   	<setcomp>  s     z3SIMDScheduling.candidate_tilings.<locals>.<setcomp>c                 s  s   | ]}|d kV  qdS r  rM   r   rM   rM   rN   r     s     c                 s  s   | ]\}}|d kr|V  qdS r  rM   )ry   r  rN  rM   rM   rN   r     s     r   r   )ri  r   Zpointwise_read_writesZ
range_varsrb   r  r  r   r   r   r   r1   rh   ri   Zstride_hintsrZ   
ValueErrorr   r,   r   r   r@   CandidateTilingis_good_sizero   )rg   r   Zreduction_rangesrwZdep_sourcesdepsZwrite_namesZtilingsrG  r=  splitr  scorerM   rM   rN   candidate_tilings  s^    



z SIMDScheduling.candidate_tilingsr2   c                   s  |dkst jjdkrXtjtjkrPt|D ]$}t	| 
|dkr*td  qPq*||fS t }t }t|D ]@}| 
|D ]0}|j|krq~||j ||j  |j7  < q~qpdd | D }t jjdkrtdt	|D ]}	|d \}
}||	 \}}tjj|| dkrqtjj|| dk rF||	 \}
}|d \}}tjj|| dksbttjj||r|
t|||f}|g| } qqt	|dkrtd| |D ]0}||f t fdd	|D r   S q||fS )
z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r2   r   z"reduction over non-contiguous dimsc                 S  s   g | ]\}}|qS rM   rM   )ry   tilingr  rM   rM   rN   r{   @  s     z0SIMDScheduling.select_tiling.<locals>.<listcomp>r   zpossibly bad tiling: %sc                 3  s*   | ]"}t |tjrt | V  qd S rG   )r   r   rz  r=   r  ri  r  Z
new_groupsrM   rN   r   ^  s   z/SIMDScheduling.select_tiling.<locals>.<genexpr>)r   rq  Z	max_tilesperf_hint_loglevelloggingWARNINGr   r  r   r  infor   r   r   r@   r   r  r  most_commonranger1   rh   ri   r   rb   r  r   r   )r  r  rC   r  rg   Z
seen_namesZcandidate_tilesr  Zranked_tilingsr   Za0Za1Zb0b1r  rM   r  rN   ro  #  sN    





zSIMDScheduling.select_tilingc                 C  s   d S rG   rM   rO   rM   rM   rN   flushg  s    zSIMDScheduling.flushc                 C  s   dS r   rM   rO   rM   rM   rN   ready_to_flushj  s    zSIMDScheduling.ready_to_flushc              
     s8  t jG dd d  fdd|D }|D ]}t |_q*|d  st|dd dj\}\}}| |||}| |||}	| 	|||\}
}}| j
|	|
||d	}| || td
|$ t| | }W 5 Q R X W 5 Q R X n<|d }|dd  }td
| | j||dd}W 5 Q R X |ttjd}|S )Nc                   @  s&   e Zd ZU ded< ded< dd ZdS )zGSIMDScheduling.generate_kernel_code_from_nodes.<locals>.LastUsageHolderr   r   r   c                 S  s   | j | j_ d S rG   )r   r   rO   rM   rM   rN   __del__s  s    zOSIMDScheduling.generate_kernel_code_from_nodes.<locals>.LastUsageHolder.__del__N)rQ   rR   rS   rc  r  rM   rM   rM   rN   LastUsageHoldern  s   
r  c                   s   g | ]} ||j qS rM   )r   r   r  rM   rN   r{   v  s     zBSIMDScheduling.generate_kernel_code_from_nodes.<locals>.<listcomp>r   c                 S  s   t |  S rG   r  r   rM   rM   rN   r   }  r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>r   r  r  r2   T)r  Ztriton_)dataclasses	dataclassr   r   rn  r  rl  r  ro  r  r  r  r   patchr1   r  ra  r  replacer;   r*   ZKERNEL_NAME)rJ   r`   r  Zlast_usage_holdersr   r  rC   r\  r  r  r  r   r   rE   r  r  r  rM   r  rN   generate_kernel_code_from_nodesm  sL    
  
   z.SIMDScheduling.generate_kernel_code_from_nodesc                 C  s   d S rG   rM   )rJ   r  rM   rM   rN   r    s    zSIMDScheduling.codegen_commentc                 C  s   t d S rG   r_  )rJ   r  r  rE   rM   rM   rN   r    s    zSIMDScheduling.define_kernelN)F)F)%rQ   rR   rS   r=   r  r  r  rI   rh  rk  Zcan_fuse_verticalrr  r  r  re  r   r  rf  r  r  r  r  r  r  r  r  r   r   r  rU   rV   ro  r  r  r  r  r  rM   rM   rM   rN   rg    sD   oX

$
"^ 8)CC
0rg  c                   @  s6   e Zd ZU ded< ded< dZded< edd	 ZdS )
r  zTuple[sympy.Expr, sympy.Expr]r  rX   r  Nr  r@   c                 C  s"   t jj| } | dko | d dkS )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   r  )r   rM   rM   rN   r    s    zCandidateTiling.is_good_size)rQ   rR   rS   rc  r@   re  r  rM   rM   rM   rN   r    s
   
r  c                   @  s   e Zd ZdZdS )r  z
    Marker to invoke `kernel.disable_reduction()`.  This closes a
    reduction loop and allows for pointwise ops to occur on the output
    of a reduction.
    N)rQ   rR   rS   rT   rM   rM   rM   rN   r    s   r  c                   @  s   e Zd ZdZedd ZdS )r   z1
    Marker to end a DisableReduction block.
    c                 c  s4   d}| D ]&}|t tfkr"|tk}q|r(q|V  qdS )zf
        Get the nodes from node_schedule skipping those in a
        DisableReduction block.
        FN)r   r  )r  disabledrg   rM   rM   rN   r    s    
zEnableReduction.filterN)rQ   rR   rS   rT   re  r  rM   rM   rM   rN   r     s   r   c                   @  s   e Zd ZdS )r  N)rQ   rR   rS   rM   rM   rM   rN   r    s   r  )c
__future__r   r   r   r  r   r   r  r   r  typingr   r   r   r   r   r   r	   r
   r   r   r   r   rU   r  Ztorch._loggingZtorch.utils._sympy.functionsr   r   Ztorch.utils._sympy.symbolr   r   r   Z_dynamo.utilsr   r   r   r   r   Z	codecacher   dependenciesr   r   r   r   r   Zoptimize_indexingr    Zruntime.hintsr!   r"   Zruntime.runtime_utilsr#   r$   r%   r&   r'   utilsr(   r)   r*   r+   r,   r-   r.   Zvirtualizedr/   r0   r1   commonr3   r4   r5   r6   r  r7   	getLoggerrQ   rQ  Z_loggingZgetArtifactLoggerr  r  Z
fusion_logZdoprintrb  r  r8   r>   rk   r   r=   rg  r  r  r   	Exceptionr  rM   rM   rM   rN   <module>   sl   8$	
+s=
    o      