U
    zh                    @   s&  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dl
mZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZmZ d dlmZmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& dd	l'm(Z(m)Z)m*Z*m+Z+m,Z, dd
l-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl*m4Z4m5Z5m6Z6m7Z7 ddl+m8Z8m9Z9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZK ddlLmMZM eNeOZPejQReOdZSG dd dZTG dd dZUeeVdddZWG dd dZXeTeeVeTf ddddZYejZj[j\ejZj[j]ejZj[j^ejZj[j_d Z`G d!d" d"eTZaG d#d$ d$eTZbG d%d& d&eTZcG d'd( d(eTZdG d)d* d*edZed9eeef  eejg eefd,f eef d-d.d/ZhejiG d0d1 d1Zjek ZlG d2d3 d3ZmG d4d5 d5Zneecedf eeV d6d7d8ZodS ):    N)AnyCounterDefaultDictDictGenericListOptionalSequenceSetTupleTypeVarUnion)countersdynamo_timed)get_metric_tableis_metric_table_enabled)free_unbacked_symbols)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)
write_text)get_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)ComputedBufferMultiOutputMultiOutputLayout)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_waitsympy_product)VZfusionc                   @   s  e Zd ZU eejeeejdf df f ed< e	j
ed< ee ed< dejdddd	Zed
ddZed
ddZed
ddZdd
ddZeeef ddddZeddddZed ddddZee eeef ddddZee d
d d!Zee d
d"d#Zed
d$d%Ze	j
dd&d'd(Z e!e d
d)d*Z"ee d
d+d,Z#ee d
d-d.Z$dd
d/d0Z%dd
d1d2Z&eed f dd3d4d5Z'ed
d6d7Z(ed
d8d9Z)ee d
d:d;Z*ed  d
d<d=Z+ejd
d>d?Z,ed
d@dAZ-ed
dBdCZ.ed
dDdEZ/ed
dFdGZ0ed
dHdIZ1e	jedJdKdLZ2ed
dMdNZ3dd
dOdPZ4dd
dQdRZ5ed
dSdTZ6d_e7eddVdWdXZ8e9d
dYdZZ:e;d
d[d\Z<e=ej> d
d]d^Z?dS )`BaseSchedulerNode.groupread_writesunmet_dependencies	SchedulerN	schedulernodereturnc                 C   sN   || _ || _g | _g | _g | _| |  t | _|  |  t | _	d| _
d S NF)r<   r=   usersinverse_users
node_usersset_read_writesZget_read_writesset	ancestors
last_usagewrittenselfr<   r=    rJ   K/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/scheduler.py__init__G   s    zBaseSchedulerNode.__init__r>   c                 C   s   t | j d|  dS )Nz(name=)type__name__get_namerI   rJ   rJ   rK   __repr__V   s    zBaseSchedulerNode.__repr__c              	   C   s   |   }| dt| j dtt| ddj d| dt| jj | dt| j | dt| jj| j  | d	| j	 g}z|| 
 g7 }W n" tk
r   tjd
dd Y nX d| S )#Longer form printout for trace logs: (r=   NrN   
.writes = .unmet_dependencies = .met_dependencies = 	.users = Ignoring error in debug_str()Texc_info
)rR   rP   rQ   getattrpformatr8   writesr9   readsr@   debug_str_extra	Exceptionlogwarningjoinrstrip)rI   namelinesrJ   rJ   rK   	debug_strY   s    (
zBaseSchedulerNode.debug_strc                 C   s   dS )N rJ   rS   rJ   rJ   rK   rd   l   s    z!BaseSchedulerNode.debug_str_extrac                 C   s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)rf   infor9   r8   rb   rS   rJ   rJ   rK   log_detailso   s    zBaseSchedulerNode.log_detailsrenamesr>   c                 C   s   |  | j| d S N)rC   r8   renamerI   rq   rJ   rJ   rK   update_mutated_namesw   s    z&BaseSchedulerNode.update_mutated_namesdepr>   c                 C   s   |  | j| d S rr   )rC   r8   Z	with_readrI   rw   rJ   rJ   rK   add_fake_depz   s    zBaseSchedulerNode.add_fake_depNodeUserr@   r>   c                 C   s\   i }|D ]@}t |j|kr:||t |j |t |j< q||t |j< qt| | _d S rr   )idr=   mergelistvaluesr@   )rI   r@   resultuserJ   rJ   rK   	set_users}   s     zBaseSchedulerNode.set_usersfuture_used_buffersmutation_real_namer>   c                    s(   |   } fdd|D }|| | _d S )Nc                    s   h | ]}  ||qS rJ   )get).0kr   rJ   rK   	<setcomp>   s     z3BaseSchedulerNode.set_last_usage.<locals>.<setcomp>)used_or_aliased_buffer_namesrF   )rI   r   r   Zused_buffersrJ   r   rK   set_last_usage   s    z BaseSchedulerNode.set_last_usagec                 C   s   | j d k	st| j  S rr   )r=   AssertionErrorget_inputs_that_alias_outputrS   rJ   rJ   rK   get_aliases   s    zBaseSchedulerNode.get_aliasesc                 C   s   | j d k	st| j  S rr   )r=   r   get_mutation_namesrS   rJ   rJ   rK   get_mutations   s    zBaseSchedulerNode.get_mutationsc                 C   s   t |  p|  S rr   )boolr   r   rS   rJ   rJ   rK   has_aliasing_or_mutation   s    z*BaseSchedulerNode.has_aliasing_or_mutation)rwr>   c                 C   s   || _ | j j| _|   d S rr   )r8   rc   r9   
prune_deps)rI   r   rJ   rJ   rK   rC      s    
z!BaseSchedulerNode.set_read_writesc                 C   s   | j jS rr   )r8   	op_countsrS   rJ   rJ   rK   r      s    zBaseSchedulerNode.op_countsc                 C   s   dd t | jj| jjD S )Nc                 S   s   h | ]
}|j qS rJ   rj   r   rw   rJ   rJ   rK   r      s   z6BaseSchedulerNode.used_buffer_names.<locals>.<setcomp>)	itertoolschainr8   rc   rb   rS   rJ   rJ   rK   used_buffer_names   s    z#BaseSchedulerNode.used_buffer_namesc                 C   s~   t  }dd t| jj| jjD }t|dkrz| }|| t	j
j|r$t	j
j|  D ]}||kr`|| q`q$|S )Nc                 S   s   g | ]
}|j qS rJ   r   r   rJ   rJ   rK   
<listcomp>   s   zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r   )rD   r   r   r8   rc   rb   lenpopaddr5   graphname_to_bufferr   r   append)rI   Z
used_namesdepsrw   aliasrJ   rJ   rK   r      s    
z.BaseSchedulerNode.used_or_aliased_buffer_namesc                    s    fdd j D  _ d S )Nc                    s   h | ]}|j  jjkr|qS rJ   )rj   r<   available_buffer_namesr   rS   rJ   rK   r      s   z/BaseSchedulerNode.prune_deps.<locals>.<setcomp>r9   rS   rJ   rS   rK   r      s    
zBaseSchedulerNode.prune_depsc                    s<   t tddd  fdd| jjD }| | j| d S )Nrv   c                 S   s   t | to| jtjjkS rr   )
isinstancer#   rj   r5   r   removed_buffers)rw   rJ   rJ   rK   should_prune   s    z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                    s   h | ]} |r|qS rJ   rJ   r   r   rJ   rK   r      s      z4BaseSchedulerNode.prune_weak_deps.<locals>.<setcomp>)r    r   r8   rc   rC   remove_reads)rI   Z	to_removerJ   r   rK   prune_weak_deps   s    z!BaseSchedulerNode.prune_weak_depsname_to_fused_noder>   c                 C   s   t | | d S rr   )_prune_redundant_deps)rI   r   rJ   rJ   rK   prune_redundant_deps   s    z&BaseSchedulerNode.prune_redundant_depsc                 C   s   | j d k	st| j  S rr   )r=   r   rR   rS   rJ   rJ   rK   rR      s    zBaseSchedulerNode.get_namec                 C   s   |   S rr   rR   rS   rJ   rJ   rK   get_first_name   s    z BaseSchedulerNode.get_first_namec                 C   s
   |   hS rr   r   rS   rJ   rJ   rK   	get_names   s    zBaseSchedulerNode.get_namesc                 C   s   | gS rr   rJ   rS   rJ   rJ   rK   	get_nodes   s    zBaseSchedulerNode.get_nodesc                 C   s   | j d k	st| j  S rr   )r=   r   
get_devicerS   rJ   rJ   rK   r      s    zBaseSchedulerNode.get_devicec                 C   s   dS r?   rJ   rS   rJ   rJ   rK   is_reduction   s    zBaseSchedulerNode.is_reductionc                 C   s   dS r?   rJ   rS   rJ   rJ   rK   is_split_scan   s    zBaseSchedulerNode.is_split_scanc                 C   s   dS r?   rJ   rS   rJ   rJ   rK   is_template   s    zBaseSchedulerNode.is_templatec                 C   s   dS r?   rJ   rS   rJ   rJ   rK   	is_extern   s    zBaseSchedulerNode.is_externc                 C   s   dS r?   rJ   rS   rJ   rJ   rK   
is_foreach   s    zBaseSchedulerNode.is_foreachread_depr>   c                 C   s   dS r?   rJ   rI   r   rJ   rJ   rK   can_inplace   s    zBaseSchedulerNode.can_inplacec                 C   s   dS r?   rJ   rS   rJ   rJ   rK   has_side_effects   s    z"BaseSchedulerNode.has_side_effectsc                    s
   j dk	st j  sdS t tfr@ j  s< j  r@dS t tfrtjrtt	j
tjjjjr~tt	j
dddk	rddlm} t jjdd d}|D ]`} jj|j}|rt	jj| rt|ts|jdk	st fdd	|jD }t|dkr|d
 j r|d
 j  kr|j dk	rt|j ! t"j#t"j$fst|j t"j%t"j&frft|j  d
ks||j | j krt't	j
drt	j
j()|*  *  tt	j
tjjjjrt	j
j+,|*  t	j
j+, *   j-.|*  |* t	j
j/ * <  qqdS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        N	mutationsr   )buffer_reuse_keyc                 S   s   | j S rr   r   xrJ   rJ   rK   <lambda>	      z9BaseSchedulerNode.decide_inplace_update.<locals>.<lambda>keyc                    s"   g | ]}|j   jjkr|qS rJ   )r=   rR   r<   r   r   r   rS   rJ   rK   r     s
   z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>r   args)0r=   r   should_allocater   SchedulerNoder   r   r   inplace_buffersr5   kerneltorch	_inductorcodegenZsimdZ
SIMDKernelr`   Zcodegen.wrapperr   sortedr8   rc   r<   name_to_noder   rj   r   wrapper_codeZ	can_reuseNopKernelSchedulerNoder@   r   r   
get_layoutr   r&   ZMutationLayoutSHOULDREMOVEZFallbackKernelr%   hasattrr   Zmake_inplacerR   r   r   rF   discardinplace_update_buffers)rI   r   Zordered_readsreadZ
input_nodeZremaining_usesrJ   rS   rK   decide_inplace_update   s    




 
  
z'BaseSchedulerNode.decide_inplace_updatec                 C   s   | j d k	st| j  sd S t| tfrP| j  s<| j  rPtjj	
| j  d S ttjdr|  tjjkrtjj	| jjtjj|    j | j  ntjj	
| j  d S )Nr   )r=   r   r   r   r   r   r   r5   r   r   Zcodegen_allocationr   r   rR   r   Zcodegen_inplace_reuser<   r   rS   rJ   rJ   rK   allocateG  s*    

zBaseSchedulerNode.allocatec                 C   sD   | j d k	stt| j jtjr"dS | jD ]}t|j tr( dS q(dS )NFT)r=   r   r   layoutr   Z
NoneLayoutr@   
OutputNode)rI   r   rJ   rJ   rK   can_free`  s    
zBaseSchedulerNode.can_freeT)buffer	only_oncer>   c           	      C   s  t js
d S |r| jrd S | jd k	s&t| jj}g }|D ]}|jdkrFq6|d |d d|j d|j }d|j	kr|d|j	d   }|| d|j	kr6|j	d  }|
d	d
 }|d|dddddd  |d |d q6t|dkrd S || d| _d S )Noutputrm   z#pragma CMT ORIGIN:z#pragma CMT  Zseq_nrz seq_nr:stack_trace|{z{{}z}}r_   \z#pragma CMT END ORIGINr   T)r   Zcomment_originrG   r=   r   originsopr   targetmetasplitreplacer   
writelines)	rI   r   r   r   Z	out_linesoZop_info_strr   Zstack_trace_last_linerJ   rJ   rK   codegen_originating_infoj  sH    






  

z*BaseSchedulerNode.codegen_originating_infoc                    s  t trdS t tr(t jtr(dS tjtdddt trjt	
 d t	
 d  ntdtt}jjjjB D ]}||j | qdd jjD }d	d jjD }ttt td
fddt trfdd|D }|| }|| }d}||B D ]}tfdd|| D }|tjjkrVtjj| }	n |tjjkrtjj| }	nqtttj tj!f  td fdd  |	}
|t"|
|t#|	$  7 }q|S )aM  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size
        r   )sr>   c                 S   s   t jjj| ddS )Nr   )fallback)r5   r   sizevarsZ	size_hint)r   rJ   rJ   rK   try_size_hint  s    zEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.try_size_hintr       eAc                 S   s   h | ]
}|j qS rJ   r   r   rJ   rJ   rK   r     s     zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<setcomp>c                 S   s   h | ]
}|j qS rJ   r   r   rJ   rJ   rK   r     s     )bufsnodesr>   c                    s:   dd  j j|  jD }dd |D }t|t| dkS )Nc                 S   s   g | ]}|j s|qS rJ   is_weakr   userrJ   rJ   rK   r     s   z[BaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<listcomp>c                 S   s   h | ]
}|j qS rJ   r=   r   rJ   rJ   rK   r     s     zZBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<setcomp>r   )r<   r   r@   r   rD   )r   r   r@   Zbuf_usesrS   rJ   rK   is_materialized  s
    zGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materializedc                    s   h | ]} |j s|qS rJ   r   r   )r   rI   rJ   rK   r     s     c                 3   s   | ]
} V  qd S rr   rJ   r   )
node_numelrJ   rK   	<genexpr>  s     zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>)r   r>   c                    s   | sdS t | jtrnjj|   j}d}|D ]:}t |jtsBt	t |jjt
rb| |jj7 }q. dS q.|S t|  S d S Nr   )r   r   r&   r<   r   rR   r@   r=   r6   r   r%   r4   get_size)r   r@   Ztotr   )get_buf_elemsrI   r   rJ   rK   r    s    zEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_elems)%r   r   ExternKernelSchedulerNoder=   r%   sympyExprintr   r4   
get_rangescollectionsdefaultdictr~   r8   rc   rb   rj   r   strr	   r6   r   FusedSchedulerNodesumr5   r   r   graph_inputsr   r   r   BufferZ	TensorBoxminr.   	get_dtype)rI   Zbuf_accessesrw   rc   rb   r   Z
node_bytesbuf_nameZbuf_accessed_elemsr   Z	buf_elemsrJ   )r  r   r   rI   r   rK   get_read_write_buffers_sizes  sR    
 

	(z.BaseSchedulerNode.get_read_write_buffers_sizesc                    s  d}d}t | dr| jstt| ttfs8tdt| | jsBt| jd jsRdS | jd j }| jd j	 }n| j }| j	 }|j
dk	rt|j
jsdS t| jr | jdk	stzt| jW S  tk
 r } zt| W Y dS d}~X Y nX nt| jrdS zt }t|d }W n tk
r>   Y dS X t| trt| jtjsptdt| jtt| jddd}|dk	rdd	lm} dd
lm} tdd | jj D rdS | }	|dd}
t!"| jj# t!$|	 ddlm%   fdd| jj D }| jj&}|j'|f|| jj( d}|
) }| * }|| | d }|| }t+||W  5 Q R  W  5 Q R  W  5 Q R  W  5 Q R  S Q R X W 5 Q R X W 5 Q R X W 5 Q R X n&t| tst| jt,r| * | S dS )zB
        Returns estimated op runtime in nanoseconds (ns)
        Nr=   type(self)=r   l    J)type(self.node)=python_kernel_namerm   )FakeTensorMode)FlopCounterModec                 s   s"   | ]}t t| d kV  qdS r   N)r   r   	get_numelr   nrJ   rJ   rK   r  7  s   z:BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>F)displayr   ir_node_to_tensorc                    s   g | ]} |d dqS )F)Zguard_shaperJ   )r   inputr  rJ   rK   r   H  s   z;BaseSchedulerNode.get_estimated_runtime.<locals>.<listcomp>g      ?r   )-r   r=   r   r  ForeachKernelSchedulerNoder   rP   r   r   r  devicer2   r1   r   
ValueErrorrf   rn   r3   r/   r-   re   r  r   ExternKernelkernel_name_to_opr   r`   Ztorch._subclasses.fake_tensorr  Ztorch.utils.flop_counterr  anyinputsr5   set_current_nodeZfx_nodeZset_fake_moder   	__class__Zprocess_kernelkwargsZget_total_flopsr  maxr$   )rI   r   ZdtypeeZgpu_memory_bandwidthZ	gpu_flopsr   r  r  Z	fake_modeZflop_counter_modeZfake_inputsclsfactorZcounted_flopsZcounted_bytesZcompute_timeZtransfer_timerJ   r  rK   get_estimated_runtime  s     



$ 


 
` z'BaseSchedulerNode.get_estimated_runtimec                 C   s   d S rr   rJ   rS   rJ   rJ   rK   get_template_nodea  s    z#BaseSchedulerNode.get_template_node)T)@rQ   
__module____qualname__r   r   r#  r  r  __annotations__r   
ReadWritesr
   r    r   r  rL   r  rT   rl   rd   ro   r   ru   ry   r   r   r   r	   r   r   r   r   rC   r   r   r   r   r   r   r   rR   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r   r  r  floatr0  r   TemplateBufferr1  rJ   rJ   rJ   rK   r6   B   sf   
$
 
	
V  ,gdr6   c                   @   sb   e Zd ZU ddddgZeed< eedf ed< eedddZ	eed	d
ddZ
edddZd	S )	WhyNoFusenode1node2reasonr   .r9  r:  c                 C   s   || _ || _d S rr   r<  rI   r9  r:  rJ   rJ   rK   rL   l  s    zWhyNoFuse.__init__N)r;  r   r>   c                 G   s   || _ || _t|  d S rr   )r;  r   
fusion_logdebug)rI   r;  r   rJ   rJ   rK   __call__p  s    zWhyNoFuse.__call__rM   c                 C   s*   d| j   d| j  d| j| j  S )Nzcannot fuse z with rV   )r9  rR   r:  r;  r   rS   rJ   rJ   rK   __str__u  s    
zWhyNoFuse.__str__)rQ   r2  r3  	__slots__r  r4  r   r   r6   rL   r@  rA  rJ   rJ   rJ   rK   r8  e  s   
r8  )objr>   c                 C   sB   t | trt| td} tj| dd}d|kr>dt|d S |S )Nr      )indentr_       )r   rD   r   r  pprintra   textwraprE  )rC  r   rJ   rJ   rK   ra   {  s    
ra   c                   @   sN   e Zd ZeddddZedddZee ddd	Z	edd
dZ
e
ZdS )r   Nrv   c                 C   s   |h| _ g | _d S rr   )r9   rA   rx   rJ   rJ   rK   rL     s    zOutputNode.__init__rM   c                 C   s   dS r?   rJ   rS   rJ   rJ   rK   r     s    zOutputNode.is_reductionc                 C   s   dS )NrJ   rJ   rS   rJ   rJ   rK   r     s    z'OutputNode.get_inputs_that_alias_outputc                 C   s   dS )NZOUTPUTrJ   rS   rJ   rJ   rK   rR     s    zOutputNode.get_name)rQ   r2  r3  r"   rL   r   r   r	   r  r   rR   rT   rJ   rJ   rJ   rK   r     s
   r   )r=   r   r>   c                    s   t   jD ](}t|ts |j    d7  < qttd fddfddjD }|rj| _	j
| dS )am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   rv   c                    s>   t | tr6 | j   dk}| j k}|p4|S dS d S )Nr   F)r   r#   rj   rR   )rw   Zis_redundantZis_self_dep)name_to_dep_countr   r=   rJ   rK   r     s    
z+_prune_redundant_deps.<locals>.should_prunec                    s   h | ]} |r|qS rJ   rJ   r   r   rJ   rK   r     s      z(_prune_redundant_deps.<locals>.<setcomp>N)r
  r   r9   r   r#   rj   rR   r    r   rC   r8   r   )r=   r   rw   Zdeps_to_prunerJ   )rI  r   r=   r   rK   r     s    

r   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                   @   s6   e Zd ZedddZedddZedddZdS )	r  rM   c                 C   s   |    dt| jdd  S )Nz.node.kernel = r  )rR   r`   r=   rS   rJ   rJ   rK   rd     s    z)ExternKernelSchedulerNode.debug_str_extrac                 C   s   dS NTrJ   rS   rJ   rJ   rK   r     s    z#ExternKernelSchedulerNode.is_externc                 C   s$   | j d k	stt| j do"| j  S )Nr   )r=   r   r   r   rS   rJ   rJ   rK   r     s    z*ExternKernelSchedulerNode.has_side_effectsN)rQ   r2  r3  r  rd   r   r   r   rJ   rJ   rJ   rK   r    s   r  c                   @   s   e Zd ZdS )r   N)rQ   r2  r3  rJ   rJ   rJ   rK   r     s   r   c                       s  e Zd Zdeejejf dd fddZd(ee	e
eef ee f  ddddZe	e
eef ee f ddd	d
ZedddZeeej  dddZedddZedddZedddZeej dddZeej ddddZddddZeeej  e
ejejf dddZeeej  dddd Zejdd!d"Z ej!ed#d$d%Z"e#e$e dd&d'Z%  Z&S ))r   r:   Nr;   c                    s   t  || |   d S rr   )superrL   _compute_attrsrH   r*  rJ   rK   rL     s    zSchedulerNode.__init__)extra_indexing_constraintsr>   c                 C   s   t | jtjtjfst| jj|d\| _| _| j	
| j j}| j || jf| _t | jtjrx| | j  n | tj| jf| jddi d S )NrN  	normalizeT)r   r=   r   r$   r7  r   Zsimplify_and_reorder_sizes_bodyr<   get_backendr   group_fnr7   rC   Znormalized_read_writesr   extract_read_writes)rI   rN  rT  rJ   rJ   rK   rL    s"    zSchedulerNode._compute_attrsc                 C   s   | j |d d S )NrO  )rL  )rI   rN  rJ   rJ   rK   recompute_size_and_body  s    z%SchedulerNode.recompute_size_and_bodyrM   c                 C   s4  |   }| d| jd  | d| jd  | d| j g}| j D ]0}|j}tj|}|	| dt
|j  qH|  r|	| dt
|    |  r|	| dt
|    t| jtjr|	d	| d
 |	t| j d | jd k	s
tt| j r*|t|  d|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = z.aliases = z.mutations = zclass z_loop_body:rF  r_   )rR   r7   rQ  r8   reads_and_writesrj   r5   r   
get_bufferr   ra   r   r   r   r   rR  r   LoopBodyrH  rE  rl   r=   r   	is_tritonr   extenddebug_triton_coderh   )rI   rj   rk   rw   r  r   rJ   rJ   rK   rd     s(    zSchedulerNode.debug_str_extrac                 C   s   | j S rr   )rQ  rS   rJ   rJ   rK   r	    s    zSchedulerNode.get_rangesc                 C   s6   t | jtjtjfs(tdt| jt| j S Nr  )	r   r=   r   r$   r7  r   rP   r   Zget_reduction_typerS   rJ   rJ   rK   r     s     
zSchedulerNode.is_reductionc                 C   sF   t | jtjtjfs(tdt| jt | jtjoDt | jjtjS r]  )	r   r=   r   r$   r7  r   rP   dataZ	SplitScanrS   rJ   rJ   rK   r     s     
 zSchedulerNode.is_split_scanc                 C   s   t | jtjS rr   r   r=   r   r7  rS   rJ   rJ   rK   r   !  s    zSchedulerNode.is_templatec                 C   s   t | jtjr| jS d S rr   r_  rS   rJ   rJ   rK   r1  $  s    zSchedulerNode.get_template_node)
index_varsr>   c                 G   s   |    |   | | d S rr   )r   mark_runr   )rI   r`  rJ   rJ   rK   run'  s    zSchedulerNode.runc                 C   s   |    d S rr   )r   rS   rJ   rJ   rK   ra  ,  s    zSchedulerNode.mark_runc                 C   sH   | j }ttt|ttt|ks&ttttj	|tj	|}|S rr   )
rQ  r  mapr   r   dictzipr   r   from_iterable)rI   r`  sizes
var_rangesrJ   rJ   rK   ranges_from_index_vars/  s     

z$SchedulerNode.ranges_from_index_varsc              
   C   sz   |  |}zFttt |( tj|  | j|  W 5 Q R X W 5 Q R X W n$ tk
rt   t	
d| j  Y nX d S )NzError in codegen for %s)ri  r5   Zset_ops_handlerr)   Zget_ops_handlerr   r)  rR  re   rf   fatalr=   )rI   r`  rh  rJ   rJ   rK   r   <  s    

"zSchedulerNode.codegenc                    s2   j \} ttj td fdd}t||S )zH
        Get the memory dependencies in the non-reduction axis.
        )indexr>   c                    s    | dd  D S )Nc                 S   s   g | ]}t d qS )r   )r  Integer)r   _rJ   rJ   rK   r   N  s     zCSchedulerNode.pointwise_read_writes.<locals>.fn.<locals>.<listcomp>)rR  )rk  Zreduction_sizesrI   rJ   rK   fnM  s    z/SchedulerNode.pointwise_read_writes.<locals>.fn)rQ  r	   r  Symbolr  r   rU  )rI   rg  ro  rJ   rn  rK   pointwise_read_writesG  s    
z#SchedulerNode.pointwise_read_writesr   c                 C   sz   |   s|  rdS t| jjdkrvt|tjrvtt	| jj}t|tjs^t
dt||j|jkot|j|jkS dS )NFr   ztype(write_dep)=)r   r   r   r8   rb   r   r   r!   nextiterr   rP   rk  size)rI   r   	write_deprJ   rJ   rK   r   R  s     zSchedulerNode.can_inplacec                 C   s   t  }t| jtjr| j D ]}|jdkr|jdkrd|jkrN|jd dksjt	|j
dkr|j
d dkr|d|jkr|jd nt	|j
dkr|j
d	 nd
 q|S )NZcall_methodstoremode
atomic_add   rD  rj      r   rm   )rD   r   rR  r   rY  r   r   r   r+  r   r   r   )rI   Zbuffers_store_as_atomic_addr=   rJ   rJ   rK   _get_atomic_add_buffers]  s*    z%SchedulerNode._get_atomic_add_buffers)N)'rQ   r2  r3  r   r   r$   r7  rL   r   r   r   r   r   rL  rV  r  rd   r	   r  r  r	  r   r   r   r   r1  rb  ra  ri  r   r   r5  rq  r    r   r*   r
   r{  __classcell__rJ   rJ   rM  rK   r     s8   
 r   c                       s  e Zd ZdZeeed dddZdee dddd	Ze	e
d
ddZe
d
ddZe	ee
 d
ddZe
d
ddZee
 ee
e
f dd fddZe	ee
 d
ddZe	ee
 d
ddZee d
ddZe
d
ddZe	ed
ddZe	ed
d d!Ze	ed
d"d#Ze	eej d
d$d%Zejd
d&d'Z e	ed
d(d)Z!e	e"e
 d
d*d+Z#ee
e
f dd,d-d.Z$e%dd/d0d1Z&e'd2 dd3d4d5Z(ee
 d
d6d7Z)e'e
 d
d8d9Z*e+j%ed:d;d<Z,dd
d=d>Z-ed
d?d@Z.e
d
dAdBZ/  Z0S )Cr  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r9  r:  r>   c                 C   sX   |j |j kstt|ttfs"tt|ttfs4ttt| | }| |j |S rr   )	r<   r   r   r   r  r~   r   r   r   )r.  r9  r:  nodesrJ   rJ   rK   fusey  s
    zFusedSchedulerNode.fuser:   N)r<   r   r>   c                    s   | _ | _d  _g  _g  _g  _t|dd dj _tj	dd |D   _
 tjdd |D   fddtj	d	d |D  D  jj  _td
d  j D  _tdd  j D  _d S )Nc                 S   s   t |  S rr   )r  r   r   rJ   rJ   rK   r     r   z-FusedSchedulerNode.__init__.<locals>.<lambda>r   c                 S   s   g | ]}|j d k	r|j qS rr   )rE   r   rJ   rJ   rK   r     s     
 z/FusedSchedulerNode.__init__.<locals>.<listcomp>c                 S   s   g | ]
}|j qS rJ   )r8   r   rJ   rJ   rK   r     s     c                    s   h | ]}|j   kr|qS rJ   rj   r   r   rS   rJ   rK   r     s   z.FusedSchedulerNode.__init__.<locals>.<setcomp>c                 S   s   g | ]
}|j qS rJ   r   r   rJ   rJ   rK   r     s     c                 s   s   | ]}|j V  qd S rr   	min_orderr   rJ   rJ   rK   r    s     z.FusedSchedulerNode.__init__.<locals>.<genexpr>c                 s   s   | ]}|j V  qd S rr   )	max_orderr   rJ   rJ   rK   r    s     )r   r<   r=   r@   rA   rB   r,  r7   rD   unionrE   rC   r   r5  
merge_listr8   rb   r9   r  r  r  )rI   r<   r   rJ   rS   rK   rL     s(    
zFusedSchedulerNode.__init__rM   c                 C   s   d dd | jD S )Nrm  c                 S   s   g | ]}|  qS rJ   r   r   rJ   rJ   rK   r     s     z/FusedSchedulerNode.get_name.<locals>.<listcomp>)rh   r   rS   rJ   rJ   rK   rR     s    zFusedSchedulerNode.get_namec                 C   s   | j d  S r  )r   rR   rS   rJ   rJ   rK   r     s    z!FusedSchedulerNode.get_first_namec                 C   s   t jdd | jD  S )Nc                 S   s   g | ]}|  qS rJ   )r   r   rJ   rJ   rK   r     s     z0FusedSchedulerNode.get_names.<locals>.<listcomp>rD   r  r   rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.get_namesc                    sf    fddt  jD } jd j}|d k	s0t| }t|rP|t  t	
d| dS )Nc                    s,   g | ]$\}}    d | d|  qS )z.snodes[z] =
)rR   rl   )r   ir=   rS   rJ   rK   r     s   z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>r   r_   rF  )	enumerater   r=   r   r   r   rZ  r[  r\  rH  rE  rh   ri   )rI   rk   r=   r#  rJ   rS   rK   rd     s    

z"FusedSchedulerNode.debug_str_extrar   c                    s@   t  || t }t| jD ]}||| ||j qd S rr   )rK  r   rD   reversedr   updaterF   )rI   r   r   r=   rM  rJ   rK   r     s
    z!FusedSchedulerNode.set_last_usagec                 C   s   t jdd | jD  S )Nc                 S   s   g | ]}|  qS rJ   )r   r   rJ   rJ   rK   r     s     z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>r  rS   rJ   rJ   rK   r     s    z$FusedSchedulerNode.used_buffer_namesc                 C   s   t jdd | jD  S )Nc                 S   s   g | ]}|  qS rJ   )r   r   rJ   rJ   rK   r     s     zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r  rS   rJ   rJ   rK   r     s    z/FusedSchedulerNode.used_or_aliased_buffer_namesc                 C   s   | j S rr   r   rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.get_nodesc                 C   s   t | j d|   dS )Nz(nodes=rN   rO   rS   rJ   rJ   rK   rT     s    zFusedSchedulerNode.__repr__c                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S rr   )r   r   rJ   rJ   rK   r    s     z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>r'  r   rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.is_reductionc                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S rr   )r   r   rJ   rJ   rK   r    s     z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r  rS   rJ   rJ   rK   r     s    z FusedSchedulerNode.is_split_scanc                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S rr   )r   r   rJ   rJ   rK   r    s     z1FusedSchedulerNode.is_template.<locals>.<genexpr>r  rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.is_templatec                 C   s$   | j D ]}| r|   S qd S rr   )r   r   r1  rI   r=   rJ   rJ   rK   r1    s    
z$FusedSchedulerNode.get_template_nodec                 C   s
   | j d S r  )r7   rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.get_devicec                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S rr   )r   r   rJ   rJ   rK   r    s     z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r  rS   rJ   rJ   rK   r     s    z+FusedSchedulerNode.has_aliasing_or_mutationc                 C   s&   t  }| jD ]}||  q|S rr   )r
  r   r   r  r   )rI   r   r=   rJ   rJ   rK   r     s    
zFusedSchedulerNode.op_countsrp   c                 C   s   t d S rr   NotImplementedErrorrt   rJ   rJ   rK   ru     s    z'FusedSchedulerNode.update_mutated_namesrj   r>   c                 C   s   t d S rr   r  rI   rj   rJ   rJ   rK   ry     s    zFusedSchedulerNode.add_fake_deprz   r{   c                 C   s   t d S rr   r  )rI   r@   rJ   rJ   rK   r     s    zFusedSchedulerNode.set_usersc                 C   s   t d S rr   r  rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.get_aliasesc                 C   s   t d S rr   r  rS   rJ   rJ   rK   r      s    z FusedSchedulerNode.get_mutationsr   c                 C   s   t d S rr   r  r   rJ   rJ   rK   r     s    zFusedSchedulerNode.can_inplacec                 C   s   t d S rr   r  rS   rJ   rJ   rK   r     s    zFusedSchedulerNode.allocatec                 C   s   t d S rr   r  rS   rJ   rJ   rK   r   	  s    zFusedSchedulerNode.can_freec                 C   s   |   }ddd | jD }| dt| j d| d| dt| jj | dt| j | d	t| jj	| j  | d
| j
 g}z||  g7 }W n" tk
r   tjddd Y nX d| S )rU   ,c                 s   s   | ]}t |jV  qd S rr   )rP   rQ   r  rJ   rJ   rK   r    s     z/FusedSchedulerNode.debug_str.<locals>.<genexpr>rV   rW   rN   rX   rY   rZ   r[   r\   Tr]   r_   )rR   rh   r   rP   rQ   ra   r8   rb   r9   rc   r@   rd   re   rf   rg   ri   )rI   rj   Znode_typestrrk   rJ   rJ   rK   rl     s    
zFusedSchedulerNode.debug_str)1rQ   r2  r3  __doc__classmethodr6   r  r	   rL   r*   r  rR   r   r
   r   rd   r   r   r   r   r   rT   r   r   r   r   r   r   r7  r1  r   r#  r   r   r   r   ru   r    ry   r   r   r   r   r   r   r   r   rl   r|  rJ   rJ   rM  rK   r  r  s`    
  
r  c                       s  e Zd ZdZeee dddZeee dddZeeee	dd	d
Z
eeed dddZd$dee ee ee dd fddZddddZddddZe	dddZe	dddZee dddZee dddZeddd Zeeef dd!d"d#Z  ZS )%r"  z{Scheduler node which consists of a list of scheduler nodes that each operate on a
    distinct tensor in a list of tensors.)producerr>   c                 C   s    |  | jkr| j|   S d S rr   )rR   read_to_node)rI   r  rJ   rJ   rK   get_consumer_subnode_for%  s    z3ForeachKernelSchedulerNode.get_consumer_subnode_for)consumerr>   c                 C   s.   |j jD ] }|j| jkr| j|j   S qd S rr   )r8   rc   rj   r   )rI   r  rdrJ   rJ   rK   get_producer_subnode_for-  s    z3ForeachKernelSchedulerNode.get_producer_subnode_for)r  r  r>   c                    s.  t  |}  rv| rvtt  tt|}t jt|jk}|sR|d |ott fddt j|jD S | rʈ 	 r|d dS tt|}|
 }|d k	r|j |S |d dS   r"|	 r|d dS tt   |}|d k	r j||S |d dS td	d S )
Nzforeach do not have same lengthc                 3   s    | ]\}} j ||V  qd S rr   )r<   can_fuser   lrr  rJ   rK   r  ?  s   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r8  r   typingcastr"  r   r   allre  r   r  r<   r  r  r   )r.  r  r  whyZforeach_matchconsumer_subnodeproducer_subnoderJ   r  rK   r  6  sJ    




z#ForeachKernelSchedulerNode.can_fusec           
      C   s<  |  s|  std }d }|  r^|  r^tt|}tt|}dd t|j|jD }n|  rtt|}||}g }|}d }|jD ]2}||krt	||}|}|
| q|
| qnh|  r,tt|}||}	g }|}d }|jD ]4}||	kr t	||}|}|
| q|
| q| |j|||S )Nc                 S   s   g | ]\}}t ||qS rJ   )r  r  r  rJ   rJ   rK   r   p  s   z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>)r   r   r  r  r"  re  r   r  r  r  r   r  r<   )
r.  r  r  prev_node_1prev_node_2fused_nodesr  r=   new_noder  rJ   rJ   rK   r  e  sD    





zForeachKernelSchedulerNode.fuseNr:   )r<   r~  r  r  r>   c           
         s  i  _ i  _|d ks|d krht || |D ]6}|jjD ]}| j |j< q:| D ]}| j|< qTq.n| _| _	d  _
g  _ tj|j|jg  fddt|j|jD  jj  _t|j|jg _t|j|jg _| r
t|tst|| }}	nt|tst|| }}	|j _ j|	j |j _|	 D ]}|	 j|< qJ|d  t dfff _!t  _"d S )Nc                    s   h | ]}|j   kr|qS rJ   r  r   rS   rJ   rK   r     s   z6ForeachKernelSchedulerNode.__init__.<locals>.<setcomp>r   Zforeach)#r  r   rK  rL   r8   rc   rj   r   r<   r   r=   r@   rC   r   r5  r  rD   r  r9   rb   r  r  r,  r  r   r   r"  r   rE   r  r   r  r  r7   r   )
rI   r<   r~  r  r  r=   r   rj   Zforeach_node
other_noderM  rS   rK   rL     sP    

 

z#ForeachKernelSchedulerNode.__init__rM   c                 C   s   t d S rr   r  rS   rJ   rJ   rK   ra    s    z#ForeachKernelSchedulerNode.mark_runc                 C   s<   t | jtjs"tdt| j| j | j   d S r]  )r   r=   r   r$   r   rP   Zget_store_functionZmake_loaderrS   rJ   rJ   rK   r     s    "z"ForeachKernelSchedulerNode.codegenc                 C   s   t d S rr   r  rS   rJ   rJ   rK   r     s    z#ForeachKernelSchedulerNode.can_freec                 C   s   dS rJ  rJ   rS   rJ   rJ   rK   r     s    z%ForeachKernelSchedulerNode.is_foreachc                 C   s
   t | jS )zReturns a list of nodes which comprise the foreach kernel, operating on corresponding elements of our input lists.
        These nodes may be vertically fused.)r~   r   rS   rJ   rJ   rK   get_subkernel_nodes  s    z.ForeachKernelSchedulerNode.get_subkernel_nodesc                 C   s   t tjdd | jD S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c                 s   s   | ]}|  V  qd S rr   )r   r   rJ   rJ   rK   r    s     z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>)r~   r   r   rf  r   rS   rJ   rJ   rK   r     s    z$ForeachKernelSchedulerNode.get_nodesc                 C   s   | j d  S r  )r   r   rS   rJ   rJ   rK   r     s    z)ForeachKernelSchedulerNode.get_first_namer   c                 C   s$   t | | | jD ]}|| qd S rr   )r   r   r   )rI   r   r=   rJ   rJ   rK   r     s    

z/ForeachKernelSchedulerNode.prune_redundant_deps)NN)rQ   r2  r3  r  r6   r   r  r  r  r   r  r  r	   rL   ra  r   r   r   r   r  r   r  r   r   r   r|  rJ   rJ   rM  rK   r"  !  sB   		. 1  <
r"  rJ   .)stride_lengthsrg  priority_idxr>   c                    sj   t jtttd fdd}ttttd }t|dkrTfdd|D tjrf|j	|d |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    )abr>   c                    s     dks dkr2t   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krdS ||krdS t  S )	Nr   c                    s   g | ]}t |  qS rJ   absr   sl)r  rJ   rK   r     s     z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                    s   g | ]}t |  qS rJ   r  r  )r  rJ   rK   r     s     c                 s   s"   | ]\}}|d kp||k V  qdS r  rJ   r   Zsl_aZsl_brJ   rJ   rK   r  	  s    z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s   s"   | ]\}}|d kp||k V  qdS r  rJ   r  rJ   rJ   rK   r    s    r   )r+   r  re  )r  r  Zstride_len_aZstride_len_bZa_firstZb_firstrg  r  )r  r  rK   	index_cmp  s    z"pick_loop_order.<locals>.index_cmpr   c                    s   g | ]} | qS rJ   rJ   )r   pi)r  rJ   rK   r     s     z#pick_loop_order.<locals>.<listcomp>r   )
	functools
cmp_to_keyr  r~   r  ranger   r   Zpick_loop_orderssort)r  rg  r  r  orderrJ   r  rK   pick_loop_order  s    
r  c                   @   sr   e Zd ZU eeef ed< dZeed< dZ	eed< e
dddZeedd	d
ZedddZd d dddZdS )rz   r=   Fr   r   rM   c                 C   s   t | j | j| jfS rr   )hashr=   rR   r   r   rS   rJ   rJ   rK   __hash__)  s    zNodeUser.__hash__otherr>   c                 C   s2   t |to0|  | ko0| j|jko0| j|jkS rr   )r   rz   rR   r   r   rI   r  rJ   rJ   rK   __eq__,  s    


zNodeUser.__eq__c                 C   s
   | j  S rr   )r=   rR   rS   rJ   rJ   rK   rR   4  s    zNodeUser.get_namec                 C   s.   | j |j kstt| j | jo |j| jo*|jS rr   )r=   r   rz   r   r   r  rJ   rJ   rK   r}   7  s    

zNodeUser.mergeN)rQ   r2  r3  r   r6   r   r4  r   r   r   r  r  objectr  r  rR   r}   rJ   rJ   rJ   rK   rz      s   
rz   c                       s  e Zd ZU eeef ed< eee	j
 dd fddZejdddZddd	d
ZeddddZe	j
edddZddddZddddZddddZddddZddddZddddZddddZee eeef ddd Zddd!d"Zeee d#d$d%Z!ddd&d'Z"ddd(d)Z#eeeef  dd*d+Z$eee d#d,d-Z%eee d#d.d/Z&eeeed0f ed1d2d3Z'eee d#d4d5Z(eee d#d6d7Z)ee*e d8d9d:Z+eeee e eef d#d;d<Z,eed=d>d?Z-eeed#d@dAZ.eeeef  eeeef  dBdCdDZ/eeef ee e eef ddEdFZ0dddGdHZ1dddIdJZ2dddKdLZ3eddMdNdOZ4eddMdPdQZ5dddRdSZ6e7ddTdUdVZ8ejdWdXdYdZZ9ejdWdXd[d\Z:eddd]d^Z;eddd_d`Z<ee	j=dadbdcZ>  Z?S )dr:   _Scheduler__dep_size_hint_cacheNr~  r>   c                    s  t    i  _ tj_i  _tt _	tjj
 tjj tjj  _ fdd|D  _ jtjj   jD ]}|  qzdd  jD  _t  _i  _i  _         tjrt j    t j t! j7  _ tj"# j t! j _$dd  jD  _ %     t&  _' (   )  tjrf *  t+ j _ ,  tj"- j tj". j  /  d  _0t&  _1i  _2t3d4 fdd d S )	Nc                    s   g | ]}  |qS rJ   )create_scheduler_noder  rS   rJ   rK   r   T  s     z&Scheduler.__init__.<locals>.<listcomp>c                 S   s   i | ]}|  |qS rJ   r   r  rJ   rJ   rK   
<dictcomp>[  s     z&Scheduler.__init__.<locals>.<dictcomp>c                 S   s   i | ]}|  |qS rJ   r   r  rJ   rJ   rK   r  }  s      Zgraph_statsc                      s    j  jt jdS )N)graph_idZnum_nodes_before_fusionZnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r~  rJ   rS   rJ   rK   r     s    z$Scheduler.__init__.<locals>.<lambda>)5rK  rL   r  r5   r   r<   backendsrr  _post_grad_graph_counterr  r  keys	constantsZtorchbind_constantsr   r~  r  r   r   rd  r   r   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationr   Z reorder_for_compute_comm_overlapr   Zdecide_global_ordering_of_commscompute_ancestorsr   Zir_nodes_pre_fusionr   r?  Zir_pre_fusionr  create_foreach_nodesrD   logged_slow_fusion
fuse_nodesfinalize_multi_template_bufferscompute_node_usersZ$reorder_compute_and_comm_for_overlapcompute_last_usageZir_post_fusionZgraph_diagramdebug_draw_graphcurrent_devicebuffer_names_to_freeorigin_to_indexr   add_row)rI   r~  r=   rM  rS   rK   rL   F  s`    






	

zScheduler.__init__rM   c                 C   s   | j  }r|S tdd S )NzNo current device)r  RuntimeErrorrI   r#  rJ   rJ   rK   get_current_device_or_throw  s    
z%Scheduler.get_current_device_or_throwc                 C   s0   t jdddkr,ddlm} || jdd dS )z,Generate an image of the graph for debuggingZINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)Zprint_graph)osenvironr   r?  r  r~  )rI   r  rJ   rJ   rK   r    s    zScheduler.debug_draw_graph)labelr>   c                 C   s0   t tjr,t d| | jD ]}|  qd S )Nz%s:)rf   isEnabledForloggingINFOrn   r~  ro   )rI   r  r=   rJ   rJ   rK   debug_print_nodes  s    
zScheduler.debug_print_nodesr=   r>   c                 C   sb   |j d k	std| r$t| |S t|tjtjfr@t| |S t|tj	rVt
| |S t|d S )Nz2All nodes passed to scheduling must have an origin)r   r   Zis_no_opr   r   r   r$   r7  r   r%  r  r  r  rJ   rJ   rK   r    s    


zScheduler.create_scheduler_nodec                    s   t  g }j  tjj D ]b} fdd|D }|s>q | fdd|D }t|}|	| |D ]}|j|< qrq fddj
D t| _
d S )Nc                    s(   g | ] }| krt j| ts|qS rJ   )r   r   r   r   rj   )kept_node_namesrI   rJ   rK   r     s   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                    s   g | ]} j | qS rJ   )r   r  rS   rJ   rK   r     s     c                    s   g | ]}|   kr|qS rJ   r   )r   r=   )removed_node_namesrJ   rK   r     s     )rD   r   r  r5   r   listsr   r  r"  r   r~  r~   )rI   Zfe_nodesnamesr   Zfe_noderj   rJ   )r  r  rI   rK   r    s(    




zScheduler.create_foreach_nodesc                    s  t dG  fdddt  t jD ]}| }| D ]~}|kr|kr| }| }|| } D ]$}| |ks| |krz||< qzqF|kr| |< qF| |< qFq2ttdfddtt	t dfdd	dtt
ttf ttddfdd}i }	tjj D ].\}
}t|tjr.|jD ]}d|	|< qJq.jD ]*}td|j |jdk	stt|j dd d}|D ].}t|tjst||	kr| |	|< qt|j dd d}|D ]B}||	kst| d|	 |	|  }dk	r|t| qt |j!j"dkrlt#t$|j!j" }rlt|t%rl|j&}nd}t |' dkst|' D ]z}|}||| |t||d | jD ]D}| }| }||kr|t(| |||dd qq|j!j)D ]&}t|t(}||j*||+|| q|,j- |' D ]>}| j-|< | j-|< j./||j.| < qNqdtj0 D ]$}td| ||tt| qtjj1D ]h}| D ]X}||	kst| d|	  |	|  }dk	rtd|| ||tt| q֐qʈj-D ]V}
|
tjjkrn||
tt|
 tjj23|
 n |
tjj4kr:||
tt|
 q:dd t5tjj D fddtjj2D tj_6jD ]}|7|  j q̈jD ]"}|j8D ]}|jj9:| qqdS ) zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        Tc                       sX   e Zd ZdZdee  ee  ddddZddddZd	d	d
 fddZ	dS )z1Scheduler.compute_dependencies.<locals>.DedupListag  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a set/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            N)items
membershipr>   c                 S   s   |pt  | _|pt | _d S rr   )r~   r  rD   r  )rI   r  r  rJ   rJ   rK   rL     s    z:Scheduler.compute_dependencies.<locals>.DedupList.__init__)	node_userr>   c                 S   s*   || j krd S | j| | j | d S rr   )r  r  r   r   )rI   r  rJ   rJ   rK   r     s    
z8Scheduler.compute_dependencies.<locals>.DedupList.appendzDedupList[T]r  c                    s4   t  j|j} j fdd|jD  }||S )Nc                    s   g | ]}| j kr|qS rJ   )r  r   rS   rJ   rK   r     s    
 zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>)rD   r  r  r  )rI   r  Znew_membershipZ	new_items)	DedupListrS   rK   __add__  s
    z9Scheduler.compute_dependencies.<locals>.DedupList.__add__)NN)
rQ   r2  r3  r  r   r   r
   rL   r   r  rJ   )r  r  rJ   rK   r    s     

r  r  r>   c                    s   | j kr j |  S | S rr   )r  r  )rs   rI   rJ   rK   rs     s    
z.Scheduler.compute_dependencies.<locals>.rename)	node_namer>   c                    s~   | h}j |  }tt|jj}|jjD ]P}|jj kr(t|tj	r(t|tj	r(|j
|j
kr(|j|jkr(| |j q(|S rr   )r   rr  rs  r8   rb   rc   rj   r   r   r!   rk  rt  r  )r  Zreachable_namesr=   ru  r   )dep_closurerI   rJ   rK   r    s     





z3Scheduler.compute_dependencies.<locals>.dep_closureFN)used_by_name	user_noder   r   r>   c                    s    |   t||| d S rr   )r   rz   )r  r  r   r   )name_to_usersrs   rJ   rK   add_user,  s    
z0Scheduler.compute_dependencies.<locals>.add_userzscheduling %sc                 S   s   | j S rr   r   r   rJ   rJ   rK   r   G  r   z0Scheduler.compute_dependencies.<locals>.<lambda>r   c                 S   s   | j S rr   r   r   rJ   rJ   rK   r   R  r   z not in r   )rw  Tr   zscheduling output %sz+scheduling output %s for unbacked symint %sc                 S   s   i | ]\}}||qS rJ   rJ   )r   rk  rj   rJ   rJ   rK   r    s     z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                    s   g | ]} | qS rJ   rJ   r  )	inp_namesrJ   rK   r     s    z2Scheduler.compute_dependencies.<locals>.<listcomp>)FF);r   r   r
  r  r~  rR   r   r  r  r
   r   r6   r   r   r5   r   r  r  r   r  r  Zfree_symbolsrf   r?  r=   r   r   Zget_unbacked_symbol_defsrp  Zget_unbacked_symbol_usesry   r"   r   r8   rb   rr  rs  r!   rw  r   r#   rc   rj   r   ru   r  r   r   get_output_namesZgraph_outputsZmutated_inputsr   r  r  Zmutated_input_idxsr   r@   rA   r   )rI   r9  
node1_name
node2_nameZlist1Zlist2combinedr   r  Zunbacked_symbol_to_origin_noderj   valfsr=   Zunbacked_symbol_defsr   Zunbacked_symbol_usesr  rw   Z	node_modeZalt_namer  Z
other_nameZknown_dep_node_namesr   r   r  r   rJ   )r  r  r  r  r  rs   rI   rK   r    s    
  


 
 


   





zScheduler.compute_dependenciesc           
      C   s   i }| j D ]2}t|tr0|jD ]}||| < q||| < q
| j D ]}g |_g |_qD| j D ]<}g }|jD ]&}|j|ks|t	||j }|
| qj||_q\i }| j D ]"}|jD ]}||g 
| qq| D ]\}}	|	|_qd S rr   )r~  r   r  r   rR   rB   rA   r9   rj   r   r   
setdefaultr  )
rI   Zbuf_to_snoder=   r   rA   rw   Zdep_nodeZnode_to_usersZinverse_userr@   rJ   rJ   rK   r    s,    








zScheduler.compute_node_usersc                    s   d}|rg }| j D ]h}ttddd |  oFt fdd|jD }|sX|| qtd|	  t
jj|	  qt| j t|k}|| _ q| j D ]}|  qdS )	z0
        Remove any nodes without users
        T)r   r>   c                 S   s   | j p|  tjjkS rr   )r   rR   r5   r   r   )r   rJ   rJ   rK   can_eliminate_user  s    z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userc                 3   s   | ]} |V  qd S rr   rJ   )r   ur  rJ   rK   r    s    z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead node: %sN)r~  rz   r   r   r  r@   r   rf   r?  rR   r5   r   r   r   r   r   )rI   ZagainZupdated_nodesr=   Zcan_eliminaterJ   r	  rK   r    s     

zScheduler.dead_node_eliminationc                    sj   t  t  g tdd fdd| jD ]}| D ]}| |< q<q0| jD ]}| qR| _dS )zD
        Ensure self.nodes is in topologically sorted order
        Nr  c                    sF   | krB |  t| jdd dD ]} |j  q$|  d S )Nc                 S   s   | j S rr   r   )drJ   rJ   rK   r     r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>r   )r   r   r9   rj   r   )r  rw   r   r   seenvisitrJ   rK   r    s
    
z2Scheduler.topological_sort_schedule.<locals>.visit)rD   rd  r6   r~  r   )rI   r=   rj   rJ   r  rK   r    s    


z#Scheduler.topological_sort_schedulec                 C   sr   i }| j D ]B}t }|jD ]}||j |||j O }q||| < ||_q
t| j D ]\}}||_||_	qXdS )z.
        Populate each node.ancestors
        N)
r~  rD   r9   r   rj   rR   rE   r  r  r  )rI   Zname_to_ancestorsr=   rE   rw   r  rJ   rJ   rK   r    s    

zScheduler.compute_ancestorsc                 C   sx   t dD ]j}t| j}td|d | |   t| j}td|d || ||ks^|dkrtd|d   qtqdS )zO
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
        
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r  r   r~  r>  r?  fuse_nodes_once)rI   r  Zold_lenZnew_lenrJ   rJ   rK   r    s$    

zScheduler.fuse_nodesc                 C   s6   t |dkst|d  }|| _| |}||S )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   )r   r   r   r  rS  benchmark_fused_nodes)rI   r~  r#  backendrJ   rJ   rK   r  -  s
    
zScheduler.benchmark_fused_nodesc                 C   s8  t jt jd ddd}t| jD ]\}}t|tr t|jt jr |j}| \}}t|t	j
j jrr|j| q | }|j}t|t jst|j}	t|	t jst|j|	_|||	 | |	}
|
| j|< |
| j| < |
| j| < |j|
_|j|
_|j|
_|j|
_|
jD ]"}|jj| |jj|
 qq d S )N)	orig_noder  r>   c                 S   sn   |j }|  }t|tr"t|ts&ttjj|= ||_ tjj	| }tjj
| |tjj|< |tjj|< d S rr   )rj   rR   r   r  r   r5   r   r   buffersrk  remove)r  r  Zreplaced_name	orig_nameorigrJ   rJ   rK   replace_buffer;  s    
zAScheduler.finalize_multi_template_buffers.<locals>.replace_buffer)r   MultiTemplateBufferr  r  r~  r   r   r=   get_min_choicer   r   TritonTemplateCallerBasefinalize_as_triton_callerZoutput_noder^  
StorageBoxr   r   r  r   rR   r   r@   r  r  rF   rA   r  r   )rI   r  r  r=   
multi_nodeZmin_node_unfusedrm  Zout_tensorboxZout_storageZ
out_bufferZnew_scheduler_noder   rJ   rJ   rK   r  :  sF      



z)Scheduler.finalize_multi_template_buffersr}  c              
      s    ot tj}tjs&|s&dS   r>t tjrN sN rRdS 	 }|d 
 }|jdkrtdS 	 }tt||}tdd |D rdS ddlm} t}	tttddfd	d
}
ttrtjtjrj}|j}| \} | |\td}d}d}| D ]\}}t|tjjjsJq,|  kr\q,|d7 }|tjkrv qj|& | |\}|k r}|}W 5 Q R X q,|
|  |  k r|dk	rj| dS dS nz|| |\ t ! r|	d W dS | |\t !r>|	d W dS | |\t !rf|	d W dS W n> |k
r } zdt"|krW Y 
dS  W 5 d}~X Y nX |
  t#dr  krf| j$kr| j$%f t&d' fdd   k S )z
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        Tr   cpuc                 s   s@   | ]8}t |jd o6|jdk	o6t |jjdo6|jjjdkV  qdS )r^  Nscatter_moderx  )r   r=   r^  r   r  rJ   rJ   rK   r    s   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>)CompilationErrorN)ms_fusedms1ms2r>   c              	      sn   t tjrj| || k rBt d   t|| |  d n(t d   t| ||  d d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedupz.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r>  r  r  DEBUGr?  r   r'   r(   )r"  r#  r$  r<  rJ   rK   
log_fusion  s    z/Scheduler.speedup_by_fusion.<locals>.log_fusioninfr   Fz%register spilling of the first kernelz&register spilling of the second kernelz%register spilling of the fused kernelzLoop-carried variableZslow_fusionc                	      s       dS )N)Zkernel1_pathZkernel1_latencyZkernel2_pathZkernel2_latencyZfused_kernel_pathZfused_kernel_latencyZslow_down_ratiorJ   rJ   )r#  r$  r"  path1path2
path_fusedrJ   rK   r     s    
z-Scheduler.speedup_by_fusion.<locals>.<lambda>)(r   r   r1  r   r  r   Zbenchmark_fusionZTritonTemplateBufferr   r   r   rP   r~   r   r   r'  Ztriton.compiler.errorsr!  r8  r6  r   r=   choice_timingsr  r  r  r   r   r  Z max_epilogue_benchmarked_choicesZswap_as_triton_callerr  mathisinfr  r   r  r   r   r  )rI   r9  r:  Zis_multi_templateZnode_list_1r#  Znode_list_2Znode_list_fusedr!  r  r&  r  r+  rm  Zmin_ms_fusedZms_fused_choiceZtriton_choiceschoiceZunfused_timer-  rJ   )r#  r$  r"  r9  r:  r(  r)  r*  rK   speedup_by_fusionn  s    
 


 



zScheduler.speedup_by_fusionc                    s   t | j}|  D ]\}}| j|  }| j|  }| ||r| ||s| ||s\qt	d|
 |
  | }| ||| || || |  | j fdd  D  qt|dd d| _|   |   dS )a  
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfusing %s with %sc                    s   i | ]}|   qS rJ   r   r  Znode3rJ   rK   r     s      z-Scheduler.fuse_nodes_once.<locals>.<dictcomp>c                 S   s   | j S rr   r  r   rJ   rJ   rK   r   "  r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>r   N)rD   r~  get_possible_fusionsr   r   r  will_fusion_create_cycler/  r>  r?  rR   r   rS  r  r  r   r  r   r   r  r   )rI   r  r9  r:  r#  rJ   r0  rK   r    s4    
   


zScheduler.fuse_nodes_oncec                 C   s   | j D ]}|| j qd S rr   )r~  r   r   r  rJ   rJ   rK   r   &  s    
zScheduler.prune_redundant_depsc                    s   g  t  tt dd fdd}tt}jD ] }| D ]}|| | qBq6|	 D ]}|| q`t
jrtt}jD ]"}t|dd}|r|| | q|	 D ]}|| q   jjdd tdt   S )	z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        Nr  c                    s   t | D ]|\}}| |d d  D ]b}||f}|kr6q | ||rX | q | sh| r ||r  ||f q qd S )Nr   )r  r   r  r   r   r   )r~  Znode1_indexr9  r:  r   possible_fusionsr  rI   rJ   rK   check_all_pairs1  s    
 z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr7   T)r   reversezfound %d possible fusions)rD   r   r6   r
  r  r~   r~  r   r   r   r   aggressive_fusionr`   *get_possible_fusions_with_highest_priorityr  score_fusion_keyr>  r?  r   )rI   r5  Zbuffer_names_groupingr=   r   Znode_groupingZgroup_groupingr7   rJ   r3  rK   r1  *  s.    





zScheduler.get_possible_fusionsc                    sp   t  ttd fdd| | B |j|jB   tfdd D }|rlt||d |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        r  c                    s^   t | trZ| krZ|  |   r.dS t| j@ pXtfdd| j  D S dS )NFc                 3   s   | ]} j | V  qd S rr   r   r  
found_pathrI   rJ   rK   r  s  s   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>)r   r  r   r   issubsetr   rE   r'  r   Zcombined_ancestorsZcombined_namesr<  rI   visitedrJ   rK   r<  b  s    

z6Scheduler.will_fusion_create_cycle.<locals>.found_pathc                 3   s   | ]} j | V  qd S rr   r:  r  r;  rJ   rK   r  {  s     z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>zwill create cycle)rD   r6   r   r   rE   r'  r8  )rI   r9  r:  cyclerJ   r>  rK   r2  X  s    z"Scheduler.will_fusion_create_cyclec                 C   s*   t t|j|j t|j|j }|dkS )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r,  r  r  r  )rI   r9  r:  proximity_scorerJ   rJ   rK   can_fusion_increase_peak_memory  s
    z)Scheduler.can_fusion_increase_peak_memory.)r9  r:  common_buf_namesr>   c                 C   sL  i }dd |j  D }dd |j  D }|D ]}tj|}|| }	|| }
|	 |
 krd|	  d|
  ||< q0t|	jt|
jkrd||< q0t|	t	rt|
t	sdt
|	 dt
|
 ||< q0|	 }|
 }||krd| d| ||< q0|	 |
 kr&d	|	 d|
 ||< q0d
|	 d|
 d|j ||< q0t|S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                 S   s   i | ]}|j |qS rJ   r   r   rJ   rJ   rK   r    s      z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>c                 S   s   i | ]}|j |qS rJ   r   r   rJ   rJ   rK   r    s      zdifferent numel: z v.s. 	broadcastznot MemoryDep: zdifferent offset: zMismatch loop orders: zUnknown reason: z
. Layout: )r8   rW  r5   r   rX  r  r4   rt  r   r!   rP   Z
get_offsetZnormalize_with_stride_orderr   r  )rI   r9  r:  rD  reasonsZnode1_name2depZnode2_name2depr  r   Zlhs_depZrhs_depZlhs_offZrhs_offrJ   rJ   rK   decide_fusion_fail_reason  sL    

z#Scheduler.decide_fusion_fail_reasonc                    s  krdS t }tttfr8 s8|d dS tttfrZ sZ|d dS  j@ rt|d dS  r|d dS  r s st	j
s|d dS  } }||kr|d|| dS ~dk}|rnt	jr s rntd	rbj j @  t dkrbtd	 fd
d |d dS |d dS  s st t  t	jkr|d dS  j@ rsdS |S r|d dS |S dS )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        Fznode1 is extern or nopznode2 is extern or nopznode1 must go before node2z!templates can only fuse epiloguesztemplate epilogue not satisfiedzdevice mismatch (%s vs %s)r   Z'fusion_failure_due_to_indexing_mismatchc                      sD   t jjt jj  t t t  dS )N)Zpre_grad_graph_idr  r  r  Znode1_debug_strZnode2_debug_strZcommon_buffer_namesZfailure_reason)	r5   r   r  r  rR   r   rl   r~   rG  rJ   rD  r9  r:  rI   rJ   rK   r   	  s    

  z$Scheduler.can_fuse.<locals>.<lambda>z'no shared data due to indexing mismatchzno shared datazexceeds max fusionzwill increase peak memoryN)r8  r   r  r   r   r   rE   r   r   r   Zepilogue_fusionr   score_fusion_memoryr7  r   r8   Zbuffer_namesr   r   r  r   r   Zmax_fusion_sizecan_fuse_verticalrS  rC  can_fuse_horizontal)rI   r9  r:  r  r#  Zdevice2Zno_shared_datarJ   rH  rK   r    s    

zScheduler.can_fusec                 C   s  |  }t }t||}|jjD ]2}t|ts0q |jD ]}| ||r6|	| q6q dd |j| D }||@ r||d dS |D ]"}	|| j
|	 j@ r|d  dS q|jjD ]T}
t|
tsq|jjD ]:}|
j| j|j|jkrq| ||
s|d   dS qqdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.

        We also disable fusion of a write subsequent to a read if the reads
        and writes do not align.
        c                 S   s   h | ]
}|j qS rJ   r   r   rJ   rJ   rK   r   L	  s     z.Scheduler.can_fuse_vertical.<locals>.<setcomp>zmemory deps did not matchFz(intermediate nodes between node1 & node2z:fusing a write into a read with different indexing formulaT)r   rD   r8  r8   rb   r   r!   r9   fusable_read_and_writer   r   rE   rc   rj   r  r   )rI   r9  r:  Znode1_namesZcomputed_depsr  cdr  Zremaining_depsrj   writer   rJ   rJ   rK   rJ  4	  s6    



zScheduler.can_fuse_vertical)r   rN  r>   c                 C   s   t |tr|j|jkr$|jd k	r$dS |j}|| jkr>| j| }||jkot|jtj ot|jtj o|j|jkot	|j
t	|j
ko|j
d t	|j
 |j
kS t |tr| j|j|j}| j|j|j}|j|jkr|jd k	r||krdS dS )NTF)r   r!   rw  rj   r  r   rk  r   TMPr   rt  r"   r   )rI   r   rN  Z	read_nameZ
write_namerJ   rJ   rK   rL  m	  s8    






z Scheduler.fusable_read_and_writec                 C   sb   |  ||}tt|j|j t|j|j  }| tjkoD|dk| | koZ|dk||fS )a\  
        Assign a score (higher comes first) to the fusion of node1
        and node2.  When different fusions conflict with each other,
        this is the way we decide what order to run them in.

        Our current score is based on:
        - Estimate of the saved memory operations
        - Fusions closer together in original order
        r   )	rI  r,  r  r  r  r   r   Zepilogue_fusion_firstr   )rI   r9  r:  Zmemory_scorerB  rJ   rJ   rK   score_fusion	  s    zScheduler.score_fusionrv   c                 C   sR   d}|| j krDz| s | }W n tk
r6   Y nX || j |< n
| j | }|S r  )r  Zhas_unbacked_symbolsZnumbytes_hintKeyError)rI   rw   resrJ   rJ   rK   dep_size_hint	  s    

zScheduler.dep_size_hintc                    s6   |j j|j jB |j j|j jB @ }t fdd|D S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        c                 3   s   | ]}  |V  qd S rr   )rS  r   rS   rJ   rK   r  	  s     z0Scheduler.score_fusion_memory.<locals>.<genexpr>)r8   rc   rb   r  )rI   r9  r:  Zcommon_memory_depsrJ   rS   rK   rI  	  s    zScheduler.score_fusion_memory)r4  r>   c                 C   s   t |dkr|S i }|D ]d\}}| | ks4t| }t| |||}||krj||fg||< q|| ||f qt| t	
ddd }t |dkst|S )Nr   r   r   )r   r   r   r  rS  get_fusion_pair_priorityr   r  r  operator
itemgetter)rI   r4  Z"possible_fusions_group_by_priorityr9  r:  r#  Zfusion_pair_priorityZ&possible_fusions_with_highest_priorityrJ   rJ   rK   r8  	  s0    
 z4Scheduler.get_possible_fusions_with_highest_priorityc                 C   s   |\}}|  ||S )z-
        Shim for list.sort(key=...)
        )rP  )rI   r~  r9  r:  rJ   rJ   rK   r9  	  s    zScheduler.score_fusion_keyc                 C   s<   t tj }t| jD ]}||| j ||j	 qdS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
rD   r5   r   r   r  r~  r   r   r  rF   )rI   r   r=   rJ   rJ   rK   r  	  s    zScheduler.compute_last_usagec                 C   s   t | jtjj tjjj D ]t}|| jkrN| j| }| rtjj	|j
 q|tjjkrtjj| j}t|tjr|| sttjj	|j q| j  dS )z*Free any buffers that are no longer neededN)r   r  r5   r   r   r   Zfreedr   r   Zcodegen_freer=   r  r^  r   r   r  Zis_input_bufferr   clear)rI   rj   r=   ZstoragerJ   rJ   rK   free_buffers	  s     

zScheduler.free_buffersc                    s   t jj}g  t jjD ]>}j| j}|dk	s0tdd |D }||r | qtt	dfdd}t
t|   D ]v}|t jjjkrt jjj| }t|tr|drqzt fdd	|jD }|rֈ| t jj| qz| qzdS )
zr
        Any buffers that are both created and have a last use in the
        same kernel can be removed.
        Nc                 S   s   h | ]}|j s| qS rJ   )r   rR   r   rJ   rJ   rK   r   
  s      z8Scheduler.remove_kernel_local_buffers.<locals>.<setcomp>r  c                    s.   | t jjko,| t jjjko,|  jko,|  jkS rr   )r5   r   Zmust_keep_buffersr   Zinput_buffersr  r   r  rS   rJ   rK   remove_filter
  s    z<Scheduler.remove_kernel_local_buffers.<locals>.remove_filterREMOVEDc                 3   s   | ]}| kV  qd S rr   rJ   r  )names_to_removerJ   rK   r  !
  s     z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>)r5   r   Zstore_buffer_namesr   r@   r   r=  r   r  r   r~   filterr   r   r   
startswithr  Zother_namesremove_inplace_bufferZinplaced_to_remover   remove_buffer)rI   Zfused_node_namesZout_bufr@   rY  rj   r   r  rJ   )r[  rI   rK   remove_kernel_local_buffers
  s(    

z%Scheduler.remove_kernel_local_buffersr  c                 C   s,   t d| dtjjj|< tjj| d S )Nzremove_buffer(%r)rZ  )rf   r?  r5   r   r   Zoutput_buffersr   r   r  rJ   rJ   rK   r_  (
  s    zScheduler.remove_bufferc                 C   sD   t d| tjjj| j}|ddtjjj|< tjj	| d S )Nzremoving_inplace_buffer(%r)Z
in_out_ptrrZ  )
rf   r?  r5   r   r   r   
inner_namer   r   r   )rI   rj   ra  rJ   rJ   rK   r^  0
  s     zScheduler.remove_inplace_bufferc                 C   s$   | j  D ]}|  q
|   d S rr   )r  r   flushrX  )rI   r  rJ   rJ   rK   rb  8
  s    
zScheduler.flush)scheduler_noder>   c              	   C   s   t |tsttd d  d7  < ttdd |  |  W 5 Q R X |j	}t |t
jsrtdt||tjj |   d S )NZinductorZextern_callsr   F)Zincrease_kernel_countztype(node)=)r   r  r   r   r5   Zset_kernel_handlerr   r   r   r=   r   r%  rP   r   r   r   rX  )rI   rc  r=   rJ   rJ   rK   codegen_extern_call=
  s    zScheduler.codegen_extern_callBaseScheduling)r#  r>   c                 C   s   t |jr"|jd k	s"t| dtj| t|j}|d krPtd|j t	 s|jdkrt
j| }jdk rtd|j d|j d|j nt |jrtd|| S )	Nz( should have been normalized in loweringzUnsupported device type: cuda   zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability .zCannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton)r2   rP   rk  r   r5   r   Zadd_device_infor   r  r   r   rf  Zget_device_propertiesmajorrj   minor)rI   r#  Zdevice_schedulingZdevice_propsrJ   rJ   rK   create_backendL
  s.    

zScheduler.create_backendc                 C   s$   || j kr| || j |< | j | S rr   )r  rk  r  rJ   rJ   rK   rS  e
  s    
zScheduler.get_backendc                    sf   t jjtdfdd  fdd| D }t| }|rbt|t	dd\}}t
jj| d S )Nr  c                    s2   |  j kr( j dd t| jjD   j |  S )Nc                 S   s   i | ]\}}||qS rJ   rJ   )r   r  r  rJ   rJ   rK   r  m
  s      z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)r  r  r  r   r~  r  rS   rJ   rK   	get_orderk
  s    
z*Scheduler.enter_context.<locals>.get_orderc                    s2   i | ]*}|j d k	r|j jD ]} ||fd qqS rr   )r=   r   )r   r  r-  )rl  rJ   rK   r  q
  s   

 
 z+Scheduler.enter_context.<locals>.<dictcomp>r   r   )r   ZfxNoder  r   r~   r  r,  rU  rV  r5   r   r   enter_context)rI   r=   r   rm  lastrJ   )rl  rI   rK   rn  j
  s    
zScheduler.enter_contextc           	      C   s~  | j D ]F}ztd| |  W n2 tk
rX } ztd|  W 5 d }~X Y nX | | t|ts|	  }r|| j
ks| s| r|   || j
kr| j
rt| j
jrtjj  t|jr|jd k	stdtjj|j || _
| j|j | r.| ^}}| ||| n| rPtt|}|  | n|! rtt"|}| |}ddl#m$} ddl%m&} t|||fr|}ntdt| |'| n:t|t(t)fr| |*| nt|tst|+  t,j-j.r| |/  | j0|1  t|ts|	 }|d k	r| |2 r|   q| j
rrt| j
jrrtjj  |   d S )Nz5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   CUDACombinedScheduling)SIMDSchedulingr  )3r~  rf   r?  rR   r0  re   rn  r   r   r   r  r   r   rb  r,   rP   r5   r   r   Zcodegen_device_guard_exitrk  r   Zcodegen_device_guard_enterr  r  rF   r   rS  codegen_templater  r  r  rd  r   r"  Z codegen.cuda_combined_schedulingrq  Zcodegen.simdrr  Zcodegen_foreachr  r   codegen_noder   r   ZtritonZdebug_sync_kernelcodegen_syncr   r   ready_to_flush)	rI   r=   r-  r#  epilogueZbackend_rq  rr  r  rJ   rJ   rK   r   |
  s|    











zScheduler.codegen)r  r>   c                 C   s"   | j | }|jd k	st|j S rr   )r   r=   r   r   )rI   r  r=   rJ   rJ   rK   get_buffer_layout
  s    
zScheduler.get_buffer_layout)@rQ   r2  r3  r   r    r  r4  r   r   r   r  rL   r   r#  r  r  r  r  r6   r  r  r  r  r  r  r  r  r	   r   r6  r  r  r   r/  r  r   r1  r2  rC  rG  r  rJ  r!   rL  rP  rS  rI  r8  r9  r  rX  r`  r_  r^  rb  r  rd  rk  rS  rn  r   ZLayoutrx  r|  rJ   rJ   rM  rK   r:   C  s   
U V!
5  "/ ) 
;_ 9  !
	'Nr:   c                   @   s   e Zd ZeeedddZeeedddZeeedddZe	e	e
j  eee
jdf df d	d
dZee	e ee dddZeeef ddddZddddZedddZddddZe	e eeef dddZeeedddZdS )re  r}  c                 C   s   t dS )zO
        Check whether node1 and node2 can be vertically fused or not.
        Nr  r=  rJ   rJ   rK   rJ  
  s    z BaseScheduling.can_fuse_verticalc                 C   s   t dS )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        Nr  r=  rJ   rJ   rK   rK  
  s    z"BaseScheduling.can_fuse_horizontalc                 C   s,   |  s|  rt||S t||S dS )z 
        Fuse two nodes
        N)r   r"  r  r  r=  rJ   rJ   rK   r  
  s    zBaseScheduling.fuse.)rg  r>   c                 C   s   t dS )z[
        Process the iteration sizes in case a transformation needs to be applied.
        Nr  )rI   rg  rJ   rJ   rK   rT  
  s    zBaseScheduling.group_fn)template_nodeepilogue_nodesr>   c                 C   s   t dS )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        Nr  )rI   ry  rz  rJ   rJ   rK   rs  
  s    zBaseScheduling.codegen_templateNr  c                 C   s   t dS )zD
        Generate a kernel given a list of pre-fused nodes.
        Nr  r  rJ   rJ   rK   rt    s    zBaseScheduling.codegen_noderM   c                 C   s   t dS )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        Nr  rS   rJ   rJ   rK   ru    s    zBaseScheduling.codegen_syncc                 C   s   dS )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        FrJ   rS   rJ   rJ   rK   rv    s    zBaseScheduling.ready_to_flushc                 C   s   t dS )z]
        Flush the generated kernel and python wrapper code to the source code file.
        Nr  rS   rJ   rJ   rK   rb    s    zBaseScheduling.flushr  c                 C   s   t dS )r  Nr  )rI   r~  rJ   rJ   rK   r    s    z$BaseScheduling.benchmark_fused_nodesc                 C   s   dS )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   rJ   r=  rJ   rJ   rK   rT  $  s    z'BaseScheduling.get_fusion_pair_priority)rQ   r2  r3  r6   r   rJ  rK  r  r  r	   r  r  r   rT  r   r  rs  r   r   rt  ru  rv  rb  r6  r  r  rT  rJ   rJ   rJ   rK   re  
  s<    	 	 


 re  r  c           
      C   s   g }|   }|d ks$t|tjs$t|rH|jd krH||   d nddlm	} ddl
m} t| trp| fn| j}|d  }| j|}t|||fst|tjj_tj}|| }	|t_||   d |t|	d |S )Nz" Unfinalized multi template bufferr   rp  )TritonSchedulingz Triton code:rF  )r1  r   r   r  r   Zmake_kernel_renderr   rR   Z0torch._inductor.codegen.cuda_combined_schedulingrq  Ztorch._inductor.codegen.tritonr{  r   r   r   r<   rS  r5   r   r  r   Zgenerated_kernel_countZgenerate_kernel_code_from_nodesstriprH  rE  )
r=   rk   Zmulti_templaterq  r{  r   r#  r  Zold_generated_kernel_countZtriton_coderJ   rJ   rK   r\  .  s$    
r\  )rJ   )pr
  dataclassesr  r   r  r,  rU  r  rG  rH  r  r   r   r   r   r   r   r   r	   r
   r   r   r   r  r   Ztorch._inductor.async_compileZtorch._dynamo.utilsr   r   Ztorch._inductor.metricsr   r   Z%torch.fx.experimental.symbolic_shapesr   Ztorch.utils._sympy.symbolr   r   Ztorch.utils._tritonr   rm   r   r   r   r   r   Z	codecacher   Zcodegen.commonr   r   Zcomm_analysisr   r    r!   r"   r#   r$   r%   r&   Zruntime.runtime_utilsr'   r(   r   r)   utilsr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   Zvirtualizedr5   	getLoggerrQ   rf   Z_loggingZgetArtifactLoggerr>  r6   r8  r  ra   r   r   ZopsZatenZconvolutionmmZbmmZaddmmr&  r  r   r   r  r"  r  r  r  	dataclassrz   countr  r:   re  r\  rJ   rJ   rJ   rK   <module>   s   84
    '
 
'  0 U 

.           ]