U
    yh                 -   @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z% d	d
l&m'Z'm(Z( d	dl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d	dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ej78e9dZ:dd Z;ej<G dd dZ=ej<G dd dZ>ej<G dd dZ?ej<G dd dZ@ee=e>e?f ZAi ZBeeCe@f eDd< G dd dZEi ZFeeCeEf eDd< eGdfeCe
eGeGdddZHeCdd d!ZIdeCeJd#d$d%ZKeejL d&d'd(ZMeCeEd)d*d+ZNeCdd,d-ZOePdd.d/ ZQejRejSejTejSid0d1 ejJejUejVejWejXejYejZej[ej\ej]ej^fD Z_G d2d3 d3Z`G d4d5 d5eZaG d6d7 d7eaZbG d8d9 d9Zcej<G d:d; d;Zdeeedejfd<d= d>d?edejfd@d= dAd= dBdCedejfdDd= dEd= dFdCedejfdGd= dHd= dIdCedejfdJd= dKd= dLdCedejfdMd= dNd= dOdPedejfdQd= dRd= dSdCedejfdTd= dUd= dVd= dWdXedejfdYd= dZd?edejfd[d= d\d?edejfd]d= d^d?edejfd_d= d`d?edejfdad= dbd= dcd= dddeedejfdfd= dgd= dhdPedejfdid= djd= dkdCedejfdld= dmd?edejfdnd= dod?edejfdpd= dqd= drdCedejfdsd= dtd= dudCedejfdvd= dwd?edejfdxd= dyd?edejfdzd= d{d?edejfd|d= d}d?edejfd~d= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?edejfdd= dd?d(ZgeeCedf eDd< ece2eC dddZhG dd de*ZiG dd de,ZjG dd deZkG dd dZlG dd dZmG dd delZnG dd dZoG dd dZpG dd dZqG dd depZrej<G dd dZsePddd ZtG dd dZudS )    N)chain)
AnyCallableClassVarDictList
NamedTupleOptionalSetTupleUnion)Printer)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRangeAnalysisValueRanges   )configmetrics)DeferredLineBasegenerate_assertIndentedBuffer	sympy_dot
sympy_subsunique)ops
OpsHandlerOpsValueReductionType	StoreModeVZschedulec                 C   s   t tjrt d|  d S )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)msg r+   P/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/codegen/common.pydata_type_logger1   s    r-   c                   @   s$   e Zd ZU dZejed< eed< dS )WorkspaceArgzA temporary buffer used for a single kernel, then discarded.

    Not registered as a traditional buffer since there are no users,
    so it would be dead code eliminated.
    nbytes	zero_fillN)__name__
__module____qualname____doc__sympyExpr__annotations__boolr+   r+   r+   r,   r.   6   s   

r.   c                   @   s<   e Zd ZU eed< eed< ejed< edZ	ej
ed< dS )	TensorArgnamebufferdtyper   offsetN)r1   r2   r3   strr7   torchr<   r5   Integerr=   r6   r+   r+   r+   r,   r9   B   s   

r9   c                   @   s    e Zd ZU eed< ejed< dS )SizeArgr:   exprN)r1   r2   r3   r>   r7   r5   r6   r+   r+   r+   r,   rA   J   s   
rA   c                   @   s.   e Zd ZU eed< eed< edZeed< dS )DeviceCodegen
schedulingwrapper_codegenNcpp_wrapper_codegen)r1   r2   r3   r   r7   typerF   r+   r+   r+   r,   rC   P   s   
rC   device_codegensc                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
DeviceOpOverridesc                 C   s   t d S NNotImplementedErrorselfr:   r+   r+   r,   import_get_raw_stream_as]   s    z*DeviceOpOverrides.import_get_raw_stream_asc                 C   s   t d S rJ   rK   rN   Z
device_idxr+   r+   r,   
set_device`   s    zDeviceOpOverrides.set_devicec                 C   s   t d S rJ   rK   rN   r+   r+   r,   synchronizec   s    zDeviceOpOverrides.synchronizec                 C   s   t d S rJ   rK   rP   r+   r+   r,   device_guardf   s    zDeviceOpOverrides.device_guardN)r1   r2   r3   rO   rQ   rS   rT   r+   r+   r+   r,   rI   \   s   rI   device_op_overrides_dictdeviceZdevice_schedulingZdevice_wrapper_codegenZdevice_cpp_wrapper_codegenc                 C   s   t |||t| < d S rJ   )rC   rH   rV   r+   r+   r,   register_backend_for_device   s
      rX   rW   c                 C   s   | t krt |  jS d S rJ   )rH   rD   rY   r+   r+   r,   get_scheduling_for_device   s    rZ   F)rW   cpp_wrapperc                 C   s(   | t kr t |  }|r|jS |jS d S d S rJ   )rH   rF   rE   )rW   r[   Zwrapper_codegen_objr+   r+   r,   get_wrapper_codegen_for_device   s    r\   )indexc                 C   s"   ddl m} | t|||fS )Nr   )FlexibleLayout)irr^   r   Zcontiguous_strides)r]   Z
index_varssizesr^   r+   r+   r,   index_prevent_reordering   s    ra   rW   device_op_overridesc                 C   s   |t | < d S rJ   )rU   rb   r+   r+   r,   register_device_op_overrides   s    rd   c                 C   sF   t | tstt s.ddlm} ddlm} | t krBt|  S d S )N   )rc   )
isinstancer>   AssertionErrorrU   keyscudarc   Zxpu)rW   rc   Zxpu_op_overridesr+   r+   r,   get_device_op_overrides   s    rj   c                   C   s   dS )N)Zis_infis_nanbitwise_xorlogical_notsignbitleltgegteqner+   r+   r+   r+   r,   boolean_ops   s    ru   c                 C   s   i | ]
}||qS r+   r+   ).0r<   r+   r+   r,   
<dictcomp>   s    rw   c                   @   s   e Zd ZddddZejjdddZejjddd	Zejjdd
dZ	ejj
dddZdd Zedd Zedd ZdS )DataTypePropagationNreturnc                 C   s8   || _ d|jji| _|j D ]\}}|j| j|< qd S Nroot)bodyZ
root_blockgraphgraphsZ	subblocksitems)rN   r}   kvr+   r+   r,   __init__   s     zDataTypePropagation.__init__)nodec                 C   sV   |j }dd |D }t|dkr$d S tdd |D }|s>d S ttjdd |D S )Nc                 S   s(   g | ] }t |tjjr|jd kr|qS )placeholder)rf   r?   fxNodeoprv   nr+   r+   r,   
<listcomp>   s     
 zCDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<listcomp>r   c                 s   s,   | ]$}t j|jko"|jt j jd k	V  qd S rJ   )OptimizationContextkeymetar<   r   r+   r+   r,   	<genexpr>   s   zBDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>c                 S   s   g | ]}|j tj jqS r+   )r   r   r   r<   r   r+   r+   r,   r      s     )Zall_input_nodeslenall	functoolsreducer?   Zpromote_types)rN   r   inputsZinput_nodesZall_input_nodes_propagatedr+   r+   r,   deduce_node_dtype_by_inputs   s    z/DataTypePropagation.deduce_node_dtype_by_inputsc                 C   s"   | j |j }| |}|st|S rJ   )r   targetpropagate_graphrg   )rN   r   Z	sub_graphr<   r+   r+   r,   deduce_node_dtype_by_subgraph   s    
z1DataTypePropagation.deduce_node_dtype_by_subgraphc                 C   s  |j t krtjS |jdkr d S |j dkr<t|jdkr<d S |j dkrP|jd S |j dkr`tjS |j dkrptjS |j dkr|jd }t	j
|S |j tjkr| |jd	 S t|j tst|j d
kr|jd S |j dkrt|jd  S |j dr| |S | |S )Nr   outputre   )to_dtype
index_expr)ZrandZrandn)	get_indexr   Z	randint64)loadstorestore_reductionr   	reductionconstantZmasked_subblock)r   ru   r?   r8   r   r   argsfloatint64r$   r~   	get_dtypeoperatorgetitemdeduce_node_dtyperf   r>   rg   DTYPE_TO_COMPUTATION_DTYPE
startswithr   r   )rN   r   Zbuf_namer+   r+   r,   r      s4    











z%DataTypePropagation.deduce_node_dtype)r~   c                 C   sf   |j s
td }|j D ]L}tj|jkr2|jtj }nt }| ||_||jtj< |jdkr|j}q|S )Nr   )Znodesrg   r   r   r   r   r<   r   )rN   r~   Zgraph_dtyper   opt_ctxr+   r+   r,   r   5  s    


z#DataTypePropagation.propagate_graphc                 C   s   |  | jd  d S r{   )r   r   rR   r+   r+   r,   	propagateG  s    zDataTypePropagation.propagatec                 C   s   | |  S rJ   )r   )clsr}   r+   r+   r,   propagate_loopbodyJ  s    z&DataTypePropagation.propagate_loopbodyc                 C   sF   ddl m} ddlm} t||s&tt|j|s6tt|j d S )Nr   )LoopBody)SchedulerNode)	r_   r   	schedulerr   rf   rg   _bodyrx   r   )r   r   r   r   r+   r+   r,   propagate_scheduler_nodeN  s
    z,DataTypePropagation.propagate_scheduler_node)r1   r2   r3   r   r?   r   r   r   r   r   ZGraphr   r   classmethodr   r   r+   r+   r+   r,   rx      s   7
rx   c                       s   e Zd Zedd Zdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d.ed. fd/d0Z  ZS )1ExprPrinterc                 C   sV   dd }t | ts:td| tjs:td| tjs:| dkr>| S || rJ| S d|  dS )Nc                 S   s   | d dkst | dk rdS d}t| dd  D ]H\}}|dkrJ|d7 }n|dkrZ|d8 }|dkr0|t | d kr0 dS q0|dkstdS )Nr   (r   Fre   )T)r   	enumeraterg   )stringcounticharr+   r+   r,   all_in_parens]  s    
z(ExprPrinter.paren.<locals>.all_in_parensz^[a-z0-9_.]+$z^\([^)]*\)$ r   r   )rf   CSEVariablerematchI)r   r   r+   r+   r,   paren[  s    zExprPrinter.parenc                 C   s&   d|j  dt| jt| j|jS )N )Zrel_opjoinmapr   _printr   rN   rB   r+   r+   r,   _print_Relationalw  s    zExprPrinter._print_Relationalc                 C   s   d t| jt| j|jS )N*r   r   r   r   r   r   r+   r+   r,   
_print_Mulz  s    zExprPrinter._print_Mulc                 C   s   d t| jt| j|jS )Nz + r   r   r+   r+   r,   
_print_Add}  s    zExprPrinter._print_Addc                 C   s   d t| jt| j|jS N % r   r   r+   r+   r,   
_print_Mod  s    zExprPrinter._print_Modc                 C   s0   |j \}}| | | d| | | S Nz / r   r   r   rN   rB   lhsrhsr+   r+   r,   _print_FloatTrueDiv  s    
zExprPrinter._print_FloatTrueDivc                 C   s
   |  |S rJ   )_print_FloorDivr   r+   r+   r,   _print_CleanDiv  s    zExprPrinter._print_CleanDivc                 C   s   d t| jt| j|jS )Nz >= r   r   r+   r+   r,   _print_GreaterThan  s    zExprPrinter._print_GreaterThanc                 C   s*   t |jdkstd| |jd  dS )Nre   zalign(r   r   r   r   rg   r   r   r+   r+   r,   _print_align  s    zExprPrinter._print_alignc                 C   sb   |j \}}| |}|t|ks(t|t|}|dks<t|dkrZd| |g| S dS d S )Nr   r   1)r   r   intrg   r   r   rN   rB   baseexpr+   r+   r,   
_print_Pow  s    

zExprPrinter._print_Powc                 C   s   t dt|  d S )Nz#_print_ToFloat not implemented for rL   rG   r   r+   r+   r,   _print_ToFloat  s    zExprPrinter._print_ToFloatc                 C   s   t dt|  d S )Nz$_print_Infinity not implemented for r   r   r+   r+   r,   _print_Infinity  s    zExprPrinter._print_Infinityc                 C   s   t dt|  d S )Nz,_print_NegativeInfinity not implemented for r   r   r+   r+   r,   _print_NegativeInfinity  s    z#ExprPrinter._print_NegativeInfinityc                 C   s   t dt|  d S )Nz$_print_FloorDiv not implemented for r   r   r+   r+   r,   r     s    zExprPrinter._print_FloorDivc                 C   s   t dt|  d S )Nz%_print_PythonMod not implemented for r   r   r+   r+   r,   _print_PythonMod  s    zExprPrinter._print_PythonModc                 C   s   t dt|  d S )Nz&_print_IntTrueDiv not implemented for r   r   r+   r+   r,   _print_IntTrueDiv  s    zExprPrinter._print_IntTrueDivc                 C   s   t dt|  d S )Nz(_print_PowByNatural not implemented for r   r   r+   r+   r,   _print_PowByNatural  s    zExprPrinter._print_PowByNaturalc                 C   s   t dt|  d S )Nz$_print_FloatPow not implemented for r   r   r+   r+   r,   _print_FloatPow  s    zExprPrinter._print_FloatPowc                 C   s   t dt|  d S )Nz&_print_TruncToInt not implemented for r   r   r+   r+   r,   _print_TruncToInt  s    zExprPrinter._print_TruncToIntc                 C   s   t dt|  d S )Nz&_print_RoundToInt not implemented for r   r   r+   r+   r,   _print_RoundToInt  s    zExprPrinter._print_RoundToIntc                 C   s   t dt|  d S )Nz(_print_RoundDecimal not implemented for r   r   r+   r+   r,   _print_RoundDecimal  s    zExprPrinter._print_RoundDecimalc                 C   s   t dt|  d S )Nz(_print_TruncToFloat not implemented for r   r   r+   r+   r,   _print_TruncToFloat  s    zExprPrinter._print_TruncToFloatT)simplifyc                   s6   |r*t |tjr*ttjdr*tjj|}t 	|S )Nsizevars)
rf   r5   r6   hasattrr$   r~   r   r   superdoprint)rN   rB   r   	__class__r+   r,   r     s    zExprPrinter.doprint)r1   r2   r3   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r8   r   __classcell__r+   r+   r   r,   r   Z  s0   

r   c                   @   s   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Zd7d8 Zd9d: Zd;d< Z d=S )>PythonPrinterc                 C   s*   t |jdkstd| |jd  dS )Nre   zfloat(r   r   r   r   r+   r+   r,   r     s    zPythonPrinter._print_ToFloatc                 C   sd   |j \}}}| | |}| | |}| | |}|dkrVd| d| d}| d| S )Nr   r    // r   r   r   r   r   )rN   rB   xdivmodr+   r+   r,   _print_ModularIndexing  s    z$PythonPrinter._print_ModularIndexingc                 C   s   dS )Nzmath.infr+   r   r+   r+   r,   r     s    zPythonPrinter._print_Infinityc                 C   s   dS )Nz	-math.infr+   r   r+   r+   r,   r     s    z%PythonPrinter._print_NegativeInfinityc                 C   s   d t| jt| j|jS r   r   r   r+   r+   r,   r     s    zPythonPrinter._print_PythonModc                 C   s<   |j \}}| | |}| | |}d| d| dS )Nr   r   r   r   )rN   rB   r   r   r+   r+   r,   r      s    
zPythonPrinter._print_FloorDivc                 C   s0   |j \}}| | | d| | | S r   r   r   r+   r+   r,   r     s    
zPythonPrinter._print_IntTrueDivc                 C   s   d|  | dS )Nz
math.sqrt(r   )r   r   r+   r+   r,   _helper_sqrt  s    zPythonPrinter._helper_sqrtc                 C   s   |  |jd S Nr   )r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_sqrt  s    z'PythonPrinter._print_OpaqueUnaryFn_sqrtc                 C   s0   |j \}}| | | d| | | S Nz ** r   r   r+   r+   r,   r     s    
zPythonPrinter._print_FloatPowc                 C   s0   |j \}}| | | d| | | S r  r   r   r+   r+   r,   r     s    
z!PythonPrinter._print_PowByNaturalc                 C   s*   t |jdkstd| |jd  dS Nre   zmath.floor(r   r   r   r   r+   r+   r,   _print_floor  s    zPythonPrinter._print_floorc                 C   s*   t |jdkstd| |jd  dS r  r   r   r+   r+   r,   _print_FloorToInt  s    zPythonPrinter._print_FloorToIntc                 C   s*   t |jdkstd| |jd  dS )Nre   zmath.trunc(r   r   r   r   r+   r+   r,   r   #  s    zPythonPrinter._print_TruncToIntc                 C   s*   t |jdkstd| |jd  dS Nre   z
math.ceil(r   r   r   r   r+   r+   r,   _print_ceiling(  s    zPythonPrinter._print_ceilingc                 C   s*   t |jdkstd| |jd  dS r  r   r   r+   r+   r,   _print_CeilToInt,  s    zPythonPrinter._print_CeilToIntc                 C   s*   t |jdkstd| |jd  dS )Nre   zabs(r   r   r   r   r+   r+   r,   
_print_Abs0  s    zPythonPrinter._print_Absc                 C   s.   t |jdkstddt| j|j dS )Nr   zmax(, r   r   r   rg   r   r   r   r   r+   r+   r,   
_print_Max7  s    zPythonPrinter._print_Maxc                 C   s.   t |jdkstddt| j|j dS )Nr   zmin(r
  r   r  r   r+   r+   r,   
_print_Min;  s    zPythonPrinter._print_Minc                 C   s*   t |jdkstd| |jd  dS )Nre   z	math.cos(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_cos?  s    z&PythonPrinter._print_OpaqueUnaryFn_cosc                 C   s*   t |jdkstd| |jd  dS )Nre   z
math.cosh(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_coshC  s    z'PythonPrinter._print_OpaqueUnaryFn_coshc                 C   s*   t |jdkstd| |jd  dS )Nre   z
math.acos(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_acosG  s    z'PythonPrinter._print_OpaqueUnaryFn_acosc                 C   s*   t |jdkstd| |jd  dS )Nre   z	math.sin(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_sinK  s    z&PythonPrinter._print_OpaqueUnaryFn_sinc                 C   s*   t |jdkstd| |jd  dS )Nre   z
math.sinh(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_sinhO  s    z'PythonPrinter._print_OpaqueUnaryFn_sinhc                 C   s*   t |jdkstd| |jd  dS )Nre   z
math.asin(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_asinS  s    z'PythonPrinter._print_OpaqueUnaryFn_asinc                 C   s*   t |jdkstd| |jd  dS )Nre   z	math.tan(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_tanW  s    z&PythonPrinter._print_OpaqueUnaryFn_tanc                 C   s*   t |jdkstd| |jd  dS )Nre   z
math.tanh(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_tanh[  s    z'PythonPrinter._print_OpaqueUnaryFn_tanhc                 C   s*   t |jdkstd| |jd  dS )Nre   z
math.atan(r   r   r   r   r+   r+   r,   _print_OpaqueUnaryFn_atan_  s    z'PythonPrinter._print_OpaqueUnaryFn_atanc                 C   s*   t |jdkstd| |jd  dS )Nre   round(r   r   r   r   r+   r+   r,   r   c  s    zPythonPrinter._print_RoundToIntc                 C   sD   t |jdkst|j\}}t|tjs,td| | d| dS )Nr   r  r
  r   )r   r   rg   rf   r5   r@   r   )rN   rB   numberndigitsr+   r+   r,   r   g  s    
z!PythonPrinter._print_RoundDecimalN)!r1   r2   r3   r   r   r   r   r   r   r   r   r  r   r   r  r  r   r  r  r	  r  r  r  r  r  r  r  r  r  r  r  r   r   r+   r+   r+   r,   r     s<   	r   c                       s  e Zd Z fddZdd Zedd Zedd Zed	d
 Zedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* Zed+d, Zed-d. Zed/d0 Zed1d2 Zed3d4 Zed5d6 Zed7d8 Zed9d: Z ed;d< Z!ed=d> Z"ed?d@ Z#edAdB Z$edCdD Z%edEdF Z&edGdH Z'e(dIdJ Z)  Z*S )KOpOverridesc                    s   t    || _d S rJ   )r   r   _parent)rN   parentr   r+   r,   r   o  s    
zOpOverrides.__init__c                 C   s   t | j|S rJ   )getattrr  )rN   itemr+   r+   r,   __getattr__s  s    zOpOverrides.__getattr__c                 C   s   | S rJ   r+   )valuer+   r+   r,   identityv  s    zOpOverrides.identityc                 C   s   t | S rJ   )repr)r   r<   r+   r+   r,   r   {  s    zOpOverrides.constantc                 C   s   t t dtj| S Nre   )r   truedivr   r?   int32r   r+   r+   r,   
reciprocal  s    zOpOverrides.reciprocalc                 C   s   t | | S rJ   )r   mulr&  r+   r+   r,   square  s    zOpOverrides.squarec                 C   s   t t dtjt | S r#  )r   subr   r?   float32erfr&  r+   r+   r,   erfc  s    zOpOverrides.erfcc                 C   s   t t t | t | S rJ   )r   r(  r   r)  r-  r&  r+   r+   r,   erfcx  s    zOpOverrides.erfcxc                 C   s   t t | t dtjS r#  )r   r*  r   r   r?   r+  r&  r+   r+   r,   expm1  s    zOpOverrides.expm1c              	   C   s&   t t | t dtd tjS )Nre   
   r   r(  logr   mathr?   r+  r&  r+   r+   r,   log10  s    zOpOverrides.log10c              	   C   s&   t t | t dtd tjS )Nre   r   r1  r&  r+   r+   r,   log2  s    zOpOverrides.log2c              
   C   s"   t t | t tdtjS )Nr   )r   r   r(  r   r3  r2  r?   r+  r&  r+   r+   r,   exp2  s    zOpOverrides.exp2c              	   C   s   t t | t dtjS r#  )r   r2  addr   r?   r%  r&  r+   r+   r,   log1p  s    zOpOverrides.log1pc                 C   s.   t dtj}t |t |t t | S r#  )r   r   r?   r%  r$  r7  r   negr   oner+   r+   r,   sigmoid  s    zOpOverrides.sigmoidc                 C   s.   t dtj}t |t |t t | S r#  )r   r   r?   r%  r$  r7  libdevice_expr9  r:  r+   r+   r,   libdevice_sigmoid  s    zOpOverrides.libdevice_sigmoidc                 C   s   t | t dtjS r   )r   maximumr   r?   r%  r&  r+   r+   r,   relu  s    zOpOverrides.reluc                 C   s
   t | S rJ   )r   absr&  r+   r+   r,   libdevice_abs  s    zOpOverrides.libdevice_absc                 C   s
   t | S rJ   )r   sqrtr&  r+   r+   r,   libdevice_sqrt  s    zOpOverrides.libdevice_sqrtc                 C   s
   t | S rJ   )r   cosr&  r+   r+   r,   libdevice_cos  s    zOpOverrides.libdevice_cosc                 C   s
   t | S rJ   )r   sinr&  r+   r+   r,   libdevice_sin  s    zOpOverrides.libdevice_sinc                 C   s
   t | S rJ   )r   r2  r&  r+   r+   r,   libdevice_log  s    zOpOverrides.libdevice_logc                 C   s
   t | S rJ   )r   r   r&  r+   r+   r,   r=    s    zOpOverrides.libdevice_expc                 C   s   dt |  S )N~r   r   r&  r+   r+   r,   bitwise_not  s    zOpOverrides.bitwise_notc                 C   s   t |  dS )Nz == 0rK  )ar+   r+   r,   rm     s    zOpOverrides.logical_notc                 C   s   t |  dt | S )Nz & rK  r   yr+   r+   r,   bitwise_and  s    zOpOverrides.bitwise_andc                 C   s   t |  dt | S )Nz | rK  rN  r+   r+   r,   
bitwise_or  s    zOpOverrides.bitwise_orc                 C   s   t |  dt | S )Nz ^ rK  rN  r+   r+   r,   rl     s    zOpOverrides.bitwise_xorc                 C   s   t |  dt | S )Nz << rK  rN  r+   r+   r,   bitwise_left_shift  s    zOpOverrides.bitwise_left_shiftc                 C   s   t |  dt | S )Nz >> rK  rN  r+   r+   r,   bitwise_right_shift  s    zOpOverrides.bitwise_right_shiftc              	   C   sT   t | |}t t |t dtjt t |t |}t |t 	|||S r   )
r   r   and_rt   r   r?   r%  rn   wherer7  )rM  brcondr+   r+   r,   	remainder  s    zOpOverrides.remainderc                 C   s   t t | |S rJ   )r   r   truncrM  r<   r+   r+   r,   trunc_to_int  s    zOpOverrides.trunc_to_intc                 C   s   t t | |S rJ   )r   r   floorr[  r+   r+   r,   floor_to_int  s    zOpOverrides.floor_to_intc                 C   s   t t | |S rJ   )r   r   ceilr[  r+   r+   r,   ceil_to_int  s    zOpOverrides.ceil_to_intc                 C   s   t t | |S rJ   )r   r   roundr[  r+   r+   r,   round_to_int  s    zOpOverrides.round_to_intc                 C   s   t | |S rJ   )r   r$  )rM  rV  r+   r+   r,   int_truediv  s    zOpOverrides.int_truedivc                 C   s   t | t|S rJ   )r   r   r5   r@   )r:   r=   r+   r+   r,   	load_seed  s    zOpOverrides.load_seedc                 C   sJ   |dkst |t D ],\}}t||}|d kr4qt| |t| qd S )N>   cppvectritoncpp)rg   pointwise_overrides_datar   r  setattrr   )r   r   funcnamedataimplr+   r+   r,   _initialize_pointwise_overrides	  s    
z+OpOverrides._initialize_pointwise_overrides)+r1   r2   r3   r   r  r   r!  r   r'  r)  r-  r.  r/  r4  r5  r6  r8  r<  r>  r@  rB  rD  rF  rH  rI  r=  rL  rm   rP  rQ  rl   rR  rS  rY  r\  r^  r`  rb  rc  rd  r   rm  r   r+   r+   r   r,   r  n  s   

































r  c                   @   sd   e Zd ZU eed< edef ed< dZeedef  ed< dZeedef  ed< e	j
Ze	ed< dS )OverridesDatar:   .rg  Nrf  re  type_promotion_kind)r1   r2   r3   r>   r7   r   rf  r	   re  r   DEFAULTro  r+   r+   r+   r,   rn    s   
rn  c                 C   s   d|  dS )Nzairy_ai_forward(r   r+   r&  r+   r+   r,   <lambda>&      rq  Zspecial_airy_ai)ro  rg  r:   c                 C   s   d|  dS )Nzbessel_j0_forward(r   r+   r&  r+   r+   r,   rq  +  rr  c                 C   s   d|  dS )Nzlibdevice.j0(r   r+   r&  r+   r+   r,   rq  ,  rr  Zspecial_bessel_j0)ro  rg  rf  r:   c                 C   s   d|  dS )Nzbessel_j1_forward(r   r+   r&  r+   r+   r,   rq  1  rr  c                 C   s   d|  dS )Nzlibdevice.j1(r   r+   r&  r+   r+   r,   rq  2  rr  Zspecial_bessel_j1c                 C   s   d|  dS )Nzbessel_y0_forward(r   r+   r&  r+   r+   r,   rq  7  rr  c                 C   s   d|  dS )Nzlibdevice.y0(r   r+   r&  r+   r+   r,   rq  8  rr  Zspecial_bessel_y0c                 C   s   d|  dS )Nzbessel_y1_forward(r   r+   r&  r+   r+   r,   rq  =  rr  c                 C   s   d|  dS )Nzlibdevice.y1(r   r+   r&  r+   r+   r,   rq  >  rr  Zspecial_bessel_y1c                 C   s   d|  dS )Nzcalc_digamma(r   r+   r&  r+   r+   r,   rq  C  rr  c                 C   s
   |  dS )Nz
.digamma()r+   r&  r+   r+   r,   rq  D  rr  digamma)ro  rg  re  r:   c                 C   s   d|  dS )Nzcalc_erfcx(r   r+   r&  r+   r+   r,   rq  K  rr  c                 C   s   d|  dS )Nzlibdevice.erfcx(r   r+   r&  r+   r+   r,   rq  L  rr  Zspecial_erfcxc                 C   s   d|  d| d| dS )Nz	std::fma(r
  r   r+   r   rO  zr+   r+   r,   rq  Q  rr  c                 C   s   d|  d| d| dS )Nzfmadd(r
  r   r+   rt  r+   r+   r,   rq  R  rr  c                 C   s   d|  d| d| dS )Nzlibdevice.fma(r
  r   r+   rt  r+   r+   r,   rq  S  rr  fma)ro  rg  re  rf  r:   c                 C   s   d|  d| dS Nzcalc_igamma(r
  r   r+   rN  r+   r+   r,   rq  Y  rr  igammac                 C   s   d|  d| dS Nzcalc_igammac(r
  r   r+   rN  r+   r+   r,   rq  ^  rr  igammacc                 C   s   d|  d| dS rw  r+   rN  r+   r+   r,   rq  c  rr  Zspecial_gammaincc                 C   s   d|  d| dS ry  r+   rN  r+   r+   r,   rq  h  rr  Zspecial_gammainccc                 C   s   d|  dS )Nzcalc_i0(r   r+   r&  r+   r+   r,   rq  m  rr  c                 C   s   d|  dS Nzlibdevice.cyl_bessel_i0(r   r+   r&  r+   r+   r,   rq  n  rr  c                 C   s
   |  dS )Nz.i0()r+   r&  r+   r+   r,   rq  o  rr  i0)ro  rg  rf  re  r:   c                 C   s   d|  dS )Nz	calc_i0e(r   r+   r&  r+   r+   r,   rq  t  rr  c                 C   s
   |  dS )Nz.i0e()r+   r&  r+   r+   r,   rq  u  rr  Zspecial_i0ec                 C   s   d|  dS )Nzcalc_i1(r   r+   r&  r+   r+   r,   rq  z  rr  c                 C   s   d|  dS Nzlibdevice.cyl_bessel_i1(r   r+   r&  r+   r+   r,   rq  {  rr  Z
special_i1c                 C   s   d|  dS )Nz	calc_i1e(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_i1ec                 C   s   d|  dS )Nzcalc_log_ndtr(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_log_ndtrc                 C   s   d|  dS )Nzmodified_bessel_i0_forward(r   r+   r&  r+   r+   r,   rq    rr  c                 C   s   d|  dS r{  r+   r&  r+   r+   r,   rq    rr  Zspecial_modified_bessel_i0c                 C   s   d|  dS )Nzmodified_bessel_i1_forward(r   r+   r&  r+   r+   r,   rq    rr  c                 C   s   d|  dS r}  r+   r&  r+   r+   r,   rq    rr  Zspecial_modified_bessel_i1c                 C   s   d|  dS )Nzmodified_bessel_k0_forward(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_modified_bessel_k0c                 C   s   d|  dS )Nzmodified_bessel_k1_forward(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_modified_bessel_k1c                 C   s   d|  dS )Nz
calc_ndtr(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_ndtrc                 C   s   d|  dS )Nzcalc_ndtri(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_ndtric                 C   s   d| d|  dS )Nzcalc_polygamma(r
  r   r+   rN  r+   r+   r,   rq    rr  	polygammac                 C   s   d|  dS )Nz"scaled_modified_bessel_k0_forward(r   r+   r&  r+   r+   r,   rq    rr  Z!special_scaled_modified_bessel_k0c                 C   s   d|  dS )Nz"scaled_modified_bessel_k1_forward(r   r+   r&  r+   r+   r,   rq    rr  Z!special_scaled_modified_bessel_k1c                 C   s   d|  dS )Nzspherical_bessel_j0_forward(r   r+   r&  r+   r+   r,   rq    rr  Zspecial_spherical_bessel_j0c                 C   s   d|  d| dS )Nzzeta(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_zetac                 C   s   d|  d| dS )Nzchebyshev_polynomial_t_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_chebyshev_polynomial_tc                 C   s   d|  d| dS )Nzchebyshev_polynomial_u_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_chebyshev_polynomial_uc                 C   s   d|  d| dS )Nzchebyshev_polynomial_v_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_chebyshev_polynomial_vc                 C   s   d|  d| dS )Nzchebyshev_polynomial_w_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_chebyshev_polynomial_wc                 C   s   d|  d| dS )Nzlegendre_polynomial_p_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_legendre_polynomial_pc                 C   s   d|  d| dS )Nz'shifted_chebyshev_polynomial_t_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Z&special_shifted_chebyshev_polynomial_tc                 C   s   d|  d| dS )Nz'shifted_chebyshev_polynomial_u_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Z&special_shifted_chebyshev_polynomial_uc                 C   s   d|  d| dS )Nz'shifted_chebyshev_polynomial_v_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Z&special_shifted_chebyshev_polynomial_vc                 C   s   d|  d| dS )Nz'shifted_chebyshev_polynomial_w_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Z&special_shifted_chebyshev_polynomial_wc                 C   s   d|  d| dS )Nzhermite_polynomial_h_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_hermite_polynomial_hc                 C   s   d|  d| dS )Nzhermite_polynomial_he_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_hermite_polynomial_hec                 C   s   d|  d| dS )Nzlaguerre_polynomial_l_forward(r
  r   r+   rN  r+   r+   r,   rq    rr  Zspecial_laguerre_polynomial_l)(Zairy_aiZ	bessel_j0Z	bessel_j1Z	bessel_y0Z	bessel_y1rs  r.  rv  rx  rz  ZgammaincZ	gammainccr|  Zi0ei1Zi1eZlog_ndtrZmodified_bessel_i0Zmodified_bessel_i1Zmodified_bessel_k0Zmodified_bessel_k1ZndtrZndtrir~  Zscaled_modified_bessel_k0Zscaled_modified_bessel_k1Zspherical_bessel_j0zetaZchebyshev_polynomial_tZchebyshev_polynomial_uZchebyshev_polynomial_vZchebyshev_polynomial_wZlegendre_polynomial_pZshifted_chebyshev_polynomial_tZshifted_chebyshev_polynomial_uZshifted_chebyshev_polynomial_vZshifted_chebyshev_polynomial_wZhermite_polynomial_hZhermite_polynomial_heZlaguerre_polynomial_lrh  hrz   c                 C   s   | S rJ   r+   r  r+   r+   r,   _typecheck_OpOverrides  s    r  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )DeferredLinezHA line that can be 'unwritten' by adding name to V.graph.removed_buffersc                    s$   t  | || _t|tr td S rJ   )r   r   r:   rf   r   rg   )rN   r:   liner   r+   r,   r     s    zDeferredLine.__init__c                    s8   t  fddtjjtjjtjjtjjfD r4 jS d S )Nc                 3   s   | ]} j |kV  qd S rJ   r:   rv   r   rR   r+   r,   r     s   z(DeferredLine.__call__.<locals>.<genexpr>)r   r$   r~   removed_bufferskernelinplaced_to_remover  rR   r+   rR   r,   __call__  s    	zDeferredLine.__call__c                 C   s   t | j|S rJ   )r  r:   )rN   r  r+   r+   r,   	_new_line  s    zDeferredLine._new_line)r1   r2   r3   r4   r   r  r  r   r+   r+   r   r,   r  
  s   r  c                   @   s   e Zd ZdddZdS )BracesBufferre   c                    s   t j fdd}| S )Nc                  3   s   t  D ]} d  jd7  _qt   D ]}  jd8  _d q0d V  t   D ]} d  jd7  _q^t  D ]}  jd8  _d qd S )N{re   })range	writeline_indent)_r=   rN   r+   r,   ctx%  s    

z BracesBuffer.indent.<locals>.ctx)
contextlibcontextmanager)rN   r=   r  r+   r  r,   indent$  s    zBracesBuffer.indentN)re   )r1   r2   r3   r  r+   r+   r+   r,   r  #  s   r  c                   @   s"   e Zd ZU eed< ee ed< dS )InplacedBuffer
inner_nameother_namesN)r1   r2   r3   r>   r7   r   r+   r+   r+   r,   r  8  s   
r  c                   @   s   e Zd Zedd Zd'ddZdd Zdd	 Zd
d Zdd Z	dd Z
ejedddZdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& ZdS )(
KernelArgsc                 C   s8   t |ttjfst||kr0|  t| ||< || S rJ   )rf   r>   r5   Symbolrg   r   )prefixZodictr:   r+   r+   r,   _lookup>  s    zKernelArgs._lookupNc                 C   s.   t  | _t  | _t  | _|p t  | _d | _d S rJ   )dictinput_buffersoutput_buffersinplace_buffersr   workspace_arg)rN   r   r+   r+   r,   r   E  s
    zKernelArgs.__init__c              
   C   s&   d dtt| j| j| j| jgS )NzKernelArgs({})r
  )formatr   r   r"  r  r  r  r   rR   r+   r+   r,   __repr__L  s    zKernelArgs.__repr__c                 C   s   t |to|dS )NZREMOVED)rf   r>   r   rM   r+   r+   r,   _buffer_is_marked_removed[  s    z$KernelArgs._buffer_is_marked_removedc                 C   s   t jjrt jjj||}|t jjks.t||| jkrB| j| S || jkrX| j| j	S |
drr| d| j|S | d| j|S )NseedZin_ptr)r$   r~   r   mutation_real_namegetr  rg   r  r  r  r   r  r  rM   r+   r+   r,   input^  s    



zKernelArgs.inputc                 C   sT   t jjrt jjj||}|t jjks.t||| jkrD| j| jS | 	d| j
|S )NZout_ptr)r$   r~   r   r  r  r  rg   r  r  r  r  rM   r+   r+   r,   r   j  s    
zKernelArgs.outputc                 C   st   || j kst|| j kr:| j | }|j| || j |< n6tdtt| j   ||g}|| j |< || j |< d S )NZ
in_out_ptr)r  rg   r  appendr  r   r   values)rN   Z
input_nameZoutput_namebufr+   r+   r,   make_inplacer  s    


zKernelArgs.make_inplace)r/   r0   c                 C   sF   | j d krt||| _ dS | j j}|p,| j j}t|| || _ d|fS )N)ws_ptrr   r  )r  r.   r/   r0   )rN   r/   r0   r=   r+   r+   r,   	workspace  s    
zKernelArgs.workspacec                    sT   || j kr| j | S  | j  krF  t fdd| j  D    | j |<  S )Nc                 3   s   | ]}|  rd V  qdS )re   N)r   )rv   r   r  r+   r,   r     s     
 z)KernelArgs.seed_offset.<locals>.<genexpr>)r   r  sum)rN   r:   r   r+   r  r,   seed_offset  s    

"
zKernelArgs.seed_offsetc                 C   s*   t |dkrd| jd< dS | d| j|S )Nr  ks)r>   r   r  rM   r+   r+   r,   size  s    
zKernelArgs.sizec                 C   s   t | j | j | j S rJ   )r   r  rh   r  r   rR   r+   r+   r,   
call_names  s
      zKernelArgs.call_namesc                 C   s   |S rJ   r+   )rN   r  r<   r+   r+   r,   wrap_ptr_arg  s    zKernelArgs.wrap_ptr_argc                 C   s   t |S rJ   )r>   rN   r  r+   r+   r,   wrap_size_arg  s    zKernelArgs.wrap_size_argc                 C   s  ddl m}m} g }g }g }t| j D ]j}| |r:q*|jd }|j}t	j
|}	||	 }
||
 d|  || ||	 ||
 d q*| j D ]b\}}|| jkrqt	j
|}	||	 }
|d|
 d|  || ||	 |d|
 d q| j D ]p\}}|| jks| |r2qt	j
|}	||	 }
||
 d|  || ||	 ||
 d q| j D ]X\}}|d| d|  || | |d|  t	j
jrt	j
j| q| jd kstd|||fS )	Nre   )DTYPE_TO_CPP
INDEX_TYPEr   z* r   zconst r   zWorkspace not supported on CPU )	cpp_utilsr  r  r   r  r  r  r  r  r$   r~   r   r  r  r  r   r  r   r  wrapper_codeensure_size_computedr  rg   )rN   r  r  	call_argsarg_defs	arg_typesinplacedouterinnerr<   Z	cpp_dtyper+   r+   r,   cpp_argdefs  sL    



zKernelArgs.cpp_argdefsc              
   C   s  g }g }g }g }t | j D ]n}| |r.q||j ||jd  |tj	|jd  |t
|j|jd tj	|jd d qt| j | j D ]`\}}|| jks| |rq|| || |tj	| |t
||tj	|d q| j D ]T\}}|| || |t| |t|| tjjrtjj| q| jd k	r|d |d || j ||||fS )Nr   )r:   r;   r<   r  r  )r   r  r  r  r  r  r  r$   r~   r   r9   r   r  r   r  r   rG   rA   r  r  r  )rN   r  r  r  Zprecompile_argsr  r  r  r+   r+   r,   python_argdefs  sZ    
 







zKernelArgs.python_argdefsc                 c   s   t | j D ]n}| |rq|jD ]V}|tjjks$|tjjkrBq$|| j	kr^| j	| |j
fV  || jkr$| j| |j
fV  q$qd S rJ   )r   r  r  r  r  r$   r~   r  r  r  r  r  )rN   r  otherr+   r+   r,   aliases  s    





zKernelArgs.aliasesc                    s$    fdd}|| j o"|| jS )Nc                    s   | |kp  ||  S rJ   )r  )r:   buffersrR   r+   r,   _is_removed  s    z*KernelArgs.is_removed.<locals>._is_removed)r  r  )rN   r:   r  r+   rR   r,   
is_removed  s
     zKernelArgs.is_removedc                 C   sn   t  }t| j D ] }| |r$q||jd  q| j D ](\}}|| jks@| |r^q@|| q@|S )Nr   )	setr   r  r  r  r7  r  r  r   )rN   Z	live_outsr  r  r  r+   r+   r,   live_output_buffers  s    
zKernelArgs.live_output_buffers)N)r1   r2   r3   r   r  r   r  r  r  r   r  r5   r6   r8   r  r  r  r  r  r  r  r  r  r  r  r+   r+   r+   r,   r  =  s&   



)/r  c                   @   sV   e Zd ZdZee dddZdd Zeddd	Z	e
dd
dZdd Zdd ZdS )r   aD  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
    To do so, the backends can simply overload `Kernel.create_cse_var`
    The "CSEVariable.update_on_args" method gives you a hook for annotations
    See example of TritonCSEVariable in triton.py
    boundsc                 C   s$   t |tst|| _|| _d| _d S r#  )rf   r   rg   r:   r  	use_count)rN   r:   r  r+   r+   r,   r   +  s    zCSEVariable.__init__c                 C   s   | j S rJ   r  rR   r+   r+   r,   __str__1  s    zCSEVariable.__str__ry   c                 C   s
   t | jS rJ   )hashr:   rR   r+   r+   r,   __hash__4  s    zCSEVariable.__hash__c                 C   s   t |t | ko|j| jkS rJ   )rG   r:   )rN   r  r+   r+   r,   __eq__7  s    zCSEVariable.__eq__c                 C   s   d S rJ   r+   )rN   r:   r   kwargsr+   r+   r,   update_on_args:  s    zCSEVariable.update_on_argsc                 C   s   | j j d| jdS )Nr   r   )r   r1   r:   rR   r+   r+   r,   r  =  s    zCSEVariable.__repr__N)r1   r2   r3   r4   r   r   r   r  r   r  r8   r  r  r  r+   r+   r+   r,   r   $  s   r   c                   @   s   e Zd Zdd Zdd ZdS )CppWrapperKernelArgsc                 C   s0   ddl m} tjr|S d||  d| dS d S )Nre   )r  r   z*)(z.data_ptr()))r  r  r   Zabi_compatible)rN   r  r<   r  r+   r+   r,   r  B  s    z!CppWrapperKernelArgs.wrap_ptr_argc                 C   s   | S rJ   r+   r  r+   r+   r,   r  L  s    z"CppWrapperKernelArgs.wrap_size_argN)r1   r2   r3   r  r  r+   r+   r+   r,   r  A  s   
r  c                   @   s   e Zd ZdZdddZee ddd	Zd
d Ze	
 dddeeeeeef e	e edddZe	
 fe	e edddZdS )CSEz Common subexpression eliminationr   tmpNc                 C   sP   || _ || _i | _|| _|pi | _|p(i | _|p6t | _t	 | _
|pHi | _d S rJ   )r  suffixcachename_prefixstore_cachereduction_cache	itertoolsr   iter_buffer_idsr  invalidated_storesvarname_map)rN   r  r  r  iter_buffersr  r  r  r+   r+   r,   r   S  s    


zCSE.__init__	keep_varsc                    sR   t | j D ]$\}}| kr| j|= | j| q fdd| j D | _d S )Nc                    s   i | ]\}}| kr||qS r+   r+   )rv   r   r   r  r+   r,   rw   l  s       z"CSE.invalidate.<locals>.<dictcomp>)listr  r   r  r7  r  )rN   r  r:   r  r+   r  r,   
invalidateg  s
    zCSE.invalidatec                 C   s    t | j| j| j| j| j| jdS )N)r  r  r  r  r  r  )r  r  r  r  r  r  r  rR   r+   r+   r,   clonen  s    z	CSE.cloneT)r  write
assignment)r;   rB   r  rz   c          	      C   sX  t |tr|j}t |tttfs,tt||s8|s8tt |trb|j	||_| j
d7  _
|S t |trt| n|}| j|d }|s8| |}|| j|< |rTtjjrtjjj|dd t |tr|r|| j | d || || j n:|r| j | d| | j }n| | j }|| n|j	||_| j
d7  _
|S )Nre   T)Z	only_oncez =z = )rf   r!   r   r>   r   r   rg   rG   r  Ztightenr  getvaluer  r  newvarr$   r  current_nodeZcodegen_originating_infor  r  Zsplicer  )	rN   r;   rB   r  r  r  	cache_keyvarr  r+   r+   r,   generatey  s>    	



 

zCSE.generate)r  rz   c                 C   s0   | j  t| j }tj||}|| j|< |S rJ   )r  nextr  r$   r  create_cse_varr  )rN   r  var_namer  r+   r+   r,   r    s    
z
CSE.newvar)r   r   r  NNNN)r1   r2   r3   r4   r   r
   r>   r  r  r   unknownr   r   r   r!   r   r  r  r+   r+   r+   r,   r  P  s*          
0r  c                       s,   e Zd Z fddZdd Zdd Z  ZS )CodeGenc                    s   t    t | _d S rJ   )r   r   r  	ExitStack
exit_stackrR   r   r+   r,   r     s    
zCodeGen.__init__c                 C   s   | j   | S rJ   )r  	__enter__rR   r+   r+   r,   r    s    
zCodeGen.__enter__c                 C   s   | j ||| d S rJ   )r  __exit__rN   exc_typeexc_valexc_tbr+   r+   r,   r    s    zCodeGen.__exit__)r1   r2   r3   r   r  r  r   r+   r+   r   r,   r    s   r  c                   @   s6   e Zd Zdd Zdd Zdd Zdd Zdd
dZd	S )
ScopedDictc                 C   s   || _ i | _d S rJ   )original_dict	new_items)rN   r   r+   r+   r,   r     s    zScopedDict.__init__c                 C   s   || j kr| j | S | j| S rJ   r  r   rN   r   r+   r+   r,   __getitem__  s    

zScopedDict.__getitem__c                 C   s   || j |< d S rJ   )r  )rN   r   r   r+   r+   r,   __setitem__  s    zScopedDict.__setitem__c                 C   s   || j kp|| jkS rJ   r  r  r+   r+   r,   __contains__  s    zScopedDict.__contains__Nc                 C   s"   || j kr| j | S | j||S rJ   )r  r   r  )rN   r   defaultr+   r+   r,   r    s    

zScopedDict.get)N)r1   r2   r3   r   r  r  r  r  r+   r+   r+   r,   r    s
   r  c                	       s  e Zd ZU dZdZdZeeee	 gee	 f  e
d< dZde
d< dZde
d< d9 fdd	Zejd	d
 Zejd:ddZeejedddZeejdddZeejedddZd;eejeeddddZejejeeeeedf f eeeedf f dddZ eejdf eeedf eedf geedf f eedf eedf dddZ!d d! Z"eeejeje#ed"d#d$Z$e%ed%d&d'Z&d<eeef ee ee ee ed(d)d*Z'ejeje#e#d+d,d-Z(ejed.d/d0Z) fd1d2Z* fd3d4Z+ejd%d5d6Z,d7d8 Z-  Z.S )=Kernelr   N	overridesload_formatstore_formatTc                    s   t    |rt jd7  _|p$t | _t | _t | _t | _	d| _
d| _t| j| j| _t | _t | _d | _d | _d | _t | _t | _t | _d| _d | _d S )Nre   r   )r   r   r   Zgenerated_kernel_countr  r   r   loadscomputestoresnum_loadnum_reductionr  newvar_prefixr  cser  must_keep_buffersstore_buffer_namesZ
_load_maskr  node_to_boundsr  r  r  Zinplace_update_buffersZmin_elem_per_threadZkernel_name)rN   r   Zincrease_kernel_countr   r+   r,   r     s(    
zKernel.__init__c                 c   s4   | j }|| _ |j  | _z
d V  W 5 || _ X d S rJ   )r  r   r  Z
get_boundsr  )rN   r   priorr+   r+   r,   set_current_node  s    
zKernel.set_current_nodec           	      c   sr   dd }|d kr|}| j }| j}| j}| j}|| _ || _|| _||| _z
d V  W 5 || _ || _|| _|| _X d S )Nc                 S   s0   |   }t| j|_t| j|_t| j|_|S rJ   )r  r  r  r  r  )r  Znew_cser+   r+   r,   	scope_cse  s
    z&Kernel.swap_buffers.<locals>.scope_cse)r  r  r  r  )	rN   Zlbcbsbr  r  r  r  r  r+   r+   r,   swap_buffers  s"    

zKernel.swap_buffersr:   r]   rz   c                 C   s   t d S rJ   rK   )rN   r:   r]   r+   r+   r,   r     s    zKernel.load)r:   r]   c                 C   s,   | j }z| j| _ | ||W S || _ X dS )z+A load the depends on an index we have readN)r  r  r   )rN   r:   r]   r  r+   r+   r,   indirect_load"  s
    zKernel.indirect_loadr:   r]   r   c                 C   s   t d S rJ   rK   )rN   r:   r]   r   r+   r+   r,   r   ,  s    zKernel.store_reductionr:   r]   r   moderz   c                 C   s   t d S rJ   rK   )rN   r:   r]   r   r   r+   r+   r,   r   /  s    zKernel.store.r<   	src_dtypereduction_typer   rz   c                 C   s   t d S rJ   rK   )rN   r<   r"  r#  r   r+   r+   r,   r   4  s    zKernel.reductiondtypes
combine_fnr  rz   c                 C   s   t d S rJ   rK   )rN   r%  r&  r  r+   r+   r,   scan=  s    zKernel.scanc                 C   s   t d S rJ   rK   rR   r+   r+   r,   
var_rangesG  s    zKernel.var_rangesr  offsets_nameoffsets_sizeindexing_dtyperightrz   c                 C   s   t dS )z3
        See [Note: Inductor bucketize op]
        NrK   )rN   r  r*  r+  r,  r-  r+   r+   r,   	bucketizeJ  s    zKernel.bucketizery   c                 C   s   t d S rJ   rK   rR   r+   r+   r,   assert_functionW  s    zKernel.assert_function)r  loweruppermaskrz   c              	   C   s   t |trt|}t |ts t|d ks6t |ts6t|d ksLt |tsLt|r|rd| d| d| d| d	}| d| d| }n2|r| d| }|}n|st| d| }|}|rd| d| d}| j d| d| dS )	Nr   z <= z) & (z < r   z) | ~(z, "index out of bounds: z"))rf   r   r>   rg   r/  )rN   r  r0  r1  r2  rX  Z
cond_printr+   r+   r,   indirect_assert[  s"    
zKernel.indirect_assertrB   r  r0  r1  c                 C   s   t d S rJ   rK   )rN   rB   r  r0  r1  r+   r+   r,   check_boundsy  s    zKernel.check_bounds)r]   rz   c                 C   s   t d S rJ   rK   )rN   r]   r+   r+   r,   index_to_str~  s    zKernel.index_to_strc                    sv   G  fddd  t t ddd}t   js>tt j	t
   j	t S )Nc                
       s  e Zd Zd_e Zeeede	f d fddZ
e fddZed"e	eejef ed	fd
dZeejejeedfddZeeeje	dfddZed#eeje	eddfddZeeeje	dfddZeejejeee	ee	df f ee	ee	df f dfddZeeejdf eee	df ee	df gee	df f ee	df ee	df dfddZee	eejejee	dfd d!ZdS )$z"Kernel.__enter__.<locals>.CSEProxyCSEProxy.)r:   rz   c                    s    fdd}|S )Nc                     sB   j f t } fdd}t||S )Nc                    s&   j jj| d}|  |S )Nr  )r  r  r  r  )r   Zcsevar)r   r  r  r:   rN   r+   r,   do_cse  s    zMKernel.__enter__.<locals>.CSEProxy.__getattr__.<locals>.inner.<locals>.do_cse)_bound_variabler  pytreeZtree_map)r   r  r   r8  )r7  r:   parent_handlerrN   )r   r  r  r,   r    s    z=Kernel.__enter__.<locals>.CSEProxy.__getattr__.<locals>.innerr+   )r:   r  r7  r;  rN   r  r,   r    s    z.Kernel.__enter__.<locals>.CSEProxy.__getattr__c                    s   ddl m} ttj|r t S tjj  j	| kr^j
dk	r^tj
tsLtj
 t S tjrtt| rt fdddD rt S |rtdd }tt||}tj| | S t S dS )	z
                If the variable comes from an FX node, we forward the bound we have already computed
                Else, if the variable when codegen'ing another op, we try to compute its bounds
                r   )TritonTemplateKernelNc                 3   s   | ]}| j kV  qd S rJ   )r   )rv   sZfx_noder+   r,   r     s   zEKernel.__enter__.<locals>.CSEProxy._bound_variable.<locals>.<genexpr>)Zset_indirectr   r'  c                 S   s,   t | tr| jS t | tjr$t| S | S d S rJ   )rf   r   r  r5   r6   r   r&  r+   r+   r,   arg_to_bound  s
    
zHKernel.__enter__.<locals>.CSEProxy._bound_variable.<locals>.arg_to_bound)Zselect_algorithmr=  rf   r$   r  r   r  interpreterr  r   r  r  rg   r  r   Zcompute_all_boundsr   r   anyr  r   r  vr_analysis)r:   r   r  r=  r@  Z
arg_bounds)r7  rN   r?  r,   r9    s"    z2Kernel.__enter__.<locals>.CSEProxy._bound_variableT)r  r  checkc                    sN  t |trt|}t |tjs(t|| jjdk rt	| t
|tj}| jjdkrpt| d}t||| }t }| jt krt |tjr| jttj d@ }t|j| |j| }| jjdkr| jtdtj@ }||B }jjj||d}  | ||}t|rJ| jjdk }	t |tj p8| jj|k  }
|||	|
 |S )Nr   r   r  )rf   r   r5   r@   r6   rg   r  r0  r   r7  r   r?   longr1  rp   rU  r   r  NumberZoor  r  r  indirect_indexingr   r5  )r  r  rD  Zstmrp   Z
new_boundsZ
neg_boundsposZ	sympy_varZassert_lowerZassert_upper)r;  rN   r+   r,   rG    s<    

  

z4Kernel.__enter__.<locals>.CSEProxy.indirect_indexingr4  c                    s     | |||S rJ   )r5  r4  rR   r+   r,   r5    s    z/Kernel.__enter__.<locals>.CSEProxy.check_boundsr  c                    sr   |  j jkrtjj|  t|tjr2 	| |S  j j
}| |krJ||  S  | |}|jdkrn  jd7  _|S r#  )r  r  r$   r  r  r7  r   r   TMPr  r  r   r  r  )r:   r]   r  outrR   r+   r,   r     s    
z'Kernel.__enter__.<locals>.CSEProxy.loadNr  c                    sh    j |  |d krB| jj| <  jrB j D ]}| jj|< q0| tjjkr` j	| |||dS d S d S )N)r   )
r  r7  r  r  r  get_mutationsr$   r~   r  r   )r:   r]   r   r   
other_namerR   r+   r,   r     s    z(Kernel.__enter__.<locals>.CSEProxy.storer  c                    sX    j |  | jj| <  jr: j D ]}| jj|< q(| tjjkrT 	| ||S d S rJ   )
r  r7  r  r  r  rK  r$   r~   r  r   )r:   r]   r   rL  rR   r+   r,   r     s    z2Kernel.__enter__.<locals>.CSEProxy.store_reductionr!  c                    s     j d7  _  | |||S r#  )r  r   )r<   r"  r#  r   rR   r+   r,   r   !  s    z,Kernel.__enter__.<locals>.CSEProxy.reductionr$  c                    s     | ||S rJ   )r'  )r%  r&  r  rR   r+   r,   r'  +  s    	z'Kernel.__enter__.<locals>.CSEProxy.scanr)  c                    s     | ||||S )ay  
                [Note: Inductor bucketize op]

                Given values (tensor) and offsets_name (reference to the name of a 1D
                tensor), calculate the bucket that each value belongs to.

                e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True
                return =        [ 0, 1, 1, 1, 1, 3, 3, 4].

                When right == False, bucket i refers to range (offsets[i], offsets[i+1]].
                When right == True,  bucket i refers to range [offsets[i], offsets[i+1]).

                Offsets must be non-decreasing or the result is undefined.
                )r.  )r  r*  r+  r,  r-  rR   r+   r,   r.  6  s        z,Kernel.__enter__.<locals>.CSEProxy.bucketize)T)N)r1   r2   r3   r:   r   rC  r   r>   r   r   r  r9  r   r5   r6   r   r8   rG  r5  r   r#   r   r   r?   r<   r"   r   r   r'  r.  r+   r<  r+   r,   r7    sr   "*   ,       
	



r7  r  c                 S   s   | S rJ   r+   r  r+   r+   r,   _typecheck_CSEProxyQ  s    z-Kernel.__enter__.<locals>._typecheck_CSEProxy)r    r   r   r  r	  rg   r$   Zget_ops_handlerr  enter_contextZset_ops_handlerZset_kernel_handler)rN   rM  r   r<  r,   r    s     O

zKernel.__enter__c                    s(   t jjrt jj  t ||| dS )zj
        Note that V.graph.scheduler can be None when codegening triton template
        kernels.
        N)r$   r~   r   Zremove_kernel_local_buffersr   r  r  r   r+   r,   r  [  s    zKernel.__exit__c                    s\   t |ttfr  fdd|D S tjj|}t|jdd d} fdd|D }t	||S )Nc                    s   g | ]}  |qS r+   )rename_indexingr  rR   r+   r,   r   h  s     z*Kernel.rename_indexing.<locals>.<listcomp>c                 S   s   | j S rJ   r  )r>  r+   r+   r,   rq  j  rr  z(Kernel.rename_indexing.<locals>.<lambda>)r   c                    s0   i | ](}t |tjtjtjfr| j|qS r+   )r   r   ZUNBACKED_INTZSIZEZPRECOMPUTED_SIZEr   r  r  rR   r+   r,   rw   k  s    z*Kernel.rename_indexing.<locals>.<dictcomp>)
rf   r  tupler$   r~   r   r   sortedZfree_symbolsr   )rN   r]   Zsorted_symbolsZreplacementsr+   rR   r,   rO  d  s    
zKernel.rename_indexingc                 O   s
   t ||S rJ   )r   )rN   r   r  r+   r+   r,   r  y  s    zKernel.create_cse_var)NT)NN)N)N)/r1   r2   r3   r  r  r	  r	   r   r    r   r7   r
  r  r   r  r  r  r  r>   r5   r6   r   r   r  r   r#   r   r?   r<   r"   r   r   r   r'  r(  r8   r.  propertyr/  r3  r5  r6  r  r  rO  r  r   r+   r+   r   r,   r    s~   
"
	
    "


 
    [	r  c                   @   s<   e Zd ZU dZee ed< dZee	j ed< dZ
eed< dS )r   r   r   Nr<   r   ops_name)r1   r2   r3   r   r   r>   r7   r<   r	   r?   rS  r+   r+   r+   r,   r   }  s   
r   c                  C   s4   zdd l } | j| jdW S  tk
r.   Y d S X d S )Nr   )Z	undefined)jinja2EnvironmentZStrictUndefinedImportError)rT  r+   r+   r,   
jinja2_env  s    rW  c                   @   sb   e Zd ZdZedeedddZedd Zedd	 Z	ed
ddZ
dd ZddddZdS )KernelTemplatezg
    Base class for defining kernel templates.

    Children classes: TritonTemplate, CUDATemplate
       )sourcenum_indentsc                    sD   |  d}t|dkr: fdd|dd  D |dd < d|S )NTre   c                    s   g | ]}d    | qS )r   r+   )rv   r  indents_spacingr[  r+   r,   r     s    z6KernelTemplate.indent_except_first.<locals>.<listcomp>r   )
splitlinesr   r   )rZ  r[  r]  linesr+   r\  r,   indent_except_first  s    

z"KernelTemplate.indent_except_firstc                 C   s(   t  }|d k	r$tj|jd< || S d S )Nr`  )rW  rX  r`  filtersZfrom_string)rZ  envr+   r+   r,   _template_from_string  s
    
z$KernelTemplate._template_from_stringc                    s   t jj  fdd}|S )Nc                    s   |   kr S  | S rJ   )get_namer   r  Z_get_dtype_realfake_outr+   r,   r     s    z1KernelTemplate._fake_get_dtype.<locals>.get_dtype)r$   r~   r   )rf  r   r+   re  r,   _fake_get_dtype  s    zKernelTemplate._fake_get_dtyper  c                 C   s
   || _ d S rJ   r  rM   r+   r+   r,   r     s    zKernelTemplate.__init__c                 K   s0   z| | jf | W n tk
r*   Y nX dS )z
        Maybe generates a new ChoiceCaller and appends it into existing choices.

        choices: A list of ChoiceCallers.
        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        N)r  r  rL   )rN   choicesr  r+   r+   r,   maybe_append_choice  s    z"KernelTemplate.maybe_append_choiceztorch._inductor.ir.ChoiceCallerry   c                 K   s   t dS )zM
        Generates a ChoiceCaller instance from the given arguments.
        NrK   )rN   r  r+   r+   r,   r    s    zKernelTemplate.generateN)rY  )r1   r2   r3   r4   r   r>   r   r`  rc  rg  r   ri  r  r+   r+   r+   r,   rX    s   


rX  )F)vr  dataclassesr   r  r'   r3  r   r   r   typingr   r   r   r   r   r   r	   r
   r   r   r5   Zsympy.printing.printerr   r?   Ztorch.fxZtorch._prims_commonr   Ztorch.utilsr   r:  Ztorch.utils._sympy.symbolr   r   r   Ztorch.utils._sympy.value_rangesr   r   r   r   r   r   utilsr   r   r   r   r   r   Zvirtualizedr   r    r!   r"   r#   r$   Z_loggingZgetArtifactLoggerr1   r%   r-   	dataclassr.   r9   rA   rC   ZKernelArgTyperH   r>   r7   rI   rU   rG   rX   rZ   r8   r\   r6   ra   rd   rj   	lru_cacheru   Zbfloat16r   Zfloat16r+  Zfloat64Zint8Zint16r%  r   Zuint8Zuint16Zuint32Zuint64r   rx   r   r   r  rn  r  ZINT_TO_FLOATrh  r  r  r  r  r  r   r  r  r  r  r  r   rW  rX  r+   r+   r+   r,   <module>   sz   0  
      '  d h`   -
