U
    Mh(/                  	   @   s  d dl Z d dlmZ d dlmZ e edZe r|d dlZd dlmZ	 ej
dddd	Zej
ddd
ddZejejddidddejddidddejddidddejddidddgg dej
ddddZejejddddddejddddddejddddddejddddddgg dej
dddddZej
ddddZej
ddddZej
ddddZej
d d! Zej
ddd"d#d$Zej
ddd%d&d'Zej
ddd(d)d*Zej
e	jdd+d,Zej
e	jdd-d.Zd d/lmZmZ ej
ddd0d1Zej
ddd2d3Zej
ddd4d5Z ej
ddd6d7Z!ej
ddd8d9Z"dS ):    N)HAS_CUDA)
has_tritonzrequires cuda)languageztl.constexpr
BLOCK_SIZEc                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S Nr   Zaxismasktl
program_idarangeloadstorein_ptr0in_ptr1out_ptr
n_elementsr   pidblock_startoffsetsr
   xyoutput r   V/var/www/html/venv/lib/python3.8/site-packages/torch/testing/_internal/triton_utils.py
add_kernel   s    r   )ARGS_PASSEDr   c                 C   s~   t jdd}|| }|t d| }||k }	t j| | |	d}
|dkrbt j|| |	d}|
| }n|
}t j|| ||	d d S )Nr   r   r	   twor   )r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   add_kernel_with_optional_param    s    	
r!   r            )Z
num_stagesZ	num_warps   @   )Zconfigskeyc                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r   r   r   r   r   r   add_kernel_autotuned5   s    r(   )BLOCK_SIZE_XBLOCK_SIZE_Yc                 C   s   t d| }|t d|d d d f  }||k }	t d| }
|
t d|d d d f  }||k }|}|}t | |||   |	|@ }t | |||   |	|@ }|| }t ||||   ||	|@  d S )Nr      r   )r   r   r   Z
x_elementsZ
y_elementsr)   r*   ZxoffsetZxindexZxmaskZyoffsetZyindexZymaskx1Zy0Ztmp0Ztmp1Ztmp2r   r   r   add_kernel_2d_autotunedO   s    r-   c                 C   st   t jdd}|| }|t d| }||k }	t j| | |	d}
t j|| |	d}|
| | }t j|| ||	d d S r   r   )r   r   r   r   Zscaling_factorr   r   r   r   r
   r   r   r   r   r   r   add_kernel_with_scalingw   s    	r.   c           
      C   s^   t jdd}|| }|t d| }||k }t j| | |d}d| }	t j|| |	|d d S Nr   r   r	      r   )
r   r   r   r   r   r   r   r
   r   r   r   r   r   mul2_kernel   s    r1   c           	      C   s^   t jdd}|| }|t d| }||k }t j| | |d}d| }t j| | ||d d S r/   r   )	Zptrr   r   r   r   r   r
   r   r   r   r   r   mul2_inplace_kernel   s    r2   c                 C   s   t | dk| dS )Nr   )r   where)r   r   r   r   	zero_negs   s    r4   )r   
ACTIVATIONc           
      C   s   t jdd}|| }|t d| }||k }|dkrDt| ||d n|dkr^t| | |||d t j| | |d}	t j|| |	|d d S )Nr   r   r2   r   r   r	   )r   r   r   r2   r   r   r   )
r   r   r   r   r5   r   r   r   r
   r   r   r   r   indirection_kernel   s    r6   )X_BLOCK_SIZEY_BLOCK_SIZEc                 C   s   t jdd}t jdd}|| }|| }	|t d| }
|	t d| }|d d d f | |
d d d f  }|d d d f | |
d d d f  }t | | }t || |d  d S )Nr   r   r+   g       @r   )Zin_ptrr   Zin_y_strideZout_y_strider7   r8   xidZyidZx_startZy_startZ	x_offsetsZ	y_offsetsZsrc_offsetsZdst_offsetssrcr   r   r   double_strided_kernel   s    	$$r;   )nBLOCKc           	      C   sx   t | t d| }t |t d| }t |g|t j}t jdd|||gt jddd}t |t d| | d S )Nr   zshf.l.wrap.b32 $0, $1, $2, $3;z
=r,r, r, rTr+   )Zdtypeis_purepack)r   r   r   fullZint32Zinline_asm_elementwiser   )	XYZr<   r=   r   r   szr   r   r   inline_asm_kernel   s    rF   c           
   	   C   s   t jdd}|| }t jt j| |gdg|g|gdgddgd}t jt j||gdg|g|gdgddgd}|| }	t jt j||gdg|g|gdgd|	dgd d S Nr   r   r+   )baseshapestridesr   Zblock_shapeorder)Zboundary_checkr   r   r   Zmake_block_ptrr   )
x_ptrZy_ptr
output_ptrr   r   r   r   r   r   r   r   r   r   add_kernel_with_block_ptr   sJ    rO   c              	   C   s   t jdd}|| }t jt j| |dgddg|dg|dgddgddgd}|}t jt j||dgddg|dg|dgddgd|dgd d S rG   rL   )rM   rN   r   r   r   r   r   r   r   r   r   kernel_with_block_ptr_2d  s4    rP   )r   r   c                 C   sj   t jdd}|| }|t d| }||k }t| | |d}	t|| |d}
|	|
 }t|| ||d d S r   r   r   r   r   r   add_kernel_with_import2  s    rQ   c                 C   s   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
t ddkrh|	|
 }n|	|
 }t j|| ||d d S r   r   r   r   r   r   cond_op_kernelC  s    
rR   c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r   )r   r   r   r   Z
atomic_addr   r   r   r   atomic_add_kernelW  s    rS   c                 C   s   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
tdD ] }|	|
 }t j|| ||d qXd}|dkr|d8 }|	|
 }t j|| ||d q~d S )Nr   r   r	   r0   r+   )r   r   r   r   ranger   )r   r   r   r   r   r   r   r   r
   r   r   ir   r   r   r   add_4_times_kernelh  s    rV   c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r   r   )r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   add_kernel_out_of_order_fn2  s    rW   )#ZunittestZ&torch.testing._internal.inductor_utilsr   Ztorch.utils._tritonr   Z
skipUnlessZrequires_cudaZtritonr   r   Zjitr   r!   ZautotuneZConfigr(   r-   r.   r1   r2   r4   r6   r;   rF   Z	constexprrO   rP   Ztriton.languager   r   rQ   rR   rS   rV   rW   r   r   r   r   <module>   s   	        
-!