U
    yh9                     @  s:  d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZ d dlZddlmZmZ ddlmZmZmZmZ ddlmZmZmZ ddlmZmZmZmZmZm Z  dd	l!m"Z" d
dl#m$Z$ e
rddlm%Z% e&e'Z(ej)j*Z*dd Z+dddddddddddddddddddddgZ,e-dd e,D Z.ej/j0rPe-dd e.D Z.ej1e$e.dZ2dZ3ede+de3 d e3 d  d!Z4eej5d"d#e*j5j6d$Z7d%d& Z8ee8dZ9G d'd( d(eZ:d)d)d*d+d,d,d-d,d.d/d0
d1d2Z;d3d4 Z<d5d6 Z=ee*j5d)d)d)d7d7d7d-d7d.d8	d9dZ5ee*j>d:d; Z>d<d= Z?ee*j5e? dS )>    )annotationsN)castListOptionalSequenceTupleTYPE_CHECKING	TypedDict   )configir)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoiceTritonTemplate)ceildivis_onesis_zerospad_listlikesympy_productuse_triton_template)V   )filtered_configs)	TensorBoxc                 C  s*   t | | | |d t ||d |d fS )NZBLOCK_MZBLOCK_NGROUPS)r   )nchwmeta r$   M/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/kernel/conv.py	conv_grid+   s    r&   )@         r
      T)r   cond)r(   r'   r)   r
   r*   )i   r)   r)   r      )   r-       r
   r,   )r'   r'   r.   r
   r*   )r'   r(   r.   r
   r,   )r(   r'   r.   r
   r,   c                 c  s2   | ]*}|d  rt ttttttf |d V  qdS )r+   r   N)r   r   int.0r   r$   r$   r%   	<genexpr>A   s   r2   c                 c  s,   | ]$}|d  |d |d d|d fV  qdS )r   r   r
   r*   Nr$   r0   r$   r$   r%   r2   I   s    )Zconfigsa  
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolutionag  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_H = {{size("X", 2)}}
    IN_W = {{size("X", 3)}}
    OUT_C = {{size(None, 1)}}
    OUT_H = {{size(None, 2)}}
    OUT_W = {{size(None, 3)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xh = {{stride("X", 2)}}
    stride_xw = {{stride("X", 3)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wh = {{stride("W", 2)}}
    stride_ww = {{stride("W", 3)}}

    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = nhw % OUT_W
    nh = nhw // OUT_W
    idx_y_h = nh % OUT_H
    idx_n = nh // OUT_H
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        a  
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
        ij = ijk // BLOCK_K_COUNT
        i = ij // KERNEL_W
        j = ij % KERNEL_W
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
)namegridsourcezat::convolutionF)Zhas_out_variantZop_overloadc             	   C  sD   t t |dd}t j| dddd|dd|dddddS )Nr   r
      r   )out)torchsqueezematmulpermute)xr"   r9   r$   r$   r%   conv1x1_via_mm   s     
 r?   c                   @  s>   e Zd ZU ded< ded< ded< ded< ded< ded	< d
S )ConvLayoutParamstuple[int, ...]stridepaddingdilationbool
transposedoutput_paddingr/   groupsN)__name__
__module____qualname____annotations__r$   r$   r$   r%   r@      s   
r@   r   zOptional[TensorBox]zSequence[int]rA   rE   r/   z	ir.Layout)
r>   weightbiasrB   rC   rD   rF   rG   rH   returnc	                 C  s   t jj| tjjtj| ddtj|ddtj|ddt jj	
|t jj	
|||t jj	
||	}	t|	 }
t|	 }W 5 Q R X t|  |  |
|S )z)Determine output layout for a convolutionT)Zguard_shape)r   graphZ	fake_moder:   opsatenr3   r   Zir_node_to_tensorsizevars
size_hintsZconvert_shape_to_inductorsizerB   ZFixedLayoutZ
get_deviceZ	get_dtype)r>   rM   rN   rB   rC   rD   rF   rG   rH   outputsizesr$   r$   r%   conv_layout   s(    
rX   c                 C  s&   t tt| }|d|d |S )Nr   r7   )listreversedrangeinsertpop)rankorderr$   r$   r%   channels_last_order  s    r`   c           
      C  s8  t | }t|d D ]}ttj |dd}qttj |ddg}|  d dkrhtj	| t
|} n|   |   tt|}||d ttj | |} |  ^ }}ttj | t||g} |d krttj | |}nttj || |}ttj ||d}tt|}	|	d|	d ttj ||	S )Nr
   r7   dimr   r   )r7   )lenget_sizer[   LrR   r;   r=   r   ExternKernelrequire_stride_orderr`   realizefreeze_layoutrY   appendr]   Zreshaper   mmZaddmmr\   )
r>   rM   rN   r^   _Z	x_permuterW   in_chanresultZresult_permuter$   r$   r%   convert_1x1_conv_to_mm  s(    ro   z	List[int])	r>   rM   rN   rB   rC   rD   rF   rG   rH   c	                   s  t |}t |}t |}t |}t|ts8tjj|}t|tsFt||||||d t	 t	 d krt
tj tt
tj d	 |f ddS tjj	 ^}	}
}t|t|}t|}t|}t|} fdd}tjptj}tjs$|r| rt|rt|rt|rt|r|st|r|dkrtjjt	 drt|S |d k	rtdkrtd f }t
tj |t
tj ||	 d gdg  S     tjjrDdkrDtj j d7  _ tj!"tj!"t#d f }nBt#d f }t$tjj%|j&}tj!'|tj!'|d	d
ddddg}|d krg}d  d< |(dd n,|g}|  |)  tjj|	  t*j+|||f g}t,|rdkrt|r|st|rtjj-|
	 d rt|rt|rt|r|dkr|.t/+|| t0t	 d f	 dd  |	|
D ]`}t1j2|ff||d |d |d |d |d |d |t|t3j4j5j6|j7|j8d|j9 qt:d|||S )N)rB   rC   rD   rF   rG   rH   r   r   ra   c                    sD   t jjrdkrdS td f } tt jj| j}|tj	kS )Nr
   T)
r   rP   
layout_optrX   r   get_stride_orderrS   rT   rB   ZNHWC_STRIDE_ORDER)layoutreq_stride_orderkwargsndimrM   r>   r$   r%   channels_last_convZ  s    z'convolution.<locals>.channels_last_convcpur
   rB   rC   rD   rF   rG   rH   rN   )Zinput_nodesrr   ZKERNEL_HZKERNEL_WZSTRIDE_HZSTRIDE_WZ	PADDING_HZ	PADDING_Wr   ZUNROLLZ
ALLOW_TF32
num_stages	num_warpsr3   )r   );tuple
isinstancer/   r   rP   rS   Zevaluate_static_shapeAssertionErrorrc   rd   re   rR   r;   r3   expandZevaluate_static_shapesr   r   Zmax_autotuneZmax_autotune_gemmZconv_1x1_as_mmr   r   Zstatically_known_gtr   ro   r   Zget_device_typeaddviewrh   rp   Znum_channels_last_convrf   Zrequire_channels_lastrX   rq   rT   rB   rg   r\   ri   aten_convolutionbindr   Zstatically_known_equalsrj   aten_conv1x1_via_mmconv_configsconv2d_templateZmaybe_append_choicer:   backendsZcudnn
allow_tf32ry   rz   ru   r   )r>   rM   rN   rB   rC   rD   rF   rG   rH   Zout_chanrm   Zkernel_shaperw   Zautotuning_gemmrn   rr   rs   Zordered_kwargs_for_cpp_kernelargschoicescfgr$   rt   r%   r3   .  s   
	"





	 "


 
c              
   C  s   t | ||||||||	S N)r3   )r>   rM   rN   rB   rC   rD   rF   rG   rH   Z	benchmarkZdeterministicZcudnn_enabledr   r$   r$   r%   _convolution  s            r   c                 O  s:   | j tjjjjksttjj	r&||fS t
| f||S d S r   )targetr:   rQ   rR   r3   defaultr}   r   rP   rp   r   )Zfx_noder   ru   r$   r$   r%   constrain_conv_to_fx_strides  s    r   )@
__future__r   	functoolsloggingtypingr   r   r   r   r   r   r	   r:    r   r   Zloweringr   r   r   re   r   Zselect_algorithmr   r   r   utilsr   r   r   r   r   r   Zvirtualizedr   Z	mm_commonr   r   	getLoggerrI   logrQ   rR   r&   Zkernel_configsr{   Zplatform_configsversionZhippartialr   Z	LOOP_BODYr   r3   r   r   r?   r   r@   rX   r`   ro   r   r   r$   r$   r$   r%   <module>   s   $ 

!45DE[
	 #  )
