U
    yh=                     @   s  d dl Z d dlZd dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) e*e+Z,ej-j.Z.ede(ddZ/eej0dZ1eej2de.j2j3dZ4eej5dZ6dd Z7ddddddZ8ee8dZ9ee.j0ddddddZ:dd  Z;ee.j5ddddd!d"Z<ee.j2dddddd#d$d%Z=d&d' Z>ee>dZ?e @deeA eBd(d)d*ZCd+d, ZDddd-d.ZEdS )/    N)AnyDictListOptional)CppPackedGemmTemplate)V   )config)CUTLASSGemmTemplate)WrapperCodeGen)FlexibleLayout)register_lowering)autotune_select_algorithmExternKernelChoiceNoValidChoicesErrorTritonTemplate)use_aten_gemm_kernelsuse_cpp_packed_gemm_templateuse_cutlass_templateuse_max_autotuneuse_triton_template   )addmm_epilogueint8_mm_configsmixed_mm_configsmm_args
mm_configsmm_grid
mm_optionsmma  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        ram = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        rbn = rn % N
    rk = tl.arange(0, BLOCK_K)
    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    for k in range(K, 0, -BLOCK_K):
        if EVEN_K:
            a = tl.load(A)
            b = tl.load(B)
        else:
            a = tl.load(A, mask=rk[None, :] < k, other=0.)
            b = tl.load(B, mask=rk[:, None] < k, other=0.)
        if B_PROLOGUE_CAST_TYPE is not None:
            b = b.to(B_PROLOGUE_CAST_TYPE)
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
        A += BLOCK_K * stride_ak
        B += BLOCK_K * stride_bk

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsourcez
at::mm_outzat::addmm_out)Zop_overloadzat::_int_mmc                 C   s   |   tjtjfkS )N)	get_dtypetorchZint8Zuint8)Zmat r%   K/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/kernel/mm.py_is_int8_matv   s    r'   outalphabetac                C   sL   |  ddks| ddkr6tj| d |||||dS tj| |||||dS )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r   r(   )stridesizer$   addmm)inpmat1mat2r)   r*   r+   r%   r%   r&   
bias_addmmz   s    r2   )Ztype_promotion_kindlayoutc             
   C   s  t | ||d\}}}}} }|}t s8t|j|j|jd}t rPt| |f|gng }t	| |g|\}}	|	rt
|rt|||D ],}
tj|f| |f|dt|
|||| q~|r|	rt||||rt||| |g t|| |rt||| |g t|dkr.t s.tjr.td t| |f| S ztd|| |g|W S  tk
r   tjs^ td t| |f|  Y S X d S )Nr3   devicedtyper-   Zinput_nodesr4   r   3No choices for GEMM, using ATen backend as fallbackr   AAll choices for GEMM were invalid, using ATen backend as fallback)r   r   r   r6   r7   r-   r   aten_mmbind_is_static_problemr   r   mm_templatemaybe_append_choicer   r   r
   add_cutlass_gemm_choicesr   r   add_choicesleninductor_configautotune_fallback_to_atenlogwarningoutput_noder   r   )r0   r1   r4   mnkZaten_layoutchoicesstatic_shape
is_nonzeror	   r%   r%   r&   tuned_mm   sX      


rN   c           	      C   sx   d}t |j}|d krRd}|jD ]&}t |}|d k	r"|dkr"d} qJq"d|fS d}|D ]}||9 }qZ|dk}||fS )NTr   Fr   )r   Z%statically_known_list_of_ints_or_noner-   statically_known_int_or_none)	Zinputs_tensorsr4   rL   Zstatic_sizeZnonzerosszZnumeldimr%   r%   r&   r=      s    


r=   c             
   C   sn  t | ||tjd\}}}}} }t| |g|\}}|oD|oDt||||}t r^t| |f|gng }	|srt|ddrvg }	|rt	j
|	|| |gddd |rt|ddrt|||D ],}
tj|	f| |f|dt|
|||| qt|	dkrtd t| |f|g}	ztd|	| |g|W S  tk
rh   tjs6 td	 t| |f|g}	td|	| |g| Y S X d S )
Nr4   	out_dtypeT)Zenable_int32ZfuseableZnon_fuseabler8   r   z^No choices for integer GEMM avaialbe using configured backends, using ATen backend as fallbackint_mmr:   )r   r$   int32r=   r   r   aten__int_mmr<   r   r
   r@   r   r>   r?   r   rB   rE   rF   r   r   rC   rD   )r0   r1   r4   rH   rI   rJ   rL   rM   Zuse_cutlassrK   r	   r%   r%   r&   tuned_int_mm   sV           
rY   )r*   r+   r4   c             
   C   s  d}t ||| |d\}}}	}}}}
t| ||g|\}}|r@t sddlm}m} t||rn||j|j|j	d}t
 rtj| ||f|||dgng }td|| ||g|S t
 rtj|
||f|||dgng }t
 r|
 d dkr|
 jdkrtjjr|dtj|
||f|||d |r~t|r~t|||	D ]B}tj|f|
||f|d	t||||	|d
t|j||d q:|r|rt||||	rt|
jj d dkrt!j"|||||
g||d t#|||rt$j%|||
||g||d d}t&|dkrt'(d d}|r|)tj|
||f||||d |
 d dkr|
 jdkrtjjr|dtj|
||f|||d ztd||
||g|W S  t*k
r   tj+s t'(d tj| ||f||||d}|,  Y S X d S )N)r+   r*   r3   r   )FixedLayoutr   r5   )r*   r+   r.   cudar8   r   )Zprefix_argsepilogue_fnFr9   Tr:   )-r   r=   r   Ztorch._inductor.irrZ   r   
isinstancer6   r7   r-   r   
aten_addmmr<   r   Z
get_strideZ
get_devicetyperC   ZtritonZautotune_cublasLtinsertaten_bias_addmmr   r   r>   r?   r   r   r   r   rO   r4   r,   r
   r@   r   r   rA   rB   rE   rF   appendr   rD   rG   )r/   r0   r1   r*   r+   r4   Zordered_kwargs_for_cpp_kernelrH   rI   rJ   Zinp_expandedrL   rM   rZ   r   rK   r	   Zadd_aten_fallbackZfallback_choicer%   r%   r&   tuned_addmm   s   

  		   	
      
rd   c                C   s   t j| || j|dS )N)r)   )r$   r   tor7   )r0   r1   r)   r%   r%   r&   fallback_mixed_mm  s    rf   )indexreturnc                 C   s   t j| pd}|jdkS )Nr      )r$   r[   Zget_device_propertiesmajor)rg   propsr%   r%   r&   _is_sm7x_or_older_gpu  s    rl   c                 C   s:  t | |d d\}}}}} }t| |g|\}}t| |f|}	|	g}
| jjtjkrf|j pb|j	  ppt
|jj}tjr|g }
|sd| dd}t| pt|}t||||dD ].}tj|
f| |f|dt|||||| q|r|rt||||rtj|
|| |gddd |r(|
s(|	g}
td	|
| |g|S )
Nr3   ztl.ztorch. )has_int8_tensorr8   TrU   Zmixed_mm)r   r=   aten_fallback_mixed_mmr<   r4   r7   r$   Zfloat32Zis_contiguousZis_transposedrl   r6   rg   rC   Zforce_mixed_mmreplacer'   r   r>   r?   r   r   r
   r@   r   )r0   r1   Z
mat2_dtyperH   rI   rJ   r4   rL   rM   fallbackrK   Zskip_tritonZb_prologue_cast_typern   r	   r%   r%   r&   tuned_mixed_mm  sD    
    rr   c          
      C   s   |d krt | t jn|}t| ||||d\}}}}} }}g }t|||D ]B}	tj|f| ||f|dtt	|	||||dddt
jjd qNtd|| ||g|S )NrS   r8   ztl.int32)ZACC_TYPEr   )Zsuffix_argsr\   rV   )r$   Zpromote_typesr#   rW   r   r   r>   r?   dictr   r   opsmulr   )
r0   r1   Zmat3rT   r4   rH   rI   rJ   rK   r	   r%   r%   r&   tuned_fused_int_mm_mul  s0        rv   )F	functoolsloggingtypingr   r   r   r   r$   Z)torch._inductor.codegen.cpp_gemm_templater   Ztorch._inductor.virtualizedr   rm   r	   rC   Zcodegen.cuda.gemm_templater
   Zcodegen.wrapperr   Zirr   Zloweringr   Zselect_algorithmr   r   r   r   utilsr   r   r   r   r   Z	mm_commonr   r   r   r   r   r   r   	getLogger__name__rE   rt   Zatenr>   r   r;   r.   defaultr_   Z_int_mmrX   r'   r2   rb   rN   r=   rY   rd   rf   ro   	lru_cacheintboolrl   rr   rv   r%   r%   r%   r&   <module>   sX   $

D  
2+ 	
)