U
    h)$                     @   s  d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e je je je jgZdd Zd	d
 Zdd Zdd Zeeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddge  dddge
edddedd d! ieejejejejejejejejejejd"
d#d$ZG d%d& d&e jjZejZdS )'    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 C   s   dt | krtjS | S )NZfp8)strtorchfloat16)a r   C/var/www/html/venv/lib/python3.8/site-packages/triton/ops/matmul.pyupcast_if_fp8
   s    r   c                 C   sb   t | } t |}| |kr| S | tks(t|tks4ttD ]$}| |krL|  S ||kr8|   S q8d S N)r   _ordered_datatypesAssertionError)r   bdr   r   r   get_higher_dtype   s    r   c                    s    fddS )Nc                    s   |     S r   )Zzero_)nargsnamer   r   <lambda>!       zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero    s    r   c                  C   s   g } dD ]~}dD ]t}dD ]j}dD ]`}|dkr0dnd}|  t|||dd	||d
 dD ](}|  t||||d	||tdd qVq qqq| S )N)r               )       )r$   @   )r$   r%         r%   r   r    r	   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r       r#   C)r.   r/   Zpre_hook)appendr   r   )configsr.   Zblock_mZblock_kZblock_nr/   Zsplit_kr   r   r   get_configs_io_bound$   s*       r4   r&   r'   r$   r(   r   r0   r-   r%   r    r!   MNK
   )r
   Z
perf_modelZtop_k)r3   keyZprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr7   r+   r,   r   r   )argsr   r   r   r   U   r   r   )
	acc_dtypeinput_precisionfp8_fast_accumr)   r*   r+   GROUP_Mr,   r:   AB_DTYPEc           +      C   s  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| } t t || ||}!t t | | ||}"|| t d| }#| |!d d d f | |#d d d f |   } ||#d d d f | |"d d d f |	   }t j||f|d}$tdt ||| D ]}%|rvt 	| }&t 	|}'nb||%||   }(t jd|j
jd})t j	| |#d d d f |(k |)d}&t j	||#d d d f |(k |)d}'|d k	r|&|}&|'|}'|rt j|&|'|$||d}$n|$t j|&|'||d7 }$| || | 7 } ||| | 7 }qV|$|j
j}$|| t d| }|| t d| } ||d d d f |
 | d d d f |   }||k d d d f | |k d d d f @ }*|dkrt j||$|*d nt j||$|*d d S )Nr   r	   )dtype)r	   r	   )maskother)Z	out_dtyper=   )rB   )tlZ
program_idr   minZarangeZmax_contiguousZmultiple_ofZzerosrangeloadrA   Z
element_tytodotstoreZ
atomic_add)+ABr1   r5   r6   r7   Z	stride_amZ	stride_akZ	stride_bkZ	stride_bnZ	stride_cmZ	stride_cnr<   r=   r>   r)   r*   r+   r?   r,   r:   r@   pidZpid_zZgrid_mZgrid_nwidthZgroup_idZ
group_sizeZpid_mZpid_nZrmZrnramZrbnZrkacckr   r   Zk_remainingZ_0rB   r   r   r   _kernel6   sR    -

,,
  


,(
rR   c                   @   s.   e Zd ZeZi Zedd ZedddZdS )_matmulc                    s  | j }| ddkr*| ddkr*|  } |ddkrN|ddkrN| }| jd |jd ksjtd| j\ }|j\}t| j|j}	|d kr|	}tj f||d}
tj	tj
tj	ftjtj
tjftj
tj
ftjtjfi}|d kr||	 d }nFt|tjstd||| j ks$td|||j ks<tddd	 }||}||	}	||}| jtjtjfkr|jtjtjfkrd }	 fd
d}t| | ||
 || d| d|d|d|
d|
d|||d|	d |
S )Nr   r	   zincompatible dimensions)devicerA   zacc_dtype must be a torch.dtypez+acc_dtype not compatible with the type of az+acc_dtype not compatible with the type of bc                 S   s   t tt| dd S )N.)getattrrD   r   split)tyr   r   r   
to_tl_type   s    z!_matmul._call.<locals>.to_tl_typec                    s$   t  | d t | d  | d fS )Nr)   r*   r,   )r   )ZMETAr5   r6   r   r   r      r   z_matmul._call.<locals>.<lambda>r0   )r<   r=   r>   r?   r@   )rT   Zstride
contiguousshaper   r   rA   r   emptyr   float32bfloat16int8Zint32
isinstancerD   Z
float8e4nvZfloat8e5rR   )r   r   r<   r=   r>   output_dtyperT   r7   _Zab_dtypecZsupported_acc_dtypesrZ   gridr   r[   r   _call   sj    

 
  
   (         	z_matmul._callNTc                 C   s   t j||||||dS )N)r<   r=   r>   rc   )rS   rg   )ctxr   r   r<   r=   r>   rc   r   r   r   forward   s    z_matmul.forward)NNTN)	__name__
__module____qualname__rR   ZkernelZ_locksstaticmethodrg   ri   r   r   r   r   rS      s   
9rS   )r    r   r   r   r   r   r   rD   Zmatmul_perf_modelr
   r   ra   r   r`   r_   r   r   r   r   r4   Z	constexprrR   ZautogradFunctionrS   applymatmulr   r   r   r   <module>   sl         >E