U
    h)                     @   s   d dl Z d dlZd dlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ e  dd Zdd	 Zd
d Zdd ZdddZdd ZdS )    N   )cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                  C   sZ   zt dgd d W S  tk
rT   dd l} |   | d}| || jd  Y S X d S )Nzclocks.max.smr   g     @@)r   FileNotFoundErrorpynvmlZnvmlInitZnvmlDeviceGetHandleByIndexZnvmlDeviceGetMaxClockInfoZNVML_CLOCK_SM)r
   handle r   N/var/www/html/venv/lib/python3.8/site-packages/triton/ops/matmul_perf_model.pyget_clock_rate_in_khz   s    
r   c                 C   sD   |t |d }tjj| d d }t ||| t|t |  }|S z# return compute throughput in TOPS    multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypeZtotal_warpsZnum_subcoresZtflopsr   r   r   get_tensorcore_tflops   s      r   c                 C   sD   |t |d }tjj| d d }t ||| t|t |  }|S r   )r   r   r   r   r   r   r   r   r   r   r   get_simd_tflops    s    r   c                 C   s>   t j| }|d dk r0|t jkr0t| |||S t| |||S )Nr      )torchcudaget_device_capabilityfloat32r   r   )r   r   r   r   
capabilityr   r   r   
get_tflops(   s    r#   Fc           +      K   s  t j }|j}| }t||}t||	}|}|| | }t||t||	 }}d| | | d }t||| |}|| }tj	j
|d }td|| }td|d }ttd|d d d}t||d |d	   }|d
 }|| | dd|d    }|| | d |d  }|| | dd|d    } || | d |d  }!||  d }"||! d }#|"| |#|  }$|d }%|| | | d }&|dkr|&|% }'n(|%}(|&|( }'|| d d |% })|'|)7 }'t||$|' }*|r
td|* d| d|$ d|' d|d  d |*S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r   r   current_devicer   element_sizer   maxr#   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   dtsizeZ	num_cta_mZ	num_cta_nZ	num_cta_kr   Z	total_opsZtputZ
compute_msZnum_smZactive_cta_ratioZactive_cta_ratio_bw1Zactive_cta_ratio_bw2Zdram_bwZl2_bwZload_a_dramZ	load_a_l2Zload_b_dramZ	load_b_l2Z
total_dramZtotal_l2Zload_msZstore_bwZstore_c_dramZstore_msZ	reduce_bwZzero_msZtotal_time_msr   r   r   estimate_matmul_time/   sH    





,r;   c                    s  t j }t j }|d  }|d j}g }| D ]d}|j}	|	d |	d |	d |jf\}
}}}tj	j
|d }|
| | | | }||kr2|| q2|} |t jt jfkrdd | D } i }| D ]t}|j}	|	d |	d |	d |	d |j|jf\}
}}}}}|
||||f}||kr(|| ||f q||fg||< qg }| D ]\}}|\}
}}}}|d	 d
kr|
| | d }|td| d
 }d}||  tjd| fddd}|D ]}||d	  qn|d	 d	 }d|_|| qD|S )Nr.   r4   r5   r6   Zmax_shared_memc                 S   s   g | ]}|j d  dkr|qS )r7   r$   )r9   ).0configr   r   r   
<listcomp>   s      z&early_config_prune.<locals>.<listcomp>r7   r   r   i   r   i,  r   c                    s0   | d   dk r$dt | d    S | d   S )Nr$   r   
   )abs)xZoptimal_num_stagesr   r   <lambda>   s    z$early_config_prune.<locals>.<lambda>)key)r   r   r)   r    r*   r   r9   r-   r   r   r   r   appendZfloat16r!   r   itemsr   heapq	nsmallest)ZconfigsZ
named_argsr9   r   r"   r:   r   Zpruned_configsr=   kwr4   r5   r6   r-   Zmax_shared_memoryZrequired_shared_memoryZconfigs_mapr7   r   rD   kvZmmasZ
mma_cyclesZldgsts_latencyZnearestnZrandom_configr   rB   r   early_config_prunep   sX    



"
  
rM   )F)	functoolsrG   r    r   runtimer   Ztestingr   r   r   r   	lru_cacher   r   r   r#   r;   rM   r   r   r   r   <module>   s   
	 
A