U
    h3                     @   s  d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ e
 eddd	Ze
 d
d Ze
 edddZe
ddd Ze	ddG dd dZG dd deZdS )    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyTupleOptional)Path)binaryc                 C   s   t jd|   ddt jt jtd| g}|D ]j}t j|r6t j	|r6t
j|dgt
jd}|d k	r6tjd|dtjd	}|d k	r6||d
f  S q6td|  d S )NZTRITON_Z_PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr   resultversion r,   Q/var/www/html/venv/lib/python3.8/site-packages/triton/backends/nvidia/compiler.py_path_to_binary   s    r.   c                  C   s    t tdd dgd} | S )Nptxasr   r   r   )r    r!   r.   r%   )r+   r,   r,   r-   get_ptxas_version!   s    r0   returnc                 C   s^   t | tsttt| d\}}|dkr2d| S |dkrBd| S |dkrRd| S tdd	S )
zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higherN)
isinstancestrAssertionErrormapintsplitr(   )cuda_versionmajorminorr,   r,   r-   ptx_get_version'   s    rC   c              
   C   s4   t | d }t|  W  5 Q R  S Q R X d S )Nrb)openhashlibsha256read	hexdigest)r   fr,   r,   r-   	file_hash7   s    rK   T)frozenc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dd Zdd ZdS )CUDAOptions   	num_warpsr   num_ctas   
num_stagesNmaxnreg)r   r   r   cluster_dimsptx_versionTenable_fp_fusionFallow_fp8e4nvallow_fp8e4b15tf32default_dot_input_precision)rY   Ztf32x3Zieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namec                 C   s   t tjd }| jd kri nt| j}|dd sJtdt|d |d< t	
| dt|  | jdkr~| j| jd @ dkstdd S )	NlibZ	libdeviceZTRITON_LIBDEVICE_PATHzlibdevice.10.bcr]   r   r   znum_warps must be a power of 2)r   r   parentr]   dictr   r   getenvr;   object__setattr__tupleitemsrO   r<   )selfZdefault_libdirr]   r,   r,   r-   __post_init__Q   s     zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s   | ]\}}|t |fV  qd S N)rK   ).0kvr,   r,   r-   	<genexpr>\   s     z#CUDAOptions.hash.<locals>.<genexpr>r]   _c                 S   s   g | ]\}}| d | qS )-r,   )rl   namevalr,   r,   r-   
<listcomp>]   s     z$CUDAOptions.hash.<locals>.<listcomp>r   )
rc   __dict__rg   sortedr   rh   rF   rG   encoderI   )ri   Z	hash_dictkeyr,   r,   r-   hashZ   s    
zCUDAOptions.hash)__name__
__module____qualname__rO   r>   __annotations__rP   rR   rS   r   rT   rg   rU   rV   boolrW   rX   rZ   r;   r[   r
   r\   r]   rc   r^   r`   rj   ry   r,   r,   r,   r-   rM   =   s"   
	rM   c                       s   e Zd ZeedddZedd fddZedd	d
Zdd Z	dd Z
dd Zedd Zedd Zedd Zedd Zedd Zdd Ze dd Z  ZS )CUDABackendtargetc                 C   s
   | j dkS )Nr_   )backendr   r,   r,   r-   supports_targetc   s    zCUDABackend.supports_targetN)r   r2   c                    s.   t  | |j| _t| jts$td| _d S )Ncubin)super__init__arch
capabilityr:   r>   r<   Z
binary_ext)ri   r   	__class__r,   r-   r   g   s    zCUDABackend.__init__r1   c                    sT    fddt j D }| jdk|d< | jdk |d< | jdkrBdnd|d	< t f |S )
Nc                    s   i | ]}| kr| | qS r,   r,   )rl   rm   optsr,   r-   
<dictcomp>n   s       z-CUDABackend.parse_options.<locals>.<dictcomp>Y   rW   Z   rX   i   @r   r\   )rM   __dataclass_fields__keysr   )ri   r   argsr,   r   r-   parse_optionsm   s
    zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r      )rO   rP   sharedrT   )ri   metadatar,   r,   r-   pack_metadatat   s    zCUDABackend.pack_metadatac                 C   s6   dd l m  m  m} d| jdkr*|jn|ji}|S )Nr   Zconvert_custom_typesr5   )Ztriton.language.extra.cudalanguageextrar_   r   Zconvert_custom_float8_sm80Zconvert_custom_float8_sm70)ri   r_   Zcodegen_fnsr,   r,   r-   get_codegen_implementation~   s
    z&CUDABackend.get_codegen_implementationc                 C   s   t | d S rk   )r   load_dialects)ri   ctxr,   r,   r-   r      s    zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| tj	| tj
| tj| tj| tj| tj| ||  | S rk   )r   pass_managercontextenable_debugr   commonZadd_inlinerttirZadd_rewrite_tensor_pointerZadd_combineadd_canonicalizerZadd_reorder_broadcastadd_cseZadd_licmadd_symbol_dcerun)modr   optpmr,   r,   r-   	make_ttir   s    
zCUDABackend.make_ttirc                 C   s  t  }|jd k	r6|jd |_|jd |_|jd |_t| j}|	  t
j|d| |jd|j t
j| |d dkrt
j| t j
j|| t
j| t
j| t
j| t
j| t
j||dk t
j| |d dkrt
j| t
j||j t
j| t
j||dk t
j| t
j| t
j| t
j| t
j | |d d	krt j
j!| t j
j"| t
j#| |$|  |j|j|jf|d
< | S )Nr   r   r   zcuda:    r8      r5   	   rT   )%r   ZClusterInforT   ZclusterDimXZclusterDimYZclusterDimZr   r   r   r   r   r   Zadd_convert_to_ttgpuirrO   rP   ttgpuirZadd_coalesceZadd_f32_dot_tc	ttnvgpuirZadd_plan_ctaZadd_remove_layout_conversionsZadd_optimize_thread_localityZadd_accelerate_matmulZadd_optimize_dot_operandsr   r    add_combine_tensor_select_and_ifZadd_pipelinerR   Zadd_prefetchZadd_reduce_data_duplicationZadd_reorder_instructionsr   Zadd_fence_insertionZadd_tma_loweringr   r   )r   r   r   r   Zcluster_infor   r,   r,   r-   
make_ttgir   sF    

zCUDABackend.make_ttgirc                 C   s  |  d}|d k	r"|d  |9  < | }t|j}|  tjj| tj	| tj
| tj
| tj| tjj|| tjj| tj
| tj| tj| tj| tjdddkrtj| || t  t }t||}t| |j d k	rP|! D ]&}	|	" s(|	# r(|	$|j  q(|j%rtdd |j%D }
t&||
 t'|tj( |  d|d< t)|}~~|S )	Nz"triton_gpu.num-warp-groups-per-ctarO   TRITON_DISABLE_LINE_INFO0c                 S   s   g | ]\}}|qS r,   r,   )rl   rr   r   r,   r,   r-   rt      s     z)CUDABackend.make_llir.<locals>.<listcomp>ztriton_gpu.sharedr   )*Zget_int_attrr   r   r   r   r   r   r   Z%add_decompose_unsupported_conversionsr   convertZadd_scf_to_cfZadd_index_to_llvmirZadd_allocate_shared_memoryZadd_to_llvmirr   Zadd_nvgpu_to_llvmZadd_arith_to_llvmirr   r   r   r   r   r   r   ZllvmirZadd_di_scoper   r   Zinit_targetsZ	to_moduleZset_nvvm_reflect_ftzrS   Zget_functionsZis_declarationZis_external_linkageZset_nvvm_maxnregr]   Zlink_extern_libsZoptimize_moduleZOPTIMIZE_O3r;   )srcr   optionsr   Znum_warp_groupsr   r   r   Zllvm_modrm   r)   retr,   r,   r-   	make_llir   sJ    


zCUDABackend.make_llirc              	   C   s   |j }|d kr"td\}}t|}td|}d}|dkr<dnd| }	d| }
t| ||	|
dg|jd	}td
|}t	|dkst
|d |d< |d  d|d  }tjdd| |tjd}tdd|}tjdddkrtd t| |S )Nr/   S   znvptx64-nvidia-cudar   Zsm_90aZsm_z+ptxznvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   rr   r8   r3   z\.version \d+\.\d+z	.version r   z,\s*debug|debug,\s*r   ZNVPTX_ENABLE_DUMPr   1z // -----// NVPTX Dump //----- //)rU   r.   rC   minr   Ztranslate_to_asmrV   r#   findalllenr<   subr&   r   r   r   print)r   r   r   r   rU   rp   r@   Zllvm_ptx_versionZtripleprocfeaturesr   namesr,   r,   r-   make_ptx   s&    

zCUDABackend.make_ptxc                 C   s6  t d\}}tjdddd}tjdddd}||  |  |jd }tjd	r`d
nd}	|j	rnd
nd}
|dkr~dnd}tjdddkr| |	 |
 d| | |j d| d|j }n.| |	 |
 d| | |j d| d|j }zztj|ddd W n tjk
r } z~t|j}| }W 5 Q R X |jdkrXtd| n@|jdtj krtd|j d| ntd|j d| W 5 d }~X Y nX W 5 tj
|jrt|j tj
|jrt|j X t|d }| }W 5 Q R X tj
|rt| W 5 Q R X W 5 Q R X |S )!Nr/   Fwz.ptx)deletemodesuffixrz.logz.or   r   z
 -lineinfoz --fmad=falser   za  ZDISABLE_PTXAS_OPTr   r   z  -v --opt-level 0 --gpu-name=sm_z -o z 2> z -v --gpu-name=sm_T)shellcheck   z$Internal Triton PTX codegen error: 
   zPlease run `ptxas z+` to confirm that this is a bug in `ptxas`
z`ptxas` failed with error code z: 
rD   )r.   tempfileNamedTemporaryFilewriteflushrr   r   r   r   rV   r   r   remover    r   CalledProcessErrorrE   rH   
returncoder(   signalSIGSEGV)r   r   r   r   r/   rp   fsrcZflogZfbinZ	line_infoZfmadr   cmdeZlog_filelogrJ   r   r,   r,   r-   
make_cubin  sF    

0.,zCUDABackend.make_cubinc                    s^    fdd|d<  fdd|d<  fdd|d<  fdd|d	<  fd
d|d< d S )Nc                    s    | | S rk   )r   r   r   r   ri   r,   r-   <lambda><      z(CUDABackend.add_stages.<locals>.<lambda>r   c                    s    | | jS rk   )r   r   r   r   r,   r-   r   =  r   Zttgirc                    s    | | jS rk   )r   r   r   r   r,   r-   r   >  r   Zllirc                    s    | | jS rk   )r   r   r   r   r,   r-   r   ?  r   ptxc                    s    | | jS rk   )r   r   r   r   r,   r-   r   @  r   r   r,   )ri   Zstagesr   r,   r   r-   
add_stages;  s
    zCUDABackend.add_stagesc                 C   s   t  }| d| j S )Nrq   )r0   r   )ri   r+   r,   r,   r-   ry   B  s    zCUDABackend.hash)rz   r{   r|   staticmethodr   r   r   r	   r   r   r   r   r   r   r   r   r   r   	functools	lru_cachery   __classcell__r,   r,   r   r-   r   a   s(   


(
0

(r   ) Ztriton.backends.compilerr   r   Ztriton._C.libtritonr   r   r   r   dataclassesr   r   typingr	   r
   r   rF   r#   r   r   r   r    pathlibr   r   r;   r.   r0   r>   rC   rK   rM   r   r,   r,   r,   r-   <module>   s,   

#