U
    h3                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ ejejeZejedgZejedZd	gZe  d
d Ze  dd Zdd ZG dd deZdd Zdd ZG dd deZG dd deZ dS )    N)Path)_build)get_cache_manager)	GPUTarget)	GPUDriverincludelibcudac                  C   s   t d} | r| gS tddg }dd | D }dd |D }t d}|rl|sldd |d	D }d
}|r|dt| 7 }|d7 }n|d7 }|d7 }tdd |D st	||S )NZTRITON_LIBCUDA_PATHz/sbin/ldconfigz-pc                 S   s    g | ]}d |kr|  d qS )libcuda.so.1)split).0line r   O/var/www/html/venv/lib/python3.8/site-packages/triton/backends/nvidia/driver.py
<listcomp>   s      z libcuda_dirs.<locals>.<listcomp>c                 S   s   g | ]}t j|qS r   )ospathdirname)r   locr   r   r   r      s     ZLD_LIBRARY_PATHc                 S   s&   g | ]}t jt j|d r|qS )r
   r   r   existsjoin)r   dirr   r   r   r      s      :zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c                 s   s$   | ]}t jt j|d V  qdS )r
   Nr   )r   r   r   r   r   	<genexpr>'   s     zlibcuda_dirs.<locals>.<genexpr>)
r   getenv
subprocesscheck_outputdecode
splitlinesr   stranyAssertionError)Zenv_libcuda_pathZlibslocsdirsZenv_ld_library_pathmsgr   r   r   libcuda_dirs   s"    


r'   c                   C   s   t ft S N)libdevice_dirr'   r   r   r   r   library_dirs+   s    r*   c              
   C   s   t | d }t|}|| d}|d krt x}tj	
|d}t|d}||  W 5 Q R X t|||t tt}t|d }|j| | ddd}W 5 Q R X W 5 Q R X dd l}	|	j||}
|	j|
}|
j| |S )	Nzutf-8z.sozmain.cwrbT)binaryr   )hashlibsha256encode	hexdigestr   Zget_filetempfileTemporaryDirectoryr   r   r   openwriter   r*   include_dir	librariesputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcnamekeycache
cache_pathZtmpdirZsrc_pathfso	importlibspecmodr   r   r   compile_module_from_src0   s     
.rJ   c                       s$   e Zd Z fddZdd Z  ZS )	CudaUtilsc                    s"   t | dstt| | | _| jS )Ninstance)hasattrsuperrK   __new__rL   )cls	__class__r   r   rO   J   s    
zCudaUtils.__new__c                 C   sP   t ttjtd d}|j| _|j| _|j	| _	|j
| _
|j| _|j| _d S )Nzdriver.cZ
cuda_utils)rJ   r   r   r   r   r   	read_textZload_binaryZget_device_propertiesZcuOccupancyMaxActiveClustersZset_printf_fifo_sizeZfill_1d_tma_descriptorZfill_2d_tma_descriptor)selfrI   r   r   r   __init__O   s    zCudaUtils.__init__)__name__
__module____qualname__rO   rU   __classcell__r   r   rQ   r   rK   H   s   rK   c                 C   s8   | d dkrdS ddddddd	d
dddddddd|  S )Nr   *ZCUdeviceptrint32_tint8_tint16_tint64_tuint32_tuint8_tuint16_tuint64_tfloatdouble)i1i8Zi16Zi32Zi64u1u8u16u32Zu64Zfp16Zbf16Zfp32Zf32Zfp64r   tyr   r   r   	ty_to_cpp^   s(    rm   c           	         s>  d dd | D }dd  dd d  fd	d
| D }d| }t|dkrvdd dd | D  nd}fdd
| D }dt|dkrd| nd dd dd |D  dd  fdd
| D  d| d| dd dd
 | D  dt|dkr0dd dd | D  nd d}|S )N, c                 s   s$   | ]\}}t | d | V  qdS )z argNrm   r   irl   r   r   r   r   w   s     z make_launcher.<locals>.<genexpr>c                 S   s   | d dkrdS t | S )Nr   rZ   	PyObject*ro   rk   r   r   r   _extracted_typey   s    z&make_launcher.<locals>._extracted_typec                 S   s"   dddddddddd	d
dd|  S )NOrE   dlbhrq   BHIK)rr   rc   rd   longr\   r]   r[   r^   r`   ra   r_   rb   r   rk   r   r   r   	format_of~   s    z make_launcher.<locals>.format_of c                    s   g | ]} |qS r   r   )r   rl   )rs   r~   r   r   r      s     z!make_launcher.<locals>.<listcomp>Z	iiiKKOOOOr   c                 s   s   | ]\}}d | V  qdS )z&_argNr   rp   r   r   r   r      s     c                    s   g | ]}| kr|qS r   r   r   rq   )	constantsr   r   r      s      a	  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction functionz) {
  void *params[] = { c                 s   s   | ]}d | V  qdS )z&argNr   r   r   r   r   r      s     a   };
  if (gridX*gridY*gridZ > 0) {
    if (num_ctas == 1) {
      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
    } else {
      CUlaunchAttribute launchAttr[2];
      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      launchAttr[0].value.clusterDim.x = clusterDimX;
      launchAttr[0].value.clusterDim.y = clusterDimY;
      launchAttr[0].value.clusterDim.z = clusterDimZ;
      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      CUlaunchConfig config;
      config.gridDimX = gridX * clusterDimX;
      config.gridDimY = gridY * clusterDimY;
      config.gridDimZ = gridZ * clusterDimZ;
      config.blockDimX = 32 * num_warps;
      config.blockDimY = 1;
      config.blockDimZ = 1;
      config.sharedMemBytes = shared_memory;
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = 2;
      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
      if (cuLaunchKernelExHandle == NULL) {
        cuLaunchKernelExHandle = getLaunchKernelExHandle();
      }
      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
    }
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   c                    s$   g | ]\}} | d | dqS )z _arg; r   rp   )rs   r   r   r     s     z
  if(!PyArg_ParseTuple(args, "z", &gridX, &gridY, &gridZ, &_stream, &_function,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook ai  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  // raise exception asap
  r   c                 S   s>   g | ]6\}}|d  dkr6d| d| d| d| d	ndqS )r   rZ   zDevicePtrInfo ptr_infoz = getPointer(_argrn   z); if (!ptr_infoz.valid) return NULL;r   r   rp   r   r   r   r   1  s     z;
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_functionc                 s   s4   | ],\}}|d  dkr"d| dnd| V  qdS )r   rZ   Zptr_infoz.dev_ptrZ_argNr   rp   r   r   r   r   3  s     aW  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;

  }

  // return None
  Py_INCREF(Py_None);
  return Py_None;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)r   itemsvalueslenkeys)	r   	signatureidsZ	arg_declsZargs_formatformatZ	args_listparamsr@   r   )rs   r   r~   r   make_launchert   sD    ,./     
     ,  Jr   c                   @   s   e Zd Zdd Zdd ZdS )CudaLauncherc                    s   dt drjjnt i}t dr,jnt }fdd  fdd| D } fddj D }t|||t	d	}|j
| _
d S )
NZids_of_const_exprsfnr   c                    s   t | tr jj| S | S r(   )
isinstancer!   r   	arg_namesindex)rq   )r@   r   r   <lambda>e      z'CudaLauncher.__init__.<locals>.<lambda>c                    s   i | ]\}} ||qS r   r   r   rB   valuecst_keyr   r   
<dictcomp>f  s      z)CudaLauncher.__init__.<locals>.<dictcomp>c                    s   i | ]\}} ||qS r   r   r   r   r   r   r   g  s      Z__triton_launcher)rM   r   Z
constexprstupler   dictr   r   r   rJ   launch)rT   r@   metadatar   r   r   rI   r   )r   r@   r   rU   b  s    
zCudaLauncher.__init__c                 O   s   | j || d S r(   )r   )rT   argskwargsr   r   r   __call__l  s    zCudaLauncher.__call__N)rV   rW   rX   rU   r   r   r   r   r   r   `  s   
r   c                       s0   e Zd Z fddZdd Zedd Z  ZS )
CudaDriverc                    s   t  | _t| _t   d S r(   )rK   utilsr   Zlauncher_clsrN   rU   )rT   rQ   r   r   rU   r  s    zCudaDriver.__init__c                 C   s6   |   }| |}|d d |d  }d}td||S )Nr   
          r	   )Zget_current_deviceZget_device_capabilityr   )rT   ZdeviceZ
capabilityZ	warp_sizer   r   r   get_current_targetw  s
    
zCudaDriver.get_current_targetc                  C   s   dd l } | j o| jjd kS )Nr   )torchr	   Zis_availableversionZhip)r   r   r   r   	is_active~  s    zCudaDriver.is_active)rV   rW   rX   rU   r   staticmethodr   rY   r   r   rQ   r   r   p  s   r   )!	functoolsr   r.   r   r2   pathlibr   Ztriton.runtime.buildr   Ztriton.runtime.cacher   Ztriton.backends.compilerr   Ztriton.backends.driverr   r   r   realpath__file__r   r6   r)   r7   	lru_cacher'   r*   rJ   objectrK   rm   r   r   r   r   r   r   r   <module>   s0   

 m