U
    Mh$                    @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ ddlm	Z	 d dl
mZmZ eeddZd	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zdkd!d"Zd#d$ Zd%d& Zd'd( Zd)d* Z dd+d,d-Z!dld.d/Z"dmd0d1Z#G d2d3 d3Z$eed4d5d6 Z%dnd8d9Z&dod:d;Z'dddd<ddd=ej(ej(ej(eej( e)eeee ee ee f  ee* d>d?d@Z+e rd dl,Z,d dl-m.Z/ e,j0e/j1e/j1e/j1e/j1e/j1e/j1dAdBdCZ2e,j0e/j1e/j1e/j1e/j1e/j1dDdEdFZ3dGdH Z4dIdIdd<ddJej(ej(ej(eej( e)eeee ee ee f  dKdLdMZ5dd<dddNej(ej(eej( e)eeee ee ee f  ee* dOdPdQZ6e,j0e/j1e/j1dRdSdTZ7dpdUdVZ8dqej(ej(ej(eej( e9e)ee9 dXdYdZZ:e,j0e/j1e/j1e/j1e/j1e/j1e/j1e/j1d[d\d]Z;ej(ej(ej(ej(ej(d^d_d`Z<e,j0e/j1e/j1e/j1e/j1e/j1e/j1e/j1e/j1dadbdcZ=drej(ej(ej(ej(ej(ej(e*ej(e)de	dfdgZ>e,j0e/j1e/j1e/j1e/j1e/j1e/j1e/j1e/j1e/j1e/j1dh
didjZ?ndZ8dZ6dZ5dZ:dZ<dZ>dZ?dS )s    N)	lru_cache)
has_triton   )get_meta)OptionalTuple*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                 C   s   | st |d S N)
ValueError)Zcondmsg r   J/var/www/html/venv/lib/python3.8/site-packages/torch/sparse/_triton_ops.pycheck   s    r   c                 C   s   t |jtjk|  d d S )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_nametr   r   r   check_bsr_layout   s    
r   c                 C   s&   t |j|ko|jjdk|  d d S )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   r   r   r   check_device   s    r   c                 C   s   t | dko| dk|  d|  d|  d |jdd  \}}|jdd  \}}t ||k|  d| d| d d S )Nr	   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhsmklkrnr   r   r   check_mm_compatible_shapes!   s    r$   c                 G   sF   t |j|ko(|jtjtjtjft|  k|  d| d|j d d S )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r   )r   dtyper   halfbfloat16floattuple)r   r   r%   Zadditional_dtypesr   r   r   check_dtype2   s
    
r*   c                    sP   t |dkstdd   fdd}t|||  d|d  d|d	  d
 d S )Nr	   c                 S   s   | | d @  S Nr   r   )vr   r   r   is_power_of_two?   s    z(check_blocksize.<locals>.is_power_of_twoc                    s&   d}| D ]}|dko |o|}q|S )NT   r   )bres	blocksizer-   r   r   is_compatible_blocksizeB   s    z0check_blocksize.<locals>.is_compatible_blocksizez(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenAssertionErrorr   )r   r1   r3   r   r2   r   check_blocksize<   s    r6   c                 C   s    t |  dkr|  S | S dS )a  Return input as a triton-contiguous tensor.

    A triton-contiguous tensor is defined as a tensor that has strides
    with minimal value equal to 1.

    While triton kernels support triton-non-contiguous tensors (all
    strides being greater than 1 or having 0 strides) arguments, a
    considerable slow-down occurs because tensor data is copied
    element-wise rather than chunk-wise.
    r   N)minstride
contiguous)r   r   r   r   make_triton_contiguousP   s    r:   c                 G   s@   zt jdd |D  W S  tk
r:   td|  d Y nX d S )Nc                 s   s   | ]}|j d d V  qd S Nr   r   .0r   r   r   r   	<genexpr>e   s     z'broadcast_batch_dims.<locals>.<genexpr>Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorsr   r   r   broadcast_batch_dimsc   s    rC   c                 g   s2   |D ](}t d g|  }||| < || V  qd S r
   )slicer   )r   Zslice_rangerB   r   slicesr   r   r   slicerj   s    rF   c                 g   sN   |D ]D}t d g|  }t| |D ]\}}|d k	r$|||< q$|| V  qd S r
   )rD   r   zip)ZdimsrE   rB   r   sdZd_slicer   r   r   multidim_slicerq   s    
rJ   c                  g   s"   | D ]}|V  |  E d H  qd S r
   )r8   )rB   r   r   r   r   ptr_stride_extractorz   s    rK   c           	      #   s   dt    krdksn tdt   kr6dks<n tdd l} fdd}fdd}|j|  D ]H}dd t |D }d	d t||D }|d d d
 f||V  qjd S )Nr      c                  3   s&   t  D ]\} }td| |V  q
d S )Nr   )rG   range)fgmg)	full_gridgrid_blocksr   r   generate_grid_points   s    z.grid_partitioner.<locals>.generate_grid_pointsc                 3   s(      D ]\}}tt|| |V  qd S r
   )itemsnextrJ   )rE   r   Zt_dims)tensor_dims_mapr   r   generate_sliced_tensors   s    z1grid_partitioner.<locals>.generate_sliced_tensorsc                 S   s    g | ]\}}}t || |qS r   )r7   )r>   rN   gprO   r   r   r   
<listcomp>   s     z$grid_partitioner.<locals>.<listcomp>c                 S   s   g | ]\}}t ||| qS r   )rD   )r>   rW   gr   r   r   rX      s     )r4   r5   	itertoolsproductrG   )	rP   rQ   rU   r[   rR   rV   Z
grid_pointgridrE   r   )rP   rQ   rU   r   grid_partitioner   s    r^   c                    sh   dd d d }|d kr|}n$dd  t  fddt||D }t|||D ]^}}| |f|  qLd S )N)i  r_   rZ   c                 S   s    | d kr|S t dt| |S d S r+   )maxr7   )rY   rO   r   r   r   valid_grid_dim   s    z%launch_kernel.<locals>.valid_grid_dimc                 3   s   | ]\}} ||V  qd S r
   r   )r>   rY   rO   ra   r   r   r?      s    z launch_kernel.<locals>.<genexpr>)r)   rG   r^   )kernelrU   rP   rQ   Zcuda_max_gridr]   sliced_tensorsr   rb   r   launch_kernel   s    re   c                    s   |   d}|  d}t|  d}dd |D }tj|jd d fdd |D  dd   |d	} |d	} ||jdd  } fd
d|D }|||f|S )Nr   c                 S   s   g | ]}t |d qS r   )r:   	unsqueezer=   r   r   r   rX      s     z"prepare_inputs.<locals>.<listcomp>c                 s   s   | ]}|j d d V  qd S r;   r<   r=   r   r   r   r?      s     z!prepare_inputs.<locals>.<genexpr>c                 S   s   |  || dt|d S )Nr   r   )broadcast_toflattenr4   )r   Z
batch_dimsZinvariant_dimsr   r   r   batch_broadcast_and_squash   s     
z2prepare_inputs.<locals>.batch_broadcast_and_squashrZ   c                    s"   g | ]} ||j d d qS )r   Nr<   r=   rk   Zbatch_dims_broadcastedr   r   rX      s    )crow_indicesrg   col_indicesr:   valuesr   r@   r   )bsrZdense_tensorsrn   ro   rp   rB   r   rm   r   prepare_inputs   s2    $      rr   c                 G   s|   t | |f| }| |d }| |d }| || jdd   }||jdd   }tj|||||jdS )Nrl   rh   r   sizer   )	rC   rn   ri   ro   rp   r   r   sparse_compressed_tensorr   )r   rq   rB   Zbatch_shapern   ro   rp   rt   r   r   r   broadcast_batch_dims_bsr   s     rv   c                 C   sH   | j ^ }}}|||d  |d ||d  |d g }| |ddS )Nr   r   rh   r   )r   view	transpose)r   r1   restr    r#   Z	new_shaper   r   r   tile_to_blocksize   s    

rz   c                 C   sJ   | j dk r| d} q | j dkr2| d| j d } | j dksFt| j| S )zReturn tensor as 3D tensor by either prepending new dimensions to
    the tensor shape (when ``tensor.ndim < 3``), or by collapsing
    starting dimensions into the first dimension (when ``tensor.ndim >
    3``).
    rL   r   )ndimrg   rj   r5   r   )tensorr   r   r   	as1Dbatch   s    

r}   accumulatorsc          0      C   s  |d }| j dkst| j\}}}|dkrV|dd \}}	|j dksJt|j\}
}}||ksbt|dkr|jd d }tj|||f| j| jd}n$|j\}}}||kst||kst|d s|d s|d stdkrBt|jd d D ]R}|| }||d  }t||D ].}|	| \}}||  | | ||  7  < qqnt| |||	| |S |dkrT|j}t	|}|j\}}}|| dkst|dd \}}}}}|d	 }|dkr||
  d |  } tj|dd
 | |f| j| jd}n |jd
d \} }!|!|kst|j}"t	|}|| }|d sP|d sP|d sPtdkr4|  t|D ]}#t|jd D ]}||  }$||  }||d   }t|$|\}%}&||#|%|%| |&|&| f }'t||D ]T}|| ||  }}t| |\}(})|'| | ||#|(|(| |)|)| f  7 }'qԐqrq`nt| ||||||| ||"S |dkrx|j}t	|}|j\}}}|| dkst|dd \}}}}|d	 }|dkr||
  d |  } tj|dd
 | |f| j| jd}n |jd
d \} }!|!|kst|j}"t	|}|| }|d sL|d sL|d sLtdkrDt|D ]}#tt|D ]}*t||*  |\}%}&|%| }+|&| },||+  }-||+d   }.||#|%|%| |&|&| f }'tt|-|.D ]b\}/}||,|. ||, |-  |/   }t||\}(})|'| | ||#|(|(| |)|)| f  7 }'q֐qdqTn*tjd|j|jd}t| ||||||| ||"S t|dS )ad  Scattered matrix multiplication of tensors.

    A scattered matrix multiplication is defined as a series of matrix
    multiplications applied to input tensors according to the input
    and output mappings specified by indices data.

    The following indices data formats are supported for defining a
    scattered matrix multiplication operation (:attr:`indices_data[0]`
    holds the name of the indices data format as specified below):

    - ``"scatter_mm"`` - matrix multiplications scattered in batches
      of tensors.

      If :attr:`blocks` is a :math:`(* 	imes M 	imes K) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor,
      and :attr:`indices = indices_data['indices']` is a :math:`(*
      	imes 3)` tensor, then the operation is equivalent to the
      following code::

        c_offsets, pq = indices_data[1:]
        for r in range(len(c_offsets) - 1):
            for g in range(c_offsets[r], c_offsets[r + 1]):
                p, q = pq[g]
                accumulators[r] += blocks[p] @ others[q]

    - ``"bsr_strided_mm"`` - matrix multiplications scattered in
      batches of tensors and a tensor.

      If :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor, then
      the operation is equivalent to the following code::

        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for i, r in enumerate(r_offsets):
                r0, r1 = divmod(r, N)
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for g in range(c_indices[i], c_indices[i+1]):
                    p = p_offsets[g]
                    q0, q1 = divmod(q_offsets[g], N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

    - ``"bsr_strided_mm_compressed"`` - matrix multiplications
      scattered in batches of tensors and a tensor. A memory and
      processor efficient version of ``"bsr_strided_mm"`` format.  If
      :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor, :attr:`others`
      is a :math:`(* 	imes K 	imes N)` tensor, :attr:`accumulators`
      is a :math:`(* 	imes M 	imes N)` tensor, then the operation is
      equivalent to the following code::

        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for r in r_offsets:
                m = (r // N) // Ms
                n = (r % N) // Ns
                r0, r1 = divmod(r, N)
                c0, c1 = c_indices[m], c_indices[m + 1]
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for i, p in enumerate(range(c0, c1)):
                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                    q0, q1 = divmod(q, N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

      Notice that the order of ``r_offsets`` items can be arbitrary;
      this property enables defining swizzle operators via
      rearrangements of ``r_offsets`` items..

    Auxilary functions are provided for pre-computing
    :attr:`indices_data`. For example,
    :func:`bsr_scatter_mm_indices_data` is used to define indices data
    for matrix multiplication of BSR and strided tensors.

    Parameters
    ----------
    blocks (Tensor): a 3-D tensor of first matrices to be multiplied

    others (Tensor): a tensor of second matrices to be multiplied. If
      ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
      tensor of second input matrices to be multiplied. Otherwise, the
      second input matrices are slices of the :attr:`others` tensor.
    indices_data (tuple): a format data that defines the inputs and
      outputs of scattered matrix multiplications.

    Keyword arguments
    -----------------

    accumulators (Tensor, optional): a tensor of matrix product
      accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
      is a 1-D batch tensor of output matrices. Otherwise, output
      matrices are slices of the :attr:`accumulators` tensor.
    r   rL   
scatter_mmr   Nr%   r   r.   bsr_strided_mmSPLIT_Nr   bsr_strided_mm_compressedrf   )r{   r5   r   r   zerosr%   r   _scatter_mm2rM   r}   r`   item_scatter_mm6zero_divmodrw   r4   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_formatPMsKsZ	c_offsetsZpqQZKs_NsRZMs_ZNs_rg0g1rY   pqZothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_Zaccumulators_shaper/   Zr_r0r1accq0q1jr    r#   Zc0c1ir   r   r   r      s    d
"$

&(8


&( 8
r   c              
   K   s  ||||	|
|hd hkrt j }td| ||||f|dt jdfd}|d k	r\|jf | |S | ||fdkr||fdkrd}d}d}d	}d}
d	}	nr||fd
krd}d}d}d	}d}
d	}	nL||fdkrd}d}d}d	}d}
d	}	n&||fdkrd}d}d}d}d}
d	}	n| ||fdkr||fdkr@d}d}d}d}d}
d}	nv||fd
krhd}d}d}d	}d}
d}	nN||fdkrd	}d}d}d	}d}
d	}	n&||fdkrd}d}d}d	}d}
d	}	n8| ||fdkr||fdkrd	}d}d}d}d}
d}	n||fd
krd}d}d}d}d}
d}	nv||fdkrBd}d}d}d	}d}
d}	nN||fdkrjd}d}d}d	}d}
d	}	n&||fdkrd}d}d}d}d}
d	}	n^| ||fdkrl||fdkrd	}d}d}d}d}
d}	n||fd
krd	}d}d}d	}d}
d}	nv||fdkrd	}d}d}d	}d}
d	}	nN||fdkrDd}d}d}d	}d}
d	}	n&||fdkrd	}d}d}d}d}
d	}	n| ||fdkr||fdkrd}d}d}d}d}
d}	nN||fd
krd}d}d}d}d}
d}	n&||fdkrd}d}d}d}d}
d	}	|d kr4ddd	ddddddd	|d}|dkr4|dkr4d}|| }|d kr^t|dk rVdnd|}|d krt|dk rxdnd|}|
pd}
|	d krt| |dkrdddd|d	}	n\t| |dkrdddd|d	}	n6t| |dkrdd	d|d	}	nddd|d	}	|pd	}||ks:tt	||d||ksTtt	||d|| ksntt	| |d ||kstt	||d!||kstt	||d"t	f ||||
|	|d#|S )$Nr   r         ?version)   r   r   )r.   r.   r   r.      )    r   r	   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r.   r   r   r   r   r   r   r   i    r   r   r   )r.   r   r   )r.   r   )TILE_Mr   )TILE_Nr   )r   r   )r   r   )r   r   )r   r   
GROUP_SIZE
num_stages	num_warpsr   )
r   r   get_device_namer   float16updategetr7   r5   dict)r   r   r   r   r   r   r   r   r   r   r   extradevice_namer   r   r   r   r   scatter_mm_meta  s~   

                                                                                                         
 





  r   c              	   K   sl  |d krt j}|d krd}||	|
|hd hkr"t j }| |||||dk|dk|dkf}td||d||fd}|d kr|dkrtd||d|dfd}|d krtd|d d d|dd  |d|dfd}t|pi D ],}|| }||d	  dkr|d |kr|}q|d k	r"|jf | |S |p4t|| d}|p>d
}|
pHd}
|	pRd
}	tf |||
|	d|S )Nr   r   r   bsr_dense_addmmr   r	   *rL   r   r   )r   GROUP_SIZE_ROWr   r   )r   )	r   r   r   r   r   sortedr   r`   r   )r   r   r   r   r   betaalphar   r   r   r   sparsityr%   r   r   keyr   Zmatching_metaZmkeyZmeta_r   r   r   bsr_dense_addmm_meta/  sF    
   
 



r   c                   @   s4   e Zd ZdZdd Zdd Zdd Zedd	 Zd
S )TensorAsKeyaS  A light-weight wrapper of a tensor that enables storing tensors as
    keys with efficient memory reference based comparision as an
    approximation to data equality based keys.

    Motivation: the hash value of a torch tensor is tensor instance
    based that does not use data equality and makes the usage of
    tensors as keys less useful. For instance, the result of
    ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
    the tensor results from `crow_indices` method call are equal, in
    fact, these share the same data storage.
    On the other hand, for efficient caching of tensors we want to
    avoid calling torch.equal that compares tensors item-wise.

    TensorAsKey offers a compromise in that it guarantees key equality
    of tensors that references data in the same storage in the same
    manner and without accessing underlying data. However, this
    approach does not always guarantee correctness. For instance, for
    a complex tensor ``x``, we have ``TensorAsKey(x) ==
    TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
    return False.
    c                 C   s   dd }t || _|jtjkr,||| _nf|jtjtjhkrZ||	 ||
 f| _n8|jtjtjhkr|| || f| _n
t|jt| j| _d S )Nc                 S   s8   | j js| j jrt| j |  |  | j|  | j fS r
   )r%   Zis_floating_pointZ
is_complexr5   data_ptrZstorage_offsetr   r8   )objr   r   r   get_tensor_keyj  s    
z,TensorAsKey.__init__.<locals>.get_tensor_key)weakrefref_obj_refr   r   Zstridedr   Z
sparse_csrr   rn   ro   Z
sparse_cscZ
sparse_bscZccol_indicesZrow_indicesr   hash_hash)selfr   r   r   r   r   __init__h  s    
zTensorAsKey.__init__c                 C   s   | j S r
   )r   r   r   r   r   __hash__  s    zTensorAsKey.__hash__c                 C   s6   t |tsdS | jd ks"|jd kr*| |kS | j|jkS )NF)
isinstancer   r   r   )r   otherr   r   r   __eq__  s
    
zTensorAsKey.__eq__c                 C   s   |   S )z'Return object if alive, otherwise None.)r   r   r   r   r   r     s    zTensorAsKey.objN)	__name__
__module____qualname____doc__r   r   r   propertyr   r   r   r   r   r   Q  s   	r   )maxsizec	           #   	   C   s^  |j }	|	d k	st|	 |	  }
}|
j}tj}| dkr,|| }g }tj|||d| }t|| D ]X}|
| 	 }|
|d  	 }||krqf|
||| ||  ||||   qft|}|
 }| }|||  }|| d}|
}|| |}|jddd\}}|| }| |||fS | dkrX|| }g }g }tj|||d| }t|| D ]|}|
| 	 }|
|d  	 }||krqf|
tj||||d| |
||| ||  ||||   qft|}|
 }| }|||  }|| d}t|
d d t|| |df}t|}| ||||fS | d	krJ|}dg}g }t|D ]}t|| D ]}|
| 	 }|
|d  	 }t|| D ]j}|
|d | |  t|| D ]@} ||  }!||! 	 |||   ||  | }"|
|!|"g qڐqqqx| tj|||dtj|||dfS td
| dd S )Nr   r   r   rZ   T)Z
descendingZstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r   r5   rn   ro   r   r   Zint32arangerM   r   appendrepeatZrepeat_interleavecatdiffZnonzerorw   sortZcumsumr|   r   )#r   r   r   r   r   r   nbatchesr   Zcompressed_sparse_tensor_as_keyrq   rn   ro   r   Zindices_dtyper   Zq_offsets_lstr/   r    r   r   r   Zcrow_indices_diffZnon_zero_row_indicesar   r   Znnz_per_rowindicesZp_offsets_lstr   
pq_offsetsr#   r   r   r   r   r   r   _bsr_scatter_mm_indices_data  s    
0


2


$r   r   c                 K   s"  |   dkst| jdkst|  }|  }|  jdd }| j\}}|\}	}
|jdd \}}||kspt|jdd  }t||||	|
f|}d|kr|j	| j
tjtjhkd |d }t|||||	|
||t| 	}|dkr|j	d	d
 ||f S |dkr|j	dd
 ||f S |S dS )zkComputes indices data for :func:`scatter_mm` used in BSR and
    strided tensor matrix multiplication.
    r   r	   r   N
allow_tf32r   r   r   T)is_compressedr   F)Z	dense_dimr5   r{   rn   ro   rp   r   numelr   r   r%   r   r   r'   r   r   )rq   r   r   Z
meta_inputrn   ro   r1   r   r   r   r   K_r   r   r   r   r   r   r   r   bsr_scatter_mm_indices_data  s@    
        


r   c              
   C   s  | j dkst|j dkst| jd | jd |jd   }}}|  jdd }|dkrft| |dd}|d }|dkrtj|jdd ||f| j| jd}|j}	t	|}| 
 dkr|  n"|d	kr|  t|  |||d
 n|dkr|jdd  }
tj|
| |d  | |d  |d |d f| j| jd}t	|dd|
||d  |d ||d  |d dddd}t|  |||d
 ||d|
||d  ||d  fdd|
||dd nt|||	S )zBSR @ strided -> strided
    r	   r   rZ   Nr   )r   r   r   >   r   r   r~   r   r   )rL   r   r   r	   )r   r	   rL   r   )r{   r5   r   rp   r   r   r   r%   r   r}   _nnzr   r   r   r   rx   rw   Zmovedimrj   copy_Z	unflattenreshaper   )rq   r   r   outr   r   r   r1   r   Z	out_shaper   r   r   r   r   r   bsr_scatter_mm   sn    "&
,   
  
        r   F)r   r   r   skip_checksmax_gridr   )inputrq   denser   r   r   r   c                   s  d}	|  }
| }| }| d }|j||d  \}}|
j|d |d  }|jd }|d krt|	||}||||f }| dksdks|dks|dks|dkrdkr|  n|	|  dkr|
 |S d kr:td| |d  |d  ||   d}t||||d |d ||jd	|}t|| ||\}}}
} }}|\ d| }|| |}t|f}t| f}t| f} tjtjtjtjtjtjtjtji|j |d}|dd }|d	}|||f}|d k	r:t|d d d d d d
dt|d d    }nd }|
d|d|d| d|d|di}dksht fdd}t|||| | | kr|	||j |S )Nr   r   r	   rL   rZ   r   )r   r%   r   rh   r
   r   NNr   NrZ   )r   rh   )r   rh   Nc                    sF   t |  t| fdkdkdk tjkd d S )Nr   r   )beta_is_onebeta_is_nonzeroalpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLr   	acc_dtype)_bsr_strided_addmm_kernelrK   tlfloat32r]   rd   ZBKZBMBNr   r   dot_out_dtyper   r   r   rc   |  s      zbsr_dense_addmm.<locals>.kernel)rp   rn   ro   r   r   rC   	new_emptyr   r   r   mul_roundr   r%   rr   r   rz   r   r   r  r  r'   float64rt   r)   r4   r5   re   r   rw   )r   rq   r  r   r   r   r   r   r   r   rp   rn   ro   Z
batch_ndimr   r   r1   r   original_batch_dims_broadcastedr   
out_backupr   Zout_untiled	n_batchesn_block_rowsn_block_colsrP   rQ   rU   rc   r   r  r   r   /  s    
,



*$   



4      	r   )IS_BETA_ZEROr  r
  TILE_Kr  r   c            5   	   C   s4  t jdd} t jdd}!|||   ||!  }"t |"}#t |"| }$|$|# }%|%dkrXd S t d|}&t d|}'|||   |	|#  |
|&d d d f   ||'d d d f   }(|||   ||#  })|||   ||!  ||&d d d f   }*|||   ||'d d d f   }+t d|},t|%D ]}-t j||f|d}.t |)}/td||D ]}0|0|, }1|1|k }2t j|*||1d d d f   |2d d d f dd}3t j|+||/  ||1d d d f   |2d d d f dd}4|.t j|3|4||d7 }.qJ|r|.| 9 }.n| |. |t |(  }.t |(|.|j	j
 |(|	7 }(|)|7 })qd S )Nr   Zaxisr   r%           maskr   r   	out_dtype)r  
program_idloadr   rM   r   dotstoretor%   
element_ty)5r   r   r  r  r
  kr  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_strideZmat1_ptrZmat1_batch_strideZmat1_tiled_row_strideZmat1_tiled_col_strideZmat1_row_block_strideZmat1_col_block_strideZmat2_ptrZmat2_batch_strideZmat2_tiled_row_strideZmat2_tiled_col_strideZmat2_row_block_strideZmat2_col_block_strider  r   	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrZmat1_block_ptrsZmat2_block_ptrsZk_tile_arange_	acc_block	col_blockZk_tileZ	k_offsetsZmask_kZ
mat1_blockZ
mat2_blockr   r   r   _sampled_addmm_kernel  s    #
		
  
rD  )r  r
  r  r   r   c           0      C   s  t jdd}t jdd}t jdd}t jdd}t jdd} t |||| |\}}|||  ||  }!t |!}"t |!| }#|#|" }$|$dkrd S t d|}%t d|}&| ||  ||"  ||%d d d f   ||&d d d f   }'|||  ||  ||&d d d f   ||%d d d f   }(|||  ||  ||  ||%d d d f   ||%d d d f   })||	|  |
|"  }*t j||f|d}+t|$D ]R},t |'}-t |*}.t |(||.  }/|+t j|-|/||d7 }+|'|7 }'|*|
7 }*qt 	|)|+
|jj d S Nr	   r  r   r   r  r#  )r  r%  num_programs	swizzle2dr&  r   r   rM   r'  r(  r)  r%   r*  )0r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider  r
  r  r   r   r7  r8  col_block_pidr  r  r9  r:  r;  r<  r=  r>  r?  dense_block_ptrsoutput_ptrsr@  output_acc_blockrA  values_blockdense_row_idxdense_blockr   r   r   "_bsr_strided_dense_rowspace_kernel  s    .    



r[  c              
      s   | d}| dd }||f}|d k	r\t|d d d d d ddt|d d    }nd }|d|d|d|	d|
di}|jtjtjfkrtj d	n
tj	 d
 fdd}t
|||| d S )Nr   rZ   r   r	   r
   )r   N)r   rZ   )r   r  TFc                    s2   t |  fft|  ddd d S )Nr   r   )r  r   r   r   )rD  rK   r  r  r   r   r   r1   is_beta_zeror+  tile_kr   r   rc     s         z)_run_sampled_addmm_kernel.<locals>.kernel)rt   r)   r4   r%   r   r&   r'   r  r  r  re   )r   r   r]  r1   r+  r^  rp   rn   ro   mat1mat2r   r  r  rP   rQ   rU   rc   r   r\  r   _run_sampled_addmm_kernel  s0    
4     ra  g      ?)r   r   r   r   r   )r   r_  r`  r   r   r   c                C   sn  d}t ||  t|| ||}	|s&t||| j t||| j |dkrh| jtjkrhtd| d| d | jtjk	rt||| j t||| j nt|||j t	||| |d k	r&t || t|||j t||| j t|j
|	j
ko| |  k| d|	j
 d|	  d|j
 d	|  	 |d krB|	j|jd
d}n
||	 | dksh| dkrl|S | j
dd  }
|d}|d}|d}|dks|dkr| | |S |}t|||\}}}}}t||
d |f}t|||
d f}t|
 }t|||dk|
|||||||| |  dd  | dd  krj| || j
 |S )Nsampled_addmmr   Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r   rZ   r   rh   )r   rv   r   r   r%   r   boolr   r*   r$   r   r   r)  r   r   rp   rt   r  rr   rz   r`   ra  r8   r   )r   r_  r`  r   r   r   r   r   r   Zinput_broadcastedr1   r    r#   r+  r  rn   ro   rp   r^  r   r   r   rb    sv    


*




       &rb  )r   r   r   r   )rq   r  r   r   r   r   c                C   sX  d}| j dd  \}}|st||  t|| |j t|| |j t|| | |d}	|  j dd  \}
}t	||
|f t
|	d  | d|	 d n|j dd  \}}	t|| |}|d k	r|s|||	f }t
|j |kd| d|j  d	 t
| p|dd d
 |d kr.||||	f }|  dkrD| S t|| |dd|dS )Nbsr_dense_mmr   rZ   r.   z(): dense.size(-1) == z should be divisible by 16z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got r   zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r   )r   r   r   )r   r   r   r   r*   r%   r$   rt   rp   r6   r   rC   is_contiguousrx   r  r   r   r   )rq   r  r   r   r   r   r   r    r!   r#   	row_blockrC  r"   r  Zexpected_out_shaper   r   r   re    s>    	


re  )MAX_ROW_NNZTILEc                 C   sV  t jdd}t jdd}t jdd}| ||  ||  }t |}t || }|| }|dkrdd S t d|
}||| k }|||  ||  ||  }t j|| |td dt j}t j|dd}t|
|	|
D ]Z}||
7 }||| k }t j|| |td dt j}t j|dd}t 	||k||}qt 
|| }t j|dd}t|
|	|
D ]\}||
8 }||| k }t j|| |td dt j}t 
|| }|t j|dd7 }qXt j|| || |jj|d t|
|	|
D ]l}||
7 }||| k }t j|| |td dt j}t 
|| }t j|| || |jj|d qd S )Nr	   r  r   r   infr!  )r"  )r  r%  r&  r   r(   r)  r  r`   rM   whereexpsumr(  r%   r*  )r1  r2  r3  r,  r-  r/  Zvalues_nnz_col_block_striderg  rC  rh  ri  r7  Zrow_block_offset_pidr8  r9  r:  r;  r<  Z
row_aranger"  Zcurr_row_values_ptrsZrow_tileZmax_row_valuerA  Zcurr_max_row_valuenumdenomr   r   r   _bsr_softmax_kernelK  s^    
"""""rp  c                    s  d}t ||  t|| | j |  dks4|  dkr<|  S | jdd  \}}|  }|  jdd  \ d krt	|n
t	| 
 ddd}|  dd r|   }n|  }|dd dddd|  }|jd | f}d }	|dd df d|d	i}
 fd
d}t||
||	 |d| ddj|  j }tj| 
  |   || j| jdS )Nbsr_softmaxr   r   rh   r  rZ   .r  r  c                    s&   t |  t|  tdf  d S )Ni   )rp  rK   r7   r  rC  max_row_nnzrg  r   r   rc     s     zbsr_softmax.<locals>.kernelrs   )r   r*   r%   r   r   cloner   rp   tritonZnext_power_of_2rn   rg   rj   rx   rf  r9   r   re   r   ru   ro   r   )r   rs  r   r    r#   nnzrn   rp   rP   rQ   rU   rc   r   rr  r   rq    sB    

,  	$

rq  r   )queryr   value	attn_mask	dropout_p	is_causalscalec           
      C   sl  d}t | | d t |d k	| d |d k	s6tt |jtjk| dtj d|j d t||| j t||| j t||| j t||| j t||| j |jtj	k	rt||| j t
|| |ddd	d
d}|d kr| ddks|d	krt d
| d| d |d kr,dt| d n|}	| |	 t|}tjjj| |dd t||}|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r   r   rZ   r   F)r   r   r   z(): current value of scale == z results in division by zero.r   T)r   Zinplace)r   r5   r   r   r   r   r   r*   r%   rd  rb  rx   rt   mathsqrtrp   r  rq  nnZ
functionalZdropoutre  )
rw  r   rx  ry  rz  r{  r|  r   ZsdpaZscale_factorr   r   r   r}    sB    	
 "
r}  )r   r   r   r  r   r   r   c           ,      C   s  | | }|| }t jdd}t jdd}|| }|| }|| t d| }|| t d| }t d|} ||d d d f | | d d d f |   }!|| d d d f |	 |d d d f |
   }"t |||  }#t ||d |  }$|#|$krd S t j||f|d}%t|#|$D ]h}&t ||&|  }'t ||&|  | }(t |!|'|  })t |"|(|  }*|%t j|)|*||d7 }%q|||  |d d d f | |d d d f |   }+t |+|%|j	j
 d S Nr   r  r   r  )r$  r   )r  r%  r   r&  r   rM   r'  r(  r)  r%   r*  ),r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrZothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptrZaccumulators_stride_Raccumulators_stride_Maccumulators_stride_NZpq_offsets_ptrZpq_offsets_strideZpq_ptrZpq_stride_TZpq_stride_1r  r   r   r   r   r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   rB  r   r   r   Ar   C_ptrr   r   r   _scatter_mm2_kernel   s4    ,,
&r  )r   r   r   
pq_indicesr   c                    s&  | j \} }|j \}}|j \}	}}ttd d tdd ddd}
 fdd}tjtjtjtjtjtjtjtji|j	 }d|
kr|
j
|tjkd	 t|  || | d
| d| d||d
|d|d||d
|d|dd
||d
|dfd|i|
 d S )Nr.   r   r   r	   )r   r   r   r   c                    s0   j d d t | d t| d  dfS )Nr   r   r   r   r   ru  ZcdivZMETAr   r   r   r   r   r]   =  s    z_scatter_mm2.<locals>.gridr   r   r   r  )r   r   r`   r   r   r  r  r'   r  r%   r   r  r8   )r   r   r   r  r   r   r   r   rA  r   r   r]   r  r   r  r   r   0  sV    $                 r   )r   r   r  r   r   r   r   r   c           >      C   s  || }|| }|| }t jdd}t jdd}||  } ||  }!|| }"||" }#|#| }$t||$ |}%|$||%  }&||" |% }'|&| t d| }(|'| t d| })t d|}*||(d d d f | |*d d d f |   }+|| |	  |*d d d f |
 |)d d d f |   },t ||! }-|r||-| | }.|-| | }/t ||. }0t ||. d }1|/|1 ||/ |0  }2|1|0 }3n(t ||! }2t ||! d }4|4|2 }3||2 }5t j||f|d}6|r*|+|0| 7 }+t|3D ]N}7t |5}8t |,|8 }9t |+}:|6t j|:|9||d7 }6|+|7 }+|5d7 }5qnr||2 };t|3D ]`}7t |5}8t |,|8 }9t |;}<t |+|<|  }:|;d7 };|5d7 }5|6t j|:|9||d7 }6q:||- | |  |(d d d f | |)d d d f |   }=t |=|6	|j
j d S r  )r  r%  r7   r   r&  r   rM   r'  r(  r)  r%   r*  )>r   r   r   r   r  r  r  r  r  Zothers_stride_Br  r  r  Zaccumulators_stride_Br  r  Zc_indices_ptrZr_offsets_ptrZp_offsets_ptrZq_offsets_ptrr   r  r   r   r   r   r   r   ZBLOCKS_MZBLOCKS_NZpid_t_r  Zpid_br  Znum_pid_in_groupZgroup_idZfirst_pid_mZgroup_size_mr  r  r  r  r  r  r  r   r    r#   r   r   r   rv  r   Zq_ptrrB  rA  r   r   r  Zp_ptrr   r  r   r   r   _scatter_mm6_kernelQ  sj    ,4




&r  T)	r   r   r   r   r   r   r   r   force_contiguousc	                    s  |d }	| j \}
}|j \ }}|j \}}}||ks8t||	 | ksLt fdd}tjtjtjtjtjtjtjtji|j }d|kr|j	|tjkd |
ddkst
ddkst|
ddkst|
ddkst|r|  } | }| s| }n|}n|}t|  ||| | 
d| 
d| 
d||
d|
d|
d||
d|
d|
d|||fd	|i| |r| s|| d S )
Nr   c                    s.   j d   t| d t| d  fS )Nr   r   r   r  r  r   r   r   r   r   r   r]     s    z_scatter_mm6.<locals>.gridr   r   r   r   r	   r  )r   r5   r   r   r  r  r'   r  r%   r   r8   r9   rf  r  r   )r   r   r   r   r   r   r   r   r  r   r   r   r   r   ZB_r   r   r]   r  Zaccumulators_r   r  r   r     sv       

            	
r   )
r  r  r  r  r
  r	  r  r   r   r   c)           @      C   s  t jdd})t jdd}*t jdd}+t jdd},t jdd}-t |*|+|,|-|'\}*}+|||)  ||*  }.t |.}/t |.| }0|0|/ }1t d|"}2t d|$}3t d|#}4| r|||)  ||*  ||+  ||2d d d f   ||4d d d f   }5| ||)  ||/  ||2d d d f   ||3d d d f   }6|||)  ||+  ||3d d d f   ||4d d d f   }7|||)  ||*  ||+  ||2d d d f   ||4d d d f   }8||	|)  |
|/  }9| rt |5|%}:|r|!s|| };|:|;9 }:nt j|"|#f|%d}:t|1D ]R}<t |6}=t |9}>t |7||>  }?|:t j	|=|?|&|%d7 }:|6|7 }6|9|
7 }9q|!sr|:|9 }:t 
|8|:|jj d S rE  )r  r%  rF  rG  r&  r   r)  r   rM   r'  r(  r%   r*  )@r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  Z	input_ptrZinput_batch_strideZinput_tiled_row_strideZinput_tiled_col_strideZinput_row_block_strideZinput_col_block_striderH  rI  rJ  rK  rL  rM  rN  rO  rP  rQ  rR  rS  r   r   r  r  r  r  r
  r	  r  r   r   r   r7  r8  rT  r  r  r9  r:  r;  r<  r=  Zinner_block_aranger>  Z
input_ptrsr?  rU  rV  r@  rW  Z
beta_alpharA  rX  rY  rZ  r   r   r   r    s    9    




r  )N)NNNNNN)NNNNNN)r   )NN)N)r   FN)T)@r~  osr   r   	functoolsr   Ztorch.utils._tritonr   Z_triton_ops_metar   typingr   r   intgetenvr   r   r   r   r$   r*   r6   r:   rC   rF   rJ   rK   r^   re   rr   rv   rz   r}   r   r   r   r   r   r   r   ZTensorrd  r   r   ru  Ztriton.languagelanguager  ZjitZ	constexprrD  r[  ra  rb  re  rp  rq  r(   r}  r  r   r  r   r  r   r   r   r   <module>   sh  
	
" X           
e           
"C
K
 
4ez( 0X8
F
A   2  0!Z I- !