U
    Mhk                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dlmZmZ d dlmZmZmZmZmZmZmZmZmZ dddd	gZed
dZG dd de
jZde
jeeddd	ZG dd deZG dd deZdS )    N)
namedtuple)AnyOptionalTupleListCallableDict))sparse_semi_structured_from_dense_cutlass'sparse_semi_structured_to_dense_cutlass)	fallback_dispatchersemi_sparse_valuessemi_sparse_indicessemi_sparse_detachsemi_sparse_tsemi_sparse_viewsemi_sparse_mmsemi_sparse_addmmsemi_sparse_linearSparseSemiStructuredTensor!SparseSemiStructuredTensorCUTLASS$SparseSemiStructuredTensorCUSPARSELTto_sparse_semi_structured_SEMI_STRUCTURED_SPARSE_CONFIGz=sparse_min_rows sparse_min_cols dense_min_rows dense_min_colsc                   @   s
  e Zd ZU dZdZeed< eej	e
f ed< dZeed< dZeed< dZeed	< eed
< eeef ed< eej ed< eej ed< eej ed< eej ed< eej ed< eed< eed< dddddgZed1ejeej eej eej eej eej eeed	ddZedddZeee eejeeef f dddZeeejeeef ejdddZejjZee dddZ!ed2d dd!d"Z"eejd d#d$d%Z#eejejd&d'd(Z$d)d* Z%eejd d#d+d,Z&d d-ejeej ejd.d/d0Z'd S )3r   a  
    This class implementes semi-structured sparsity as a Tensor subclass.

    Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
    depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
    structured sparsity.

    There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
    This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
    and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
    Note that as such, this class cannot be insantiated directly.

    -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
    - `def from_dense()` - backend specific compression routines
    - `def _mm()` - backend specifc mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
    r   _DEFAULT_ALG_ID_DTYPE_SHAPE_CONSTRAINTST_FORCE_CUTLASSF_FUSE_TRANSPOSE_PROTOTYPE_WARNING_SHOWNBACKENDSPARSE_DISPATCHpackedmetapacked_tmeta_tcompressed_swizzled_bitmaskfuse_transpose_cusparseltalg_id_cusparselt	shaper    r!   r"   r#   r$   r%   r&   requires_gradc
                 C   s   | j s,tdt d| _ |   tj|  |dk	r:|}
n|dk	rH|}
ntd|
j	|
j
|
j|	d}tjj| |f|}||_||_||_||_||_||_||_|S )a0  
        Create a new instance of the tensor subclass from the compressed sparse representation.

        We have the option to create the subclass with the compressed representations of both X and X', for training.
        For inference, we only need a single representation (either X or X'), while the corresponding other set will be None.

        Depending on the backend selected, certain fields will be set to None. (CUSPARSELT vs CUTLASS)

        Args:
            shape: The shape of the original dense tensor
            packed: The compressed representation of the original dense tensor
            meta: The metadata of the original dense tensor, if it is stored separately
            packed_t: The compressed representation of the transposed original dense tensor
            meta_t: The metadata of the transposed original dense tensor, if it is stored separately
            compressed_swizzled_bitmask: The masks used by the CUTLASS backend to determine which threads should
                                         participate in the computation. Used for pointwise ops.
            fuse_transpose_cusparselt: When running with cuSPARSELt, we have the option to fuse a transposition
                                       with a matmul, which is useful in the case of 2:4 sparse training.
            alg_id_cusparselt: The algorithm id to use when using cuSPARSELT, will have effect on performance

        Returns:
            torch.Tensor: A torch.Tensor wrapper subclass.

        Raises:
            ValueError: If all of the tensor arguments are None.
        zThe PyTorch API of SparseSemiStructuredTensor is in prototype stage and will change in the near future. Please open a Github issue for features requests and see our documentation on the torch.sparse module for further information about the project.TNz3At least one of packed or packed_t must be provided)devicedtypelayoutr)   )r   warningswarnUserWarning_load_dispatch_tabletorchZ_dynamoZallow_in_graph
ValueErrorr*   r+   r,   TensorZ_make_wrapper_subclassr    r!   r"   r#   r$   r%   r&   )clsr(   r    r!   r"   r#   r$   r%   r&   r)   Zprevious_tensorkwargsZtensor r6   N/var/www/html/venv/lib/python3.8/site-packages/torch/sparse/semi_structured.py__new__I   s6    '	z"SparseSemiStructuredTensor.__new__)returnc                 C   s$   t | dst| jj d| j dS )Nr(   z(shape=))hasattrAssertionError	__class____name__r(   selfr6   r6   r7   __repr__   s    z#SparseSemiStructuredTensor.__repr__c                    s4   t t fdd j} j j j jf}||fS )Nc                    s   t  | d k	S N)getattr)xr?   r6   r7   <lambda>       z?SparseSemiStructuredTensor.__tensor_flatten__.<locals>.<lambda>)listfilter	__slots__r(   r%   r&   r)   )r@   inner_tensorstensor_metar6   r?   r7   __tensor_flatten__   s    z-SparseSemiStructuredTensor.__tensor_flatten__)rK   r9   c           	      C   sN   |\}}}}| || dd | dd | dd | dd | dd |||d	S )Nr    r!   r"   r#   r$   r'   )get)	r4   rJ   rK   Z
outer_sizeZouter_strider(   r%   r&   r)   r6   r6   r7   __tensor_unflatten__   s    




z/SparseSemiStructuredTensor.__tensor_unflatten__c                 C   s:   |j | jkr$t| j d|j d| j|j  ||||S )NzI only supports a specific set of operations, can't perform requested op (r:   )Z_overloadpacketr   NotImplementedErrorr>   )r4   functypesargsr5   r6   r6   r7   __torch_dispatch__   s
    z-SparseSemiStructuredTensor.__torch_dispatch__Nc                 C   s   t | dddkrtjjjttjjjttjjjt	tjjj
t	tjjjttjjjttjjjttjjjttjjjttjjjttjjjttjjjt	i| _|dk	r| j| dS )zT
        Loads the op overload sparse dispatch table for the current class.
        r   N)rC   r1   ZopsZatenvaluesr   indicesr   Zis_same_sizer   Zdetach_detachr   tr   viewr   mmr   matmulZaddmmr   Zlinearr   Z_to_copyr   update)r4   Zcustom_dispatch_tabler6   r6   r7   r0      s8                z/SparseSemiStructuredTensor._load_dispatch_tableoriginal_tensorr9   c                 C   s   |j std|j d| dkr8td|  d| sHtd|j| jkrftd|j d|j\}}| j|j j}| j|j j	}||k s|| s||k s|| rtd	|j d
| d| ddS )z_
        Assert that the given tensor is valid for semi-structured sparse compression.
        zError original_tensor.device= z= is not supported! Only CUDA tensors are currently supported.   zError original_tensor.dim = z; is not supported! Only 2d tensors are currently supported.zXError original_tensor is not contiguous!Only contiguous tensors are currently supported.zError original_tensor.dtype zO is not a supported dtype! dtype must be one of: {cls._DTYPE_SHAPE_CONSTRAINTS}zError original_tensor.shape zS is not supported! Both dimensions must be larger or equal than and a multiple of (z, r:   N)
Zis_cudaRuntimeErrorr*   dimZis_contiguousr+   r   r(   Zsparse_min_rowsZsparse_min_cols)r4   r]   mnmin_rowsmin_colsr6   r6   r7    _validate_device_dim_dtype_shape   s.    
 z;SparseSemiStructuredTensor._validate_device_dim_dtype_shape)dense_inputr9   c                 C   s   |  dkst|j\}}| j|j j}| j|j j}||k sF|| rP| | nd}||k sd|| rn| | nd}|sz|rtjj	
|d|d|fS |S dS )z
        Calculates padding for dense tensor and pads tensor if necessary.
        If padding is not required, this function returns the original tensor.
        r^   r   N)r`   r<   r(   r   r+   Zdense_min_rowsZdense_min_colsr1   nnZ
functionalpad)r4   rf   ra   rb   rc   rd   Zto_pad_mZto_pad_nr6   r6   r7   _pad_dense_input  s    
z+SparseSemiStructuredTensor._pad_dense_inputc                 C   s&   | j d }t| tj|| j| jdS )N)r+   r*   )r(   r1   rY   eyer+   r*   )r@   colr6   r6   r7   to_dense'  s    
z#SparseSemiStructuredTensor.to_densec                 C   s   t d S rB   rO   r4   r]   r6   r6   r7   
from_dense+  s    z%SparseSemiStructuredTensor.from_densebiasBrr   r9   c                K   s   t d S rB   rn   )r@   rt   rr   r5   r6   r6   r7   _mm/  s    zSparseSemiStructuredTensor._mm)Fr   F)N)(r>   
__module____qualname____doc__r   int__annotations__r   r1   r+   r   r   boolr   r   strr   r   r3   rI   staticmethodSizer8   rA   r   r   rL   classmethodrN   Z_CZ_disabled_torch_function_implZ__torch_function__r   rS   r0   re   ri   rm   rp   ru   r6   r6   r6   r7   r   $   sp   
	   R+F)r]   
transposedr9   c                 C   s4   |rt jdtdd tjr"tjjntjj}|	| S )a	  
    This function converts a dense tensor into a sparse semi-structured tensor.
    It will return a SparseSemiStructuredTensor, a subclass of torch.Tensor.

    This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
    We currently only support semi-structured sparse tensors for 2d CUDA tensors.
    Additionally, your tensor must be a positive multiple of the mininum sparse block size, given in
    `_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).

    Args:
        original_tensor (Tensor): the dense tensor to convert
        transposed (bool, optional): deprecated arg to be removed in another release. Do not use.
    Returns:
        SparseSemiStructuredTensor: A sparse semi-structured tensor created from the given original_tensor
    Raises:
        None
    Example:
        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> A = torch.Tensor([0, 0, 1, 1]).tile((128, 32)).half().cuda()
        tensor([[0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                ...,
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.],
                [0., 0., 1.,  ..., 0., 1., 1.]], device='cuda:0', dtype=torch.float16)
        >>> A_sparse = to_sparse_semi_structured(A)
        SparseSemiStructuredTensor(shape=torch.Size([128, 128]))
        >>> A_sparse.values()
        tensor([[1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                ...,
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.],
                [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0', dtype=torch.float16),
        >>> A_sparse.indices()
        tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                ...,
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0', dtype=torch.int16))
    zSetting transpose from `to_sparse_semi_structured` is deprecated and will be removed in a future release. `SparseSemiStructuredTensor` only support contiguous input tensors.r^   )
stacklevel)
r-   r.   FutureWarningr   r   r1   sparser   r   rp   )r]   r   ZSPARSE_SUBCLASSr6   r6   r7   r   9  s    1
c                       s   e Zd ZdZdZejeddddejeddddej	eddddej
eddddiZeejd d	d
dZ fddZedejdd	ddZddejeej ejdddZ  ZS )r   a  
    This class implements semi-structured sparsity for the CUTLASS backend.


    In this implementation, the specified elements and metadata are stored seprately,
    in packed and meta respectively.

    When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_(mm|addmm) and
    sparse_semi_structured_from_dense for conversion to the compressed format.
    Zcutlass          @         r\   c              	   C   s0   |  | t|\}}| |j||d d d |jdS )Nr    r!   r"   r#   r$   r)   )re   r	   r(   r)   )r4   r]   Zsparse_tensor_cutlassZmeta_tensor_cutlassr6   r6   r7   rp     s    
z,SparseSemiStructuredTensorCUTLASS.from_densec                    s<   | j d k	r| jd k	st| j jdkr2t| j| j S t  S )Nr^   )r!   r    r<   ndimr
   superrm   r?   r=   r6   r7   rm     s    
z*SparseSemiStructuredTensorCUTLASS.to_dense r   c              	   C   s2   t j||dd\}}}}}| |j|||||ddS )a  
        This function takes in a unpruned dense tensor and runs a (branchless) static sort across a 4x4 tile.

        It greedily picks the largest values in the tile, upholding the 2:4 sparsity constraint across both rows and columns.
        The algorithm used to prune the matrix is implemented in `_sparse_semi_structured_tile`.

        Then it creates the packed and meta tensors for the compressed sparse representation of the pruned dense tensor.
        It also calculates the packed_t and meta_t tensors for the compressed sparse representation of the transposed
        pruned dense tensor.
        Since we cannot transpose the compressed representations, we store both for the fw/bw pass respectively.

        Finally, this function also computes a compressed swizzled bitmask that encodes the sparsity pattern
        This can be used in the backward pass to mask the gradients.

        [9 1 7 4]                       [9 0 7 0]
        [1 2 3 0]                       [0 2 0 0]
        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to CUTLASS semi-structured -> packed
        [1 2 6 2]                       [0 0 6 2]                                    -> metadata

                                                  -> pack to transposed CUTLASS      -> packed_t
                                                     semi-structured representation  -> metadata_t

                                                  -> compute swizzled bitmask        -> compressed_swizzled_bitmask


        The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
        ```
        from torch.sparse import SparseSemiStructuredTensorCUTLASS
        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask

        pruned = _sparse_semi_structured_tile(dense)
        packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(pruned.t().contiguous())
        bitmask = _compute_compressed_swizzled_bitmask(pruned)

        SparseSemiStructuredTensorCUTLASS(dense.shape, packed_cutlass, meta_cutlass, packed_t_cutlass, meta_t_cutlass, bitmask)
        ```
        T	algorithmZuse_cutlassFr   r1   Z_sparse_semi_structured_tiler(   r4   r]   r   r    r!   r"   r#   r$   r6   r6   r7   prune_dense_static_sort  s    )z9SparseSemiStructuredTensorCUTLASS.prune_dense_static_sortNrq   rs   c                K   s   t |trtd| jj}| jdks.|jdkr>td| d| jd ksR| jd krdtd| dnB|d krt	
| j| j|}nt	|| j| j|}|d | jd  S d S )NZ`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardwarer^   `)` matmul: Broadcasting is not implemented$` matmul: operation is not supportedr   )
isinstancer   r2   r=   r>   r   rO   r    r!   r1   Z_sparse_semi_structured_mmZ_sparse_semi_structured_addmmr(   )r@   rt   rr   r5   cls_nameresr6   r6   r7   ru     s4    


     z%SparseSemiStructuredTensorCUTLASS._mm)r   )r>   rv   rw   rx   r   r1   int8r   float16bfloat16float32r   r   r3   rp   rm   r   r   ru   __classcell__r6   r6   r   r7   r   |  s0   
    ;c                   @   s   e Zd ZdZdZejeddddejeddddej	eddddej
eddddiZeejd ddd	ZedejddddZddejeej ejdddZdS )r   a  
    The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
    packed = [ specified elements of original tensor | metadata ]
    For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
    The rest of the tensor is metadata. Since there is only one tensor, we only use the packed and packed_t
    attributes respectively.

    cuSPARSELt also supports transposition fusion, which is necessary for performant 2:4 sparse training, as well
    as specifying alg_id, a config that affects the performance of the matmul depending on matmul sizes.
    Z
cusparseltr   r   r   r   r\   c                 C   s2   |  | | |jt|d d d d tjtj|jd	S )Nr'   )re   r(   r1   Z_cslt_compressr   r   r   r)   ro   r6   r6   r7   rp     s    
z/SparseSemiStructuredTensorCUSPARSELT.from_denser   r   c              	   C   s2   t j||dd\}}}}}| |j|||||ddS )a  
        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPASRELt metadata
        layout and sparse matmul.

        The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.

        [9 1 7 4]                       [9 0 7 0]
        [1 2 3 0]                       [0 2 0 0]
        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to cuSPARSELT semi-structured -> packed
        [1 2 6 2]                       [0 0 6 2]

                                                  -> pack to transposed cuSPARSELt      -> packed_t
                                                     semi-structured representation

                                                  -> compute swizzled bitmask           -> compressed_swizzled_bitmask


        The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
        ```
        from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask

        pruned = _sparse_semi_structured_tile(dense)
        packed_cusparselt = torch._cslt_compress(pruned)
        packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
        bitmask = _compute_compressed_swizzled_bitmask(pruned)

        SparseSemiStructuredTensorCUSPARSELT(dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask)
        ```
        Fr   r   r   r   r6   r6   r7   r   #  s     z<SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sortNrq   rs   c                K   s  t |trtd| jdks&|jdkr:td| jj d|j| jkrtd| jj dt| j	 dt|j	 d| j d|j d	|d k	r|j| jkrtd| jj dt| j	 dt|j	 d
| j
d krtd| jj dn.tj| j
||| j| jd}| jr| S |S d S )Nr   r^   r   r   z` matmul: trying to do `A=z @ B=z`, with A.dtype=z and B.dtype=zH. This operation is only supported when A and B have the same data type.z + C`, with A.dtype=B.dtype={self.dtype} and C.dtype={B.dtype}. This operation is only supported when A, B and C have the same data type.r   )rr   Ztranspose_resultZalg_id)r   r   r2   r   rO   r=   r>   r+   tupler(   r    r1   Z_cslt_sparse_mmr%   r&   rW   )r@   rt   rr   r5   r   r6   r6   r7   ru   R  s8    
6&
z(SparseSemiStructuredTensorCUSPARSELT._mm)r   )r>   rv   rw   rx   r   r1   r   r   r   r   r   r   r   r3   rp   r   r   ru   r6   r6   r6   r7   r     s*   
    2)F) r-   collectionsr   typingr   r   r   r   r   r   r1   Z)torch.sparse._semi_structured_conversionsr	   r
   Z!torch.sparse._semi_structured_opsr   r   r   r   r   r   r   r   r   __all__r   r3   r   r{   r   r   r   r6   r6   r6   r7   <module>   s4    ,   C 