U
    Mh`n                     @   s  d dl Z d dlmZmZ d dlZd dlZd dlm  mZ	 d dl
mZmZmZmZmZmZ d dlmZ ddlmZmZmZ e eZd6ejejejeej d	d
dZd7eedddZd8eedddZd9ejeedddZ d:edddZ!d;eedddZ"d<eedddZ#d=eedddZ$d>eedddZ%dd  Z&ejeeje'e'f d!d"d#Z(ejd$d%d&Z)eje'e'e'ejd'd(d)Z*d*d+ Z+eje'eejd,d-d.Z,d/d0 Z-ejd1d2d3Z.d?ejejejeej d	d4d5Z/dS )@    N)OptionalTuple)can_use_efficient_attentioncan_use_flash_attentionflash_sdp_enabledmath_sdp_enabledmem_efficient_sdp_enabled
SDPAParams)
SDPBackend   )buffer_from_jaggedNestedTensorViewNestedFromBuffer        F)querykeyvalue	attn_maskc                 C   s  t | trt |trt |ts@td| j d|j d|j d| j|jksX| j|jkrztd| j d|j d|j d| j|jks| j|jkrtd| j d	|j d
|j d|  dk s| dk s| dk rtd|   d|  d|  d| j|jks| j|jkr@td| j d|j d|j d|d k	rtd|jtj	kr|j| jkrtd|j d| j dd S )NzNExpected query, key, and value to be nested tensors, but got query.is_nested: z, key.is_nested: z, and value.is_nested: z	 instead.zLExpected query, key, and value to have the same dtype, but got query.dtype: z, key.dtype: z, and value.dtype: zSExpected query, key, and value to have the same device type, but got query.device: z, key.device: z, and value.device:    zUExpected query, key, and value to all be  at least 2 dimensional, but got query.dim: z, key.dim: z and value.dim: z[Expected query, key, and value to all be ragged on the same dimension, but got ragged dims z, z, and z, respectively.zMasks are not yet supported!zVExpected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: z, and query.dtype: )

isinstancer   
ValueError	is_nesteddtypedevicedim_ragged_idxtorchbool)r   r   r   r   	dropout_p	is_causalscale r!   M/var/www/html/venv/lib/python3.8/site-packages/torch/nested/_internal/sdpa.py_validate_sdpa_input   s>    
&"
r#   )paramsreturnc                 C   s4   | j d}| jd}| jd}||ko2||kS )Nr   )r   sizer   r   )r$   debugq_batch_sizek_batch_sizev_batch_sizer!   r!   r"   _check_batch_size_nestedJ   s    r+   c                 C   sl   d}| j d}| jd}| jd}||ko6||k}|rP|d dkrP||ksh|rdtd||| dS dS )N      r   zFor NestedTensor inputs, Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 256. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FT)r   r&   r   r   logwarning)r$   r'   max_sizeZquery_size_lastZkey_size_lastZvalue_size_lastZsame_head_dim_sizer!   r!   r"   !_check_head_dim_size_flash_nestedW   s*    
r2   )param
param_namer%   c                 C   sR   t | tstd| jdkr0|r,td| dS | jdkrN|rJtd| dS dS )Nzparam should be a jagged NTr   zMFused kernels do not support ragged num_head_dims, %s has a ragged num_heads.Fr   zAFused kernels do not support seq_len == 0, %s has a seq len of 0.T)r   r   AssertionErrorr   r/   r0   _min_seqlen)r3   r4   r'   r!   r!   r"   :_check_for_seq_len_0_and_consistent_head_dim_nested_helperq   s     

r7   )r%   c              
   C   s`   t | ||}| |kr| dks<||kr,|dks<||kr\|dkr\|rXtd||| |||| dS dS )Nr   zzBoth fused kernels require query, key and value to have broadcastable %s, got Query %s %d, Key %s %d, Value %s %d instead.FT)maxr/   r0   )Zq_sizeZk_sizeZv_sizer4   r'   r1   r!   r!   r"   _try_broadcast_param_size   s4    r9   c           	      C   s   | j jrt| j d|nd}|s"dS | jjr8t| jd|nd}|sDdS | jjrZt| jd|nd}|sfdS | j d}| jd}| jd}||ko||k}|s| j js| jjs| jjr|rtd dS t	|||d|S dS )	Nr   TFr   r   r   zFBoth fused kernels do not support training with broadcasted NT inputs.z	num heads)
r   r   r7   r   r   r&   requires_gradr/   r0   r9   )	r$   r'   Z	q_is_safeZ	k_is_safeZ	v_is_safeq_num_headsk_num_headsv_num_headsZsame_num_headsr!   r!   r"   _check_for_seq_len_0_nested   sl              r>   c                 C   s(   t ttf}|D ]}|| |s dS qdS NFT)r+   r2   r>   r$   r'   constraints
constraintr!   r!   r"   _can_use_flash_sdpa_jagged   s    
rC   c                 C   s&   t tf}|D ]}|| |s dS qdS r?   )r+   r>   r@   r!   r!   r"   _can_use_efficient_sdpa_jagged   s    
rD   c                 C   sd   | j dd r6| jdd r6| jdd sH|rDtd dS | jr`|r\td dS dS )Nr   r   zGIf inputs are nested tensors they must be contiguous after transposing.FzENested tensors for query / key are not supported when is_causal=True.T)r   	transposeis_contiguousr   r   r/   r0   r   )r$   r'   r!   r!   r"   _can_use_math_sdpa_jagged   s$    rG   c           	      C   s  t  st st stjS tjtjtjf}t| |||||}|D ]n}|tjkrft	|rft
|rftj  S |tjkrt|rt|rtj  S |tjkr>t r>t|r>tj  S q>td t|dd t|dd td t	|dd t
|dd td t|dd tjS )Nz)Memory efficient kernel not used because:T)r'   z(Flash attention kernel not used because:z'Math attention kernel not used because:)r   r   r   r
   ERRORFLASH_ATTENTIONEFFICIENT_ATTENTIONMATHr	   r   rC   r   rD   rG   r/   r0   )	r   r   r   r   Zdropoutr   Zorderingr$   backendr!   r!   r"   _select_sdp_backend  sB    








rM   )qkvr%   c                 C   s   t | tstd|  d krJ|  jtj| jd}| j	}| 
 jd }n<|  djtj| jd}| d}| j	}t|d  }|||fS )Nz<QKV must be nested for flash cumulative_seq_len calculation.)r   r   r   r-   )r   r   r   lengthsoffsetstor   Zint32r   _max_seqlenvaluesshapeZcumsumr&   intitem)rN   Zcumulative_seqlenZ
max_seqlenZn_elemZ
batch_sizer!   r!   r"   _cumulative_and_max_seq_len_nnz0  s    

rW   )tensorc                 C   sf   t | tst|  }| j}|dd }|dkr6dS |d }|dd  D ]}||kr\ dS |}qJdS )Nr   r   Tr   F)r   r   r5   rP   Z_stridesr&   )rX   rP   stridesZ	n_tensorsZprev_strideZstrider!   r!   r"   !_is_safe_to_get_storage_as_tensorL  s    
rZ   )rX   Nnz	num_headshead_dimr%   c                 C   s   | j rt| S | |||S )N)r   r   view)rX   r[   r\   r]   r!   r!   r"   _view_as_densek  s    r_   c                 C   s`  |  d}| d}| d}|  d}| d}| d}||kr\||kr\||kr\||ksdtd|  d}	|  d}
| d}| dd}|dd}|dd}t|\}}}t|\}}}| st|s| }| st|s| }| st|s| }t|||	|
}t|||	|
}t|||	|}| |j	|j
d}||||||||fS )Nr   r   z<This path is currently not implemented for jagged layout NT.   r   )rP   rR   r6   )r&   RuntimeErrorrE   rW   rF   rZ   
contiguousr_   rP   rR   r6   )r   r   r   r(   r)   r*   r;   r<   r=   r\   Zhead_dim_qkZ
head_dim_vZq_tZk_tZv_tcumulative_sequence_length_qmax_seqlen_batch_qZNnz_qcumulative_sequence_length_kvmax_seqlen_batch_kvZNnz_kvquery_buffer_reshapedkey_buffer_reshapedvalue_buffer_reshapedoutput_nt_infor!   r!   r"   _sdpa_nested_preprocessing  sh    








	rk   )rX   alignment_sizeslicer%   c                 C   sR   |  d}|| dkr| S |||  }tjj| d|g} |rN| dd|f S | S )Nr-   r   .)r&   r   nn
functionalpad)rX   rl   rm   Zlast_dim_sizeZ	pad_countr!   r!   r"   _pad_last_dimJ  s    
rq   c                 C   s$   |d k	r|nt d| d }|S )Ng      ?r-   )r   Zsym_sqrtr&   )r   r    Zsoftmax_scaler!   r!   r"   _calculate_scale]  s     rr   )outc                 C   s(   | j s$| d|kr$| dd|f } | S )Nr-   .r   )r   r&   )rs   og_sizer!   r!   r"   _post_process_flash_outputc  s    ru   c           )      C   s  t | |||||| t| tr2t|tr2t|ts6t|  dkr| dkr| dkr| jdkrddlm} tj	| j
|j
|j
t|tr|j
n||||d}t|f|| S | jp|jp|j}	t| |||||}
|
tjkr| d}t| dd}t|dd}t|dd}t| |}t|||\}}}}}}}}tjjj|||||||||d|d	\}}}}}t||d
 dd}t||S |
tjkrt| ||\}}}}}}}}tjjj|d|d|dd |||||t||	|d	\}} }!}"}#}t|d|d
 ddS |
tj kr| ! }$| j"d }%|j"d }&dd }'|'| } |'|}|'|}tj#| ||||||d	d }(|(dd$ % }(|(&d|%|&}(t|(|$}(|(dd}(|(S t'dd S )Nr`   r   r   )extract_kwargs)r   r   r   r    r-   r.   F)r    rP   r   c                 S   sd   | j dd  | j d d  }t| dd}t|jt|dd}tjt|}|dd }|S )Nr   r-   r   r   )r   )	_offsetsr   rE   r   splitlistnestedZas_nested_tensorrb   )Zjagged_layout_ntrO   rE   Ztensor_listZ
strided_ntr!   r!   r"    get_strided_layout_nested_tensor  s    zMjagged_scaled_dot_product_attention.<locals>.get_strided_layout_nested_tensorz=No viable backend for scaled_dot_product_attention was found.)(r#   r   r   r5   r   r   Ztorch.nested._internal.opsrv   FZscaled_dot_product_attentionZ_valuesr:   rM   r
   rI   r&   rq   rr   rk   r   ZopsZatenZ_flash_attention_forwardr   applyrE   ru   rJ   Z_efficient_attention_forwardZ	unsqueezerU   ZsqueezerK   rP   _sizeZ"_scaled_dot_product_attention_mathrb   rS   r^   ra   ))r   r   r   r   r   r   r    rv   outputZcompute_logsumexpZbackend_choicert   Zquery_paddedZ
key_paddedZvalue_paddedZog_scalerg   rh   ri   rc   re   rd   rf   rj   Z	attentionZ	logsumexpZphilox_seedZphilox_offsetZdebug_attn_maskZquery_reshapedZkey_reshapedZvalue_reshapedZ
log_sumexpseedoffsetZmax_seqlen_qrP   Zd1Zd2r{   Zattn_outr!   r!   r"   #jagged_scaled_dot_product_attentioni  s   		.     



	 


	  

      r   )Nr   FN)F)F)F)F)F)F)F)F)Nr   FN)0loggingtypingr   r   r   Ztorch.nnZtorch.nn.functionalrn   ro   r|   Ztorch.backends.cudar   r   r   r   r   r	   Ztorch.nn.attentionr
   Znested_tensorr   r   r   	getLogger__name__r/   ZTensorr#   r   r+   r2   strr7   r9   r>   rC   rD   rG   rM   rU   rW   rZ   r_   rk   rq   rr   ru   r   r!   r!   r!   r"   <module>   sv    	
    3  ;(     J  
    