U
    yâhX  ã                   @   sn   d dl Z d dlZ d dl mZ d dlm  mZ d dl mZ d dlmZm	Z	 d dl
Z
dgZG dd„ dejƒZdS )é    N)Únn)ÚTensor)ÚOptionalÚTupleÚMultiheadAttentionc                       sâ   e Zd ZejZdgZdeeee	e	e	e
e e
e e	ddœ
‡ fdd„Zd	d
„ Zedd„ ƒZejjdd„ ƒZedd„ ƒZdeeee
e e	e
e e	e	eee
e f dœ	dd„Zdeeee
e e	e
e e	e	eee
e f dœ	dd„Z‡  ZS )r   Úbatch_firstç        TFN)
Ú	embed_dimÚ	num_headsÚdropoutÚbiasÚadd_bias_kvÚadd_zero_attnÚkdimÚvdimr   Úreturnc              
      sü   |
|dœ}t ƒ j|||||||||	f	|Ž tj| j| jfd|i|—Ž| _tj| j| jfd|i|—Ž| _tj| j| jfd|i|—Ž| _	tj| j| jfd|i|—Ž| _
tjjj ¡ | _tjj ¡ | _tjj ¡ | _tjj ¡ | _tjj ¡ | _tjj ¡ | _d S )N)ÚdeviceÚdtyper   )ÚsuperÚ__init__r   ZLinearr	   Úlinear_Qr   Úlinear_Kr   Úlinear_VÚout_projÚtorchÚaoZ	quantizedZFloatFunctionalÚq_scaling_productÚquantizationZ	QuantStubÚquant_attn_outputÚquant_attn_output_weightsZDeQuantStubÚ	dequant_qÚ	dequant_kÚ	dequant_v)Úselfr	   r
   r   r   r   r   r   r   r   r   r   Zfactory_kwargs©Ú	__class__© ú\/var/www/html/venv/lib/python3.8/site-packages/torch/ao/nn/quantizable/modules/activation.pyr   @   s*    
    þýzMultiheadAttention.__init__c                 C   s   dS )NZQuantizableMultiheadAttentionr&   )r#   r&   r&   r'   Ú	_get_name[   s    zMultiheadAttention._get_namec              
   C   sˆ  t |ƒ| jkst‚t|dƒs$tdƒ‚| |j|j|j|jd k	|jd k	|j	|j
|j|jƒ	}|j|_|j|_|j|_|jj|j_|jj|j_|jr¼|j}d}||j }|j||…d d …f }|d k	rØtj |||… |j¡}tj ||j¡|j_||j_|j}|}||j }|j||…d d …f }|d k	r@tj |||… |j¡}tj ||j¡|j_||j_|j}|}|j|d …d d …f }|d k	ržtj ||d … |j¡}tj ||j¡|j_||j_n®t |j¡|j_t |j¡|j_t |j¡|j_|jd krd |j_d |j_d |j_nXt |jd|j… ¡|j_t |j|j|jd … ¡|j_t |j|jd d … ¡|j_| ¡  tjj j!|dd}|S )NÚqconfigz$The float module must have 'qconfig'r   é   T)Zinplace)"ÚtypeÚ_FLOAT_MODULEÚAssertionErrorÚhasattrr	   r
   r   Úin_proj_biasÚbias_kr   r   r   r   Úbias_vr)   r   Úweightr   Ú_qkv_same_embed_dimÚin_proj_weightr   r   Ú	ParameterZrequires_gradr   r   r   Úq_proj_weightÚk_proj_weightÚv_proj_weightÚevalr   r   Úprepare)ÚclsÚotherZobservedr   Ú_startÚ_endr2   r&   r&   r'   Ú
from_float^   st      ü
ÿ

ÿ
ÿ

 zMultiheadAttention.from_floatc                 C   st  |   | j| j| j| j ¡ d dk	| jdk	| j| j| j	| j
¡	}|j| jksLt‚| jdk	rht | j ¡ ¡|_| jdk	r„t | j ¡ ¡|_| j ¡ \}}t | ¡ ¡|j_|dk	rºt |¡|j_| j ¡ \}}| ¡ }| j ¡ \}}| ¡ }| j ¡ \}}	| ¡ }|jrìd}
|
|j }||j|
|…dd…f< |jdk	rTt|dkƒsFt‚||j|
|…< |}
|
|j }||j|
|…dd…f< |jdk	r¤t|dkƒs–t‚||j|
|…< |}
||j|
d…dd…f< |jdk	rpt|	dkƒsÜt‚|	|j|
d…< n„t |¡|_t |¡|_t |¡|_|jdkr6d| j_d| j_d| j_n:||jd|j…< ||j|j|jd …< |	|j|jd d…< |S )zæUtility to convert the quantized MHA back to float.

        The motivation for this is that it is not trivial to conver the weights
        from the format that is used in the quantized version back to the
        float.
        é   Nr   r*   )r,   r	   r
   r   r   Z_weight_biasr0   r   r   r   r   r3   r-   r   r5   Ú
dequantizer1   r   r2   r   r   r   r4   r/   Úallr6   r7   r8   )r#   ÚfpÚwÚbZwQZbQZwKZbKZwVZbVr=   r>   r&   r&   r'   rA   Ÿ   sh       ý




zMultiheadAttention.dequantizec                 C   s   t dƒ‚d S )NzdIt looks like you are trying to prepare an MHA module. Please, see the examples on quantizable MHAs.)ÚNotImplementedError)r;   r<   r&   r&   r'   Úfrom_observedä   s    z MultiheadAttention.from_observed)	ÚqueryÚkeyÚvalueÚkey_padding_maskÚneed_weightsÚ	attn_maskÚaverage_attn_weightsÚ	is_causalr   c	           	   
   C   s   |   ||||||||¡S )aÆ  
    Note::
        Please, refer to :func:`~torch.nn.MultiheadAttention.forward` for more
        information

    Args:
        query, key, value: map a query and a set of key-value pairs to an output.
            See "Attention Is All You Need" for more details.
        key_padding_mask: if provided, specified padding elements in the key will
            be ignored by the attention. When given a binary mask and a value is True,
            the corresponding value on the attention layer will be ignored.
        need_weights: output attn_output_weights.
        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
            the batches while a 3D mask allows to specify a different mask for the entries of each batch.

    Shape:
        - Inputs:
        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
          the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
          the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
          the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
          If a BoolTensor is provided, the positions with the
          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
          positions. If a BoolTensor is provided, positions with ``True``
          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
          is provided, it will be added to the attention weight.
        - is_causal: If specified, applies a causal mask as attention mask. Mutually exclusive with providing attn_mask.
          Default: ``False``.
        - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
          heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
          effect when ``need_weights=True.``. Default: True (i.e. average weights across heads)

        - Outputs:
        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
          E is the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
        - attn_output_weights: If ``average_attn_weights=True``, returns attention weights averaged
          across heads of shape :math:`(N, L, S)`, where N is the batch size, L is the target sequence length,
          S is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
          head of shape :math:`(N, num_heads, L, S)`.
        )Ú_forward_impl)	r#   rH   rI   rJ   rK   rL   rM   rN   rO   r&   r&   r'   Úforwardí   s    7  þzMultiheadAttention.forwardc	                 C   sÔ  d }	d }
|d k	r|rt dƒ‚|r(t dƒ‚| jrHdd„ |||fD ƒ\}}}| ¡ \}}}| j|ksdt ‚| d¡| d¡krŒ| d¡| d¡kst ‚| j| j }|| j | jks´t dƒ‚t|ƒd }|  |¡}|  |¡}|  |¡}| j	 
||¡}|d k	râ|jtjkrtjd	d
d | tj¡}| ¡ sF|jtjksFt d|j› ƒ‚| ¡ dkrŒ| d¡}t| ¡ ƒd| d¡| d¡gkrâtdƒ‚nV| ¡ d
krÎt| ¡ ƒ|| j | d¡| d¡gkrâtdƒ‚ntd| ¡ › dƒ‚|d k	r|jtjkrtjdd
d | tj¡}| jd k	rì| jd k	rì|	d krÆ|
d krÆ| j}|d k	sTt ‚| j}|d k	sht ‚t || d|d¡g¡}t || d|d¡g¡}|d k	r®t |d¡}|d k	rêt |d¡}n$|	d ksØt dƒ‚|
d kst dƒ‚n | jd ksüt ‚| jd kst ‚| ¡  ||| j |¡ dd¡}|d k	rV| ¡  d|| j |¡ dd¡}|d k	r€| ¡  d|| j |¡ dd¡}|	d k	r¼|	 d¡|| j ks¤t ‚|	 d¡|ks¸t ‚|	}|
d k	rø|
 d¡|| j ksàt ‚|
 d¡|ksôt ‚|
}| d¡}|d k	r4| d¡|ks t ‚| d¡|ks4t ‚| j r |d7 }t !| d¡df| ¡ dd …  ¡}|j"rŠt #|| $¡ | %¡ |j¡}tj||gdd}t !| d¡df| ¡ dd …  ¡}|j"rât #|| $¡ | %¡ |j¡}tj||gdd}|d k	r
t |d¡}|d k	r t |d¡}|  &|¡}|  '|¡}|  (|¡}t )|| dd¡¡}t| ¡ ƒ|| j ||gkstt ‚|d k	r¦|jtjkrž| *|tdƒ¡ n||7 }|d k	rò| || j||¡}| +| d¡ d¡tdƒ¡}| || j ||¡}tj,|dd}tj-|| j-| j.d}t )||¡}t| ¡ ƒ|| j ||gksBt ‚| jr\| ||| j¡}n| dd¡ ¡  ||| j¡}|  /|¡}|  0|¡}|  1|¡}|rÈ| || j||¡}|rÀ|j2dd}||fS |d fS d S )Nz#Only allow causal mask or attn_maskz*causal mask not supported by AO MHA modulec                 s   s   | ]}|  d d¡V  qdS )r   r@   N)Ú	transpose)Ú.0Úxr&   r&   r'   Ú	<genexpr>@  s     z3MultiheadAttention._forward_impl.<locals>.<genexpr>r   r@   z(embed_dim must be divisible by num_headsg      à¿z^Byte tensor for `attn_mask` in `nn.MultiheadAttention` is deprecated. Use bool tensor instead.é   )Ú
stacklevelz;Only float and bool types are supported for attn_mask, not r*   z,The size of the 2D attn_mask is not correct.z,The size of the 3D attn_mask is not correct.zattn_mask's dimension z is not supportedzeByte tensor for `key_padding_mask` in `nn.MultiheadAttention` is deprecated. Use bool tensor instead.)r   r@   z#bias cannot be added to static key.z%bias cannot be added to static value.éÿÿÿÿ)Údimz-inf)ÚpÚtraining)3r-   r   Úsizer	   r
   Úfloatr   r   r   r   Z
mul_scalarr   r   Zuint8ÚwarningsÚwarnÚtoÚboolZis_floating_pointrY   Z	unsqueezeÚlistÚRuntimeErrorr0   r1   ÚcatÚrepeatÚnnFÚpadÚ
contiguousÚviewrR   r   ZzerosZis_quantizedZquantize_per_tensorZq_scaleZq_zero_pointr    r!   r"   ZbmmZmasked_fill_Zmasked_fillZsoftmaxr   r[   r   r   r   Zmean)r#   rH   rI   rJ   rK   rL   rM   rN   rO   Zstatic_kZstatic_vZtgt_lenZbszZembed_dim_to_checkZhead_dimZscalingÚqÚkÚvr0   r1   Zsrc_lenZk_zerosZv_zerosZattn_output_weightsZattn_outputr&   r&   r'   rP   (  sø    ,



ý
ÿ
$
*
ý

 
 
 



$$




"

þ ÿ"


z MultiheadAttention._forward_impl)	r   TFFNNFNN)NTNTF)NTNTF)Ú__name__Ú
__module__Ú__qualname__r   r   r,   Z__constants__Úintr]   ra   r   r   r(   Úclassmethodr?   r   ZjitZunusedrA   rG   r   r   rQ   rP   Ú__classcell__r&   r&   r$   r'   r      sx   -              ü    ü
@
D
     ø ø?     ø ø)r   Z	torch.jitr   Ztorch.nn.functionalZ
functionalrf   r   Útypingr   r   r^   Ú__all__r   r&   r&   r&   r'   Ú<module>   s   ÿ