U
    h(l                     @   s,  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
  mZ d dlm
Z
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZmZ dddddddgZee ee ee eee ee f dddZ ej!"d ejejee edddZ#ej!"d eeeef eeeef eeeef dddZ$ej!"d eeeeef eeeef eeeef edddZ%ej!"d d:eeeeee eee e&e&ee ee e'ed"d#d$Z(ej!"d$ G d%d& d&e
j)Z*G d'd( d(e
j)Z+G d)d de
j)Z,ee eee ee ee e&ee e'ee,d*
d+d,Z-ed-d.d/Z.G d0d deZ/G d1d deZ0G d2d deZ1e ed3e/j2fd4dd!d5ee/ e'ee,d6d7dZ3e ed3e0j2fd4dd!d5ee0 e'ee,d6d8dZ4e ed3e1j2fd4dd!d5ee1 e'ee,d6d9dZ5dS );    )partial)AnyCallableListOptionalTupleN)nnTensor   )VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)PatchMergingSwinTransformerBlockSwinTransformer3dSwin3D_T_WeightsSwin3D_S_WeightsSwin3D_B_Weightsswin3d_tswin3d_sswin3d_b)
shift_sizesize_dhwwindow_sizereturnc                 C   s:   t dD ](}|| || kr|| ||< d| |< q|| fS )Nr
   r   range)r   r   r   i r$   [/var/www/html/venv/lib/python3.8/site-packages/torchvision/models/video/swin_transformer.py_get_window_and_shift_size    s
    
r&   )relative_position_bias_tablerelative_position_indexr   r    c                 C   s^   |d |d  |d  }| |d |d |f    }|||d}|ddd d}|S )Nr      r   )flattenviewpermute
contiguous	unsqueeze)r'   r(   r   Z
window_volrelative_position_biasr$   r$   r%   _get_relative_position_bias/   s    r1   )r   
patch_sizer    c                    s.    fddt dD }|d |d |d fS )Nc                    s,   g | ]$} | |  |    |  qS r$   r$   .0r#   r2   r   r$   r%   
<listcomp>@   s     z(_compute_pad_size_3d.<locals>.<listcomp>r
   r   r)   r   r!   )r   r2   pad_sizer$   r5   r%   _compute_pad_size_3d?   s    r8   )xr   r   r   r    c              
      sj  | j | }|d d  |d d   |d d   } fddtdD }d}|d D ]Z}|d D ]L}	|d D ]>}
|||d |d |	d |	d |
d |
d f< |d7 }qvqjq^||d d  d |d d  d |d d  d }|dddddd|d d  d  }|d|d }||dktd	|dktd
}|S )Nr   r)   r   c                    s:   g | ]2}d |  f|   |  f |  dffqS )r   Nr$   r3   r   r   r$   r%   r6   P   s
   z._compute_attention_mask_3d.<locals>.<listcomp>r
         g      Y        )Z	new_zerosr"   r,   r-   reshaper/   Zmasked_fillfloat)r9   r   r   r   	attn_masknum_windowsZslicescountdhwr$   r:   r%   _compute_attention_mask_3dG   s4    
02 $rF   r=   T)input
qkv_weightproj_weightr0   r   	num_headsr   attention_dropoutdropoutqkv_bias	proj_biastrainingr    c                 C   s  | j \}}}}}t|||f|d |d |d f}t| ddd|d d|d d|d f}|j \}}}}}|||f}t|dkrtj||d  |d  |d  fdd}|d |d  |d |d   |d |d   }|||d |d  |d |d |d  |d |d |d  |d |}|ddddddd	d
	|| |d |d  |d  |}t
|||	}|	|d|dd||| ddddd}|d |d |d   }}}||| d  }||dd}|| }t|dkrt||d |d |d f|d |d |d f|d |d |d f}||d| |||d|d}||dd }|d||d|d}tj|dd}tj|||d}||dd	|d|d|}t
|||
}tj|||d}|||d |d  |d |d  |d |d  |d |d |d |}|ddddddd	d
	|||||}t|dkrtj||d |d |d fdd}|ddd|d|d|ddf  }|S )a  
    Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.
    Args:
        input (Tensor[B, T, H, W, C]): The input tensor, 5-dimensions.
        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
        relative_position_bias (Tensor): The learned relative position bias added to attention.
        window_size (List[int]): 3-dimensions window size, T, H, W .
        num_heads (int): Number of attention heads.
        shift_size (List[int]): Shift size for shifted window attention (T, H, W).
        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
        dropout (float): Dropout ratio of output. Default: 0.0.
        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
        training (bool, optional): Training flag used by the dropout parameters. Default: True.
    Returns:
        Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
    r   r)   r   )r)   r   r
   )ZshiftsZdimsr
   r<   r;         g      r*   )dim)prO   N)shaper8   FpadsumtorchZrollr,   r-   r>   ZlinearsizematmulZ	transposerF   r/   ZsoftmaxrL   r.   )rG   rH   rI   r0   r   rJ   r   rK   rL   rM   rN   rO   btrD   rE   cr7   r9   _tphpZwpZpadded_sizerA   qkvqkvZattnr@   r$   r$   r%   shifted_window_attention_3ds   sx    !"(
(.
  0((
&"*rf   c                       s   e Zd ZdZdeee ee eeeeedd	 fddZddd	d
Z	ddddZ
ee ejdddZeedddZ  ZS )ShiftedWindowAttention3dz2
    See :func:`shifted_window_attention_3d`.
    Tr=   N)	rS   r   r   rJ   rM   rN   rK   rL   r    c	           	         s   t    t|dks"t|dkr*td|| _|| _|| _|| _|| _t	j
||d |d| _t	j
|||d| _|   |   d S )Nr
   z.window_size and shift_size must be of length 2)bias)super__init__len
ValueErrorr   r   rJ   rK   rL   r   Linearrb   proj#define_relative_position_bias_tabledefine_relative_position_index)	selfrS   r   r   rJ   rM   rN   rK   rL   	__class__r$   r%   rj      s    
z!ShiftedWindowAttention3d.__init__)r    c                 C   s^   t td| jd  d d| jd  d  d| jd  d  | j| _t jj| jdd d S )Nr   r   r)   {Gz?std)	r   	ParameterrY   Zzerosr   rJ   r'   inittrunc_normal_rq   r$   r$   r%   ro     s    4z<ShiftedWindowAttention3d.define_relative_position_bias_tablec                    s   fddt dD }ttj|d |d |d dd}t|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df   jd d 7  < |d d d d df   jd d 7  < |d d d d df   jd d 7  < |d d d d df  d jd  d d jd  d  9  < |d d d d df  d jd  d 9  < |d	} 	d
| d S )Nc                    s   g | ]}t  j| qS r$   )rY   Zaranger   r3   rz   r$   r%   r6     s     zKShiftedWindowAttention3d.define_relative_position_index.<locals>.<listcomp>r
   r   r)   r   Zij)Zindexingr*   r(   )
r"   rY   stackZmeshgridr+   r-   r.   r   rX   Zregister_buffer)rq   Z
coords_dhwZcoordsZcoords_flattenZrelative_coordsr(   r$   rz   r%   rp     s    ,(((>,
z7ShiftedWindowAttention3d.define_relative_position_index)r   r    c                 C   s   t | j| j|S )N)r1   r'   r(   )rq   r   r$   r$   r%   get_relative_position_bias#  s    z3ShiftedWindowAttention3d.get_relative_position_biasr9   r    c           
      C   s   |j \}}}}}|||g}| j | j  }}t|||\}}| |}	t|| jj| j	j|	|| j
|| j| j| jj| j	j| jdS )N)r   rK   rL   rM   rN   rO   )rU   r   copyr   r&   r|   rf   rb   weightrn   rJ   rK   rL   rh   rO   )
rq   r9   r_   r]   rD   rE   r   r   r   r0   r$   r$   r%   forward&  s&    

z ShiftedWindowAttention3d.forward)TTr=   r=   )__name__
__module____qualname____doc__intr   boolr?   rj   ro   rp   rY   r	   r|   r   __classcell__r$   r$   rr   r%   rg      s(   
    
rg   c                       sR   e Zd ZdZdee eeeedej	f  dd fddZ
eed	d
dZ  ZS )PatchEmbed3da;  Video to Patch Embedding.

    Args:
        patch_size (List[int]): Patch token size.
        in_channels (int): Number of input channels. Default: 3
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    r
   `   N.)r2   in_channels	embed_dim
norm_layerr    c                    sd   t    t|  |d |d |d f| _tj||| j| jd| _|d k	rV||| _n
t | _d S )Nr   r)   r   )Zkernel_sizeZstride)	ri   rj   r   tuple_patch_sizer   ZConv3drn   normZIdentity)rq   r2   r   r   r   rr   r$   r%   rj   K  s    
zPatchEmbed3d.__init__r}   c              
   C   s|   |  \}}}}}t|||f| j}t|d|d d|d d|d f}| |}|ddddd}| jdk	rx| |}|S )zForward function.r   r   r)   r
   r;   N)rZ   r8   r   rV   rW   rn   r-   r   )rq   r9   r_   r]   rD   rE   r7   r$   r$   r%   r   a  s    $


zPatchEmbed3d.forward)r
   r   N)r   r   r   r   r   r   r   r   r   Modulerj   r	   r   r   r$   r$   rr   r%   r   A  s      r   c                       s   e Zd ZdZdddddddedf	ee eee ee ee eeeeeee	de
jf  ee	de
jf  e	de
jf ee	de
jf  dd fd	d
ZeedddZ  ZS )r   aY  
    Implements 3D Swin Transformer from the `"Video Swin Transformer" <https://arxiv.org/abs/2106.13230>`_ paper.
    Args:
        patch_size (List[int]): Patch size.
        embed_dim (int): Patch embedding dimension.
        depths (List(int)): Depth of each Swin Transformer layer.
        num_heads (List(int)): Number of attention heads in different layers.
        window_size (List[int]): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
        dropout (float): Dropout rate. Default: 0.0.
        attention_dropout (float): Attention dropout rate. Default: 0.0.
        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
        num_classes (int): Number of classes for classification head. Default: 400.
        norm_layer (nn.Module, optional): Normalization layer. Default: None.
        block (nn.Module, optional): SwinTransformer Block. Default: None.
        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
        patch_embed (nn.Module, optional): Patch Embedding layer. Default: None.
    g      @r=   皙?i  N.)r2   r   depthsrJ   r   	mlp_ratiorL   rK   stochastic_depth_probnum_classesr   blockdownsample_layerpatch_embedr    c                    s  t    t|  |
| _|d kr,tttd}|d krBttjdd}|d krNt	}||||d| _
tj|d| _g }t|}d}tt|D ]}g }|d|  }t|| D ]T |	t| |d  }||||| | fd	d
|D |||||td
 |d7 }q|tj|  |t|d k r|||| qtj| | _|dt|d   | _|| j| _td| _t| j|
| _|  D ]@}t|tjrtjj|jdd |jd k	rtj |j qd S )N)
attn_layergh㈵>)eps)r2   r   r   )rT   r   r   r)   c                    s$   g | ]} d  dkrdn|d  qS )r   r   r$   )r4   rE   Zi_layerr$   r%   r6     s     z.SwinTransformer3d.__init__.<locals>.<listcomp>)r   r   r   rL   rK   r   r   r   rt   ru   )!ri   rj   r   r   r   r   rg   r   Z	LayerNormr   r   ZDropoutpos_droprX   r"   rk   r?   appendZ
SequentialfeaturesZnum_featuresr   ZAdaptiveAvgPool3davgpoolrm   headmodules
isinstancerx   ry   r   rh   Zzeros_)rq   r2   r   r   rJ   r   r   rL   rK   r   r   r   r   r   r   ZlayersZtotal_stage_blocksZstage_block_idZi_stageZstagerS   Zsd_probmrr   r   r%   rj     s^    

zSwinTransformer3d.__init__r}   c                 C   s^   |  |}| |}| |}| |}|ddddd}| |}t|d}| |}|S )Nr   r;   r)   r   r
   )	r   r   r   r   r-   r   rY   r+   r   )rq   r9   r$   r$   r%   r     s    





zSwinTransformer3d.forward)r   r   r   r   r   r   r   r?   r   r   r   r   rj   r	   r   r   r$   r$   rr   r%   r   n  s8   L)
r2   r   r   rJ   r   r   weightsprogresskwargsr    c           
   	   K   sZ   |d k	rt |dt|jd  tf | |||||d|}	|d k	rV|	|j|dd |	S )Nr   
categories)r2   r   r   rJ   r   r   T)r   Z
check_hash)r   rk   metar   Zload_state_dictZget_state_dict)
r2   r   r   rJ   r   r   r   r   r   modelr$   r$   r%   _swin_transformer3d  s    
r   )r)   r)   r)   )r   Zmin_sizeZmin_temporal_sizec                   @   sH   e Zd Zedeedddddeddd	d
dddiddddZeZdS )r   z9https://download.pytorch.org/models/swin3d_t-7615ae03.pth   r      g
ףp=
?gv/?gCl?gZd;O?gy&1?g?Z	crop_sizeZresize_sizeZmeanrv   Fhttps://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400The weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`ivKinetics-400g(\mS@gK7aW@zacc@1zacc@5g7A`E@gnb^@ZrecipeZ_docsZ
num_paramsZ_metricsZ_ops
_file_sizeurlZ
transformsr   N	r   r   r   r   r   r   _COMMON_METAKINETICS400_V1DEFAULTr$   r$   r$   r%   r      s.   c                   @   sH   e Zd Zedeedddddeddd	d
dddiddddZeZdS )r   z9https://download.pytorch.org/models/swin3d_s-da41c237.pthr   r   r   r   r   r   r   if$r   gMbXS@g'1W@r   gҵT@gK7Ik@r   r   Nr   r$   r$   r$   r%   r     s.   c                   @   s   e Zd Zedeedddddeddd	d
dddiddddZedeedddddeddd	d
dddiddddZeZ	dS )r   z<https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pthr   r   r   r   r   r   r   iX?r   gSS@gbX9W@r   gMbXa@g/$v@r   r   z=https://download.pytorch.org/models/swin3d_b_22k-7c6ae6fa.pthgx&iT@g~jW@N)
r   r   r   r   r   r   r   r   ZKINETICS400_IMAGENET22K_V1r   r$   r$   r$   r%   r   >  sZ   Z
pretrained)r   )r   r   )r   r   r   r    c                 K   sF   t | } tf dddgdddddgddddgdd	d	gd
| |d|S )a  
    Constructs a swin_tiny architecture from
    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.

    Args:
        weights (:class:`~torchvision.models.video.Swin3D_T_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.Swin3D_T_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.Swin3D_T_Weights
        :members:
    r   r;   r   rP   r
            rQ   r   r2   r   r   rJ   r   r   r   r   )r   verifyr   r   r   r   r$   r$   r%   r   x  s    


	c                 K   sF   t | } tf dddgdddddgddddgd	d
d
gd| |d|S )a  
    Constructs a swin_small architecture from
    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.

    Args:
        weights (:class:`~torchvision.models.video.Swin3D_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.Swin3D_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.Swin3D_S_Weights
        :members:
    r   r;   r      r
   rP   r   r   r   rQ   r   r   )r   r   r   r   r$   r$   r%   r     s    


	c                 K   sF   t | } tf dddgdddddgddddgdddgd	| |d
|S )a  
    Constructs a swin_base architecture from
    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.

    Args:
        weights (:class:`~torchvision.models.video.Swin3D_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.Swin3D_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.Swin3D_B_Weights
        :members:
    r   r;      r   r          rQ   r   r   )r   r   r   r   r$   r$   r%   r     s    


	)r=   r=   NNT)6	functoolsr   typingr   r   r   r   r   rY   Ztorch.nn.functionalr   Z
functionalrV   r	   Ztransforms._presetsr   utilsr   Z_apir   r   r   _metar   Z_utilsr   r   Zswin_transformerr   r   __all__r   r&   Zfxwrapr1   r8   rF   r?   r   rf   r   rg   r   r   r   r   r   r   r   r   r   r   r   r$   r$   r$   r%   <module>   s       0)     p[-n:$$$$