U
    h                  	   @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ ddlmZmZ ddlmZ ddlmZ d	d
lmZmZmZ d	dlmZ d	dlmZm Z  dddddgZ!eG dd dZ"ee# e#dddZ$ej%e#e#eej%e#f dddZ&ej%e#e#e#ej%dddZ'ej()d ej()d G dd dej*Z+ej%e#ej%dd d!Z,ej%ej%ee#e#e#f ee#e#e#f ej%ej%ej%ej%d"d#d$Z-ej%ej%e.d%d&d'Z/ej()d$ ej()d' G d(d) d)ej*Z0G d*d+ d+ej*Z1G d,d- d-ej*Z2G d.d dej*Z3e	e" e4e
e e.ee3d/d0d1Z5G d2d deZ6G d3d deZ7e e d4e6j8fd5dd6d7e
e6 e.ee3d8d9dZ9e e d4e7j8fd5dd6d7e
e7 e.ee3d8d:dZ:dS );    N)	dataclass)partial)AnyCallableDictListOptionalSequenceTuple   )MLPStochasticDepth)VideoClassification)_log_api_usage_once   )register_modelWeightsWeightsEnum)_KINETICS400_CATEGORIES)_ovewrite_named_paramhandle_legacy_interfaceMViTMViT_V1_B_Weights	mvit_v1_bMViT_V2_S_Weights	mvit_v2_sc                   @   sV   e Zd ZU eed< eed< eed< ee ed< ee ed< ee ed< ee ed< dS )	MSBlockConfig	num_headsinput_channelsoutput_channelskernel_q	kernel_kvstride_q	stride_kvN)__name__
__module____qualname__int__annotations__r    r)   r)   O/var/www/html/venv/lib/python3.8/site-packages/torchvision/models/video/mvit.pyr      s   
r   )sreturnc                 C   s   d}| D ]}||9 }q|S N   r)   )r+   productvr)   r)   r*   _prod&   s    
r1   )x
target_dim
expand_dimr,   c                 C   s@   |   }||d kr | |} n||kr8td| j | |fS )Nr.   zUnsupported input dimension )dim	unsqueeze
ValueErrorshaper2   r3   r4   
tensor_dimr)   r)   r*   
_unsqueeze-   s    r;   )r2   r3   r4   r:   r,   c                 C   s   ||d kr|  |} | S r-   )squeezer9   r)   r)   r*   _squeeze6   s    
r=   c                       sl   e Zd Zd	ejeej eej edd fddZej	e
eeef e
ej	e
eeef f dddZ  ZS )
PoolNF)poolnorm
activationnorm_before_poolr,   c                    sV   t    || _g }|d k	r&|| |d k	r8|| |rFtj| nd | _|| _d S )N)super__init__r?   appendnn
Sequentialnorm_actrB   )selfr?   r@   rA   rB   layers	__class__r)   r*   rD   A   s    


zPool.__init__r2   thwr,   c                 C   s   t |dd\}}tj|ddd\}}|dd}|jd d \}}}||| |f|  }| jrx| jd k	rx| |}| 	|}|jdd  \}}	}
||||ddd}tj
||fdd}| js| jd k	r| |}t|dd|}|||	|
ffS )	N   r.   r.   r   )indicesr5   r   r5   )r;   torchZtensor_split	transposer8   reshape
contiguousrB   rH   r?   catr=   )rI   r2   rN   r:   class_tokenBNCTHWr)   r)   r*   forwardR   s    


zPool.forward)NF)r$   r%   r&   rF   Moduler   boolrD   rT   Tensorr
   r'   r`   __classcell__r)   r)   rK   r*   r>   @   s     r>   )	embeddingdr,   c                 C   s@   | j d |kr| S tjj| ddd|dddddS )Nr   r.   Zlinear)sizemode)r8   rF   Z
functionalZinterpolatepermuter6   r<   )re   rf   r)   r)   r*   _interpolatel   s     rj   )attnqq_thwk_thw	rel_pos_h	rel_pos_w	rel_pos_tr,   c           %      C   s6  |\}}}	|\}
}}t dt|| d }t dt|	| d }t dt||
 d }t|| d}t|| d}t|d d d f | t|d d d f d|  |  }t||	 d}t|	| d}t|	d d d f | t|d d d f d|  |  }t|
| d}t||
 d}t|d d d f | t|
d d d f d|
  |  }t||}t||}t||}||  }||  }||  }|j\}}}}|d d d d dd f |||||	|} td| |}!td| |}"| 	dddddd	||| | |	 |} t
| |dddd}#|#||||	||
	dddddd	}#|!d d d d d d d d d d d d d d f |"d d d d d d d d d d d d d d f  |#d d d d d d d d d d d d d d f  |||| |	 |
| | }$| d d d d dd dd f  |$7  < | S )
Nr   r.         ?zbythwc,hkc->bythwkzbythwc,wkc->bythwkr   r   rO      )r'   maxrT   Zarangerj   longr8   rV   Zeinsumri   matmulrU   view)%rk   rl   rm   rn   ro   rp   rq   Zq_tZq_hZq_wZk_tZk_hZk_wZdhZdwdtZ	q_h_ratioZ	k_h_ratioZdist_hZ	q_w_ratioZ	k_w_ratioZdist_wZ	q_t_ratioZ	k_t_ratioZdist_tZRhZRwZRtrZ   Zn_head_r5   Zr_qZrel_h_qZrel_w_qZrel_q_tZrel_posr)   r)   r*   _add_rel_pos{   sP    


<<<


**$...  
 
(rz   r2   Zshortcutresidual_with_cls_embedc              	   C   sX   |r|  | nD| d d d d dd d d f  |d d d d dd d d f 7  < | S r-   )add_r{   r)   r)   r*   _add_shortcut   s    Dr~   c                       s   e Zd Zdejfee eeeee ee ee ee eeeee	dej
f dd fddZejeeeef eejeeeef f ddd	Z  ZS )
MultiscaleAttention        .N)
input_size	embed_dim
output_dimr   r    r!   r"   r#   residual_poolr|   rel_pos_embeddropout
norm_layerr,   c              
      st  t    || _|| _|| _|| | _dt| j | _|	| _	|
| _
t|d| | _t||g}|dkr|tj|dd tj| | _d | _t|dkst|dkrdd |D }ttj| j| j|||| jd	d
|| j| _d | _d | _t|dkst|dkrrdd |D }ttj| j| j|||| jd	d
|| j| _ttj| j| j|||| jd	d
|| j| _d | _d | _d | _|rpt|dd  }t|dkr||d  n|}t|dkr||d  n|}dt|| d }d|d  d }tt|| j| _tt|| j| _tt|| j| _tj j!| jdd tj j!| jdd tj j!| jdd d S )Nrr   r   r   Tinplacer.   c                 S   s   g | ]}t |d  qS r   r'   ).0rl   r)   r)   r*   
<listcomp>   s     z0MultiscaleAttention.__init__.<locals>.<listcomp>F)stridepaddinggroupsbiasc                 S   s   g | ]}t |d  qS r   r   )r   kvr)   r)   r*   r      s     r   r   {Gz?std)"rC   rD   r   r   r   head_dimmathsqrtscalerr   r|   rF   LinearqkvrE   DropoutrG   projectpool_qr1   r>   Conv3dpool_kpool_vro   rp   rq   rt   len	ParameterrT   zerosinittrunc_normal_)rI   r   r   r   r   r    r!   r"   r#   r   r|   r   r   r   rJ   Z	padding_qZ
padding_kvrg   Zq_sizeZkv_sizeZspatial_dimZtemporal_dimrK   r)   r*   rD      s    

			zMultiscaleAttention.__init__rM   c                 C   s<  |j \}}}| |||d| j| jddjdd\}}}| jd k	rZ| ||\}}	n|}	| jd k	rx| ||d }| j	d k	r| 	||\}}t
| j| |dd}
| jd k	r| jd k	r| jd k	rt|
|||	| j| j| j}
|
jdd}
t
|
|}| jrt||| j |dd|d| j}| |}||fS )Nr   r.   r   rS   r   rR   )r8   r   rV   r   r   rU   Zunbindr   r   r   rT   rv   r   ro   rp   rq   rz   Zsoftmaxr   r~   r|   r   r   )rI   r2   rN   rZ   r[   r\   rl   kr0   rn   rk   r)   r)   r*   r`      s6    2


	
zMultiscaleAttention.forward)r$   r%   r&   rF   	LayerNormr   r'   rb   floatr   ra   rD   rT   rc   r
   r`   rd   r)   r)   rK   r*   r      s&   \r   c                       s~   e Zd Zddejfee eeeeee	e	e
dejf dd
 fddZejeeeef eejeeeef f ddd	Z  ZS )
MultiscaleBlockr   .N)
r   cnfr   r|   r   proj_after_attnr   stochastic_depth_probr   r,   c
                    s  t    || _d | _t|jdkr\dd |jD }
dd |
D }ttj|
|j|dd | _|rf|j	n|j
}|	|j
| _|	|| _t| jtj| _t||j
||j|j|j|j|j|||||	d| _t|d| |j	gtj|d d| _t|d	| _d | _|j
|j	krt|j
|j	| _d S )
Nr.   c                 S   s    g | ]}|d kr|d  n|qS rP   r)   )r   r+   r)   r)   r*   r   U  s     z,MultiscaleBlock.__init__.<locals>.<listcomp>c                 S   s   g | ]}t |d  qS r   r   )r   r   r)   r)   r*   r   V  s     )r   r   )	r    r!   r"   r#   r   r   r|   r   r   rO   )Zactivation_layerr   r   row)rC   rD   r   	pool_skipr1   r"   r>   rF   Z	MaxPool3dr   r   norm1norm2
isinstanceZBatchNorm1dneeds_transposalr   r   r    r!   r#   rk   r   ZGELUmlpr   stochastic_depthr   r   )rI   r   r   r   r|   r   r   r   r   r   Zkernel_skipZpadding_skipZattn_dimrK   r)   r*   rD   D  sP    
 
zMultiscaleBlock.__init__rM   c           	      C   s   | j r | |ddddn| |}| ||\}}| jd ksJ| jsN|n| |}| jd krf|n| ||d }|| | }| j r| |ddddn| |}| jd ks| jr|n| |}|| | 	| |fS )Nr.   r   r   )
r   r   rU   rk   r   r   r   r   r   r   )	rI   r2   rN   Zx_norm1Zx_attnZthw_newZx_skipZx_norm2Zx_projr)   r)   r*   r`   ~  s    **zMultiscaleBlock.forward)r$   r%   r&   rF   r   r   r'   r   rb   r   r   ra   rD   rT   rc   r
   r`   rd   r)   r)   rK   r*   r   C  s    	:r   c                       sF   e Zd Zeeeef eedd fddZejejdddZ	  Z
S )PositionalEncodingN)
embed_sizespatial_sizetemporal_sizer   r,   c                    s   t    || _|| _tt|| _d | _	d | _
d | _|stt| jd | jd  || _	tt| j|| _
tt|| _d S )Nr   r.   )rC   rD   r   r   rF   r   rT   r   rY   spatial_postemporal_pos	class_pos)rI   r   r   r   r   rK   r)   r*   rD     s    
$zPositionalEncoding.__init__r2   r,   c                 C   s   | j |ddd}tj||fdd}| jd k	r| jd k	r| jd k	r| jj	\}}tj
| j|dd}|| jd| jddd| tj| jd|fddd}|| |S )Nr   rR   r.   rS   )rY   expandrg   r6   rT   rX   r   r   r   r8   Zrepeat_interleaver}   r   rV   )rI   r2   rY   Zhw_sizer   Zpos_embeddingr)   r)   r*   r`     s    & 
zPositionalEncoding.forward)r$   r%   r&   r'   r
   rb   rD   rT   rc   r`   rd   r)   r)   rK   r*   r     s   "r   c                       s   e Zd Zdeeef eee eeeeeeeee	e
dejf  e	e
dejf  eeeef eeeef eeeef dd	 fd
dZejejdddZ  ZS )r         ?r     Nr      r   r   rO   rO   r.   r   r   .)r   r   block_settingr   r|   r   r   r   attention_dropoutr   num_classesblockr   patch_embed_kernelpatch_embed_stridepatch_embed_paddingr,   c                    s*  t    t|  t|}|dkr*td|dkr6t}|dkrLttjdd}tj	d|d j
|||d| _dd	 t|f| | jjD }t|d j
|d
 |d f|d |d| _t | _t|D ]^\}}|
| |d  }| j||||||||	||d	 t|jdkrdd	 t||jD }q||d j| _ttj|ddt|d j|| _|  D ]}t|tjrtjj|jdd t|tjr"|j dk	r"tj!|j d ntt|tjr|jdk	rtj!|jd |j dk	r"tj!|j d n,t|tr^|" D ]}tjj|dd q
q^dS )a  
        MViT main class.

        Args:
            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
            temporal_size (int): The temporal size ``T`` of the input.
            block_setting (sequence of MSBlockConfig): The Network structure.
            residual_pool (bool): If True, use MViTv2 pooling residual connection.
            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
                the class embedding.
            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
            proj_after_attn (bool): If True, apply the projection after the attention.
            dropout (float): Dropout rate. Default: 0.0.
            attention_dropout (float): Attention dropout rate. Default: 0.0.
            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
            num_classes (int): The number of classes.
            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
            norm_layer (callable, optional): Module specifying the normalization layer to use.
            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
        r   z+The configuration parameter can't be empty.Ngư>)epsr   )Zin_channelsZout_channelsZkernel_sizer   r   c                 S   s   g | ]\}}|| qS r)   r)   r   rg   r   r)   r)   r*   r     s     z!MViT.__init__.<locals>.<listcomp>r.   r   )r   r   r   r   rr   )	r   r   r   r|   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r)   r)   r   r)   r)   r*   r   
  s     rR   Tr   r   r   r   )#rC   rD   r   r   r7   r   r   rF   r   r   r   	conv_projzipr   r   pos_encodingZ
ModuleListblocks	enumeraterE   r"   r   r@   rG   r   r   headmodulesr   r   r   weightr   Z	constant_
parameters)rI   r   r   r   r   r|   r   r   r   r   r   r   r   r   r   r   r   Ztotal_stage_blocksr   Zstage_block_idr   Zsd_probmweightsrK   r)   r*   rD     sv    )

zMViT.__init__r   c                 C   s   t |ddd }| |}|ddd}| |}| jjf| jj }| jD ]}|||\}}qN| |}|d d df }| 	|}|S )Nrs   r   r   r.   )
r;   r   flattenrU   r   r   r   r   r@   r   )rI   r2   rN   r   r)   r)   r*   r`   !  s    




zMViT.forward)	r   r   r   r   NNr   r   r   )r$   r%   r&   r
   r'   r	   r   rb   r   r   r   rF   ra   rD   rT   rc   r`   rd   r)   r)   rK   r*   r     s:   
         
x)r   r   r   progresskwargsr,   c                 K   s   |d k	rbt |dt|jd  |jd d |jd d ks>tt |d|jd  t |d|jd  |dd	}|dd
}tf ||| |dd|dd|dd|dd|d|}|d k	r||j|dd |S )Nr   
categoriesmin_sizer   r.   r   r   min_temporal_size   r      r   Fr|   Tr   r   )r   r   r   r   r|   r   r   r   )r   Z
check_hash)r   r   metaAssertionErrorpopr   Zload_state_dictZget_state_dict)r   r   r   r   r   r   r   modelr)   r)   r*   _mvit8  s,     



	r   c                   @   sJ   e Zd Zedeedddddddedd	d
ddddiddd	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pthr      ?r   r   ?r   r   Z	crop_sizeZresize_sizeZmeanr   r   zShttps://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.mdThe weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`ip.Kinetics-400gJ+S@gh|?eW@zacc@1zacc@5guVQ@g rxa@	r   r   r   ZrecipeZ_docsZ
num_paramsZ_metricsZ_ops
_file_sizeurlZ
transformsr   N	r$   r%   r&   r   r   r   r   KINETICS400_V1DEFAULTr)   r)   r)   r*   r   Y  s2   c                   @   sJ   e Zd Zedeedddddddedd	d
ddddiddd	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v2_s-ae3be167.pthr   r   r   r   r   r   zChttps://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.mdr   ir   g r0T@g(\W@r   guVP@g?5^I|`@r   r   Nr   r)   r)   r)   r*   r   z  s2   Z
pretrained)r   T)r   r   )r   r   r   r,   c                 K   sz  t | } ddddddddddddddddgddddddddddddddddgddddddddddddddddgg d	d	d	gg d	d	d	gg g g g g g g g g g d	d	d	gg gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	ggg dddgg dddgg g g g g g g g g g dddgg gdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddggd
}g }tt|d D ]Z}|t|d | |d | |d | |d | |d | |d | |d | d
 qtf dd|dd|dd| |d|S )a  
    Constructs a base MViTV1 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
        :members:
    r.   r   rO      `           r   r   r   r   r    r!   r"   r#   r   r   r   r    r!   r"   r#   r   r   Fr   皙?)r   r   r   r   r|   r   r   r   )r   verifyranger   rE   r   r   r   r   r   r   configr   ir)   r)   r*   r     s    
"""44,







	c                 K   s  t | } ddddddddddddddddgddddddddddddddddgddddddddddddddddgd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	ggd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	gd	d	d	ggdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddggdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddgdddggd
}g }tt|d D ]Z}|t|d | |d | |d | |d | |d | |d | |d | d
 qtf dd|dddd|dd| |d
|S )aC  Constructs a small MViTV2 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
    `MViTv2: Improved Multiscale Vision Transformers for Classification
    and Detection <https://arxiv.org/abs/2112.01526>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
            :members:
    r.   r   rO   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   r   TFr   r   )
r   r   r   r   r|   r   r   r   r   r   )r   r   r   r   rE   r   r   r   r  r)   r)   r*   r     s    
"""N







);r   dataclassesr   	functoolsr   typingr   r   r   r   r   r	   r
   rT   Ztorch.fxZtorch.nnrF   Zopsr   r   Ztransforms._presetsr   utilsr   Z_apir   r   r   _metar   Z_utilsr   r   __all__r   r'   r1   rc   r;   r=   Zfxwrapra   r>   rj   rz   rb   r~   r   r   r   r   r   r   r   r   r   r   r   r)   r)   r)   r*   <module>   sx   $	
 	,< H !!!$`