U
    h5}                     @   s^  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlm  mZ d dlmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZmZ d d
l m!Z! d dl"m#Z#m$Z$ d dl%m&Z& dddgZ'ee(e(f e(e(e(ee(e(f dddZ)ee(e(f e(eee(e(f  dddZ*e(e(ejdddZ+G dd dej,Z-G dd dej,Z.G dd dej,Z/G dd  d ej,Z0G d!d" d"ej,Z1G d#d$ d$ej,Z2G d%d& d&ej,Z3G d'd( d(ej,Z4G d)d dej,Z5d5e(ee( ee( e6e(e(e	e e7ee5d+
d,d-Z8G d.d deZ9e ed/e9j:fd0dd1d2e	e9 e7ee5d3d4dZ;dS )6    N)OrderedDict)partial)AnyCallableListOptionalSequenceTuple)nnTensor)register_modelWeightsWeightsEnum)_IMAGENET_CATEGORIES)_ovewrite_named_paramhandle_legacy_interface)Conv2dNormActivationSqueezeExcitation)StochasticDepth)ImageClassificationInterpolationMode)_log_api_usage_onceMaxVitMaxVit_T_Weightsmaxvit_t)
input_sizekernel_sizestridepaddingreturnc                 C   s8   | d | d|  | d | d | d|  | d fS )Nr          )r   r   r   r   r"   r"   K/var/www/html/venv/lib/python3.8/site-packages/torchvision/models/maxvit.py_get_conv_output_shape   s    r$   )r   n_blocksr   c                 C   s<   g }t | ddd}t|D ]}t |ddd}|| q|S )zQUtil function to check that the input size is correct for a MaxVit configuration.   r    r!   )r$   rangeappend)r   r%   ZshapesZblock_input_shape_r"   r"   r#   _make_block_input_shapes    s    r*   )heightwidthr   c                 C   s   t t t | t |g}t |d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | d 7  < |d d d d df  |d 7  < |d d d d df  d| d 9  < |dS )Nr!   r    r   )torchstackZmeshgridZarangeflattenpermute
contiguoussum)r+   r,   ZcoordsZcoords_flatZrelative_coordsr"   r"   r#   _get_relative_position_index*   s     ,""&r4   c                       s\   e Zd ZdZdeeeeeedejf edejf edd	 fddZ	e
e
dd	d
Z  ZS )MBConva=  MBConv: Mobile Inverted Residual Bottleneck.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        stride (int): Stride of the depthwise convolution.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        p_stochastic_dropout (float): Probability of stochastic depth.
            .N)	in_channelsout_channelsexpansion_ratiosqueeze_ratior   activation_layer
norm_layerp_stochastic_dropoutr   c	                    s*  t    |  |dkp||k}	|	rdtj||ddddg}
|dkrVtjd|ddg|
 }
tj|
 | _n
t | _t|| }t|| }|rt	|dd| _
n
t | _
t }|||d	< t||ddd
||d d|d< t||d|d|||d d	|d< t||tjd|d< tj||ddd|d< t|| _d S )Nr!   T)r   r   biasr    r&   r   r   r   rowmodeZpre_normr   )r   r   r   r;   r<   inplaceZconv_a)r   r   r   r;   r<   groupsrC   Zconv_b)Z
activationZsqueeze_excitation)r7   r8   r   r>   Zconv_c)super__init__r
   Conv2dZ	AvgPool2d
SequentialprojIdentityintr   stochastic_depthr   r   r   ZSiLUlayers)selfr7   r8   r9   r:   r   r;   r<   r=   Zshould_projrI   Zmid_channelsZsqz_channelsZ_layers	__class__r"   r#   rF   C   sP    





zMBConv.__init__xr   c                 C   s"   |  |}| | |}|| S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H / stride, W / stride].
        )rI   rL   rM   rN   rR   resr"   r"   r#   forward   s    
zMBConv.forward)r6   )__name__
__module____qualname____doc__rK   floatr   r
   ModulerF   r   rU   __classcell__r"   r"   rO   r#   r5   5   s    =r5   c                       sL   e Zd ZdZeeedd fddZejdddZeed	d
dZ	  Z
S )$RelativePositionalMultiHeadAttentionzRelative Positional Multi-Head Attention.

    Args:
        feat_dim (int): Number of input features.
        head_dim (int): Number of features per head.
        max_seq_len (int): Maximum sequence length.
    N)feat_dimhead_dimmax_seq_lenr   c                    s   t    || dkr*td| d| || | _|| _tt|| _|| _	t
|| j| j d | _|d | _t
| j| j || _t
jtjd| j d d| j d  | jftjd| _| d	t| j| j tj
jj| jd
d d S )Nr   z
feat_dim: z  must be divisible by head_dim: r&   g      r    r!   )Zdtyperelative_position_index{Gz?Zstd)rE   rF   
ValueErrorn_headsr_   rK   mathsqrtsizer`   r
   Linearto_qkvscale_factormergeZ	parameter	Parameterr.   emptyZfloat32relative_position_bias_tableZregister_bufferr4   initZtrunc_normal_)rN   r^   r_   r`   rO   r"   r#   rF      s    


,z-RelativePositionalMultiHeadAttention.__init__r   c                 C   s@   | j d}| j| | j| jd}|ddd }|dS )Nr-   r    r   r!   )ra   viewro   r`   r1   r2   Z	unsqueeze)rN   Z
bias_indexZrelative_biasr"   r"   r#   get_relative_positional_bias   s    zARelativePositionalMultiHeadAttention.get_relative_positional_biasrQ   c                 C   s  |j \}}}}| j| j }}| |}tj|ddd\}	}
}|	|||||ddddd}	|
|||||ddddd}
||||||ddddd}|
| j }
t	d|	|
}| 
 }tj|| dd}t	d	||}|ddddd||||}| |}|S )
z
        Args:
            x (Tensor): Input tensor with expected layout of [B, G, P, D].
        Returns:
            Tensor: Output tensor with expected layout of [B, G, P, D].
        r&   r-   )dimr   r!   r       z!B G H I D, B G H J D -> B G H I Jz!B G H I J, B G H J D -> B G H I D)shapere   r_   rj   r.   chunkreshaper1   rk   Zeinsumrs   FZsoftmaxrl   )rN   rR   BGPDHZDHZqkvqkvZdot_prodZpos_biasoutr"   r"   r#   rU      s    
   

z,RelativePositionalMultiHeadAttention.forward)rV   rW   rX   rY   rK   rF   r.   r   rs   rU   r\   r"   r"   rO   r#   r]      s   
r]   c                       s>   e Zd ZdZeedd fddZejejdddZ  Z	S )	SwapAxeszPermute the axes of a tensor.N)abr   c                    s   t    || _|| _d S N)rE   rF   r   r   )rN   r   r   rO   r"   r#   rF      s    
zSwapAxes.__init__rQ   c                 C   s   t || j| j}|S r   )r.   Zswapaxesr   r   rS   r"   r"   r#   rU      s    zSwapAxes.forward)
rV   rW   rX   rY   rK   rF   r.   r   rU   r\   r"   r"   rO   r#   r      s   r   c                       s8   e Zd ZdZdd fddZeeedddZ  ZS )	WindowPartitionzB
    Partition the input tensor into non-overlapping windows.
    Nrq   c                    s   t    d S r   rE   rF   rN   rO   r"   r#   rF      s    zWindowPartition.__init__)rR   pr   c                 C   sf   |j \}}}}|}||||| ||| |}|dddddd}|||| ||  || |}|S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
            p (int): Number of partitions.
        Returns:
            Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C].
        r   r    ru   r&      r!   rv   rx   r1   )rN   rR   r   rz   Cr~   Wr|   r"   r"   r#   rU      s     zWindowPartition.forward	rV   rW   rX   rY   rF   r   rK   rU   r\   r"   r"   rO   r#   r      s   r   c                       s<   e Zd ZdZdd fddZeeeeedddZ  ZS )	WindowDepartitionzo
    Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W].
    Nrq   c                    s   t    d S r   r   r   rO   r"   r#   rF     s    zWindowDepartition.__init__)rR   r   h_partitionsw_partitionsr   c                 C   s`   |j \}}}}|}	|| }
}|||
||	|	|}|dddddd}||||
|	 ||	 }|S )ar  
        Args:
            x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C].
            p (int): Number of partitions.
            h_partitions (int): Number of vertical partitions.
            w_partitions (int): Number of horizontal partitions.
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H, W].
        r   r   r!   r&   r    ru   r   )rN   rR   r   r   r   rz   r{   ZPPr   r|   ZHPZWPr"   r"   r#   rU     s    

zWindowDepartition.forwardr   r"   r"   rO   r#   r      s   r   c                       sh   e Zd ZdZeeeeeeef eedej	f edej	f e
e
e
dd fddZeeddd	Z  ZS )
PartitionAttentionLayera  
    Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window.

    Args:
        in_channels (int): Number of input channels.
        head_dim (int): Dimension of each attention head.
        partition_size (int): Size of the partitions.
        partition_type (str): Type of partitioning to use. Can be either "grid" or "window".
        grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into.
        mlp_ratio (int): Ratio of the  feature size expansion in the MLP layer.
        activation_layer (Callable[..., nn.Module]): Activation function to use.
        norm_layer (Callable[..., nn.Module]): Normalization function to use.
        attention_dropout (float): Dropout probability for the attention layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        p_stochastic_dropout (float): Probability of dropping out a partition.
    .N)r7   r_   partition_sizepartition_type	grid_size	mlp_ratior;   r<   attention_dropoutmlp_dropoutr=   r   c              	      s(  t    || | _|| _|d | | _|| _|| _|dkrDtd|dkr^|| j | _| _	n| j| | _| _	t
 | _t | _|dkrtddnt | _|dkrtddnt | _t||t|||d t|	| _tt|t||| | t|| |t|
| _t|d	d
| _d S )Nr   )gridwindowz0partition_type must be either 'grid' or 'window'r   r   r    r@   rA   )rE   rF   re   r_   Zn_partitionsr   r   rd   r   gr   partition_opr   departition_opr   r
   rJ   partition_swapdepartition_swaprH   r]   ZDropout
attn_layer	LayerNormri   	mlp_layerr   stochastic_dropout)rN   r7   r_   r   r   r   r   r;   r<   r   r   r=   rO   r"   r#   rF   ,  s8    

		z PartitionAttentionLayer.__init__rQ   c                 C   s   | j d | j | j d | j  }}t| j d | j dkoL| j d | j dkd| j | j | || j}| |}|| | | }|| | 	| }| 
|}| || j||}|S )z
        Args:
            x (Tensor): Input tensor with expected layout of [B, C, H, W].
        Returns:
            Tensor: Output tensor with expected layout of [B, C, H, W].
        r   r!   z[Grid size must be divisible by partition size. Got grid size of {} and partition size of {})r   r   r.   Z_assertformatr   r   r   r   r   r   r   )rN   rR   ghZgwr"   r"   r#   rU   f  s    
"& 

zPartitionAttentionLayer.forward)rV   rW   rX   rY   rK   strr	   r   r
   r[   rZ   rF   r   rU   r\   r"   r"   rO   r#   r     s   
:r   c                       sn   e Zd ZdZeeeeeedejf edejf eeeeeee	eef dd fddZ
eeddd	Z  ZS )
MaxVitLayera  
    MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        stride (int): Stride of the depthwise convolution.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Ratio of the MLP layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        attention_dropout (float): Dropout probability for the attention layer.
        p_stochastic_dropout (float): Probability of stochastic depth.
        partition_size (int): Size of the partitions.
        grid_size (Tuple[int, int]): Size of the input feature grid.
    .N)r7   r8   r:   r9   r   r<   r;   r_   r   r   r   r=   r   r   r   c                    s   t    t }t||||||||d|d< t|||d||	|tj||
|d|d< t|||d||	|tj||
|d|d< t|| _d S )N)r7   r8   r9   r:   r   r;   r<   r=   ZMBconvr   )r7   r_   r   r   r   r   r;   r<   r   r   r=   Zwindow_attentionr   Zgrid_attention)	rE   rF   r   r5   r   r
   r   rH   rM   )rN   r7   r8   r:   r9   r   r<   r;   r_   r   r   r   r=   r   r   rM   rO   r"   r#   rF     sN    



zMaxVitLayer.__init__rQ   c                 C   s   |  |}|S z
        Args:
            x (Tensor): Input tensor of shape (B, C, H, W).
        Returns:
            Tensor: Output tensor of shape (B, C, H, W).
        rM   )rN   rR   r"   r"   r#   rU     s    
zMaxVitLayer.forward)rV   rW   rX   rY   rK   rZ   r   r
   r[   r	   rF   r   rU   r\   r"   r"   rO   r#   r     s$   
Ar   c                       sr   e Zd ZdZeeeeedejf edejf eeeeee	eef ee
e dd fddZeeddd	Z  ZS )
MaxVitBlocka(  
    A MaxVit block consisting of `n_layers` MaxVit layers.

     Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expansion_ratio (float): Expansion ratio in the bottleneck.
        squeeze_ratio (float): Squeeze ratio in the SE Layer.
        activation_layer (Callable[..., nn.Module]): Activation function.
        norm_layer (Callable[..., nn.Module]): Normalization function.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Ratio of the MLP layer.
        mlp_dropout (float): Dropout probability for the MLP layer.
        attention_dropout (float): Dropout probability for the attention layer.
        p_stochastic_dropout (float): Probability of stochastic depth.
        partition_size (int): Size of the partitions.
        input_grid_size (Tuple[int, int]): Size of the input feature grid.
        n_layers (int): Number of layers in the block.
        p_stochastic (List[float]): List of probabilities for stochastic depth for each layer.
    .N)r7   r8   r:   r9   r<   r;   r_   r   r   r   r   input_grid_sizen_layersp_stochasticr   c                    s   t    t||ks,td| d| dt | _t|dddd| _t	|D ]V\}}|dkrddnd}|  jt
|dkr||n||||||||||	|
|| j|d	g7  _qPd S )
Nz'p_stochastic must have length n_layers=z, got p_stochastic=.r&   r    r!   r?   r   )r7   r8   r:   r9   r   r<   r;   r_   r   r   r   r   r   r=   )rE   rF   lenrd   r
   
ModuleListrM   r$   r   	enumerater   )rN   r7   r8   r:   r9   r<   r;   r_   r   r   r   r   r   r   r   idxr   r   rO   r"   r#   rF     s2    

zMaxVitBlock.__init__rQ   c                 C   s   | j D ]}||}q|S r   r   )rN   rR   layerr"   r"   r#   rU   ,  s    

zMaxVitBlock.forward)rV   rW   rX   rY   rK   rZ   r   r
   r[   r	   r   rF   r   rU   r\   r"   r"   rO   r#   r     s$   
3r   c                       s   e Zd ZdZdejddddddfeeef eeee ee ee	e
edejf  edejf e	e	ee	e	edd fd	d
ZeedddZdd Z  ZS )r   ay  
    Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_ paper.
    Args:
        input_size (Tuple[int, int]): Size of the input image.
        stem_channels (int): Number of channels in the stem.
        partition_size (int): Size of the partitions.
        block_channels (List[int]): Number of channels in each block.
        block_layers (List[int]): Number of layers in each block.
        stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value.
        squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25.
        expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4.
        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.01)`).
        activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU.
        head_dim (int): Dimension of the attention heads.
        mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4.
        mlp_dropout (float): Dropout probability for the MLP layer. Default: 0.0.
        attention_dropout (float): Dropout probability for the attention layer. Default: 0.0.
        num_classes (int): Number of classes. Default: 1000.
    Ng      ?ru   r6   i  .)r   stem_channelsr   block_channelsblock_layersr_   stochastic_depth_probr<   r;   r:   r9   r   r   r   num_classesr   c                    s  t    t|  d}|d kr.ttjddd}t|t|}t|D ]J\}}|d | dksl|d | dkrDt	d| d| d	| d
| d	qDt
t||dd||	dd dt||ddd d dd| _t|dddd}|| _t | _|g|d d  }|}td|t| }d}t|||D ]X\}}}| jt|||
|||	|||||||||||  d | jd j}||7 }q t
tdt t|d t|d |d t tj|d |dd| _|   d S )Nr&   gMbP?g{Gz?)epsZmomentumr   r!   zInput size z
 of block z$ is not divisible by partition size zx. Consider changing the partition size or the input size.
Current configuration yields the following block input sizes: r   r    F)r   r<   r;   r>   rC   T)r   r<   r;   r>   r?   r-   )r7   r8   r:   r9   r<   r;   r_   r   r   r   r   r   r   r   )r>   ) rE   rF   r   r   r
   BatchNorm2dr*   r   r   rd   rH   r   stemr$   r   r   blocksnpZlinspacer3   tolistzipr(   r   r   ZAdaptiveAvgPool2dZFlattenr   ri   ZTanh
classifier_init_weights)rN   r   r   r   r   r   r_   r   r<   r;   r:   r9   r   r   r   r   Zinput_channelsZblock_input_sizesr   Zblock_input_sizer7   r8   r   Zp_idxZ
in_channelZout_channelZ
num_layersrO   r"   r#   rF   M  s    
 
      
	zMaxVit.__init__rQ   c                 C   s,   |  |}| jD ]}||}q| |}|S r   )r   r   r   )rN   rR   blockr"   r"   r#   rU     s
    



zMaxVit.forwardc                 C   s   |   D ]}t|tjrDtjj|jdd |jd k	rtj|j qt|tj	rrtj
|jd tj
|jd qt|tjrtjj|jdd |jd k	rtj|j qd S )Nrb   rc   r!   r   )modules
isinstancer
   rG   rp   Znormal_weightr>   Zzeros_r   Z	constant_ri   )rN   mr"   r"   r#   r     s    

zMaxVit._init_weights)rV   rW   rX   rY   r
   ZGELUr	   rK   r   rZ   r   r   r[   rF   r   rU   r   r\   r"   r"   rO   r#   r   8  s:   %
vF)
r   r   r   r   r   r_   weightsprogresskwargsr   c              
   K   s   |d k	rPt |dt|jd  |jd d |jd d ks>tt |d|jd  |dd}	tf | ||||||	d|}
|d k	r|
|j|d	d
 |
S )Nr   
categoriesmin_sizer   r!   r      r   )r   r   r   r   r_   r   r   T)r   Z
check_hash)r   r   metaAssertionErrorpopr   Zload_state_dictZget_state_dict)r   r   r   r   r   r_   r   r   r   r   modelr"   r"   r#   _maxvit  s&     r   c                   @   sH   e Zd Zedeeddejdedddddd	d
idddddZ	e	Z
dS )r   z9https://download.pytorch.org/models/maxvit_t-bc5ab103.pthr   )Z	crop_sizeZresize_sizeinterpolationir   zLhttps://github.com/pytorch/vision/tree/main/references/classification#maxvitzImageNet-1KgT@g|?5.X@)zacc@1zacc@5gZd;@gK7]@zThese weights reproduce closely the results of the paper using a similar training recipe.
            They were trained with a BatchNorm2D momentum of 0.99 instead of the more correct 0.01.)r   Z
num_paramsr   ZrecipeZ_metricsZ_ops
_file_sizeZ_docs)urlZ
transformsr   N)rV   rW   rX   r   r   r   r   ZBICUBICr   IMAGENET1K_V1DEFAULTr"   r"   r"   r#   r     s.      Z
pretrained)r   T)r   r   )r   r   r   r   c                 K   s:   t | } tf dddddgddddgddd	| |d
|S )a  
    Constructs a maxvit_t architecture from
    `MaxViT: Multi-Axis Vision Transformer <https://arxiv.org/abs/2204.01697>`_.

    Args:
        weights (:class:`~torchvision.models.MaxVit_T_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.MaxVit_T_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.maxvit.MaxVit``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/maxvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.MaxVit_T_Weights
        :members:
    @         i   r    r       g?   )r   r   r   r_   r   r   r   r   )r   verifyr   )r   r   r   r"   r"   r#   r     s    


	)NF)<rf   collectionsr   	functoolsr   typingr   r   r   r   r   r	   numpyr   r.   Ztorch.nn.functionalr
   Z
functionalry   r   Ztorchvision.models._apir   r   r   Ztorchvision.models._metar   Ztorchvision.models._utilsr   r   Ztorchvision.ops.miscr   r   Z torchvision.ops.stochastic_depthr   Ztorchvision.transforms._presetsr   r   Ztorchvision.utilsr   __all__rK   r$   r*   r4   r[   r5   r]   r   r   r   r   r   r   r   rZ   boolr   r   r   r   r"   r"   r"   r#   <module>   sb    &&
WIhaU .  *