import matplotlib.pyplot as plt

Why truncated normal initialization?

Neurons would be dead below < -2 and above > 2 since GeLU can be approximated with input times the sigmoid function:

$$ x\sigma(1.702x) $$

so truncated normal helps with that and makes sures all neurons are updated.

trunc_normal_[source]

trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0)

trunc_dist = [trunc_normal_(torch.tensor([0.]),std=1.5,a=-2,b=2).item() for o in range(5000)]
plt.hist(trunc_dist, bins=30);

drop_path[source]

drop_path(x, drop_prob:float=0.0, training:bool=False)

class DropPath[source]

DropPath(drop_prob=None) :: Module

Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

class Mlp[source]

Mlp(in_features, hidden_features=None, out_features=None, act_layer=GELU, drop=0.0) :: Module

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

class Attention[source]

Attention(dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0) :: Module

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

class Block[source]

Block(dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=GELU, norm_layer=LayerNorm) :: Module

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

class PatchEmbed[source]

PatchEmbed(img_size=224, patch_size=16, in_chans=3, embed_dim=768) :: Module

Image to Patch Embedding

class VisionTransformer[source]

VisionTransformer(img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=LayerNorm, **kwargs) :: Module

Vision Transformer

class MultiCropWrapper[source]

MultiCropWrapper(encoder) :: Module

Perform forward pass separately on each resolution input. The inputs corresponding to a single resolution are clubbed and single forward is run on the same resolution inputs. Hence we do several forward passes = number of different resolutions used. We then concatenate all the output features and run the head forward on these concatenated features.

bs = 4
x_large = [torch.randn(4,3,224,224)]*2
x_small = [torch.randn(16,3,96,96)]*4
x = x_large + x_small; [xi.size() for xi in x]
[torch.Size([4, 3, 224, 224]),
 torch.Size([4, 3, 224, 224]),
 torch.Size([16, 3, 96, 96]),
 torch.Size([16, 3, 96, 96]),
 torch.Size([16, 3, 96, 96]),
 torch.Size([16, 3, 96, 96])]
vit_encoder = VisionTransformer(patch_size=32, embed_dim=128, depth=4, num_heads=4, mlp_ratio=4,
                                qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))
vit = MultiCropWrapper(vit_encoder)
out = vit(x)
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py:3063: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  "See the documentation of nn.Upsample for details.".format(mode))
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py:3103: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. 
  warnings.warn("The default behavior for interpolate/upsample with float scale_factor changed "
len(out)
72

deit_tiny[source]

deit_tiny(patch_size=16, img_size=[224], in_chans=3, num_classes=0, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0)

deit_small[source]

deit_small(patch_size=16, img_size=[224], in_chans=3, num_classes=0, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0)

vit_base[source]

vit_base(patch_size=16, img_size=[224], in_chans=3, num_classes=0, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0)