import matplotlib.pyplot as plt

Why truncated normal initialization?

Neurons would be dead below < -2 and above > 2 since GeLU can be approximated with input times the sigmoid function:

$$ x\sigma(1.702x) $$

so truncated normal helps with that and makes sures all neurons are updated.

trunc_dist = [trunc_normal_(torch.tensor([0.]),std=1.5,a=-2,b=2).item() for o in range(5000)]
plt.hist(trunc_dist, bins=30);

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

bs = 4
x_large = [torch.randn(4,3,224,224)]*2
x_small = [torch.randn(16,3,96,96)]*4

x = x_large + x_small; [xi.size() for xi in x]

[torch.Size([4, 3, 224, 224]),
 torch.Size([4, 3, 224, 224]),
 torch.Size([16, 3, 96, 96]),
 torch.Size([16, 3, 96, 96]),
 torch.Size([16, 3, 96, 96]),
 torch.Size([16, 3, 96, 96])]

vit_encoder = VisionTransformer(patch_size=32, embed_dim=128, depth=4, num_heads=4, mlp_ratio=4,
                                qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))

vit = MultiCropWrapper(vit_encoder)

out = vit(x)

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py:3063: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
  "See the documentation of nn.Upsample for details.".format(mode))
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py:3103: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. 
  warnings.warn("The default behavior for interpolate/upsample with float scale_factor changed "

len(out)

72

ViT

`trunc_normal_`[source]

`drop_path`[source]

`class` `DropPath`[source]

`class` `Mlp`[source]

`class` `Attention`[source]

`class` `Block`[source]

`class` `PatchEmbed`[source]

`class` `VisionTransformer`[source]

`class` `MultiCropWrapper`[source]

`deit_tiny`[source]

`deit_small`[source]

`vit_base`[source]

ViT

trunc_normal_[source]

drop_path[source]

class DropPath[source]

class Mlp[source]

class Attention[source]

class Block[source]

class PatchEmbed[source]

class VisionTransformer[source]

class MultiCropWrapper[source]

deit_tiny[source]

deit_small[source]

vit_base[source]