#

Batch-Channel Normalization

This is a PyTorch implementation of Batch-Channel Normalization from the paper Micro-Batch Training with Batch-Channel Normalization and Weight Standardization. We also have an annotated implementation of Weight Standardization.

Batch-Channel Normalization performs batch normalization followed by a channel normalization (similar to a Group Normalization. When the batch size is small a running mean and variance is used for batch normalization.

Here is the training code for training a VGG network that uses weight standardization to classify CIFAR-10 data.

25import torch
26from torch import nn
27
28from labml_nn.normalization.batch_norm import BatchNorm

#

Batch-Channel Normalization

This first performs a batch normalization - either normal batch norm or a batch norm with estimated mean and variance (exponential mean/variance over multiple batches). Then a channel normalization performed.

31class BatchChannelNorm(nn.Module):

#

channels is the number of features in the input
groups is the number of groups the features are divided into
eps is $ϵ$ , used in $Va r [x^{(k)}] + ϵ$ for numerical stability
momentum is the momentum in taking the exponential moving average
estimate is whether to use running mean and variance for batch norm

41    def __init__(self, channels: int, groups: int,
42                 eps: float = 1e-5, momentum: float = 0.1, estimate: bool = True):

#

50        super().__init__()

#

Use estimated batch norm or normal batch norm.

53        if estimate:
54            self.batch_norm = EstimatedBatchNorm(channels,
55                                                 eps=eps, momentum=momentum)
56        else:
57            self.batch_norm = BatchNorm(channels,
58                                        eps=eps, momentum=momentum)

#

Channel normalization

61        self.channel_norm = ChannelNorm(channels, groups, eps)

#

63    def forward(self, x):
64        x = self.batch_norm(x)
65        return self.channel_norm(x)

#

Estimated Batch Normalization

When input $X \in R^{B \times C \times H \times W}$ is a batch of image representations, where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width. $γ \in R^{C}$ and $β \in R^{C}$ .

$\dot{X}_{\cdot, C, \cdot, \cdot} = γ_{C} \frac{X _{\cdot, C, \cdot, \cdot} - μ ^ _{C}}{σ ^ _{C}} + β_{C}$

where,

\overset{μ}{^}_{C} \overset{σ}{^}_{C}^{2} ⟵ (1 - r) \overset{μ}{^}_{C} + r \frac{1}{B H W} b, h, w \sum X_{b, c, h, w} ⟵ (1 - r) \overset{σ}{^}_{C}^{2} + r \frac{1}{B H W} b, h, w \sum (X_{b, c, h, w} - \overset{μ}{^}_{C})^{2}

are the running mean and variances. $r$ is the momentum for calculating the exponential mean.

68class EstimatedBatchNorm(nn.Module):

#

channels is the number of features in the input
eps is $ϵ$ , used in $Va r [x^{(k)}] + ϵ$ for numerical stability
momentum is the momentum in taking the exponential moving average
estimate is whether to use running mean and variance for batch norm

89    def __init__(self, channels: int,
90                 eps: float = 1e-5, momentum: float = 0.1, affine: bool = True):

#

97        super().__init__()
98
99        self.eps = eps
100        self.momentum = momentum
101        self.affine = affine
102        self.channels = channels

#

Channel wise transformation parameters

105        if self.affine:
106            self.scale = nn.Parameter(torch.ones(channels))
107            self.shift = nn.Parameter(torch.zeros(channels))

#

Tensors for $\overset{μ}{^}_{C}$ and $\overset{σ}{^}_{C}^{2}$

110        self.register_buffer('exp_mean', torch.zeros(channels))
111        self.register_buffer('exp_var', torch.ones(channels))

#

x is a tensor of shape [batch_size, channels, *] . * denotes any number of (possibly 0) dimensions. For example, in an image (2D) convolution this will be [batch_size, channels, height, width]

113    def forward(self, x: torch.Tensor):

#

Keep old shape

121        x_shape = x.shape

#

Get the batch size

123        batch_size = x_shape[0]

#

Sanity check to make sure the number of features is correct

126        assert self.channels == x.shape[1]

#

Reshape into [batch_size, channels, n]

129        x = x.view(batch_size, self.channels, -1)

#

Update $\overset{μ}{^}_{C}$ and $\overset{σ}{^}_{C}^{2}$ in training mode only

132        if self.training:

#

No backpropagation through $\overset{μ}{^}_{C}$ and $\overset{σ}{^}_{C}^{2}$

134            with torch.no_grad():

#

Calculate the mean across first and last dimensions; $\frac{1}{B H W} b, h, w \sum X_{b, c, h, w}$

137                mean = x.mean(dim=[0, 2])

#

Calculate the squared mean across first and last dimensions; $\frac{1}{B H W} b, h, w \sum X_{b, c, h, w}^{2}$

140                mean_x2 = (x ** 2).mean(dim=[0, 2])

#

Variance for each feature $\frac{1}{B H W} b, h, w \sum (X_{b, c, h, w} - \overset{μ}{^}_{C})^{2}$

143                var = mean_x2 - mean ** 2

#

Update exponential moving averages

\overset{μ}{^}_{C} \overset{σ}{^}_{C}^{2} ⟵ (1 - r) \overset{μ}{^}_{C} + r \frac{1}{B H W} b, h, w \sum X_{b, c, h, w} ⟵ (1 - r) \overset{σ}{^}_{C}^{2} + r \frac{1}{B H W} b, h, w \sum (X_{b, c, h, w} - \overset{μ}{^}_{C})^{2}

151                self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean
152                self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var

#

Normalize $\frac{X _{\cdot, C, \cdot, \cdot} - μ ^ _{C}}{σ ^ _{C}}$

156        x_norm = (x - self.exp_mean.view(1, -1, 1)) / torch.sqrt(self.exp_var + self.eps).view(1, -1, 1)

#

Scale and shift $γ_{C} \frac{X _{\cdot, C, \cdot, \cdot} - μ ^ _{C}}{σ ^ _{C}} + β_{C}$

161        if self.affine:
162            x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)

#

Reshape to original and return

165        return x_norm.view(x_shape)

#

Channel Normalization

This is similar to Group Normalization but affine transform is done group wise.

168class ChannelNorm(nn.Module):

#

groups is the number of groups the features are divided into
channels is the number of features in the input
eps is $ϵ$ , used in $Va r [x^{(k)}] + ϵ$ for numerical stability
affine is whether to scale and shift the normalized value

175    def __init__(self, channels, groups,
176                 eps: float = 1e-5, affine: bool = True):

#

183        super().__init__()
184        self.channels = channels
185        self.groups = groups
186        self.eps = eps
187        self.affine = affine

#

Parameters for affine transformation.

Note that these transforms are per group, unlike in group norm where they are transformed channel-wise.

192        if self.affine:
193            self.scale = nn.Parameter(torch.ones(groups))
194            self.shift = nn.Parameter(torch.zeros(groups))

#

x is a tensor of shape [batch_size, channels, *] . * denotes any number of (possibly 0) dimensions. For example, in an image (2D) convolution this will be [batch_size, channels, height, width]

196    def forward(self, x: torch.Tensor):

#

Keep the original shape

205        x_shape = x.shape

#

Get the batch size

207        batch_size = x_shape[0]

#

Sanity check to make sure the number of features is the same

209        assert self.channels == x.shape[1]

#

Reshape into [batch_size, groups, n]

212        x = x.view(batch_size, self.groups, -1)

#

Calculate the mean across last dimension; i.e. the means for each sample and channel group $E [x_{(i_{N}, i_{G})}]$

216        mean = x.mean(dim=[-1], keepdim=True)

#

Calculate the squared mean across last dimension; i.e. the means for each sample and channel group $E [x_{(i_{N}, i_{G})}^{2}]$

219        mean_x2 = (x ** 2).mean(dim=[-1], keepdim=True)

#

Variance for each sample and feature group $Va r [x_{(i_{N}, i_{G})}] = E [x_{(i_{N}, i_{G})}^{2}] - E [x_{(i_{N}, i_{G})}]^{2}$

222        var = mean_x2 - mean ** 2

#

Normalize $\overset{x}{^}_{(i_{N}, i_{G})} = \frac{x _{(i_{N}, i_{G})} - E [ x _{(i_{N}, i_{G})} ]}{Va r [ x _{(i_{N}, i_{G})} ] + ϵ}$

227        x_norm = (x - mean) / torch.sqrt(var + self.eps)

#

Scale and shift group-wise $y_{i_{G}} = γ_{i_{G}} \overset{x}{^}_{i_{G}} + β_{i_{G}}$

231        if self.affine:
232            x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)

#

Reshape to original and return

235        return x_norm.view(x_shape)