#

U-Net 用于稳定扩散

这实现了 U-Net，它可以给出 $ϵ_{cond} (x_{t}, c)$

我们保持了 compvis/Stable-Difusi on 的模型定义和命名不变，这样我们就可以直接加载检查点。

18import math
19from typing import List
20
21import numpy as np
22import torch
23import torch.nn as nn
24import torch.nn.functional as F
25
26from labml_nn.diffusion.stable_diffusion.model.unet_attention import SpatialTransformer

#

U-Net 模型

29class UNetModel(nn.Module):

#

in_channels 是输入特征图中的通道数
out_channels 是输出特征图中的通道数
channels 是模型的基本信道数
n_res_blocks 每个级别的剩余区块数
attention_levels 是应该注意的级别
channel_multipliers 是每个级别信道数量的乘法系数
n_heads 是变形金刚中的注意力头数量
tf_layers 是变压器中的变压器层数
d_cond 是变压器中条件嵌入的大小

34    def __init__(
35            self, *,
36            in_channels: int,
37            out_channels: int,
38            channels: int,
39            n_res_blocks: int,
40            attention_levels: List[int],
41            channel_multipliers: List[int],
42            n_heads: int,
43            tf_layers: int = 1,
44            d_cond: int = 768):

#

56        super().__init__()
57        self.channels = channels

#

关卡数

60        levels = len(channel_multipliers)

#

调整时间嵌入的大小

62        d_time_emb = channels * 4
63        self.time_embed = nn.Sequential(
64            nn.Linear(channels, d_time_emb),
65            nn.SiLU(),
66            nn.Linear(d_time_emb, d_time_emb),
67        )

#

输入 U-Net 的一半

70        self.input_blocks = nn.ModuleList()

#

将@@

输入映射到的初始 $3 \times 3$ 卷积channels 。这些方块被封装在TimestepEmbedSequential 模块中，因为不同的模块具有不同的正向函数签名；例如，卷积仅接受特征图，而剩余块接受特征图和时间嵌入。TimestepEmbedSequential 相应地给他们打电话。

77        self.input_blocks.append(TimestepEmbedSequential(
78            nn.Conv2d(in_channels, channels, 3, padding=1)))

#

U-Net 输入半部分中每个模块的信道数

80        input_block_channels = [channels]

#

每个级别的频道数

82        channels_list = [channels * m for m in channel_multipliers]

#

准备关卡

84        for i in range(levels):

#

添加残留方块和注意力

86            for _ in range(n_res_blocks):

#

残差方块从先前的通道数映射到当前关卡中的通道数

89                layers = [ResBlock(channels, d_time_emb, out_channels=channels_list[i])]
90                channels = channels_list[i]

#

添加变压器

92                if i in attention_levels:
93                    layers.append(SpatialTransformer(channels, n_heads, tf_layers, d_cond))

#

将它们加到 U-Net 的输入半部分，并跟踪其输出的通道数

96                self.input_blocks.append(TimestepEmbedSequential(*layers))
97                input_block_channels.append(channels)

#

除最后一个关卡外，所有级别均向下采样

99            if i != levels - 1:
100                self.input_blocks.append(TimestepEmbedSequential(DownSample(channels)))
101                input_block_channels.append(channels)

#

U-Net 的中间

104        self.middle_block = TimestepEmbedSequential(
105            ResBlock(channels, d_time_emb),
106            SpatialTransformer(channels, n_heads, tf_layers, d_cond),
107            ResBlock(channels, d_time_emb),
108        )

#

U-Net 的后半部分

111        self.output_blocks = nn.ModuleList([])

#

按相反的顺序准备关卡

113        for i in reversed(range(levels)):

#

添加残留方块和注意力

115            for j in range(n_res_blocks + 1):

#

残差方块从先前的信道数加上从 U-Net 的输入一半的跳过连接映射到当前关卡中的信道数。

119                layers = [ResBlock(channels + input_block_channels.pop(), d_time_emb, out_channels=channels_list[i])]
120                channels = channels_list[i]

#

添加变压器

122                if i in attention_levels:
123                    layers.append(SpatialTransformer(channels, n_heads, tf_layers, d_cond))

#

在最后一个残差方块之后的每个等级上采样，最后一个区块除外。请注意，我们在反向迭代；i == 0 即最后一次。

127                if i != 0 and j == n_res_blocks:
128                    layers.append(UpSample(channels))

#

将 U-Net 的一半加到输出中

130                self.output_blocks.append(TimestepEmbedSequential(*layers))

#

最终标准化和 $3 \times 3$ 卷积

133        self.out = nn.Sequential(
134            normalization(channels),
135            nn.SiLU(),
136            nn.Conv2d(channels, out_channels, 3, padding=1),
137        )

#

创建正弦时间步长嵌入

time_steps 是形状的时间步长[batch_size]
max_period 控制嵌入的最小频率。

139    def time_step_embedding(self, time_steps: torch.Tensor, max_period: int = 10000):

#

$\frac{c}{2}$ ; 一半的频道是罪恶另一半是 cos，

147        half = self.channels // 2

#

$\frac{1}{1000 0 ^{\frac{2 i}{c}}}$

149        frequencies = torch.exp(
150            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
151        ).to(device=time_steps.device)

#

$\frac{t}{1000 0 ^{\frac{2 i}{c}}}$

153        args = time_steps[:, None].float() * frequencies[None]

#

$cos (\frac{t}{1000 0 ^{\frac{2 i}{c}}})$ 和 $sin (\frac{t}{1000 0 ^{\frac{2 i}{c}}})$

155        return torch.cat([torch.cos(args), torch.sin(args)], dim=-1)

#

x 是形状的输入特征图[batch_size, channels, width, height]
time_steps 是形状的时间步长[batch_size]

cond 形状调节[batch_size, n_cond, d_cond]

157    def forward(self, x: torch.Tensor, time_steps: torch.Tensor, cond: torch.Tensor):

#

存储跳过连接的输入半输出

164        x_input_block = []

#

获取时间步长嵌入信息

167        t_emb = self.time_step_embedding(time_steps)
168        t_emb = self.time_embed(t_emb)

#

输入 U-Net 的一半

171        for module in self.input_blocks:
172            x = module(x, t_emb, cond)
173            x_input_block.append(x)

#

U-Net 的中间

175        x = self.middle_block(x, t_emb, cond)

#

输出 U-Net 的一半

177        for module in self.output_blocks:
178            x = torch.cat([x, x_input_block.pop()], dim=1)
179            x = module(x, t_emb, cond)

#

最终标准化和 $3 \times 3$ 卷积

182        return self.out(x)

#

用于具有不同输入的模块的顺序模块

这个顺序模块可以由不同的模块（例如ResBlock 、nn.Conv 和）组成，SpatialTransformer 并使用匹配的签名调用它们

185class TimestepEmbedSequential(nn.Sequential):

#

193    def forward(self, x, t_emb, cond=None):
194        for layer in self:
195            if isinstance(layer, ResBlock):
196                x = layer(x, t_emb)
197            elif isinstance(layer, SpatialTransformer):
198                x = layer(x, cond)
199            else:
200                x = layer(x)
201        return x

#

向上采样层

204class UpSample(nn.Module):

#

channels 是频道数

209    def __init__(self, channels: int):

#

213        super().__init__()

#

$3 \times 3$ 卷积映射

215        self.conv = nn.Conv2d(channels, channels, 3, padding=1)

#

x 是带有形状的输入要素图[batch_size, channels, height, width]

217    def forward(self, x: torch.Tensor):

#

按系数向上采样 $2$

222        x = F.interpolate(x, scale_factor=2, mode="nearest")

#

应用卷积

224        return self.conv(x)

#

向下采样层

227class DownSample(nn.Module):

#

channels 是频道数

232    def __init__(self, channels: int):

#

236        super().__init__()

#

$3 \times 3$ 卷积，步长为 $2$ 向下采样的系数为 $2$

238        self.op = nn.Conv2d(channels, channels, 3, stride=2, padding=1)

#

x 是带有形状的输入要素图[batch_size, channels, height, width]

240    def forward(self, x: torch.Tensor):

#

应用卷积

245        return self.op(x)

#

ResNet 区块

248class ResBlock(nn.Module):

#

channels 输入通道的数量
d_t_emb 时间步嵌入的大小
out_channels 是输出信道的数量。默认为 `channels。

253    def __init__(self, channels: int, d_t_emb: int, *, out_channels=None):

#

259        super().__init__()

#

out_channels 未指定

261        if out_channels is None:
262            out_channels = channels

#

第一次归一化和卷积

265        self.in_layers = nn.Sequential(
266            normalization(channels),
267            nn.SiLU(),
268            nn.Conv2d(channels, out_channels, 3, padding=1),
269        )

#

时间步长嵌入

272        self.emb_layers = nn.Sequential(
273            nn.SiLU(),
274            nn.Linear(d_t_emb, out_channels),
275        )

#

最终卷积层

277        self.out_layers = nn.Sequential(
278            normalization(out_channels),
279            nn.SiLU(),
280            nn.Dropout(0.),
281            nn.Conv2d(out_channels, out_channels, 3, padding=1)
282        )

#

channels 到剩余连接的out_channels 映射层

285        if out_channels == channels:
286            self.skip_connection = nn.Identity()
287        else:
288            self.skip_connection = nn.Conv2d(channels, out_channels, 1)

#

x 是带有形状的输入要素图[batch_size, channels, height, width]
t_emb 是形状的时间步长嵌入[batch_size, d_t_emb]

290    def forward(self, x: torch.Tensor, t_emb: torch.Tensor):

#

初始卷积

296        h = self.in_layers(x)

#

时间步长嵌入

298        t_emb = self.emb_layers(t_emb).type(h.dtype)

#

添加时间步长嵌入

300        h = h + t_emb[:, :, None, None]

#

最后的卷积

302        h = self.out_layers(h)

#

添加跳过连接

304        return self.skip_connection(x) + h

#

使用 float32 强制转换进行分组归一化

307class GroupNorm32(nn.GroupNorm):

#

312    def forward(self, x):
313        return super().forward(x.float()).type(x.dtype)

#

群组标准化

这是一个辅助函数，具有固定数量的组。

316def normalization(channels):

#

322    return GroupNorm32(32, channels)

#

测试正弦时间步长嵌入

325def _test_time_embeddings():

#

329    import matplotlib.pyplot as plt
330
331    plt.figure(figsize=(15, 5))
332    m = UNetModel(in_channels=1, out_channels=1, channels=320, n_res_blocks=1, attention_levels=[],
333                  channel_multipliers=[],
334                  n_heads=1, tf_layers=1, d_cond=1)
335    te = m.time_step_embedding(torch.arange(0, 1000))
336    plt.plot(np.arange(1000), te[:, [50, 100, 190, 260]].numpy())
337    plt.legend(["dim %d" % p for p in [50, 100, 190, 260]])
338    plt.title("Time embeddings")
339    plt.show()

#

343if __name__ == '__main__':
344    _test_time_embeddings()