# 用于去噪扩散概率模型 (DDPM) 的 U-Net 模型

U-Net 是从模型图中的 U 形中获取它的名字。它通过逐步降低（减半）要素图分辨率，然后提高分辨率来处理给定的图像。每种分辨率都有直通连接。

24import math
25from typing import Optional, Tuple, Union, List
26
27import torch
28from torch import nn
29
30from labml_helpers.module import Module

### Swish 激活功能

33class Swish(Module):
40    def forward(self, x):
41        return x * torch.sigmoid(x)

### 嵌入用于

44class TimeEmbedding(nn.Module):
• n_channels 是嵌入中的维数
49    def __init__(self, n_channels: int):
53        super().__init__()
54        self.n_channels = n_channels

56        self.lin1 = nn.Linear(self.n_channels // 4, self.n_channels)

58        self.act = Swish()

60        self.lin2 = nn.Linear(self.n_channels, self.n_channels)
62    def forward(self, t: torch.Tensor):

72        half_dim = self.n_channels // 8
73        emb = math.log(10_000) / (half_dim - 1)
74        emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb)
75        emb = t[:, None] * emb[None, :]
76        emb = torch.cat((emb.sin(), emb.cos()), dim=1)

79        emb = self.act(self.lin1(emb))
80        emb = self.lin2(emb)
83        return emb

### 剩余方块

86class ResidualBlock(Module):
• in_channels 是输入通道的数量
• out_channels 是输入通道的数量
• time_channels 是时间步 () 嵌入中的通道数
• n_groups 是用于组标准化的组
• dropout 是辍学率
94    def __init__(self, in_channels: int, out_channels: int, time_channels: int,
95                 n_groups: int = 32, dropout: float = 0.1):
103        super().__init__()

105        self.norm1 = nn.GroupNorm(n_groups, in_channels)
106        self.act1 = Swish()
107        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(3, 3), padding=(1, 1))

110        self.norm2 = nn.GroupNorm(n_groups, out_channels)
111        self.act2 = Swish()
112        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(3, 3), padding=(1, 1))

116        if in_channels != out_channels:
117            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1))
118        else:
119            self.shortcut = nn.Identity()

122        self.time_emb = nn.Linear(time_channels, out_channels)
123        self.time_act = Swish()
124
125        self.dropout = nn.Dropout(dropout)
• x 有形状[batch_size, in_channels, height, width]
• t 有形状[batch_size, time_channels]
127    def forward(self, x: torch.Tensor, t: torch.Tensor):

133        h = self.conv1(self.act1(self.norm1(x)))

135        h += self.time_emb(self.time_act(t))[:, :, None, None]

137        h = self.conv2(self.dropout(self.act2(self.norm2(h))))

140        return h + self.shortcut(x)

### 注意力块

143class AttentionBlock(Module):
• n_channels 是输入中的声道数
• d_k 是每个头部的尺寸数
• n_groups 是组归一化的组
150    def __init__(self, n_channels: int, n_heads: int = 1, d_k: int = None, n_groups: int = 32):
157        super().__init__()

160        if d_k is None:
161            d_k = n_channels

163        self.norm = nn.GroupNorm(n_groups, n_channels)

165        self.projection = nn.Linear(n_channels, n_heads * d_k * 3)

167        self.output = nn.Linear(n_heads * d_k, n_channels)

169        self.scale = d_k ** -0.5
172        self.d_k = d_k
• x 有形状[batch_size, in_channels, height, width]
• t 有形状[batch_size, time_channels]
174    def forward(self, x: torch.Tensor, t: Optional[torch.Tensor] = None):

t 未使用，但它保留在参数中，因为要与注意层函数签名匹配ResidualBlock

181        _ = t

183        batch_size, n_channels, height, width = x.shape

x 成形状[batch_size, seq, n_channels]

185        x = x.view(batch_size, n_channels, -1).permute(0, 2, 1)

187        qkv = self.projection(x).view(batch_size, -1, self.n_heads, 3 * self.d_k)

189        q, k, v = torch.chunk(qkv, 3, dim=-1)

191        attn = torch.einsum('bihd,bjhd->bijh', q, k) * self.scale

193        attn = attn.softmax(dim=2)

195        res = torch.einsum('bijh,bjhd->bihd', attn, v)

197        res = res.view(batch_size, -1, self.n_heads * self.d_k)

199        res = self.output(res)

202        res += x

205        res = res.permute(0, 2, 1).view(batch_size, n_channels, height, width)
208        return res

### 向下方块

211class DownBlock(Module):
218    def __init__(self, in_channels: int, out_channels: int, time_channels: int, has_attn: bool):
219        super().__init__()
220        self.res = ResidualBlock(in_channels, out_channels, time_channels)
221        if has_attn:
222            self.attn = AttentionBlock(out_channels)
223        else:
224            self.attn = nn.Identity()
226    def forward(self, x: torch.Tensor, t: torch.Tensor):
227        x = self.res(x, t)
228        x = self.attn(x)
229        return x

### 向上方块

232class UpBlock(Module):
239    def __init__(self, in_channels: int, out_channels: int, time_channels: int, has_attn: bool):
240        super().__init__()

243        self.res = ResidualBlock(in_channels + out_channels, out_channels, time_channels)
244        if has_attn:
245            self.attn = AttentionBlock(out_channels)
246        else:
247            self.attn = nn.Identity()
249    def forward(self, x: torch.Tensor, t: torch.Tensor):
250        x = self.res(x, t)
251        x = self.attn(x)
252        return x

### 中间方块

255class MiddleBlock(Module):
263    def __init__(self, n_channels: int, time_channels: int):
264        super().__init__()
265        self.res1 = ResidualBlock(n_channels, n_channels, time_channels)
266        self.attn = AttentionBlock(n_channels)
267        self.res2 = ResidualBlock(n_channels, n_channels, time_channels)
269    def forward(self, x: torch.Tensor, t: torch.Tensor):
270        x = self.res1(x, t)
271        x = self.attn(x)
272        x = self.res2(x, t)
273        return x

### 按比例放大要素地图

276class Upsample(nn.Module):
281    def __init__(self, n_channels):
282        super().__init__()
283        self.conv = nn.ConvTranspose2d(n_channels, n_channels, (4, 4), (2, 2), (1, 1))
285    def forward(self, x: torch.Tensor, t: torch.Tensor):

t 未使用，但它保留在参数中，因为要与注意层函数签名匹配ResidualBlock

288        _ = t
289        return self.conv(x)

### 按比例缩小要素地图

292class Downsample(nn.Module):
297    def __init__(self, n_channels):
298        super().__init__()
299        self.conv = nn.Conv2d(n_channels, n_channels, (3, 3), (2, 2), (1, 1))
301    def forward(self, x: torch.Tensor, t: torch.Tensor):

t 未使用，但它保留在参数中，因为要与注意层函数签名匹配ResidualBlock

304        _ = t
305        return self.conv(x)

## U-Net

308class UNet(Module):
• image_channels 是图像中的通道数。对于 RGB。
• n_channels 是初始特征图中我们将图像转换为的通道数
• ch_mults 是每种分辨率下的通道编号列表。频道的数量是ch_mults[i] * n_channels
• is_attn 是一个布尔值列表，用于指示是否在每个分辨率下使用注意力
• n_blocks 是每种分辨UpDownBlocks 率的数字
313    def __init__(self, image_channels: int = 3, n_channels: int = 64,
314                 ch_mults: Union[Tuple[int, ...], List[int]] = (1, 2, 2, 4),
315                 is_attn: Union[Tuple[bool, ...], List[int]] = (False, False, True, True),
316                 n_blocks: int = 2):
324        super().__init__()

327        n_resolutions = len(ch_mults)

330        self.image_proj = nn.Conv2d(image_channels, n_channels, kernel_size=(3, 3), padding=(1, 1))

333        self.time_emb = TimeEmbedding(n_channels * 4)

#### U-Net 的前半部分-分辨率降低

336        down = []

338        out_channels = in_channels = n_channels

340        for i in range(n_resolutions):

342            out_channels = in_channels * ch_mults[i]

344            for _ in range(n_blocks):
345                down.append(DownBlock(in_channels, out_channels, n_channels * 4, is_attn[i]))
346                in_channels = out_channels

348            if i < n_resolutions - 1:
349                down.append(Downsample(in_channels))

352        self.down = nn.ModuleList(down)

355        self.middle = MiddleBlock(out_channels, n_channels * 4, )

#### U-Net 的后半部分-提高分辨率

358        up = []

360        in_channels = out_channels

362        for i in reversed(range(n_resolutions)):

n_blocks 以相同的分辨率

364            out_channels = in_channels
365            for _ in range(n_blocks):
366                up.append(UpBlock(in_channels, out_channels, n_channels * 4, is_attn[i]))

368            out_channels = in_channels // ch_mults[i]
369            up.append(UpBlock(in_channels, out_channels, n_channels * 4, is_attn[i]))
370            in_channels = out_channels

372            if i > 0:
373                up.append(Upsample(in_channels))

376        self.up = nn.ModuleList(up)

379        self.norm = nn.GroupNorm(8, n_channels)
380        self.act = Swish()
381        self.final = nn.Conv2d(in_channels, image_channels, kernel_size=(3, 3), padding=(1, 1))
• x 有形状[batch_size, in_channels, height, width]
• t 有形状[batch_size]
383    def forward(self, x: torch.Tensor, t: torch.Tensor):

390        t = self.time_emb(t)

393        x = self.image_proj(x)

h 将以每种分辨率存储输出以进行跳过连接

396        h = [x]

U-Net 的上半年

398        for m in self.down:
399            x = m(x, t)
400            h.append(x)

403        x = self.middle(x, t)

U-Net 的下半场

406        for m in self.up:
407            if isinstance(m, Upsample):
408                x = m(x, t)
409            else:

411                s = h.pop()
412                x = torch.cat((x, s), dim=1)
414                x = m(x, t)

417        return self.final(self.act(self.norm(x)))