#

深度 Q 网络 (DQN) 模型

12import torch
13from torch import nn
14
15from labml_helpers.module import Module

#

决斗网络 ⚔️ $Q$ 价值观模型

我们正在使用决斗网络来计算 Q 值。决斗网络架构背后的直觉是，在大多数州，行动无关紧要，而在某些州，行动意义重大。决斗网络可以很好地体现这一点。

Q^{π} (s, a) E_{a \sim π (s)} [A^{π} (s, a)] = V^{π} (s) + A^{π} (s, a) = 0

因此，我们为 $V$ 和创建了两个网络， $A$ 然后 $Q$ 从中获取。 $Q (s, a) = V (s) + (A (s, a) - \frac{1}{∣ A ∣} a^{'} \in A \sum A (s, a^{'}))$ 我们共享 $V$ 和 $A$ 网络的初始层。

18class Model(Module):

#

49    def __init__(self):
50        super().__init__()
51        self.conv = nn.Sequential(

#

第一个卷积层需要一个 $84 \times 84$ 帧并生成一个 $20 \times 20$ 帧

54            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
55            nn.ReLU(),

#

第二个卷积层获取一个 $20 \times 20$ 帧并生成一个 $9 \times 9$ 帧

59            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
60            nn.ReLU(),

#

第三个卷积层获取一个 $9 \times 9$ 帧并生成一个 $7 \times 7$ 帧

64            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
65            nn.ReLU(),
66        )

#

完全连接的图层从第三个卷积图层获取展平的帧，并输出 $512$ 要素

71        self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
72        self.activation = nn.ReLU()

#

这个头给出了状态值 $V$

75        self.state_value = nn.Sequential(
76            nn.Linear(in_features=512, out_features=256),
77            nn.ReLU(),
78            nn.Linear(in_features=256, out_features=1),
79        )

#

这个头给出了动作值 $A$

81        self.action_value = nn.Sequential(
82            nn.Linear(in_features=512, out_features=256),
83            nn.ReLU(),
84            nn.Linear(in_features=256, out_features=4),
85        )

#

87    def forward(self, obs: torch.Tensor):

#

卷积

89        h = self.conv(obs)

#

线性图层的整形

91        h = h.reshape((-1, 7 * 7 * 64))

#

线性层

94        h = self.activation(self.lin(h))

#

$A$

97        action_value = self.action_value(h)

#

$V$

99        state_value = self.state_value(h)

#

$A (s, a) - \frac{1}{∣ A ∣} \sum_{a^{'} \in A} A (s, a^{'})$

102        action_score_centered = action_value - action_value.mean(dim=-1, keepdim=True)

#

$Q (s, a) = V (s) + (A (s, a) - \frac{1}{∣ A ∣} \sum_{a^{'} \in A} A (s, a^{'}))$

104        q = state_value + action_score_centered
105
106        return q

深度 Q 网络 (DQN) 模型

决斗网络 ⚔️Q 价值观模型

决斗网络 ⚔️ $Q$ 价值观模型