13import copy
14
15import torch
16import torch.nn as nn
17
18from labml import experiment
19from labml.configs import option
20from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
21from labml_nn.normalization.deep_norm import DeepNormTransformerLayer
22from labml_nn.transformers import MultiHeadAttention
23from labml_nn.transformers.feed_forward import FeedForward
26class AutoregressiveTransformer(nn.Module):
n_tokens
is the number of tokens in the vocabulary d_model
is the embedding size n_layers
is the number of transformer layers layer
is the layer. We use n_layers
copies of this for the tranformer.33 def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: DeepNormTransformerLayer):
40 super().__init__()
Transformer with n_layers
layers
42 self.transformer = nn.Sequential(*[copy.deepcopy(layer) for _ in range(n_layers)])
Token embedding layer
45 self.emb = nn.Embedding(n_tokens, d_model)
Readout layer
47 self.readout = nn.Linear(d_model, n_tokens)
x
are the input tokens of shape [seq_len, batch_size]
49 def forward(self, x: torch.Tensor):
Get the token embeddings
54 x = self.emb(x)
Transformer encoder
56 x = self.transformer(x)
Get logits
58 x = self.readout(x)
Return results
61 return x, None
64class Configs(NLPAutoRegressionConfigs):
Model
73 model: AutoregressiveTransformer
Number of layers
76 n_layers: int = 32
and for DeepNorm
79 deep_norm_alpha: float
80 deep_norm_beta: float
Number of heads in the attention
83 n_heads: int = 4
Embedding size
85 d_model: int = 64
Size of each attention head
87 d_k: int = 16
90@option(Configs.deep_norm_alpha)
91def _deep_norm_alpha(c: Configs):
97 return (2. * c.n_layers) ** (1. / 4.)
100@option(Configs.deep_norm_beta)
101def _deep_norm_beta(c: Configs):
107 return (8. * c.n_layers) ** -(1. / 4.)
110@option(Configs.model)
111def _model(c: Configs):
115 m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
116 DeepNormTransformerLayer(d_model=c.d_model,
117 deep_norm_alpha=c.deep_norm_alpha,
118 deep_norm_beta=c.deep_norm_beta,
119 feed_forward=FeedForward(d_model=c.d_model,
120 d_ff=c.d_model * 4),
121 self_attn=MultiHeadAttention(c.n_heads, c.d_model,
122 dropout_prob=0.0)))
123
124 return m.to(c.device)
127def main():
Create experiment
132 experiment.create(name="deep_norm", writers={'screen', 'web_api'})
Create configs
134 conf = Configs()
Override configurations
136 experiment.configs(conf, {
Use character level tokenizer
138 'tokenizer': 'character',
Prompt separator is blank
140 'prompt_separator': '',
Starting prompt for sampling
142 'prompt': 'It is ',
Use Tiny Shakespeare dataset
144 'text': 'tiny_shakespeare',
Use a context size of
147 'seq_len': 256,
Train for 32 epochs
149 'epochs': 32,
Batch size
151 'batch_size': 16,
Switch between training and validation for times per epoch
153 'inner_iterations': 10,
Number of layers
156 'n_layers': 50,
Adam optimizer with no warmup
160 'optimizer.optimizer': 'Adam',
161 'optimizer.learning_rate': 1.25e-4,
162 })
Set model(s) for saving and loading
165 experiment.add_pytorch_models({'model': conf.model})
Start the experiment
168 with experiment.start():
Run training
170 conf.run()
174if __name__ == '__main__':
175 main()