This trains a feedback transformer model for auto-regression. You can pick the original feedback transformer or the new version where the keys and values are precalculated.
Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
18import torch
19from labml import experiment
20from labml.configs import option
21from labml.utils.pytorch import get_modules
22from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
23from torch import nn
26class AutoregressiveModel(nn.Module):
31 def __init__(self, n_vocab: int, d_model: int, transformer: nn.Module):
32 super().__init__()
Token embedding module
34 self.src_embed = nn.Embedding(n_vocab, d_model)
35 self.transformer = transformer
36 self.generator = nn.Linear(d_model, n_vocab)
38 def forward(self, x: torch.Tensor):
Embed the tokens
40 x = self.src_embed(x)
Run it through the the transformer
42 res = self.transformer(x)
Generate logits of the next token
44 return self.generator(res), None
47class Configs(NLPAutoRegressionConfigs):
54 model: AutoregressiveModel
55
56 d_model: int = 512
57 heads: int = 8
58 dropout: float = 0.0
59 d_ff: int = 2048
60 n_layers: int = 6
Create original feedback transformer.
63@option(Configs.model)
64def feedback_transformer(c: Configs):
68 from labml_nn.transformers.feedback import FeedbackTransformer, FeedbackTransformerLayer, \
69 FeedbackAttention, FeedForward
70
71 return AutoregressiveModel(
72 c.n_tokens, c.d_model,
73 FeedbackTransformer(
74 FeedbackTransformerLayer(d_model=c.d_model,
75 attn=FeedbackAttention(c.heads, c.d_model, c.dropout),
76 feed_forward=FeedForward(c.d_model, c.d_ff, c.dropout),
77 dropout_prob=c.dropout),
78 c.n_layers)).to(c.device)
Create updated feedback transformer, with precalculated keys and values.
81@option(Configs.model)
82def feedback_transformer_kv(c: Configs):
86 from labml_nn.transformers.feedback import FeedbackTransformerKV, FeedbackTransformerLayer, \
87 FeedbackAttention, FeedForward
88
89 return AutoregressiveModel(
90 c.n_tokens, c.d_model,
91 FeedbackTransformerKV(
92 FeedbackTransformerLayer(d_model=c.d_model,
93 attn=FeedbackAttention(c.heads, c.d_model, c.dropout,
94 is_kv_precomputed=True),
95 feed_forward=FeedForward(c.d_model, c.d_ff, c.dropout),
96 dropout_prob=c.dropout),
97 c.n_layers, c.d_model, c.heads)).to(c.device)
100def main():
Create experiment
102 experiment.create(name="feedback_transformer")
Create configs
104 conf = Configs()
Load configurations
106 experiment.configs(conf,
A dictionary of configurations to override
108 {'tokenizer': 'character',
109 'text': 'tiny_shakespeare',
110 'optimizer.learning_rate': 1.0,
111 'optimizer.optimizer': 'Noam',
112 'prompt': 'It is',
113 'prompt_separator': '',
Use feedback_transformer
for original feedback transformer
116 'model': 'feedback_transformer_kv',
117
118 'train_loader': 'shuffled_train_loader',
119 'valid_loader': 'shuffled_valid_loader',
120
121 'seq_len': 128,
122 'epochs': 128,
123 'batch_size': 64,
124 'inner_iterations': 25})
Set models for saving and loading
127 experiment.add_pytorch_models(get_modules(conf))
Start the experiment
130 with experiment.start():
Run the training loop
132 conf.run()
133
134
135if __name__ == '__main__':
136 main()