This trains a simple transformer introduced in Attention Is All You Need on an NLP auto-regression task (with Tiny Shakespeare dataset).
16import torch
17from torch import nn
18
19from labml import experiment
20from labml.configs import option
21from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
22from labml_nn.transformers import TransformerConfigs, Encoder
23from labml_nn.transformers.utils import subsequent_mask26class AutoregressiveTransformer(nn.Module):encoder
is the transformer Encoder src_embed
is the token embedding module (with positional encodings) generator
is the final fully connected layer that gives the logits.30 def __init__(self, encoder: Encoder, src_embed: nn.Module, generator: nn.Module):37 super().__init__()
38 self.src_embed = src_embed
39 self.encoder = encoder
40 self.generator = generatorThe mask will be initialized on the first call
43 self.mask = None45 def forward(self, x: torch.Tensor):Create subsequent mask if mask is not initialized or if the size of the mask is different
48 if self.mask is None or self.mask.size(0) != len(x):Subsequent mask, will mask out tokens from seeing future tokens
50 self.mask = subsequent_mask(len(x)).to(x.device)Get the token embeddings with positional encodings
52 x = self.src_embed(x)Transformer encoder
54 x = self.encoder(x, self.mask)Get logits
56 x = self.generator(x)Return results (second value is for state, since our trainer is used with RNNs also)
60 return x, None63class Configs(NLPAutoRegressionConfigs):GPT model
72 model: AutoregressiveTransformerTransformer
74 transformer: TransformerConfigs77@option(Configs.transformer, 'Transformer')
78def _transformer_configs(c: Configs):We use our configurable transformer implementation
85 conf = TransformerConfigs()Set the vocabulary sizes for embeddings and generating logits
87 conf.n_src_vocab = c.n_tokens
88 conf.n_tgt_vocab = c.n_tokens90 conf.d_model = c.d_model93 return confCreate GPT model and initialize weights
96@option(Configs.model)
97def _model(c: Configs):101 m = AutoregressiveTransformer(c.transformer.encoder,
102 c.transformer.src_embed,
103 c.transformer.generator).to(c.device)
104
105 return m108def main():Create experiment
110 experiment.create(name="transformer")Create configs
112 conf = Configs()Override configurations
114 experiment.configs(conf, {Use character level tokenizer
116 'tokenizer': 'character',Prompt separator is blank
118 'prompt_separator': '',Starting prompt for sampling
120 'prompt': 'It is ',Use Tiny Shakespeare dataset
122 'text': 'tiny_shakespeare',Use a context size of
125 'seq_len': 512,Train for 32 epochs
127 'epochs': 32,Batch size
129 'batch_size': 16,Switch between training and validation for times per epoch
132 'inner_iterations': 10,Model size
135 'd_model': 256,
136 'transformer.n_heads': 16,
137 'transformer.ffn.d_ff': 1024,Use Noam optimizer
140 'optimizer.optimizer': 'Noam',
141 'optimizer.learning_rate': 1.,
142 })Set models for saving and loading
145 experiment.add_pytorch_models({'model': conf.model})Start the experiment
148 with experiment.start():Run training
150 conf.run()154if __name__ == '__main__':
155 main()