Attention Free Transformer (AFT) Experiment

This is an annotated PyTorch experiment to train a AFT model.

This is based on general training loop and configurations for auto-regressive NLP task.

14import torch
15
16from labml import experiment
17from labml.configs import option
18from labml_helpers.module import Module
19from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
20from labml_nn.transformers import TransformerConfigs, Encoder
21from labml_nn.transformers.utils import subsequent_mask

Simple autoregressive model

This consists of a token embedding layer, transformer encoder, and a final linear layer that gives token logits.

24class AutoregressiveTransformer(Module):
32    def __init__(self, encoder: Encoder, src_embed: Module, generator: Module):
39        super().__init__()
40        self.src_embed = src_embed
41        self.encoder = encoder
42        self.generator = generator

The mask will be initialized on the first call

45        self.mask = None
47    def forward(self, x: torch.Tensor):

Create subsequent mask if mask is not initialized or if the size of the mask is different

50        if self.mask is None or self.mask.size(0) != len(x):

Subsequent mask, will mask out tokens from seeing future tokens

52            self.mask = subsequent_mask(len(x)).to(x.device)

Get the token embeddings with positional encodings

55        x = self.src_embed(x)

Transformer encoder

57        x = self.encoder(x, self.mask)

Get logits

59        x = self.generator(x)

Return results (second value is for state, since our trainer is used with RNNs also)

63        return x, None

Configurations

This inherits from NLPAutoRegressionConfigs

66class Configs(NLPAutoRegressionConfigs):

GPT model

75    model: AutoregressiveTransformer

Transformer

77    transformer: TransformerConfigs
78
79    local_window_size: int = 32

Transformer configurations

82@option(Configs.transformer, 'Transformer')
83def _transformer_configs(c: Configs):
90    conf = TransformerConfigs()

Set the vocabulary sizes for embeddings and generating logits

92    conf.n_src_vocab = c.n_tokens
93    conf.n_tgt_vocab = c.n_tokens

Set the embedding size

95    conf.d_model = c.d_model

Replace self-attention with an AFT Local Module

97    from labml_nn.transformers.aft import AFTLocal
98    conf.encoder_attn = AFTLocal(c.d_model, c.seq_len, c.local_window_size)

101    return conf

Create an auto-regressive model

104@option(Configs.model)
105def _model(c: Configs):
109    m = AutoregressiveTransformer(c.transformer.encoder,
110                                  c.transformer.src_embed,
111                                  c.transformer.generator).to(c.device)
112
113    return m
116def main():

Create experiment

118    experiment.create(name="aft")

Create configs

120    conf = Configs()

Override configurations

122    experiment.configs(conf, {

Use character level tokenizer

124        'tokenizer': 'character',

Prompt separator is blank

126        'prompt_separator': '',

Starting prompt for sampling

128        'prompt': 'It is ',

Use Tiny Shakespeare dataset

130        'text': 'tiny_shakespeare',

Use a context size of

133        'seq_len': 256,

Train for epochs

135        'epochs': 128,

Batch size

137        'batch_size': 32,

Switch between training and validation for times per epoch

140        'inner_iterations': 10,

Embedding size

143        'd_model': 128,

FFN hidden dimension size

145        'transformer.ffn.d_ff': 256,

Optimizer

148        'optimizer.optimizer': 'Noam',
149        'optimizer.learning_rate': 1.,
150    })

Set models for saving and loading

153    experiment.add_pytorch_models({'model': conf.model})

Start the experiment

156    with experiment.start():

Run training

158        conf.run()

162if __name__ == '__main__':
163    main()