Pay Attention to MLPs (gMLP) Experiment

This is an annotated PyTorch experiment to train a gMLP model. The paper also applies a Stochastic Depth regularization where some layers are removed randomly during training. We have not implemented that here.

This is based on training loop and configurations for a simple transformer auto-regressive NLP task.

16from labml import experiment
17from labml.configs import option
18from labml_nn.transformers import TransformerConfigs
19from labml_nn.transformers.basic.autoregressive_experiment import Configs as BasicAutoRegressionConfigs
20from labml_nn.transformers.gmlp import GMLPBlock
23class Configs(BasicAutoRegressionConfigs):


32    transformer: TransformerConfigs = 'gMLP'

gMLP Block

34    gmlp: GMLPBlock

d_ffn for gMLP projection layer

36    d_ffn: int = 2048

Create a gMLP block

39@option(Configs.gmlp, 'gMLP')
40def _gmlp_configs(c: Configs):
44    return GMLPBlock(c.d_model, c.d_ffn, c.seq_len)

Transformer configurations

47@option(Configs.transformer, 'gMLP')
48def _transformer_configs(c: Configs):
55    conf = TransformerConfigs()

Set the vocabulary sizes for embeddings and generating logits

57    conf.n_src_vocab = c.n_tokens
58    conf.n_tgt_vocab = c.n_tokens

Set model size

60    conf.d_model = c.d_model

Replace the encoder layer with a gMLP layer

62    conf.encoder_layer = c.gmlp
64    return conf
67def main():

Create experiment

69    experiment.create(name="gMLP")

Create configs

71    conf = Configs()

Override configurations

73    experiment.configs(conf, {

Use character level tokenizer

75        'tokenizer': 'character',

Prompt separator is blank

77        'prompt_separator': '',

Starting prompt for sampling

79        'prompt': 'It is ',

Use Tiny Shakespeare dataset

81        'text': 'tiny_shakespeare',

Use a context size of

84        'seq_len': 256,

Train for epochs

86        'epochs': 128,

Batch size

88        'batch_size': 32,

Switch between training and validation for times per epoch

91        'inner_iterations': 10,

Model size

94        'd_model': 512,
95        'd_ffn': 2048,
98        'optimizer.optimizer': 'Noam',
99        'optimizer.learning_rate': 1.,
100    })

Set models for saving and loading

103    experiment.add_pytorch_models({'model': conf.model})

Start the experiment

106    with experiment.start():

Run training


112if __name__ == '__main__':
113    main()