MLP Mixer Experiment

This is an annotated PyTorch experiment to train a MLP Mixer Model.

12from labml import experiment
13from labml.configs import option
14from labml_nn.transformers import TransformerConfigs
15from labml_nn.transformers.configs import FeedForwardConfigs
16from labml_nn.transformers.mlm.experiment import TransformerMLM, Configs as MLMConfigs

Configurations

This inherits from MLMConfigs where we define an experiment for Masked Language Models.

19class Configs(MLMConfigs):

Configurable Feed-Forward Network for the MLP

29    mix_mlp: FeedForwardConfigs

The mixing MLP configurations

32@option(Configs.mix_mlp)
33def _mix_mlp_configs(c: Configs):
38    conf = FeedForwardConfigs()

Size of the MLP is the sequence length, because it is applied across tokens

40    conf.d_model = c.seq_len

The paper suggests activation

42    conf.activation = 'GELU'

45    return conf

Transformer configurations

48@option(Configs.transformer)
49def _transformer_configs(c: Configs):
56    conf = TransformerConfigs()

Set the vocabulary sizes for embeddings and generating logits

58    conf.n_src_vocab = c.n_tokens
59    conf.n_tgt_vocab = c.n_tokens

Embedding size

61    conf.d_model = c.d_model

Change attention module to MLPMixer

63    from labml_nn.transformers.mlp_mixer import MLPMixer
64    conf.encoder_attn = MLPMixer(c.mix_mlp.ffn)

67    return conf
70def main():

Create experiment

72    experiment.create(name="mlp_mixer_mlm")

Create configs

74    conf = Configs()

Override configurations

76    experiment.configs(conf, {

Batch size

78        'batch_size': 64,

Sequence length of . We use a short sequence length to train faster. Otherwise MLM models take forever to train.

81        'seq_len': 32,

Train for 1024 epochs.

84        'epochs': 1024,

Switch between training and validation for times per epoch

87        'inner_iterations': 1,

Transformer configurations

90        'd_model': 128,
91        'transformer.ffn.d_ff': 256,
92        'transformer.n_heads': 8,
93        'transformer.n_layers': 6,
94        'transformer.ffn.activation': 'GELU',

Mixer MLP hidden layer size

97        'mix_mlp.d_ff': 128,
100        'optimizer.optimizer': 'Noam',
101        'optimizer.learning_rate': 1.,
102    })

Set models for saving and loading

105    experiment.add_pytorch_models({'model': conf.model})

Start the experiment

108    with experiment.start():

Run training

110        conf.run()

114if __name__ == '__main__':
115    main()