Primer EZ Experiment

This is an annotated PyTorch experiment to train a Primer EZ transformer.

This is based on our vanilla transformer experiment. We use the same experiment and add the Primer EZ modifications.

15from labml import experiment
16from labml.configs import option
17from labml_nn.transformers import TransformerConfigs
18from labml_nn.transformers.basic.autoregressive_experiment import Configs
19from labml_nn.transformers.configs import FeedForwardConfigs
20from labml_nn.transformers.primer_ez import SquaredReLU
23@option(FeedForwardConfigs.activation, 'SquaredReLU')
24def _squared_relu():
30    return SquaredReLU()
33@option(TransformerConfigs.encoder_attn, 'MultiDConvHeadAttention')
34def _d_conv_mha(c: TransformerConfigs):
40    from labml_nn.transformers.primer_ez import MultiDConvHeadAttention
41    return MultiDConvHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)

Add the option of Multi Depth-wise Shared Conv Head Attention to configurable transformer

📝 This is a variation we tried

44@option(TransformerConfigs.encoder_attn, 'MultiDSharedConvHeadAttention')
45def _d_shared_conv_mha(c: TransformerConfigs):
53    from labml_nn.transformers.primer_ez.variations import MultiDSharedConvHeadAttention
54    return MultiDSharedConvHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)

Add the option of Multi Depth-wise Per Head Conv Head Attention to configurable transformer

📝 This is a variation we tried

57@option(TransformerConfigs.encoder_attn, 'MultiDPHConvHeadAttention')
58def _d_per_head_conv_mha(c: TransformerConfigs):
66    from labml_nn.transformers.primer_ez.variations import MultiDPHConvHeadAttention
67    return MultiDPHConvHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)
70def main():

Create experiment

72    experiment.create(name="primer_ez")

Create configs

74    conf = Configs()

Override configurations

76    experiment.configs(conf, {

Use character level tokenizer

78        'tokenizer': 'character',

Prompt separator is blank

80        'prompt_separator': '',

Starting prompt for sampling

82        'prompt': 'It is ',

Use Tiny Shakespeare dataset

84        'text': 'tiny_shakespeare',

Use a context size of

87        'seq_len': 256,

Train for epochs

89        'epochs': 128,

Batch size

91        'batch_size': 32,

Switch between training and validation for times per epoch

94        'inner_iterations': 10,

Model size

97        'd_model': 512,
98        'transformer.ffn.d_ff': 2048,

Use Adam optimizer

101        'optimizer.optimizer': 'Adam',
102        'optimizer.learning_rate': 2.5e-4,

⭐️ Use squared ReLU activation in the feed forward network.

Replace this with ReLU for .

107        'transformer.ffn.activation': 'SquaredReLU',

⭐️ Use Multi-DConv-Head Attention for encoder attention.

Replace this with mha for original multi-head attention.

112        'transformer.encoder_attn': 'MultiDConvHeadAttention',
113    })

Set models for saving and loading

116    experiment.add_pytorch_models({'model': conf.model})

Start the experiment

119    with experiment.start():

Run training


125if __name__ == '__main__':
126    main()