This is an annotated PyTorch experiment to train a Primer EZ transformer.
This is based on our vanilla transformer experiment. We use the same experiment and add the Primer EZ modifications.
15from labml import experiment
16from labml.configs import option
17from labml_nn.transformers import TransformerConfigs
18from labml_nn.transformers.basic.autoregressive_experiment import Configs
19from labml_nn.transformers.configs import FeedForwardConfigs
20from labml_nn.transformers.primer_ez import SquaredReLU
Add the option of squared ReLU to configurable feed forward module.
23@option(FeedForwardConfigs.activation, 'SquaredReLU')
24def _squared_relu():
30 return SquaredReLU()
Add the option of Multi-DConv-Head Attention to configurable transformer
33@option(TransformerConfigs.encoder_attn, 'MultiDConvHeadAttention')
34def _d_conv_mha(c: TransformerConfigs):
40 from labml_nn.transformers.primer_ez import MultiDConvHeadAttention
41 return MultiDConvHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)
Add the option of Multi Depth-wise Shared Conv Head Attention to configurable transformer
📝 This is a variation we tried
44@option(TransformerConfigs.encoder_attn, 'MultiDSharedConvHeadAttention')
45def _d_shared_conv_mha(c: TransformerConfigs):
53 from labml_nn.transformers.primer_ez.variations import MultiDSharedConvHeadAttention
54 return MultiDSharedConvHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)
Add the option of Multi Depth-wise Per Head Conv Head Attention to configurable transformer
📝 This is a variation we tried
57@option(TransformerConfigs.encoder_attn, 'MultiDPHConvHeadAttention')
58def _d_per_head_conv_mha(c: TransformerConfigs):
66 from labml_nn.transformers.primer_ez.variations import MultiDPHConvHeadAttention
67 return MultiDPHConvHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)
70def main():
Create experiment
72 experiment.create(name="primer_ez")
Create configs
74 conf = Configs()
Override configurations
76 experiment.configs(conf, {
Use character level tokenizer
78 'tokenizer': 'character',
Prompt separator is blank
80 'prompt_separator': '',
Starting prompt for sampling
82 'prompt': 'It is ',
Use Tiny Shakespeare dataset
84 'text': 'tiny_shakespeare',
Use a context size of
87 'seq_len': 256,
Train for epochs
89 'epochs': 128,
Batch size
91 'batch_size': 32,
Switch between training and validation for times per epoch
94 'inner_iterations': 10,
Model size
97 'd_model': 512,
98 'transformer.ffn.d_ff': 2048,
Use Adam optimizer
101 'optimizer.optimizer': 'Adam',
102 'optimizer.learning_rate': 2.5e-4,
107 'transformer.ffn.activation': 'SquaredReLU',
⭐️ Use Multi-DConv-Head Attention for encoder attention.
Replace this with mha
for original multi-head attention.
112 'transformer.encoder_attn': 'MultiDConvHeadAttention',
113 })
Set models for saving and loading
116 experiment.add_pytorch_models({'model': conf.model})
Start the experiment
119 with experiment.start():
Run training
121 conf.run()
125if __name__ == '__main__':
126 main()