This is an annotated PyTorch experiment to train a hourglass.
This is based on training loop and configurations for a simple transformer auto-regressive NLP task.
14import math
15from typing import List
16
17import torch
18from torch import nn
19
20from labml import experiment
21from labml.configs import option
22from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
23from labml_nn.transformers.hour_glass import HourGlass
24from labml_nn.transformers.positional_encoding import PositionalEncoding
27class AutoregressiveTransformer(nn.Module):
n_tokens
is the vocabulary size d_model
is the size of the token embeddings dropout
is the dropout probability hour_glass
is the hourglass model32 def __init__(self, n_tokens: int, d_model: int, dropout: float, hour_glass: HourGlass):
39 super().__init__()
Token embeddings
41 self.embedding = nn.Embedding(n_tokens, d_model)
47 self.pos_embedding = PositionalEncoding(d_model, dropout)
49 self.hour_glass = hour_glass
To normalize the final embeddings
51 self.norm = nn.LayerNorm([d_model])
Embedding size
53 self.d_model = d_model
Final linear layer to predict the logits
55 self.output = nn.Linear(d_model, n_tokens)
x
is the tensor with token indexes of shape [seq_len, batch_size]
57 def __call__(self, x: torch.Tensor):
Get embeddings
62 x = self.embedding(x)
65 if self.pos_embedding is not None:
66 x = self.pos_embedding(x * math.sqrt(self.d_model))
Hourglass
69 x = self.hour_glass(x)
Get logits
72 output = self.output(self.norm(x))
Return the logits
75 return output, None
This inherits from training loop and configurations for a simple transformer auto-regressive NLP task.
78class Configs(NLPAutoRegressionConfigs):
Model
86 model: AutoregressiveTransformer
Number of attention heads
88 n_heads: int = 8
Dropout probability
90 dropout: float = 0.1
Size of feed-forward hidden layer
92 d_ff: int = 512
Token embedding size
94 d_model: int = 256
Shortening factors
96 shortening_factors: List[int] = [8, 4]
Create the model
99@option(Configs.model)
100def _model(c: Configs):
Create hourglass model
106 hour_glass = HourGlass(c.n_heads, c.d_model, c.dropout, c.d_ff, c.shortening_factors)
Create the auto-regressive wrapper
108 m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.dropout, hour_glass).to(c.device)
111 return m
114def main():
Create experiment
116 experiment.create(name="hour_glass")
Create configs
118 conf = Configs()
Override configurations
120 experiment.configs(conf, {
Use character level tokenizer
122 'tokenizer': 'character',
Prompt separator is blank
124 'prompt_separator': '',
Starting prompt for sampling
126 'prompt': 'It is ',
Use Tiny Shakespeare dataset
128 'text': 'tiny_shakespeare',
Use a context size of
131 'seq_len': 256,
Train for epochs
133 'epochs': 128,
Batch size
135 'batch_size': 32,
Switch between training and validation for times per epoch
138 'inner_iterations': 10,
Use Noam optimizer
141 'optimizer.optimizer': 'Noam',
142 'optimizer.learning_rate': 1.,
144 })
Set models for saving and loading
147 experiment.add_pytorch_models({'model': conf.model})
Start the experiment
150 with experiment.start():
Run training
152 conf.run()
156if __name__ == '__main__':
157 main()