14import math
15from typing import List
16
17import torch
18from torch import nn
19
20from labml import experiment
21from labml.configs import option
22from labml_helpers.module import Module
23from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
24from labml_nn.transformers.hour_glass import HourGlass
25from labml_nn.transformers.positional_encoding import PositionalEncoding

Autoregressive language model

28class AutoregressiveTransformer(Module):
  • n_tokens is the vocabulary size
  • d_model is the size of the token embeddings
  • dropout is the dropout probability
  • hour_glass is the hourglass model
33    def __init__(self, n_tokens: int, d_model: int, dropout: float, hour_glass: HourGlass):
40        super().__init__()

Token embeddings

42        self.embedding = nn.Embedding(n_tokens, d_model)
48        self.pos_embedding = PositionalEncoding(d_model, dropout)
50        self.hour_glass = hour_glass

To normalize the final embeddings

52        self.norm = nn.LayerNorm([d_model])

Embedding size

54        self.d_model = d_model

Final linear layer to predict the logits

56        self.output = nn.Linear(d_model, n_tokens)
  • x is the tensor with token indexes of shape [seq_len, batch_size]
58    def __call__(self, x: torch.Tensor):

Get embeddings

63        x = self.embedding(x)
66        if self.pos_embedding is not None:
67            x = self.pos_embedding(x * math.sqrt(self.d_model))

Hourglass

70        x = self.hour_glass(x)

Get logits

73        output = self.output(self.norm(x))

Return the logits

76        return output, None
79class Configs(NLPAutoRegressionConfigs):

Model

87    model: AutoregressiveTransformer

Number of attention heads

89    n_heads: int = 8

Dropout probability

91    dropout: float = 0.1

Size of feed-forward hidden layer

93    d_ff: int = 512

Token embedding size

95    d_model: int = 256

Shortening factors

97    shortening_factors: List[int] = [8, 4]

Create the model

100@option(Configs.model)
101def _model(c: Configs):

Create hourglass model

107    hour_glass = HourGlass(c.n_heads, c.d_model, c.dropout, c.d_ff, c.shortening_factors)

Create the auto-regressive wrapper

109    m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.dropout, hour_glass).to(c.device)

112    return m
115def main():

Create experiment

117    experiment.create(name="hour_glass")

Create configs

119    conf = Configs()

Override configurations

121    experiment.configs(conf, {

Use character level tokenizer

123        'tokenizer': 'character',

Prompt separator is blank

125        'prompt_separator': '',

Starting prompt for sampling

127        'prompt': 'It is ',

Use Tiny Shakespeare dataset

129        'text': 'tiny_shakespeare',

Use a context size of

132        'seq_len': 256,

Train for epochs

134        'epochs': 128,

Batch size

136        'batch_size': 32,

Switch between training and validation for times per epoch

139        'inner_iterations': 10,
142        'optimizer.optimizer': 'Noam',
143        'optimizer.learning_rate': 1.,

145    })

Set models for saving and loading

148    experiment.add_pytorch_models({'model': conf.model})

Start the experiment

151    with experiment.start():

Run training

153        conf.run()

157if __name__ == '__main__':
158    main()