Fuzzy Tiling Activation Experiment

Open In Colab Open In Comet

Here we train a transformer that uses Fuzzy Tiling Activation in the Feed-Forward Network. We use it for a language model and train it on Tiny Shakespeare dataset for demonstration.

However, this is probably not the ideal task for FTA, and we believe FTA is more suitable for modeling data with continuous variables.

22import copy
23
24import torch
25import torch.nn as nn
26
27from labml import experiment
28from labml.configs import option
29from labml_helpers.module import Module
30from labml_nn.activations.fta import FTA
31from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
32from labml_nn.transformers import MultiHeadAttention, TransformerLayer
33from labml_nn.transformers.utils import subsequent_mask

FFN module with FTA activation

36class FeedForwardFTA(nn.Module):
  • d_model is the number of features in a token embedding
  • d_ff is the number of features in the hidden layer of the FFN
  • activation is FTA activation module
  • dropout is dropout probability for the hidden layer
41    def __init__(self, d_model: int, d_ff: int,
42                 activation: FTA,
43                 dropout: float = 0.1):
50        super().__init__()

Layer one parameterized by weight and bias

52        self.layer1 = nn.Linear(d_model, d_ff)

Layer two parameterized by weight and bias

54        self.layer2 = nn.Linear(d_ff * activation.expansion_factor, d_model)

Hidden layer dropout

56        self.dropout = nn.Dropout(dropout)

Activation function

58        self.activation = activation
60    def forward(self, x: torch.Tensor):

62        x = self.activation(self.layer1(x))

Apply dropout

64        x = self.dropout(x)

66        return self.layer2(x)

Auto-Regressive model

This is an autoregressive transformer model that uses Feed-Forward Networks with (Fuzzy Tiling Activations)(index.html).

69class AutoregressiveTransformer(Module):
  • n_tokens is the number of tokens in the vocabulary
  • d_model is the embedding size
  • n_layers is the number of transformer layers
  • layer is the layer. We use n_layers copies of this for the transformer.
77    def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: TransformerLayer):
84        super().__init__()

Transformer with n_layers layers

86        self.transformer_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])

Token embedding layer

89        self.emb = nn.Embedding(n_tokens, d_model)

Readout layer

91        self.readout = nn.Linear(d_model, n_tokens)

The mask will be initialized on the first call

94        self.mask = None
  • x are the input tokens of shape [seq_len, batch_size]
96    def forward(self, x: torch.Tensor):

Create auto-regressive mask

101        if self.mask is None or self.mask.size(0) != len(x):

Subsequent mask, will mask out tokens from seeing future tokens

103            self.mask = subsequent_mask(len(x)).to(x.device)

Get the token embeddings

106        x = self.emb(x)

Transformer encoder

108        for layer in self.transformer_layers:
109            x = layer(x=x, mask=self.mask)

Get logits

111        x = self.readout(x)

Return results

114        return x, None

Configurations

This inherits from NLPAutoRegressionConfigs

117class Configs(NLPAutoRegressionConfigs):

Model

126    model: AutoregressiveTransformer

Number of layers

129    n_layers: int = 4

and for DeepNorm

132    deep_norm_alpha: float
133    deep_norm_beta: float

Number of heads in the attention

136    n_heads: int = 4

Embedding size

138    d_model: int = 256

Size of each attention head

140    d_k: int = 16

Feed forward layer size

142    d_ff: int = 256

FTA

145    fta_lower_limit: float = -1.
146    fta_upper_limit: float = +1.
147    fta_delta: float = 0.2
148    fta_eta: float = 0.05

Initialize the model

151@option(Configs.model)
152def _model(c: Configs):

Create FTA activation module

158    fta = FTA(c.fta_lower_limit, c.fta_upper_limit, c.fta_delta, c.fta_eta)

Create the transformer. We re-use TransformerLayer and MultiHeadAttention implementations.

162    m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
163                                  TransformerLayer(d_model=c.d_model,
164                                                   feed_forward=FeedForwardFTA(d_model=c.d_model,
165                                                                               d_ff=c.d_ff,
166                                                                               activation=fta,
167                                                                               dropout=0.1),
168                                                   self_attn=MultiHeadAttention(c.n_heads, c.d_model,
169                                                                                dropout_prob=0.0),
170                                                   dropout_prob=0.0))

Move to the device

173    return m.to(c.device)

Create and run the experiment

176def main():

Create experiment

181    experiment.create(name="fta", writers={'screen',  'comet', 'labml'})

Create configs

183    conf = Configs()

Override configurations

185    experiment.configs(conf, {

Use character level tokenizer

187        'tokenizer': 'character',

Prompt separator is blank

189        'prompt_separator': '',

Starting prompt for sampling

191        'prompt': 'It is ',

Use Tiny Shakespeare dataset

193        'text': 'tiny_shakespeare',

Use a context size of

196        'seq_len': 256,

Train for 32 epochs

198        'epochs': 32,

Batch size

200        'batch_size': 16,

Switch between training and validation for times per epoch

202        'inner_iterations': 10,

Adam optimizer with no warmup

205        'optimizer.optimizer': 'Adam',
206        'optimizer.learning_rate': 3e-4,
207    })

Set model(s) for saving and loading

210    experiment.add_pytorch_models({'model': conf.model})

Start the experiment

213    with experiment.start():

Run training

215        conf.run()

219if __name__ == '__main__':
220    main()