#

RETRO training dataset

We pre-retrieve nearest neighbors from the key-value database and create the dataset to train the RETRO model.

15import json
16from pathlib import Path
17
18import numpy as np
19import torch
20from torch.utils.data import Dataset as PyTorchDataset
21
22from labml import lab, monit
23from labml_helpers.datasets.text import TextFileDataset, TextDataset
24from labml_nn.transformers.retro.database import RetroIndex

#

Build the dataset

chunk_len is the chunk length
chunks_per_sample is the number of chunks per training sample
skip_range is the maximum number of characters to skip between two samples. We skip a few characters between samples to make sure the samples aren't aligned perfectly with the chunks in the database

27def build_dataset(chunk_len: int = 16, chunks_per_sample: int = 32, skip_range: int = 8):

#

Load the text file

39    dataset = TextFileDataset(
40        lab.get_data_path() / 'tiny_shakespeare.txt',
41        list,
42        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')

#

Training portion of it

45    text = dataset.train

#

Load the index for retrieving neighbors

48    index = RetroIndex()

#

The input sample offsets

51    sample_offsets = []

#

Cursor for the text

53    i = 0
54    while i < len(text):

#

Skip a few characters to make sure it's not aligned with the neighbors

56        skip = np.random.randint(skip_range)
57        i += skip

#

Stop if we've reached the end of the text

60        if i + chunks_per_sample * chunk_len > len(text):
61            break

#

Collect the offset

64        sample_offsets.append(i)

#

Increment the cursor

67        i += chunks_per_sample * chunk_len

#

For samples

70    samples = []

#

Iterate through sample offsets

72    for i in monit.iterate('Gather Neighbors', sample_offsets):

#

Get the sample including an extra character (for prediction)

74        sample = text[i: i + chunks_per_sample * chunk_len + 1]

#

The input

76        src = sample[:-1]

#

Break it into chunks

78        chunks = [src[j:j + chunk_len] for j in range(0, len(src), chunk_len)]

#

The chunk offsets

80        chunk_offsets = [j + i for j in range(0, len(src), chunk_len)]

#

Retrieve nearest neighbors

83        neighbor_offsets = index(chunks, chunk_offsets)

#

Get neighbor texts. The neighbor length is twice the chunk_len

86        neighbors = [[text[j: j + chunk_len * 2] for j in n_off] for n_off in neighbor_offsets]

#

Add to list of samples

89        samples.append((sample[:-1], sample[1:], neighbors))

#

Save the samples in JSON. We don't need to use complex dataset storage mechanisms or pre-tokenize since our dataset is small.

94    with open(str(lab.get_data_path() / 'retro_train_dataset.json'), 'w') as f:
95        f.write(json.dumps(samples))

#

Dataset

This is the PyTorch dataset that loads the dataset created by build_dataset .

98class Dataset(PyTorchDataset):

#

file_path is the path of the saved JSON file
tds is the TextDataset

105    def __init__(self, file_path: Path, tds: TextDataset):

#

111        self.tds = tds

#

Load the samples

113        with open(str(file_path), 'r') as f:
114            self.samples = json.loads(f.read())

#

Number of samples

116    def __len__(self):

#

120        return len(self.samples)

#

Get a sample

122    def __getitem__(self, idx: int):

#

Get the sample

127        s = self.samples[idx]

#

Tokenize

129        src = self.tds.text_to_i(s[0])
130        tgt = self.tds.text_to_i(s[1])
131        neighbors = torch.stack([torch.stack([self.tds.text_to_i(n) for n in chunks]) for chunks in s[2]])

#

133        return src, tgt, neighbors

#

136if __name__ == '__main__':
137    build_dataset()