Text Dataset for GPT-NeoX

10from pathlib import PurePath, Path
11from typing import Optional, List
12
13import torch
14import torch.utils.data
15from labml import lab
16from labml import monit
17from labml.logger import inspect
18from labml.utils.download import download_file
19
20from labml_nn.neox.tokenizer import get_tokenizer

Load text file

  • path is the location of the text file
  • url is the URL to download the file from
  • filter_subset is the number of characters to filter. Use this during testing when trying large datasets
  • Returns the text content

23def load_text(path: PurePath, url: Optional[str] = None, *, filter_subset: Optional[int] = None):
34    path = Path(path)

Download if it doesn't exist

37    if not path.exists():
38        if not url:
39            raise FileNotFoundError(str(path))
40        else:
41            download_file(url, path)
42
43    with monit.section("Load data"):

Load data

45        with open(str(path), 'r') as f:
46            text = f.read()

Filter

48        if filter_subset:
49            text = text[:filter_subset]

52    return text

Dataset for fine-tuning GPT-NeoX

This is not optimized to very large datasets.

55class NeoXDataset(torch.utils.data.Dataset):
  • tokens is the list of token ids
  • seq_len is the sequence length of a single training sample
62    def __init__(self, tokens: List[int], seq_len: int):
68        self.seq_len = seq_len

Number of samples

70        n_samples = len(tokens) // seq_len
71        self.n_samples = n_samples

Truncate

73        tokens = tokens[:n_samples * seq_len + 1]

Create a PyTorch tensor

75        self.tokens = torch.tensor(tokens)
77    def __len__(self):
78        return self.n_samples

Get a sample

  • idx is the index of the sample
  • Returns the input and the target

80    def __getitem__(self, idx: int):
87        offset = idx * self.seq_len
88        return self.tokens[offset:offset + self.seq_len], self.tokens[offset + 1:offset + 1 + self.seq_len]
89
90
91DATASETS = {
92    'tiny_shakespeare': {
93        'file': 'tiny_shakespeare.txt',
94        'url': 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
95    }
96}

Load Dataset

  • seq_len is the sequence length of a single training sample
  • dataset_name is the name of the dataset
  • Returns the dataset

99def get_training_data(seq_len: int = 32, dataset_name: str = 'tiny_shakespeare', truncate: int = -1):
108    ds = DATASETS[dataset_name]

Load the content

110    text = load_text(lab.get_data_path() / ds['file'], ds['url'])

Tokenize

112    tokenizer = get_tokenizer()
113    tokens = tokenizer.encode_batch([text])[0]
114
115    if truncate > 0:
116        token_ids = tokens.ids[:truncate * seq_len]
117    else:
118        token_ids = tokens.ids

121    return NeoXDataset(token_ids, seq_len)
124def _test():
125    dataset = get_training_data()
126
127    inspect(tokens=len(dataset.tokens))

131if __name__ == '__main__':
132    _test()