#

Evaluate GPT-NeoX using LLM.int8() quantization on test suite

This code evaluate GPT-NeoX using LLM.int8() quantization, on a suite of tasks.

14import torch
15from torch import nn
16
17from labml import monit
18from labml_nn.neox.evaluation import run_eval_harness
19from labml_nn.neox.model import LayerGenerator

#

22def main():

#

Device

24    device = torch.device('cuda:0')

#

Load layers in float16 into CPU. We convert the layers to int8 later, because doing that on the fly after loading layers to GPU causes CUDA memory fragmentation (about 3GB memory can get lost due to fragmentation).

29    layer_generator = LayerGenerator(is_clone_layers=True,
30                                     dtype=torch.float16,
31                                     device=torch.device('cpu'),
32                                     )

#

Load layers

34    layers = list(layer_generator.load())

#

This reduces CUDA memory fragmentation

37    for layer in monit.iterate('Convert to int8', layers, is_children_silent=True):
38        layer_generator.post_load_prepare(layer,
39                                          device=device,
40                                          is_llm_int8=True,
41                                          llm_int8_threshold=6.0,
42                                          )
43        layer.to(device)

#

Create nn.Sequential model

46    model = nn.Sequential(*layers)

#

Run evaluation harness

49    print(run_eval_harness(model, 'half_precision', [], device))

#

53if __name__ == '__main__':
54    main()