This code evaluate GPT-NeoX using LLM.int8() quantization, on a suite of tasks.
14import torch
15from torch import nn
16
17from labml import monit
18from labml_nn.neox.evaluation import run_eval_harness
19from labml_nn.neox.model import LayerGenerator
22def main():
Device
24 device = torch.device('cuda:0')
Load layers in float16 into CPU. We convert the layers to int8 later, because doing that on the fly after loading layers to GPU causes CUDA memory fragmentation (about 3GB memory can get lost due to fragmentation).
29 layer_generator = LayerGenerator(is_clone_layers=True,
30 dtype=torch.float16,
31 device=torch.device('cpu'),
32 )
Load layers
34 layers = list(layer_generator.load())
This reduces CUDA memory fragmentation
37 for layer in monit.iterate('Convert to int8', layers, is_children_silent=True):
38 layer_generator.post_load_prepare(layer,
39 device=device,
40 is_llm_int8=True,
41 llm_int8_threshold=6.0,
42 )
43 layer.to(device)
Create nn.Sequential
model
46 model = nn.Sequential(*layers)
49 print(run_eval_harness(model, 'half_precision', [], device))
53if __name__ == '__main__':
54 main()