14import torch
15from torch import nn
16
17from labml import monit
18from labml_nn.neox.evaluation import run_eval_harness
19from labml_nn.neox.model import LayerGenerator
22def main():
设备
24 device = torch.device('cuda:0')
float16 中的层加载到 CPU 中。我们稍后将图层转换为int8,因为在将图层加载到GPU后即时执行此操作会导致CUDA内存碎片(大约3GB的内存可能会由于碎片而丢失)。
29 layer_generator = LayerGenerator(is_clone_layers=True,
30 dtype=torch.float16,
31 device=torch.device('cpu'),
32 )
加载图层
34 layers = list(layer_generator.load())
这减少了 CUDA 内存碎片
37 for layer in monit.iterate('Convert to int8', layers, is_children_silent=True):
38 layer_generator.post_load_prepare(layer,
39 device=device,
40 is_llm_int8=True,
41 llm_int8_threshold=6.0,
42 )
43 layer.to(device)
创建nn.Sequential
模型
46 model = nn.Sequential(*layers)
53if __name__ == '__main__':
54 main()