8. 保存转换后的模型
转换模型的目的是为了使用c/c++来使用,前面的模型转换为自己的模型后,需要保存下来,,参照 https://github.com/karpathy/llama2.c 项目下的 export.py 文件,命名为 test08.py,文件保存到 newsrc 目录下:
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | import os import gzip import shutil import struct import argparse import json from pathlib import Path import numpy as np import torch from torch import nn from transformers import AutoModelForCausalLM from model import ModelArgs, Transformer def load_hf_model(model_path):     # load HF model     hf_model = AutoModelForCausalLM.from_pretrained(model_path)     hf_dict = hf_model.state_dict()     # convert LlamaConfig to ModelArgs     config = ModelArgs()     config.dim = hf_model.config.hidden_size     config.n_layers = hf_model.config.num_hidden_layers     config.n_heads = hf_model.config.num_attention_heads     config.n_kv_heads = hf_model.config.num_attention_heads     config.vocab_size = hf_model.config.vocab_size     config.hidden_dim = hf_model.config.intermediate_size     config.norm_eps = hf_model.config.rms_norm_eps     config.max_seq_len = hf_model.config.max_position_embeddings     # create a new Transformer object and set weights     model = Transformer(config)     model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight'])     model.norm.weight = nn.Parameter(hf_dict['model.norm.weight'])     # huggingface permutes WQ and WK, this function reverses it     def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim):         return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2)     for layer in model.layers:         i = layer.layer_id         layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight'])         layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight']))         layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight']))         layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight'])         layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight'])         layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight'])         layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight'])         layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight'])         layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight'])     # final classifier     model.output.weight = nn.Parameter(hf_dict['lm_head.weight'])     model.eval()     return model # legacy def serialize_fp32(file, tensor):     """ writes one fp32 tensor to file that is open in wb mode """     d = tensor.detach().cpu().view(-1).to(torch.float32).numpy()     b = struct.pack(f'{len(d)}f', *d)     file.write(b) def legacy_export(model, filepath):     """ Original export of llama2.c bin files, i.e. version v0 """     out_file = open(filepath, 'wb')     # first write out the header     hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]     p = model.params     shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)     # legacy format uses negative/positive vocab size as a shared classifier flag     if not shared_classifier:         p.vocab_size = -p.vocab_size     n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads     header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,                                     n_kv_heads, p.vocab_size, p.max_seq_len)     out_file.write(header)     # next write out the embedding weights     serialize_fp32(out_file, model.tok_embeddings.weight)     # now all the layers     # attention weights     for layer in model.layers:         serialize_fp32(out_file, layer.attention_norm.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.attention.wq.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.attention.wk.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.attention.wv.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.attention.wo.weight)     # ffn weights     for layer in model.layers:         serialize_fp32(out_file, layer.ffn_norm.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.feed_forward.w1.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.feed_forward.w2.weight)     for layer in model.layers:         serialize_fp32(out_file, layer.feed_forward.w3.weight)     # final rmsnorm     serialize_fp32(out_file, model.norm.weight)     # freqs_cis     serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len])     serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len])     # final classifier weights     if not shared_classifier:         serialize_fp32(out_file, model.output.weight)     # write to binary file     out_file.close()     print(f"wrote {filepath}") # 指定模型路径 model_path = "meta-llama/Llama-2-7b-chat-hf" output_path = "output/model.bin" model = load_hf_model(model_path) legacy_export(model, output_path) | 
运行 test08.py
最后检查 output 目录的内容如下:
| 1 2 3 4 5 6 7 | python newsrc/test08.py Loading checkpoint shards: 100%|███████| 2/2 [02:01<00:00, 60.74s/it] wrote output/model.bin ls -l output/ total 26323988 -rwxrwxrwx 1 tony tony 26955759644 Mar 12 01:23 model.bin | 
