8. 保存转换后的模型
转换模型的目的是为了使用c/c++来使用,前面的模型转换为自己的模型后,需要保存下来,,参照 https://github.com/karpathy/llama2.c 项目下的 export.py 文件,命名为 test08.py,文件保存到 newsrc 目录下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import os import gzip import shutil import struct import argparse import json from pathlib import Path import numpy as np import torch from torch import nn from transformers import AutoModelForCausalLM from model import ModelArgs, Transformer def load_hf_model(model_path): # load HF model hf_model = AutoModelForCausalLM.from_pretrained(model_path) hf_dict = hf_model.state_dict() # convert LlamaConfig to ModelArgs config = ModelArgs() config.dim = hf_model.config.hidden_size config.n_layers = hf_model.config.num_hidden_layers config.n_heads = hf_model.config.num_attention_heads config.n_kv_heads = hf_model.config.num_attention_heads config.vocab_size = hf_model.config.vocab_size config.hidden_dim = hf_model.config.intermediate_size config.norm_eps = hf_model.config.rms_norm_eps config.max_seq_len = hf_model.config.max_position_embeddings # create a new Transformer object and set weights model = Transformer(config) model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight']) model.norm.weight = nn.Parameter(hf_dict['model.norm.weight']) # huggingface permutes WQ and WK, this function reverses it def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim): return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2) for layer in model.layers: i = layer.layer_id layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight']) layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight'])) layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight'])) layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight']) layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight']) layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight']) layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight']) layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight']) layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight']) # final classifier model.output.weight = nn.Parameter(hf_dict['lm_head.weight']) model.eval() return model # legacy def serialize_fp32(file, tensor): """ writes one fp32 tensor to file that is open in wb mode """ d = tensor.detach().cpu().view(-1).to(torch.float32).numpy() b = struct.pack(f'{len(d)}f', *d) file.write(b) def legacy_export(model, filepath): """ Original export of llama2.c bin files, i.e. version v0 """ out_file = open(filepath, 'wb') # first write out the header hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] p = model.params shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) # legacy format uses negative/positive vocab size as a shared classifier flag if not shared_classifier: p.vocab_size = -p.vocab_size n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, n_kv_heads, p.vocab_size, p.max_seq_len) out_file.write(header) # next write out the embedding weights serialize_fp32(out_file, model.tok_embeddings.weight) # now all the layers # attention weights for layer in model.layers: serialize_fp32(out_file, layer.attention_norm.weight) for layer in model.layers: serialize_fp32(out_file, layer.attention.wq.weight) for layer in model.layers: serialize_fp32(out_file, layer.attention.wk.weight) for layer in model.layers: serialize_fp32(out_file, layer.attention.wv.weight) for layer in model.layers: serialize_fp32(out_file, layer.attention.wo.weight) # ffn weights for layer in model.layers: serialize_fp32(out_file, layer.ffn_norm.weight) for layer in model.layers: serialize_fp32(out_file, layer.feed_forward.w1.weight) for layer in model.layers: serialize_fp32(out_file, layer.feed_forward.w2.weight) for layer in model.layers: serialize_fp32(out_file, layer.feed_forward.w3.weight) # final rmsnorm serialize_fp32(out_file, model.norm.weight) # freqs_cis serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len]) serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len]) # final classifier weights if not shared_classifier: serialize_fp32(out_file, model.output.weight) # write to binary file out_file.close() print(f"wrote {filepath}") # 指定模型路径 model_path = "meta-llama/Llama-2-7b-chat-hf" output_path = "output/model.bin" model = load_hf_model(model_path) legacy_export(model, output_path) |
运行 test08.py
最后检查 output 目录的内容如下:
1 2 3 4 5 6 7 |
python newsrc/test08.py Loading checkpoint shards: 100%|███████| 2/2 [02:01<00:00, 60.74s/it] wrote output/model.bin ls -l output/ total 26323988 -rwxrwxrwx 1 tony tony 26955759644 Mar 12 01:23 model.bin |