11. 将字符串编码为令牌序列
在聊天的时候,需要把字符串通过 tokenizer 进行编码,命名为 test10.py,文件保存到 newsrc 目录下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os import struct import argparse from typing import List from sentencepiece import SentencePieceProcessor class Tokenizer: def __init__(self, tokenizer_model=None): model_path = tokenizer_model assert os.path.isfile(model_path), model_path self.sp_model = SentencePieceProcessor(model_file=model_path) self.model_path = model_path # BOS / EOS token IDs self.n_words: int = self.sp_model.vocab_size() self.bos_id: int = self.sp_model.bos_id() self.eos_id: int = self.sp_model.eos_id() self.pad_id: int = self.sp_model.pad_id() #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}") assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() def encode(self, s: str, bos: bool, eos: bool) -> List[int]: assert type(s) is str t = self.sp_model.encode(s) if bos: t = [self.bos_id] + t if eos: t = t + [self.eos_id] return t def decode(self, t: List[int]) -> str: return self.sp_model.decode(t) def export(self): # get all the tokens (postprocessed) and their scores as floats tokens, scores = [], [] for i in range(self.n_words): # decode the token and light postprocessing t = self.sp_model.id_to_piece(i) s = self.sp_model.get_score(i) if i == self.bos_id: t = '\n<s>\n' elif i == self.eos_id: t = '\n</s>\n' t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace b = t.encode('utf-8') # bytes of this token, utf-8 encoded tokens.append(b) scores.append(s) # record the max token length max_token_length = max(len(t) for t in tokens) # write to a binary file # the tokenizer.bin file is the same as .model file, but .bin tokenizer_bin = self.model_path.replace('.model', '.bin') with open(tokenizer_bin, 'wb') as f: f.write(struct.pack("I", max_token_length)) for bytes, score in zip(tokens, scores): f.write(struct.pack("fI", score, len(bytes))) f.write(bytes) t = Tokenizer("newsrc/tokenizer.model") prompt = "The meaning of life is" start_ids = t.encode(prompt, bos=True, eos=False) print(start_ids) |
运行 test10.py
1 2 |
python newsrc/test10.py [1, 450, 6593, 310, 2834, 338] |
通过 AutoTokenizer 来编码,命名为 test11.py,文件保存到 newsrc 目录下:
1 2 3 4 5 6 7 8 |
from transformers import AutoTokenizer model_path = "meta-llama/Llama-2-7b-chat-hf" tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) prompt = "The meaning of life is" start_ids = tokenizer.encode(prompt) print(start_ids) |
运行 test11.py
1 2 |
python newsrc/test11.py [1, 450, 6593, 310, 2834, 338] |
下面我们通过C语言来实现,参照 https://github.com/karpathy/llama2.c 项目下的 run.c 文件,命名为 test02.c,文件保存到 newsrc 目录下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
#include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <time.h> #include <math.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <sys/mman.h> // The Byte Pair Encoding (BPE) Tokenizer that translates strings <-> tokens typedef struct { // 一个字符指针,指向与ID关联的字符串。 char *str; // 一个整数,表示与字符串关联的ID。 int id; } TokenIndex; typedef struct { // 一个指针数组,存储词汇表中每个单词的字符串表示。 // 例如,如果 vocab[0] 是 "apple",那么 vocab[0][0] 就是字符 'a'。 char** vocab; // 一个浮点数数组,可能存储与词汇表中的每个单词相关联的分数或权重。 float* vocab_scores; // 结构体,存储一个字符串和与其相关联的整数ID TokenIndex *sorted_vocab; // 一个整数,表示词汇表的大小,即vocab和vocab_scores数组中的元素数量。 int vocab_size; // 一个无符号整数,表示词汇表中的最大token长度。 unsigned int max_token_length; // 一个无符号字符数组,存储所有单字节字符串 // 这个数组的大小被设定为512,可能是为了存储ASCII字符表中的所有可能的单字节字符串。 unsigned char byte_pieces[512]; // stores all single-byte strings } Tokenizer; int compare_tokens(const void *a, const void *b) { return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str); } void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) { // i should have written the vocab_size into the tokenizer file... sigh // 设置了Tokenizer结构体中vocab_size的值。 t->vocab_size = vocab_size; // malloc space to hold the scores and the strings // 为vocab,vocab_scores数组分配了内存,而sorted_vocab被初始化为NULL,表示它将在后续被“懒惰地”初始化。 t->vocab = (char**)malloc(vocab_size * sizeof(char*)); t->vocab_scores = (float*)malloc(vocab_size * sizeof(float)); t->sorted_vocab = NULL; // initialized lazily // 初始化byte_pieces数组,该数组存储所有单字节字符串 for (int i = 0; i < 256; i++) { t->byte_pieces[i * 2] = (unsigned char)i; t->byte_pieces[i * 2 + 1] = '\0'; } // read in the file FILE *file = fopen(tokenizer_path, "rb"); if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); } // 从文件中读取max_token_length的值 if (fread(&t->max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } int len; // 这个循环块读取每个vocab字符串及其对应的vocab_scores值。 for (int i = 0; i < vocab_size; i++) { if (fread(t->vocab_scores + i, sizeof(float), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE);} if (fread(&len, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } t->vocab[i] = (char *)malloc(len + 1); if (fread(t->vocab[i], len, 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); } t->vocab[i][len] = '\0'; // add the string terminating token } fclose(file); } void free_tokenizer(Tokenizer* t) { for (int i = 0; i < t->vocab_size; i++) { free(t->vocab[i]); } if(t->vocab) free(t->vocab); if(t->vocab_scores) free(t->vocab_scores); if(t->sorted_vocab) free(t->sorted_vocab); } void print_tokenizer(Tokenizer* t) { printf("vocab = %d\n", t->vocab_size); printf("max_token_length = %d\n", t->max_token_length); for (int i = 0; i < t->vocab_size; i++) { printf("%5d, %12.6lf, (%s)\n", i, t->vocab_scores[i], t->vocab[i]); } } int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) { // efficiently find the perfect match for str in vocab, return its index or -1 if not found TokenIndex tok = { .str = str }; // acts as the key to search for TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens); return res != NULL ? res->id : -1; } // 该encode函数在提供的C代码中,将字符串text编码为一系列令牌。它执行多个步骤和操作来完成此操作,这包括处理UTF-8字节序列、处理字节对编码(BPE),等等。 // // 参数: // Tokenizer* t – 指向Tokenizer结构的指针。 // char *text – 需要编码的输入字符串。 // int8_t bos – 一个标志,表示是否在序列开始处添加BOS(序列开始)令牌。 // int8_t eos – 一个标志,表示是否在序列末尾添加EOS(序列结束)令牌。 // int *tokens – 指向存储结果令牌的数组的指针。 // int *n_tokens – 指向存储结果令牌数量的整数的指针。 void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) { // encode the string text (input) into an upper-bound preallocated tokens[] array // bos != 0 means prepend the BOS token (=1), eos != 0 means append the EOS token (=2) // 将字符串 text(输入)编码进预先分配好的上限 tokens[] 数组中 // bos != 0 意味着在前面加上 BOS 令牌(=1),eos != 0 意味着在后面加上 EOS 令牌(=2) if (text == NULL) { fprintf(stderr, "cannot encode NULL text\n"); exit(EXIT_FAILURE); } if (t->sorted_vocab == NULL) { // lazily malloc and sort the vocabulary // 如果还未分配,懒加载并排序词汇表 t->sorted_vocab = malloc(t->vocab_size * sizeof(TokenIndex)); for (int i = 0; i < t->vocab_size; i++) { t->sorted_vocab[i].str = t->vocab[i]; t->sorted_vocab[i].id = i; } qsort(t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens); } // create a temporary buffer that will store merge candidates of always two consecutive tokens // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_length is 1) // 创建一个临时缓冲区,该缓冲区将始终存储两个连续令牌的合并候选项 // *2 用于串联,+1 用于空终止符 +2 用于UTF8(以防 max_token_length 为 1) char* str_buffer = malloc((t->max_token_length*2 +1 +2) * sizeof(char)); size_t str_len = 0; // start at 0 tokens *n_tokens = 0; // add optional BOS (=1) token, if desired // 如果需要,添加 BOS (=1) 令牌 if (bos) tokens[(*n_tokens)++] = 1; // add_dummy_prefix is true by default // so prepend a dummy prefix token to the input string, but only if text != "" // TODO: pretty sure this isn't correct in the general case but I don't have the // energy to read more of the sentencepiece code to figure out what it's doing // add_dummy_prefix 默认为 true // 因此,在输入字符串前加一个虚拟前缀令牌,但只在 text != "" 的情况下加 // TODO: 在一般情况下,我很确信这不正确,但我没有精力去阅读更多的 sentencepiece 代码以弄清楚它在做什么 if (text[0] != '\0') { int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size); tokens[(*n_tokens)++] = dummy_prefix; } // Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia: // Code point ↔ UTF-8 conversion // First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4 // 好的,现在是 UTF-8 的时间。这会变得有点混乱。下面是来自维基百科的参考: // 代码点 ↔ UTF-8 转换 // 第一个代码点 最后一个代码点 字节 1 字节 2 字节 3 字节 4 // U+0000 U+007F 0xxxxxxx // U+0080 U+07FF 110xxxxx 10xxxxxx // U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx // U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // process the raw (UTF-8) byte sequence of the input string // 处理输入字符串的原始(UTF-8)字节序列 for (char *c = text; *c != '\0'; c++) { // reset buffer if the current byte is ASCII or a leading byte // 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the rest // 0x80 is 10000000 // in UTF-8, all continuation bytes start with "10" in first two bits // so in English this is: "if this byte is not a continuation byte" if ((*c & 0xC0) != 0x80) { // this byte must be either a leading byte (11...) or an ASCII char (0x...) // => reset our location, as we're starting a new UTF-8 codepoint str_len = 0; } // append the current byte to the buffer str_buffer[str_len++] = *c; // ++ is post-increment, incremented after this line str_buffer[str_len] = '\0'; // while the next character is a continuation byte, continue appending // but if there are too many of them, just stop to avoid overruning str_buffer size. if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) { continue; } // ok c+1 is not a continuation byte, so we've read in a full codepoint int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); if (id != -1) { // we found this codepoint in vocab, add it as a token tokens[(*n_tokens)++] = id; } else { // byte_fallback encoding: just encode each byte as a token // +3 is here because the first 3 vocab elements are <unk>, <s>, </s> // so the individual bytes only start at index 3 for (int i=0; i < str_len; i++) { tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3; } } str_len = 0; // protect against a sequence of stray UTF8 continuation bytes } // merge the best consecutive pair each iteration, according the scores in vocab_scores // 在每次迭代中合并最佳连续对,根据vocab_scores中的分数 while (1) { float best_score = -1e10; int best_id = -1; int best_idx = -1; for (int i=0; i < (*n_tokens-1); i++) { // check if we can merge the pair (tokens[i], tokens[i+1]) // 检查我们是否可以合并对(tokens[i], tokens[i+1]) sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]); int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size); if (id != -1 && t->vocab_scores[id] > best_score) { // this merge pair exists in vocab! record its score and position // 这个合并对在词汇表中存在!记录其分数和位置 best_score = t->vocab_scores[id]; best_id = id; best_idx = i; } } if (best_idx == -1) { break; // we couldn't find any more pairs to merge, so we're done } // merge the consecutive pair (best_idx, best_idx+1) into new token best_id tokens[best_idx] = best_id; // delete token at position best_idx+1, shift the entire sequence back 1 for (int i = best_idx+1; i < (*n_tokens-1); i++) { tokens[i] = tokens[i+1]; } (*n_tokens)--; // token length decreased } // add optional EOS (=2) token, if desired // 如果需要,添加 EOS (=2) 令牌 if (eos) tokens[(*n_tokens)++] = 2; free(str_buffer); } int main(int argc, char *argv[]) { char *tokenizer_path = "tokenizer.bin"; // build the Tokenizer via the tokenizer .bin file Tokenizer tokenizer; int vocab_size = 32000; // 从模型文件的 config.json 获取 build_tokenizer(&tokenizer, tokenizer_path, vocab_size); // encode the (string) prompt into tokens sequence // 将字符串提示编码为令牌序列 char *prompt = " The meaning of life is"; int num_prompt_tokens = 0; int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS encode(&tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens); if (num_prompt_tokens < 1) { fprintf(stderr, "something is wrong, expected at least 1 prompt token\n"); exit(EXIT_FAILURE); } int i = 0; for(i = 0; i < num_prompt_tokens; i++) { int token = prompt_tokens[i]; printf("%2d, %5d, %12.6lf, (%s)\n", i, token, tokenizer.vocab_scores[token], tokenizer.vocab[token]); } free_tokenizer(&tokenizer); free(prompt_tokens); return 0; } |
编译 test02.c
1 2 |
make test02 cc test02.c -o test02 |
运行 test02
1 2 3 4 5 6 7 8 9 |
./test02 0, 1, 0.000000, ( <s> ) 1, 450, -191.000000, ( The) 2, 6593, -6334.000000, ( meaning) 3, 310, -51.000000, ( of) 4, 2834, -2575.000000, ( life) 5, 338, -79.000000, ( is) |
从 test10.py 和 test02.c 的运行结果来看,运行结果是一致的