了解 LLaMA-2 模型结构(6)

11. 将字符串编码为令牌序列

在聊天的时候，需要把字符串通过 tokenizer 进行编码，命名为 test10.py，文件保存到 newsrc 目录下：

import os
import struct
import argparse
from typing import List

from sentencepiece import SentencePieceProcessor

class Tokenizer:
    def __init__(self, tokenizer_model=None):
        model_path = tokenizer_model
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
        self.model_path = model_path

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)

    def export(self):

        # get all the tokens (postprocessed) and their scores as floats
        tokens, scores = [], []
        for i in range(self.n_words):

            # decode the token and light postprocessing
            t = self.sp_model.id_to_piece(i)
            s = self.sp_model.get_score(i)
            if i == self.bos_id:
                t = '\n<s>\n'
            elif i == self.eos_id:
                t = '\n</s>\n'
            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
            b = t.encode('utf-8') # bytes of this token, utf-8 encoded

            tokens.append(b)
            scores.append(s)

        # record the max token length
        max_token_length = max(len(t) for t in tokens)

        # write to a binary file
        # the tokenizer.bin file is the same as .model file, but .bin
        tokenizer_bin = self.model_path.replace('.model', '.bin')
        with open(tokenizer_bin, 'wb') as f:
            f.write(struct.pack("I", max_token_length))
            for bytes, score in zip(tokens, scores):
                f.write(struct.pack("fI", score, len(bytes)))
                f.write(bytes)

t = Tokenizer("newsrc/tokenizer.model")

prompt = "The meaning of life is"

start_ids = t.encode(prompt, bos=True, eos=False)
print(start_ids)

import os

import struct

import argparse

from typing import List

from sentencepiece import SentencePieceProcessor

class Tokenizer:

def __init__(self, tokenizer_model=None):

model_path = tokenizer_model

assert os.path.isfile(model_path), model_path

self.sp_model = SentencePieceProcessor(model_file=model_path)

self.model_path = model_path

# BOS / EOS token IDs

self.n_words: int = self.sp_model.vocab_size()

self.bos_id: int = self.sp_model.bos_id()

self.eos_id: int = self.sp_model.eos_id()

self.pad_id: int = self.sp_model.pad_id()

#print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")

assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

def encode(self, s: str, bos: bool, eos: bool) -> List[int]:

assert type(s) is str

t = self.sp_model.encode(s)

if bos:

t = [self.bos_id] + t

if eos:

t = t + [self.eos_id]

return t

def decode(self, t: List[int]) -> str:

return self.sp_model.decode(t)

def export(self):

# get all the tokens (postprocessed) and their scores as floats

tokens, scores = [], []

for i in range(self.n_words):

# decode the token and light postprocessing

t = self.sp_model.id_to_piece(i)

s = self.sp_model.get_score(i)

if i == self.bos_id:

t = '\n<s>\n'

elif i == self.eos_id:

t = '\n</s>\n'

t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace

b = t.encode('utf-8') # bytes of this token, utf-8 encoded

tokens.append(b)

scores.append(s)

# record the max token length

max_token_length = max(len(t) for t in tokens)

# write to a binary file

# the tokenizer.bin file is the same as .model file, but .bin

tokenizer_bin = self.model_path.replace('.model', '.bin')

with open(tokenizer_bin, 'wb') as f:

f.write(struct.pack("I", max_token_length))

for bytes, score in zip(tokens, scores):

f.write(struct.pack("fI", score, len(bytes)))

f.write(bytes)

t = Tokenizer("newsrc/tokenizer.model")

prompt = "The meaning of life is"

start_ids = t.encode(prompt, bos=True, eos=False)

print(start_ids)

运行 test10.py

python newsrc/test10.py
[1, 450, 6593, 310, 2834, 338]

1 2	python newsrc/test10.py [1, 450, 6593, 310, 2834, 338]

通过 AutoTokenizer 来编码，命名为 test11.py，文件保存到 newsrc 目录下：

from transformers import AutoTokenizer

model_path = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

prompt = "The meaning of life is"
start_ids = tokenizer.encode(prompt)
print(start_ids)

from transformers import AutoTokenizer

model_path = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

prompt = "The meaning of life is"

start_ids = tokenizer.encode(prompt)

print(start_ids)

运行 test11.py

python newsrc/test11.py
[1, 450, 6593, 310, 2834, 338]

1 2	python newsrc/test11.py [1, 450, 6593, 310, 2834, 338]

下面我们通过C语言来实现，参照 https://github.com/karpathy/llama2.c 项目下的 run.c 文件，命名为 test02.c，文件保存到 newsrc 目录下：

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
// The Byte Pair Encoding (BPE) Tokenizer that translates strings <-> tokens

typedef struct {
    // 一个字符指针，指向与ID关联的字符串。
    char *str;
    // 一个整数，表示与字符串关联的ID。
    int id;
} TokenIndex;

typedef struct {
    // 一个指针数组，存储词汇表中每个单词的字符串表示。
    // 例如，如果 vocab[0] 是 "apple"，那么 vocab[0][0] 就是字符 'a'。  
    char** vocab;
    // 一个浮点数数组，可能存储与词汇表中的每个单词相关联的分数或权重。
    float* vocab_scores;
    // 结构体，存储一个字符串和与其相关联的整数ID
    TokenIndex *sorted_vocab;
    // 一个整数，表示词汇表的大小，即vocab和vocab_scores数组中的元素数量。
    int vocab_size;
    // 一个无符号整数，表示词汇表中的最大token长度。
    unsigned int max_token_length;
    // 一个无符号字符数组，存储所有单字节字符串
    // 这个数组的大小被设定为512，可能是为了存储ASCII字符表中的所有可能的单字节字符串。
    unsigned char byte_pieces[512]; // stores all single-byte strings
} Tokenizer;

int compare_tokens(const void *a, const void *b) {
    return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
}

void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
    // i should have written the vocab_size into the tokenizer file... sigh
    // 设置了Tokenizer结构体中vocab_size的值。
    t->vocab_size = vocab_size;
    // malloc space to hold the scores and the strings
    // 为vocab，vocab_scores数组分配了内存，而sorted_vocab被初始化为NULL，表示它将在后续被“懒惰地”初始化。
    t->vocab = (char**)malloc(vocab_size * sizeof(char*));
    t->vocab_scores = (float*)malloc(vocab_size * sizeof(float));
    t->sorted_vocab = NULL; // initialized lazily
    // 初始化byte_pieces数组，该数组存储所有单字节字符串
    for (int i = 0; i < 256; i++) {
        t->byte_pieces[i * 2] = (unsigned char)i;
        t->byte_pieces[i * 2 + 1] = '\0';
    }
    // read in the file
    FILE *file = fopen(tokenizer_path, "rb");
    if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); }
    // 从文件中读取max_token_length的值
    if (fread(&t->max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }
    int len;
    // 这个循环块读取每个vocab字符串及其对应的vocab_scores值。
    for (int i = 0; i < vocab_size; i++) {
        if (fread(t->vocab_scores + i, sizeof(float), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE);}
        if (fread(&len, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }
        t->vocab[i] = (char *)malloc(len + 1);
        if (fread(t->vocab[i], len, 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }
        t->vocab[i][len] = '\0'; // add the string terminating token
    }
    fclose(file);
}

void free_tokenizer(Tokenizer* t) {
    for (int i = 0; i < t->vocab_size; i++) 
    { 
      free(t->vocab[i]); 
    }
    if(t->vocab) free(t->vocab);
    if(t->vocab_scores) free(t->vocab_scores);
    if(t->sorted_vocab) free(t->sorted_vocab);
}

void print_tokenizer(Tokenizer* t) {
    printf("vocab = %d\n", t->vocab_size);
    printf("max_token_length = %d\n", t->max_token_length);
    for (int i = 0; i < t->vocab_size; i++) 
    { 
      printf("%5d, %12.6lf, (%s)\n", i, t->vocab_scores[i], t->vocab[i]);
    }
}

int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) {
    // efficiently find the perfect match for str in vocab, return its index or -1 if not found
    TokenIndex tok = { .str = str }; // acts as the key to search for
    TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);
    return res != NULL ? res->id : -1;
}

// 该encode函数在提供的C代码中，将字符串text编码为一系列令牌。它执行多个步骤和操作来完成此操作，这包括处理UTF-8字节序列、处理字节对编码（BPE），等等。
// 
// 参数：
// Tokenizer* t – 指向Tokenizer结构的指针。
// char *text – 需要编码的输入字符串。
// int8_t bos – 一个标志，表示是否在序列开始处添加BOS（序列开始）令牌。
// int8_t eos – 一个标志，表示是否在序列末尾添加EOS（序列结束）令牌。
// int *tokens – 指向存储结果令牌的数组的指针。
// int *n_tokens – 指向存储结果令牌数量的整数的指针。
void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) {
    // encode the string text (input) into an upper-bound preallocated tokens[] array
    // bos != 0 means prepend the BOS token (=1), eos != 0 means append the EOS token (=2)
    // 将字符串 text（输入）编码进预先分配好的上限 tokens[] 数组中
    // bos != 0 意味着在前面加上 BOS 令牌（=1），eos != 0 意味着在后面加上 EOS 令牌（=2）
    if (text == NULL) { fprintf(stderr, "cannot encode NULL text\n"); exit(EXIT_FAILURE); }

    if (t->sorted_vocab == NULL) {
        // lazily malloc and sort the vocabulary
        // 如果还未分配，懒加载并排序词汇表
        t->sorted_vocab = malloc(t->vocab_size * sizeof(TokenIndex));
        for (int i = 0; i < t->vocab_size; i++) {
            t->sorted_vocab[i].str = t->vocab[i];
            t->sorted_vocab[i].id = i;
        }
        qsort(t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens);
    }

    // create a temporary buffer that will store merge candidates of always two consecutive tokens
    // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_length is 1)
    // 创建一个临时缓冲区，该缓冲区将始终存储两个连续令牌的合并候选项
    // *2 用于串联，+1 用于空终止符 +2 用于UTF8（以防 max_token_length 为 1）
    char* str_buffer = malloc((t->max_token_length*2 +1 +2) * sizeof(char));
    size_t str_len = 0;

    // start at 0 tokens
    *n_tokens = 0;

    // add optional BOS (=1) token, if desired
    // 如果需要，添加 BOS (=1) 令牌
    if (bos) tokens[(*n_tokens)++] = 1;

    // add_dummy_prefix is true by default
    // so prepend a dummy prefix token to the input string, but only if text != ""
    // TODO: pretty sure this isn't correct in the general case but I don't have the
    // energy to read more of the sentencepiece code to figure out what it's doing
    // add_dummy_prefix 默认为 true
    // 因此，在输入字符串前加一个虚拟前缀令牌，但只在 text != "" 的情况下加
    // TODO: 在一般情况下，我很确信这不正确，但我没有精力去阅读更多的 sentencepiece 代码以弄清楚它在做什么
    if (text[0] != '\0') {
        int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size);
        tokens[(*n_tokens)++] = dummy_prefix;
    }

    // Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia:
    // Code point ↔ UTF-8 conversion
    // First code point Last code point Byte 1  Byte 2  Byte 3  Byte 4
    // 好的，现在是 UTF-8 的时间。这会变得有点混乱。下面是来自维基百科的参考：
    // 代码点 ↔ UTF-8 转换
    // 第一个代码点 最后一个代码点 字节 1  字节 2  字节 3  字节 4
    // U+0000 U+007F      0xxxxxxx
    // U+0080 U+07FF      110xxxxx  10xxxxxx
    // U+0800 U+FFFF      1110xxxx  10xxxxxx  10xxxxxx
    // U+10000  U+10FFFF    11110xxx  10xxxxxx  10xxxxxx  10xxxxxx

    // process the raw (UTF-8) byte sequence of the input string
    // 处理输入字符串的原始（UTF-8）字节序列
    for (char *c = text; *c != '\0'; c++) {

        // reset buffer if the current byte is ASCII or a leading byte
        // 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the rest
        // 0x80 is 10000000
        // in UTF-8, all continuation bytes start with "10" in first two bits
        // so in English this is: "if this byte is not a continuation byte"
        if ((*c & 0xC0) != 0x80) {
            // this byte must be either a leading byte (11...) or an ASCII char (0x...)
            // => reset our location, as we're starting a new UTF-8 codepoint
            str_len = 0;
        }

        // append the current byte to the buffer
        str_buffer[str_len++] = *c; // ++ is post-increment, incremented after this line
        str_buffer[str_len] = '\0';

        // while the next character is a continuation byte, continue appending
        // but if there are too many of them, just stop to avoid overruning str_buffer size.
        if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {
            continue;
        }

        // ok c+1 is not a continuation byte, so we've read in a full codepoint
        int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);

        if (id != -1) {
            // we found this codepoint in vocab, add it as a token
            tokens[(*n_tokens)++] = id;
        } else {
            // byte_fallback encoding: just encode each byte as a token
            // +3 is here because the first 3 vocab elements are <unk>, <s>, </s>
            // so the individual bytes only start at index 3
            for (int i=0; i < str_len; i++) {
                tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
            }
        }
        str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
    }

    // merge the best consecutive pair each iteration, according the scores in vocab_scores
    // 在每次迭代中合并最佳连续对，根据vocab_scores中的分数
    while (1) {
        float best_score = -1e10;
        int best_id = -1;
        int best_idx = -1;

        for (int i=0; i < (*n_tokens-1); i++) {
            // check if we can merge the pair (tokens[i], tokens[i+1])
            // 检查我们是否可以合并对(tokens[i], tokens[i+1])
            sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]);
            int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);
            if (id != -1 && t->vocab_scores[id] > best_score) {
                // this merge pair exists in vocab! record its score and position
                // 这个合并对在词汇表中存在！记录其分数和位置
                best_score = t->vocab_scores[id];
                best_id = id;
                best_idx = i;
            }
        }

        if (best_idx == -1) {
            break; // we couldn't find any more pairs to merge, so we're done
        }

        // merge the consecutive pair (best_idx, best_idx+1) into new token best_id
        tokens[best_idx] = best_id;
        // delete token at position best_idx+1, shift the entire sequence back 1
        for (int i = best_idx+1; i < (*n_tokens-1); i++) {
            tokens[i] = tokens[i+1];
        }
        (*n_tokens)--; // token length decreased
    }

    // add optional EOS (=2) token, if desired
    // 如果需要，添加 EOS (=2) 令牌
    if (eos) tokens[(*n_tokens)++] = 2;

    free(str_buffer);
}


int main(int argc, char *argv[]) {

    char *tokenizer_path = "tokenizer.bin";
    
    // build the Tokenizer via the tokenizer .bin file
    Tokenizer tokenizer;
    int vocab_size = 32000; // 从模型文件的 config.json 获取
    build_tokenizer(&tokenizer, tokenizer_path, vocab_size);
    
    // encode the (string) prompt into tokens sequence
    // 将字符串提示编码为令牌序列
    char *prompt = " The meaning of life is";
    int num_prompt_tokens = 0;
    int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS
    encode(&tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
    if (num_prompt_tokens < 1) {
        fprintf(stderr, "something is wrong, expected at least 1 prompt token\n");
        exit(EXIT_FAILURE);
    }
    
    int i = 0;
    for(i = 0; i < num_prompt_tokens; i++)
    {
      int token = prompt_tokens[i];
      printf("%2d, %5d, %12.6lf, (%s)\n", i, token, 
        tokenizer.vocab_scores[token], tokenizer.vocab[token]);
    }

    free_tokenizer(&tokenizer);
    free(prompt_tokens);
    
    return 0;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

#include <stdio.h>

#include <stdlib.h>

#include <ctype.h>

#include <time.h>

#include <math.h>

#include <string.h>

#include <fcntl.h>

#include <unistd.h>

#include <sys/mman.h>

// The Byte Pair Encoding (BPE) Tokenizer that translates strings <-> tokens

typedef struct {

// 一个字符指针，指向与ID关联的字符串。

char *str;

// 一个整数，表示与字符串关联的ID。

int id;

} TokenIndex;

typedef struct {

// 一个指针数组，存储词汇表中每个单词的字符串表示。

// 例如，如果 vocab[0] 是 "apple"，那么 vocab[0][0] 就是字符 'a'。

char** vocab;

// 一个浮点数数组，可能存储与词汇表中的每个单词相关联的分数或权重。

float* vocab_scores;

// 结构体，存储一个字符串和与其相关联的整数ID

TokenIndex *sorted_vocab;

// 一个整数，表示词汇表的大小，即vocab和vocab_scores数组中的元素数量。

int vocab_size;

// 一个无符号整数，表示词汇表中的最大token长度。

unsigned int max_token_length;

// 一个无符号字符数组，存储所有单字节字符串

// 这个数组的大小被设定为512，可能是为了存储ASCII字符表中的所有可能的单字节字符串。

unsigned char byte_pieces[512]; // stores all single-byte strings

} Tokenizer;

int compare_tokens(const void *a, const void *b) {

return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);

}

void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {

// i should have written the vocab_size into the tokenizer file... sigh

// 设置了Tokenizer结构体中vocab_size的值。

t->vocab_size = vocab_size;

// malloc space to hold the scores and the strings

// 为vocab，vocab_scores数组分配了内存，而sorted_vocab被初始化为NULL，表示它将在后续被“懒惰地”初始化。

t->vocab = (char**)malloc(vocab_size * sizeof(char*));

t->vocab_scores = (float*)malloc(vocab_size * sizeof(float));

t->sorted_vocab = NULL; // initialized lazily

// 初始化byte_pieces数组，该数组存储所有单字节字符串

for (int i = 0; i < 256; i++) {

t->byte_pieces[i * 2] = (unsigned char)i;

t->byte_pieces[i * 2 + 1] = '\0';

}

// read in the file

FILE *file = fopen(tokenizer_path, "rb");

if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); }

// 从文件中读取max_token_length的值

if (fread(&t->max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }

int len;

// 这个循环块读取每个vocab字符串及其对应的vocab_scores值。

for (int i = 0; i < vocab_size; i++) {

if (fread(t->vocab_scores + i, sizeof(float), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE);}

if (fread(&len, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }

t->vocab[i] = (char *)malloc(len + 1);

if (fread(t->vocab[i], len, 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }

t->vocab[i][len] = '\0'; // add the string terminating token

}

fclose(file);

}

void free_tokenizer(Tokenizer* t) {

for (int i = 0; i < t->vocab_size; i++)

{

free(t->vocab[i]);

}

if(t->vocab) free(t->vocab);

if(t->vocab_scores) free(t->vocab_scores);

if(t->sorted_vocab) free(t->sorted_vocab);

}

void print_tokenizer(Tokenizer* t) {

printf("vocab = %d\n", t->vocab_size);

printf("max_token_length = %d\n", t->max_token_length);

for (int i = 0; i < t->vocab_size; i++)

{

printf("%5d, %12.6lf, (%s)\n", i, t->vocab_scores[i], t->vocab[i]);

}

int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) {

// efficiently find the perfect match for str in vocab, return its index or -1 if not found

TokenIndex tok = { .str = str }; // acts as the key to search for

TokenIndex *res = bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);

return res != NULL ? res->id : -1;

}

// 该encode函数在提供的C代码中，将字符串text编码为一系列令牌。它执行多个步骤和操作来完成此操作，这包括处理UTF-8字节序列、处理字节对编码（BPE），等等。

// 参数：

// Tokenizer* t – 指向Tokenizer结构的指针。

// char *text – 需要编码的输入字符串。

// int8_t bos – 一个标志，表示是否在序列开始处添加BOS（序列开始）令牌。

// int8_t eos – 一个标志，表示是否在序列末尾添加EOS（序列结束）令牌。

// int *tokens – 指向存储结果令牌的数组的指针。

// int *n_tokens – 指向存储结果令牌数量的整数的指针。

void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) {

// encode the string text (input) into an upper-bound preallocated tokens[] array

// bos != 0 means prepend the BOS token (=1), eos != 0 means append the EOS token (=2)

// 将字符串 text（输入）编码进预先分配好的上限 tokens[] 数组中

// bos != 0 意味着在前面加上 BOS 令牌（=1），eos != 0 意味着在后面加上 EOS 令牌（=2）

if (text == NULL) { fprintf(stderr, "cannot encode NULL text\n"); exit(EXIT_FAILURE); }

if (t->sorted_vocab == NULL) {

// lazily malloc and sort the vocabulary

// 如果还未分配，懒加载并排序词汇表

t->sorted_vocab = malloc(t->vocab_size * sizeof(TokenIndex));

for (int i = 0; i < t->vocab_size; i++) {

t->sorted_vocab[i].str = t->vocab[i];

t->sorted_vocab[i].id = i;

}

qsort(t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens);

}

// create a temporary buffer that will store merge candidates of always two consecutive tokens

// *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_length is 1)

// 创建一个临时缓冲区，该缓冲区将始终存储两个连续令牌的合并候选项

// *2 用于串联，+1 用于空终止符 +2 用于UTF8（以防 max_token_length 为 1）

char* str_buffer = malloc((t->max_token_length*2 +1 +2) * sizeof(char));

size_t str_len = 0;

// start at 0 tokens

*n_tokens = 0;

// add optional BOS (=1) token, if desired

// 如果需要，添加 BOS (=1) 令牌

if (bos) tokens[(*n_tokens)++] = 1;

// add_dummy_prefix is true by default

// so prepend a dummy prefix token to the input string, but only if text != ""

// TODO: pretty sure this isn't correct in the general case but I don't have the

// energy to read more of the sentencepiece code to figure out what it's doing

// add_dummy_prefix 默认为 true

// 因此，在输入字符串前加一个虚拟前缀令牌，但只在 text != "" 的情况下加

// TODO: 在一般情况下，我很确信这不正确，但我没有精力去阅读更多的 sentencepiece 代码以弄清楚它在做什么

if (text[0] != '\0') {

int dummy_prefix = str_lookup(" ", t->sorted_vocab, t->vocab_size);

tokens[(*n_tokens)++] = dummy_prefix;

}

// Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia:

// Code point ↔ UTF-8 conversion

// First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4

// 好的，现在是 UTF-8 的时间。这会变得有点混乱。下面是来自维基百科的参考：

// 代码点 ↔ UTF-8 转换

// 第一个代码点最后一个代码点字节 1 字节 2 字节 3 字节 4

// U+0000 U+007F 0xxxxxxx

// U+0080 U+07FF 110xxxxx 10xxxxxx

// U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx

// U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

// process the raw (UTF-8) byte sequence of the input string

// 处理输入字符串的原始（UTF-8）字节序列

for (char *c = text; *c != '\0'; c++) {

// reset buffer if the current byte is ASCII or a leading byte

// 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the rest

// 0x80 is 10000000

// in UTF-8, all continuation bytes start with "10" in first two bits

// so in English this is: "if this byte is not a continuation byte"

if ((*c & 0xC0) != 0x80) {

// this byte must be either a leading byte (11...) or an ASCII char (0x...)

// => reset our location, as we're starting a new UTF-8 codepoint

str_len = 0;

}

// append the current byte to the buffer

str_buffer[str_len++] = *c; // ++ is post-increment, incremented after this line

str_buffer[str_len] = '\0';

// while the next character is a continuation byte, continue appending

// but if there are too many of them, just stop to avoid overruning str_buffer size.

if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {

continue;

}

// ok c+1 is not a continuation byte, so we've read in a full codepoint

int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);

if (id != -1) {

// we found this codepoint in vocab, add it as a token

tokens[(*n_tokens)++] = id;

} else {

// byte_fallback encoding: just encode each byte as a token

// +3 is here because the first 3 vocab elements are <unk>, <s>, </s>

// so the individual bytes only start at index 3

for (int i=0; i < str_len; i++) {

tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;

}

str_len = 0; // protect against a sequence of stray UTF8 continuation bytes

}

// merge the best consecutive pair each iteration, according the scores in vocab_scores

// 在每次迭代中合并最佳连续对，根据vocab_scores中的分数

while (1) {

float best_score = -1e10;

int best_id = -1;

int best_idx = -1;

for (int i=0; i < (*n_tokens-1); i++) {

// check if we can merge the pair (tokens[i], tokens[i+1])

// 检查我们是否可以合并对(tokens[i], tokens[i+1])

sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]);

int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);

if (id != -1 && t->vocab_scores[id] > best_score) {

// this merge pair exists in vocab! record its score and position

// 这个合并对在词汇表中存在！记录其分数和位置

best_score = t->vocab_scores[id];

best_id = id;

best_idx = i;

}

if (best_idx == -1) {

break; // we couldn't find any more pairs to merge, so we're done

}

// merge the consecutive pair (best_idx, best_idx+1) into new token best_id

tokens[best_idx] = best_id;

// delete token at position best_idx+1, shift the entire sequence back 1

for (int i = best_idx+1; i < (*n_tokens-1); i++) {

tokens[i] = tokens[i+1];

}

(*n_tokens)--; // token length decreased

}

// add optional EOS (=2) token, if desired

// 如果需要，添加 EOS (=2) 令牌

if (eos) tokens[(*n_tokens)++] = 2;

free(str_buffer);

}

int main(int argc, char *argv[]) {

char *tokenizer_path = "tokenizer.bin";

// build the Tokenizer via the tokenizer .bin file

Tokenizer tokenizer;

int vocab_size = 32000; // 从模型文件的 config.json 获取

build_tokenizer(&tokenizer, tokenizer_path, vocab_size);

// encode the (string) prompt into tokens sequence

// 将字符串提示编码为令牌序列

char *prompt = " The meaning of life is";

int num_prompt_tokens = 0;

int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS

encode(&tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);

if (num_prompt_tokens < 1) {

fprintf(stderr, "something is wrong, expected at least 1 prompt token\n");

exit(EXIT_FAILURE);

}

int i = 0;

for(i = 0; i < num_prompt_tokens; i++)

{

int token = prompt_tokens[i];

printf("%2d, %5d, %12.6lf, (%s)\n", i, token,

tokenizer.vocab_scores[token], tokenizer.vocab[token]);

}

free_tokenizer(&tokenizer);

free(prompt_tokens);

return 0;

}

编译 test02.c

make test02
cc     test02.c   -o test02

1 2	make test02 cc test02.c -o test02

运行 test02

./test02
 0,     1,     0.000000, (
<s>
)
 1,   450,  -191.000000, ( The)
 2,  6593, -6334.000000, ( meaning)
 3,   310,   -51.000000, ( of)
 4,  2834, -2575.000000, ( life)
 5,   338,   -79.000000, ( is)

./test02

0, 1, 0.000000, (

<s>

)

1, 450, -191.000000, ( The)

2, 6593, -6334.000000, ( meaning)

3, 310, -51.000000, ( of)

4, 2834, -2575.000000, ( life)

5, 338, -79.000000, ( is)

从 test10.py 和 test02.c 的运行结果来看，运行结果是一致的

11. 将字符串编码为令牌序列

相关文章

发表评论 取消回复

发表评论取消回复