使用 loRA 微调一个 PDF 使用手册（2）

这一节介绍把 md 文件转换为 jsonl 文件，便于微调使用

pdf 文件转换 md 文件后，大概的格式如下：

这是文档的第一句话，没有任何标题标注。
# 第一部分
这里是第一部分的内容。
# 第二部分
这里是第二部分的内容。

这是文档的第一句话，没有任何标题标注。

# 第一部分

这里是第一部分的内容。

# 第二部分

这里是第二部分的内容。

为了保证数据的有效性，提供的文档需要进行第二次处理

我们微调的文档大部分是英文的，尽量保持原理英文的逻辑，但是需要分段，分段也是一个技巧。

每一段需要最多多少字节，如何分段？

这里是这样分段，按 md 的分段逻辑来。

以 # 开始为一段，但是要保持字节足够多，可以继续下一个 #，
但是总字节数不能超过 2048个 token（不是字节，这个你可以修改）
如果每一段超过2048个字节，则会按行再分开
使用的模型是 https://huggingface.co/Qwen/Qwen2-7B-Instruct

在微调大型语言模型（如Qwen-2）时，通常会按token来处理，而不是字节。这是因为：

Token是模型的基本处理单元：语言模型在输入文本时会将其拆分为token（通常是词、词的一部分或字符）。不同的tokenizer（如BPE、WordPiece）有不同的方式生成token，因此token数直接决定模型的输入长度。
Token与字节的关系：虽然字节数在文本编码和存储中有意义，但在语言模型中，模型实际处理的是token。即使两个文本的字节数相同，token数可能会不同，因为同一个字符序列在不同的上下文中可能被切分成不同数量的token。
最大长度限制：微调过程中，输入序列的长度限制是基于token的。例如，大多数模型在处理时会有最大token数（如2048或4096 tokens）的限制，而不是直接根据字节数。

下面是示例代码：

import re
import json
from transformers import AutoTokenizer

# 初始化tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

def split_by_headers(md_content):
    # 判断文档开头是否有标题
    if not md_content.startswith("#"):
        first_part, *rest = re.split(r'(?=^#)', md_content, flags=re.MULTILINE)
        sections = [first_part.strip()] + rest
    else:
        sections = re.split(r'(?=^#)', md_content, flags=re.MULTILINE)
    
    return [section.strip() for section in sections if section.strip()]

def split_long_section(section, max_length=2048):
    lines = section.split("\n")
    split_sections = []
    current_chunk = ""

    for line in lines:
        temp_chunk = current_chunk + "\n" + line if current_chunk else line
        token_length = len(tokenizer.encode(temp_chunk, add_special_tokens=False))

        if token_length <= max_length:
            current_chunk = temp_chunk
        else:
            split_sections.append(current_chunk)
            current_chunk = line

    if current_chunk:
        split_sections.append(current_chunk)

    return split_sections

def merge_sections(sections, max_length=2048):
    merged_sections = []
    current_section = sections[0]

    for next_section in sections[1:]:
        temp_section = current_section + "\n" + next_section
        token_length = len(tokenizer.encode(temp_section, add_special_tokens=False))

        if token_length <= max_length:
            current_section = temp_section
        else:
            # 如果合并后的段落超过限制，先对当前段落进行处理
            if len(tokenizer.encode(current_section, add_special_tokens=False)) > max_length:
                merged_sections.extend(split_long_section(current_section, max_length))
            else:
                merged_sections.append(current_section)
            current_section = next_section

    # 最后处理剩余段落
    if len(tokenizer.encode(current_section, add_special_tokens=False)) > max_length:
        merged_sections.extend(split_long_section(current_section, max_length))
    else:
        merged_sections.append(current_section)

    return merged_sections

def process_markdown_file(md_file, max_length=2048):
    with open(md_file, 'r', encoding='utf-8') as file:
        md_content = file.read()

    sections = split_by_headers(md_content)
    merged_sections = merge_sections(sections, max_length)
    
    return merged_sections

def save_to_jsonl(sections, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for section in sections:
            token_length = len(tokenizer.encode(section, add_special_tokens=False))
            byte_length = len(section.encode('utf-8'))
            json_line = json.dumps({
                "token_length": token_length,
                "byte_length": byte_length,
                "content": section
            }, ensure_ascii=False)
            f.write(json_line + "\n")

# 使用示例
split_sections = process_markdown_file("output/EX_SAG_10x/EX_SAG_10x.md", max_length=2048)
save_to_jsonl(split_sections, "EX_SAG_10x.jsonl")

import re

import json

from transformers import AutoTokenizer

# 初始化tokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

def split_by_headers(md_content):

# 判断文档开头是否有标题

if not md_content.startswith("#"):

first_part, *rest = re.split(r'(?=^#)', md_content, flags=re.MULTILINE)

sections = [first_part.strip()] + rest

else:

sections = re.split(r'(?=^#)', md_content, flags=re.MULTILINE)

return [section.strip() for section in sections if section.strip()]

def split_long_section(section, max_length=2048):

lines = section.split("\n")

split_sections = []

current_chunk = ""

for line in lines:

temp_chunk = current_chunk + "\n" + line if current_chunk else line

token_length = len(tokenizer.encode(temp_chunk, add_special_tokens=False))

if token_length <= max_length:

current_chunk = temp_chunk

else:

split_sections.append(current_chunk)

current_chunk = line

if current_chunk:

split_sections.append(current_chunk)

return split_sections

def merge_sections(sections, max_length=2048):

merged_sections = []

current_section = sections[0]

for next_section in sections[1:]:

temp_section = current_section + "\n" + next_section

token_length = len(tokenizer.encode(temp_section, add_special_tokens=False))

if token_length <= max_length:

current_section = temp_section

else:

# 如果合并后的段落超过限制，先对当前段落进行处理

if len(tokenizer.encode(current_section, add_special_tokens=False)) > max_length:

merged_sections.extend(split_long_section(current_section, max_length))

else:

merged_sections.append(current_section)

current_section = next_section

# 最后处理剩余段落

if len(tokenizer.encode(current_section, add_special_tokens=False)) > max_length:

merged_sections.extend(split_long_section(current_section, max_length))

else:

merged_sections.append(current_section)

return merged_sections

def process_markdown_file(md_file, max_length=2048):

with open(md_file, 'r', encoding='utf-8') as file:

md_content = file.read()

sections = split_by_headers(md_content)

merged_sections = merge_sections(sections, max_length)

return merged_sections

def save_to_jsonl(sections, output_file):

with open(output_file, 'w', encoding='utf-8') as f:

for section in sections:

token_length = len(tokenizer.encode(section, add_special_tokens=False))

byte_length = len(section.encode('utf-8'))

json_line = json.dumps({

"token_length": token_length,

"byte_length": byte_length,

"content": section

}, ensure_ascii=False)

f.write(json_line + "\n")

# 使用示例

split_sections = process_markdown_file("output/EX_SAG_10x/EX_SAG_10x.md", max_length=2048)

save_to_jsonl(split_sections, "EX_SAG_10x.jsonl")

相关文章

发表评论 取消回复

发表评论取消回复