使用 Qwen/Qwen2-VL-2B-Instruct进行图片自动标注

这是 Qwen-VL 模型的最新版本，代表了近一年的创新。

主要增强功能：

SoTA对各种分辨率和比例的图像的理解：Qwen2-VL在视觉理解基准上达到了最先进的性能，包括MathVista、DocVQA、RealWorldQA、MTVQA等。
理解 20 分钟+ 的视频：Qwen2-VL 可以理解 20 分钟以上的视频，以进行高质量的基于视频的问答、对话、内容创建等。
可以操作您的手机、机器人等的代理：Qwen2-VL 具有复杂的推理和决策能力，可以与手机、机器人等设备集成，根据视觉环境和文本指令进行自动操作。
多语言支持：为了服务全球用户，除了英文和中文外，Qwen2-VL 现在还支持理解图像中不同语言的文本，包括大多数欧洲语言、日语、韩语、阿拉伯语、越南语等。

主要有两个模型：

Qwen2-VL-2B-Instruct

Qwen2-VL-7B-Instruct

主要是对目录下的图片文件自动进行标注，图片的高和宽建议不要超过2048，不然有可能GPU内存不够。

会自动根据 test01.img 产生标注文件 test01.txt ，这样训练 flux1 就很好。

创建环境和安装依赖性的库

conda create -yn Qwen2-VL python=3.10
conda activate Qwen2-VL
pip install qwen-vl-utils
pip3 install torch torchvision torchaudio
pip install 'accelerate>=0.26.0'
pip install jmespath
pip install six
pip install datasets
pip install auto-gptq
pip install optimum

pip install git+https://github.com/huggingface/transformers

conda create -yn Qwen2-VL python=3.10

conda activate Qwen2-VL

pip install qwen-vl-utils

pip3 install torch torchvision torchaudio

pip install 'accelerate>=0.26.0'

pip install jmespath

pip install six

pip install datasets

pip install auto-gptq

pip install optimum

pip install git+https://github.com/huggingface/transformers

代码支持多GPU操作，当GPU内存不够时，适量添加GPU，下面的测试是3 *24G(RTX 4090)

python test08.py

1	python test08.py

下面是代码

import os
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from tqdm import tqdm
from PIL import Image, ImageOps

# 初始化模型和处理器
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# 定义图像文件夹路径
image_folder_path = "/mnt/d/AIImages/Qwen/png/"

# 获取文件列表
image_files = [f for f in os.listdir(image_folder_path) if f.endswith(".png") or f.endswith(".jpg")]

# 遍历文件夹中的所有文件并显示进度条
for filename in tqdm(image_files, desc="Processing Images"):
    image_path = os.path.join(image_folder_path, filename)

    # 打开图像并检查尺寸
    with Image.open(image_path) as img:
        img = ImageOps.exif_transpose(img)  # 处理 EXIF 中的旋转信息
        max_size = 2048
        # 仅当图像宽度或高度超过2048时才进行调整
        if img.width > max_size or img.height > max_size:
            if img.width > img.height:
                # 如果宽度大于高度，则将宽度调整为2048，保持长宽比
                new_width = max_size
                new_height = int((max_size / img.width) * img.height)
            else:
                # 如果高度大于宽度，则将高度调整为2048，保持长宽比
                new_height = max_size
                new_width = int((max_size / img.height) * img.width)
            
            new_size = (new_width, new_height)
            img = img.resize(new_size, Image.LANCZOS)
            img.save(image_path)  # 保存缩放后的图像

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": f"file://{image_path}",
                },
                {"type": "text", "text": "Describe this image."},
            ],
        }
    ]

    # 准备推理输入
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # 推理：生成输出
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    output_text = output_text[0]  # 获取输出的文本

    # 保存输出到与图像同名的txt文件中
    txt_filename = os.path.splitext(filename)[0] + ".txt"
    txt_filepath = os.path.join(image_folder_path, txt_filename)
    with open(txt_filepath, "w") as txt_file:
        txt_file.write(output_text)

import os

import torch

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

from qwen_vl_utils import process_vision_info

from tqdm import tqdm

from PIL import Image, ImageOps

# 初始化模型和处理器

model = Qwen2VLForConditionalGeneration.from_pretrained(

"Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"

)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# 定义图像文件夹路径

image_folder_path = "/mnt/d/AIImages/Qwen/png/"

# 获取文件列表

image_files = [f for f in os.listdir(image_folder_path) if f.endswith(".png") or f.endswith(".jpg")]

# 遍历文件夹中的所有文件并显示进度条

for filename in tqdm(image_files, desc="Processing Images"):

image_path = os.path.join(image_folder_path, filename)

# 打开图像并检查尺寸

with Image.open(image_path) as img:

img = ImageOps.exif_transpose(img) # 处理 EXIF 中的旋转信息

max_size = 2048

# 仅当图像宽度或高度超过2048时才进行调整

if img.width > max_size or img.height > max_size:

if img.width > img.height:

# 如果宽度大于高度，则将宽度调整为2048，保持长宽比

new_width = max_size

new_height = int((max_size / img.width) * img.height)

else:

# 如果高度大于宽度，则将高度调整为2048，保持长宽比

new_height = max_size

new_width = int((max_size / img.height) * img.width)

new_size = (new_width, new_height)

img = img.resize(new_size, Image.LANCZOS)

img.save(image_path) # 保存缩放后的图像

messages = [

{

"role": "user",

"content": [

{

"type": "image",

"image": f"file://{image_path}",

{"type": "text", "text": "Describe this image."},

}

]

# 准备推理输入

text = processor.apply_chat_template(

messages, tokenize=False, add_generation_prompt=True

)

image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(

text=[text],

images=image_inputs,

videos=video_inputs,

padding=True,

return_tensors="pt",

)

inputs = inputs.to("cuda")

# 推理：生成输出

generated_ids = model.generate(**inputs, max_new_tokens=128)

generated_ids_trimmed = [

out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)

]

output_text = processor.batch_decode(

generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False

)

output_text = output_text[0] # 获取输出的文本

# 保存输出到与图像同名的txt文件中

txt_filename = os.path.splitext(filename)[0] + ".txt"

txt_filepath = os.path.join(image_folder_path, txt_filename)

with open(txt_filepath, "w") as txt_file:

txt_file.write(output_text)

主要增强功能：

相关文章

发表评论 取消回复

发表评论取消回复