構造化された情報でも抽出できる
今回はこの食品の成分表の画像を読んでみます。
この成分表示の栄養素を日本語でリストにして。
モデル:Qwen2.5-VL-32B-Instruct
この成分表示の栄養素を日本語でリストにすると以下の通りです:
栄養成分表
エネルギー
100 gあたり: 2252 kJ / 539 kcal
15 gあたり: 336 kJ / 80 kcal
15 gあたりの割合: 4%
脂質(脂肪)
100 gあたり: 30.9 g
15 gあたり: 4.6 g
15 gあたりの割合: 7%
飽和脂肪酸
100 gあたり: 10.6 g
15 gあたり: 1.6 g
15 gあたりの割合: 8%
炭水化物
100 gあたり: 57.5 g
15 gあたり: 8.6 g
15 gあたりの割合: 3%
糖分
100 gあたり: 56.3 g
15 gあたり: 8.4 g
15 gあたりの割合: 9%
タンパク質
100 gあたり: 6.3 g
15 gあたり: 0.9 g
15 gあたりの割合: 2%
塩分(ナトリウム)
100 gあたり: 0.107 g
15 gあたり: 0.016 g
15 gあたりの割合: 0%
その他の情報
参考値(成人男性の場合): 8400 kJ / 2000 kcal
15 g ≒ 1小さじ
v保存方法: 乾燥した場所で冷暗所に保管。高温多湿を避ける。
製造元: Ferrero France Commerciale
住所: CS 90058 - 76136 Mont Saint Aignan Cedex
以上が、この成分表示の栄養素を日本語でリストしたものです。
モデル:InternVL
この成分表示の栄養素を日本語でリストアップすると以下のようになります。
- エネルギー
- 100gあたり: 2252kJ / 539kcal
- 15gあたり: 336kJ / 80kcal
- 1食あたりの割合: 4%
- 油脂
- 100gあたり: 30.9g
- 15gあたり: 4.6g
- 1食あたりの割合: 7%
- 飱塩相当量
- 100gあたり: 10.6g
- 15gあたり: 1.6g
- 1食あたりの割合: 8%
- 炭水化物
- 100gあたり: 57.5g
- 15gあたり: 8.6g
- 1食あたりの割合: 3%
- 糖類
- 100gあたり: 56.3g
- 15gあたり: 8.4g
- 1食あたりの割合: 9%
- たんぱく
- 100gあたり: 6.3g
- 15gあたり: 0.9g
- 1食あたりの割合: 2%
- 食塩
- 100gあたり: 0.107g
- 15gあたり: 0.016g
- 1食あたりの割合: 0%
この成分表示は、15gの摂取量を基準にしています。1食あたりの割合は、成人の1日あたりの摂取量の目安に基づいています。
いやー、すごいですね。
チャートなども読めます。
画像理解AIの使い方
Qwen2.5
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-32B-Instruct", torch_dtype="auto", device_map="auto"
)
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# "Qwen/Qwen2.5-VL-32B-Instruct",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )
# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")
# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
image = Image.open("image.jpg")
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": "この成分表示の栄養素を日本語でリストにして。"},
],
}
]
# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
InternVL
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
num_layers = {
'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
# If you set `load_in_8bit=True`, you will need one 80GB GPUs.
# If you set `load_in_8bit=False`, you will need at least two 80GB GPUs.
path = 'OpenGVLab/InternVL2_5-38B'
device_map = split_model('InternVL2_5-38B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=True,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
pixel_values = load_image('image.jpg', max_num=1).to(torch.float16).cuda()
generation_config = dict(max_new_tokens=128, do_sample=True)
# single-image single-round conversation (单图单轮对话)
question = '<image>\nこの成分表示の栄養素を日本語でリストにして。'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
🐣
フリーランスエンジニアです。
お仕事のご相談こちらまで
rockyshikoku@gmail.com
Core MLを使ったアプリを作っています。
機械学習関連の情報を発信しています。