📖 目次
🌟 はじめに
中国のAIDC-AIが開発した次世代マルチモーダルモデル「Ovis2-16B/34B」をWSL2上のUbuntu環境で動作させる方法を解説します。
🖥️ 環境要件
項目 | 推奨仕様 |
---|---|
OS | Ubuntu 24.04 LTS (WSL2可) |
GPU | NVIDIA RTX 4090 (24GB VRAM) ×4 |
CUDA | 12.2以降 |
Python | 3.10.16 |
必要VRAM | 16B: 33GB / 34B: 68GB |
🔧 セットアップ手順
1. リポジトリのクローン
git clone https://github.com/AIDC-AI/Ovis.git && cd Ovis
2. Conda環境構築
conda create -n ovis python=3.10 -y && conda activate ovis
3. 依存関係のインストール
pip install -r requirements.txt && pip install -e .
4. モデルダウンロード
# 16Bモデル
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download AIDC-AI/Ovis2-16B
# 34Bモデル
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download AIDC-AI/Ovis2-34B
⚙️ app.py
ファイルの作成
import logging
import os
from threading import Thread
from typing import List, Any
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, TextIteratorStreamer
model_name = 'AIDC-AI/Ovis2-16B'
use_thread = False
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
multimodal_max_length=8192,
trust_remote_code=True,
device_map="auto",
# max_memory={i: "22GiB" for i in range(4)}
max_memory={0: "23GiB", 1: "24GiB", 2: "24GiB", 3: "24GiB"}
)
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
image_placeholder = '<image>'
cur_dir = os.path.dirname(os.path.abspath(__file__))
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def initialize_gen_kwargs():
return {
"max_new_tokens": 1536,
"do_sample": True,
"num_beams": 1,
"top_p": 0.75,
"top_k": 30,
"temperature": 0.7,
"repetition_penalty": 1.05,
"eos_token_id": model.generation_config.eos_token_id,
"pad_token_id": text_tokenizer.pad_token_id,
"use_cache": True,
}
def submit_chat(chatbot, text_input):
response = ''
chatbot.append((text_input, response))
return chatbot, ''
def ovis_chat(chatbot: List[List[str]], image_input: Any):
conversations, model_inputs = prepare_inputs(chatbot, image_input)
gen_kwargs = initialize_gen_kwargs()
with torch.inference_mode():
generate_func = lambda: model.generate(**model_inputs, **gen_kwargs, streamer=streamer)
if use_thread:
thread = Thread(target=generate_func)
thread.start()
else:
generate_func()
response = ""
for new_text in streamer:
response += new_text
chatbot[-1][1] = response
yield chatbot
if use_thread:
thread.join()
log_conversation(chatbot)
def prepare_inputs(chatbot: List[List[str]], image_input: Any):
conversations = []
for query, response in chatbot[:-1]:
conversations.extend([
{"from": "human", "value": query},
{"from": "gpt", "value": response}
])
last_query = chatbot[-1][0].replace(image_placeholder, '')
conversations.append({"from": "human", "value": last_query})
if image_input is not None:
for conv in conversations:
if conv["from"] == "human":
conv["value"] = f'{image_placeholder}\n{conv["value"]}'
break
logger.info(conversations)
prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input], max_partition=16)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
model_inputs = {
"inputs": input_ids.unsqueeze(0).to(model.device),
"attention_mask": attention_mask.unsqueeze(0).to(model.device),
"pixel_values": [pixel_values.to(
dtype=visual_tokenizer.dtype,
device=model.device
)] if image_input is not None else [None]
}
return conversations, model_inputs
def log_conversation(chatbot):
logger.info("[OVIS_CONV_START]")
[print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
logger.info("[OVIS_CONV_END]")
def clear_chat():
return [], None, ""
latex_delimiters_set = [{
"left": "\\(",
"right": "\\)",
"display": False
}, {
"left": "\\begin{equation}",
"right": "\\end{equation}",
"display": True
}, {
"left": "\\begin{align}",
"right": "\\end{align}",
"display": True
}, {
"left": "\\begin{alignat}",
"right": "\\end{alignat}",
"display": True
}, {
"left": "\\begin{gather}",
"right": "\\end{gather}",
"display": True
}, {
"left": "\\begin{CD}",
"right": "\\end{CD}",
"display": True
}, {
"left": "\\[",
"right": "\\]",
"display": True
}]
text_input = gr.Textbox(label="prompt", placeholder="Enter your text here...", lines=1, container=False)
with gr.Blocks(title=model_name.split('/')[-1], theme=gr.themes.Ocean()) as demo:
with gr.Row():
with gr.Column(scale=3):
image_input = gr.Image(label="image", height=350, type="pil")
with gr.Column(scale=7):
chatbot = gr.Chatbot(
label="Ovis", layout="panel", height=600,
show_copy_button=True,
latex_delimiters=latex_delimiters_set
)
text_input.render()
with gr.Row():
send_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
send_click_event = send_btn.click(
submit_chat,
[chatbot, text_input],
[chatbot, text_input]
).then(
ovis_chat,
[chatbot, image_input],
chatbot
)
submit_event = text_input.submit(
submit_chat,
[chatbot, text_input],
[chatbot, text_input]
).then(
ovis_chat,
[chatbot, image_input],
chatbot
)
clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input])
demo.launch()
🚀 起動スクリプト
DISABLE_VERSION_CHECK=1 \
PYTORCH_NVML_BASED_CUDA_CHECK=1 \
CUDA_VISIBLE_DEVICES=3,1,0,2 \
python app.py
✅ アクセス方法
起動成功時に表示されるURLhttp://127.0.0.1:7860/へ移動して、アクセスできます。
本ガイドを参考に、ぜひ試してみましょう!