【完全ガイド】Ovis2-16B/34BマルチモーダルAIを動かす方法

Posted at 2025-02-15

📖 目次

はじめに
環境要件
セットアップ手順
設定ファイルの作成
起動スクリプト
アクセス方法

🌟 はじめに

中国のAIDC-AIが開発した次世代マルチモーダルモデル「Ovis2-16B/34B」をWSL2上のUbuntu環境で動作させる方法を解説します。

🖥️ 環境要件

項目	推奨仕様
OS	Ubuntu 24.04 LTS (WSL2可)
GPU	NVIDIA RTX 4090 (24GB VRAM) ×4
CUDA	12.2以降
Python	3.10.16
必要VRAM	16B: 33GB / 34B: 68GB

🔧 セットアップ手順

1. リポジトリのクローン

git clone https://github.com/AIDC-AI/Ovis.git && cd Ovis

2. Conda環境構築

conda create -n ovis python=3.10 -y && conda activate ovis

3. 依存関係のインストール

pip install -r requirements.txt && pip install -e .

4. モデルダウンロード

# 16Bモデル
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download AIDC-AI/Ovis2-16B

# 34Bモデル
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download AIDC-AI/Ovis2-34B

⚙️ `app.py`ファイルの作成

import logging
import os
from threading import Thread
from typing import List, Any

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, TextIteratorStreamer

model_name = 'AIDC-AI/Ovis2-16B'
use_thread = False

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    multimodal_max_length=8192,
    trust_remote_code=True,
    device_map="auto",
    # max_memory={i: "22GiB" for i in range(4)}
    max_memory={0: "23GiB", 1: "24GiB", 2: "24GiB", 3: "24GiB"}
)

text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
image_placeholder = '<image>'
cur_dir = os.path.dirname(os.path.abspath(__file__))

logging.getLogger("httpx").setLevel(logging.WARNING)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def initialize_gen_kwargs():
    return {
        "max_new_tokens": 1536,
        "do_sample": True,
        "num_beams": 1,
        "top_p": 0.75,
        "top_k": 30,
        "temperature": 0.7,
        "repetition_penalty": 1.05,
        "eos_token_id": model.generation_config.eos_token_id,
        "pad_token_id": text_tokenizer.pad_token_id,
        "use_cache": True,
    }


def submit_chat(chatbot, text_input):
    response = ''
    chatbot.append((text_input, response))
    return chatbot, ''


def ovis_chat(chatbot: List[List[str]], image_input: Any):
    conversations, model_inputs = prepare_inputs(chatbot, image_input)
    gen_kwargs = initialize_gen_kwargs()

    with torch.inference_mode():
        generate_func = lambda: model.generate(**model_inputs, **gen_kwargs, streamer=streamer)

        if use_thread:
            thread = Thread(target=generate_func)
            thread.start()
        else:
            generate_func()

        response = ""
        for new_text in streamer:
            response += new_text
            chatbot[-1][1] = response
            yield chatbot

        if use_thread:
            thread.join()

    log_conversation(chatbot)


def prepare_inputs(chatbot: List[List[str]], image_input: Any):
    conversations = []

    for query, response in chatbot[:-1]:
        conversations.extend([
            {"from": "human", "value": query},
            {"from": "gpt", "value": response}
        ])

    last_query = chatbot[-1][0].replace(image_placeholder, '')
    conversations.append({"from": "human", "value": last_query})

    if image_input is not None:
        for conv in conversations:
            if conv["from"] == "human":
                conv["value"] = f'{image_placeholder}\n{conv["value"]}'
                break

    logger.info(conversations)

    prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input], max_partition=16)
    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

    model_inputs = {
        "inputs": input_ids.unsqueeze(0).to(model.device),
        "attention_mask": attention_mask.unsqueeze(0).to(model.device),
        "pixel_values": [pixel_values.to(
            dtype=visual_tokenizer.dtype,
            device=model.device
        )] if image_input is not None else [None]
    }

    return conversations, model_inputs


def log_conversation(chatbot):
    logger.info("[OVIS_CONV_START]")
    [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
    logger.info("[OVIS_CONV_END]")


def clear_chat():
    return [], None, ""


latex_delimiters_set = [{
    "left": "\\(",
    "right": "\\)",
    "display": False
}, {
    "left": "\\begin{equation}",
    "right": "\\end{equation}",
    "display": True
}, {
    "left": "\\begin{align}",
    "right": "\\end{align}",
    "display": True
}, {
    "left": "\\begin{alignat}",
    "right": "\\end{alignat}",
    "display": True
}, {
    "left": "\\begin{gather}",
    "right": "\\end{gather}",
    "display": True
}, {
    "left": "\\begin{CD}",
    "right": "\\end{CD}",
    "display": True
}, {
    "left": "\\[",
    "right": "\\]",
    "display": True
}]

text_input = gr.Textbox(label="prompt", placeholder="Enter your text here...", lines=1, container=False)
with gr.Blocks(title=model_name.split('/')[-1], theme=gr.themes.Ocean()) as demo:
    with gr.Row():
        with gr.Column(scale=3):
            image_input = gr.Image(label="image", height=350, type="pil")
        with gr.Column(scale=7):
            chatbot = gr.Chatbot(
                label="Ovis", layout="panel", height=600,
                show_copy_button=True,
                latex_delimiters=latex_delimiters_set
            )
            text_input.render()
            with gr.Row():
                send_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear", variant="secondary")

    send_click_event = send_btn.click(
        submit_chat,
        [chatbot, text_input],
        [chatbot, text_input]
    ).then(
        ovis_chat,
        [chatbot, image_input],
        chatbot
    )
    submit_event = text_input.submit(
        submit_chat,
        [chatbot, text_input],
        [chatbot, text_input]
    ).then(
        ovis_chat,
        [chatbot, image_input],
        chatbot
    )
    clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input])

demo.launch()

🚀 起動スクリプト

DISABLE_VERSION_CHECK=1 \
PYTORCH_NVML_BASED_CUDA_CHECK=1 \
CUDA_VISIBLE_DEVICES=3,1,0,2 \
python app.py

✅ アクセス方法

起動成功時に表示されるURLhttp://127.0.0.1:7860/へ移動して、アクセスできます。

本ガイドを参考に、ぜひ試してみましょう！

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

【完全ガイド】Ovis2-16B/34BマルチモーダルAIを動かす方法

📖 目次

🌟 はじめに

🖥️ 環境要件

🔧 セットアップ手順

1. リポジトリのクローン

2. Conda環境構築

3. 依存関係のインストール

4. モデルダウンロード

⚙️ app.pyファイルの作成

🚀 起動スクリプト

✅ アクセス方法

⚙️ `app.py`ファイルの作成