PyQt6で作るYOLO動画ラベリングツール

Last updated at 2025-09-22Posted at 2025-09-22

はじめに

YOLOモデルのfinetuneで学習が不足している部分の検出・追加データを行うツールを作成してみました。
動画にYOLO推論を重ねてインタラクティブにラベリングできるGUIツールです。
Positive/Negative登録やVOC形式のXML出力も可能で、動画からフレーム毎にデータ作成を行います。

機能

動画再生/停止、フレーム送り/戻し
マウスでボックス操作（ドラッグ・リサイズ）
Ctrl+クリックで新規ボックス作成
Positive/Negative画像保存
XML出力（VOC形式）
YOLOv8モデル対応（.pt形式）

実行環境

Python3.13

# requirements.txt
PyQt6==6.9.1
opencv-python==4.12.0.88
ultralytics==8.3.202
torch==2.8.0
torchvision==0.23.0

画像

実行手順

:: 仮想環境作成
python -m venv interactive-vision

:: 仮想環境有効化（Windows コマンドプロンプト）
interactive-vision\Scripts\activate

:: 依存関係インストール
pip install -r requirements.txt

:: 実行
python interactive_yolo.py

操作方法

1. 動画・モデル・出力先の選択

動画選択 ボタンをクリックして、ラベリングしたい動画ファイル（.mp4/.avi/.mov）を選択
モデル選択 ボタンをクリックして、YOLOv8の学習済みモデルファイル（.pt）を選択
出力フォルダ選択 ボタンで、Positive/Negative画像やXMLを保存するフォルダを選択

2. 再生・フレーム操作

再生/停止 ボタン：動画の再生・停止を切り替え
前フレーム / 次フレーム ボタン：1フレームずつ移動
FPS入力欄：動画再生速度を変更可能（デフォルトは30fps）

3. ボックス操作（ラベリング）

既存ボックスをクリック：ボックス選択して移動（ドラッグ）
ボックスの角をドラッグ：ボックスサイズを変更（リサイズ）
Ctrl + クリック：新しいボックスを作成（初期サイズ50x50）
ボックスが選択されている状態では、色が黄色に変化

4. Positive / Negative登録

ボックスを選択した状態で Positive登録 または Negative登録 ボタンをクリック
選択中のボックスが現在のフレームに描画された画像として保存される
保存パス：出力フォルダ/YYYYMMDD/positive/ または .../negative/

5. XML保存

XML保存 ボタンをクリックすると、すべての登録データがVOC形式のXMLとして保存される
保存パス：出力フォルダ/YYYYMMDD/annotations/動画名.xml

6. マウス操作まとめ

操作	内容
左クリック	ボックスを選択
左クリック+ドラッグ	ボックス移動
ボックス角をドラッグ	ボックスリサイズ
Ctrl + 左クリック	新規ボックス作成

7. 注意事項

動画とモデルを選択していない場合、再生ボタンは動作しません
フレーム送り・戻しは動画範囲外には移動できません
XML保存前に必ず Positive/Negative 登録を行ってください

コード

import sys, os
from PyQt6.QtWidgets import (
    QApplication, QWidget, QLabel, QPushButton, QVBoxLayout, QHBoxLayout,
    QFileDialog, QLineEdit, QMessageBox
)
from PyQt6.QtCore import QTimer, Qt
from PyQt6.QtGui import QPixmap, QImage
import cv2
from ultralytics import YOLO
import xml.etree.ElementTree as ET
from datetime import datetime

class InteractiveYOLO(QWidget):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("Interactive YOLO")

        # --- 基本設定 ---
        self.video_path = ""
        self.model_path = ""
        self.output_dir = ""
        self.cap = None
        self.detector = None
        self.frame_idx = 0
        self.total_frames = 0
        self.current_frame = None
        self.frame_size = (640, 480)
        self.timer = QTimer()
        self.timer.timeout.connect(self.next_frame)

        # --- ボックス管理 ---
        self.boxes = []  # dict: {"xyxy":[x1,y1,x2,y2],"cls":cls_id}
        self.selected_box = None
        self.dragging = False
        self.resizing = False
        self.resizing_corner = None
        self.start_pt = None
        self.aspect_fixed = False
        self.annotation_data = []

        self.corner_size = 10

        self.init_ui()

    # ----------------------------
    # UI初期化
    # ----------------------------
    def init_ui(self):
        self.video_label = QLabel("動画表示")
        self.video_label.setFixedSize(1280, 720)
        self.video_label.setAlignment(Qt.AlignmentFlag.AlignCenter)

        # 右側操作パネル
        self.video_input = QLineEdit()
        self.video_btn = QPushButton("動画選択")
        self.video_btn.clicked.connect(self.select_video)

        self.model_input = QLineEdit()
        self.model_btn = QPushButton("モデル選択")
        self.model_btn.clicked.connect(self.select_model)

        self.output_input = QLineEdit()
        self.output_btn = QPushButton("出力フォルダ選択")
        self.output_btn.clicked.connect(self.select_output)

        self.fps_input = QLineEdit("30")

        self.play_pause_button = QPushButton("再生/停止")
        self.play_pause_button.clicked.connect(self.toggle_playback)
        self.prev_button = QPushButton("前フレーム")
        self.prev_button.clicked.connect(lambda: self.goto_frame(self.frame_idx-1))
        self.next_button = QPushButton("次フレーム")
        self.next_button.clicked.connect(lambda: self.goto_frame(self.frame_idx+1))
        self.save_pos_button = QPushButton("Positive登録")
        self.save_pos_button.clicked.connect(lambda: self.save_box("positive"))
        self.save_neg_button = QPushButton("Negative登録")
        self.save_neg_button.clicked.connect(lambda: self.save_box("negative"))
        self.save_xml_button = QPushButton("XML保存")
        self.save_xml_button.clicked.connect(self.save_xml)

        right_layout = QVBoxLayout()
        for label, widget in [("動画:", [self.video_input, self.video_btn]),
                              ("モデル:", [self.model_input, self.model_btn]),
                              ("出力フォルダ:", [self.output_input, self.output_btn])]:
            right_layout.addWidget(QLabel(label))
            hl = QHBoxLayout()
            for w in widget: hl.addWidget(w)
            right_layout.addLayout(hl)
        right_layout.addWidget(QLabel("FPS:"))
        right_layout.addWidget(self.fps_input)
        for w in [self.play_pause_button, self.prev_button, self.next_button,
                  self.save_pos_button, self.save_neg_button, self.save_xml_button]:
            right_layout.addWidget(w)
        right_layout.addStretch()

        main_layout = QHBoxLayout()
        main_layout.addWidget(self.video_label)
        main_layout.addLayout(right_layout)
        self.setLayout(main_layout)

        # マウスイベント
        self.video_label.mousePressEvent = self.mouse_press
        self.video_label.mouseMoveEvent = self.mouse_move
        self.video_label.mouseReleaseEvent = self.mouse_release

    # ----------------------------
    # 動画・モデル・出力選択
    # ----------------------------
    def select_video(self):
        path, _ = QFileDialog.getOpenFileName(self, "動画を選択", "", "Video Files (*.mp4 *.avi *.mov)")
        if not path: return
        self.video_path = path
        self.video_input.setText(path)
        cap = cv2.VideoCapture(path)
        ret, frame = cap.read()
        cap.release()
        if ret: self.show_frame(frame)

    def select_model(self):
        path, _ = QFileDialog.getOpenFileName(self, "モデルを選択", "", "PyTorchモデル (*.pt)")
        if path:
            self.model_path = path
            self.model_input.setText(path)

    def select_output(self):
        path = QFileDialog.getExistingDirectory(self, "出力フォルダ選択")
        if path:
            self.output_dir = path
            self.output_input.setText(path)

    # ----------------------------
    # 再生/停止
    # ----------------------------
    def toggle_playback(self):
        if self.timer.isActive():
            self.timer.stop()
        else:
            if not self.cap or not self.detector:
                if not self.video_path or not self.model_path:
                    QMessageBox.warning(self,"注意","動画とモデルを選択してください")
                    return
                self.cap = cv2.VideoCapture(self.video_path)
                self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
                self.detector = YOLO(self.model_path)
            try:
                fps = float(self.fps_input.text())
                interval = max(1,int(1000/fps))
            except:
                interval = 30
            self.timer.start(interval)

    def next_frame(self):
        self.goto_frame(self.frame_idx+1)

    def goto_frame(self, idx):
        if self.cap is None: return
        idx = max(0, min(idx, self.total_frames-1))
        self.frame_idx = idx
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = self.cap.read()
        if not ret:
            self.timer.stop()
            return
        self.current_frame = frame
        # YOLO推論
        self.boxes = [{"xyxy":b.xyxy[0].tolist(),"cls":int(b.cls[0])} for b in self.detector(frame)[0].boxes]
        display = self.draw_boxes(frame)
        display = self.resize_frame(display)
        self.show_frame(display)

    # ----------------------------
    # フレーム描画
    # ----------------------------
    def resize_frame(self, frame):
        h,w = frame.shape[:2]
        max_w, max_h = self.video_label.width(), self.video_label.height()
        scale = min(max_w/w, max_h/h)
        new_w, new_h = int(w*scale), int(h*scale)
        return cv2.resize(frame,(new_w,new_h))

    def draw_boxes(self, frame):
        disp = frame.copy()
        for i, box in enumerate(self.boxes):
            x1,y1,x2,y2 = map(int, box["xyxy"])
            cls_id = int(box["cls"])
            color = (0,255,255) if i==self.selected_box else (0,255,0)
            label = f"cls{cls_id}"
            cv2.rectangle(disp, (x1,y1),(x2,y2), color, 2)
            cv2.putText(disp,label,(x1,y1-5),cv2.FONT_HERSHEY_SIMPLEX,0.5,color,2)
            for cx,cy in [(x1,y1),(x2,y1),(x1,y2),(x2,y2)]:
                cv2.circle(disp,(cx,cy),4,(255,0,0),-1)
        return disp

    def show_frame(self, frame):
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        h,w,ch = rgb.shape
        qt_img = QImage(rgb.data,w,h,ch*w,QImage.Format.Format_RGB888)
        self.video_label.setPixmap(QPixmap.fromImage(qt_img))

    # ----------------------------
    # マウス操作
    # ----------------------------
    def mouse_press(self, e):
        if self.current_frame is None: return

        # ラベルサイズと元フレームサイズの比率
        scale_x = self.current_frame.shape[1] / self.video_label.width()
        scale_y = self.current_frame.shape[0] / self.video_label.height()

        # ラベル座標 → 元フレーム座標
        x = int(e.position().x() * scale_x)
        y = int(e.position().y() * scale_y)

        # 四隅の判定
        for i, box in enumerate(self.boxes):
            x1,y1,x2,y2 = map(int,box["xyxy"])
            corners = {"tl":(x1,y1),"tr":(x2,y1),"bl":(x1,y2),"br":(x2,y2)}
            for c_name,(cx,cy) in corners.items():
                # corner_sizeもスケールに合わせると掴みやすい
                scaled_corner = max(5, int(self.corner_size * (scale_x + scale_y)/2))
                if abs(x-cx)<=scaled_corner and abs(y-cy)<=scaled_corner:
                    self.selected_box=i
                    self.resizing=True
                    self.resizing_corner=c_name
                    self.start_pt=(x,y)
                    return

        # ボックス内クリックでドラッグ
        for i, box in enumerate(self.boxes):
            x1,y1,x2,y2 = map(int, box["xyxy"])
            if x1 <= x <= x2 and y1 <= y <= y2:
                self.selected_box=i
                self.dragging=True
                self.start_pt=(x,y)
                return

        # Ctrl + クリックで新規ボックス作成
        if e.modifiers() & Qt.KeyboardModifier.ControlModifier:
            new_box={"xyxy":[x,y,x+50,y+50],"cls":0}
            self.boxes.append(new_box)
            self.selected_box=len(self.boxes)-1
            self.dragging=True
            self.start_pt=(x,y)

        self.update_display()


    def mouse_move(self, e):
        if self.current_frame is None or self.selected_box is None: return

        scale_x = self.current_frame.shape[1] / self.video_label.width()
        scale_y = self.current_frame.shape[0] / self.video_label.height()
        x = int(e.position().x() * scale_x)
        y = int(e.position().y() * scale_y)

        box = self.boxes[self.selected_box]
        x1, y1, x2, y2 = box["xyxy"]

        if self.dragging:
            dx, dy = x - self.start_pt[0], y - self.start_pt[1]
            box["xyxy"] = [x1+dx, y1+dy, x2+dx, y2+dy]
            self.start_pt = (x, y)

        elif self.resizing:
            if self.resizing_corner == "tl":
                new_x1 = min(x2-1, x)
                new_y1 = min(y2-1, y)
                box["xyxy"] = [new_x1, new_y1, x2, y2]
            elif self.resizing_corner == "tr":
                new_x2 = max(x1+1, x)
                new_y1 = min(y2-1, y)
                box["xyxy"] = [x1, new_y1, new_x2, y2]
            elif self.resizing_corner == "bl":
                new_x1 = min(x2-1, x)
                new_y2 = max(y1+1, y)
                box["xyxy"] = [new_x1, y1, x2, new_y2]
            elif self.resizing_corner == "br":
                new_x2 = max(x1+1, x)
                new_y2 = max(y1+1, y)
                box["xyxy"] = [x1, y1, new_x2, new_y2]

        self.update_display()

    def mouse_release(self,e):
        self.dragging=False
        self.resizing=False

    def update_display(self):
        if self.current_frame is None: return
        display=self.draw_boxes(self.current_frame)
        display=self.resize_frame(display)
        self.show_frame(display)

    # ----------------------------
    # Positive/Negative保存
    # ----------------------------
    def save_box(self, folder):
        if self.selected_box is None or not self.output_dir: return

        base_dir = os.path.join(self.output_dir, datetime.now().strftime("%Y%m%d"))
        os.makedirs(os.path.join(base_dir, folder), exist_ok=True)

        # 現在のフレームに枠を描画した画像を作る
        frame_with_box = self.current_frame.copy()
        x1, y1, x2, y2 = map(int, self.boxes[self.selected_box]["xyxy"])
        cls_id = int(self.boxes[self.selected_box]["cls"])
        color = (0, 255, 0)
        cv2.rectangle(frame_with_box, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame_with_box, f"cls{cls_id}", (x1, y1-5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        img_path = os.path.join(base_dir, folder, f"frame{self.frame_idx}_box{self.selected_box}.jpg")
        cv2.imwrite(img_path, frame_with_box)

        # アノテーションデータは今まで通り保存
        self.annotation_data.append((self.frame_idx, self.boxes[self.selected_box]))
        print(f"[LOG] Saved {img_path}")

        self.selected_box = None

    # ----------------------------
    # XML保存
    # ----------------------------
    def save_xml(self):
        if not self.output_dir or self.current_frame is None: return
        base_dir=os.path.join(self.output_dir, datetime.now().strftime("%Y%m%d"))
        os.makedirs(os.path.join(base_dir,"annotations"), exist_ok=True)
        path=os.path.join(base_dir,"annotations", os.path.splitext(os.path.basename(self.video_path))[0]+".xml")
        root=ET.Element("annotation")
        ET.SubElement(root,"filename").text=os.path.basename(self.video_path)
        size=ET.SubElement(root,"size")
        ET.SubElement(size,"width").text=str(self.current_frame.shape[1])
        ET.SubElement(size,"height").text=str(self.current_frame.shape[0])
        ET.SubElement(size,"depth").text="3"
        for frame_idx, box in self.annotation_data:
            x1,y1,x2,y2=map(int, box["xyxy"])
            cls_id=int(box["cls"])
            obj=ET.SubElement(root,"object")
            ET.SubElement(obj,"name").text=f"cls{cls_id}"
            ET.SubElement(obj,"frame").text=str(frame_idx)
            bbox=ET.SubElement(obj,"bndbox")
            ET.SubElement(bbox,"xmin").text=str(x1)
            ET.SubElement(bbox,"ymin").text=str(y1)
            ET.SubElement(bbox,"xmax").text=str(x2)
            ET.SubElement(bbox,"ymax").text=str(y2)
        ET.ElementTree(root).write(path)
        print(f"[LOG] Saved XML: {path}")

if __name__=="__main__":
    app=QApplication(sys.argv)
    window=InteractiveYOLO()
    window.show()
    sys.exit(app.exec())

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up