Python演習_04_機械学習②YOLO

Last updated at 2025-05-07Posted at 2025-05-07

はじめに

YOLOv11を使ってカメラ画像からの物体検出をリアルタイムで行います．また，トラッキングや姿勢推定機能も使えるようになりましょう．

動作確認は下記環境で行っています

・OS: Windows11 Pro (64bit)
・Anaconda Navigator: 2.6.3
・Python: 3.9.21
・UI: jupyter Notebook 7.3.2
・OpenCV: 4.11.0
・
・カメラ: ELECOM UCAM-310FBBK

1. ultralyticsのインストール

Terminalにて（jupyter notebookでも最初に「！」をつけると同様にできます）

pip install ultralytics

2. 推論（YOLOv11nで物体検出・全自動）

まずはきちんと動作するか試してみましょう

import cv2
from ultralytics import YOLO

# Load the YOLO model
#model = YOLO("best.pt")
model = YOLO("yolo11n.pt")

# Open the video file
# video_path = "cat.mp4"
video_path = 0
cap = cv2.VideoCapture(video_path)

# Get the original video width, height, and frame rate
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the output video size (1080p)
#output_height = 1080
output_height = 540
output_width = int((output_height / original_height) * original_width)

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Resize the frame to 1080p
        resized_frame = cv2.resize(frame, (output_width, output_height))

        # Run YOLO inference on the resized frame
        results = model(resized_frame, conf=0.8,iou=0.3, verbose=False)
        
        # Visualize the results on the frame
        annotated_frame = results[0].plot()

        # Display the annotated frame
        cv2.imshow("YOLO Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        print("cannot open camera")
        break

# Release the video capture and writer objects and close the display window
cap.release()
cv2.destroyAllWindows()

3. 「2」のプログラムにおいて結果表示を自分で行う

YOLOを活用するには，例えば検出結果をもとに何かを制御したりできないといけませんので，結果表示をマニュアル化したプログラムを示します．

import cv2
from ultralytics import YOLO

# Load the YOLO model
#model = YOLO("best.pt")
model = YOLO("yolo11n.pt")

# Open the video file
# video_path = "cat.mp4"
video_path = 0
cap = cv2.VideoCapture(video_path)

# Get the original video width, height, and frame rate
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the output video size (1080p)
#output_height = 1080
output_height = 540
output_width = int((output_height / original_height) * original_width)


# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Resize the frame to 1080p
        resized_frame = cv2.resize(frame, (output_width, output_height))

        # Run YOLO inference on the resized frame
        results = model(resized_frame, conf=0.8,iou=0.3, verbose=False)

        items = results[0]
        annotated_frame = resized_frame.copy()

        for item in items:
            # 検出結果の各パラメータを取得
            cls = int(item.boxes.cls)  # クラスID
            label = item.names[int(cls)]  # クラス名
            score = item.boxes.conf.cpu().numpy()[0]  # 信頼スコア
            x1, y1, x2, y2 = item.boxes.xyxy.cpu().numpy()[0]  # バウンディングボックス座標

            # トラッキングIDの取得
            id_value = item.boxes.id
            track_ids = '' if id_value is None else item.boxes.id.int().cpu().tolist()[0]
            
            # バウンディングボックスの描画
            cv2.rectangle(annotated_frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
               
            # ラベル描画
            text = f"{label}: {score:.2f}" if track_ids else f"{label}: {score:.2f}"
            font_scale = 0.75 # フォントサイズ
            cv2.putText(annotated_frame, text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), 2)

        cv2.imshow("YOLO Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        print("cannot open camera")
        break

# Release the video capture and writer objects and close the display window
cap.release()
cv2.destroyAllWindows()

4. トラッキングをできるようにする

「3」のプログラムを少しだけ修正してトラッキングを行うようにします．検出した物体に「ID」をふるようになりますので，特定の物体の移動などを検出可能になります．

import cv2
from ultralytics import YOLO

# Load the YOLO model
#model = YOLO("best.pt")
model = YOLO("yolo11n.pt")

# Open the video file
# video_path = "cat.mp4"
video_path = 0
cap = cv2.VideoCapture(video_path)

# Get the original video width, height, and frame rate
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the output video size (1080p)
#output_height = 1080
output_height = 540
output_width = int((output_height / original_height) * original_width)


# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Resize the frame to 1080p
        resized_frame = cv2.resize(frame, (output_width, output_height))

        # Run YOLO inference on the resized frame
        results = model.track(resized_frame, conf=0.8,iou=0.3, verbose=False, show_labels=False ,show_boxes =False, show_conf = False)

        items = results[0]
        annotated_frame = resized_frame.copy()

        for item in items:

            # 検出結果の各パラメータを取得
            cls = int(item.boxes.cls)  # クラスID
            label = item.names[int(cls)]  # クラス名
            score = item.boxes.conf.cpu().numpy()[0]  # 信頼スコア
            x1, y1, x2, y2 = item.boxes.xyxy.cpu().numpy()[0]  # バウンディングボックス座標

            # トラッキングIDの取得
            id_value = item.boxes.id
            #track_ids = '' if id_value is None else item.boxes.id.int().cpu().tolist()[0]
            track_ids = item.boxes.id.int().cpu().tolist()[0]
            
            # バウンディングボックスの描画
            cv2.rectangle(annotated_frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
               
            # ラベル描画
            text = "ID:" + f"{track_ids}: {label}: {score:.2f}"
            font_scale = 0.75 # フォントサイズ
            cv2.putText(annotated_frame, text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), 2)

        
        # Display the annotated frame
        cv2.imshow("YOLO Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        print("cannot open camera")
        break

# Release the video capture and writer objects and close the display window
cap.release()
cv2.destroyAllWindows()

5. 姿勢推定を行う

5.1 全自動

まずは動作を確認しましょう．

import cv2
from ultralytics import YOLO

# Load the YOLO model
#model = YOLO("best.pt")
model = YOLO("yolo11n-pose.pt")

# Open the video file
# video_path = "cat.mp4"
video_path = 0
cap = cv2.VideoCapture(video_path)

# Get the original video width, height, and frame rate
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the output video size (1080p)
#output_height = 1080
output_height = 540
output_width = int((output_height / original_height) * original_width)


# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Resize the frame to 1080p
        resized_frame = cv2.resize(frame, (output_width, output_height))

        # Run YOLO inference on the resized frame
        results = model(resized_frame, conf=0.8,iou=0.3, verbose=False)

        items = results[0]
        annotated_frame = results[0].plot()

        cv2.imshow("YOLO Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        print("cannot open camera")
        break

# Release the video capture and writer objects and close the display window
cap.release()
cv2.destroyAllWindows()

5.2 結果表示を自分で行う

結果表示をマニュアル化したプログラムです。
これで検出した関節座標などの情報を入手できます．

import cv2
from ultralytics import YOLO

# Load the YOLO model
#model = YOLO("best.pt")
model = YOLO("yolo11n-pose.pt")

# Open the video file
# video_path = "cat.mp4"
video_path = 0
cap = cv2.VideoCapture(video_path)

# Get the original video width, height, and frame rate
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define the output video size (1080p)
#output_height = 1080
output_height = 540
output_width = int((output_height / original_height) * original_width)


# keypointの位置毎の名称定義
KEYPOINTS_NAMES = [
    "nose",  # 0
    "eye(L)",  # 1
    "eye(R)",  # 2
    "ear(L)",  # 3
    "ear(R)",  # 4
    "shoulder(L)",  # 5
    "shoulder(R)",  # 6
    "elbow(L)",  # 7
    "elbow(R)",  # 8
    "wrist(L)",  # 9
    "wrist(R)",  # 10
    "hip(L)",  # 11
    "hip(R)",  # 12
    "knee(L)",  # 13
    "knee(R)",  # 14
    "ankle(L)",  # 15
    "ankle(R)",  # 16
]

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Resize the frame to 1080p
        resized_frame = cv2.resize(frame, (output_width, output_height))

        # Run YOLO inference on the resized frame
        results = model(resized_frame, conf=0.8,iou=0.3, verbose=False)

        items = results[0]
        annotated_frame = resized_frame.copy()
        #annotated_frame = results[0].plot()

        for item in items:
            # 検出結果の各パラメータを取得
            cls = int(item.boxes.cls)  # クラスID
            label = item.names[int(cls)]  # クラス名
            score = item.boxes.conf.cpu().numpy()[0]  # 信頼スコア
            x1, y1, x2, y2 = item.boxes.xyxy.cpu().numpy()[0]  # バウンディングボックス座標

            # 姿勢分析結果のキーポイントを取得する
            keypoints = results[0].keypoints
            confs = keypoints.conf[0].tolist()  # 推論結果:1に近いほど信頼度が高い
            xys = keypoints.xy[0].tolist()  # 座標

            for index, keypoint in enumerate(zip(xys, confs)):
                pscore = keypoint[1]

                # スコアが0.5以下なら描画しない
                if pscore < 0.5:
                    continue

                x = int(keypoint[0][0])
                y = int(keypoint[0][1])
                print(
                    f"Keypoint Name={KEYPOINTS_NAMES[index]}, X={x}, Y={y}, Score={score:.4}"
                )
                # 紫の四角を描画
                annotatedf_frame = cv2.rectangle(
                    annotated_frame,
                    (x, y),
                    (x + 3, y + 3),
                    (255, 0, 255),
                    cv2.FILLED,
                    cv2.LINE_AA,
                )
                # キーポイントの部位名称を描画
                annotated_frame = cv2.putText(
                    annotated_frame,
                    KEYPOINTS_NAMES[index],
                    (x + 5, y),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=0.6,
                    color=(255, 0, 255),
                    thickness=1,
                    lineType=cv2.LINE_AA,
                )

            # トラッキングIDの取得
            id_value = item.boxes.id
            track_ids = '' if id_value is None else item.boxes.id.int().cpu().tolist()[0]
            
            # バウンディングボックスの描画
            cv2.rectangle(annotated_frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
               
            # ラベル描画
            text = f"{label}: {score:.2f}" if track_ids else f"{label}: {score:.2f}"
            font_scale = 0.75 # フォントサイズ
            cv2.putText(annotated_frame, text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), 2)

        cv2.imshow("YOLO Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        print("cannot open camera")
        break

# Release the video capture and writer objects and close the display window
cap.release()
cv2.destroyAllWindows()

6. 演習課題

課題①

「3」の物体検出プログラムにおいて，検出した物体の種類によってバウンディングボックスの色を変化させるように修正を加えて下さい．（人間⇒黄色，スマホ⇒青，など）

課題②

「4」のトラッキングのプログラムにおいて，検出した物体の速度を表示するようにプログラムを修正して下さい．相対値（pixel/秒)などでOK.

課題③

「5.2」の姿勢推定プログラムにおいて，特定のポーズをした人が検出されたら特定の表示をするようにして下さい．（手を挙げている人がいたら「挙手」と表示するなど）

【さらにできたら】

座っているか立っているかを判断するプログラムを作成してみよう

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up