丸が集まって大きな丸を作っているものを検出したい

Last updated at 2026-01-17Posted at 2026-01-10

概要

大阪・関西万博報道写真展にいったときのこと、スマホカメラで写真を撮ろうとしたら人の顔は検出するがあの方の顔を検出しないことに気づいた。「撮りたいのは人ではないのに！」と思いあの方の顔を検出するシステムを作ることにした。

作るもの

~~著作権の関係であの方を直接学習できないので~~（直接学習しても問題ないが学習データを機械的に作った方が簡単なので）丸が集まって大きな丸を作っているものを検出するAIを作る。

画像データ

以下の条件で訓練用200個、検証用200個を用意した。

丸を大小さまざまに組み合わせて大きな丸を作る
色：イラストと実写の両方に対応するため、赤のグラデーションあり・なしの2種類
背景：真っ白、kaggleの画像データの2種類
背景追加：ランダムな画像あり・なしの2種類
カメラ撮影風（カメラで撮影したような中央が明るく四隅を暗く、かつ僅かにノイズを加える）：あり・なしの2種類

| 丸の色 | 背景 | ランダム画像有無 | カメラ撮影風 |
|:-:|:-:|:-:|
|赤（グラデーションなし）（80%）|白（50%）|なし（50%）|なし（50%）|
|赤（グラデーションあり）（20%）|画像（50%）|あり（50%）|あり（50%）|

（例）

訓練（Google Colab使用）

ソース

※ソースの生成にCopilotを使用

!curl -L -o sample_data/stl10.zip  https://www.kaggle.com/api/v1/datasets/download/jessicali9530/stl10
!unzip sample_data/stl10.zip -d sample_data/stl10
!pip install ultralytics

!mkdir -p data/images/train
!mkdir -p data/images/test
!mkdir -p data/labels/train
!mkdir -p data/labels/test

from PIL import Image, ImageDraw, ImageEnhance
import random
import math
from pathlib import Path
import numpy as np
import cv2

IMAGE_SIZE = (640, 640)

def draw_gradient_ellipse(image_size, bounding_box, center_point, color1, color2):
    """
    新しい画像に放射状グラデーションの楕円を描画します。
    Args:
        image_size (tuple): 最終画像のサイズ(width, height).
        bounding_box (tuple): 楕円の境界ボックスの座標 (x0, y0, x1, y1)
        center_point (tuple): 放射状グラデーションの原点の中心 (x, y)
        color1 (tuple): グラデーションの開始色（例：内側の色） (R, G, B).
        color2 (tuple): グラデーションの終了色（例：外側の色） (R, G, B).

    Returns:
        Image: グラデーション楕円を持つ PIL Image オブジェクト
    """
    width, height = image_size

    # 1. αチャンネル付きの空白のキャンバスを作成する
    image = Image.new('RGBA', (width, height), (0, 0, 0, 0))

    # 2. numpyを使用して中心点からの各ピクセルの距離を計算する
    x, y = np.meshgrid(np.arange(width), np.arange(height))
    # 中心からの正規化された距離（0～1）を計算する
    # グラデーションの広がりをより均一にするには、最大次元を使用する
    max_dist = max(width, height) / 2.0
    distances = np.linalg.norm(np.stack((x, y), axis=2) - center_point, axis=2) / max_dist
    distances = np.clip(distances, 0, 1) # 値を0～1の範囲にクランプする

    # 3. 距離に基づいて色の値を計算する（線形補間）
    # Color = color1 * (1 - distance) + color2 * distance
    c1_arr = np.array(color1)
    c2_arr = np.array(color2)
    gradient_colors = (c1_arr * (1 - distances)[:, :, np.newaxis] + c2_arr * distances[:, :, np.newaxis]).astype(np.uint8)
    gradient_image = Image.fromarray(gradient_colors, 'RGB')

    # 4. 楕円形のマスクを作成する
    # 新しいシングルチャンネル（Lモード）画像にマスクとして使用する楕円を描画する
    mask = Image.new("L", image_size, 0)
    draw = ImageDraw.Draw(mask)
    draw.ellipse(bounding_box, fill=255) # 白い楕円を描く

    # 5. 楕円マスクを使用してグラデーション画像をキャンバスに貼り付ける
    image.paste(gradient_image, mask=mask)

    return image

def apply_camera_effects(image):
    image_np = np.array(image)
    rows, cols = image_np.shape[:2]

    # 1. ビネット効果 (四隅を暗くする)
    # ガウスカーネルを生成して中心を明るく、周囲を暗いマスクを作る
    kernel_x = cv2.getGaussianKernel(cols, cols/2)
    kernel_y = cv2.getGaussianKernel(rows, rows/2)
    kernel = kernel_y * kernel_x.T
    mask = kernel / kernel.max()
    img_vignette = np.copy(image_np)
    for i in range(3): # BGRの各チャンネルに適用
        img_vignette[:,:,i] = img_vignette[:,:,i] * mask

    # 2. フィルムグレイン (デジタルノイズ)
    noise = np.random.normal(0, 1.0, (rows, cols, 3)).astype(np.uint8)
    img_noisy = cv2.add(img_vignette, noise)

    # 3. 色収差 (Chromatic Aberration: 赤と青を少しずらす)
    b, g, r = cv2.split(img_noisy)
    # 赤チャンネルを右に2ピクセル、青チャンネルを左に2ピクセルずらす例
    M_r = np.float32([[1, 0, 2], [0, 1, 0]])
    M_b = np.float32([[1, 0, -2], [0, 1, 0]])
    r = cv2.warpAffine(r, M_r, (cols, rows))
    b = cv2.warpAffine(b, M_b, (cols, rows))
    img_aberration = cv2.merge((b, g, r))
    image_final = cv2.convertScaleAbs(img_aberration, alpha=1.2, beta=10)

    return Image.fromarray(image_final)

def generate_ring_of_circles_face(
    size=(512, 512),
    bg_color=(255, 255, 255),
    circle_colors=((255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 255, 0), (0, 255, 255), (255, 0, 255)),
    min_r_ratio=0.07,
    max_r_ratio=0.18,
    seed=None,
):
    """
    あの方風の「丸がたくさん集まった顔」画像を生成する。

    size: 画像サイズ (width, height)
    bg_color: 背景色
    circle_colors: 小さい丸の色候補
    min_r_ratio, max_r_ratio: 小さい丸の半径の割合（画像サイズに対する）
    seed: 乱数シード（再現性用）
    """
    if seed is not None:
        random.seed(seed)

    # 顔の大きさを決定
    width = random.randint(40, size[0])
    height = width

    # 顔の描き方をランダムに決定
    isGradient = random.randint(0, 9) < 2
    useBackground = random.randint(0, 3) < 2
    drawShapes = random.randint(0, 3) < 2
    cameraEffect = random.randint(0, 3) < 2
    
    if (useBackground):
      img = Image.open(f"sample_data/stl10/train_images/train_image_png_{random.randint(1, 1999)}.png")
      img = img.resize((size[0], size[1]))
      enhancer = ImageEnhance.Brightness(img)
      img = enhancer.enhance(1.5)
    else:
      img = Image.new("RGB", size, bg_color)
    draw = ImageDraw.Draw(img)

    # 顔の大きな丸のパラメータ
    face_center = (width // 2, height // 2)
    face_radius = int(min(width, height) * 0.35)

    # 顔のベースとなる大きな丸
    fc = (face_center[0] + random.randint(0, size[0] - width), face_center[1] + random.randint(0, size[0] - height))
    fr = face_radius
    draw.ellipse(
        (fc[0] - fr, fc[1] - fr, fc[0] + fr, fc[1] + fr),
        fill=None,
        outline=None,
    )

    # 小さな丸を円形にランダムに配置
    min_r = int(min(width, height) * min_r_ratio)
    max_r = int(min(width, height) * max_r_ratio)
    angle = 0
    minx, miny = size
    maxx, maxy = 0, 0
    while angle < math.pi * 2:
        r = random.randint(min_r, max_r)
        dist = face_radius - random.randint(0, int(face_radius * 0.1))
        cx = int(fc[0] + dist * math.cos(angle))
        cy = int(fc[1] + dist * math.sin(angle))

        if (isGradient):
          gradient_ellipse = draw_gradient_ellipse(
              (r * 2, r * 2),
              (0, 0, r * 2, r * 2),
              (r, r),
              (255, 0, 0),
              (220, 0, 0)
          )
          img.paste(gradient_ellipse, (cx - r, cy - r), gradient_ellipse)
        else:
          draw.ellipse(
              (cx - r, cy - r, cx + r, cy + r),
              fill=circle_colors[0],
              outline=None,
          )

        angle += math.asin(r / face_radius) * 1.4
        # ラベル用に顔の座標を退避する
        if (cx - r) < minx:
            minx = cx - r
        if (cy - r) < miny:
            miny = cy - r
        if (cx + r) > maxx:
            maxx = cx + r
        if (cy + r) > maxy:
            maxy = cy + r

    if (drawShapes):
        # 顔以外で空いてる場所にランダムに図形を描画
        for idx in range(random.randint(0, 50)):
          item_type = random.randint(0, 3)

          # Determine the drawing color for the additional shape
          item_color = random.choice(circle_colors)

          while True:
            # Coordinates for drawing within the face's bounding box
            x1, y1 = random.randint(0, size[0]), random.randint(0, size[1])
            item_size = random.randint(1, size[0])
            x2 = x1 + item_size
            y2 = y1 + item_size
            if maxx < x1 or minx > x2 or maxy < y1 or miny > y2:
              if item_type == 0:  # Draw rectangle
                rx1 = random.randint(x1, x2)
                ry1 = random.randint(y1, y2)
                rx2 = random.randint(rx1, x2)
                ry2 = random.randint(ry1, y2)
                draw.rectangle((rx1, ry1, rx2, ry2), fill=item_color)
                break
              elif item_type == 1:  # Draw triangle
                tx1 = random.randint(x1, x2)
                ty1 = random.randint(y1, y2)
                tx2 = random.randint(x1, x2)
                ty2 = random.randint(y1, y2)
                tx3 = random.randint(x1, x2)
                ty3 = random.randint(y1, y2)
                draw.polygon([(tx1, ty1), (tx2, ty2), (tx3, ty3)], fill=item_color)
                break
              elif item_type == 2:  # Draw circle
                ccx = random.randint(x1, x2)
                ccy = random.randint(y1, y2)
                cr = random.randint(min(x2 - x1, y2 - y1) // 10, min(x2 - x1, y2 - y1) // 4)
                draw.ellipse((ccx - cr, ccy - cr, ccx + cr, ccy + cr), fill=item_color)
                break
              elif item_type == 3:  # Draw line
                lx1 = random.randint(x1, x2)
                ly1 = random.randint(y1, y2)
                lx2 = random.randint(x1, x2)
                ly2 = random.randint(y1, y2)
                draw.line((lx1, ly1, lx2, ly2), fill=item_color, width=random.randint(2, 8))
                break

    if (cameraEffect):
      # カメラ撮影風
      img = apply_camera_effects(img)

    return img, (minx, miny, maxx, maxy)

img_train_dir = Path("data/images/train")
lbl_train_dir = Path("data/labels/train")
img_test_dir = Path("data/images/test")
lbl_test_dir = Path("data/labels/test")

NUM_IMAGES = 200

# 訓練画像
for i in range(NUM_IMAGES):
    img, rect = generate_ring_of_circles_face(
        size=IMAGE_SIZE,
        seed=i,
    )
    img.save(img_train_dir / f"face_{i:03d}.png")
    with open(lbl_train_dir / f"face_{i:03d}.txt", "w") as f:
        f.write(f"0 {((rect[2] - rect[0]) / 2 + rect[0]) / IMAGE_SIZE[0]:.2f} {((rect[3] - rect[1]) / 2 + rect[1]) / IMAGE_SIZE[1]:.2f} {(rect[2] - rect[0]) / IMAGE_SIZE[0]:.2f} {(rect[3] - rect[1]) / IMAGE_SIZE[1]:.2f}")
    print(f"saved: face_{i:03d}.png")

# 検証画像
for i in range(NUM_IMAGES):
    j = i + NUM_IMAGES
    img, rect = generate_ring_of_circles_face(
        size=IMAGE_SIZE,
        seed=j,
    )
    img.save(img_test_dir / f"face_{j:03d}.png")
    with open(lbl_test_dir / f"face_{j:03d}.txt", "w") as f:
        f.write(f"0 {((rect[2] - rect[0]) / 2 + rect[0]) / IMAGE_SIZE[0]:.2f} {((rect[3] - rect[1]) / 2 + rect[1]) / IMAGE_SIZE[1]:.2f} {(rect[2] - rect[0]) / IMAGE_SIZE[0]:.2f} {(rect[3] - rect[1]) / IMAGE_SIZE[1]:.2f}")
    print(f"saved: face_{j:03d}.png")

from ultralytics import YOLO

def train():
    model = YOLO("yolov8n.pt")  # 事前学習済みの小さいモデルからスタート
    model.train(
        data="train.yaml",  # train/val のパスとクラス定義
        epochs=50,
        imgsz=IMAGE_SIZE[0],
        batch=16,
        augment=True,
        name="ring-of-circles-detector",
    )

train()

train.yaml

path: ./data
train: images/train
val: images/test

names:
  0: ring-of-circles

訓練ログは良好

検証

作成したモデルを使って画像を読み込ませて精度を確認した。

import yaml
from pathlib import Path
from ultralytics import YOLO
import cv2
import numpy as np
from google.colab.patches import cv2_imshow


class RingOfCirclesInfer:
    def __init__(self, config_path: str = "infer.yaml"):
        self.config = self._load_config(config_path)
        self.model = YOLO(self.config["model"])
        self.names = self.config["names"]

    def _load_config(self, path: str):
        with open(path, "r", encoding="utf-8") as f:
            return yaml.safe_load(f)

    def load_image(self, image_path: str):
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Failed to load image: {image_path}")
        return img

    def predict(self, img):
        results = self.model.predict(
            img,
            imgsz=self.config["imgsz"],
            conf=self.config["conf"],
            iou=self.config["iou"],
            device=self.config["device"],
            verbose=False,
        )
        return self._convert_results(results[0])

    def _convert_results(self, result):
        detections = []
        for box in result.boxes:
            cls_id = int(box.cls[0])
            score = float(box.conf[0])
            x1, y1, x2, y2 = box.xyxy[0].tolist()

            detections.append({
                "class_id": cls_id,
                "label": self.names[cls_id],
                "score": score,
                "bbox": [x1, y1, x2, y2],
            })

        return {
            "detections": detections,
            "num_detections": len(detections),
        }

    def draw_detections(self, img, detections):
        """
        img: OpenCV image (numpy array)
        detections: result["detections"] のリスト
        """
        for det in detections:
            x1, y1, x2, y2 = map(int, det["bbox"])
            label = det["label"]
            score = det["score"]

            # バウンディングボックス
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)

            # ラベルとスコア
            text = f"{label} {score:.2f}"
            cv2.putText(
                img, text, (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                (0, 0, 255), 2
            )

        return img

infer = RingOfCirclesInfer("infer.yaml")
img = infer.load_image("data/images/image1.jpg")
result = infer.predict(img)
print(result)
# バウンディングボックス描画
img_with_boxes = infer.draw_detections(img.copy(), result["detections"])

# 表示
cv2_imshow(img_with_boxes)

infer.yaml

# 推論時の設定
model: "ring-of-circles-detector.pt"      # 学習済みモデル
imgsz: 640                                # 入力画像サイズ
conf: 0.5                                 # 信頼度しきい値
iou: 0.45                                 # NMS IoU しきい値
device: "cuda:0"                          # "cpu" or "cuda:0"

# クラス定義
names:
  0: ring-of-circles

手書きの画像で試した結果は以下のとおり
{'detections': [{'class_id': 0, 'label': 'ring-of-circles', 'score': 0.941784679889679, 'bbox': [195.29576110839844, 29.783838272094727, 415.94842529296875, 210.16983032226562]}, {'class_id': 0, 'label': 'ring-of-circles', 'score': 0.8607085943222046, 'bbox': [1.6221038103103638, 236.8182830810547, 225.94073486328125, 429.9375305175781]}], 'num_detections': 2}

写真やイラストで試したところ、まずまず検出してくれた。
夜間やボケてる写真はうまくいかなかったので訓練データが不十分なのだと思う。
今後改善していきたいと思う。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up