More than 5 years have passed since last update.

webカメラで Mask R-CNN + Openpose + FastStyleTransfer をさくっと試すメモ

Last updated at 2018-07-09Posted at 2018-06-30

https://github.com/matterport/Mask_RCNN を使って，Mask R-CNNを試す．
もともとすぐ試せるように成っていて，30分くらいで自分のカメラで実行できるようになるが，いろいろハマったのでその時のメモ．

2018/07/04 追記： OpenPoseもさくっと試せたので追記した．
2018/07/09 追記： Fast Style Transferもさくっと試せたので追記した．

環境のセットアップ

Ubuntuを使う．バージョンは何でも良い．
また，python3を使う．
環境を汚さないようにpythonの仮想環境を使う．

sudo apt install python3-dev # これがないと後のpipでコケる
sudo apt install python3-tk # 実行時に使いたいので
sudo pip install virtualenv virtualenvwrapper

で仮想環境を使えるようにして，

.bashrc

if [ -f /usr/local/bin/virtualenvwrapper.sh ]; then
    export WORKON_HOME=$HOME/.virtualenvs
    source /usr/local/bin/virtualenvwrapper.sh
fi

を.bashrcに追記して，ターミナルを上げ直してから，

mkvirtualenv -p `which python3` test-maskrcnn

Mask R-CNNのインストール

https://github.com/matterport/Mask_RCNN

を使う．README.mdを参考にしつつ

workon test-maskrcnn # 仮想環境の起動
cdvirtualenv # 仮想環境のhomeに移動
git clone https://github.com/matterport/Mask_RCNN.git
cd Mask_RCNN
# https://github.com/matterport/Mask_RCNN/pull/662 をマージする
git remote add gtgalone https://github.com/gtgalone/Mask_RCNN.git
git fetch gtgalone fix-keras-engine-topology
git merge gtgalone/fix-keras-engine-topology
# 必要なモジュールのインストール
pip3 install -r requirements.txt # Python.hが無い，みたいなエラーが出たとしたら一番最初のpython3-devをやっていないから．
python3 setup.py install
# logsディレクトリに学習済みのパラメータをダウンロード
wget -P logs https://github.com/matterport/Mask_RCNN/releases/download/v2.0/mask_rcnn_coco.h5
# cocoをダウンロードしてインストールする
git clone https://github.com/waleedka/coco.git
cd coco/PythonAPI
make
python3 setup.py install

をすればOK．

Openposeのインストール

https://github.com/ildoonet/tf-pose-estimation
- https://qiita.com/kotauchisunsun/items/bdbdca2ddb9036e29ab1 で紹介されていた

を使う．README.mdに分かりやすく書いてあって，抽出すると以下になる．

workon test-maskrcnn # 簡単のためMask R-CNNと同じ仮想環境を使う．共存可能なので問題ない．
cdvirtualenv
git clone https://www.github.com/ildoonet/tf-openpose
cd tf-openpose
pip3 install -r requirements.txt
cd tf_pose/pafprocess
swig -python -c++ pafprocess.i && python3 setup.py build_ext --inplace

Mask R-CNNの実行

カメラが付いていないPCであれば，webカメラなりなんなりを付ける．
その上で，

workon test-maskrcnn
cdvirtualenv
touch test-maskrcnn.py
chmod +x test-maskrcnn.py

をして，test-maskrcnn.pyに以下のコードを貼り付けて，実行する．

test-maskrcnn.py

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# inspired by https://github.com/matterport/Mask_RCNN/blob/master/samples/demo.ipynb

import os
import sys
import numpy as np

sys.path.append(os.path.abspath('Mask_RCNN/'))
sys.path.append(os.path.abspath('Mask_RCNN/samples/coco/'))
# rosが使っているopencvだとうまく行かず，仮想環境内にインストールされているopencvを使いたいのでPATHの順番を並び替える必要がある．
ros_index = -1
for i, p in enumerate(sys.path):
    if 'ros' in p:
        ros_index = i
        break
sys.path.append(sys.path.pop(ros_index))

import coco
import mrcnn.model as modellib
from mrcnn import visualize
import cv2

class InferenceConfig(coco.CocoConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
               'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
               'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
               'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
               'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
               'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
               'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
               'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
               'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
               'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
               'teddy bear', 'hair drier', 'toothbrush']

def display_instances(image, boxes, masks, class_ids, class_names, scores):
    N = boxes.shape[0]
    if not N:
        print('\n*** No instances to display *** \n')
    else:
        assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]

    colors = visualize.random_colors(N)
    masked_image = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), cv2.COLOR_GRAY2BGR)

    for i in range(N):
        color = colors[i]

        # Bounding box
        if not np.any(boxes[i]):
            continue
        y1, x1, y2, x2 = boxes[i]
        camera_color = (color[0] * 255, color[1] * 255, color[2] * 255)
        cv2.rectangle(masked_image, (x1, y1), (x2, y2), camera_color, 1)

        # Label
        class_id = class_ids[i]
        score = scores[i]
        label = class_names[class_id]
        caption = '{} {:.0f}%'.format(label, score * 100) if score else label
        camera_font = cv2.FONT_HERSHEY_PLAIN
        cv2.putText(masked_image, caption, (x1, y1), camera_font,
                    fontScale=3, color=camera_color, thickness=3)

        # Mask
        mask = masks[:, :, i]
        alpha = 0.2
        for c in range(3):
            masked_image[:, :, c] = np.where(mask == 1,
                                             image[:, :, c] * (1 - alpha) + alpha * color[c] * 255,
                                             masked_image[:, :, c])

    return masked_image.astype(np.uint8)


if __name__ == '__main__':
    # カメラへの接続
    cap = cv2.VideoCapture(0)

    # モデルの構築
    MODEL_DIR = os.path.abspath('Mask_RCNN/logs')
    COCO_MODEL_PATH = os.path.join(MODEL_DIR, 'mask_rcnn_coco.h5')
    config = InferenceConfig()
    config.display()
    model = modellib.MaskRCNN(mode='inference', model_dir=MODEL_DIR, config=config)
    model.load_weights(COCO_MODEL_PATH, by_name=True)

    # フルスクリーンで表示する
    SCREEN_NAME = 'screen'
    cv2.namedWindow(SCREEN_NAME, cv2.WINDOW_NORMAL)
    cv2.setWindowProperty(SCREEN_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    # 処理ループ
    while(cap.isOpened()):
        _, frame = cap.read()   # カメラ画像の取得
        frame = frame[:, ::-1]   # mirroring
        results = model.detect([frame]) # 検出処理
        r = results[0]
        camera = display_instances(frame,
                                   r['rois'], r['masks'], r['class_ids'],
                                   class_names, r['scores'])
        cv2.imshow(SCREEN_NAME, camera) # 表示

        if cv2.waitKey(1) == 27: # ESCで処理ループを抜ける
            break
        # bufferをクリアする:  https://github.com/eiichiromomma/CVMLAB/wiki/OpenCV-VideoCapture
        for i in range(cv2.CAP_PROP_BUFFERSIZE):
            cap.grab()

    # 終了処理
    cap.release()
    cv2.destroyAllWindows()

https://github.com/matterport/Mask_RCNN/blob/master/samples/demo.ipynb を参考にした．
CPUで実行するのでFPSはだいぶ遅い．

GPUでやりたい場合

https://www.tensorflow.org/install/install_sources#tested_source_configurations を参考に

Ubuntu 16.04
CUDA 9.0
cuDNN v7.0.5
- https://developer.nvidia.com/rdp/cudnn-archive
tensorflow-gpu==1.8.0
Keras==2.2.0

で，トラブル無く出来た．
ハマりポイントとしては，CUDAが9.2だとNG．

GPUを使うとFPSは格段に向上する．

Openposeの実行

workon test-maskrcnn
cdvirtualenv
touch test-openpose.py
chmod +x test-openpose.py

をして，test-maskrcnn.pyに以下のコードを貼り付けて，実行する．

test-openpose.py

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# inspired by https://github.com/ildoonet/tf-pose-estimation/blob/master/run_webcam.py

import os
import sys

sys.path.append(os.path.abspath('tf-openpose/'))
# rosが使っているopencvだとうまく行かず，仮想環境内にインストールされているopencvを使いたいのでPATHの順番を並び替える必要がある．
ros_index = -1
for i, p in enumerate(sys.path):
    if 'ros' in p:
        ros_index = i
        break
sys.path.append(sys.path.pop(ros_index))

import cv2

from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path


if __name__ == '__main__':
    # カメラへの接続
    cap = cv2.VideoCapture(0)

    # モデルの構築
    estimator = TfPoseEstimator(get_graph_path('mobilenet_thin'), target_size=(432, 368))

    # フルスクリーンで表示する
    SCREEN_NAME = 'screen'
    cv2.namedWindow(SCREEN_NAME, cv2.WINDOW_NORMAL)
    cv2.setWindowProperty(SCREEN_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    # 処理ループ
    while(cap.isOpened()):
        _, frame = cap.read()   # カメラ画像の取得
        frame = frame[:, ::-1]  # mirroring

        humans = estimator.inference(frame, resize_to_default=True, upsample_size=4.0) # 検出処理

        image = TfPoseEstimator.draw_humans(frame, humans, imgcopy=True)

        cv2.imshow(SCREEN_NAME, image) # 表示

        if cv2.waitKey(1) == 27: # ESCで処理ループを抜ける
            break
        # bufferをクリアする:  https://github.com/eiichiromomma/CVMLAB/wiki/OpenCV-VideoCapture
        for i in range(3):
            cap.grab()

    # 終了処理
    cap.release()
    cv2.destroyAllWindows()

https://github.com/ildoonet/tf-pose-estimation/blob/master/run_webcam.py を参考にした．
CPUでもだいぶ早い．

Mask R-CNNはそこまでFPSが出ないので，webcamのbufferのためのfor loopはたくさん回しているが，
Openposeはだいぶ早いので，webcamのbufferのためのfor loopは3回が適切だった．

両方同時に動かす

ついでにwebカメラの読み込みを別スレッドで行うようにして，grabのパラちゅんをなくした．

test-maskrcnn-openpose-at-once.py

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# inspired by https://github.com/matterport/Mask_RCNN/blob/master/samples/demo.ipynb
# and https://github.com/ildoonet/tf-pose-estimation/blob/master/run_webcam.py
# and https://gist.github.com/allskyee/7749b9318e914ca45eb0a1000a81bf56

import tensorflow as tf
from threading import Thread, Lock

import os
import sys

# rosが使っているopencvだとうまく行かず，仮想環境内にインストールされているopencvを使いたいのでPATHの順番を並び替える必要がある．
ros_index = -1
for i, p in enumerate(sys.path):
    if 'ros' in p:
        ros_index = i
        break
sys.path.append(sys.path.pop(ros_index))

import cv2

# for Openpose
sys.path.append(os.path.abspath('tf-openpose/'))
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path

# for Mask-RCNN
sys.path.append(os.path.abspath('Mask_RCNN/'))
sys.path.append(os.path.abspath('Mask_RCNN/samples/coco/'))
import numpy as np
import coco
import mrcnn.model as modellib
from mrcnn import visualize

class InferenceConfig(coco.CocoConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
               'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
               'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
               'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
               'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
               'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
               'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
               'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
               'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
               'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
               'teddy bear', 'hair drier', 'toothbrush']

def display_instances(image, boxes, masks, class_ids, class_names, scores):
    N = boxes.shape[0]
    if not N:
        print('\n*** No instances to display *** \n')
    else:
        assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]

    colors = visualize.random_colors(N)
    masked_image = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), cv2.COLOR_GRAY2BGR)

    for i in range(N):
        color = colors[i]

        # Bounding box
        if not np.any(boxes[i]):
            continue
        y1, x1, y2, x2 = boxes[i]
        camera_color = (color[0] * 255, color[1] * 255, color[2] * 255)
        cv2.rectangle(masked_image, (x1, y1), (x2, y2), camera_color, 1)

        # Label
        class_id = class_ids[i]
        score = scores[i]
        label = class_names[class_id]
        caption = '{} {:.0f}%'.format(label, score * 100) if score else label
        camera_font = cv2.FONT_HERSHEY_PLAIN
        cv2.putText(masked_image, caption, (x1, y1), camera_font,
                    fontScale=3, color=camera_color, thickness=3)

        # Mask
        mask = masks[:, :, i]
        alpha = 0.2
        for c in range(3):
            masked_image[:, :, c] = np.where(mask == 1,
                                             image[:, :, c] * (1 - alpha) + alpha * color[c] * 255,
                                             masked_image[:, :, c])

    return masked_image.astype(np.uint8)

# for webcam
class WebcamVideoStream:
    '''
    別スレッドでwebcamから画像を取得するためのクラス
    https://gist.github.com/allskyee/7749b9318e914ca45eb0a1000a81bf56
    '''
    def __init__(self, src = 0):
        self.stream = cv2.VideoCapture(src)
        (self.grabbed, self.frame) = self.stream.read()
        self.started = False
        self.read_lock = Lock()

    def start(self):
        if self.started:
            print("already started!!")
            return None
        self.started = True
        self.thread = Thread(target=self.update, args=())
        self.thread.start()
        return self

    def update(self):
        while self.started:
            (grabbed, frame) = self.stream.read()
            self.read_lock.acquire()
            self.grabbed, self.frame = grabbed, frame
            self.read_lock.release()

    def read(self):
        self.read_lock.acquire()
        frame = self.frame.copy()
        self.read_lock.release()
        return frame

    def stop(self):
        self.started = False
        if self.thread.is_alive():
            self.thread.join()

    def is_open(self):
        return self.stream.isOpened()

    def __exit__(self, exc_type, exc_value, traceback) :
        self.stream.release()


if __name__ == "__main__" :
    # フルスクリーンで表示する
    SCREEN_NAME = 'screen'
    cv2.namedWindow(SCREEN_NAME, cv2.WINDOW_NORMAL)
    cv2.setWindowProperty(SCREEN_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    # モデルの構築
    # for Openpose
    config = tf.ConfigProto(device_count={"GPU":0}) # CPUのみを使う
    estimator = TfPoseEstimator(get_graph_path('mobilenet_thin'), target_size=(432, 368), tf_config=config)
    # for Mask-RCNN
    MODEL_DIR = os.path.abspath('Mask_RCNN/logs')
    COCO_MODEL_PATH = os.path.join(MODEL_DIR, 'mask_rcnn_coco.h5')
    config = InferenceConfig()
    config.display()
    model = modellib.MaskRCNN(mode='inference', model_dir=MODEL_DIR, config=config)
    model.load_weights(COCO_MODEL_PATH, by_name=True)
    # for webcam
    vs = WebcamVideoStream().start()

    # 処理ループ
    while(vs.is_open()):
        # 画像の取得
        frame = vs.read()
        # ミラー化
        frame = frame[:, ::-1]
        # for Mask-RCNN
        results = model.detect([frame])
        r = results[0]
        mask_rcnn = display_instances(frame,
                                      r['rois'], r['masks'], r['class_ids'],
                                      class_names, r['scores'])
        # for Openpose
        humans = estimator.inference(frame, resize_to_default=True, upsample_size=4.0)
        final_image = TfPoseEstimator.draw_humans(mask_rcnn, humans, imgcopy=True)
        # 可視化
        cv2.imshow(SCREEN_NAME, final_image)
        # ESCで処理ループを抜ける
        if cv2.waitKey(1) == 27:
            break

    # 終了処理
    vs.stop()
    cv2.destroyAllWindows()

Mask-RCNNはGPUでないと遅くて，OpenposeはCPUで十分早く，
手元のPCはGPUが1台なので，前者をGPU，後者をCPUで計算するようにした．
（これに関して，やり方が正しいかは不明）

FastStyleTransfer の実行

https://github.com/lengstrom/fast-style-transfer を参考にした．

test-fast-style-transfer.py

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# inspired by https://github.com/lengstrom/fast-style-transfer/blob/master/evaluate.py

import os
import sys
import numpy as np
import tensorflow as tf
from PIL import ImageFont, ImageDraw, Image

# rosが使っているopencvだとうまく行かず，仮想環境内にインストールされているopencvを使いたいのでPATHの順番を並び替える必要がある．
ros_index = -1
for i, p in enumerate(sys.path):
    if 'ros' in p:
        ros_index = i
        break
sys.path.append(sys.path.pop(ros_index))
import cv2

sys.path.insert(0, 'fast-style-transfer/src')
import transform

ckpt_candidate = [["fast-style-transfer/la_muse.ckpt",       "ピカソのラ・ミューズ風"],
                  ["fast-style-transfer/rain_princess.ckpt", "レオニード・アフレモフ風"],
                  ["fast-style-transfer/scream.ckpt",        "ムンクの叫び風"],
                  ["fast-style-transfer/wave.ckpt",          "葛飾北斎の富嶽三十六景風"]]

target_ckpt = ckpt_candidate[0]

if __name__ == "__main__":
    # カメラへの接続
    cap = cv2.VideoCapture(0)
    _, frame = cap.read()
    height, width, _ = frame.shape

    # フルスクリーンで表示する
    SCREEN_NAME = 'screen'
    cv2.namedWindow(SCREEN_NAME, cv2.WINDOW_NORMAL)
    cv2.setWindowProperty(SCREEN_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    # フォントを読み込み
    font_size = 50
    font = ImageFont.truetype("/usr/share/fonts/opentype/ipafont-mincho/ipam.ttf", font_size)

    # モデルの構築
    g = tf.Graph()
    soft_config = tf.ConfigProto(allow_soft_placement=True)
    soft_config.gpu_options.allow_growth = True
    with g.as_default(), g.device('/gpu:0'), tf.Session(config=soft_config) as sess:
        img_placeholder = tf.placeholder(tf.float32, shape=(1, height, width, 3), name='img_placeholder')
        preds = transform.net(img_placeholder)
        saver = tf.train.Saver()
        saver.restore(sess, target_ckpt[0])
        X = np.zeros((1, height, width, 3), dtype=np.float32)

        # 処理ループ
        while(cap.isOpened()):
            _, frame = cap.read()   # カメラ画像の取得
            frame = frame[:, ::-1]   # mirroring
            X[0] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            _preds = sess.run(preds, feed_dict={img_placeholder:X})
            image = cv2.cvtColor(np.clip(_preds[0], 0, 255).astype(np.uint8), cv2.COLOR_BGR2RGB)

            # 画像に文字を追加する
            img_pil = Image.fromarray(image)
            draw = ImageDraw.Draw(img_pil)
            draw.rectangle([(0, 0), (width, font_size)], fill=(255, 255, 255)) # 背景
            draw.text((0, 0), target_ckpt[1], font=font, fill=(0, 0, 0)) # 文字
            image = np.array(img_pil)

            # 表示
            cv2.imshow(SCREEN_NAME, image)

            if cv2.waitKey(1) == 27: # ESCで処理ループを抜ける
                break
            # bufferをクリアする:  https://github.com/eiichiromomma/CVMLAB/wiki/OpenCV-VideoCapture
            for i in range(3):
                cap.grab()
        # 終了処理
        cap.release()
        cv2.destroyAllWindows()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up