26
22

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

[150 FPS ++] Coral Edge TPU Accelerator を3本突き刺して並列推論し超速の物体検出推論性能を獲得する ー無駄な高性能の極みへー

Last updated at Posted at 2019-08-11

TPU-MobilenetSSD GitHub stars

Special thanks!!
edge_tpu GitHub stars

1.Introduction

いつもながらアホっぽい実装を行います。 Python + Coral Edge TPU Accelerator 3本を使用して推論を並列処理し、物体検出を高速化します。 動画撮影と各TPUの処理を全てMultiProcessで非同期並列処理します。 処理の実行中は、3本のTPUの白色のランプの部分が同時にメカメカ光ります。 なお、カメラの撮影レートが遅すぎて 80FPS ほどの性能をロスしていると思います。 150FPS 程度のUSBカメラなら、TPUは2本で十分です。

IMG_20190812_011433.jpg

処理の前後関係は特に意識しない雑な実装です。 説明は特に不要だと思いますので、結果の動画をご覧ください。 動画の撮影レートが 60 FPS ですので、下の動画よりも実際の動作速度は 3倍 速いパフォーマンスが出ています。面倒でしたので、撮影時にハードウェアエンコーダは使用していません。


**<おまけ, DeeplabV3 Semantic Segmentation, Coral Edge TPU x3 boosted, 320x240>** **https://github.com/NobuoTsukamoto/edge_tpu.git**

2.Environment

  • Ubuntu 16.04 x86_64, USB 3.0
  • Coral Edge TPU Accelerator x 3本
  • Python 3.5.3
  • mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite
  • edgetpu runtime 2.11.1
  • OpenCV 4.1.1-openvino
  • USB Camera (Playstationeye, 320x240, 150 FPS)
  • Self-powered USB3.0 HUB

3.Implementation

Pythonのロジック内で空きのTPUを自動的に検出し、別々のプロセスへ空きTPUを割り当てながら並列に処理を実行します。 Edge TPUが発売された半年前に実装はほぼ終わっていましたが、 Tensorflow のビルドに夢中になりすぎてずっと放置していました。 バージョンが上がった最新のAPIに適合させるために少しだけテーラリングしています。 すごくシンプルな実装ですので、他のモデルにも簡単に応用が効きますね。

リソース一式はページ最上部の巨大なニコちゃんマークのリンク先のGithubリポジトリからダウンロード可能です。

MobileNet-SSD-TPU-async.py
import argparse
import platform
import numpy as np
import cv2
import time
from PIL import Image
from time import sleep
import multiprocessing as mp
from edgetpu.detection.engine import DetectionEngine
from edgetpu.basic import edgetpu_utils

lastresults = None
processes = []
frameBuffer = None
results = None
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
box_color = (255, 128, 0)
box_thickness = 1
label_background_color = (125, 175, 75)
label_text_color = (255, 255, 255)
percentage = 0.0

# Function to read labels from text files.
def ReadLabelFile(file_path):
  with open(file_path, 'r') as f:
    lines = f.readlines()
  ret = {}
  for line in lines:
    pair = line.strip().split(maxsplit=1)
    ret[int(pair[0])] = pair[1].strip()
  return ret


def camThread(label, results, frameBuffer, camera_width, camera_height, vidfps, usbcamno):

    global fps
    global detectfps
    global framecount
    global detectframecount
    global time1
    global time2
    global lastresults
    global cam
    global window_name

    cam = cv2.VideoCapture(usbcamno)
    cam.set(cv2.CAP_PROP_FPS, vidfps)
    cam.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
    cam.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
    window_name = "USB Camera"
    cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)

    while True:
        t1 = time.perf_counter()

        ret, color_image = cam.read()
        if not ret:
            continue
        if frameBuffer.full():
            frameBuffer.get()
        frames = color_image
        frameBuffer.put(color_image.copy())
        res = None

        if not results.empty():
            res = results.get(False)
            detectframecount += 1
            imdraw = overlay_on_image(frames, res, label, camera_width, camera_height)
            lastresults = res
        else:
            imdraw = overlay_on_image(frames, lastresults, label, camera_width, camera_height)

        cv2.imshow('USB Camera', imdraw)

        if cv2.waitKey(1)&0xFF == ord('q'):
            break

        # FPS calculation
        framecount += 1
        if framecount >= 15:
            fps       = "(Playback) {:.1f} FPS".format(time1/15)
            detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
            framecount = 0
            detectframecount = 0
            time1 = 0
            time2 = 0
        t2 = time.perf_counter()
        elapsedTime = t2-t1
        time1 += 1/elapsedTime
        time2 += elapsedTime



def inferencer(results, frameBuffer, model, camera_width, camera_height):

    engine = None

    # Acquisition of TPU list without model assignment
    devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)

    devopen = False
    for device in devices:
        try:
            engine = DetectionEngine(model, device)
            devopen = True
            break
        except:
            continue

    if devopen == False:
        print("TPU Devices open Error!!!")
        sys.exit(1)

    print("Loaded Graphs!!! ")

    while True:

        if frameBuffer.empty():
            continue

        # Run inference.
        color_image = frameBuffer.get()
        prepimg = color_image[:, :, ::-1].copy()
        prepimg = Image.fromarray(prepimg)

        tinf = time.perf_counter()
        ans = engine.DetectWithImage(prepimg, threshold=0.5, keep_aspect_ratio=True, relative_coord=False, top_k=10)
        print(time.perf_counter() - tinf, "sec")
        results.put(ans)



def overlay_on_image(frames, object_infos, label, camera_width, camera_height):

    color_image = frames

    if isinstance(object_infos, type(None)):
        return color_image
    img_cp = color_image.copy()

    for obj in object_infos:
        box = obj.bounding_box.flatten().tolist()
        box_left = int(box[0])
        box_top = int(box[1])
        box_right = int(box[2])
        box_bottom = int(box[3])
        cv2.rectangle(img_cp, (box_left, box_top), (box_right, box_bottom), box_color, box_thickness)

        percentage = int(obj.score * 100)
        label_text = label[obj.label_id] + " (" + str(percentage) + "%)" 

        label_size = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
        label_left = box_left
        label_top = box_top - label_size[1]
        if (label_top < 1):
            label_top = 1
        label_right = label_left + label_size[0]
        label_bottom = label_top + label_size[1]
        cv2.rectangle(img_cp, (label_left - 1, label_top - 1), (label_right + 1, label_bottom + 1), label_background_color, -1)
        cv2.putText(img_cp, label_text, (label_left, label_bottom), cv2.FONT_HERSHEY_SIMPLEX, 0.5, label_text_color, 1)

    cv2.putText(img_cp, fps,       (camera_width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
    cv2.putText(img_cp, detectfps, (camera_width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)

    return img_cp

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite", help="Path of the detection model.")
    parser.add_argument("--label", default="coco_labels.txt", help="Path of the labels file.")
    parser.add_argument("--usbcamno", type=int, default=0, help="USB Camera number.")
    args = parser.parse_args()

    model    = args.model
    label    = ReadLabelFile(args.label)
    usbcamno = args.usbcamno

    camera_width = 320
    camera_height = 240
    vidfps = 150

    try:
        mp.set_start_method('forkserver')
        frameBuffer = mp.Queue(10)
        results = mp.Queue()

        # Start streaming
        p = mp.Process(target=camThread,
                       args=(label, results, frameBuffer, camera_width, camera_height, vidfps, usbcamno),
                       daemon=True)
        p.start()
        processes.append(p)

        # Activation of inferencer
        devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)
        for devnum in range(len(devices)):
            p = mp.Process(target=inferencer,
                           args=(results, frameBuffer, model, camera_width, camera_height),
                           daemon=True)
            p.start()
            processes.append(p)

        while True:
            sleep(1)

    finally:
        for p in range(len(processes)):
            processes[p].terminate()

[150 FPS ++] Connect three Coral Edge TPU accelerators and infer in parallel processing to get ultra-fast object detection inference performance ーTo the extreme of useless high performanceー

1.Introduction

Use Python + Coral Edge TPU Accelerator x3 to parallelize inferences and speed up object detection. Asynchronous parallel processing uses MultiProcess to process each TPU and video recording. I think that the shooting rate of the camera is too slow, and the performance of 80FPS is lost. For a USB camera of about 150FPS, two TPUs are sufficient.

IMG_20190812_011433.jpg

This is a crude implementation that is not particularly aware of the processing context. I don’t think there’s any need for an explanation, so watch the video. Since the shooting rate of the movie is 60 FPS, the actual operation speed is 3 times faster than the movie below. No hardware encoder is used during shooting.

<Appendix, DeeplabV3 Semantic Segmentation, Coral Edge TPU x3 boosted, 320x240>
https://github.com/NobuoTsukamoto/edge_tpu.git

2.Environment

  • Ubuntu 16.04 x86_64, USB 3.0
  • Coral Edge TPU Accelerator x3 pieces
  • Python 3.5.3
  • mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite
  • edgetpu runtime 2.11.1
  • OpenCV 4.1.1-openvino
  • USB Camera (Playstationeye, 320x240, 150 FPS)
  • Self-powered USB3.0 HUB

3.Implementation

Free TPU is automatically detected in Python logic, and processes are executed in parallel while assigning free TPUs to different processes. Half a year ago the logic was complete, but I was so obsessed with building Tensorflow that I left it alone. Because it is a very simple implementation, it can be easily applied to other models.

MobileNet-SSD-TPU-async.py
import argparse
import platform
import numpy as np
import cv2
import time
from PIL import Image
from time import sleep
import multiprocessing as mp
from edgetpu.detection.engine import DetectionEngine
from edgetpu.basic import edgetpu_utils

lastresults = None
processes = []
frameBuffer = None
results = None
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
box_color = (255, 128, 0)
box_thickness = 1
label_background_color = (125, 175, 75)
label_text_color = (255, 255, 255)
percentage = 0.0

# Function to read labels from text files.
def ReadLabelFile(file_path):
  with open(file_path, 'r') as f:
    lines = f.readlines()
  ret = {}
  for line in lines:
    pair = line.strip().split(maxsplit=1)
    ret[int(pair[0])] = pair[1].strip()
  return ret


def camThread(label, results, frameBuffer, camera_width, camera_height, vidfps, usbcamno):

    global fps
    global detectfps
    global framecount
    global detectframecount
    global time1
    global time2
    global lastresults
    global cam
    global window_name

    cam = cv2.VideoCapture(usbcamno)
    cam.set(cv2.CAP_PROP_FPS, vidfps)
    cam.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
    cam.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
    window_name = "USB Camera"
    cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)

    while True:
        t1 = time.perf_counter()

        ret, color_image = cam.read()
        if not ret:
            continue
        if frameBuffer.full():
            frameBuffer.get()
        frames = color_image
        frameBuffer.put(color_image.copy())
        res = None

        if not results.empty():
            res = results.get(False)
            detectframecount += 1
            imdraw = overlay_on_image(frames, res, label, camera_width, camera_height)
            lastresults = res
        else:
            imdraw = overlay_on_image(frames, lastresults, label, camera_width, camera_height)

        cv2.imshow('USB Camera', imdraw)

        if cv2.waitKey(1)&0xFF == ord('q'):
            break

        # FPS calculation
        framecount += 1
        if framecount >= 15:
            fps       = "(Playback) {:.1f} FPS".format(time1/15)
            detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
            framecount = 0
            detectframecount = 0
            time1 = 0
            time2 = 0
        t2 = time.perf_counter()
        elapsedTime = t2-t1
        time1 += 1/elapsedTime
        time2 += elapsedTime



def inferencer(results, frameBuffer, model, camera_width, camera_height):

    engine = None

    # Acquisition of TPU list without model assignment
    devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)

    devopen = False
    for device in devices:
        try:
            engine = DetectionEngine(model, device)
            devopen = True
            break
        except:
            continue

    if devopen == False:
        print("TPU Devices open Error!!!")
        sys.exit(1)

    print("Loaded Graphs!!! ")

    while True:

        if frameBuffer.empty():
            continue

        # Run inference.
        color_image = frameBuffer.get()
        prepimg = color_image[:, :, ::-1].copy()
        prepimg = Image.fromarray(prepimg)

        tinf = time.perf_counter()
        ans = engine.DetectWithImage(prepimg, threshold=0.5, keep_aspect_ratio=True, relative_coord=False, top_k=10)
        print(time.perf_counter() - tinf, "sec")
        results.put(ans)



def overlay_on_image(frames, object_infos, label, camera_width, camera_height):

    color_image = frames

    if isinstance(object_infos, type(None)):
        return color_image
    img_cp = color_image.copy()

    for obj in object_infos:
        box = obj.bounding_box.flatten().tolist()
        box_left = int(box[0])
        box_top = int(box[1])
        box_right = int(box[2])
        box_bottom = int(box[3])
        cv2.rectangle(img_cp, (box_left, box_top), (box_right, box_bottom), box_color, box_thickness)

        percentage = int(obj.score * 100)
        label_text = label[obj.label_id] + " (" + str(percentage) + "%)" 

        label_size = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
        label_left = box_left
        label_top = box_top - label_size[1]
        if (label_top < 1):
            label_top = 1
        label_right = label_left + label_size[0]
        label_bottom = label_top + label_size[1]
        cv2.rectangle(img_cp, (label_left - 1, label_top - 1), (label_right + 1, label_bottom + 1), label_background_color, -1)
        cv2.putText(img_cp, label_text, (label_left, label_bottom), cv2.FONT_HERSHEY_SIMPLEX, 0.5, label_text_color, 1)

    cv2.putText(img_cp, fps,       (camera_width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
    cv2.putText(img_cp, detectfps, (camera_width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)

    return img_cp

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite", help="Path of the detection model.")
    parser.add_argument("--label", default="coco_labels.txt", help="Path of the labels file.")
    parser.add_argument("--usbcamno", type=int, default=0, help="USB Camera number.")
    args = parser.parse_args()

    model    = args.model
    label    = ReadLabelFile(args.label)
    usbcamno = args.usbcamno

    camera_width = 320
    camera_height = 240
    vidfps = 150

    try:
        mp.set_start_method('forkserver')
        frameBuffer = mp.Queue(10)
        results = mp.Queue()

        # Start streaming
        p = mp.Process(target=camThread,
                       args=(label, results, frameBuffer, camera_width, camera_height, vidfps, usbcamno),
                       daemon=True)
        p.start()
        processes.append(p)

        # Activation of inferencer
        devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)
        for devnum in range(len(devices)):
            p = mp.Process(target=inferencer,
                           args=(results, frameBuffer, model, camera_width, camera_height),
                           daemon=True)
            p.start()
            processes.append(p)

        while True:
            sleep(1)

    finally:
        for p in range(len(processes)):
            processes[p].terminate()
26
22
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
26
22

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?