1.Introduction
前回記事 [150 FPS ++] Coral Edge TPU Accelerator を3本突き刺して並列推論し超速の物体検出推論性能を獲得する ー無駄な高性能の極みへー - Qiita - PINTO を見た、フランスの Vincentさん から、 「10人を同時に骨格検出したら何FPSでるか試したかい?」 という無茶振りを受けましたので、パフォーマンスが最高になるようにGoogleのサンプルロジックをチューニングしました。 TPU 3本で、とは言われていませんが、、、
Python + Coral Edge TPU Accelerator 3本を使用して推論を並列処理し、骨格検出を高速化します。 動画撮影と各TPUの処理を全てMultiProcessで非同期並列処理します。 処理の実行中は、3本のTPUの白色のランプの部分が同時にメカメカ光ります。 150FPS 程度のUSBカメラなら、TPUは2本で十分です。
今回は、標準サンプルの複雑なGstreamer実装を免れてOpenCV実装に変更するため、 Jetson Nano + EdgeTPU で爆速PoseNet (ラズパイとのパフォーマンス比較) - Qiita - rheneさん を参考にさせていただきました。 この場を借りてお礼申し上げます。
下の動画が1人での検出動作確認結果です。 カメラがショボいため、150 FPS以上の性能が出ません。
おかげ様で、 Python + Multi-TPU x3 + USB 3.0 の無駄に高速な Posenet の実装が完了しました。 動画撮影レートが 60 FPS と遅すぎてモッサリした動きに見えてしまいますが、非同期処理にも関わらず骨格のズレはほぼ無いです。 150 FPS を超えました。https://t.co/ukgr0kdxdl
— Super PINTO (@PINTO03091) August 17, 2019
下の動画が TPU 1本 かつ 同期式 の場合の動画です。 同期式でも十分な速度が出ます。 TPUの推論を待ち合わせますので、カメラの撮影速度が推論に引きずられて遅くなります。 50 FPS から 60 FPS です。 同期でしっかりレンダリングしますので、 激しく動いても1mm もズレません。
同期式 Posenet + TPU x1 + Python + USB Camera
— Super PINTO (@PINTO03091) August 18, 2019
同期式でも十分な速度が出ました。 TPUの推論を同期で待ちますので、カメラの撮影速度が推論に引きずられて遅くなります。 50 FPS から 60 FPS です。https://t.co/9JtGDdpDpS
下の動画が TPU 1本 かつ 複数人 でのMP4による非同期検出動作確認結果です。 動画の再生フレームレートは 30 FPS ですのでほぼラグなしですが、非同期で推論していますので、サッカー選手の動きとカメラワークが速すぎて2〜3フレーム分のズレが顕著に表れています。 このスピードで推論できるなら、同期で推論したほうが綺麗に骨格検出できると思います。 使用している動画はクリエイティブ・コモンズライセンスです。
USB 2.0 / USB 3.0 による性能差・ボトルネックのかなり詳細な検証結果は Edge TPU USB Acceleratorの解析 - 入出力データの転送 - Qiita - iwatake2222さん の記事がとても参考になります。 凄まじい検証です。
Async, TPU x1, Posenet, Soccer, 640x480, 30 FPS
— Super PINTO (@PINTO03091) August 17, 2019
Multi-Personhttps://t.co/2uTmg8a6rY
下の動画が TPU 1本 かつ 複数人 でのMP4による同期検出動作確認結果です。 一切ズレませんので綺麗です。
同期式, TPU x1, Posenet, Soccer, 640x480, 30 FPS
— Super PINTO (@PINTO03091) August 18, 2019
MP4動画ファイルによるマルチパーソン骨格検出。 十分な性能です。https://t.co/tUiKMCEbG6
Githubへリソースを公開済みです。
早く RaspberryPi4 で検証したいですね。
2.Environment
- Ubuntu 16.04 x86_64, USB 3.0
- Coral Edge TPU Accelerator x 3本
- Python 3.5.3
- posenet_mobilenet_v1_075_481_641_quant_decoder_edgetpu.tflite
- edgetpu runtime 2.11.1
- OpenCV 4.1.1-openvino
- USB Camera (Playstationeye, 320x240, 150 FPS)
- Self-powered USB3.0 HUB
- google-coral/project-posenet の PoseEngine
3.Implementation
$ git clone https://github.com/PINTO0309/TPU-Posenet.git
$ cd TPU-Posenet.git
USB Camera用 非同期式 マルチTPU
Pythonのロジック内で空きのTPUを自動的に検出し、別々のプロセスへ空きTPUを割り当てながら並列に処理を実行します。
import argparse
import numpy as np
import cv2
import time
from PIL import Image
from time import sleep
import multiprocessing as mp
from edgetpu.basic import edgetpu_utils
from pose_engine import PoseEngine
lastresults = None
processes = []
frameBuffer = None
results = None
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
EDGES = (
('nose', 'left eye'),
('nose', 'right eye'),
('nose', 'left ear'),
('nose', 'right ear'),
('left ear', 'left eye'),
('right ear', 'right eye'),
('left eye', 'right eye'),
('left shoulder', 'right shoulder'),
('left shoulder', 'left elbow'),
('left shoulder', 'left hip'),
('right shoulder', 'right elbow'),
('right shoulder', 'right hip'),
('left elbow', 'left wrist'),
('right elbow', 'right wrist'),
('left hip', 'right hip'),
('left hip', 'left knee'),
('right hip', 'right knee'),
('left knee', 'left ankle'),
('right knee', 'right ankle'),
)
def camThread(results, frameBuffer, camera_width, camera_height, model_width, model_height, vidfps, usbcamno, videofile):
global fps
global detectfps
global framecount
global detectframecount
global time1
global time2
global lastresults
global cam
global window_name
global waittime
if videofile == "":
cam = cv2.VideoCapture(usbcamno)
cam.set(cv2.CAP_PROP_FPS, vidfps)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
waittime = 1
window_name = "USB Camera"
else:
cam = cv2.VideoCapture(videofile)
waittime = vidfps
window_name = "Movie File"
cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)
while True:
t1 = time.perf_counter()
ret, color_image = cam.read()
if not ret:
continue
if frameBuffer.full():
frameBuffer.get()
#frames = color_image
frames = cv2.resize(color_image, (model_width, model_height))
frameBuffer.put(frames.copy())
res = None
if not results.empty():
res = results.get(False)
detectframecount += 1
imdraw = overlay_on_image(frames, res, model_width, model_height)
lastresults = res
else:
imdraw = overlay_on_image(frames, lastresults, model_width, model_height)
cv2.imshow(window_name, imdraw)
if cv2.waitKey(waittime)&0xFF == ord('q'):
break
# FPS calculation
framecount += 1
if framecount >= 15:
fps = "(Playback) {:.1f} FPS".format(time1/15)
detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
t2 = time.perf_counter()
elapsedTime = t2-t1
time1 += 1/elapsedTime
time2 += elapsedTime
def inferencer(results, frameBuffer, model, camera_width, camera_height):
engine = None
# Acquisition of TPU list without model assignment
devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)
devopen = False
for device in devices:
try:
engine = PoseEngine(model)
devopen = True
break
except:
continue
if devopen == False:
print("TPU Devices open Error!!!")
sys.exit(1)
print("Loaded Graphs!!! ")
while True:
if frameBuffer.empty():
continue
# Run inference.
color_image = frameBuffer.get()
prepimg = color_image[:, :, ::-1].copy()
tinf = time.perf_counter()
result, inference_time = engine.DetectPosesInImage(prepimg)
print(time.perf_counter() - tinf, "sec")
results.put(result)
def draw_pose(img, pose, threshold=0.2):
xys = {}
for label, keypoint in pose.keypoints.items():
if keypoint.score < threshold: continue
xys[label] = (int(keypoint.yx[1]), int(keypoint.yx[0]))
img = cv2.circle(img, (int(keypoint.yx[1]), int(keypoint.yx[0])), 5, (0, 255, 0), -1)
for a, b in EDGES:
if a not in xys or b not in xys: continue
ax, ay = xys[a]
bx, by = xys[b]
img = cv2.line(img, (ax, ay), (bx, by), (0, 255, 255), 2)
def overlay_on_image(frames, result, model_width, model_height):
color_image = frames
if isinstance(result, type(None)):
return color_image
img_cp = color_image.copy()
for pose in result:
draw_pose(img_cp, pose)
cv2.putText(img_cp, fps, (model_width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.putText(img_cp, detectfps, (model_width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
return img_cp
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="models/posenet_mobilenet_v1_075_481_641_quant_decoder_edgetpu.tflite", help="Path of the detection model.")
parser.add_argument("--usbcamno", type=int, default=0, help="USB Camera number.")
parser.add_argument('--videofile', default="", help='Path to input video file. (Default="")')
parser.add_argument('--vidfps', type=int, default=30, help='FPS of Video. (Default=30)')
args = parser.parse_args()
model = args.model
usbcamno = args.usbcamno
vidfps = args.vidfps
videofile = args.videofile
camera_width = 320
camera_height = 240
model_width = 640
model_height = 480
try:
mp.set_start_method('forkserver')
frameBuffer = mp.Queue(3)
results = mp.Queue()
# Start streaming
p = mp.Process(target=camThread,
args=(results, frameBuffer, camera_width, camera_height, model_width, model_height, vidfps, usbcamno, videofile),
daemon=True)
p.start()
processes.append(p)
# Activation of inferencer
devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)
for devnum in range(len(devices)):
p = mp.Process(target=inferencer,
args=(results, frameBuffer, model, model_width, model_height),
daemon=True)
sleep(5)
p.start()
processes.append(p)
while True:
sleep(1)
finally:
for p in range(len(processes)):
processes[p].terminate()
$ python3 pose_camera_multi_tpu.py -h
usage: pose_camera_multi_tpu.py [-h] [--model MODEL] [--usbcamno USBCAMNO]
[--videofile VIDEOFILE] [--vidfps VIDFPS]
optional arguments:
-h, --help show this help message and exit
--model MODEL Path of the detection model.
--usbcamno USBCAMNO USB Camera number.
--videofile VIDEOFILE
Path to input video file. (Default="")
--vidfps VIDFPS FPS of Video. (Default=30)
USB Camera用 同期式 シングルTPU
import argparse
import numpy as np
import cv2
import time
from PIL import Image
from time import sleep
from edgetpu.basic import edgetpu_utils
from pose_engine import PoseEngine
lastresults = None
processes = []
frameBuffer = None
results = None
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
EDGES = (
('nose', 'left eye'),
('nose', 'right eye'),
('nose', 'left ear'),
('nose', 'right ear'),
('left ear', 'left eye'),
('right ear', 'right eye'),
('left eye', 'right eye'),
('left shoulder', 'right shoulder'),
('left shoulder', 'left elbow'),
('left shoulder', 'left hip'),
('right shoulder', 'right elbow'),
('right shoulder', 'right hip'),
('left elbow', 'left wrist'),
('right elbow', 'right wrist'),
('left hip', 'right hip'),
('left hip', 'left knee'),
('right hip', 'right knee'),
('left knee', 'left ankle'),
('right knee', 'right ankle'),
)
def draw_pose(img, pose, threshold=0.2):
xys = {}
for label, keypoint in pose.keypoints.items():
if keypoint.score < threshold: continue
xys[label] = (int(keypoint.yx[1]), int(keypoint.yx[0]))
img = cv2.circle(img, (int(keypoint.yx[1]), int(keypoint.yx[0])), 5, (0, 255, 0), -1)
for a, b in EDGES:
if a not in xys or b not in xys: continue
ax, ay = xys[a]
bx, by = xys[b]
img = cv2.line(img, (ax, ay), (bx, by), (0, 255, 255), 2)
def overlay_on_image(frames, result, model_width, model_height):
color_image = frames
if isinstance(result, type(None)):
return color_image
img_cp = color_image.copy()
for pose in result:
draw_pose(img_cp, pose)
cv2.putText(img_cp, fps, (model_width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.putText(img_cp, detectfps, (model_width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
return img_cp
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="models/posenet_mobilenet_v1_075_481_641_quant_decoder_edgetpu.tflite", help="Path of the detection model.")
parser.add_argument("--usbcamno", type=int, default=0, help="USB Camera number.")
parser.add_argument('--videofile', default="", help='Path to input video file. (Default="")')
parser.add_argument('--vidfps', type=int, default=30, help='FPS of Video. (Default=30)')
args = parser.parse_args()
model = args.model
usbcamno = args.usbcamno
vidfps = args.vidfps
videofile = args.videofile
camera_width = 320
camera_height = 240
model_width = 640
model_height = 480
engine = PoseEngine(model)
sleep(5)
if videofile == "":
cam = cv2.VideoCapture(usbcamno)
cam.set(cv2.CAP_PROP_FPS, vidfps)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
waittime = 1
window_name = "USB Camera"
else:
cam = cv2.VideoCapture(videofile)
waittime = vidfps - 20
window_name = "Movie File"
cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)
while True:
t1 = time.perf_counter()
ret, color_image = cam.read()
if not ret:
continue
# Run inference.
color_image = cv2.resize(color_image, (model_width, model_height))
prepimg = color_image[:, :, ::-1].copy()
tinf = time.perf_counter()
res, inference_time = engine.DetectPosesInImage(prepimg)
if res:
detectframecount += 1
imdraw = overlay_on_image(color_image, res, model_width, model_height)
else:
imdraw = color_image
cv2.imshow(window_name, imdraw)
if cv2.waitKey(waittime)&0xFF == ord('q'):
break
# FPS calculation
framecount += 1
if framecount >= 15:
fps = "(Playback) {:.1f} FPS".format(time1/15)
detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
t2 = time.perf_counter()
elapsedTime = t2-t1
time1 += 1/elapsedTime
time2 += elapsedTime
$ python3 pose_camera_single_tpu.py -h
usage: pose_camera_single_tpu.py [-h] [--model MODEL] [--usbcamno USBCAMNO]
[--videofile VIDEOFILE] [--vidfps VIDFPS]
optional arguments:
-h, --help show this help message and exit
--model MODEL Path of the detection model.
--usbcamno USBCAMNO USB Camera number.
--videofile VIDEOFILE
Path to input video file. (Default="")
--vidfps VIDFPS FPS of Video. (Default=30)
PiCamera用 USB 2.0 では Multi-TPUが正常に動作しません。
import argparse
import numpy as np
import cv2
import time
from PIL import Image
from time import sleep
import multiprocessing as mp
from edgetpu.basic import edgetpu_utils
from pose_engine import PoseEngine
from imutils.video.pivideostream import PiVideoStream
from imutils.video.filevideostream import FileVideoStream
import imutils
lastresults = None
processes = []
frameBuffer = None
results = None
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
EDGES = (
('nose', 'left eye'),
('nose', 'right eye'),
('nose', 'left ear'),
('nose', 'right ear'),
('left ear', 'left eye'),
('right ear', 'right eye'),
('left eye', 'right eye'),
('left shoulder', 'right shoulder'),
('left shoulder', 'left elbow'),
('left shoulder', 'left hip'),
('right shoulder', 'right elbow'),
('right shoulder', 'right hip'),
('left elbow', 'left wrist'),
('right elbow', 'right wrist'),
('left hip', 'right hip'),
('left hip', 'left knee'),
('right hip', 'right knee'),
('left knee', 'left ankle'),
('right knee', 'right ankle'),
)
def camThread(results, frameBuffer, camera_width, camera_height, model_width, model_height, vidfps, video_file_path):
global fps
global detectfps
global framecount
global detectframecount
global time1
global time2
global lastresults
global cam
global window_name
global vs
if video_file_path != "":
vs = FileVideoStream(video_file_path).start()
window_name = "Movie File"
else:
vs = PiVideoStream((camera_width, camera_height), vidfps).start()
window_name = "PiCamera"
time.sleep(2)
cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)
while True:
t1 = time.perf_counter()
color_image = vs.read()
if frameBuffer.full():
frameBuffer.get()
frames = cv2.resize(color_image, (model_width, model_height))
frameBuffer.put(frames.copy())
res = None
if not results.empty():
res = results.get(False)
detectframecount += 1
imdraw = overlay_on_image(frames, res, model_width, model_height)
lastresults = res
else:
imdraw = overlay_on_image(frames, lastresults, model_width, model_height)
cv2.imshow(window_name, imdraw)
if cv2.waitKey(1)&0xFF == ord('q'):
break
# FPS calculation
framecount += 1
if framecount >= 15:
fps = "(Playback) {:.1f} FPS".format(time1/15)
detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
t2 = time.perf_counter()
elapsedTime = t2-t1
time1 += 1/elapsedTime
time2 += elapsedTime
def inferencer(results, frameBuffer, model, camera_width, camera_height):
engine = None
# Acquisition of TPU list without model assignment
devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)
devopen = False
for device in devices:
try:
engine = PoseEngine(model)
devopen = True
break
except:
continue
if devopen == False:
print("TPU Devices open Error!!!")
sys.exit(1)
print("Loaded Graphs!!! ")
while True:
if frameBuffer.empty():
continue
# Run inference.
color_image = frameBuffer.get()
prepimg = color_image[:, :, ::-1].copy()
tinf = time.perf_counter()
result, inference_time = engine.DetectPosesInImage(prepimg)
print(time.perf_counter() - tinf, "sec")
results.put(result)
def draw_pose(img, pose, threshold=0.2):
xys = {}
for label, keypoint in pose.keypoints.items():
if keypoint.score < threshold: continue
xys[label] = (int(keypoint.yx[1]), int(keypoint.yx[0]))
img = cv2.circle(img, (int(keypoint.yx[1]), int(keypoint.yx[0])), 5, (0, 255, 0), -1)
for a, b in EDGES:
if a not in xys or b not in xys: continue
ax, ay = xys[a]
bx, by = xys[b]
img = cv2.line(img, (ax, ay), (bx, by), (0, 255, 255), 2)
def overlay_on_image(frames, result, model_width, model_height):
color_image = frames
if isinstance(result, type(None)):
return color_image
img_cp = color_image.copy()
for pose in result:
draw_pose(img_cp, pose)
cv2.putText(img_cp, fps, (model_width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.putText(img_cp, detectfps, (model_width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
return img_cp
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="models/posenet_mobilenet_v1_075_481_641_quant_decoder_edgetpu.tflite", help="Path of the detection model.")
parser.add_argument('--videofile', default="", help='Path to input video file. (Default="")')
parser.add_argument('--vidfps', type=int, default=30, help='FPS of Video. (Default=30)')
args = parser.parse_args()
model = args.model
video_file_path = args.videofile
vidfps = args.vidfps
camera_width = 320
camera_height = 240
model_width = 640
model_height = 480
try:
mp.set_start_method('forkserver')
frameBuffer = mp.Queue(3)
results = mp.Queue()
# Start streaming
p = mp.Process(target=camThread,
args=(results, frameBuffer, camera_width, camera_height, model_width, model_height, vidfps, video_file_path),
daemon=True)
p.start()
processes.append(p)
# Activation of inferencer
devices = edgetpu_utils.ListEdgeTpuPaths(edgetpu_utils.EDGE_TPU_STATE_UNASSIGNED)
for devnum in range(len(devices)):
p = mp.Process(target=inferencer,
args=(results, frameBuffer, model, model_width, model_height),
daemon=True)
sleep(5)
p.start()
processes.append(p)
while True:
sleep(1)
finally:
for p in range(len(processes)):
processes[p].terminate()