#◆ 前回記事
CPU単体で無理やり tiny-YoloV3 OpenVINO [60 FPS / CPU only] 今度こそ絶対速いと感じるに違いない、というか、速すぎです 【その4】
#◆ はじめに
今回は、ようやく tiny-YoloV3
の Python + OpenVINO
前回は C++ による実装でした。
本記事からは RaspberryPi3 上でも動作します。
コメント部の入れ替えをすることで、NCS および NCS2 にも対応します。
#◆ 環境
- Ubuntu 16.04 x86_64 or RaspberryPi3 (Raspbian Stretch)
- OpenVINO toolkit 2018 R5 (2018.5.445)
- Python 3.5
- OpenCV 4.0.1-openvino
- tiny-YoloV3 (MS-COCO)
#◆ 実装
最終目標は MultiProcess + NCS2のMultiStick + RaspberryPi3 なので。
import sys, os, cv2, time
import numpy as np, math
from argparse import ArgumentParser
from openvino.inference_engine import IENetwork, IEPlugin
m_input_size = 416
yolo_scale_13 = 13
yolo_scale_26 = 26
yolo_scale_52 = 52
classes = 80
coords = 4
num = 3
anchors = [10,14, 23,27, 37,58, 81,82, 135,169, 344,319]
LABELS = ("person", "bicycle", "car", "motorbike", "aeroplane",
"bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird",
"cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard","tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon",
"bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut",
"cake", "chair", "sofa", "pottedplant", "bed",
"diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock",
"vase", "scissors", "teddy bear", "hair drier", "toothbrush")
label_text_color = (255, 255, 255)
label_background_color = (125, 175, 75)
box_color = (255, 128, 0)
box_thickness = 1
def build_argparser():
parser = ArgumentParser()
parser.add_argument("-d", "--device", help="Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. \
Sample will look for a suitable plugin for device specified (CPU by default)", default="CPU", type=str)
return parser
def EntryIndex(side, lcoords, lclasses, location, entry):
n = int(location / (side * side))
loc = location % (side * side)
return int(n * side * side * (lcoords + lclasses + 1) + entry * side * side + loc)
class DetectionObject():
xmin = 0
ymin = 0
xmax = 0
ymax = 0
class_id = 0
confidence = 0.0
def __init__(self, x, y, h, w, class_id, confidence, h_scale, w_scale):
self.xmin = int((x - w / 2) * w_scale)
self.ymin = int((y - h / 2) * h_scale)
self.xmax = int(self.xmin + w * w_scale)
self.ymax = int(self.ymin + h * h_scale)
self.class_id = class_id
self.confidence = confidence
def IntersectionOverUnion(box_1, box_2):
width_of_overlap_area = min(box_1.xmax, box_2.xmax) - max(box_1.xmin, box_2.xmin)
height_of_overlap_area = min(box_1.ymax, box_2.ymax) - max(box_1.ymin, box_2.ymin)
area_of_overlap = 0.0
if (width_of_overlap_area < 0.0 or height_of_overlap_area < 0.0):
area_of_overlap = 0.0
area_of_overlap = width_of_overlap_area * height_of_overlap_area
box_1_area = (box_1.ymax - box_1.ymin) * (box_1.xmax - box_1.xmin)
box_2_area = (box_2.ymax - box_2.ymin) * (box_2.xmax - box_2.xmin)
area_of_union = box_1_area + box_2_area - area_of_overlap
return (area_of_overlap / area_of_union)
def ParseYOLOV3Output(blob, resized_im_h, resized_im_w, original_im_h, original_im_w, threshold, objects):
out_blob_h = blob.shape[2]
out_blob_w = blob.shape[3]
side = out_blob_h
anchor_offset = 0
if len(anchors) == 18: ## YoloV3
if side == yolo_scale_13:
anchor_offset = 2 * 6
elif side == yolo_scale_26:
anchor_offset = 2 * 3
elif side == yolo_scale_52:
anchor_offset = 2 * 0
elif len(anchors) == 12: ## tiny-YoloV3
if side == yolo_scale_13:
anchor_offset = 2 * 3
elif side == yolo_scale_26:
anchor_offset = 2 * 0
else: ## ???
if side == yolo_scale_13:
anchor_offset = 2 * 6
elif side == yolo_scale_26:
anchor_offset = 2 * 3
elif side == yolo_scale_52:
anchor_offset = 2 * 0
side_square = side * side
output_blob = blob.flatten()
for i in range(side_square):
row = int(i / side)
col = int(i % side)
for n in range(num):
obj_index = EntryIndex(side, coords, classes, n * side * side + i, coords)
box_index = EntryIndex(side, coords, classes, n * side * side + i, 0)
scale = output_blob[obj_index]
if (scale < threshold):
x = (col + output_blob[box_index + 0 * side_square]) / side * resized_im_w
y = (row + output_blob[box_index + 1 * side_square]) / side * resized_im_h
height = math.exp(output_blob[box_index + 3 * side_square]) * anchors[anchor_offset + 2 * n + 1]
width = math.exp(output_blob[box_index + 2 * side_square]) * anchors[anchor_offset + 2 * n]
for j in range(classes):
class_index = EntryIndex(side, coords, classes, n * side_square + i, coords + 1 + j)
prob = scale * output_blob[class_index]
if prob < threshold:
obj = DetectionObject(x, y, height, width, j, prob, (original_im_h / resized_im_h), (original_im_w / resized_im_w))
return objects
def main_IE_infer():
camera_width = 320
camera_height = 240
fps = ""
framepos = 0
frame_count = 0
vidfps = 0
skip_frame = 0
elapsedTime = 0
args = build_argparser().parse_args()
model_xml = "lrmodels/tiny-YoloV3/FP32/frozen_tiny_yolo_v3.xml" #<--- CPU
#model_xml = "lrmodels/tiny-YoloV3/FP16/frozen_tiny_yolo_v3.xml" #<--- MYRIAD
model_bin = os.path.splitext(model_xml)[0] + ".bin"
cap = cv2.VideoCapture(1)
cap.set(cv2.CAP_PROP_FPS, 30)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
#cap = cv2.VideoCapture("data/input/testvideo.mp4")
#camera_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
#camera_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
#frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#vidfps = int(cap.get(cv2.CAP_PROP_FPS))
#print("videosFrameCount =", str(frame_count))
#print("videosFPS =", str(vidfps))
plugin = IEPlugin(device=args.device)
if "CPU" in args.device:
net = IENetwork(model=model_xml, weights=model_bin)
input_blob = next(iter(net.inputs))
exec_net = plugin.load(network=net)
while cap.isOpened():
t1 = time.time()
## Uncomment only when playing video files
#cap.set(cv2.CAP_PROP_POS_FRAMES, framepos)
ret, image = cap.read()
if not ret:
prepimg = cv2.resize(image, (m_input_size, m_input_size))
prepimg = prepimg[np.newaxis, :, :, :] # Batch size axis add
prepimg = prepimg.transpose((0, 3, 1, 2)) # NHWC to NCHW
outputs = exec_net.infer(inputs={input_blob: prepimg})
#output_name = detector/yolo-v3-tiny/Conv_12/BiasAdd/YoloRegion
#output_name = detector/yolo-v3-tiny/Conv_9/BiasAdd/YoloRegion
objects = []
for output in outputs.values():
objects = ParseYOLOV3Output(output, m_input_size, m_input_size, camera_height, camera_width, 0.2, objects)
# Filtering overlapping boxes
objlen = len(objects)
for i in range(objlen):
if (objects[i].confidence == 0.0):
for j in range(i + 1, objlen):
if (IntersectionOverUnion(objects[i], objects[j]) >= 0.4):
objects[j].confidence = 0
# Drawing boxes
for obj in objects:
if obj.confidence < 0.2:
label = obj.class_id
confidence = obj.confidence
if confidence > 0.2:
label_text = LABELS[label] + " (" + "{:.1f}".format(confidence * 100) + "%)"
cv2.rectangle(image, (obj.xmin, obj.ymin), (obj.xmax, obj.ymax), box_color, box_thickness)
cv2.putText(image, label_text, (obj.xmin, obj.ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_text_color, 1)
cv2.putText(image, fps, (camera_width - 170, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38, 0, 255), 1, cv2.LINE_AA)
cv2.imshow("Result", image)
if cv2.waitKey(1)&0xFF == ord('q'):
elapsedTime = time.time() - t1
fps = "(Playback) {:.1f} FPS".format(1/elapsedTime)
## frame skip, video file only
#skip_frame = int((vidfps - int(1/elapsedTime)) / int(1/elapsedTime))
#framepos += skip_frame
del net
del exec_net
del plugin
if __name__ == '__main__':
sys.exit(main_IE_infer() or 0)
#◆ 結果
<tiny-YoloV3 (tiny), Intel Core i7-8750H, USB Camera, CPU Only, 30 FPS>
#◆ 最後に
- ありきたりね。
- スピードがかなり速いですが、精度は極悪です。
- USB Cameraのフレームレートを 30 FPS にセーブしているため上限速度が 30 FPS になっていますが、USB Cameraのフレームレートを上げれば、60 FPS ぐらいは余裕で出ます。
- MobileNet-SSDの実装のほうが速度と精度のバランスがとれています。
#◆ 次回予告
前回の予告を無視してしまいましたので、次回こそ MultiStick NCS2 にチャレンジします。
連休に入りましたし、せっかくなので MobileNet-SSD と tiny-YoloV3 を同時に実装しようと思います。
Tiny-YoloV3 OpenVINO [30 FPS / CPU only] Python implementation version forcibly with CPU alone [Part 5]
#◆ Previous article
Forcibly with CPU alone tiny-YoloV3 OpenVINO [60 FPS / CPU only] It must be faster then this time, it is too fast 【Part4】
#◆ Introduction
This time, it is finally Pinthon + OpenVINO
implementation of tiny-YoloV3
Last time it was implementation by C++.
From this article also works on RaspberryPi3.
By replacing the comment part, it corresponds also to NCS and NCS2.
#◆ Environment
- Ubuntu 16.04 x86_64 or RaspberryPi3 (Raspbian Stretch)
- OpenVINO toolkit 2018 R5 (2018.5.445)
- Python 3.5
- OpenCV 4.0.1-openvino
- tiny-YoloV3 (MS-COCO)
#◆ Implementation
Because it is single thread, I implemented it very roughly.
The final goal is MultiProcess + NCS 2's MultiStick + RaspberryPi3.
Because it is only an intermediate step of verification, I omit explanation of troublesome logic at all.
import sys, os, cv2, time
import numpy as np, math
from argparse import ArgumentParser
from openvino.inference_engine import IENetwork, IEPlugin
m_input_size = 416
yolo_scale_13 = 13
yolo_scale_26 = 26
yolo_scale_52 = 52
classes = 80
coords = 4
num = 3
anchors = [10,14, 23,27, 37,58, 81,82, 135,169, 344,319]
LABELS = ("person", "bicycle", "car", "motorbike", "aeroplane",
"bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird",
"cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard","tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon",
"bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut",
"cake", "chair", "sofa", "pottedplant", "bed",
"diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock",
"vase", "scissors", "teddy bear", "hair drier", "toothbrush")
label_text_color = (255, 255, 255)
label_background_color = (125, 175, 75)
box_color = (255, 128, 0)
box_thickness = 1
def build_argparser():
parser = ArgumentParser()
parser.add_argument("-d", "--device", help="Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. \
Sample will look for a suitable plugin for device specified (CPU by default)", default="CPU", type=str)
return parser
def EntryIndex(side, lcoords, lclasses, location, entry):
n = int(location / (side * side))
loc = location % (side * side)
return int(n * side * side * (lcoords + lclasses + 1) + entry * side * side + loc)
class DetectionObject():
xmin = 0
ymin = 0
xmax = 0
ymax = 0
class_id = 0
confidence = 0.0
def __init__(self, x, y, h, w, class_id, confidence, h_scale, w_scale):
self.xmin = int((x - w / 2) * w_scale)
self.ymin = int((y - h / 2) * h_scale)
self.xmax = int(self.xmin + w * w_scale)
self.ymax = int(self.ymin + h * h_scale)
self.class_id = class_id
self.confidence = confidence
def IntersectionOverUnion(box_1, box_2):
width_of_overlap_area = min(box_1.xmax, box_2.xmax) - max(box_1.xmin, box_2.xmin)
height_of_overlap_area = min(box_1.ymax, box_2.ymax) - max(box_1.ymin, box_2.ymin)
area_of_overlap = 0.0
if (width_of_overlap_area < 0.0 or height_of_overlap_area < 0.0):
area_of_overlap = 0.0
area_of_overlap = width_of_overlap_area * height_of_overlap_area
box_1_area = (box_1.ymax - box_1.ymin) * (box_1.xmax - box_1.xmin)
box_2_area = (box_2.ymax - box_2.ymin) * (box_2.xmax - box_2.xmin)
area_of_union = box_1_area + box_2_area - area_of_overlap
return (area_of_overlap / area_of_union)
def ParseYOLOV3Output(blob, resized_im_h, resized_im_w, original_im_h, original_im_w, threshold, objects):
out_blob_h = blob.shape[2]
out_blob_w = blob.shape[3]
side = out_blob_h
anchor_offset = 0
if len(anchors) == 18: ## YoloV3
if side == yolo_scale_13:
anchor_offset = 2 * 6
elif side == yolo_scale_26:
anchor_offset = 2 * 3
elif side == yolo_scale_52:
anchor_offset = 2 * 0
elif len(anchors) == 12: ## tiny-YoloV3
if side == yolo_scale_13:
anchor_offset = 2 * 3
elif side == yolo_scale_26:
anchor_offset = 2 * 0
else: ## ???
if side == yolo_scale_13:
anchor_offset = 2 * 6
elif side == yolo_scale_26:
anchor_offset = 2 * 3
elif side == yolo_scale_52:
anchor_offset = 2 * 0
side_square = side * side
output_blob = blob.flatten()
for i in range(side_square):
row = int(i / side)
col = int(i % side)
for n in range(num):
obj_index = EntryIndex(side, coords, classes, n * side * side + i, coords)
box_index = EntryIndex(side, coords, classes, n * side * side + i, 0)
scale = output_blob[obj_index]
if (scale < threshold):
x = (col + output_blob[box_index + 0 * side_square]) / side * resized_im_w
y = (row + output_blob[box_index + 1 * side_square]) / side * resized_im_h
height = math.exp(output_blob[box_index + 3 * side_square]) * anchors[anchor_offset + 2 * n + 1]
width = math.exp(output_blob[box_index + 2 * side_square]) * anchors[anchor_offset + 2 * n]
for j in range(classes):
class_index = EntryIndex(side, coords, classes, n * side_square + i, coords + 1 + j)
prob = scale * output_blob[class_index]
if prob < threshold:
obj = DetectionObject(x, y, height, width, j, prob, (original_im_h / resized_im_h), (original_im_w / resized_im_w))
return objects
def main_IE_infer():
camera_width = 320
camera_height = 240
fps = ""
framepos = 0
frame_count = 0
vidfps = 0
skip_frame = 0
elapsedTime = 0
args = build_argparser().parse_args()
model_xml = "lrmodels/tiny-YoloV3/FP32/frozen_tiny_yolo_v3.xml" #<--- CPU
#model_xml = "lrmodels/tiny-YoloV3/FP16/frozen_tiny_yolo_v3.xml" #<--- MYRIAD
model_bin = os.path.splitext(model_xml)[0] + ".bin"
cap = cv2.VideoCapture(1)
cap.set(cv2.CAP_PROP_FPS, 30)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
#cap = cv2.VideoCapture("data/input/testvideo.mp4")
#camera_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
#camera_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
#frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#vidfps = int(cap.get(cv2.CAP_PROP_FPS))
#print("videosFrameCount =", str(frame_count))
#print("videosFPS =", str(vidfps))
plugin = IEPlugin(device=args.device)
if "CPU" in args.device:
net = IENetwork(model=model_xml, weights=model_bin)
input_blob = next(iter(net.inputs))
exec_net = plugin.load(network=net)
while cap.isOpened():
t1 = time.time()
## Uncomment only when playing video files
#cap.set(cv2.CAP_PROP_POS_FRAMES, framepos)
ret, image = cap.read()
if not ret:
prepimg = cv2.resize(image, (m_input_size, m_input_size))
prepimg = prepimg[np.newaxis, :, :, :] # Batch size axis add
prepimg = prepimg.transpose((0, 3, 1, 2)) # NHWC to NCHW
outputs = exec_net.infer(inputs={input_blob: prepimg})
#output_name = detector/yolo-v3-tiny/Conv_12/BiasAdd/YoloRegion
#output_name = detector/yolo-v3-tiny/Conv_9/BiasAdd/YoloRegion
objects = []
for output in outputs.values():
objects = ParseYOLOV3Output(output, m_input_size, m_input_size, camera_height, camera_width, 0.2, objects)
# Filtering overlapping boxes
objlen = len(objects)
for i in range(objlen):
if (objects[i].confidence == 0.0):
for j in range(i + 1, objlen):
if (IntersectionOverUnion(objects[i], objects[j]) >= 0.4):
objects[j].confidence = 0
# Drawing boxes
for obj in objects:
if obj.confidence < 0.2:
label = obj.class_id
confidence = obj.confidence
if confidence > 0.2:
label_text = LABELS[label] + " (" + "{:.1f}".format(confidence * 100) + "%)"
cv2.rectangle(image, (obj.xmin, obj.ymin), (obj.xmax, obj.ymax), box_color, box_thickness)
cv2.putText(image, label_text, (obj.xmin, obj.ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_text_color, 1)
cv2.putText(image, fps, (camera_width - 170, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38, 0, 255), 1, cv2.LINE_AA)
cv2.imshow("Result", image)
if cv2.waitKey(1)&0xFF == ord('q'):
elapsedTime = time.time() - t1
fps = "(Playback) {:.1f} FPS".format(1/elapsedTime)
## frame skip, video file only
#skip_frame = int((vidfps - int(1/elapsedTime)) / int(1/elapsedTime))
#framepos += skip_frame
del net
del exec_net
del plugin
if __name__ == '__main__':
sys.exit(main_IE_infer() or 0)
#◆ Result
<tiny-YoloV3 (tiny), Intel Core i7-8750H, USB Camera, CPU Only, 30 FPS>
#◆ Finally
- Speed is quite fast, but accuracy is extremely poor.
- Since the frame rate of the USB camera is saved to 30 FPS, the upper limit speed is 30 FPS, but if you increase the frame rate of the USB camera, it will give you around 60 FPS with plenty of room.
- The implementation of MobileNet-SSD balances speed and accuracy.