I wrote an English article, here
#◆ はじめに
前回、YoloV3 に関してはかなり残念な結果を出してしまいましたが、 早速、Intel の中の人から改善策の提案を受けまして、再実装しました。
年内はオブジェクト・ディテクションの検証はしない、とか言いながら、結局やっちゃいました。
今回はかなり実装を変更して中身がカオスの様相を呈していますので、細かい実装内容の説明は省略します。
動画の再生レートは 30 FPS
推論レートは 13 FPS
です。
前回の推論レートは 4 FPS
でした。
#◆ 実装
MultiProcess + MultiThread による折衷実装です。
OpenVINOのクセが強すぎてかなりトリッキーな実装です。
ここまでやる意味があるのか、と問われれば、あまり無いかもしれません。
Github を随時アップデートしています。
MultiProcess により、映像撮影のロジックと推論のロジックを分離しています。
また、MultiProcessで分離した推論ロジック側の内部で、さらに MultiThread化して推論を 「4本×4リクエスト=16並列」 にしています。
非同期 + 並列推論 の影響により表示がカクつくため、推論対象フレームの一部を強制的にオミットする実装も組み込みました。
もうね、我ながらアホですね。
import sys, os, cv2, time, heapq, argparse
import numpy as np, math
from openvino.inference_engine import IENetwork, IEPlugin
import multiprocessing as mp
from time import sleep
import threading
yolo_scale_13 = 13
yolo_scale_26 = 26
yolo_scale_52 = 52
classes = 80
coords = 4
num = 3
anchors = [10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326]
LABELS = ("person", "bicycle", "car", "motorbike", "aeroplane",
"bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird",
"cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard","tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon",
"bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut",
"cake", "chair", "sofa", "pottedplant", "bed",
"diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock",
"vase", "scissors", "teddy bear", "hair drier", "toothbrush")
label_text_color = (255, 255, 255)
label_background_color = (125, 175, 75)
box_color = (255, 128, 0)
box_thickness = 1
processes = []
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
lastresults = None
def EntryIndex(side, lcoords, lclasses, location, entry):
n = int(location / (side * side))
loc = location % (side * side)
return int(n * side * side * (lcoords + lclasses + 1) + entry * side * side + loc)
class DetectionObject():
xmin = 0
ymin = 0
xmax = 0
ymax = 0
class_id = 0
confidence = 0.0
def __init__(self, x, y, h, w, class_id, confidence, h_scale, w_scale):
self.xmin = int((x - w / 2) * w_scale)
self.ymin = int((y - h / 2) * h_scale)
self.xmax = int(self.xmin + w * w_scale)
self.ymax = int(self.ymin + h * h_scale)
self.class_id = class_id
self.confidence = confidence
def IntersectionOverUnion(box_1, box_2):
width_of_overlap_area = min(box_1.xmax, box_2.xmax) - max(box_1.xmin, box_2.xmin)
height_of_overlap_area = min(box_1.ymax, box_2.ymax) - max(box_1.ymin, box_2.ymin)
area_of_overlap = 0.0
if (width_of_overlap_area < 0.0 or height_of_overlap_area < 0.0):
area_of_overlap = 0.0
else:
area_of_overlap = width_of_overlap_area * height_of_overlap_area
box_1_area = (box_1.ymax - box_1.ymin) * (box_1.xmax - box_1.xmin)
box_2_area = (box_2.ymax - box_2.ymin) * (box_2.xmax - box_2.xmin)
area_of_union = box_1_area + box_2_area - area_of_overlap
return (area_of_overlap / area_of_union)
def ParseYOLOV3Output(blob, resized_im_h, resized_im_w, original_im_h, original_im_w, threshold, objects):
out_blob_h = blob.shape[2]
out_blob_w = blob.shape[3]
side = out_blob_h
anchor_offset = 0
if side == yolo_scale_13:
anchor_offset = 2 * 6
elif side == yolo_scale_26:
anchor_offset = 2 * 3
elif side == yolo_scale_52:
anchor_offset = 2 * 0
side_square = side * side
output_blob = blob.flatten()
for i in range(side_square):
row = int(i / side)
col = int(i % side)
for n in range(num):
obj_index = EntryIndex(side, coords, classes, n * side * side + i, coords)
box_index = EntryIndex(side, coords, classes, n * side * side + i, 0)
scale = output_blob[obj_index]
if (scale < threshold):
continue
x = (col + output_blob[box_index + 0 * side_square]) / side * resized_im_w
y = (row + output_blob[box_index + 1 * side_square]) / side * resized_im_h
height = math.exp(output_blob[box_index + 3 * side_square]) * anchors[anchor_offset + 2 * n + 1]
width = math.exp(output_blob[box_index + 2 * side_square]) * anchors[anchor_offset + 2 * n]
for j in range(classes):
class_index = EntryIndex(side, coords, classes, n * side_square + i, coords + 1 + j)
prob = scale * output_blob[class_index]
if prob < threshold:
continue
obj = DetectionObject(x, y, height, width, j, prob, (original_im_h / resized_im_h), (original_im_w / resized_im_w))
objects.append(obj)
return objects
def camThread(LABELS, results, frameBuffer, camera_width, camera_height, vidfps):
global fps
global detectfps
global lastresults
global framecount
global detectframecount
global time1
global time2
global cam
global window_name
#cam = cv2.VideoCapture(0)
#if cam.isOpened() != True:
# print("USB Camera Open Error!!!")
# sys.exit(0)
#cam.set(cv2.CAP_PROP_FPS, vidfps)
#cam.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
#cam.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
#window_name = "USB Camera"
#wait_key_time = 1
cam = cv2.VideoCapture("data/input/testvideo4.mp4")
camera_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH))
camera_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))
window_name = "Movie File"
wait_key_time = int(1000 / vidfps)
cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)
while True:
t1 = time.perf_counter()
# USB Camera Stream Read
s, color_image = cam.read()
if not s:
continue
if frameBuffer.full():
frameBuffer.get()
height = color_image.shape[0]
width = color_image.shape[1]
frameBuffer.put(color_image.copy())
if not results.empty():
objects = results.get(False)
detectframecount += 1
for obj in objects:
if obj.confidence < 0.2:
continue
label = obj.class_id
confidence = obj.confidence
if confidence > 0.2:
label_text = LABELS[label] + " (" + "{:.1f}".format(confidence * 100) + "%)"
cv2.rectangle(color_image, (obj.xmin, obj.ymin), (obj.xmax, obj.ymax), box_color, box_thickness)
cv2.putText(color_image, label_text, (obj.xmin, obj.ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_text_color, 1)
lastresults = objects
else:
if not isinstance(lastresults, type(None)):
for obj in lastresults:
if obj.confidence < 0.2:
continue
label = obj.class_id
confidence = obj.confidence
if confidence > 0.2:
label_text = LABELS[label] + " (" + "{:.1f}".format(confidence * 100) + "%)"
cv2.rectangle(color_image, (obj.xmin, obj.ymin), (obj.xmax, obj.ymax), box_color, box_thickness)
cv2.putText(color_image, label_text, (obj.xmin, obj.ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_text_color, 1)
cv2.putText(color_image, fps, (width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.putText(color_image, detectfps, (width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.imshow(window_name, cv2.resize(color_image, (width, height)))
if cv2.waitKey(wait_key_time)&0xFF == ord('q'):
sys.exit(0)
## Print FPS
framecount += 1
if framecount >= 15:
fps = "(Playback) {:.1f} FPS".format(time1/15)
detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
t2 = time.perf_counter()
elapsedTime = t2-t1
time1 += 1/elapsedTime
time2 += elapsedTime
# l = Search list
# x = Search target value
def searchlist(l, x, notfoundvalue=-1):
if x in l:
return l.index(x)
else:
return notfoundvalue
def async_infer(ncsworker):
ncsworker.skip_frame_measurement()
while True:
ncsworker.predict_async()
class NcsWorker(object):
def __init__(self, devid, frameBuffer, results, camera_width, camera_height, number_of_ncs, vidfps):
self.devid = devid
self.frameBuffer = frameBuffer
self.model_xml = "./lrmodels/YoloV3/FP16/frozen_yolo_v3.xml"
self.model_bin = "./lrmodels/YoloV3/FP16/frozen_yolo_v3.bin"
self.camera_width = camera_width
self.camera_height = camera_height
self.m_input_size = 416
self.threshould = 0.7
self.num_requests = 4
self.inferred_request = [0] * self.num_requests
self.heap_request = []
self.inferred_cnt = 0
self.plugin = IEPlugin(device="MYRIAD")
self.net = IENetwork(model=self.model_xml, weights=self.model_bin)
self.input_blob = next(iter(self.net.inputs))
self.exec_net = self.plugin.load(network=self.net, num_requests=self.num_requests)
self.results = results
self.number_of_ncs = number_of_ncs
self.predict_async_time = 800
self.skip_frame = 0
self.roop_frame = 0
self.vidfps = vidfps
def image_preprocessing(self, color_image):
prepimg = cv2.resize(color_image, (self.m_input_size, self.m_input_size))
prepimg = prepimg[np.newaxis, :, :, :] # Batch size axis add
prepimg = prepimg.transpose((0, 3, 1, 2)) # NHWC to NCHW
return prepimg
def skip_frame_measurement(self):
surplustime_per_second = (1000 - self.predict_async_time)
if surplustime_per_second > 0.0:
frame_per_millisecond = (1000 / self.vidfps)
total_skip_frame = surplustime_per_second / frame_per_millisecond
self.skip_frame = int(total_skip_frame / self.num_requests)
else:
self.skip_frame = 0
def predict_async(self):
try:
if self.frameBuffer.empty():
return
self.roop_frame += 1
if self.roop_frame <= self.skip_frame:
self.frameBuffer.get()
return
self.roop_frame = 0
prepimg = self.image_preprocessing(self.frameBuffer.get())
reqnum = searchlist(self.inferred_request, 0)
if reqnum > -1:
self.exec_net.start_async(request_id=reqnum, inputs={self.input_blob: prepimg})
self.inferred_request[reqnum] = 1
self.inferred_cnt += 1
if self.inferred_cnt == sys.maxsize:
self.inferred_request = [0] * self.num_requests
self.heap_request = []
self.inferred_cnt = 0
heapq.heappush(self.heap_request, (self.inferred_cnt, reqnum))
cnt, dev = heapq.heappop(self.heap_request)
if self.exec_net.requests[dev].wait(0) == 0:
self.exec_net.requests[dev].wait(-1)
objects = []
outputs = self.exec_net.requests[dev].outputs
for output in outputs.values():
objects = ParseYOLOV3Output(output, self.m_input_size, self.m_input_size, self.camera_height, self.camera_width, self.threshould, objects)
objlen = len(objects)
for i in range(objlen):
if (objects[i].confidence == 0.0):
continue
for j in range(i + 1, objlen):
if (IntersectionOverUnion(objects[i], objects[j]) >= 0.4):
objects[j].confidence = 0
self.results.put(objects)
self.inferred_request[dev] = 0
else:
heapq.heappush(self.heap_request, (cnt, dev))
except:
import traceback
traceback.print_exc()
def inferencer(results, frameBuffer, number_of_ncs, camera_width, camera_height, vidfps):
# Init infer threads
threads = []
for devid in range(number_of_ncs):
thworker = threading.Thread(target=async_infer, args=(NcsWorker(devid, frameBuffer, results, camera_width, camera_height, number_of_ncs, vidfps),))
thworker.start()
threads.append(thworker)
for th in threads:
th.join()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-numncs','--numberofncs',dest='number_of_ncs',type=int,default=1,help='Number of NCS. (Default=1)')
args = parser.parse_args()
number_of_ncs = args.number_of_ncs
camera_width = 320
camera_height = 240
vidfps = 30
try:
mp.set_start_method('forkserver')
frameBuffer = mp.Queue(10)
results = mp.Queue()
# Start detection MultiStick
# Activation of inferencer
p = mp.Process(target=inferencer, args=(results, frameBuffer, number_of_ncs, camera_width, camera_height, vidfps), daemon=True)
p.start()
processes.append(p)
sleep(number_of_ncs * 7)
# Start streaming
p = mp.Process(target=camThread, args=(LABELS, results, frameBuffer, camera_width, camera_height, vidfps), daemon=True)
p.start()
processes.append(p)
while True:
sleep(1)
except:
import traceback
traceback.print_exc()
finally:
for p in range(len(processes)):
processes[p].terminate()
print("\n\nFinished\n\n")
#◆ おわりに
- 面倒だったため RaspberryPi3 では動作検証していませんが、おそらく動作します。
- 今回は Core i7機 のみで検証していますので、ARM CPU では多少の性能劣化が発生します。
- 一応次回、 MobileNet-SSDのほうも数倍の性能改善を実施します。
- 大勢の方々は 「それでも全然遅い!」 と揶揄するでしょうが、私個人的な感覚では、ショボクレたデバイスで120MBを超えるモデルを10FPS以上のパフォーマンスで推論できるのは超速の域だと感じています。
- NCS2 より、 コチラ https://www.gyrfalcontech.ai/solutions/2801s/ が気になっています。 すさまじい推論性能です。
- TX2 の実に
2.5倍
の性能です。 - 中国から調達できますが、 https://ja.aliexpress.com/store/product/Orange-Pi-AI-Stick-2801-Neural-Network-Computing-Stick-Artificial-Intelligence/1553371_32954041998.html ドキュメントやチュートリアルが一切無いようです。 リスク込み込みでチャレンジ精神旺盛なあなたは挑戦してみてはいかがでしょうか?
#[13 FPS] NCS2 x 4 + Full size YoloV3 performance has been tripled
#◆ Previous article
[24 FPS] Boost RaspberryPi3 with four Neural Compute Stick 2 (NCS2) MobileNet-SSD / YoloV3 [48 FPS for Core i7]
#◆ Introduction
Last time, I got quite disappointing results with YoloV3, but immediately received suggestions for improvement from people in Intel and reimplemented it.
While talking about not verifying the object detection within the year, I eventually did it.
I changed the implementation considerably this time and the contents are showing the aspect of chaos so I omit explanation of detailed implementation contents.
The video playback rate is 30 FPS
The inference rate is 13 FPS
The inference rate of the previous article is 4 FPS
#◆ Implementation
MultiProcess + MultiThread
It is an eclectic implementation.
It's a pretty tricky implementation, as the OpenVINO API is very hard to handle.
There may not be much if it is asked whether it has the meaning to do so far.
Github I am updating Github from time to time.
MultiProcess separates the logic of image shooting logic and inference logic.
In addition, inside the inference logic side separated by MultiProcess, the inference is further converted into "MultiThread" to make the inference "4 × 4 requests = 16 parallel".
Because the display is not beautiful due to the effect of asynchronous + parallel inference, I also incorporated an implementation to forcibly omit a part of the inference target frame.
import sys, os, cv2, time, heapq, argparse
import numpy as np, math
from openvino.inference_engine import IENetwork, IEPlugin
import multiprocessing as mp
from time import sleep
import threading
yolo_scale_13 = 13
yolo_scale_26 = 26
yolo_scale_52 = 52
classes = 80
coords = 4
num = 3
anchors = [10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326]
LABELS = ("person", "bicycle", "car", "motorbike", "aeroplane",
"bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird",
"cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard","tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon",
"bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut",
"cake", "chair", "sofa", "pottedplant", "bed",
"diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock",
"vase", "scissors", "teddy bear", "hair drier", "toothbrush")
label_text_color = (255, 255, 255)
label_background_color = (125, 175, 75)
box_color = (255, 128, 0)
box_thickness = 1
processes = []
fps = ""
detectfps = ""
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
lastresults = None
def EntryIndex(side, lcoords, lclasses, location, entry):
n = int(location / (side * side))
loc = location % (side * side)
return int(n * side * side * (lcoords + lclasses + 1) + entry * side * side + loc)
class DetectionObject():
xmin = 0
ymin = 0
xmax = 0
ymax = 0
class_id = 0
confidence = 0.0
def __init__(self, x, y, h, w, class_id, confidence, h_scale, w_scale):
self.xmin = int((x - w / 2) * w_scale)
self.ymin = int((y - h / 2) * h_scale)
self.xmax = int(self.xmin + w * w_scale)
self.ymax = int(self.ymin + h * h_scale)
self.class_id = class_id
self.confidence = confidence
def IntersectionOverUnion(box_1, box_2):
width_of_overlap_area = min(box_1.xmax, box_2.xmax) - max(box_1.xmin, box_2.xmin)
height_of_overlap_area = min(box_1.ymax, box_2.ymax) - max(box_1.ymin, box_2.ymin)
area_of_overlap = 0.0
if (width_of_overlap_area < 0.0 or height_of_overlap_area < 0.0):
area_of_overlap = 0.0
else:
area_of_overlap = width_of_overlap_area * height_of_overlap_area
box_1_area = (box_1.ymax - box_1.ymin) * (box_1.xmax - box_1.xmin)
box_2_area = (box_2.ymax - box_2.ymin) * (box_2.xmax - box_2.xmin)
area_of_union = box_1_area + box_2_area - area_of_overlap
return (area_of_overlap / area_of_union)
def ParseYOLOV3Output(blob, resized_im_h, resized_im_w, original_im_h, original_im_w, threshold, objects):
out_blob_h = blob.shape[2]
out_blob_w = blob.shape[3]
side = out_blob_h
anchor_offset = 0
if side == yolo_scale_13:
anchor_offset = 2 * 6
elif side == yolo_scale_26:
anchor_offset = 2 * 3
elif side == yolo_scale_52:
anchor_offset = 2 * 0
side_square = side * side
output_blob = blob.flatten()
for i in range(side_square):
row = int(i / side)
col = int(i % side)
for n in range(num):
obj_index = EntryIndex(side, coords, classes, n * side * side + i, coords)
box_index = EntryIndex(side, coords, classes, n * side * side + i, 0)
scale = output_blob[obj_index]
if (scale < threshold):
continue
x = (col + output_blob[box_index + 0 * side_square]) / side * resized_im_w
y = (row + output_blob[box_index + 1 * side_square]) / side * resized_im_h
height = math.exp(output_blob[box_index + 3 * side_square]) * anchors[anchor_offset + 2 * n + 1]
width = math.exp(output_blob[box_index + 2 * side_square]) * anchors[anchor_offset + 2 * n]
for j in range(classes):
class_index = EntryIndex(side, coords, classes, n * side_square + i, coords + 1 + j)
prob = scale * output_blob[class_index]
if prob < threshold:
continue
obj = DetectionObject(x, y, height, width, j, prob, (original_im_h / resized_im_h), (original_im_w / resized_im_w))
objects.append(obj)
return objects
def camThread(LABELS, results, frameBuffer, camera_width, camera_height, vidfps):
global fps
global detectfps
global lastresults
global framecount
global detectframecount
global time1
global time2
global cam
global window_name
#cam = cv2.VideoCapture(0)
#if cam.isOpened() != True:
# print("USB Camera Open Error!!!")
# sys.exit(0)
#cam.set(cv2.CAP_PROP_FPS, vidfps)
#cam.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width)
#cam.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height)
#window_name = "USB Camera"
#wait_key_time = 1
cam = cv2.VideoCapture("data/input/testvideo4.mp4")
camera_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH))
camera_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))
window_name = "Movie File"
wait_key_time = int(1000 / vidfps)
cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE)
while True:
t1 = time.perf_counter()
# USB Camera Stream Read
s, color_image = cam.read()
if not s:
continue
if frameBuffer.full():
frameBuffer.get()
height = color_image.shape[0]
width = color_image.shape[1]
frameBuffer.put(color_image.copy())
if not results.empty():
objects = results.get(False)
detectframecount += 1
for obj in objects:
if obj.confidence < 0.2:
continue
label = obj.class_id
confidence = obj.confidence
if confidence > 0.2:
label_text = LABELS[label] + " (" + "{:.1f}".format(confidence * 100) + "%)"
cv2.rectangle(color_image, (obj.xmin, obj.ymin), (obj.xmax, obj.ymax), box_color, box_thickness)
cv2.putText(color_image, label_text, (obj.xmin, obj.ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_text_color, 1)
lastresults = objects
else:
if not isinstance(lastresults, type(None)):
for obj in lastresults:
if obj.confidence < 0.2:
continue
label = obj.class_id
confidence = obj.confidence
if confidence > 0.2:
label_text = LABELS[label] + " (" + "{:.1f}".format(confidence * 100) + "%)"
cv2.rectangle(color_image, (obj.xmin, obj.ymin), (obj.xmax, obj.ymax), box_color, box_thickness)
cv2.putText(color_image, label_text, (obj.xmin, obj.ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_text_color, 1)
cv2.putText(color_image, fps, (width-170,15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.putText(color_image, detectfps, (width-170,30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (38,0,255), 1, cv2.LINE_AA)
cv2.imshow(window_name, cv2.resize(color_image, (width, height)))
if cv2.waitKey(wait_key_time)&0xFF == ord('q'):
sys.exit(0)
## Print FPS
framecount += 1
if framecount >= 15:
fps = "(Playback) {:.1f} FPS".format(time1/15)
detectfps = "(Detection) {:.1f} FPS".format(detectframecount/time2)
framecount = 0
detectframecount = 0
time1 = 0
time2 = 0
t2 = time.perf_counter()
elapsedTime = t2-t1
time1 += 1/elapsedTime
time2 += elapsedTime
# l = Search list
# x = Search target value
def searchlist(l, x, notfoundvalue=-1):
if x in l:
return l.index(x)
else:
return notfoundvalue
def async_infer(ncsworker):
ncsworker.skip_frame_measurement()
while True:
ncsworker.predict_async()
class NcsWorker(object):
def __init__(self, devid, frameBuffer, results, camera_width, camera_height, number_of_ncs, vidfps):
self.devid = devid
self.frameBuffer = frameBuffer
self.model_xml = "./lrmodels/YoloV3/FP16/frozen_yolo_v3.xml"
self.model_bin = "./lrmodels/YoloV3/FP16/frozen_yolo_v3.bin"
self.camera_width = camera_width
self.camera_height = camera_height
self.m_input_size = 416
self.threshould = 0.7
self.num_requests = 4
self.inferred_request = [0] * self.num_requests
self.heap_request = []
self.inferred_cnt = 0
self.plugin = IEPlugin(device="MYRIAD")
self.net = IENetwork(model=self.model_xml, weights=self.model_bin)
self.input_blob = next(iter(self.net.inputs))
self.exec_net = self.plugin.load(network=self.net, num_requests=self.num_requests)
self.results = results
self.number_of_ncs = number_of_ncs
self.predict_async_time = 800
self.skip_frame = 0
self.roop_frame = 0
self.vidfps = vidfps
def image_preprocessing(self, color_image):
prepimg = cv2.resize(color_image, (self.m_input_size, self.m_input_size))
prepimg = prepimg[np.newaxis, :, :, :] # Batch size axis add
prepimg = prepimg.transpose((0, 3, 1, 2)) # NHWC to NCHW
return prepimg
def skip_frame_measurement(self):
surplustime_per_second = (1000 - self.predict_async_time)
if surplustime_per_second > 0.0:
frame_per_millisecond = (1000 / self.vidfps)
total_skip_frame = surplustime_per_second / frame_per_millisecond
self.skip_frame = int(total_skip_frame / self.num_requests)
else:
self.skip_frame = 0
def predict_async(self):
try:
if self.frameBuffer.empty():
return
self.roop_frame += 1
if self.roop_frame <= self.skip_frame:
self.frameBuffer.get()
return
self.roop_frame = 0
prepimg = self.image_preprocessing(self.frameBuffer.get())
reqnum = searchlist(self.inferred_request, 0)
if reqnum > -1:
self.exec_net.start_async(request_id=reqnum, inputs={self.input_blob: prepimg})
self.inferred_request[reqnum] = 1
self.inferred_cnt += 1
if self.inferred_cnt == sys.maxsize:
self.inferred_request = [0] * self.num_requests
self.heap_request = []
self.inferred_cnt = 0
heapq.heappush(self.heap_request, (self.inferred_cnt, reqnum))
cnt, dev = heapq.heappop(self.heap_request)
if self.exec_net.requests[dev].wait(0) == 0:
self.exec_net.requests[dev].wait(-1)
objects = []
outputs = self.exec_net.requests[dev].outputs
for output in outputs.values():
objects = ParseYOLOV3Output(output, self.m_input_size, self.m_input_size, self.camera_height, self.camera_width, self.threshould, objects)
objlen = len(objects)
for i in range(objlen):
if (objects[i].confidence == 0.0):
continue
for j in range(i + 1, objlen):
if (IntersectionOverUnion(objects[i], objects[j]) >= 0.4):
objects[j].confidence = 0
self.results.put(objects)
self.inferred_request[dev] = 0
else:
heapq.heappush(self.heap_request, (cnt, dev))
except:
import traceback
traceback.print_exc()
def inferencer(results, frameBuffer, number_of_ncs, camera_width, camera_height, vidfps):
# Init infer threads
threads = []
for devid in range(number_of_ncs):
thworker = threading.Thread(target=async_infer, args=(NcsWorker(devid, frameBuffer, results, camera_width, camera_height, number_of_ncs, vidfps),))
thworker.start()
threads.append(thworker)
for th in threads:
th.join()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-numncs','--numberofncs',dest='number_of_ncs',type=int,default=1,help='Number of NCS. (Default=1)')
args = parser.parse_args()
number_of_ncs = args.number_of_ncs
camera_width = 320
camera_height = 240
vidfps = 30
try:
mp.set_start_method('forkserver')
frameBuffer = mp.Queue(10)
results = mp.Queue()
# Start detection MultiStick
# Activation of inferencer
p = mp.Process(target=inferencer, args=(results, frameBuffer, number_of_ncs, camera_width, camera_height, vidfps), daemon=True)
p.start()
processes.append(p)
sleep(number_of_ncs * 7)
# Start streaming
p = mp.Process(target=camThread, args=(LABELS, results, frameBuffer, camera_width, camera_height, vidfps), daemon=True)
p.start()
processes.append(p)
while True:
sleep(1)
except:
import traceback
traceback.print_exc()
finally:
for p in range(len(processes)):
processes[p].terminate()
print("\n\nFinished\n\n")
#◆ Finally
- Because it was troublesome, RaspberryPi3 did not verify the operation, but it probably works.
- Since this time we are verifying with only Core i7 machines, some performance degradation will occur with ARM CPU.
- The next time MobileNet-SSD will perform performance improvement several times.
- I am interested in Nore from NCS2. It is tremendous inference performance. Here https://www.gyrfalcontech.ai/solutions/2801s/
- It is
2.5 times
performance of TX2. - You can procure from China, but it seems there are no documents or tutorials. Challenging spirit with risk involvement Why don't you challenge?
https://ja.aliexpress.com/store/product/Orange-Pi-AI-Stick-2801-Neural-Network-Computing-Stick-Artificial-Intelligence/1553371_32954041998.html