More than 3 years have passed since last update.

Yolov3をDarknetとOpenCVのDNNで実行し速度比較

Last updated at 2020-05-30Posted at 2020-05-29

はじめに

YOLOv3を利用するためには、Darknetの実装を利用する方法と、OpenCVでcv.dnn.readNetFromDarknet(CFG, MODEL)して利用する方法などがあります。
ここでは、XavierNX上で
Darknetのサンプルのdarknet_video.pyでの実行と,
OpenCVで
net = cv.dnn.readNetFromDarknet(CFG, MODEL)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA_FP16)
をした場合の認識率、速度を比較してみました。

条件

測定環境

XavierNXでnvpmodel -m 2で実施
OpenCV430をとDarknetをDocker上で実施Jetson Xavier NX/JetsonNANOでDNN_BACKEND_CUDAが使えるOpenCV4.3をDockerでビルド+Darknetをビルド済みなXavierNX/JetsonNANOのDockerイメージを作成で作って実行したコンテナ上で実施
Youtubeの野鳥のビデオを利用
見つけた鳥の数、平均フレームレートを比較

手順

Youtubeの野鳥のビデオをダウンロード
mabiki.py(後述）を使ってフレームを間引き
dv.py(後述)を使ってフレームレートと鳥の検出数を算出
dnn_bench.py(後述)を使ってフレームレートと鳥の検出数を比較

結果

Darknetでの実装の方が倍近く早かったです。

IMPLREMENT	AVG FPS	Number of frames	Number of Recognized birds
Darknet	9.66	1017	1260
OpenCV-DNN	~~4.90~~ 4.76	1017	~~1469~~ 1399

みつけた鳥の数については検証してません。どっちが正解に近いかはわかりません。
フレーム数より多いのは1フレームに複数の鳥がいるフレームがそれなりにあるからです。

5/30アップデート
すみません、判定の閾値がちがってました。Darknetが0．25でOpenCVが0．1になってたので、OpenCVを0．25にして取り直しました

XavierNX上だとFP16の実装の方が断然はやく、DarknetはCUDNN_HALFをつけてコンパイルしていますが、OpenCV上では
DNN_TARGET_CUDA_FP16
しか指定できません。ちゃんと理解できてない私には詳細がわかりませんが、このあたりが認識数に効いている気がします。
一方速度については
OpenCVのDNN上では
outs = net.forward(getOutputsNames(net))
が実施の判別で、
found = postprocess(frame, outs)
が判別結果の確認プロセスのはずですが、
postprocessで結構な時間がかかります。
出口となるレイアから値を取り出したりいろいろやってるので仕方ないですが、実装に依存しているかもしれません。

mabiki.py

ビデオだと変化がすくなく、ベンチを行うにあたって時間がかかりすぎるので、フレームを1/30に間引きました
コードは以下

mabiki.py

import cv2
import numpy as np
import time

def mabiki():
    step=30

    #cap = cv2.VideoCapture(0)
    cap = cv2.VideoCapture("test.mp4")
    totalFrame = cap.get( cv2.CAP_PROP_FRAME_COUNT)
    out = cv2.VideoWriter(
        "output.avi", cv2.VideoWriter_fourcc(*"MJPG"), 10.0,
        (960,540 ))

    fnum=1
    while fnum < totalFrame-step:
        prev_time = time.time()
        ret, frame_read = cap.read()
        outimage = cv2.resize( frame_read , (960,540) , interpolation=cv2.INTER_LINEAR)
        out.write(outimage)
        cv2.imshow( 'read',outimage )
        cv2.waitKey(1)
        print( str( fnum / totalFrame *100)+"%")
        out.write(outimage)

        fnum=fnum+step
        cap.set(cv2.CAP_PROP_POS_FRAMES , fnum+step )

    cap.release()
    out.release()

if __name__ == "__main__":
    mabiki()
~

python3:dv.py

Darknetでの実装でyolov3のパフォーマンスを測定するためのスクリプトです。darknetのサンプルコードのdarknet_video.pyにパフォーマンス測定用の数行を足してあります。imshowとwaitKeyは速度に影響するのでコメントアウトしました。

dv.py

from ctypes import *
import math
import random
import os
import cv2
import numpy as np
import time
import darknet

def convertBack(x, y, w, h):
    xmin = int(round(x - (w / 2)))
    xmax = int(round(x + (w / 2)))
    ymin = int(round(y - (h / 2)))
    ymax = int(round(y + (h / 2)))
    return xmin, ymin, xmax, ymax


def cvDrawBoxes(detections, img):
    found=0
    for detection in detections:
        x, y, w, h = detection[2][0],\
            detection[2][1],\
            detection[2][2],\
            detection[2][3]
        xmin, ymin, xmax, ymax = convertBack(
            float(x), float(y), float(w), float(h))
        pt1 = (xmin, ymin)
        pt2 = (xmax, ymax)
        cv2.rectangle(img, pt1, pt2, (0, 255, 0), 1)
        cv2.putText(img,
                    detection[0].decode() +
                    " [" + str(round(detection[1] * 100, 2)) + "]",
                    (pt1[0], pt1[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    [0, 255, 0], 2)
        if detection[0]==b'bird':
            found=found+1

    return img,found


netMain = None
metaMain = None
altNames = None


def YOLO():

    global metaMain, netMain, altNames
    configPath = "./cfg/yolov3.cfg"
    weightPath = "./yolov3.weights"
    metaPath = "./cfg/coco.data"
    if not os.path.exists(configPath):
        raise ValueError("Invalid config path `" +
                         os.path.abspath(configPath)+"`")
    if not os.path.exists(weightPath):
        raise ValueError("Invalid weight path `" +
                         os.path.abspath(weightPath)+"`")
    if not os.path.exists(metaPath):
        raise ValueError("Invalid data file path `" +
                         os.path.abspath(metaPath)+"`")
    if netMain is None:
        netMain = darknet.load_net_custom(configPath.encode(
            "ascii"), weightPath.encode("ascii"), 0, 1)  # batch size = 1
    if metaMain is None:
        metaMain = darknet.load_meta(metaPath.encode("ascii"))
    if altNames is None:
        try:
            with open(metaPath) as metaFH:
                metaContents = metaFH.read()
                import re
                match = re.search("names *= *(.*)$", metaContents,
                                  re.IGNORECASE | re.MULTILINE)
                if match:
                    result = match.group(1)
                else:
                    result = None
                try:
                    if os.path.exists(result):
                        with open(result) as namesFH:
                            namesList = namesFH.read().strip().split("\n")
                            altNames = [x.strip() for x in namesList]
                except TypeError:
                    pass
        except Exception:
            pass
#    cap = cv2.VideoCapture(0)
    cap = cv2.VideoCapture("output.avi")
#    cap.set(3, 1280)
#    cap.set(4, 720)
#    out = cv2.VideoWriter(
#        "output.avi", cv2.VideoWriter_fourcc(*"MJPG"), 10.0,
#        (darknet.network_width(netMain), darknet.network_height(netMain)))
    print("Starting the YOLO loop...")

    # Create an image we reuse for each detect
    darknet_image = darknet.make_image(darknet.network_width(netMain),
                                    darknet.network_height(netMain),3)
    fpsgt=0
    frameTotal=0
    foundTotal=0
    while True:
        prev_time = time.time()
        ret, frame_read = cap.read()
        frame_rgb = cv2.cvtColor(frame_read, cv2.COLOR_BGR2RGB)
        frame_resized = cv2.resize(frame_rgb,
                                   (darknet.network_width(netMain),
                                    darknet.network_height(netMain)),
                                   interpolation=cv2.INTER_LINEAR)

        darknet.copy_image_from_bytes(darknet_image,frame_resized.tobytes())

        detections = darknet.detect_image(netMain, metaMain, darknet_image, thresh=0.25)
        image ,found= cvDrawBoxes(detections, frame_resized)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        fps = 1/(time.time()-prev_time)
        fpsgt = fpsgt + fps
        frameTotal=frameTotal+1
        foundTotal=foundTotal+found
        print( "AVG FPS="+str(fpsgt / frameTotal) + "  FRAME="+str(frameTotal)+"  BIRD="+str(foundTotal))
#        cv2.imshow('Demo', image)
#        cv2.waitKey(3)
    cap.release()
#    out.release()

if __name__ == "__main__":
    YOLO()
                                                                                                                126,10        Bot

python3:dnn_bench.py

Darknetでの実装でyolov3のパフォーマンスを測定するためのスクリプトです。
MIKI-IE.COM（みきいえMIKIIE）様にあったサンプルをにコードを追加しました。
imshowとwaitKeyはコメントアウトしました。

5/30 閾値があってなかったので治して測定しなおしました。confThreshold = 0.1 > confThreshold = 0.25

dv.py

import cv2 as cv
import numpy as np
import time

MODEL = "./yolov3.weights"
CFG = "./cfg/yolov3.cfg"
SCALE = 0.00392   ##1/255
CLASS_NAME="./data/coco.names"
INP_SHAPE = (416, 416) #input size
MEAN = 0
RGB = True

# Load a network
net = cv.dnn.readNetFromDarknet(CFG, MODEL)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA_FP16)


confThreshold = 0.25 # Confidence threshold
nmsThreshold = 0.8  # Non-maximum supression threshold

# class_names = ['active', 'goo', 'choki', 'pa', 'won', 'lose', 'draw']
with open(CLASS_NAME) as f:
    class_names = f.read().splitlines()
    print(class_names)

def getOutputsNames(net):
    layersNames = net.getLayerNames()
    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]

def postprocess(frame, outs):
    found=0
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]

    def drawPred(classId, conf, left, top, right, bottom):
        left = int(left)
        top = int(top)
        right = int(right)
        bottom = int(bottom)
        # Draw a bounding box.
        cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))

        label = class_names[classId] + '_%.2f' % conf

        labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        top = max(top, labelSize[1])
        cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
        cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))

    layerNames = net.getLayerNames()
    lastLayerId = net.getLayerId(layerNames[-1])
    lastLayer = net.getLayer(lastLayerId)

    classIds = []
    confidences = []
    boxes = []

    if lastLayer.type == 'Region':
        classIds = []
        confidences = []
        boxes = []
        for out in outs:
            for detection in out:
                scores = detection[5:]
                classId = np.argmax(scores)
                confidence = scores[classId]
                if confidence > confThreshold:
                    center_x = int(detection[0] * frameWidth)
                    center_y = int(detection[1] * frameHeight)
                    width = int(detection[2] * frameWidth)
                    height = int(detection[3] * frameHeight)
                    left = center_x - width / 2
                    top = center_y - height / 2
                    classIds.append(classId)
                    confidences.append(float(confidence))
                    boxes.append([left, top, width, height])
    else:
        print('Unknown output layer type: ' + lastLayer.type)
        exit()

    indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
    for i in indices:
        i = i[0]
        box = boxes[i]
        left = box[0]
        top = box[1]
        width = box[2]
        height = box[3]
        drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
        if class_names[classIds[i]]=='bird':
            found=found+1
    return found


# print( "STARTCAPTURE")

c = cv.VideoCapture("output.avi")
# c = cv.VideoCapture(0)
print( "VCAPTURE")
c.read()
print( "VCAPTUREREAD")
fpsgt=0
frameTotal=0
foundTotal=0
while True:
    timestart=time.time()
    frameTotal=frameTotal+1
    r, frame = c.read()

    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]
# Create a 4D blob from a frame.
    inpWidth = INP_SHAPE[0]
    inpHeight = INP_SHAPE[1]
    blob = cv.dnn.blobFromImage(frame, SCALE, (inpWidth, inpHeight), MEAN, RGB, crop=False)

# Run a model
    net.setInput(blob)
    outs = net.forward(getOutputsNames(net))
    #print(outs)
    found = postprocess(frame, outs)
# Put efficiency information.
    t, _ = net.getPerfProfile()
    label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
    cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
#    cv.namedWindow('image',cv.WINDOW_NORMAL)
#    cv.imshow('image', frame )
    timeend=time.time()
    fps = 1/(timeend - timestart)
    fpsgt=fpsgt+fps
    foundTotal=foundTotal+found
    print( "AVG FPS="+str(fpsgt / frameTotal) + "  FRAME="+str(frameTotal)+"  BIRD="+str(foundTotal))
#    cv.waitKey(5)

cv.destroyAllWindows()
cv.mwrite(target_filepath, frame)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up