More than 5 years have passed since last update.

Raspberry PIで（そこそこ）リアルタイムにYOLOを動かしてみる (darknet-nnpack)

Posted at 2018-11-24

#やったこと
Raspberry PI3上で、カメラモジュールの画像をtiny-YOLOv2に入力し、1FPS弱ぐらいで認識結果を表示できるようにした。

#背景
DeepLeaningの物体認識で有名なYOLO。
RaspberryPiでサンプル画像を認識させているのはそこそこ見つかる。
http://yagitsawa.github.io/2018/04/07/darknet-with-nnpack/

これをカメラモジュールで連続処理できたら面白かろうと試してみたが、なかなか大変だった。

使ったハードは以下。
Raspberry Pi3 コンプリートスターターキット
 HiLetgo OV5647 5MP Raspberry Pi 3 カメラ OV5647 HDカメラモジュール

#カメラモジュールからの画像取得
よくあるVideoCaptureだとなぜか遅すぎて使えなかったので、Raspicamを使う。
https://github.com/cedricve/raspicam
別の記事にサンプルコードあり。

#darknet-nnpackのインストール、C++用に改変
背景のリンク先にあるdarknet-nnpackはGCCでコンパイルするのだが、今回はOpenCVのためC++を使いたい。
extern "C" なども試したが、どうもdarknetの中でCとC++が共存しているらしく、うまくコンパイルできなかった。
そこで、darknet-nnpackを改変してC++でコンパイルできるようにしたのがこちら。
https://github.com/jonatechout/darknet-nnpack

具体的には、classやnewといったC++の予約語を変数名に使わないようにした。
インストール方法は上記gitリンク参照。

#コード
メインのコードはこちら。
https://github.com/jonatechout/raspi_yolo_test
インストール方法はgit参照。

#include <darknet.h>
#include <nnpack.h>
#include <unistd.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <sys/time.h>
#include <raspicam/raspicam_cv.h>

using namespace cv;

void convertCvImageToDnImage(const Mat& mat, image* out_img)
{
    for(int y = 0; y < out_img->h; ++y){
      for(int x = 0; x < out_img->w; ++x){
        for(int k= 0; k < out_img->c; ++k){
          out_img->data[k * out_img->h * out_img->w + y * out_img->w + x]
            = static_cast<float>(mat.data[y * mat.step + x * mat.elemSize() + k]) / 255.0;
        }
      }
    }
}

void convertDnImageToCvImage(const image& dnetImg, Mat* out_mat)
{
  for(int y = 0; y < dnetImg.h; ++y){
    for(int x = 0; x < dnetImg.w; ++x){
      for(int k= 0; k < dnetImg.c; ++k){
        out_mat->data[y * out_mat->step + x * out_mat->elemSize() + k]
          =  static_cast<unsigned char>(dnetImg.data[k * dnetImg.h * dnetImg.w + y * dnetImg.w + x] * 255);
      }
    }
  }
}

int main(int argc, char* argv[])
{
  if (argc < 4)
  {
    printf("4 arguments are necessary.\n");
    printf("%s <data config path> <network config path> <network weights path>\n", argv[0]);
    return -1;
  }

  const unsigned int IMAGE_WIDTH = 320;
  const unsigned int IMAGE_HEIGHT = 240;

  raspicam::RaspiCam_Cv camera;
  camera.set( CV_CAP_PROP_FORMAT, CV_8UC3 );
  camera.set( CV_CAP_PROP_FRAME_WIDTH, IMAGE_WIDTH );
  camera.set( CV_CAP_PROP_FRAME_HEIGHT, IMAGE_HEIGHT );

  if (!camera.open())
  {
    printf("camera open error\n");
    return -1;
  }

  char *datacfg = argv[1];
  char *cfgfile = argv[2];
  char *weightfile = argv[3];
  float thresh = 0.24;
  float hier_thresh = 0.5;

  list *options = read_data_cfg(datacfg);
  char *name_list = option_find_str(options, const_cast<char*>("names"), const_cast<char*>("data/names.list"));
  char **names = get_labels(name_list);

  image **alphabet = load_alphabet();
  network *net = load_network(cfgfile, weightfile, 0);
  set_batch_network(net, 1);
  srand(2222222);

  float nms=.3;
  nnp_initialize();
  net->threadpool = pthreadpool_create(0);

  namedWindow("prediction", CV_WINDOW_AUTOSIZE);
  image dnetImg = make_image(IMAGE_WIDTH, IMAGE_HEIGHT, 3);

  layer l = net->layers[net->n-1];
  box *boxes = static_cast<box*>(calloc(l.w*l.h*l.n, sizeof(box)));
  float **probs = static_cast<float**>(calloc(l.w*l.h*l.n, sizeof(float *)));

  for(int j = 0; j < l.w*l.h*l.n; ++j) {
    probs[j] = static_cast<float*>(calloc(l.classes + 1, sizeof(float *)));
  }

  printf("ready\n");

  while(1){
    Mat mat;
    camera.grab();
    camera.retrieve(mat);

    convertCvImageToDnImage(mat, &dnetImg);

    image sized = letterbox_image_thread(dnetImg, net->w, net->h, net->threadpool);
    float *X = sized.data;

    struct timeval start, stop;
    gettimeofday(&start, 0);

    network_predict(net, X);

    gettimeofday(&stop, 0);
    printf("Predicted in %ld ms.\n", (stop.tv_sec * 1000 + stop.tv_usec / 1000) - (start.tv_sec * 1000 + start.tv_usec / 1000));

    get_region_boxes(l, dnetImg.w, dnetImg.h, net->w, net->h, thresh, probs, boxes, NULL, 0, 0, hier_thresh, 1);

    if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
    draw_detections(dnetImg, l.w*l.h*l.n, thresh, boxes, probs, NULL, names, alphabet, l.classes);

    Mat disp(dnetImg.h, dnetImg.w, CV_8UC3);
    convertDnImageToCvImage(dnetImg, &disp);
    imshow("prediction", disp);
    waitKey(1);

    free_image(sized);
  }

  free_image(dnetImg);
  free(boxes);
  free_ptrs((void **)probs, l.w*l.h*l.n);

  pthreadpool_destroy(net->threadpool);
  nnp_deinitialize();

  return 0;
}

最終的に以下のコマンドで開始する。

./raspiyolotest cfg/voc.data cfg/yolov2-tiny-voc.cfg yolov2-tiny-voc.weights

結果

layer     filters    size              input                output
    0 conv     16  3 x 3 / 1   416 x 416 x   3   ->   416 x 416 x  16
    1 max          2 x 2 / 2   416 x 416 x  16   ->   208 x 208 x  16
    2 conv     32  3 x 3 / 1   208 x 208 x  16   ->   208 x 208 x  32
    3 max          2 x 2 / 2   208 x 208 x  32   ->   104 x 104 x  32
    4 conv     64  3 x 3 / 1   104 x 104 x  32   ->   104 x 104 x  64
    5 max          2 x 2 / 2   104 x 104 x  64   ->    52 x  52 x  64
    6 conv    128  3 x 3 / 1    52 x  52 x  64   ->    52 x  52 x 128
    7 max          2 x 2 / 2    52 x  52 x 128   ->    26 x  26 x 128
    8 conv    256  3 x 3 / 1    26 x  26 x 128   ->    26 x  26 x 256
    9 max          2 x 2 / 2    26 x  26 x 256   ->    13 x  13 x 256
   10 conv    512  3 x 3 / 1    13 x  13 x 256   ->    13 x  13 x 512
   11 max          2 x 2 / 1    13 x  13 x 512   ->    13 x  13 x 512
   12 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024
   13 conv   1024  3 x 3 / 1    13 x  13 x1024   ->    13 x  13 x1024
   14 conv    125  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 125
   15 detection
mask_scale: Using default '1.000000'
Loading weights from yolov2-tiny-voc.weights...Done!
ready
Predicted in 1318 ms.
Predicted in 1240 ms.
tvmonitor: 36%
Predicted in 1258 ms.
tvmonitor: 45%
Predicted in 1263 ms.
bottle: 29%
Predicted in 1260 ms.
tvmonitor: 37%
Predicted in 1245 ms.
tvmonitor: 32%
Predicted in 1257 ms.
tvmonitor: 43%
Predicted in 1258 ms.
tvmonitor: 53%
Predicted in 1272 ms.
Predicted in 1301 ms.
person: 55%
person: 27%
Predicted in 1270 ms.
person: 82%
person: 30%

1フレーム1.3秒ほど。リアルタイムと言っていいかわからないが、それなりに使い道はある気がする。
認識結果を外部で扱いたい場合は、boxesとprobsを参照すればいいと思う。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up