More than 5 years have passed since last update.

Detectronの中身を一部を理解してみる2

Last updated at 2019-07-10Posted at 2019-07-10

親記事はVideopose3Dを理解してみる(メモ)です.
Videopose3Dの中のdetectronの機能を使うinfer_simple.pyを理解するために少しこちらに遷移してきました。
infer_simple.py内で呼び出された宣言について軽く理解していきたいと思います。

主にこの記事でみるのは、この関数の戻り値の一つであるcls_boxesの値です。
人を検知して得られた箱の4点の座標を表すための値と、確率を表すスコアが入っていると想定しています。確認してみます。

im_detect_allを理解してみる

この関数内の条件は主にこのコードでの入力のarg.cfg(=e2e_keypoint_rcnn_R-101-FPN_s1x.yaml)のBoolean型の変数のTrue,Falseになっています。

このコードを確認して、条件文で当てはまらないところはコメントアウトしています。

test.py


def im_detect_all(model, im, box_proposals, timers=None):
    if timers is None:
        timers = defaultdict(Timer)

    # Handle RetinaNet testing separately for now
    #if cfg.RETINANET.RETINANET_ON:
        #cls_boxes = test_retinanet.im_detect_bbox(model, im, timers)
        #return cls_boxes, None, None

    timers['im_detect_bbox'].tic()
    #if cfg.TEST.BBOX_AUG.ENABLED:
        #scores, boxes, im_scale = im_detect_bbox_aug(model, im, box_proposals)
    #else:
        scores, boxes, im_scale = im_detect_bbox(
            model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes=box_proposals
        )
    timers['im_detect_bbox'].toc()

なおこのとき、cfg.TEST.SCALE=800, cfg.TEST.MAX_SIZE=1333です。
すこしdetectron/core/test_retinanet.py内で定義されているim_detect_bboxに遷移してみます。
（長いです）

test_retinanet.py

def im_detect_bbox(model, im, timers=None):
    """Generate RetinaNet detections on a single image."""
    if timers is None:
        timers = defaultdict(Timer)
    # Although anchors are input independent and could be precomputed,
    # recomputing them per image only brings a small overhead
    anchors = _create_cell_anchors()
    timers['im_detect_bbox'].tic()
    k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
    A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS)
    inputs = {}
    inputs['data'], im_scale, inputs['im_info'] = \
        blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE)
    cls_probs, box_preds = [], []
    for lvl in range(k_min, k_max + 1):
        suffix = 'fpn{}'.format(lvl)
        cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix)))
        box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix)))
    for k, v in inputs.items():
        workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False))

    workspace.RunNet(model.net.Proto().name)
    cls_probs = workspace.FetchBlobs(cls_probs)
    box_preds = workspace.FetchBlobs(box_preds)

    # here the boxes_all are [x0, y0, x1, y1, score]
    boxes_all = defaultdict(list)

boxes_allには**here the boxes_all are [x0, y0, x1, y1, score]**と書かれています。
それでは目的のcls_boxesまでもう少し！detectionsを中心に見ていきましょう。

test_retinanet.py

    cnt = 0
    for lvl in range(k_min, k_max + 1):
        # create cell anchors array
        stride = 2. ** lvl
        cell_anchors = anchors[lvl]

        # fetch per level probability
        cls_prob = cls_probs[cnt]
        box_pred = box_preds[cnt]
        cls_prob = cls_prob.reshape((
            cls_prob.shape[0], A, int(cls_prob.shape[1] / A),
            cls_prob.shape[2], cls_prob.shape[3]))
        box_pred = box_pred.reshape((
            box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3]))
        cnt += 1

        #if cfg.RETINANET.SOFTMAX:
            #cls_prob = cls_prob[:, :, 1::, :, :]

        cls_prob_ravel = cls_prob.ravel()
        # In some cases [especially for very small img sizes], it's possible that
        # candidate_ind is empty if we impose threshold 0.05 at all levels. This
        # will lead to errors since no detections are found for this image. Hence,
        # for lvl 7 which has small spatial resolution, we take the threshold 0.0
        th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0
        candidate_inds = np.where(cls_prob_ravel > th)[0]
        if (len(candidate_inds) == 0):
            continue

        pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds))
        inds = np.argpartition(
            cls_prob_ravel[candidate_inds], -pre_nms_topn)[-pre_nms_topn:]
        inds = candidate_inds[inds]

        inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose()
        classes = inds_5d[:, 2]
        anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4]
        scores = cls_prob[:, anchor_ids, classes, y, x]

        boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32)
        boxes *= stride
        boxes += cell_anchors[anchor_ids, :]

        if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
            box_deltas = box_pred[0, anchor_ids, :, y, x]
        else:
            box_cls_inds = classes * 4
            box_deltas = np.vstack(
                [box_pred[0, ind:ind + 4, yi, xi]
                 for ind, yi, xi in zip(box_cls_inds, y, x)]
            )
        pred_boxes = (
            box_utils.bbox_transform(boxes, box_deltas)
            if cfg.TEST.BBOX_REG else boxes)
        pred_boxes /= im_scale
        pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape)
        box_scores = np.zeros((pred_boxes.shape[0], 5))
        box_scores[:, 0:4] = pred_boxes
        box_scores[:, 4] = scores

        for cls in range(1, cfg.MODEL.NUM_CLASSES):
            inds = np.where(classes == cls - 1)[0]
            if len(inds) > 0:
                boxes_all[cls].extend(box_scores[inds, :])
    timers['im_detect_bbox'].toc()

    # Combine predictions across all levels and retain the top scoring by class
    timers['misc_bbox'].tic()
# detectionsを定義
    detections = []
    for cls, boxes in boxes_all.items():
        cls_dets = np.vstack(boxes).astype(dtype=np.float32)

ここで疑問です！npモジュールのvstack関数って何するの？

np.vstackって何？

配列同士を縦方向に連結する関数

example_vstack.py

In [1]: import numpy as np

In [2]: a = np.arange(12).reshape(-1,1) # 12個の要素を持つ縦ベクトル

In [3]: b = np.arange(2).reshape(-1,1) # 2個の要素を持つ縦ベクトル

In [4]: a
Out[4]:
array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11]])

こんな感じです。

test_retinanet.py

        # do class specific nms here
        #if cfg.TEST.SOFT_NMS.ENABLED:
            #cls_dets, keep = box_utils.soft_nms(
                #cls_dets,
                #sigma=cfg.TEST.SOFT_NMS.SIGMA,
                #overlap_thresh=cfg.TEST.NMS,
                #score_thresh=0.0001,
                #method=cfg.TEST.SOFT_NMS.METHOD
            #)
        #else:
            keep = box_utils.nms(cls_dets, cfg.TEST.NMS)
            cls_dets = cls_dets[keep, :]
        out = np.zeros((len(keep), 6))
        out[:, 0:5] = cls_dets
        out[:, 5].fill(cls)
        detections.append(out)

    # detections (N, 6) format:
    #   detections[:, :4] - boxes
    #   detections[:, 4] - scores
    #   detections[:, 5] - classes
    detections = np.vstack(detections)
    # sort all again
    inds = np.argsort(-detections[:, 4])
    detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :]

    # Convert the detections to image cls_ format (see core/test_engine.py)
    num_classes = cfg.MODEL.NUM_CLASSES
# 空のリストのリストをつくる
    cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)]
# 0はあけておく
    for c in range(1, num_classes):
        inds = np.where(detections[:, 5] == c)[0]
        cls_boxes[c] = detections[inds, :5]
    timers['misc_bbox'].toc()

    return cls_boxes

戻ります。

test.py

    # score and boxes are from the whole image after score thresholding and nms
    # (they are not separated by class)
    # cls_boxes boxes and scores are separated by class and in the format used
    # for evaluating results
    timers['misc_bbox'].tic()
    scores, boxes, cls_boxes = box_results_with_nms_and_limit(scores, boxes)
    timers['misc_bbox'].toc()

    if cfg.MODEL.MASK_ON and boxes.shape[0] > 0:
        timers['im_detect_mask'].tic()
        if cfg.TEST.MASK_AUG.ENABLED:
            masks = im_detect_mask_aug(model, im, boxes)
        else:
            masks = im_detect_mask(model, im_scale, boxes)
        timers['im_detect_mask'].toc()

        timers['misc_mask'].tic()
        cls_segms = segm_results(
            cls_boxes, masks, boxes, im.shape[0], im.shape[1]
        )
        timers['misc_mask'].toc()
    else:
        cls_segms = None

    if cfg.MODEL.KEYPOINTS_ON and boxes.shape[0] > 0:
        timers['im_detect_keypoints'].tic()
        if cfg.TEST.KPS_AUG.ENABLED:
            heatmaps = im_detect_keypoints_aug(model, im, boxes)
        else:
            heatmaps = im_detect_keypoints(model, im_scale, boxes)
        timers['im_detect_keypoints'].toc()

        timers['misc_keypoints'].tic()
        cls_keyps = keypoint_results(cls_boxes, heatmaps, boxes)
        timers['misc_keypoints'].toc()
    else:
        cls_keyps = None

    return cls_boxes, cls_segms, cls_keyps

box:箱の位置情報・人の確率
seg:何も入ってない
key:関節の二次元の座標が入っている
（segmentation:ピクセル単位でクラスラベルを付ける）
with:https://www.sejuku.net/blog/24672

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up