親記事はVideopose3Dを理解してみる(メモ)です.
Videopose3Dの中のdetectronの機能を使うinfer_simple.pyを理解するために少しこちらに遷移してきました。
infer_simple.py内で呼び出された宣言について軽く理解していきたいと思います。
主にこの記事でみるのは、この関数の戻り値の一つであるcls_boxesの値です。
人を検知して得られた箱の4点の座標を表すための値と、確率を表すスコアが入っていると想定しています。確認してみます。
#im_detect_allを理解してみる
この関数内の条件は主にこのコードでの入力のarg.cfg(=e2e_keypoint_rcnn_R-101-FPN_s1x.yaml)のBoolean型の変数のTrue,Falseになっています。
このコードを確認して、条件文で当てはまらないところはコメントアウトしています。
def im_detect_all(model, im, box_proposals, timers=None):
if timers is None:
timers = defaultdict(Timer)
# Handle RetinaNet testing separately for now
#if cfg.RETINANET.RETINANET_ON:
#cls_boxes = test_retinanet.im_detect_bbox(model, im, timers)
#return cls_boxes, None, None
timers['im_detect_bbox'].tic()
#if cfg.TEST.BBOX_AUG.ENABLED:
#scores, boxes, im_scale = im_detect_bbox_aug(model, im, box_proposals)
#else:
scores, boxes, im_scale = im_detect_bbox(
model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes=box_proposals
)
timers['im_detect_bbox'].toc()
なおこのとき、cfg.TEST.SCALE=800, cfg.TEST.MAX_SIZE=1333です。
すこしdetectron/core/test_retinanet.py内で定義されているim_detect_bboxに遷移してみます。
(長いです)
def im_detect_bbox(model, im, timers=None):
"""Generate RetinaNet detections on a single image."""
if timers is None:
timers = defaultdict(Timer)
# Although anchors are input independent and could be precomputed,
# recomputing them per image only brings a small overhead
anchors = _create_cell_anchors()
timers['im_detect_bbox'].tic()
k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS)
inputs = {}
inputs['data'], im_scale, inputs['im_info'] = \
blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE)
cls_probs, box_preds = [], []
for lvl in range(k_min, k_max + 1):
suffix = 'fpn{}'.format(lvl)
cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix)))
box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix)))
for k, v in inputs.items():
workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False))
workspace.RunNet(model.net.Proto().name)
cls_probs = workspace.FetchBlobs(cls_probs)
box_preds = workspace.FetchBlobs(box_preds)
# here the boxes_all are [x0, y0, x1, y1, score]
boxes_all = defaultdict(list)
boxes_allには**here the boxes_all are [x0, y0, x1, y1, score]**と書かれています。
それでは目的のcls_boxesまでもう少し!detectionsを中心に見ていきましょう。
cnt = 0
for lvl in range(k_min, k_max + 1):
# create cell anchors array
stride = 2. ** lvl
cell_anchors = anchors[lvl]
# fetch per level probability
cls_prob = cls_probs[cnt]
box_pred = box_preds[cnt]
cls_prob = cls_prob.reshape((
cls_prob.shape[0], A, int(cls_prob.shape[1] / A),
cls_prob.shape[2], cls_prob.shape[3]))
box_pred = box_pred.reshape((
box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3]))
cnt += 1
#if cfg.RETINANET.SOFTMAX:
#cls_prob = cls_prob[:, :, 1::, :, :]
cls_prob_ravel = cls_prob.ravel()
# In some cases [especially for very small img sizes], it's possible that
# candidate_ind is empty if we impose threshold 0.05 at all levels. This
# will lead to errors since no detections are found for this image. Hence,
# for lvl 7 which has small spatial resolution, we take the threshold 0.0
th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0
candidate_inds = np.where(cls_prob_ravel > th)[0]
if (len(candidate_inds) == 0):
continue
pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds))
inds = np.argpartition(
cls_prob_ravel[candidate_inds], -pre_nms_topn)[-pre_nms_topn:]
inds = candidate_inds[inds]
inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose()
classes = inds_5d[:, 2]
anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4]
scores = cls_prob[:, anchor_ids, classes, y, x]
boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32)
boxes *= stride
boxes += cell_anchors[anchor_ids, :]
if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
box_deltas = box_pred[0, anchor_ids, :, y, x]
else:
box_cls_inds = classes * 4
box_deltas = np.vstack(
[box_pred[0, ind:ind + 4, yi, xi]
for ind, yi, xi in zip(box_cls_inds, y, x)]
)
pred_boxes = (
box_utils.bbox_transform(boxes, box_deltas)
if cfg.TEST.BBOX_REG else boxes)
pred_boxes /= im_scale
pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape)
box_scores = np.zeros((pred_boxes.shape[0], 5))
box_scores[:, 0:4] = pred_boxes
box_scores[:, 4] = scores
for cls in range(1, cfg.MODEL.NUM_CLASSES):
inds = np.where(classes == cls - 1)[0]
if len(inds) > 0:
boxes_all[cls].extend(box_scores[inds, :])
timers['im_detect_bbox'].toc()
# Combine predictions across all levels and retain the top scoring by class
timers['misc_bbox'].tic()
#detectionsを定義
detections = []
for cls, boxes in boxes_all.items():
cls_dets = np.vstack(boxes).astype(dtype=np.float32)
ここで疑問です!npモジュールのvstack関数って何するの?
###np.vstackって何?
配列同士を縦方向に連結する関数
In [1]: import numpy as np
In [2]: a = np.arange(12).reshape(-1,1) # 12個の要素を持つ縦ベクトル
In [3]: b = np.arange(2).reshape(-1,1) # 2個の要素を持つ縦ベクトル
In [4]: a
Out[4]:
array([[ 0],
[ 1],
[ 2],
[ 3],
[ 4],
[ 5],
[ 6],
[ 7],
[ 8],
[ 9],
[10],
[11]])
こんな感じです。
# do class specific nms here
#if cfg.TEST.SOFT_NMS.ENABLED:
#cls_dets, keep = box_utils.soft_nms(
#cls_dets,
#sigma=cfg.TEST.SOFT_NMS.SIGMA,
#overlap_thresh=cfg.TEST.NMS,
#score_thresh=0.0001,
#method=cfg.TEST.SOFT_NMS.METHOD
#)
#else:
keep = box_utils.nms(cls_dets, cfg.TEST.NMS)
cls_dets = cls_dets[keep, :]
out = np.zeros((len(keep), 6))
out[:, 0:5] = cls_dets
out[:, 5].fill(cls)
detections.append(out)
# detections (N, 6) format:
# detections[:, :4] - boxes
# detections[:, 4] - scores
# detections[:, 5] - classes
detections = np.vstack(detections)
# sort all again
inds = np.argsort(-detections[:, 4])
detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :]
# Convert the detections to image cls_ format (see core/test_engine.py)
num_classes = cfg.MODEL.NUM_CLASSES
#空のリストのリストをつくる
cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)]
#0はあけておく
for c in range(1, num_classes):
inds = np.where(detections[:, 5] == c)[0]
cls_boxes[c] = detections[inds, :5]
timers['misc_bbox'].toc()
return cls_boxes
戻ります。
# score and boxes are from the whole image after score thresholding and nms
# (they are not separated by class)
# cls_boxes boxes and scores are separated by class and in the format used
# for evaluating results
timers['misc_bbox'].tic()
scores, boxes, cls_boxes = box_results_with_nms_and_limit(scores, boxes)
timers['misc_bbox'].toc()
if cfg.MODEL.MASK_ON and boxes.shape[0] > 0:
timers['im_detect_mask'].tic()
if cfg.TEST.MASK_AUG.ENABLED:
masks = im_detect_mask_aug(model, im, boxes)
else:
masks = im_detect_mask(model, im_scale, boxes)
timers['im_detect_mask'].toc()
timers['misc_mask'].tic()
cls_segms = segm_results(
cls_boxes, masks, boxes, im.shape[0], im.shape[1]
)
timers['misc_mask'].toc()
else:
cls_segms = None
if cfg.MODEL.KEYPOINTS_ON and boxes.shape[0] > 0:
timers['im_detect_keypoints'].tic()
if cfg.TEST.KPS_AUG.ENABLED:
heatmaps = im_detect_keypoints_aug(model, im, boxes)
else:
heatmaps = im_detect_keypoints(model, im_scale, boxes)
timers['im_detect_keypoints'].toc()
timers['misc_keypoints'].tic()
cls_keyps = keypoint_results(cls_boxes, heatmaps, boxes)
timers['misc_keypoints'].toc()
else:
cls_keyps = None
return cls_boxes, cls_segms, cls_keyps
box:箱の位置情報・人の確率
seg:何も入ってない
key:関節の二次元の座標が入っている
(segmentation:ピクセル単位でクラスラベルを付ける)
with:https://www.sejuku.net/blog/24672