はじめに
posenet-pytorchやりまーす
https://github.com/rwightman/posenet-pytorch
開発環境
- Windows 10
- Python 3.9
導入
1.Anacondaで仮想環境を作成
$ conda create -n posenet-pytorch python=3.9
$ conda activate posenet-pytorch
2.ライブラリのインストール
$ pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
$ pip install requests opencv-python==4.6.0.66
PyTorchのバージョン
- torch 1.12.0+cu113
- torchvision 0.13.0+cu113
- torchaudio 0.12.0+cu116
3.実行(画像デモ)
$ cd posenet-pytorch-master
imagesフォルダに画像を入れておく
$ python image_demo.py --model 101 --image_dir ./images --output_dir ./output
GPUで実行
import cv2
import time
import argparse
import os
import torch
import posenet
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=int, default=101)
parser.add_argument('--scale_factor', type=float, default=1.0)
parser.add_argument('--notxt', action='store_true')
parser.add_argument('--image_dir', type=str, default='./images')
parser.add_argument('--output_dir', type=str, default='./output')
args = parser.parse_args()
def main():
model = posenet.load_model(args.model)
model = model.cuda()
output_stride = model.output_stride
if args.output_dir:
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
filenames = [
f.path for f in os.scandir(args.image_dir) if f.is_file() and f.path.endswith(('.png', '.jpg', '.jpeg'))]
start = time.time()
for f in filenames:
input_image, draw_image, output_scale = posenet.read_imgfile(
f, scale_factor=args.scale_factor, output_stride=output_stride)
with torch.no_grad():
input_image = torch.Tensor(input_image).cuda()
# input_image = torch.Tensor(input_image)
heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result = model(input_image)
pose_scores, keypoint_scores, keypoint_coords = posenet.decode_multiple_poses(
heatmaps_result.squeeze(0),
offsets_result.squeeze(0),
displacement_fwd_result.squeeze(0),
displacement_bwd_result.squeeze(0),
output_stride=output_stride,
max_pose_detections=10,
min_pose_score=0.25)
keypoint_coords *= output_scale
if args.output_dir:
draw_image = posenet.draw_skel_and_kp(
draw_image, pose_scores, keypoint_scores, keypoint_coords,
min_pose_score=0.25, min_part_score=0.25)
cv2.imwrite(os.path.join(args.output_dir, os.path.relpath(f, args.image_dir)), draw_image)
if not args.notxt:
print()
print("Results for image: %s" % f)
for pi in range(len(pose_scores)):
if pose_scores[pi] == 0.:
break
print('Pose #%d, score = %f' % (pi, pose_scores[pi]))
for ki, (s, c) in enumerate(zip(keypoint_scores[pi, :], keypoint_coords[pi, :, :])):
print('Keypoint %s, score = %f, coord = %s' % (posenet.PART_NAMES[ki], s, c))
print('Average FPS:', len(filenames) / (time.time() - start))
if __name__ == "__main__":
main()
Pose #0, score = 0.379194
Keypoint nose, score = 0.674959, coord = [101.60153786 249.57170269]
Keypoint leftEye, score = 0.782584, coord = [ 90.09467647 256.01897441]
Keypoint rightEye, score = 0.764670, coord = [ 93.39451825 240.7931246 ]
Keypoint leftEar, score = 0.314097, coord = [ 96.12480439 261.91337455]
Keypoint rightEar, score = 0.218671, coord = [ 93.97518815 227.00891266]
Keypoint leftShoulder, score = 0.442732, coord = [131.89524645 266.37186226]
Keypoint rightShoulder, score = 0.792673, coord = [124.28714146 212.55187703]
Keypoint leftElbow, score = 0.935440, coord = [188.27424116 275.69031666]
Keypoint rightElbow, score = 0.283601, coord = [160.10646086 169.00513171]
Keypoint leftWrist, score = 0.099409, coord = [148.67073327 219.6195196 ]
Keypoint rightWrist, score = 0.018433, coord = [178.70592929 142.70996855]
Keypoint leftHip, score = 0.377787, coord = [215.07498418 249.12158127]
Keypoint rightHip, score = 0.494253, coord = [208.91409478 212.95368927]
Keypoint leftKnee, score = 0.179067, coord = [257.96114013 202.6191909 ]
Keypoint rightKnee, score = 0.053346, coord = [255.18849156 166.31186308]
Keypoint leftAnkle, score = 0.007786, coord = [344.95081244 157.16637931]
Keypoint rightAnkle, score = 0.006790, coord = [306.63231759 151.46420959]
Average FPS: 0.125297575109018
4.CPUで実行するためにはmodel = model.cuda()をコメントアウト、input_image = torch.Tensor(input_image).cuda()をinput_image = torch.Tensor(input_image)に変更する
Pose #0, score = 0.379194
Keypoint nose, score = 0.674959, coord = [101.60154994 249.57166841]
Keypoint leftEye, score = 0.782583, coord = [ 90.09468179 256.01896967]
Keypoint rightEye, score = 0.764670, coord = [ 93.39452115 240.79311639]
Keypoint leftEar, score = 0.314096, coord = [ 96.12480789 261.91335979]
Keypoint rightEar, score = 0.218672, coord = [ 93.97518428 227.00892123]
Keypoint leftShoulder, score = 0.442730, coord = [131.89524065 266.37183322]
Keypoint rightShoulder, score = 0.792672, coord = [124.28711632 212.55187607]
Keypoint leftElbow, score = 0.935439, coord = [188.274202 275.69030571]
Keypoint rightElbow, score = 0.283601, coord = [160.10642533 169.00512886]
Keypoint leftWrist, score = 0.099409, coord = [148.67067502 219.61951674]
Keypoint rightWrist, score = 0.018433, coord = [178.70592736 142.70997474]
Keypoint leftHip, score = 0.377784, coord = [215.07496968 249.12155937]
Keypoint rightHip, score = 0.494253, coord = [208.91408318 212.95369784]
Keypoint leftKnee, score = 0.179067, coord = [257.9611121 202.61920137]
Keypoint rightKnee, score = 0.053347, coord = [255.18835813 166.31186499]
Keypoint leftAnkle, score = 0.007786, coord = [344.95083612 157.16632932]
Keypoint rightAnkle, score = 0.006790, coord = [306.63236774 151.46421483]
Average FPS: 1.3642639493522004
5.実行(Webカメラのデモ)
python webcam_demo.py
GPUで実行
import torch
import cv2
import time
import argparse
import posenet
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=int, default=101)
parser.add_argument('--cam_id', type=int, default=0)
parser.add_argument('--cam_width', type=int, default=1280)
parser.add_argument('--cam_height', type=int, default=720)
parser.add_argument('--scale_factor', type=float, default=0.7125)
args = parser.parse_args()
def main():
model = posenet.load_model(args.model)
model = model.cuda()
output_stride = model.output_stride
cap = cv2.VideoCapture(args.cam_id)
cap.set(3, args.cam_width)
cap.set(4, args.cam_height)
start = time.time()
frame_count = 0
while True:
input_image, display_image, output_scale = posenet.read_cap(
cap, scale_factor=args.scale_factor, output_stride=output_stride)
with torch.no_grad():
input_image = torch.Tensor(input_image).cuda()
heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result = model(input_image)
pose_scores, keypoint_scores, keypoint_coords = posenet.decode_multiple_poses(
heatmaps_result.squeeze(0),
offsets_result.squeeze(0),
displacement_fwd_result.squeeze(0),
displacement_bwd_result.squeeze(0),
output_stride=output_stride,
max_pose_detections=10,
min_pose_score=0.15)
keypoint_coords *= output_scale
# TODO this isn't particularly fast, use GL for drawing and display someday...
overlay_image = posenet.draw_skel_and_kp(
display_image, pose_scores, keypoint_scores, keypoint_coords,
min_pose_score=0.15, min_part_score=0.1)
cv2.imshow('posenet', overlay_image)
frame_count += 1
if cv2.waitKey(1) & 0xFF == ord('q'):
break
print('Average FPS: ', frame_count / (time.time() - start))
if __name__ == "__main__":
main()
Average FPS: 14.213040895239454
6.CPUで実行(画像デモと同様に編集)
Average FPS: 1.140740726339934
お疲れ様でした
TensorFlowでやる場合
conda create -n posenet-tf python=3.7
conda activate posenet-tf
pip install tensorflow-gpu==1.13.1
pip install opencv-python==3.4.5.20
pip install -U protobuf~=3.20.0
pip install scipy
pip install pyyaml==5.4.1
python webcam_demo.py
python get_test_images.py
python image_demo.py --model 101 --image_dir ./images --output_dir ./output