More than 3 years have passed since last update.

ドローンのカメラ画像の内容説明文と、検出した物体に関する情報、認識した人物検出人数を、リアルタイムにモニタ表示するコード

Last updated at 2021-08-11Posted at 2021-08-11

（飛行中のTelloから受信した画像 @Macbookウィンドウ（左右2分割））

今回、やったこと

(1) Telloの電源をオンにする。
(2) 次のGitHubリポジトリをgit cloneしたノートPCを、TelloにWifi接速する。

（GitHub） electronicsdiy/tello-drone-image-caption-and-object-detection

(3) __examplesディレクトリ__に移動して、Teminalで以下を実行する。

Terminal

python3 tello_camera_image_captioned_description_window.py

すると、次の画面がPCの画面のなかで立ち上がります。

ウィンドウ画面（左右2分割）の内容

左画面 ： Telloから受信した画像の内容を説明する画像キャプション文（英語）を表示
右画面 ： Telloから受信した画像に、以下の3つを重ねた表示

① Telloの現在飛行高度
② 検出された物体の輪郭と推定物体名
③ 検出された人物の人数を重ねて

PCのキーボード操作によるTelloの遠隔操縦 & フレーム画像の撮影（画像ファイル保存）

【キーボード操作一覧】

i : 離陸
w: 前進
s : 後進
a : 左移動
d : 右移動
e : 時計回り30度回転
q : 反時計回り30度回転
r : 上昇
f : 降下
g : 着地
p : フレーム画像のファイル保存（※）

※ 画像ファイルは、カレントディレクトリ（exampleディレクトリ直下）に出力されます。

※ ファイル名は。frame_img_shot_XXXX年XX月XX日XX/XX/XX.jpgです。最後はhour, minutes, ミリ秒です。

今回作成したスクリプト資源（一式）

GitHubリポジトリを立てました。

__git clone__か、__手でzipファイルをダウンロード__すると、資源をまとめて取得できます。

（GitHub） electronicsdiy/tello-drone-image-caption-and-object-detection

同梱されている他の欠かせない資源とフォルダ構成について

画面の右側に表示させるフレーム画像の説明文は、次のリポジトリの資材を利用しています。

https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning

又、画面の左側に表示されるTelloカメラ画像の受信と、MacbookのキーボードによるTelloの遠隔操作は、DJITelloPyライブラリを使いました。

https://github.com/damiafuentes/DJITelloPy

そのため、コアとなる次のソースコード以外に、前記の2つのリポジトリの資源を、所定のフォルダ構成で配置する必要があります。
必要な資材を求められるフォルダ構成で格納した資源一式は、冒頭のGitHubからgit cloneして、取得してください。

ソースコードのコア部分

common.py

import sys, cv2, math, time, datetime
import numpy as np
import matplotlib.pyplot as plt
import cvlib as cvl
from timeout_decorator import timeout, TimeoutError
from djitellopy import Tello
from cvlib.object_detection import draw_bbox
from pprint import pprint
from create_caption_text import *

tello_camera_image_captioned_description_window.py

from common import *

TIMEOUT_SEC = 0.1

@timeout(TIMEOUT_SEC)
def input_with_timeout(msg=None):
   return input(msg)


tello = Tello()
tello.connect()

tello.streamon()
frame_read = tello.get_frame_read()

# tello.takeoff()

while True:
    img = frame_read.frame
    #cv2.imshow("drone", img)
    #cv2.imshow('Canny', cv2.Canny(img, 100, 200))
    #bitwised_img = cv2.bitwise_not(img)
    #cv2.imshow('Bitwised', bitwised_img)
    
    image = img.copy()

    # 物体検出矩形表示と人物検出人数の文字列埋込み表示の画像を取得
    label_name  = "person"
    bbox, label, conf = cvl.detect_common_objects(image)
    objection_detected_image = draw_bbox(image, bbox, label, conf)
    #plt.imshow(objection_detected_image)
    #plt.show()
    #dt_now = datetime.datetime.now()
    message = "Num of detected {0}(s) is {1}".format(label_name, str(label.count(label_name)))
    input_text_0 = message
    cv2.putText(objection_detected_image, str(input_text_0), (0, 50), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA)
    
    #https://djitellopy.readthedocs.io/en/latest/tello/#djitellopy.tello.Tello.query_battery
    time_of_flight_distance_senser_val = tello.get_distance_tof()
    input_text_1 = "ToF Distane {0} cm".format(time_of_flight_distance_senser_val)
    
    height = tello.get_height()
    input_text_2 = "Height {0} cm".format(height)
 
    height = objection_detected_image.shape[0]
    width = objection_detected_image.shape[1]
    print("height {0}".format(height))
    print("width {0}".format(width))
    # Terminal標準出力
    print(input_text_0)
    print(input_text_1)
    print(input_text_2)
    
    # カメラ画像にTelloの現在高度（ToFセンサ計測距離(cm)、高さ（cm)）を埋込む
    cv2.putText(objection_detected_image, str(input_text_1), (0, 100), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
    cv2.putText(objection_detected_image, str(input_text_2), (0, 150), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)
    #cv2.imshow("Video_2", objection_detected_image)
    
    caption_text_list = create_caption_text_str_list(img)
    
    print("\n現在のフレーム画像の状況説明文")
    #listの各要素（キャプション文を40文字毎に切り出した各行）を改行しながら順に表示
    pprint(caption_text_list)
    print("====================================")
    
    #背景黒地の画面に、キャプション文を埋込む
    #white_bachground_img = np.zeros((height, width,3), np.uint8)

    # https://qiita.com/daxanya1/items/85f5e17ecc1203f756ad
    # https://qiita.com/tifa2chan/items/78d4af969cfa837fa988
    # 背景黒の画面。zerosの引数がheightが先になるので注意
    black_bachground_img = np.zeros((height, width, 3),np.uint8)
    # 背景白にする場合。
    #white_bachground_img = np.full((height, width, 3), 255, dtype=img.dtype)

    for i, caption_sentence in enumerate(caption_text_list):
        cv2.putText(black_bachground_img, str(caption_sentence), (50, 80+i*70), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 255, 255), 1, cv2.LINE_AA)

    #cv2.imshow("Video_1", black_bachground_img)
    
    # 2つの画像を１つのウィンドウに組込む
    merged_image_group = cv2.hconcat((black_bachground_img, objection_detected_image))
    
    height = merged_image_group.shape[0]
    width = merged_image_group.shape[1]
    resized_output_img = cv2.resize(merged_image_group, (int(1.7*width), int(1.7*height)))
    
    cv2.namedWindow("Video", cv2.WINDOW_NORMAL)
    cv2.imshow("Video", resized_output_img)
    
    #次の行（key = cv2.・・・）を削除すると、画像が受信できなくなる。
    key = cv2.waitKey(1) & 0xff
    
    try:
        msg = input_with_timeout('\n{}秒以内に操作コマンドを入力して下さい :'.format(TIMEOUT_SEC))
        print('\n操作コマンド：　{} を受信しました。\n'.format(msg))
        if msg == "i":
            tello.takeoff()
        elif msg == "w":
            tello.move_forward(30)
        elif msg == "s":
            tello.move_back(30)
        elif msg == "a":
            tello.move_left(30)
        elif msg == "d":
            tello.move_right(30)
        elif msg == "e":
            tello.rotate_clockwise(30)
        elif msg == "q":
            tello.rotate_counter_clockwise(30)
        elif msg == "r":
            tello.move_up(30)
        elif msg == "f":
            tello.move_down(30)
        elif msg == "g":
            tello.land()
        elif msg == "p":
            dt_now = datetime.datetime.now()
            timestamp_str = dt_now.strftime('%Y年%m月%d日%H:%M:%S')
            file_name = "frame_img_shot_{0}.jpg".format(timestamp_str)
            cv2.imwrite(file_name, resized_output_img)
            print("フレーム画像を保存しました。")
    except TimeoutError:
        print('\n操作コマンド入力時間切れ。次のフレーム画像を読み込みます。\n')

tello.land()

create_caption_text.py

import torch
# import matplotlib.pyplot as plt
import numpy as np 
import argparse
import pickle 
import os
import cv2
import textwrap
import datetime
import locale
import numpy as np
from torchvision import transforms 
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from PIL import Image

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def create_caption_text_str_list(image_data):
    # Image preprocessing
    transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    vocab_path = 'data/vocab.pkl'
    encoder_path = 'models/encoder-5-3000.pkl'
    decoder_path = 'models/decoder-5-3000.pkl'
    embed_size = 256
    hidden_size = 512
    num_layers = 1
    
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    # Prepare an image
    #image = Image.open(image_path).convert('RGB')
    image = Image.fromarray(image_data)
    image = image.resize([224, 224], Image.LANCZOS)
    image = transform(image).unsqueeze(0)
    #image = load_image(args.image, transform)
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    sentence = sentence.replace("<start>", "")
    sentence = sentence.replace("<end>", "")
    
    # Print out the image and the generated caption
    #print(sentence)
    caption_text_wrap_list = textwrap.wrap(str(sentence), 40)
    
    return caption_text_wrap_list

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up