More than 1 year has passed since last update.

VisionFrameworkを使ったiOSアプリケーション開発(Hand Tracking編)

Last updated at 2022-09-19Posted at 2022-09-12

はじめに

この記事は、

の続きの記事である。
この記事では、AppleのvisionFrameworkを使い、手の認識と人差し指の位置を描画させることを目的とする。

Hand Tracking

まず、vision フレームワークをインポートする。

import Vision
import UIKit
import AVFoundation

class CameraViewController: UIViewController,AVCaptureVideoDataOutputSampleBufferDelegate{
//略
}

次に、HandTrackingを実装するためのリクエストクラスのインスタンスを生成する。

import Vision
import UIKit
import AVFoundation

class CameraViewController: UIViewController,AVCaptureVideoDataOutputSampleBufferDelegate{
    private var handPoseRequest = VNDetectHumanHanndPoseRequest()
//略
}

次に、ハンドラーを実装し、指の座標を取得するオブジェクトを実装する。

    private func getHandObservations(sampleBuffer: CMSampleBuffer, completion: @escaping (([VNHumanHandPoseObservation])->())){
    }

ハンドラーを実装する。

    private func getHandObservations(sampleBuffer: CMSampleBuffer, completion: @escaping (([VNHumanHandPoseObservation])->())){
        let handler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .up, options: [:])
        do{
            try handler.perform([handPoseRequest])
        }catch{
        }
    }

次に、指を認識出来たら指の座標を取得する。

    private func getHandObservations(sampleBuffer: CMSampleBuffer, completion: @escaping (([VNHumanHandPoseObservation])->())){
        let handler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .up, options: [:])
        do{
            try handler.perform([handPoseRequest])
            guard let observation = handPoseRequest.results?.first else{
                return
            }
            let allPoint = try observation.recognizedPoints(.all)
            guard let indexTipPoint = allPoint[.indexTip] else{
                return
            }
            guard indexTipPoint.confidence > 0.3 else{
                return
            }
            print(indexTipPoint)
        }catch{
        }
    }

取得できるポイントの一覧は

で紹介されていたので、活用さしてもらった。
ここで得られるポイントの座標はVision座標であることに注意しなければいけない。
描画するための座標変換は後の記事で紹介しようと思う。

上で記述したオブジェクトを1フレームごとに呼び出す。

extension CameraViewController: AVCaptureVideoDataOutputSampleBufferDelegate{
    public func captureOutput(_ output: AVCaptureOutput, didOutputt sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection){
        getHandObservations(sampleBuffer: sampleBuffer){
            VNHumanHandPoseObservation in
        }
    }
}

「captureOutput～」はフレームが切り替わるたびに呼び出されるオブジェクト。

今回、カメラで認識する手の個数は１つを想定しているので、「maximumHandCount」を１に指定する。

    override func viewDidLoad(){
        super.viewDidLoad()
        handPoseRequest.maximumHandCount = 1
    }

ここまでの「CameraViewController.swift」は以下のようになっている。

CameraViewController.swift

import UIKit
import AVFoundation
import Vision

class CameraViewController: UIViewController{
    private var cameraView: CameraView {view as! CameraView}
    private let videoDataOutputQueue = DispatchQueue(label: "CameraFeedDataOutput", qos: .userInteractive)
    private var cameraFeedSession: AVCaptureSession?
    private var handPoseRequest = VNDetectHumanHandPoseRequest()
    
    override func viewDidLoad() {
        super.viewDidLoad()
        handPoseRequest.maximumHandCount = 1
    }
    
    override func viewDidAppear(_ animated: Bool){
        super.viewDidAppear(animated)
        do{
            if(cameraFeedSession == nil){
                cameraView.previewLayer.videoGravity = .resizeAspectFill
                try setupAVSession()
                cameraView.previewLayer.session = cameraFeedSession
            }
            cameraFeedSession?.startRunning()
        }catch{
            AppError.display(error,inViewController: self)
        }
    }
    //アプリの画面から切り替わったときに呼び出されるオブジェクト
    override func viewWillDisappear(_ animated: Bool){
        cameraFeedSession?.stopRunning()
        super.viewWillDisappear(animated)
    }
    //カメラのセットアップ
    func setupAVSession() throws{
        guard let videoDevice = AVCaptureDevice.default(.builtInUltraWideCamera, for: .video, position: .back)else{
            throw AppError.captureSessionSetup(reason: "Could not find a ultrawide back camera.")
        }
        
        guard let deviceInput = try? AVCaptureDeviceInput(device: videoDevice)else{
            throw AppError.captureSessionSetup(reason:
            "Could not create video device input.")
        }
        
        let session = AVCaptureSession()
        session.beginConfiguration()
        session.sessionPreset = AVCaptureSession.Preset.high
        
        guard session.canAddInput(deviceInput)else{
            throw AppError.captureSessionSetup(reason: "Could not add video device input to the session")
        }
        session.addInput(deviceInput)
        
        let dataOutput = AVCaptureVideoDataOutput()

        guard session.canAddOutput(dataOutput)else{
            throw AppError.captureSessionSetup(reason: "Could not add video data out put to the session")
        }
        session.addOutput(dataOutput)
        
        dataOutput.alwaysDiscardsLateVideoFrames = true
        dataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String: Int(kCVPixelFormatType_420YpCbCr8BiPlanarFullRange)]
        dataOutput.setSampleBufferDelegate(self, queue: videoDataOutputQueue)
        
        session.commitConfiguration()
        cameraFeedSession = session
    }
    //指認識のオブジェクト
    private func getHandObservations(sampleBuffer: CMSampleBuffer, completion: @escaping(([VNHumanHandPoseObservation])->())){
        
        let handler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .up, options: [:])
        do {
            try handler.perform([handPoseRequest])
            guard let observation = handPoseRequest.results?.first else {
                return
            }
            let allPoint = try observation.recognizedPoints(.all)
            guard let indexTipPoint = allPoint[.indexTip] else{
                return
            }
            guard indexTipPoint.confidence > 0.3 else{
                return
            }
            //結果確認用
            print(indexTipPoint)
        }catch{
        
        }
    }
}

extension CameraViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
    public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        
        getHandObservations(sampleBuffer: sampleBuffer){
            VNHumanHandPoseObservation in
        }
    }
}

おまけ

指の認識にエラーが発生した場合の処理を記述する。

private func getHandObservations(sampleBuffer: CMSampleBuffer, completion: @escaping(([VNHumanHandPoseObservation])->())){
        
        let handler = VNImageRequestHandler(cmSampleBuffer: sampleBuffer, orientation: .up, options: [:])
        do {
        //略
        }catch{
            cameraFeedSession?.stopRunning()
            let error = AppError.visionError(error: error)
            DispathQueue.main.async{
                error.displayInViewController(self)
            }
        }
    }
}

AppDelegateにVisionErrorを追加する。

AppDelegate.swift

import UIKit
import Vision

@UIApplicationMain
class AppDelegate: UIResponder, UIApplicationDelegate {

    func application(_ application: UIApplication,
                     configurationForConnecting connectingSceneSession: UISceneSession,
                     options: UIScene.ConnectionOptions) -> UISceneConfiguration {
        
        return UISceneConfiguration(name: "Default Configuration", sessionRole: connectingSceneSession.role)
    }

}

// MARK: - Errors

enum AppError: Error {
    case captureSessionSetup(reason: String)
    case visionError(error: Error)
    case otherError(error: Error)
    
    static func display(_ error: Error, inViewController viewController: UIViewController) {
        if let appError = error as? AppError {
            appError.displayInViewController(viewController)
        } else {
            AppError.otherError(error: error).displayInViewController(viewController)
        }
    }
    
    func displayInViewController(_ viewController: UIViewController) {
        let title: String?
        let message: String?
        switch self {
        case .captureSessionSetup(let reason):
            title = "AVSession Setup Error"
            message = reason
        case .visionError(let error):
            title = "Vision Error"
            message = error.localizedDescription
        case .otherError(let error):
            title = "Error"
            message = error.localizedDescription
        }
        
        let alert = UIAlertController(title: title, message: message, preferredStyle: .alert)
        alert.addAction(UIAlertAction(title: "OK", style: .default, handler: nil))
        
        viewController.present(alert, animated: true, completion: nil)
    }
}

ここまででカメラ画像からVisionFrameworkを使って指の座標を得ることができた。次の記事では座標変換を行い、画面に描画するところまで書こうと思う。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

VisionFrameworkを使ったiOSアプリケーション開発(Hand Tracking編)

はじめに

Hand Tracking

おまけ

次へ