More than 3 years have passed since last update.

Vision.frameworkでカメラ画像のテキスト認識を行う

Last updated at 2020-05-23Posted at 2019-12-22

前回の記事では、 Vision.framework をつかって顔認識を行いました。
今度はテキスト認識をやってみます。
ちなみに、テキストの文字認識はiOS13からの機能みたいです。

概要

カメラ画像からテキストを検出し、テキスト部分に矩形を表示。
さらにその部分に検出したテキストを出力します。

現在のところ、対応言語が英語のみのようです。
また、今回のサンプルでは端末を横にしないと、文字をうまく認識しません。

試した環境

Xcode 11.3
iOS 13.2
swift 5

実行サンプル

https://www.youtube.com/watch?v=1sll_scS0UI

動画なので速度を出すため、検証精度を落として確認しています。
画面をアップにするとそこそこの精度は出ていそうです。

検証に使ったサイトのURLは以下です。
https://en.wikipedia.org/wiki/Apple

コード説明

手順的には、顔認証とほぼ同じで、リクエストを VNDetectFaceRectanglesRequest から VNRecognizeTextRequest に変更します。
VNRecognizeTextRequest で画像から検出した文字情報を [VNRecognizedTextObservation] として受け取ります。
ちなみに、 VNDetectTextRectanglesRequest でも文字の矩形取得はできるのですが、こちらの場合文字情報を取得することができません。

また、リクエストにプロパティを設定することで、文字取得条件を変更できます。

recognitionLevel = 文字の取得制度設定。 fast と accurate があり、動画で検出する場合は fast が良いみたい
recognitionLanguages = 認識する言語。現在は英語のみ。
usesLanguageCorrection = 認識した文字を自動修正する機能。スペルミス防止などにつかえそう？

    /// 文字認識情報の配列取得 (非同期)
    private func getTextObservations(pixelBuffer: CVPixelBuffer, completion: @escaping (([VNRecognizedTextObservation])->())) {
        let request = VNRecognizeTextRequest { (request, error) in
            guard let results = request.results as? [VNRecognizedTextObservation] else {
                completion([])
                return
            }
            completion(results)
        }

        request.recognitionLevel = recognitionLevel
        request.recognitionLanguages = supportedRecognitionLanguages
        request.usesLanguageCorrection = true

        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
        try? handler.perform([request])
    }

サポートしている言語一覧は以下手続きで取得。

    /// サポートしている言語リストを取得 （現在は英語のみ）
    private lazy var supportedRecognitionLanguages : [String] = {
        return (try? VNRecognizeTextRequest.supportedRecognitionLanguages(
        for: recognitionLevel,
        revision: VNRecognizeTextRequestRevision1)) ?? []
    }()

文字取得とは直接関係ありませんが、画面上に取得した文字を表示するため、
CGContext に文字を書き込んでいます。
普通に書き込むと座標系の関係で文字列が逆転してしまうみたいで、
何気にここが一番面倒くさい処理となりました・・・

   /// コンテキストに矩形を描画
    private func drawRect(_ rect: CGRect, text: String, context: CGContext) {

        context.setLineWidth(4.0)
        context.setStrokeColor(UIColor.green.cgColor)
        context.stroke(rect)
        context.setFillColor(UIColor.black.withAlphaComponent(0.6).cgColor)
        context.fill(rect)
        
        drawText(text, rect: rect, context: context)
 
    }

    /// コンテキストにテキストを描画　 (そのまま描画すると文字が反転するので、反転させる必要あり）
    private func drawText(_ text: String, rect: CGRect, context: CGContext) {
        
        context.saveGState()
        defer {
            context.restoreGState()
        }

        let transform = CGAffineTransform(scaleX: 1, y: 1)
        context.concatenate(transform)
        
        guard let textStyle = NSMutableParagraphStyle.default.mutableCopy() as? NSMutableParagraphStyle else {
            return
        }
        let font = UIFont.boldSystemFont(ofSize: 20)
        let textFontAttributes = [
            NSAttributedString.Key.font: font,
            NSAttributedString.Key.foregroundColor: UIColor.white,
            NSAttributedString.Key.paragraphStyle: textStyle
        ]
        
        let astr = NSAttributedString(string: text, attributes: textFontAttributes)
        let setter = CTFramesetterCreateWithAttributedString(astr)
        let path = CGPath(rect: rect, transform: nil)
        let frame = CTFramesetterCreateFrame(setter, CFRange(), path, nil)

        context.textMatrix = CGAffineTransform.identity
        CTFrameDraw(frame, context)

    }

VNRecognizedTextObservation から文字や検出範囲を取得する手続きは以下部分です。
topCandidates に検出文字候補が評価が高い順に入ってるみたいなので、一番最初の物を決め打ちで取るようにしています。

        textObservations.forEach{
            let rect = getUnfoldRect(normalizedRect: $0.boundingBox, targetSize: imageSize)
            let text = $0.topCandidates(1).first?.string ?? "" // topCandidates に文字列候補配列が含まれている
            self.drawRect(rect, text: text, context: newContext)
        }

コード全体

import UIKit
import AVFoundation
import Vision

class TextObservationViewController: UIViewController {

    @IBOutlet weak var previewImageView: UIImageView!

    private let avCaptureSession = AVCaptureSession()
    
    /// 認識制度を設定。 リアルタイム処理なので fastで
    private let recognitionLevel : VNRequestTextRecognitionLevel = .fast
    
    /// サポートしている言語リストを取得 （現在は英語のみ）
    private lazy var supportedRecognitionLanguages : [String] = {
        return (try? VNRecognizeTextRequest.supportedRecognitionLanguages(
        for: recognitionLevel,
        revision: VNRecognizeTextRequestRevision1)) ?? []
    }()
    
    override func viewDidLoad() {
        super.viewDidLoad()
        setupCamera()
    }

    override func viewDidDisappear(_ animated: Bool) {
        super.viewDidDisappear(animated)
        avCaptureSession.stopRunning()
    }

    /// カメラのセットアップ
    private func setupCamera() {
        avCaptureSession.sessionPreset = .photo

        let device = AVCaptureDevice.default(for: .video)
        let input = try! AVCaptureDeviceInput(device: device!)
        avCaptureSession.addInput(input)

        let videoDataOutput = AVCaptureVideoDataOutput()
        videoDataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String : Int(kCVPixelFormatType_32BGRA)]
        videoDataOutput.alwaysDiscardsLateVideoFrames = true
        videoDataOutput.setSampleBufferDelegate(self, queue: .global())

        avCaptureSession.addOutput(videoDataOutput)
        avCaptureSession.startRunning()
    }

    /// コンテキストに矩形を描画
    private func drawRect(_ rect: CGRect, text: String, context: CGContext) {

        context.setLineWidth(4.0)
        context.setStrokeColor(UIColor.green.cgColor)
        context.stroke(rect)
        context.setFillColor(UIColor.black.withAlphaComponent(0.6).cgColor)
        context.fill(rect)
        
        drawText(text, rect: rect, context: context)
 
    }

    /// コンテキストにテキストを描画　 (そのまま描画すると文字が反転するので、反転させる必要あり）
    private func drawText(_ text: String, rect: CGRect, context: CGContext) {
        
        context.saveGState()
        defer {
            context.restoreGState()
        }

        let transform = CGAffineTransform(scaleX: 1, y: 1)
        context.concatenate(transform)
        
        guard let textStyle = NSMutableParagraphStyle.default.mutableCopy() as? NSMutableParagraphStyle else {
            return
        }
        let font = UIFont.boldSystemFont(ofSize: 20)
        let textFontAttributes = [
            NSAttributedString.Key.font: font,
            NSAttributedString.Key.foregroundColor: UIColor.white,
            NSAttributedString.Key.paragraphStyle: textStyle
        ]
        
        let astr = NSAttributedString(string: text, attributes: textFontAttributes)
        let setter = CTFramesetterCreateWithAttributedString(astr)
        let path = CGPath(rect: rect, transform: nil)
        let frame = CTFramesetterCreateFrame(setter, CFRange(), path, nil)

        context.textMatrix = CGAffineTransform.identity
        CTFrameDraw(frame, context)

    }
    
    /// 文字認識情報の配列取得 (非同期)
    private func getTextObservations(pixelBuffer: CVPixelBuffer, completion: @escaping (([VNRecognizedTextObservation])->())) {
        let request = VNRecognizeTextRequest { (request, error) in
            guard let results = request.results as? [VNRecognizedTextObservation] else {
                completion([])
                return
            }
            completion(results)
        }

        request.recognitionLevel = recognitionLevel
        request.recognitionLanguages = supportedRecognitionLanguages
        request.usesLanguageCorrection = true

        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
        try? handler.perform([request])
    }

    /// 正規化された矩形位置を指定領域に展開
    private func getUnfoldRect(normalizedRect: CGRect, targetSize: CGSize) -> CGRect {
        return CGRect(
            x: normalizedRect.minX * targetSize.width,
            y: normalizedRect.minY * targetSize.height,
            width: normalizedRect.width * targetSize.width,
            height: normalizedRect.height * targetSize.height
        )
    }

    /// 文字検出位置に矩形を描画した image を取得
    private func getTextRectsImage(sampleBuffer :CMSampleBuffer, textObservations: [VNRecognizedTextObservation]) -> UIImage? {

        guard let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
            return nil
        }

        CVPixelBufferLockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))

        guard let pixelBufferBaseAddres = CVPixelBufferGetBaseAddressOfPlane(imageBuffer, 0) else {
            CVPixelBufferUnlockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
            return nil
        }

        let width = CVPixelBufferGetWidth(imageBuffer)
        let height = CVPixelBufferGetHeight(imageBuffer)
        let bitmapInfo = CGBitmapInfo(rawValue:
            (CGBitmapInfo.byteOrder32Little.rawValue | CGImageAlphaInfo.premultipliedFirst.rawValue)
        )

        guard let newContext = CGContext(
            data: pixelBufferBaseAddres,
            width: width,
            height: height,
            bitsPerComponent: 8,
            bytesPerRow: CVPixelBufferGetBytesPerRow(imageBuffer),
            space: CGColorSpaceCreateDeviceRGB(),
            bitmapInfo: bitmapInfo.rawValue
            ) else
        {
            CVPixelBufferUnlockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
            return nil
        }

        let imageSize = CGSize(width: width, height: height)

        textObservations.forEach{
            let rect = getUnfoldRect(normalizedRect: $0.boundingBox, targetSize: imageSize)
            let text = $0.topCandidates(1).first?.string ?? "" // topCandidates に文字列候補配列が含まれている
            self.drawRect(rect, text: text, context: newContext)
        }

        CVPixelBufferUnlockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))

        guard let imageRef = newContext.makeImage() else {
            return nil
        }
        let image = UIImage(cgImage: imageRef, scale: 1.0, orientation: UIImage.Orientation.right)

        return image
    }
}


extension TextObservationViewController : AVCaptureVideoDataOutputSampleBufferDelegate{

    /// カメラからの映像取得デリゲート
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
            return
        }
        getTextObservations(pixelBuffer: pixelBuffer) { [weak self] textObservations in
            guard let self = self else { return }
            let image = self.getTextRectsImage(sampleBuffer: sampleBuffer, textObservations: textObservations)
            DispatchQueue.main.async { [weak self] in
                self?.previewImageView.image = image
            }
        }
    }
}

github

becky3/text_observation: Vision.frameworkでカメラ画像のテキスト認識を行う
https://github.com/becky3/text_observation

参考サイト

iOS13から標準サポートされる文字認識
https://qiita.com/KenNagami/items/a75b2bc282ad05a6dcde
Core Text で縦書き - 錯綜
https://hrt1ro.hatenablog.com/entry/2018/09/27/132803
【Swift】Vision.frameworkでカメラ画像の顔認識を行う【iOS】 - Qiita
https://qiita.com/beckyJPN/items/4bc46a8e6a000b711de6

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up