HoloLens2 × Azure Cognitive Services（Read APIで文字認識し読み上げ）

Last updated at 2020-12-13Posted at 2020-12-13

はじめに

HoloLensアドベントカレンダー2020の13日目の記事です。
前回は「何が見える？」と質問したら、画像をキャプチャして説明文を生成し、読み上げを行いました。今回は、「文字を読んで」と言うと、画像からテキストを抽出し、読み上げてみます。「ヨンシル、文字を読んで」

開発環境

Azure
- Computer Vision API (Read API)
- Speech SDK 1.14.0
Unity 2019.4.1f1
MRTK 2.5.1
Windows 10 PC
HoloLens2

導入

1．前回の記事まで終わらせてください。

2．Unityプロジェクトはこんな感じ。前回のMySpeechRecognizerのActionワードをリストにして、「文字を読んで」を追加します。新しく「TapToCaptureReadAPI.cs」スクリプトをAdd Componentし、「文字を読んで」と言うと画像キャプチャし、テキスト抽出、読み上げという流れになります。

3．前回の記事のMySpeechRecognizer.csは、Actionワードが一つしか認識できませんでしたが、リストにして複数のActionワードを認識できるようにします。あとはUpdate関数を下記のように編集して、Actionワードが「文字を読んで」のときにTapToCaptureReadAPI.csのAirTap関数を実行します。

MySpeechRecognizer.cs


    // public string ActionWord = "";
    public List<string> ActionWords;

    async void Update()
    {
        if (recognizedString != "")
        {
            // Debug.Log(recognizedString);
            if (action){
                foreach(string ActionWord in ActionWords){
                    if (recognizedString.ToLower().Contains(ActionWord.ToLower()))
                    {
                        Debug.Log("Action");
                        if(ActionWord == "何が見える"){
                            Debug.Log("Analyze Image");
                            this.GetComponent<TapToCaptureAnalyzeAPI>().AirTap();
                        }else if(ActionWord == "文字を読んで"){
                            Debug.Log("Read");
                            this.GetComponent<TapToCaptureReadAPI>().AirTap();
                        }
                        action = false;
                    }
                }
            }else if (recognizedString.ToLower().Contains(WakeWord.ToLower()))
            {
                Debug.Log("Wake");
                await this.GetComponent<TapToCaptureAnalyzeAPI>().SynthesizeAudioAsync("はい");
                action = true;
            }
        }
    }

4．「TapToCaptureReadAPI.cs」スクリプトは以下のようになります。

TapToCaptureReadAPI.cs

using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System;
using UnityEngine;
using Microsoft.MixedReality.Toolkit.Utilities;
using System.Threading.Tasks;

// SpeechSDK ここから
using System.IO;
using System.Text;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
// SpeechSDK ここまで

public class TapToCaptureReadAPI : MonoBehaviour
{
    // Read ここから
    private string read_endpoint = "https://<Insert Your Endpoint>/vision/v3.1/read/analyze";
    private string read_subscription_key = "<Insert Your API Key>";

    // {"status":"succeeded","createdDateTime":"2020-12-13T03:05:57Z","lastUpdatedDateTime":"2020-12-13T03:05:58Z","analyzeResult":{"version":"3.0.0","readResults":[{"page":1,"angle":-1.4203,"width":1280,"height":720,"unit":"pixel","lines":[{"boundingBox":[417,351,481,348,482,365,417,367],"text":"BANANA","words":[{"boundingBox":[420,352,481,348,482,365,421,367],"text":"BANANA","confidence":0.595}]},{"boundingBox":[541,348,604,347,605,360,542,362],"text":"ALMOND","words":[{"boundingBox":[542,349,605,347,605,360,543,363],"text":"ALMOND","confidence":0.916}]},{"boundingBox":[644,345,724,344,724,358,645,359],"text":"STRAWBERRY","words":[{"boundingBox":[647,346,724,344,724,358,647,359],"text":"STRAWBERRY","confidence":0.638}]},{"boundingBox":[757,334,843,337,843,356,757,353],"text":"PINEAPPLE","words":[{"boundingBox":[763,335,842,338,843,355,763,353],"text":"PINEAPPLE","confidence":0.623}]},{"boundingBox":[385,382,748,374,750,465,387,475],"text":"LOOK","words":[{"boundingBox":[385,384,746,374,746,467,387,475],"text":"LOOK","confidence":0.983}]},{"boundingBox":[479,474,668,468,670,501,480,508],"text":"A La Mode","words":[{"boundingBox":[482,475,512,474,513,508,483,509],"text":"A","confidence":0.987},{"boundingBox":[519,474,567,472,567,506,520,508],"text":"La","confidence":0.987},{"boundingBox":[573,472,669,468,670,501,574,505],"text":"Mode","confidence":0.984}]},{"boundingBox":[489,508,666,501,666,518,490,526],"text":"CHOCOLATE","words":[{"boundingBox":[489,509,666,501,666,519,490,527],"text":"CHOCOLATE","confidence":0.981}]}]}]}}
    [System.Serializable]
    public class Read
    {
        public string status;
        public string createdDateTime;
        public string lastUpdatedDateTime;
        public AnalyzeResult analyzeResult;
    }

    [System.Serializable]
    public class AnalyzeResult
    {
        public ReadResults[] readResults;
    }
    
    [System.Serializable]
    public class ReadResults
    {
        public Lines[] lines;
    }
    
    [System.Serializable]
    public class Lines
    {
        public string text;
    }
    // Read ここまで

    // SpeechSDK ここから
    public AudioSource audioSource;

    public async Task SynthesizeAudioAsync(string text) 
    {
        var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
        var synthesizer = new SpeechSynthesizer(config, null); // nullを省略するとPCのスピーカーから出力されるが、HoloLensでは出力されない。
        
        string ssml = "<speak version=\"1.0\" xmlns=\"https://www.w3.org/2001/10/synthesis\" xml:lang=\"ja-JP\"> <voice name=\"ja-JP-Ichiro\">" + text + "</voice> </speak>";

        // Starts speech synthesis, and returns after a single utterance is synthesized.
        // using (var result = synthesizer.SpeakTextAsync(text).Result)
        using (var result = synthesizer.SpeakSsmlAsync(ssml).Result)
        {
            // Checks result.
            if (result.Reason == ResultReason.SynthesizingAudioCompleted)
            {
                // Native playback is not supported on Unity yet (currently only supported on Windows/Linux Desktop).
                // Use the Unity API to play audio here as a short term solution.
                // Native playback support will be added in the future release.
                var sampleCount = result.AudioData.Length / 2;
                var audioData = new float[sampleCount];
                for (var i = 0; i < sampleCount; ++i)
                {
                    audioData[i] = (short)(result.AudioData[i * 2 + 1] << 8 | result.AudioData[i * 2]) / 32768.0F;
                }

                // The output audio format is 16K 16bit mono
                var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false);
                audioClip.SetData(audioData, 0);
                audioSource.clip = audioClip;
                audioSource.Play();

                // newMessage = "Speech synthesis succeeded!";
            }
            else if (result.Reason == ResultReason.Canceled)
            {
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                // newMessage = $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?";
            }
        }
    }
    // SpeechSDK ここまで

    public GameObject quad;
    UnityEngine.Windows.WebCam.PhotoCapture photoCaptureObject = null;
    Texture2D targetTexture = null;
    private bool waitingForCapture;

    void Start(){
        waitingForCapture = false;
    }

    public void AirTap()
    {
        if (waitingForCapture) return;
        waitingForCapture = true;

        Resolution cameraResolution = UnityEngine.Windows.WebCam.PhotoCapture.SupportedResolutions.OrderByDescending((res) => res.width * res.height).First();
        targetTexture = new Texture2D(cameraResolution.width, cameraResolution.height);

        // PhotoCapture オブジェクトを作成します
        UnityEngine.Windows.WebCam.PhotoCapture.CreateAsync(false, delegate (UnityEngine.Windows.WebCam.PhotoCapture captureObject) {
            photoCaptureObject = captureObject;
            UnityEngine.Windows.WebCam.CameraParameters cameraParameters = new UnityEngine.Windows.WebCam.CameraParameters();
            cameraParameters.hologramOpacity = 0.0f;
            cameraParameters.cameraResolutionWidth = cameraResolution.width;
            cameraParameters.cameraResolutionHeight = cameraResolution.height;
            cameraParameters.pixelFormat = UnityEngine.Windows.WebCam.CapturePixelFormat.BGRA32;

            // カメラをアクティベートします
            photoCaptureObject.StartPhotoModeAsync(cameraParameters, delegate (UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result) {
                // 写真を撮ります
                photoCaptureObject.TakePhotoAsync(OnCapturedPhotoToMemoryAsync);
            });
        });
    }

    async void OnCapturedPhotoToMemoryAsync(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result, UnityEngine.Windows.WebCam.PhotoCaptureFrame photoCaptureFrame)
    {
        // ターゲットテクスチャに RAW 画像データをコピーします
        photoCaptureFrame.UploadImageDataToTexture(targetTexture);
        byte[] bodyData = targetTexture.EncodeToJPG();

        Response response = new Response();
        Dictionary<string, string> headers = new Dictionary<string, string>();
        headers.Add("Ocp-Apim-Subscription-Key", read_subscription_key);

        try
        {
            string query = read_endpoint;
            // headers.Add("Content-Type": "application/octet-stream");
            response = await Rest.PostAsync(query, bodyData, headers, -1, true);
        }
        catch (Exception e)
        {
            photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
            return;
        }

        if (!response.Successful)
        {
            photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
            return;
        }

        Debug.Log(response.ResponseCode);
        Debug.Log(response.ResponseBody);

        string operation_url = response.ResponseBody;
        bool poll = true;
        int i = 0;
        Read read;

        do {
            System.Threading.Thread.Sleep(1000);
            // https://docs.microsoft.com/ja-jp/azure/cognitive-services/computer-vision/concept-recognizing-text
            // https://github.com/Azure-Samples/cognitive-services-quickstart-code/blob/master/dotnet/ComputerVision/REST/CSharp-hand-text.md
            // {"status":"running","createdDateTime":"2020-12-12T19:39:44Z","lastUpdatedDateTime":"2020-12-12T19:39:44Z"}
            response = await Rest.GetAsync(operation_url, headers, -1, null, true);
            Debug.Log(response.ResponseCode);
            Debug.Log(response.ResponseBody);
            read = JsonUtility.FromJson<Read>(response.ResponseBody);
            if (read.status == "succeeded"){
                poll = false;
            }
            ++i;
        } while (i < 60 && poll);

        if (i == 60 && poll == true)
        {
            photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
            return;
        }

        string read_text = "";
        foreach (ReadResults readResult in read.analyzeResult.readResults){
            foreach (Lines line in readResult.lines){
                read_text = read_text + line.text + "\n";
                // Debug.Log(line.text);
            }
        }
        read_text += "以上です。";

        // SpeechSDK 追加分ここから
        Debug.Log(read_text);
        await SynthesizeAudioAsync(read_text); // jp 
        // SpeechSDK 追加分ここまで

        // カメラを非アクティブにします
        photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
    }

    void OnStoppedPhotoMode(UnityEngine.Windows.WebCam.PhotoCapture.PhotoCaptureResult result)
    {
        // photo capture のリソースをシャットダウンします
        photoCaptureObject.Dispose();
        photoCaptureObject = null;
        waitingForCapture = false;
    }
}

5．Computer Vision API (Read API) のエンドポイントとキーは、画像説明文生成のときと同じものを用います。

6．TapToCaptureReadAPI.csのAirTap関数で画像をキャプチャしたあと、Read APIに画像をPOSTします。すると、ResponseのHeadersに{"Operation-Location":URL}が返ってくるので、そのURLに対してGETする必要があります。しかし、MRTKのRest.csはResponseHeadersを返してくれないので、ProcessRequestAsync関数の275行目からを次のように編集します。

Rest.cs

・・・
            if (readResponseData)
            {
                // For Read API  
                Dictionary<string, string> tmp = webRequest.GetResponseHeaders();
                if (tmp == null)
                {
                    return new Response(true, webRequest.downloadHandler?.text, webRequest.downloadHandler?.data, webRequest.responseCode);
                }else{
                    if(tmp.ContainsKey("Operation-Location")){
                        string responseHeaders = tmp["Operation-Location"];
                        string downloadHandlerText = webRequest.downloadHandler?.text;
                        return new Response(true, responseHeaders, webRequest.downloadHandler?.data, webRequest.responseCode);
                    }else{
                        return new Response(true, webRequest.downloadHandler?.text, webRequest.downloadHandler?.data, webRequest.responseCode);
                    }
                }
            }
            else // This option can be used only if action will be triggered in the same scope as the webrequest
            {
                return new Response(true, () => webRequest.downloadHandler?.text, () => webRequest.downloadHandler?.data, webRequest.responseCode);
            }
・・・

7．これでResponseBodyにResponseHeadersのURLが返ってくるので、そのURLに対してGETします。

8．テキスト抽出が実行中の場合は下記のようなjsonが返ってくるので、"status"が"succeeded"になるまで、1秒おきにGETします。

{"status":"running","createdDateTime":"2020-12-12T19:39:44Z","lastUpdatedDateTime":"2020-12-12T19:39:44Z"}

9．"status"が"succeeded"になったら、テキスト抽出結果が次のようなjsonで返っくるので、仕様に合わせてReadクラス、AnalyzeResultクラス、ReadResultsクラス、Linesクラスを作成しました。

{"status":"succeeded","createdDateTime":"2020-12-13T03:05:57Z","lastUpdatedDateTime":"2020-12-13T03:05:58Z","analyzeResult":{"version":"3.0.0","readResults":[{"page":1,"angle":-1.4203,"width":1280,"height":720,"unit":"pixel","lines":[{"boundingBox":[417,351,481,348,482,365,417,367],"text":"BANANA","words":[{"boundingBox":[420,352,481,348,482,365,421,367],"text":"BANANA","confidence":0.595}]},{"boundingBox":[541,348,604,347,605,360,542,362],"text":"ALMOND","words":[{"boundingBox":[542,349,605,347,605,360,543,363],"text":"ALMOND","confidence":0.916}]},{"boundingBox":[644,345,724,344,724,358,645,359],"text":"STRAWBERRY","words":[{"boundingBox":[647,346,724,344,724,358,647,359],"text":"STRAWBERRY","confidence":0.638}]},{"boundingBox":[757,334,843,337,843,356,757,353],"text":"PINEAPPLE","words":[{"boundingBox":[763,335,842,338,843,355,763,353],"text":"PINEAPPLE","confidence":0.623}]},{"boundingBox":[385,382,748,374,750,465,387,475],"text":"LOOK","words":[{"boundingBox":[385,384,746,374,746,467,387,475],"text":"LOOK","confidence":0.983}]},{"boundingBox":[479,474,668,468,670,501,480,508],"text":"A La Mode","words":[{"boundingBox":[482,475,512,474,513,508,483,509],"text":"A","confidence":0.987},{"boundingBox":[519,474,567,472,567,506,520,508],"text":"La","confidence":0.987},{"boundingBox":[573,472,669,468,670,501,574,505],"text":"Mode","confidence":0.984}]},{"boundingBox":[489,508,666,501,666,518,490,526],"text":"CHOCOLATE","words":[{"boundingBox":[489,509,666,501,666,519,490,527],"text":"CHOCOLATE","confidence":0.981}]}]}]}}

10．テキスト抽出結果を音声合成に投げて読み上げます。

実行

実行動画を見てください。こんな感じで文字を読めるようになりました！割と小さい文字もいけます。名刺とかも読めるし、便利かもしれないです。日本語対応はまだなので待つしかないですね。（現在の対応言語：Dutch, English, French, German, Italian, Portuguese and Spanish）

画像から文字を読めるようになった！#Azure #CognitiveServices #OCR #Read #TTS #HoloLens2 #MRTK #Unity #AI pic.twitter.com/e37gEAQhO0
— 藤本賢志(ガチ本) (@sotongshi) December 13, 2020

ちなみにWakeワードの「ヨンシル」は「4種類」や「キャンセル」などに誤認しやすいので、変えた方がいいです。「ガチモト」は「が地元」や「合致もっと」になるので、「藤本」がいいです。。

参考

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up