HoloLensでテキスト認識（Azure RecognizeText API）をやーる

Last updated at 2018-12-31Posted at 2018-12-30

大晦日ハッカソン2018の進捗です。

HoloLensでAzure Computer Vision APIのRecognizeTextAPIを叩いてみました！
タップすると、画像をキャプチャし、RecognizeTextAPIへ画像を送信、テキスト認識結果を表示します。もちろん、実機なしでもできます！

開発環境

HoloLens RS5
Visual Studio 2017 (15.9.2)
Unity 2017.4.11.f1
HoloToolkit-Unity-2017.4.3.0.unitypackage
HoloToolkit-Unity-Examples-2017.4.3.0.unitypackage
Azure Computer Vision API

Azureの設定

Azure Portalを開き、Computer Vision APIを作成します。

リソースに移動し、EndpointとKeyをメモります。

Unityプロジェクトの作成

プロジェクトを作成、HoloToolkitをインポートします。

いつもの設定をします。
MainCameraを削除し、ProjectビューからMixedRealityCameraParent、InputManager、DefaultCursorをHierarchyにD&Dします。
MixedRealityCameraParent->MixedRealityCameraのInspectorビューからCameraのClear FlagsをSolid Colorにします。そして、MixedRealityCameraManagerのClearFlagsをColor、Transparent Display SettingsのNear Clipを0.2にします。

File->Build SettingsからUniversal Windows Platformを選び、Switch Platformをクリックします。Player SettingsのOther Settings->Configuration->Scripting Backendを.NETにします。Publishing Settings->Capabilities->WebCam、Internet Clientにチェックを入れます。XR Settings->Virtual Reality Supportedにチェックを入れ、Virtual Reality SDKs->Windows Mixed Realityが追加されていることを確認します。

Ctrl+Sでシーンを保存します。名前はプロジェクト名と一緒にしました。Build Settings->Add OpenScenesからSceneを読み込み、Buildを選択、Appフォルダを作成し、ビルドします。

RecognizeTextManager.cs

空のGameObjectを作成し、名前をRecognizeTextManagerとします。
ProjectビューにScriptsフォルダを作成、RecognizeTextManager.csファイルを作成します。

using System;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Networking;

public class RecognizeTextManager : MonoBehaviour
{

    [Serializable]
    public class Words
    {
        public int[] boundingBox;
        public string text;
    }

    [Serializable]
    public class Lines
    {
        public int[] boundingBox;
        public string text;
        public Words[] words;
    }

    [Serializable]
    public class RecognitionResultData
    {
        public Lines[] lines;
    }

    [Serializable]
    public class RecognizedTextObject
    {
        public string status;
        public RecognitionResultData recognitionResult;
    }

    private string authorizationKey = "<insert your key>";
    private const string ocpApimSubscriptionKeyHeader = "Ocp-Apim-Subscription-Key";
    private string visionAnalysisEndpoint = "https://westus.api.cognitive.microsoft.com/vision/v2.0/recognizeText";
    private string requestParameters = "mode=Printed"; //"mode=Handwritten"
    private string operationLocation;

    private string imageFilePath;
    internal byte[] imageBytes;
    internal string imagePath;

    public TextMesh DebugText;

    public static RecognizeTextManager instance;

    private void Awake()
    {
        instance = this;
    }

    public IEnumerator RecognizeText()
    {
        WWWForm webForm = new WWWForm();
        string uri = visionAnalysisEndpoint + "?" + requestParameters;
        using (UnityWebRequest unityWebRequest = UnityWebRequest.Post(uri, webForm))
        {
            imageBytes = GetImageAsByteArray(imagePath);
            unityWebRequest.SetRequestHeader("Content-Type", "application/octet-stream");
            unityWebRequest.SetRequestHeader(ocpApimSubscriptionKeyHeader, authorizationKey);
            unityWebRequest.downloadHandler = new DownloadHandlerBuffer();
            unityWebRequest.uploadHandler = new UploadHandlerRaw(imageBytes);
            unityWebRequest.uploadHandler.contentType = "application/octet-stream";

            yield return unityWebRequest.SendWebRequest();

            long responseCode = unityWebRequest.responseCode;
            //Debug.Log(responseCode);
            if(responseCode == 202)
            {
                try
                {
                    var response = unityWebRequest.GetResponseHeaders();
                    operationLocation = response["Operation-Location"];
                    //Debug.Log(response["Operation-Location"]);
                }
                catch (Exception exception)
                {
                    Debug.Log("Json exception.Message: " + exception.Message);
                }

                Boolean poll = true;
                while (poll)
                {
                    using (UnityWebRequest operationLocationRequest = UnityWebRequest.Get(operationLocation))
                    {
                        operationLocationRequest.SetRequestHeader(ocpApimSubscriptionKeyHeader, authorizationKey);
                        yield return operationLocationRequest.SendWebRequest();
                        responseCode = unityWebRequest.responseCode;
                        //Debug.Log("operationLocation : "  + responseCode.ToString());
                        string jsonResponse = null;
                        jsonResponse = operationLocationRequest.downloadHandler.text;
                        //Debug.Log(jsonResponse);
                        RecognizedTextObject recognizedTextObject = new RecognizedTextObject();
                        recognizedTextObject = JsonUtility.FromJson<RecognizedTextObject>(jsonResponse);
                        //Debug.Log(recognizedTextObject.status);
                        if (recognizedTextObject.status == "Succeeded")
                        {
                            string result = null;
                            foreach (Lines line in recognizedTextObject.recognitionResult.lines)
                            {
                                result = result + line.text + "\n";
                            }
                            DebugText.text = result;
                            //Debug.Log(recognizedTextObject.recognitionResult.lines[0].text);
                            poll = false;
                        }
                        if (recognizedTextObject.status == "Failed")
                        {
                            poll = false;
                        }
                    }
                }
            }
            yield return null;
        }
    }

    private static byte[] GetImageAsByteArray(string imageFilePath)
    {
        FileStream fileStream = new FileStream(imageFilePath, FileMode.Open, FileAccess.Read);
        BinaryReader binaryReader = new BinaryReader(fileStream);
        return binaryReader.ReadBytes((int)fileStream.Length);
    }
}

authorizationKeyにメモったKeyを入れ、visionAnalysisEndpointのリージョンも自分のに合わせます。requestParametersはPrinted（ロゴなど）もしくはHandwritten（手書き）にします。

MixedRealityCameraParent->MixedRealityCameraの子オブジェクトに3DTextPrefabを作成し、名前をDebugTextとします。
RecognizeTextManager.csをRecognizeTextManagerにAdd Componentし、DebugTextをアタッチします。

ImageCapture.cs

タップしたら、画像をキャプチャし、RecognizeTextManager.csのRecognizeText()を呼びます。

using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using UnityEngine;
using UnityEngine.XR.WSA.Input;
using UnityEngine.XR.WSA.WebCam;
using HoloToolkit.Unity.InputModule;

public class ImageCapture : MonoBehaviour, IInputClickHandler
{

    public static ImageCapture instance;
    public int tapsCount;
    private PhotoCapture photoCaptureObject = null;
    private bool currentlyCapturing = false;

    private void Awake()
    {
        instance = this;
    }

    void Start()
    {
        InputManager.Instance.PushFallbackInputHandler(gameObject);
    }

    public void OnInputClicked(InputClickedEventData eventData)
    {
        if (currentlyCapturing == false)
        {
            currentlyCapturing = true;
            tapsCount++;
            ExecuteImageCaptureAndAnalysis();
        }
    }

    void OnCapturedPhotoToDisk(PhotoCapture.PhotoCaptureResult result)
    {
        photoCaptureObject.StopPhotoModeAsync(OnStoppedPhotoMode);
    }

    void OnStoppedPhotoMode(PhotoCapture.PhotoCaptureResult result)
    {
        photoCaptureObject.Dispose();
        photoCaptureObject = null;
        StartCoroutine(RecognizeTextManager.instance.RecognizeText());
    }

    private void ExecuteImageCaptureAndAnalysis()
    {
        Resolution cameraResolution = PhotoCapture.SupportedResolutions.OrderByDescending((res) => res.width * res.height).First();
        Texture2D targetTexture = new Texture2D(cameraResolution.width, cameraResolution.height);
        PhotoCapture.CreateAsync(false, delegate (PhotoCapture captureObject)
        {
            photoCaptureObject = captureObject;
            CameraParameters camParameters = new CameraParameters();
            camParameters.hologramOpacity = 0.0f; // for MR 0.9f
            camParameters.cameraResolutionWidth = targetTexture.width;
            camParameters.cameraResolutionHeight = targetTexture.height;
            camParameters.pixelFormat = CapturePixelFormat.BGRA32;
            captureObject.StartPhotoModeAsync(camParameters, delegate (PhotoCapture.PhotoCaptureResult result)
            {
                string filename = string.Format(@"CapturedImage{0}.jpg", tapsCount);
                string filePath = Path.Combine(Application.persistentDataPath, filename);
                RecognizeTextManager.instance.imagePath = filePath;
                photoCaptureObject.TakePhotoAsync(filePath, PhotoCaptureFileOutputFormat.JPG, OnCapturedPhotoToDisk);
                currentlyCapturing = false;
            });
        });
    }
}

実行

ビルドしたら、Appフォルダの中に生成されたMR_Azure_RecognizeText.slnをVisual Studioで開き、x86/ReleaseにしてHoloLensへビルドします。Unity Editor上でプレイボタンで開始してもOKです。

タップすると、テキスト認識の結果が表示されます。

ソースコードはこちら。

参考文献

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up