概要

Unityの音声認識(UnityEngine.Windows.Speech.DictationRecognizer)を使ってVOICEROID+で発話(SendMessageで外部アプリ操作)というのをやってみました。

音声認識精度

後述のサンプルコードに会話(独り言)内容を入力した際の記録です。

感触としてはWebSpeechAPI(ゆかりねっと)よりも素直な認識です。というのもWebSpeechAPIは文の区切りまでの内容による推定処理の影響が強く、話し言葉だと内容によっては総崩れになって訳の分からない結果になることがあります。これに対してUnityEngine.Windows.Speech.DictationRecognizerは推定はしていそうなものの、WebSpeechAPIほどではない印象で、文法的に厳しい話し言葉でも総崩れにはならないように思います。ただし、間違え方も素直なので活舌や録音環境の影響が強く、単語の言い間違いは多く感じます。

サンプルコード

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UnityEngine;
using UnityEngine.Windows.Speech;

public class WinSpeech : MonoBehaviour
{
    public const int WM_SETTEXT = 0x000C;
    public const int WM_LBUTTONDOWN = 0x201;
    public const int WM_LBUTTONUP = 0x202;
    public const int MK_LBUTTON = 0x0001;
    public const int GWL_STYLE = -16;

    [System.Runtime.InteropServices.DllImport("user32.dll")]
    static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, uint wParam, uint lParam);

    [System.Runtime.InteropServices.DllImport("user32.dll")]
    static extern IntPtr SendMessage(IntPtr hWnd, UInt32 Msg, IntPtr ptr, byte[] lParam);

    [System.Runtime.InteropServices.DllImport("user32.dll")]
    public static extern IntPtr FindWindowEx(IntPtr hWnd, IntPtr hwndChildAfter, string lpszClass, string lpszWindow);

    [System.Runtime.InteropServices.DllImport("user32")]
    public static extern int GetWindowLong(IntPtr hWnd, int nIndex);

    [System.Runtime.InteropServices.DllImport("user32.dll")]
    public static extern int GetClassName(IntPtr hWnd, StringBuilder lpClassName, int nMaxCount);

    [System.Runtime.InteropServices.DllImport("user32.dll")]
    public static extern int GetWindowTextLength(IntPtr hWnd);

    [System.Runtime.InteropServices.DllImport("user32.dll")]
    public static extern int GetWindowText(IntPtr hWnd, byte[] lpString, int nMaxCount);

    // テキストボックス
    IntPtr edit;

    // 再生ボタン
    IntPtr play;

    // 音声認識
    DictationRecognizer recognizer;

    void Start()
    {
        // 起動時に一回全取得
        var all = GetAllChildWindows(GetWindow(IntPtr.Zero), new List<Window>());
        string log = "";
        for (int i = 0; i < all.Count; i++)
        {
            log += all[i].Title + " - " + all[i].ClassName + "(" + all[i].hWnd + ") [" + all[i].Style + "]\r\n";
        }
        Debug.Log("check all window\r\n" + log);

        // ターゲットから取得
        int index = all.IndexOf(FindTarget("VOICEROID＋ 琴葉葵", all));
        edit = all[index + 10].hWnd;
        play = FindTarget(" 再生", all, index).hWnd;

        // 音声認識初期化
        recognizer = new DictationRecognizer();
        recognizer.InitialSilenceTimeoutSeconds = 10;
        recognizer.AutoSilenceTimeoutSeconds = 10;
        recognizer.DictationResult += OnResult;
    }

    private float timeleft;
    void Update()
    {
        //だいたい1秒ごとに処理を行う
        timeleft -= Time.deltaTime;
        if (timeleft <= 0.0)
        {
            timeleft = 1.0f;

            //ここに処理
            if (recognizer.Status == SpeechSystemStatus.Stopped)
            {
                recognizer.Start();
            }
        }
    }

    void OnDestroy()
    {
        recognizer.DictationResult -= OnResult;
        recognizer.Dispose();
    }

    void OnResult(string text, ConfidenceLevel confidence)
    {
        Debug.Log(confidence + ": " + text);

        // テキスト文字コード変換と送信
        Encoding sjisEnc = Encoding.GetEncoding("Shift_JIS");
        byte[] bytes = sjisEnc.GetBytes(text);
        SendMessage(edit, WM_SETTEXT, IntPtr.Zero, bytes);

        // 再生ボタン
        SendMessage(play, WM_LBUTTONDOWN, MK_LBUTTON, 0x000A000A);
        SendMessage(play, WM_LBUTTONUP, 0x00000000, 0x000A000A);
    }

    // 指定したタイトルのウィンドウを取得(indexを指定した場合は途中から)
    public static Window FindTarget(string title, List<Window> all, int index = 0)
    {
        for (int i = index; i < all.Count; i++)
        {
            if (all[i].Title == title)
            {
                Debug.Log(all[i].Title + " : " + all[i].ClassName + "(" + all[i].Style + ")");
                return all[i];
            }
        }
        return null;
    }

    // 指定したウィンドウの全ての子孫ウィンドウを取得し、リストに追加する
    public static List<Window> GetAllChildWindows(Window parent, List<Window> dest)
    {
        dest.Add(parent);
        EnumChildWindows(parent.hWnd).ToList().ForEach(x => GetAllChildWindows(x, dest));
        return dest;
    }

    // 与えた親ウィンドウの直下にある子ウィンドウを列挙する（孫ウィンドウは見つけてくれない）
    public static IEnumerable<Window> EnumChildWindows(IntPtr hParentWindow)
    {
        IntPtr hWnd = IntPtr.Zero;
        while ((hWnd = FindWindowEx(hParentWindow, hWnd, null, null)) != IntPtr.Zero) { yield return GetWindow(hWnd); }
    }

    // ウィンドウハンドルを渡すと、ウィンドウテキスト（ラベルなど）、クラス、スタイルを取得してWindowsクラスに格納して返す
    public static Window GetWindow(IntPtr hWnd)
    {
        int textLen = GetWindowTextLength(hWnd);
        string windowText = null;
        if (0 < textLen)
        {
            //ウィンドウのタイトルを取得する
            byte[] windowTextBuffer = new byte[textLen + 1];
            GetWindowText(hWnd, windowTextBuffer, textLen + 1);
            string text = Encoding.GetEncoding("Shift_JIS").GetString(windowTextBuffer);
            windowText = text.Substring(0, text.Length - 1);
        }

        //ウィンドウのクラス名を取得する
        StringBuilder classNameBuffer = new StringBuilder(256);
        GetClassName(hWnd, classNameBuffer, classNameBuffer.Capacity);

        // スタイルを取得する
        int style = GetWindowLong(hWnd, GWL_STYLE);
        return new Window() { hWnd = hWnd, Title = windowText, ClassName = classNameBuffer.ToString(), Style = style };
    }
}

public class Window
{
    public string ClassName;
    public string Title;
    public IntPtr hWnd;
    public int Style;
}

参考

http://tips.hecomi.com/entry/2017/02/12/211458
https://docs.unity3d.com/jp/540/ScriptReference/Windows.Speech.DictationRecognizer.html
http://tech.sanwasystem.com/entry/2015/11/25/171004
https://qa.atmarkit.co.jp/q/3173
http://qiita.com/okuhiiro/items/4c76fd8862e2bbb08ac7
http://dobon.net/vb/dotnet/string/getencodingobject.html

Unityで音声認識してVOICEROIDで発話

概要

音声認識精度

サンプルコード

参考