More than 3 years have passed since last update.

【Unity】GoogleSTT(Streaming)を2つの方法で試した

Last updated at 2021-06-21Posted at 2021-06-21

#1. はじめに

UnityでGoogleSpeechToTextのStreamingを2つの方法で試してみました. Unityでマイクからの音声を取得するには, いくつかの方法があり, その内の2つを利用しました. 具体的には, OnAudioFilterRead()とAudioClip.GetData()を使いました.
UnityでGoogleSTTをしたい方や, いくつかの方法を比較してみたい方に参考になると思います.

補足
マイク音声取得の別の方法として, Platform依存の音声系ライブラリをUnityに入れてやる, みたいな方法もあるそうです.

#2. 準備

Assets/Plugins/ にGoogleStreamingSpeechToText用のdllを入れる

dllを入れる所が１番の正念場かもしれません. 僕は, こちらを参考に頑張りました.

行き詰まった場合は, 僕が使った Plugins/ を使ってもらって結構です.

GCPから SpeechToTextを有効化したサービスアカウントキー(credentials.json)を取得
Assets/StreamingAssets/ に, credentials.jsonを配置
AudioMixerで MicMixerを作成し, Masterの子要素として MuteMicを作成.(名前は何でも良い)
ハウリングしないように, MuteMicのdBを最下限-80dBに設定.

GameObjectを作成し, AudioSourceをアタッチ
アタッチしたAudioSourceのOutputに, 先程作ったMuteMicを設定
Start用とStop用のButtonを作成
以下で説明しているスクリプトを作成し, GameObjectにアタッチし, スクリプトにButtonを割り当てる

#3. スクリプト

1つ目の方法( OnAudioFilterRead()を使用 )

GoogleSTT1.cs

using System;
using System.IO;
using System.Collections;
using System.Threading;
using System.Threading.Tasks;
using UnityEngine.UI;
using UnityEngine;
using Google.Protobuf;
using Google.Cloud.Speech.V1;
using Google.Api.Gax.Grpc;

[RequireComponent(typeof(AudioSource))]
public class GoogleSTT1 : MonoBehaviour
{
    [SerializeField] Button startButton;
    [SerializeField] Button stopButton;
    private AudioSource audioSource;
    private AudioConfiguration audioConfig;
    private SpeechClient.StreamingRecognizeStream response;
    private Task responseHandler;
    private CancellationTokenSource cancellationTokenSource;
    private float MicInitializationTimeout = 2;
    private bool IsListening = false;
    private string micName;
    private string langCode = "ja-JP";
    private const string CredentialFileName = "credentials.json";
    private const double NormalizedFloatTo16BitConversionFactor = 0x7FFF + 0.4999999999999999;
    private const int StreamingLimit = 290000; // 約5分. この時間なら継続してSTTできるっぽい

    void Awake()
    {
        // 環境変数を設定
        string credentialsPath = Path.Combine(Application.streamingAssetsPath, CredentialFileName);
        Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", credentialsPath);

        // オーディオ設定 (ProjectSettings -> Audio から変更可能)
        audioConfig = AudioSettings.GetConfiguration();

        // sampleRate = 1秒間のサンプリング数
        // dspBufferSize = レイテンシに関わるもの. 小さい値ほど遅延が少なくなるが,性能が求められる
        // speakerMode = MonoやStereo
        Debug.Log(String.Format("sampleRate, dspBufferSize, speakerMode -> {0}, {1}, {2}\n", audioConfig.sampleRate, audioConfig.dspBufferSize, audioConfig.speakerMode));

        // マイクのアクセス許可
        StartCoroutine(nameof(RequestMicAuthorization));

        audioSource = gameObject.GetComponent<AudioSource>();

        foreach (string device in Microphone.devices) { Debug.Log(device); } //使用可能なデバイス一覧
        micName = Microphone.devices[0];

        startButton.onClick.AddListener(async () => await StartSTT());
        stopButton.onClick.AddListener(async () => await StopSTT());
    }

    private IEnumerator RequestMicAuthorization()
    {
        while (!Application.HasUserAuthorization(UserAuthorization.Microphone))
        {
            yield return Application.RequestUserAuthorization(UserAuthorization.Microphone);
        }
    }

    private async Task StartSTT()
    {
        SpeechClient speechClient = SpeechClient.Create();
        response = speechClient.StreamingRecognize();

        // 初期設定リクエスト
        StreamingRecognizeRequest request = new StreamingRecognizeRequest
        {
            StreamingConfig = new StreamingRecognitionConfig()
            {
                Config = new RecognitionConfig()
                {
                    Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
                    SampleRateHertz = audioConfig.sampleRate,
                    LanguageCode = langCode,
                },
                InterimResults = true,
            }
        };
        await response.WriteAsync(request);

        // マイクをONにしてauiodClipを生成
        AudioClip audioClip = Microphone.Start(micName, true, 20, audioConfig.sampleRate);
        audioSource.clip = audioClip;

        // マイクが初期化されるまで待つ. これがないとリスタート時にマイクが初期化されない時がある.
        float timerStartTime = Time.realtimeSinceStartup;
        bool timedOut = false;
        while (!(Microphone.GetPosition(micName) > 0))
        {
            timedOut = Time.realtimeSinceStartup - timerStartTime >= MicInitializationTimeout;
            if (timedOut)
            {
                Debug.LogError("マイクが初期化できませんでした");
                return;
            }
        }

        // audioSopurceから再生(AudioMixerで-80dBにしているのでハウリングしない)
        audioSource.loop = true;
        audioSource.Play();

        // STTを止めた時に, streamingLimit秒まで待つtaskをキャンセルしたい
        cancellationTokenSource = new CancellationTokenSource();

        // 非同期処理をセット. コールバック的な感じ
        responseHandler = ResponseHandlerTask();

        IsListening = true;
        Debug.Log("喋ってください");

        await RestartAfterStreamingLimit();
    }

    private async Task StopSTT()
    {
        if (!IsListening) return;

        IsListening = false;
        Microphone.End(micName);
        audioSource.Stop();

        await response.WriteCompleteAsync(); //完了リクエスト
        await responseHandler; //終わるまで待つ

        cancellationTokenSource?.Cancel(); //RestartAfterStreamingLimitのTask.Delayをキャンセル

        Debug.Log("STTを終了");
    }

    private async Task RestartAfterStreamingLimit()
    {
        try
        {
            // streamingLimit秒待つ. 非同期で別スレッドで動くので,そんなに重くないはず
            await Task.Delay(StreamingLimit, cancellationTokenSource.Token);
            Debug.Log("Streaming limit reached, restarting");
            await StopSTT();
            await StartSTT();
        }
        catch (TaskCanceledException e)
        {
            // STTの途中でStopするとTaskがキャンセルされ, ここを通る
            Debug.Log(e.Message);
        }
    }

    private async Task ResponseHandlerTask()
    {
        AsyncResponseStream<StreamingRecognizeResponse> responseStream = response.GetResponseStream();
        while (await responseStream.MoveNextAsync())
        {
            StreamingRecognizeResponse responseItem = responseStream.Current;
            if (responseItem.Results.Count <= 0) continue;

            StreamingRecognitionResult result = responseStream.Current.Results[0];

            string transcript = result.Alternatives[0].Transcript.Trim();
            if (result.IsFinal)
            {
                Debug.Log("Final Result: " + transcript);
            }
            else
            {
                Debug.Log("Interim Result: " + transcript);
            }
        }

        // response.WriteCompleteAsync()が呼ばれると,ループを抜けれる.
        Debug.Log("ループ抜けた！");
    }

    private async void OnAudioFilterRead(float[] data, int channels)
    {
        if (!IsListening) return;

        // channelsは, Monoだと1, Stereoだと2
        // Debug.Log(String.Format("data.length, channels, dspBufferSize -> {0}, {1}, {2}\n", data.Length, channels, audioConfig.dspBufferSize));

        byte[] buffer = new byte[audioConfig.dspBufferSize * 2];

        // convert 1st channel of audio from floating point to 16 bit packed into a byte array
        // reference: https://github.com/naudio/NAudio/blob/ec5266ca90e33809b2c0ceccd5fdbbf54e819568/Docs/RawSourceWaveStream.md#playing-from-a-byte-array
        for (int i = 0; i < data.Length / channels; i++)
        {
            short sample = (short)(data[i * channels] * NormalizedFloatTo16BitConversionFactor);
            byte[] bytes = BitConverter.GetBytes(sample);
            buffer[i * 2] = bytes[0];
            buffer[i * 2 + 1] = bytes[1];
        }

        ByteString chunk = ByteString.CopyFrom(buffer, 0, buffer.Length);
        await response.WriteAsync(new StreamingRecognizeRequest() { AudioContent = chunk });
    }

    async void OnApplicationQuit()
    {
        await StopSTT();
    }
}

2つ目の方法 ( AudioClip.GetData()を使用 )

先程のスクリプトにおいて, メンバ変数に

GoogleSTT2.cs

private int lastSample = 0;

を追加し, OnAudioFilterRead()を削除してください. さらに, 以下のコードを追加して下さい.

GoogleSTT2.cs

    async void Update()
    {
        if (!(Microphone.IsRecording(micName) && IsListening)) return;

        int pos = Microphone.GetPosition(micName);
        int diff = pos - lastSample;

        // Debug.Log(String.Format("pos, lastSample, diff -> {0}, {1}, {2}\n", pos, lastSample, diff));

        if (diff > 0)
        {
            float[] samples = new float[diff * audioSource.clip.channels];
            audioSource.clip.GetData(samples, lastSample);
            byte[] bytearray = ConvertAudioClipDataToInt16ByteArray(samples);
            if (bytearray.Length != 0)
            {
                ByteString chunk = ByteString.CopyFrom(bytearray, 0, bytearray.Length);
                await response.WriteAsync(new StreamingRecognizeRequest() { AudioContent = chunk });
            }
        }
        lastSample = pos;
    }

    private byte[] ConvertAudioClipDataToInt16ByteArray(float[] data)
    {
        MemoryStream dataStream = new MemoryStream();
        int x = sizeof(Int16);
        Int16 maxValue = Int16.MaxValue;
        int i = 0;
        while (i < data.Length)
        {
            dataStream.Write(BitConverter.GetBytes(Convert.ToInt16(data[i] * maxValue)), 0, x);
            ++i;
        }
        byte[] bytes = dataStream.ToArray();
        dataStream.Dispose();
        return bytes;
    }

補足
もし文字起こしが不安定だったり,うまくいかない場合は ProjectSettings -> Audio で DefaultSpeakerModeやDSPBufferSizeなどをいじってみてください.

#4. おわりに

STTの体感としては, どちらの方法も低遅延で文字起こしが返ってくるできた印象です.
ご自身のPC性能によると思いますが, 僕の場合, 長時間STTを実行していると, 若干重くなると感じました.
また, Profilerで両者を比較しても, そこまで差異は見られなかったです.

参考

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up