Qiita Engineer Festa20242024年7月17日まで開催中！

Unity Google Cloud Speech to Text APIとVADアルゴリズム結合

Posted at 2024-07-01

🎤 Unity Google Cloud Speech to Text APIとVADアルゴリズム結合

Google Cloud Speech to Text APIは公式にUnityを直接サポートしていません。
しかし、REST APIを通じてGoogle Cloud Speech-to-TextをUnityで使用することができます。

📜 ソースコード

以下のコードは、音声活動検出(Voice Activity Detection, VAD）を独自に実装した例であるMicrophone Inputです。簡単な音声活動検出のために、マイク入力のサンプルを分析し、一定の音量（しきい値）を超える場合を音声活動とみなします。

より精巧なVADアルゴリズムを使用するには、GCP Speech-to-Text API自体のVAD機能を使ったストリーミング認識を利用することをお勧めします。

NuGetパッケージマネージャコンソールでGoogle.Cloud.Speech.V1をインストールする必要があります。

GoogleSTTServiceは、受け取った音声データをGoogle STT APIに送信し、変換されたテキストをTranscriptViewに返します。
MicrophoneInputは音声データを収集し、それをGoogleSTTServiceに渡します。
TranscriptViewは、MicrophoneInputおよびGoogleSTTServiceと相互作用します。

using Cysharp.Threading.Tasks;
using System;
using System.Text;
using UnityEngine;
using UnityEngine.Networking;
using Newtonsoft.Json.Linq;
using UniRx;

public class GoogleSTTService : MonoBehaviour
{
    private const string API_KEY = "YOUR_API_KEY";
    private const string URL = "https://speech.googleapis.com/v1/speech:recognize?key=";
    private const string Locale =
        // "en-US"
        // "ko-KR"
        "ja-JP"
        ;

    public ReactiveCommand<string> OnRecognizeSpeechCommand = new();

    public async void RecognizeSpeech(byte[] audioData)
    {
        string audioContent = Convert.ToBase64String(audioData);
        string requestJson = $"{{\"config\": {{\"encoding\":\"LINEAR16\",\"sampleRateHertz\":16000,\"languageCode\":\"{Locale}\"}},\"audio\":{{\"content\":\"{audioContent}\"}}}}";
        string fullUrl = URL + API_KEY;
        using var request = new UnityWebRequest(fullUrl, "POST");
        byte[] bodyRaw = Encoding.UTF8.GetBytes(requestJson);
        request.uploadHandler = new UploadHandlerRaw(bodyRaw);
        request.downloadHandler = new DownloadHandlerBuffer();
        request.SetRequestHeader("Content-Type", "application/json");

        await request.SendWebRequest();
        if (request.result == UnityWebRequest.Result.ConnectionError || request.result == UnityWebRequest.Result.ProtocolError)
        {
            Debug.LogError($"GoogleSTTService.RecognizeSpeech() request.error is [{request.error}]");
            OnRecognizeSpeechCommand.Execute(string.Empty);
        }
        else
        {
            string responseText = request.downloadHandler.text;
            var json = JObject.Parse(responseText);
            string transcript = json["results"]?[0]?["alternatives"]?[0]?["transcript"]?.ToString();

            if (!string.IsNullOrEmpty(transcript))
            {
                OnRecognizeSpeechCommand.Execute(transcript);
            }
        }
    }
}

using System;
using UnityEngine;
using UniRx;

public class MicrophoneInput : MonoBehaviour
{
    private const int SampleWindow = 128;
    private const float VoiceThreshold = 0.25f;
    private const float VADTimeout = 1.0f; // 1 second timeout for VAD

    private AudioClip microphoneClip;
    private float lastVoiceDetectedTime;

    public ReactiveCommand<byte[]> OnMaxLevelChangeCommand = new();

    private void Start()
    {
        microphoneClip = Microphone.Start(null, true, 10, 16000);
        lastVoiceDetectedTime = Time.time;
    }

    private void FixedUpdate()
    {
        CheckMaxLevel();

        // If no voice is detected for the timeout duration, trigger the command
        if (Time.time - lastVoiceDetectedTime > VADTimeout)
        {
            var microphoneData = GetMicrophoneData();
            if (microphoneData != null)
            {
                OnMaxLevelChangeCommand.Execute(microphoneData);
            }
            lastVoiceDetectedTime = Time.time; // Reset the timer after sending data
        }
    }

    private void CheckMaxLevel()
    {
        float maxLevel = 0f;
        float[] samples = new float[SampleWindow];
        int startPosition = Microphone.GetPosition(null) - SampleWindow + 1;
        if (startPosition > 0)
        {
            microphoneClip.GetData(samples, startPosition);

            foreach (var sample in samples)
            {
                float absSample = Mathf.Abs(sample);
                if (absSample > maxLevel)
                {
                    maxLevel = absSample;
                }
            }

            if (maxLevel > VoiceThreshold)
            {
                lastVoiceDetectedTime = Time.time; // Update the last detected time when voice is detected
            }
        }
    }

    private byte[] GetMicrophoneData()
    {
        if (Microphone.GetPosition(null) <= 0)
        {
            return null;
        }
        else
        {
            float[] samples = new float[microphoneClip.samples * microphoneClip.channels];
            microphoneClip.GetData(samples, 0);
            byte[] audioData = new byte[samples.Length * 2];
            for (int i = 0; i < samples.Length; i++)
            {
                short sample = (short)(samples[i] * short.MaxValue);
                byte[] sampleBytes = BitConverter.GetBytes(sample);
                audioData[i * 2] = sampleBytes[0];
                audioData[i * 2 + 1] = sampleBytes[1];
            }
            return audioData;
        }
    }
}

using Cysharp.Threading.Tasks;
using UnityEngine;
using TMPro;
using UniRx;

public class TranscriptView : MonoBehaviour
{
    [SerializeField] private MicrophoneInput microphoneInput;
    [SerializeField] private GoogleSTTService googleSTTService;

    public TMP_Text transcriptText;

    private void Awake()
    {
        if (!microphoneInput) microphoneInput = GetComponent<MicrophoneInput>();
        if (!googleSTTService) googleSTTService = GetComponent<GoogleSTTService>();

        microphoneInput.OnMaxLevelChangeCommand
            .Subscribe(OnMaxLevelChangeExecuted).AddTo(this);

        googleSTTService.OnRecognizeSpeechCommand
            .Subscribe(OnRecognizeSpeechExecuted).AddTo(this);
    }

    private void OnMaxLevelChangeExecuted(byte[] microphoneData)
    {
        googleSTTService.RecognizeSpeech(microphoneData);
    }

    private void OnRecognizeSpeechExecuted(string transcript)
    {
        transcriptText.text = transcript;
    }
}

🪅 音声活動検出（Voice Activity Detection, VAD）アルゴリズム

音声活動検出（Voice Activity Detection, VAD）アルゴリズムは、オーディオ信号から音声と非音声の区間を区別する技術です。このアルゴリズムはさまざまな応用分野で使用されます。例えば、音声認識システムでは、VADが音声区間を識別することで不要なノイズを除去し、音声認識の精度を向上させます。また、通信システムでは、送信するデータを減らすことで帯域幅を節約することができます。

VADアルゴリズムの基本原理

エネルギー基準方法：音声信号は一般的に非音声区間よりも高いエネルギーを持つため、信号のエネルギーを測定して音声区間を検出します。
周波数ドメイン方法：音声信号と非音声信号は周波数スペクトルで異なる特性を持つため、周波数分析を通じて区別することができます。
統計的手法：信号の統計的特性を利用して音声区間と非音声区間を区別します。例えば、信号の自己相関関数やクロスエントロピーを利用することができます。
機械学習方法：音声データと非音声データを学習して分類モデルを生成します。最近では、ディープラーニングを活用したVADモデルも多く使用されています。

追加資料

韓国音響学会誌 - Signal Subspace-based Voice Activity Detection Using Generalized Gaussian Distribution(一般化ガウス分布を利用した信号準空間ベースの音声検出手法)

C#でのVAD実装例

理解を助けるための例であり、今回の記事の主題で使用された例ではありません。

C#でVADアルゴリズムを実装するために、NAudioライブラリを使用できます。NAudioは、WAVファイルのロード、再生、処理など、オーディオ処理のためのライブラリです。NAudioライブラリのインストールは、NuGetパッケージマネージャを使用して行えます。次のコマンドを使用してください。

Install-Package NAudio

この例では、エネルギー基準のVADを実装していますが、他の方法（周波数ドメイン方法、統計的手法、機械学習方法）も同様に実装できます。

以下に、簡単なエネルギー基準のVAD実装例を示します。

using System;
using NAudio.Wave;

class Program
{
    static void Main(string[] args)
    {
        string filePath = "path_to_your_wav_file.wav";
        float threshold = 0.1f;  // エネルギーしきい値

        using (var reader = new AudioFileReader(filePath))
        {
            int sampleRate = reader.WaveFormat.SampleRate;
            int channels = reader.WaveFormat.Channels;
            float[] buffer = new float[sampleRate * channels];

            int samplesRead;
            while ((samplesRead = reader.Read(buffer, 0, buffer.Length)) > 0)
            {
                float sum = 0;
                for (int i = 0; i < samplesRead; i++)
                {
                    sum += buffer[i] * buffer[i];
                }
                float rms = (float)Math.Sqrt(sum / samplesRead);

                if (rms > threshold)
                {
                    Console.WriteLine("音声活動検出");
                }
                else
                {
                    Console.WriteLine("無音");
                }
            }
        }
    }
}

このコードは、指定されたWAVファイルを読み込み、一定のエネルギーしきい値を超える場合に音声活動を検出します。NAudioライブラリを使用してオーディオファイルを読み込み、サンプルごとのエネルギーを計算して簡単なVADを実装しています。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up