AzureのText2Speechを試してみる

Last updated at 2023-10-16Posted at 2023-10-16

環境

Chip: Macbook M1

OS: Ventura 13.2.1 (22D68)

環境: Unity 2021.3.16f1

合成音声に喋らせる

コードの全体を載せておきます。詳細は後から解説します

TextToSpeechService.cs

using System;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System.Threading.Tasks;

public class TextToSpeechService
{

		static string speechKey = "YOUR_API_KYE";
    static string serviceRegion = "YOUR_INSTANCE";
    static Uri endpoint = new Uri("YOUR_ENDPOINT");

    SpeechConfig config;
    private SpeechSynthesizer synthesizer;

    private void Init()
    {
        config = SpeechConfig.FromEndpoint(endpoint, speechKey);
        config.SpeechRecognitionLanguage = "ja-JP";
        config.SpeechSynthesisLanguage = "ja-JP";
        config.SpeechSynthesisVoiceName = "ja-JP-NanamiNeural";
        config.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);

        synthesizer = new SpeechSynthesizer(config, null);
        synthesizer.SynthesisCanceled += (s, e) =>
        {
            var cancellation = SpeechSynthesisCancellationDetails.FromResult(e.Result);
            Debug.LogError(
                $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?");
        };
    }

    public async void Speak(string text)
    {
 
        using var result = await synthesizer.SpeakTextAsync(text);
        if (result.Reason == ResultReason.SynthesizingAudioCompleted)
        {
            Debug.Log("Synthesis completed.");

            var audioDataStream = AudioDataStream.FromResult(result);

            // var filePath = Application.dataPath + "/Audio/" + speechText + ".wav";
            // await audioDataStream.SaveToWaveFileAsync(filePath);
            // Debug.Log($"Audio file was saved to {filePath}.");
           
        }
        else if (result.Reason == ResultReason.Canceled)
        {
            var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
            Debug.Log($"CANCELED: Reason={cancellation.Reason}");

            if (cancellation.Reason == CancellationReason.Error)
            {
                Debug.Log($" ErrorCode={cancellation.ErrorCode}");
                Debug.Log($" ErrorDetails={cancellation.ErrorDetails}");
            }
        }
        else
        {
            Debug.Log("Unknown error occurred.");
        }

    }

}

Unityでのコード

```c#
using System;
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Microsoft.CognitiveServices.Speech;
using System.Threading.Tasks;
using System.Threading;
using UnityEngine.Events;

public class TextToSpeechService : MonoBehaviour
{
    public AudioSource audioSource;

    static string speechKey = "YOUR_API_KYE";
    static string serviceRegion = "YOUR_INSTANCE";
    static Uri endpoint = new Uri("YOUR_ENDPOINT");

    private const int SampleRate = 24000;

    SpeechConfig config;
    private SpeechSynthesizer synthesizer;

    private void Start()
    {
        audioSource = audioSource == null ? GetComponent<AudioSource>() : audioSource;

        config = SpeechConfig.FromEndpoint(endpoint, speechKey);
        config.SpeechRecognitionLanguage = "ja-JP";
        config.SpeechSynthesisLanguage = "ja-JP";
        config.SpeechSynthesisVoiceName = "ja-JP-NanamiNeural";
        config.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);

        synthesizer = new SpeechSynthesizer(config, null);
        synthesizer.SynthesisCanceled += (s, e) =>
        {
            var cancellation = SpeechSynthesisCancellationDetails.FromResult(e.Result);
            Debug.LogError(
                $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?");
        };
    }

    public async void Speak(string text)
    {
        try
        {
            using var result = await synthesizer.SpeakTextAsync(text);
            if (result.Reason == ResultReason.SynthesizingAudioCompleted)
            {
                Debug.Log("Synthesis completed.");

                var audioDataStream = AudioDataStream.FromResult(result);

                var filePath = Application.dataPath + "/Audio/" + text + ".wav";
                await audioDataStream.SaveToWaveFileAsync(filePath);
                Debug.Log($"Audio file was saved to {filePath}.");

                if (!audioDataStream
                        .CanReadData(4092 * 2)) // audio clip requires 4096 samples before it's ready to play
                {
                    return;
                }

								// Audio clipに変換
                var audioClip = AudioClip.Create(
                    "Speech",
                    SampleRate * 600, // Can speak 10mins audio as maximum
                    1,
                    SampleRate,
                    true,
                    (float[] audioChunk) =>
                    {
                        var chunkSize = audioChunk.Length;
                        var audioChunkBytes = new byte[chunkSize * 2];
                        var readBytes = audioDataStream.ReadData(audioChunkBytes);

                        for (int i = 0; i < chunkSize; ++i)
                        {
                            if (i < readBytes / 2)
                            {
                                audioChunk[i] = (short)(audioChunkBytes[i * 2 + 1] << 8 | audioChunkBytes[i * 2]) /
                                                32768.0F;
                            }
                            else
                            {
                                audioChunk[i] = 0.0f;
                            }
                        }

                        if (readBytes == 0)
                        {
                            Thread.Sleep(200); // Leave some time for the audioSource to finish playback
                            audioSourceNeedStop = true;
                        }
                    });
                audioClip.name = "DynamicClip";
                audioSource.clip = audioClip;
                audioSource.Play();
            }
            else if (result.Reason == ResultReason.Canceled)
            {
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                Debug.Log($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Debug.Log($" ErrorCode={cancellation.ErrorCode}");
                    Debug.Log($" ErrorDetails={cancellation.ErrorDetails}");
                }
            }
            else
            {
                Debug.Log("Unknown error occurred.");
            }
            result.Dispose();
        }
        catch (Exception e)
        {
            Debug.LogError(e.ToString());
            throw;
        }
    }

    private void OnDestroy()
    {
        synthesizer.Dispose();
    }
}
```

詳細

APIキーを取得する

APIキーの取得方法はこの記事を参考にしてください。

サービスのOverviewに行くと、KEYやエンドポイントが見れます

コンフィグを設定

まず、コンフィグを設定します。

SpeechConfig config;
// エンドポイントとサブスクリプションキーをセット
config = SpeechConfig.FromEndpoint(endpoint,speechKey);
//　喋らせたい言語を設定
config.SpeechSynthesisLanguage = "ja-JP";
// 声の種類を設定
config.SpeechSynthesisVoiceName = "ja-JP-NanamiNeural";
// フォーマットを選択
config.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);
// インスタンス化
synthesizer = new SpeechSynthesizer(config, null);
// エラーハンドリング
synthesizer.SynthesisCanceled += (s, e) =>
{
    var cancellation = SpeechSynthesisCancellationDetails.FromResult(e.Result);
    Debug.LogError(
        $"CANCELED:\nReason=[{cancellation.Reason}]\nErrorDetails=[{cancellation.ErrorDetails}]\nDid you update the subscription info?");
};

フォーマットのリファレンス

声の種類

声の種類は日本語だと、７種類あります。リンクに飛ぶとサンプルが聞けます。

ja-JP-NanamiNeural (Female)
ja-JP-KeitaNeural (Male)
ja-JP-AoiNeural (Female)
ja-JP-DaichiNeural (Male)
ja-JP-MayuNeural (Female)
ja-JP-NaokiNeural (Male)
ja-JP-ShioriNeural (Female)

その他の言語は公式リファレンスを確認してください。これによると、専用のMarkup Languageを使うと、声の調整や感情表現ができるらしいです。

喋らせる

SpeakTextAsyncで音声の変換を開始します。

Result.audiodDataは、byte列での音声データなので、それらを変換して使ってください。今回は、AudioDataStream.FromResultを使い、Byteデータ列をWavファイルに変換して、保存をしてます。

また、Resultには通信の状況も入っているので、それでエラーハンドリングします。

// 音声認識を開始
using var result = await synthesizer.SpeakTextAsync(text);
if (result.Reason == ResultReason.SynthesizingAudioCompleted)
{
    Debug.Log("Synthesis completed.");

    var audioDataStream = AudioDataStream.FromResult(result);
    var filePath = Application.dataPath + "/Audio/" + speechText + ".wav";
    await audioDataStream.SaveToWaveFileAsync(filePath);
    Debug.Log($"Audio file was saved to {filePath}.");
   
}
else if (result.Reason == ResultReason.Canceled)
{
    var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
    Debug.Log($"CANCELED: Reason={cancellation.Reason}");

    if (cancellation.Reason == CancellationReason.Error)
    {
        Debug.Log($" ErrorCode={cancellation.ErrorCode}");
        Debug.Log($" ErrorDetails={cancellation.ErrorDetails}");
    }
}
else
{
    Debug.Log("Unknown error occurred.");
}

遭遇したエラー

dylibがMacにブロックされて動かない事態が発生しました。
設定→Security & Privacy→セキュリティの欄に未知のアプリがあるが許可するかどうかあるので、許可を選択して実行できるようになります。

公式リファレンス

.NETの名前空間
~~なお、使い方は載っていないので、どう実装するかはわからない模様~~

ここには少しばかりのチュートリルが載ってます

もしかしたら、公式のGithubが一番参考になるかも

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up