More than 1 year has passed since last update.

【Unity×C#×VoiceVox】VoiceVoxのAPIを使ってテキストから音声を再生する

Posted at 2023-12-29

概要

Unity×C#の環境でVoiceVoxのAPIを叩いて、テキストから音声を再生するまでのデモです。

開発環境

Windows 10
Unity 2019.4.31f1
Api Compatibility Level .NET Standard 2.0
VoiceVox 0.14.10

使用したパッケージ

UniTask

リンク先からunitypackageをダウンロードして、事前にプロジェクトにインポートしておきます。
https://github.com/Cysharp/UniTask

System.Text.Json

Jsonの解析に使いました。リンク先からパッケージをダウンロードし、dllファイルをUnityのAssetsの中にPluginsフォルダを作って、その中に入れておきます。
https://www.nuget.org/packages/System.Text.Json/

また、同様にして以下の依存パッケージのdllも入れておきます。
入れ忘れがあったら、Unity側のエラーで知らせてくれます。
https://www.nuget.org/packages/System.Threading.Tasks.Extensions
https://www.nuget.org/packages/System.Text.Encodings.Web
https://www.nuget.org/packages/System.Runtime.CompilerServices.Unsafe
https://www.nuget.org/packages/System.Memory
https://www.nuget.org/packages/System.Buffers
https://www.nuget.org/packages/Microsoft.Bcl.AsyncInterfaces

UnityNugetやNugetForUnityというパッケージ管理ツールでも入れられるようですが、今回は使用していません。

実装

詳細はスクリプト中にコメントで記載しています。

VoiceVoxRequestModel.cs

VoiceVox APIのクエリに渡す構造を定義します。

namespace VoiceVox.QueryJson
{
    using System;
    using System.Text.Json.Serialization;

    /// <summary>
    /// VoiceVoxのJsonフォーマットをクラス定義
    /// System.Text.Jsonを利用したため、書式はSystem.Text.Jsonのものです。
    /// </summary>
    [Serializable]
    public class QueryJson
    {
        [JsonPropertyName("accent_phrases")]
        public AccentPhrase[] AccentPhrases { get; set; }

        [JsonPropertyName("speedScale")]
        public float SpeedScale { get; set; }

        [JsonPropertyName("pitchScale")]
        public float PitchScale { get; set; }

        [JsonPropertyName("intonationScale")]
        public float IntonationScale { get; set; }

        [JsonPropertyName("volumeScale")]
        public float VolumeScale { get; set; }

        [JsonPropertyName("prePhonemeLength")]
        public float PrePhonemeLength { get; set; }

        [JsonPropertyName("postPhonemeLength")]
        public float PostPhonemeLength { get; set; }

        [JsonPropertyName("outputSamplingRate")]
        public int OutputSamplingRate { get; set; }

        [JsonPropertyName("outputStereo")]
        public bool OutputStereo { get; set; }

        [JsonPropertyName("kana")]
        public string Kana { get; set; }
    }

    [Serializable]
    public class AccentPhrase
    {
        [JsonPropertyName("moras")]
        public Mora[] Moras { get; set; }

        [JsonPropertyName("accent")]
        public int Accent { get; set; }

        [JsonPropertyName("pause_mora")]
        public Mora PauseMora { get; set; }

        [JsonPropertyName("is_interrogative")]
        public bool IsInterrogative { get; set; }
    }

    [Serializable]
    public class Mora
    {
        [JsonPropertyName("text")]
        public string Text { get; set; }

        [JsonPropertyName("consonant")]
        public string Consonant { get; set; }

        [JsonPropertyName("consonant_length")]
        public float? ConsonantLength { get; set; } // Nullable を使用

        [JsonPropertyName("vowel")]
        public string Vowel { get; set; }

        [JsonPropertyName("vowel_length")]
        public float? VowelLength { get; set; } // Nullable を使用

        [JsonPropertyName("pitch")]
        public float Pitch { get; set; }
    }
}

VoiceVoxSpeaker.cs

VoiceVox APIを処理するメインのスクリプトです。
VoiceVoxにクエリを投げる処理は非同期処理にしたいため、UniTaskを利用しています。UniTaskの説明は末尾の参考資料が詳しいかったです。

using UnityEngine;
using UnityEngine.Networking;
using System.Text;
using System.Text.Json;
using Cysharp.Threading.Tasks;
using VoiceVox.QueryJson;

/// <summary>
/// Unity 2019.4.31fでの動作環境を前提にしています。
/// isNetworkError と isHttpError プロパティを使用していますが
/// Unity 2020.1以降の場合は、UnityWebRequestにresultプロパティが追加されているため、そちらを利用します。
/// speaker変数を利用していますが、廃止が予定されているようです。うまく動かない場合は、VoiceVox Engineのバージョン
/// APIドキュメントを確認するようにしてください。
/// https://voicevox.github.io/voicevox_engine/api/
/// 非同期処理にはUniTaskを利用しました。
/// UniTask：https://github.com/Cysharp/UniTask
/// AudioClipの操作にはWavUtilityを利用しました。
/// WevUtility：https://github.com/deadlyfingers/UnityWav/tree/master
/// 実行前に必ず、ローカルでVoiceVox Engineを起動してください。
/// </summary>
public class VoiceVoxSpeaker
{
    private readonly string _voiceVoxUrl = "http://127.0.0.1:50021";
    private readonly int _speaker = 3;

    public VoiceVoxSpeaker(int speaker)
    {
        _speaker = speaker;
    }

    public async UniTask<AudioClip> TextToSpeech(string text, float speedScale, float pitchScale, float intonationScale, float volumeScale)
    {
        // クエリ作成
        string queryJson = await SendAudioQuery(text);
        // Debug.Log(queryJson);
        // クエリ修正
        string newQueryJson = ChangeQueryJson(queryJson, speedScale, pitchScale, intonationScale, volumeScale);
        // Debug.Log(newQueryJson);

        // 音声合成とオーディオクリップへのセット
        var clip = await GetAudioClip(newQueryJson);
        return clip;
    }

    /// <summary>
    /// クエリを作成します。
    /// </summary>
    private async UniTask<string> SendAudioQuery(string text)
    {
        var form = new WWWForm();
        using (var request = UnityWebRequest.Post($"{_voiceVoxUrl}/audio_query?text={text}&speaker={_speaker}", form))
        {
            await request.SendWebRequest();

            if (request.isNetworkError || request.isHttpError)
            {
                Debug.LogError(request.error);
            }
            else
            {
                var jsonString = request.downloadHandler.text;
                return jsonString;
            }
            return null;
        };
    }

    /// <summary>
    /// クエリを修正します。
    /// Jsonの解析にUnity標準のJsonUtilityを利用するとnullなどの一部のフィールドが正確に扱えなかったので
    /// System.Text.Jsonを利用します。
    /// </summary>
    /// <param name="queryJson">生成されたオリジナルのJson</param>
    /// <param name="speedScale">話速(0.50~2.00 Def. 1.00)</param>
    /// <param name="pitchScale">音高(-0.15~0.15 Def. 0.00)</param>
    /// <param name="intonationScale">抑揚(0.00~2.00 Def. 1.00)</param>
    /// <param name="volumeScale">音量(0.00~2.00 Def. 1.00)</param>
    /// <returns>修正後のJson</returns>
    public static string ChangeQueryJson(string queryJson, float speedScale, float pitchScale, float intonationScale, float volumeScale)
    {
        var queryObject = JsonSerializer.Deserialize<QueryJson>(queryJson);

        queryObject.SpeedScale = speedScale;
        queryObject.PitchScale = pitchScale;
        queryObject.IntonationScale = intonationScale;
        queryObject.VolumeScale = volumeScale;

        string modifiedQueryJson = JsonSerializer.Serialize(queryObject);
        return modifiedQueryJson;
    }

    /// <summary>
    /// 音声を合成し、オーディオクリップにセット
    /// </summary>
    private async UniTask<AudioClip> GetAudioClip(string queryJson)
    {
        var url = $"{_voiceVoxUrl}/synthesis?speaker={_speaker}";
        using (var req = new UnityWebRequest(url, "POST"))
        {
            // Content-Type を設定
            req.SetRequestHeader("Content-Type", "application/json");

            // リクエストボディを設定
            byte[] bodyRaw = Encoding.UTF8.GetBytes(queryJson);
            req.uploadHandler = new UploadHandlerRaw(bodyRaw);
            req.downloadHandler = new DownloadHandlerBuffer();

            // 音声合成
            await req.SendWebRequest();

            if (req.isNetworkError || req.isHttpError)
            {
                Debug.LogError(req.error);
            }
            else
            {
                // 音声をオーディオクリップにセット
                var audioClip = WavUtility.ToAudioClip(req.downloadHandler.data);
                return audioClip;
            }
            return null;
        }
    }
}

AudioPlayer.cs

AudioClipを再生する処理です。

using UnityEngine;
public class AudioPlayer : MonoBehaviour
{
    private AudioSource _audioSource;

    void Start()
    {
        // AudioSource コンポーネントの取得または追加
        _audioSource = GetComponent<AudioSource>();
        if (_audioSource == null)
        {
            _audioSource = gameObject.AddComponent<AudioSource>();
        }
    }

    // オーディオ再生
    public void PlayAudioClip(AudioClip clip)
    {
        _audioSource.clip = clip;
        _audioSource.Play();
    }
}

WavUtility.cs

ToAudioClipメソッドをそのまま使わせてもらいました。

DemoVoiceVox.cs

実際に、テキストを音声合成して、再生するデモスクリプトです。

using UnityEngine;
using Cysharp.Threading.Tasks;

public class DemoVoiceVox : MonoBehaviour
{
    private string text;
    private float speedScale;
    private float pitchScale;
    private float intonationScale;
    private float volumeScale;
    private VoiceVoxSpeaker voiceVoxSpeaker;
    // Start is called before the first frame update
    void Start()
    {
        text = @"
        私はキューブオブジェクトです。名前はまだありません。どこで生成されたのか、まったく覚えがありません。
        何でもクリックやドラッグという操作とともにポップアップし、三次元空間に配置されたことだけは記憶しています。
        ";
        // text = @"
        // 私は「がにまた」。名前はまだありません。どこで生成されたのか、まったく覚えがありません。
        // 何でも「ひまた工房」というところで作られ、三次元空間に産み落とされたことだけは記憶しています。
        // ";
        speedScale = 1.0f;
        pitchScale = 0.0f;
        intonationScale = 1.0f;
        volumeScale = 1.0f;

        // voiceVoxSpeaker = new VoiceVoxSpeaker(2); // スピーカーID:2 -> VOICEVOX:四国めたん(ノーマル)
        voiceVoxSpeaker = new VoiceVoxSpeaker(3); // スピーカーID:3 -> VOICEVOX:ずんだもん(ノーマル)
        StartVoiceVoxPlayer().Forget(); // Forget()メソッドで、awaitが無いけれど大丈夫？という警告を無視;
    }

    private async UniTask StartVoiceVoxPlayer()
    {
        // VoiceVoxからオーディオを取得し、AudioClipにセット
        AudioClip clip = await voiceVoxSpeaker.TextToSpeech(text, speedScale, pitchScale, intonationScale, volumeScale);

        // AudioClip を再生
        // なお、UnityのMonoBehaviourを継承するクラスはnewでインスタンスを作れません。
        AudioPlayer audioPlayer = GetComponent<AudioPlayer>();
        if (audioPlayer != null)
        {
            audioPlayer.PlayAudioClip(clip);
        }
    }

    // Update is called once per frame
    void Update() { }
}

利用方法

Unityを起動します。
Cube等のGameObjectを用意します。
用意したGameObjectのコンポーネントにDemoVoiceVox.cs、AudioPlayer.csを追加します。
用意したGameObjectにAudioSourceコンポーネントを追加します。
VoiceVoxを起動します。
Console画面を開き、Playモードに入ります。
数秒して、読み上げが始まったら成功です。

上の画像はゲームオブジェクトに各種コンポーネントをセットした状態の参考画像です。
なお、画像中のゲームオブジェクトはCubeではなく、「がにまた」（製作者：ひまた）です。

参考資料

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up