More than 1 year has passed since last update.

非同期処理対話システム in Unity

Posted at 2022-10-29

#　非同期の対話システム in Unityです。

このシステム中の音声認識、応答生成、音声合成、これらを管理する対話マネージャーを作ります。

必要なpackage

Azureの設定が必要です。

公式サイト

unity package

実装

対話マネージャー

スクリプト

Chat.cs

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Cysharp.Threading.Tasks;
using System.Threading.Tasks;
using System.Text;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using UnityEngine.Networking;


public class Chat : MonoBehaviour
{
    //インスタンス変数
    private STT stt; 
    private TTS tts;
    private ChatAPI chatAPI;
    //azureのspeech sdkの設定.
    static string subscriptionKey = "your subscription key";
    static string serviceRegion = "your region";
    static string language = "ja-JP";

    static string voiceName = "ja-JP-NanamiNeural";

    SpeechRecognizer speechRecognizer;

    SpeechSynthesizer speechSynthesizer;
    AudioConfig audioConfig;

    //そのほか, 通信設定とAnimation設定.
    [SerializeField] string chatAPIurl = "http://127.0.0.1:5000";

    private bool tasking;

    //Awaitの中ではUnityのアニメーションを管理できないので、別に逃す.
    public bool isSpeaking = false;

    public bool isGenerating = false;

    public float agentEmotionRate;

    public string agentEmotion;

    public float agentDialogueActRate;

    public string agentDialogueAct;

    [SerializeField] AudioSource characterAudioSource;

    private float bufferTimer;

    private void Start(){
        //インスタンスの生成
        stt = new STT();
        chatAPI = new ChatAPI();
        tts = new TTS();
        //HTTP cliant設定
        chatAPI.httpClient = new System.Net.Http.HttpClient();
        //通信先のurl
        chatAPI.url = chatAPIurl;
        var ssml_asset = Resources.Load("Cyber") as TextAsset;
        tts.ssml = ssml_asset.text;

        //azure speech sdkの設定.
        var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, serviceRegion);        
        speechConfig.SpeechRecognitionLanguage = language;
        speechConfig.SpeechSynthesisVoiceName = voiceName;
        speechRecognizer = new SpeechRecognizer(speechConfig);

        if(characterAudioSource != null){
            speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);
            speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
            tts.audioSource = characterAudioSource;
        }
        else{
            speechSynthesizer = new SpeechSynthesizer(speechConfig);
        }
    }
    async void Update(){
        if(isSpeaking && characterAudioSource != null && !isGenerating){
            if(Time.time - bufferTimer < 1.0f){
                return;
            }
            if(characterAudioSource.isPlaying){
                return;
            }
            isSpeaking = false;
            tasking = false;
            Debug.Log("Finish Synthesizing!");
        }
        //----------------------------------------//
        if(tasking){
            return;
        }
        tasking = true;
        await ChatAsync();
    }
    //音声認識
    async Task ChatAsync()
    {   
        var response = await stt.SpeechToText(speechRecognizer);
        if (response == ""){
            //待つ？
            Debug.Log("No Response!");
            tasking = false;
            return;
        }
        Debug.Log("response : " + response);
        isGenerating = true;
        var responseJson = await chatAPI.ChatText(response);
        Debug.Log("response : " +  responseJson.response);
        agentEmotionRate = responseJson.agentEmotionRate;
        agentEmotion = responseJson.agentEmotion;
        agentDialogueAct = responseJson.agentDialogueAct;
        agentDialogueActRate = responseJson.agentDialogueActRate;
        await tts.TextToSpeech(speechSynthesizer, responseJson.response);
        isSpeaking = true;
        //問題点 : 音声の再生中もisSpeakingと判定
        if(characterAudioSource != null){
            isGenerating = false;
            bufferTimer = Time.time;
            return;
        }else{
            isSpeaking = false;
            tasking = false;
            Debug.Log("Finish speaking!");
        }
        //--------------------------------------------------//
    }

}

解説

別Threadで実行を行います。そのため、Unityと単純な変数以外を同期できないので、bool値で管理を行っています。

chatAPIurl については, 対話システムでHTTP通信を行うURLについて記述します。別サーバーに対話システムを立てておくことで、HTTP通信が可能となります。

taskingが SpeechToText, Chat, TextToSpeech を行っているときに, trueになり新しく音声認識を行わないようになっています。

isSpeakingが音声合成の結果を再生している間にtrueになる変数です。

isGeneratingが応答生成と音声合成をしているときにtrueになる変数です。

SpeechToText

スクリプト

STT.cs

using System;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Cysharp.Threading.Tasks;

class STT
{
    //音声認識のレスポンスによるログの出力
    static void OutputSpeechRecognitionResult(SpeechRecognitionResult speechRecognitionResult)
    {
        switch (speechRecognitionResult.Reason)
        {
            case ResultReason.RecognizedSpeech:
                Console.WriteLine($"RECOGNIZED: Text={speechRecognitionResult.Text}");
                break;
            case ResultReason.NoMatch:
                Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                break;
            case ResultReason.Canceled:
                var cancellation = CancellationDetails.FromResult(speechRecognitionResult);
                Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                    Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
                    Console.WriteLine($"CANCELED: Did you set the speech resource key and region values?");
                }
                break;
        }
    }


    //音声認識
    public async UniTask<string> SpeechToText(SpeechRecognizer speechRecognizer)
    {
        UnityEngine.Debug.Log("Start Recognizing!");
        var speechRecognitionResult = await speechRecognizer.RecognizeOnceAsync();
        OutputSpeechRecognitionResult(speechRecognitionResult);
        return speechRecognitionResult.Text;
    }
}

解説

基本的には公式リファレンスと同様です

ChatAPI

スクリプト

ChatAPI.cs

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Cysharp.Threading.Tasks;
using System.Text;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System.Net.Http;

public class ChatAPI
{
    public string url = "http://127.0.0.1:5000";

    public HttpClient httpClient;
    // Start is called before the first frame updat
    [System.Serializable]
    public class RequestTextJson{
        public string request;
    }
    [System.Serializable]
    public class ResponseTextJson{
        public string response = "もう一度、言ってください。";

        public float userEmotionRate = 0.0f;
        public float agentEmotionRate = 0.0f;
        public string userEmotion = "none";
        public string agentEmotion = "excited";

        public float userDialogueActRate = 0.0f;
        public float agentDialogueActRate = 0.0f;
        public string userDialogueAct = "none";
        public string agentDialogueAct = "none";
    }
    // Update is called once per frame
    public async UniTask<ResponseTextJson> ChatText(string inputText){
        var data = new RequestTextJson();
        data.request = inputText;
        var jsonRequest = JsonUtility.ToJson(data);
        var response = await jsonPost(url, jsonRequest);
        var jsonResponse = JsonUtility.FromJson<ResponseTextJson>(response);
        if (jsonResponse.response == ""){
            jsonResponse.response = "もう一度、言ってください。";
        }
        return jsonResponse;
    }
    public async UniTask<string> jsonPost(string url, string jsonData){
            var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
            try{
                var result = await httpClient.PostAsync(url, content);
                var response = await  result.Content.ReadAsStringAsync();
                return response;
            }catch(System.Exception e){
                Debug.Log("Error : " + e);
                var response = new ResponseTextJson();
                response.response = "エラー : " + e;
                response.agentEmotion = "sad";
                response.userEmotion = "error";
                response.agentEmotionRate = 1.0f;
                return JsonUtility.ToJson(response);
            }
        }
    }

解説

HttpClientを用いて通信を行います。

UnityでHTTPを調べると、WWW · ‎UnityWebRequest · ‎HttpWebRequest が名前が上がりますが、すごく非同期に使いづらいので、DotNET純正の System.NetHTTP.HttpClient クラスを使っています.

[System.Serializable]により、Jsonのシリアライズを行い、jsonベースのHTTP通信を行うことができます。

JSON化するためには, JSONUtilityを使います.
JonUtility.ToJson(request);

JSON化を解除するためには同様に, JSONUtilityを使います。
JsonUtility.FromJson(response);

ちなみに音声をやり取りする場合は Stream形式で通信を行う必要があります。そのため、テキストのみで通信を行う形式にしています。

TextToSpeech

スクリプト

TTS.cs


using System;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Cysharp.Threading.Tasks;
using UnityEngine;

class TTS
{
    public string ssml;    //ログの出力

    public string wavPath;

    public AudioSource audioSource;
    static void OutputSpeechSynthesisResult(SpeechSynthesisResult speechSynthesisResult, string text)
    {
        switch (speechSynthesisResult.Reason)
        {
            case ResultReason.SynthesizingAudioCompleted:
                Console.WriteLine($"Speech synthesized for text: [{text}]");
                break;
            case ResultReason.Canceled:
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(speechSynthesisResult);
                Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                    Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
                    Console.WriteLine($"CANCELED: Did you set the speech resource key and region values?");
                }
                break;
            default:
                break;
        }
    }

    //音声認識
    public async UniTask TextToSpeech(SpeechSynthesizer speechSynthesizer, string text)
    {
        UnityEngine.Debug.Log("Start Synthesising!");
        var ssml_text = ssml.Replace("TEXT", text);
        var speechSynthesisResult = await speechSynthesizer.SpeakSsmlAsync(ssml_text);
        OutputSpeechSynthesisResult(speechSynthesisResult, text);
        if(audioSource != null){
            var sampleCount = speechSynthesisResult.AudioData.Length / 2;
            var audioData = new float[sampleCount];
            for (var i = 0; i < sampleCount; ++i)
            {
                audioData[i] = (short)(speechSynthesisResult.AudioData[i * 2 + 1] << 8 | speechSynthesisResult.AudioData[i * 2]) / 32768.0F;
            }
            var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false);
            audioClip.SetData(audioData, 0);
            audioSource.clip = audioClip;
            audioSource.Play();
        }

    }
}

cyber.xml

<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts"  xml:lang="ja">
    <voice name="ja-JP-NanamiNeural">
<!--        <mstts:express-as style="cheerful">-->
         <mstts:express-as style="chat">
            <prosody pitch="13%" rate="4%">
                TEXT
            </prosody>
        </mstts:express-as>
    </voice>
</speak>

解説

基本的に公式ドキュメントと同様にして、Azure Speech SDKを使って音声合成を行います。

音声合成時に口パクを行いたかったため、AudioClip を作成して、それをAudioSourceにつけて再生を行います。

特にspeechSynthesisResultについては、chat.csにおいて、

Chat.cs

speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);
speechSynthesizer = new SpeechSynthesizer(speechConfig, null);

出力部分のフォーマットを設定し, 出力デバイスを nullにしているため、音声は再生されないようにしています。

また、ssmlについてTEXT部分を置換することで、既存のAzure Speech Synthesis の Nanami-Neutral をかわいらしく変換しています.

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up