LoginSignup
3
1

More than 1 year has passed since last update.

非同期処理対話システム in Unity

Posted at

# 非同期の対話システム in Unityです。

image.png

このシステム中の音声認識、応答生成、音声合成、これらを管理する対話マネージャーを作ります。

必要なpackage

Azureの設定が必要です。

公式サイト

unity package

実装

対話マネージャー

スクリプト
Chat.cs
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Cysharp.Threading.Tasks;
using System.Threading.Tasks;
using System.Text;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using UnityEngine.Networking;


public class Chat : MonoBehaviour
{
    //インスタンス変数
    private STT stt; 
    private TTS tts;
    private ChatAPI chatAPI;
    //azureのspeech sdkの設定.
    static string subscriptionKey = "your subscription key";
    static string serviceRegion = "your region";
    static string language = "ja-JP";

    static string voiceName = "ja-JP-NanamiNeural";

    SpeechRecognizer speechRecognizer;

    SpeechSynthesizer speechSynthesizer;
    AudioConfig audioConfig;

    //そのほか, 通信設定とAnimation設定.
    [SerializeField] string chatAPIurl = "http://127.0.0.1:5000";

    private bool tasking;

    //Awaitの中ではUnityのアニメーションを管理できないので、別に逃す.
    public bool isSpeaking = false;

    public bool isGenerating = false;

    public float agentEmotionRate;

    public string agentEmotion;

    public float agentDialogueActRate;

    public string agentDialogueAct;

    [SerializeField] AudioSource characterAudioSource;

    private float bufferTimer;

    private void Start(){
        //インスタンスの生成
        stt = new STT();
        chatAPI = new ChatAPI();
        tts = new TTS();
        //HTTP cliant設定
        chatAPI.httpClient = new System.Net.Http.HttpClient();
        //通信先のurl
        chatAPI.url = chatAPIurl;
        var ssml_asset = Resources.Load("Cyber") as TextAsset;
        tts.ssml = ssml_asset.text;

        //azure speech sdkの設定.
        var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, serviceRegion);        
        speechConfig.SpeechRecognitionLanguage = language;
        speechConfig.SpeechSynthesisVoiceName = voiceName;
        speechRecognizer = new SpeechRecognizer(speechConfig);

        if(characterAudioSource != null){
            speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);
            speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
            tts.audioSource = characterAudioSource;
        }
        else{
            speechSynthesizer = new SpeechSynthesizer(speechConfig);
        }
    }
    async void Update(){
        if(isSpeaking && characterAudioSource != null && !isGenerating){
            if(Time.time - bufferTimer < 1.0f){
                return;
            }
            if(characterAudioSource.isPlaying){
                return;
            }
            isSpeaking = false;
            tasking = false;
            Debug.Log("Finish Synthesizing!");
        }
        //----------------------------------------//
        if(tasking){
            return;
        }
        tasking = true;
        await ChatAsync();
    }
    //音声認識
    async Task ChatAsync()
    {   
        var response = await stt.SpeechToText(speechRecognizer);
        if (response == ""){
            //待つ?
            Debug.Log("No Response!");
            tasking = false;
            return;
        }
        Debug.Log("response : " + response);
        isGenerating = true;
        var responseJson = await chatAPI.ChatText(response);
        Debug.Log("response : " +  responseJson.response);
        agentEmotionRate = responseJson.agentEmotionRate;
        agentEmotion = responseJson.agentEmotion;
        agentDialogueAct = responseJson.agentDialogueAct;
        agentDialogueActRate = responseJson.agentDialogueActRate;
        await tts.TextToSpeech(speechSynthesizer, responseJson.response);
        isSpeaking = true;
        //問題点 : 音声の再生中もisSpeakingと判定
        if(characterAudioSource != null){
            isGenerating = false;
            bufferTimer = Time.time;
            return;
        }else{
            isSpeaking = false;
            tasking = false;
            Debug.Log("Finish speaking!");
        }
        //--------------------------------------------------//
    }

}

解説

別Threadで実行を行います。そのため、Unityと単純な変数以外を同期できないので、bool値で管理を行っています。

chatAPIurl については, 対話システムでHTTP通信を行うURLについて記述します。別サーバーに対話システムを立てておくことで、HTTP通信が可能となります。

taskingが SpeechToText, Chat, TextToSpeech を行っているときに, trueになり新しく音声認識を行わないようになっています。

isSpeakingが音声合成の結果を再生している間にtrueになる変数です。

isGeneratingが応答生成と音声合成をしているときにtrueになる変数です。

SpeechToText

スクリプト
STT.cs
using System;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Cysharp.Threading.Tasks;

class STT
{
    //音声認識のレスポンスによるログの出力
    static void OutputSpeechRecognitionResult(SpeechRecognitionResult speechRecognitionResult)
    {
        switch (speechRecognitionResult.Reason)
        {
            case ResultReason.RecognizedSpeech:
                Console.WriteLine($"RECOGNIZED: Text={speechRecognitionResult.Text}");
                break;
            case ResultReason.NoMatch:
                Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                break;
            case ResultReason.Canceled:
                var cancellation = CancellationDetails.FromResult(speechRecognitionResult);
                Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                    Console.WriteLine($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
                    Console.WriteLine($"CANCELED: Did you set the speech resource key and region values?");
                }
                break;
        }
    }


    //音声認識
    public async UniTask<string> SpeechToText(SpeechRecognizer speechRecognizer)
    {
        UnityEngine.Debug.Log("Start Recognizing!");
        var speechRecognitionResult = await speechRecognizer.RecognizeOnceAsync();
        OutputSpeechRecognitionResult(speechRecognitionResult);
        return speechRecognitionResult.Text;
    }
}
解説

基本的には公式リファレンスと同様です

ChatAPI

スクリプト
ChatAPI.cs
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Cysharp.Threading.Tasks;
using System.Text;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System.Net.Http;

public class ChatAPI
{
    public string url = "http://127.0.0.1:5000";

    public HttpClient httpClient;
    // Start is called before the first frame updat
    [System.Serializable]
    public class RequestTextJson{
        public string request;
    }
    [System.Serializable]
    public class ResponseTextJson{
        public string response = "もう一度、言ってください。";

        public float userEmotionRate = 0.0f;
        public float agentEmotionRate = 0.0f;
        public string userEmotion = "none";
        public string agentEmotion = "excited";

        public float userDialogueActRate = 0.0f;
        public float agentDialogueActRate = 0.0f;
        public string userDialogueAct = "none";
        public string agentDialogueAct = "none";
    }
    // Update is called once per frame
    public async UniTask<ResponseTextJson> ChatText(string inputText){
        var data = new RequestTextJson();
        data.request = inputText;
        var jsonRequest = JsonUtility.ToJson(data);
        var response = await jsonPost(url, jsonRequest);
        var jsonResponse = JsonUtility.FromJson<ResponseTextJson>(response);
        if (jsonResponse.response == ""){
            jsonResponse.response = "もう一度、言ってください。";
        }
        return jsonResponse;
    }
    public async UniTask<string> jsonPost(string url, string jsonData){
            var content = new StringContent(jsonData, Encoding.UTF8, "application/json");
            try{
                var result = await httpClient.PostAsync(url, content);
                var response = await  result.Content.ReadAsStringAsync();
                return response;
            }catch(System.Exception e){
                Debug.Log("Error : " + e);
                var response = new ResponseTextJson();
                response.response = "エラー : " + e;
                response.agentEmotion = "sad";
                response.userEmotion = "error";
                response.agentEmotionRate = 1.0f;
                return JsonUtility.ToJson(response);
            }
        }
    }
解説

HttpClientを用いて通信を行います。

UnityでHTTPを調べると、WWW · ‎UnityWebRequest · ‎HttpWebRequest が名前が上がりますが、すごく非同期に使いづらいので、DotNET純正の System.NetHTTP.HttpClient クラスを使っています.

[System.Serializable]により、Jsonのシリアライズを行い、jsonベースのHTTP通信を行うことができます。

JSON化するためには, JSONUtilityを使います.
JonUtility.ToJson(request);

JSON化を解除するためには同様に, JSONUtilityを使います。
JsonUtility.FromJson(response);

ちなみに音声をやり取りする場合は Stream形式で通信を行う必要があります。そのため、テキストのみで通信を行う形式にしています。

TextToSpeech

スクリプト
TTS.cs

using System;
using System.IO;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Cysharp.Threading.Tasks;
using UnityEngine;

class TTS
{
    public string ssml;    //ログの出力

    public string wavPath;

    public AudioSource audioSource;
    static void OutputSpeechSynthesisResult(SpeechSynthesisResult speechSynthesisResult, string text)
    {
        switch (speechSynthesisResult.Reason)
        {
            case ResultReason.SynthesizingAudioCompleted:
                Console.WriteLine($"Speech synthesized for text: [{text}]");
                break;
            case ResultReason.Canceled:
                var cancellation = SpeechSynthesisCancellationDetails.FromResult(speechSynthesisResult);
                Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                    Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
                    Console.WriteLine($"CANCELED: Did you set the speech resource key and region values?");
                }
                break;
            default:
                break;
        }
    }

    //音声認識
    public async UniTask TextToSpeech(SpeechSynthesizer speechSynthesizer, string text)
    {
        UnityEngine.Debug.Log("Start Synthesising!");
        var ssml_text = ssml.Replace("TEXT", text);
        var speechSynthesisResult = await speechSynthesizer.SpeakSsmlAsync(ssml_text);
        OutputSpeechSynthesisResult(speechSynthesisResult, text);
        if(audioSource != null){
            var sampleCount = speechSynthesisResult.AudioData.Length / 2;
            var audioData = new float[sampleCount];
            for (var i = 0; i < sampleCount; ++i)
            {
                audioData[i] = (short)(speechSynthesisResult.AudioData[i * 2 + 1] << 8 | speechSynthesisResult.AudioData[i * 2]) / 32768.0F;
            }
            var audioClip = AudioClip.Create("SynthesizedAudio", sampleCount, 1, 16000, false);
            audioClip.SetData(audioData, 0);
            audioSource.clip = audioClip;
            audioSource.Play();
        }

    }
}


cyber.xml
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts"  xml:lang="ja">
    <voice name="ja-JP-NanamiNeural">
<!--        <mstts:express-as style="cheerful">-->
         <mstts:express-as style="chat">
            <prosody pitch="13%" rate="4%">
                TEXT
            </prosody>
        </mstts:express-as>
    </voice>
</speak>
解説

基本的に公式ドキュメントと同様にして、Azure Speech SDKを使って音声合成を行います。

音声合成時に口パクを行いたかったため、AudioClip を作成して、それをAudioSourceにつけて再生を行います。

特にspeechSynthesisResultについては、chat.csにおいて、

Chat.cs
speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);
speechSynthesizer = new SpeechSynthesizer(speechConfig, null);

出力部分のフォーマットを設定し, 出力デバイスを nullにしているため、音声は再生されないようにしています。

また、ssmlについてTEXT部分を置換することで、既存のAzure Speech Synthesis の Nanami-Neutral をかわいらしく変換しています.

3
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
3
1