AmiVoiceAPIでの音声認識精度が上がらない
Q&A
Closed
解決したいこと
WebSocket インタフェースでのAmi Voice APIの音声認識の精度を高めたい
TypeScriptでwebアプリを作成しており、webアプリ内の機能として音声入力のシステムを実装しています。Ami Voiceを採用して実装しているのですが、精度が非常に低いです。リリースが差し迫っており、至急、お伺いしたいです。
発生している問題・エラー
エラーはなく、プログラム自体は正常に動作します。
該当するソースコード
import { SpeechRecognitionError } from '../../types/speech';
interface WrpListener {
utteranceStarted(startTime: number): void;
utteranceEnded(endTime: number): void;
resultCreated(): void;
resultUpdated(result: string): void;
resultFinalized(result: string): void;
}
interface RecognitionResult {
results: {
tokens: { written: string }[];
text: string;
confidence?: number;
starttime?: number;
endtime?: number;
}[];
text: string;
code?: string;
message?: string;
}
interface MediaError extends Error {
name: string;
}
interface SpeechServiceConfig {
codec?: string;
sampleRate?: number;
bufferSize?: number;
authorization?: string;
resultUpdatedInterval?: number;
extension?: string;
keepFillerToken?: boolean;
segmenterProperties?: string;
}
const DEFAULT_CONFIG: SpeechServiceConfig = {
codec: 'LSB16K',
sampleRate: 16000,
bufferSize: 4096,
resultUpdatedInterval: 200,
extension: 'progressive',
keepFillerToken: false,
segmenterProperties: 'continuous'
};
export class AmiVoiceSpeechService {
private config: SpeechServiceConfig = DEFAULT_CONFIG;
private audioContext: AudioContext | null = null;
private mediaStream: MediaStream | null = null;
private mediaStreamSource: MediaStreamAudioSourceNode | null = null;
private processor: ScriptProcessorNode | null = null;
private webSocket: WebSocket | null = null;
private isRecording = false;
private recordingStartTime = 0;
private onTranscriptCallback: ((text: string, isFinal: boolean) => void) | null = null;
private onErrorCallback: ((error: SpeechRecognitionError) => void) | null = null;
private listener: WrpListener;
private readonly CONNECTION_TIMEOUT = 15000;
private readonly SESSION_TIMEOUT = 8000;
constructor(config: Partial<SpeechServiceConfig> = {}) {
this.config = { ...DEFAULT_CONFIG, ...config };
this.startRecording = this.startRecording.bind(this);
this.stopRecording = this.stopRecording.bind(this);
this.listener = {
utteranceStarted: (startTime: number) => {
console.log('Utterance started:', startTime);
},
utteranceEnded: (endTime: number) => {
console.log('Utterance ended:', endTime);
},
resultCreated: () => {
console.log('Result created');
},
resultUpdated: (result: string) => {
console.log('Result updated:', result);
try {
const jsonResult = JSON.parse(result) as RecognitionResult;
if (jsonResult.results && jsonResult.results.length > 0) {
const text = jsonResult.results
.map(r => {
if (r.tokens && r.tokens.length > 0) {
return r.tokens.map(t => t.written).join('');
}
if (r.text) {
return r.text.replace(/\.+/g, '').trim();
}
return '';
})
.join('')
.trim();
if (text && this.onTranscriptCallback) {
this.onTranscriptCallback(text, false);
}
}
} catch (error) {
console.error('Result parse error:', error);
}
},
resultFinalized: (result: string) => {
console.log('Result finalized:', result);
try {
const jsonResult = JSON.parse(result) as RecognitionResult;
if (jsonResult.results && jsonResult.results.length > 0) {
const text = jsonResult.results
.map(r => {
if (r.tokens && r.tokens.length > 0) {
return r.tokens.map(t => t.written).join('');
}
if (r.text) {
return r.text.replace(/\.+/g, '').trim();
}
return '';
})
.join('')
.trim();
if (text && this.onTranscriptCallback) {
this.onTranscriptCallback(text, true);
}
}
} catch (error) {
console.error('Result parse error:', error);
}
}
};
}
private async connect(): Promise<boolean> {
try {
if (this.webSocket?.readyState === WebSocket.OPEN) {
return true;
}
this.webSocket = new WebSocket(import.meta.env.VITE_AMIVOICE_WEBSOCKET_URL);
return new Promise((resolve) => {
const timeout = setTimeout(() => {
if (this.onErrorCallback) {
this.onErrorCallback({
error: 'CONNECTION_ERROR',
message: '接続がタイムアウトしました'
});
}
resolve(false);
}, this.CONNECTION_TIMEOUT);
if (!this.webSocket) {
resolve(false);
return;
}
this.webSocket.onopen = () => {
console.log('WebSocket接続確立');
clearTimeout(timeout);
resolve(true);
};
this.webSocket.onerror = () => {
console.error('WebSocket接続エラー');
clearTimeout(timeout);
if (this.onErrorCallback) {
this.onErrorCallback({
error: 'CONNECTION_ERROR',
message: 'WebSocket接続に失敗しました'
});
}
resolve(false);
};
this.webSocket.onclose = () => {
console.log('WebSocket接続終了');
clearTimeout(timeout);
if (this.isRecording && this.onErrorCallback) {
this.onErrorCallback({
error: 'CONNECTION_ERROR',
message: 'WebSocket接続が終了しました'
});
}
};
this.webSocket.onmessage = (event) => {
try {
const message = event.data.trim();
const command = message.charAt(0);
const payload = message.length > 2 ? message.substring(2) : '';
switch (command) {
case 's':
if (payload && this.onErrorCallback) {
this.onErrorCallback({
error: 'SESSION_ERROR',
message: `セッション開始エラー: ${payload}`
});
}
break;
case 'S':
this.listener.utteranceStarted(parseInt(payload, 10));
break;
case 'E':
this.listener.utteranceEnded(parseInt(payload, 10));
break;
case 'C':
this.listener.resultCreated();
break;
case 'U':
this.listener.resultUpdated(payload);
break;
case 'A':
this.listener.resultFinalized(payload);
break;
case 'p':
if (payload && this.onErrorCallback) {
this.onErrorCallback({
error: 'PROCESSING_ERROR',
message: `音声データ処理エラー: ${payload}`
});
}
break;
case 'e':
if (payload && this.onErrorCallback) {
this.onErrorCallback({
error: 'SESSION_ERROR',
message: `セッション終了エラー: ${payload}`
});
}
break;
case '?':
if (this.onErrorCallback) {
const errorMessage = message.substring(1).trim();
this.onErrorCallback({
error: 'SERVER_ERROR',
message: errorMessage
});
}
break;
default:
console.warn('不明なメッセージタイプ:', message);
}
} catch (error) {
console.error('メッセージ処理エラー:', error);
if (this.onErrorCallback) {
this.onErrorCallback({
error: 'MESSAGE_ERROR',
message: 'メッセージの処理に失敗しました'
});
}
}
};
});
} catch (error) {
console.error('接続エラー:', error);
if (this.onErrorCallback) {
this.onErrorCallback({
error: 'CONNECTION_ERROR',
message: '接続処理でエラーが発生しました'
});
}
return false;
}
}
private async setupAudioProcessing(): Promise<void> {
try {
const connected = await this.connect();
if (!connected) {
throw new Error('WebSocket接続に失敗しました');
}
const params = [
this.config.codec,
'-a-general-input\u0001',
'authorization=' + (this.config.authorization || import.meta.env.VITE_AMIVOICE_AUTH_KEY),
`resultUpdatedInterval=${this.config.resultUpdatedInterval}`,
`extension=${this.config.extension}`,
`keepFillerToken=${this.config.keepFillerToken}`,
'segmenterProperties=' + encodeURIComponent(this.config.segmenterProperties || '')
].join(' ');
const startCommand = `s ${params}`;
console.log('Sending start command:', startCommand);
if (!this.webSocket || this.webSocket.readyState !== WebSocket.OPEN) {
throw new Error('WebSocket接続が確立されていません');
}
await new Promise<void>((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error('セッション開始がタイムアウトしました'));
}, this.SESSION_TIMEOUT);
const messageHandler = (event: MessageEvent) => {
if (event.data === 's') {
clearTimeout(timeout);
this.webSocket?.removeEventListener('message', messageHandler);
resolve();
} else if (event.data.startsWith('s ')) {
clearTimeout(timeout);
this.webSocket?.removeEventListener('message', messageHandler);
reject(new Error(event.data.substring(2)));
}
};
this.webSocket?.addEventListener('message', messageHandler);
this.webSocket?.send(startCommand);
});
if (!this.audioContext || this.audioContext.state === 'closed') {
this.audioContext = new AudioContext({
sampleRate: this.config.sampleRate,
latencyHint: 'interactive'
});
} else if (this.audioContext.state === 'suspended') {
await this.audioContext.resume();
}
try {
this.mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: this.config.sampleRate,
echoCancellation: false,
noiseSuppression: false,
autoGainControl: true
}
});
} catch (error) {
if (error instanceof Error) {
const mediaError = error as MediaError;
if (mediaError.name === 'NotAllowedError') {
throw new Error('マイクの使用が許可されていません');
} else if (mediaError.name === 'NotFoundError') {
throw new Error('マイクが見つかりません');
}
}
throw error;
}
if (!this.audioContext) {
throw new Error('AudioContextが初期化されていません');
}
this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.mediaStream);
this.processor = this.audioContext.createScriptProcessor(this.config.bufferSize, 1, 1);
this.processor.onaudioprocess = (e) => {
if (this.webSocket?.readyState === WebSocket.OPEN) {
try {
const inputData = e.inputBuffer.getChannelData(0);
const pcmData = new Int16Array(inputData.length);
for (let i = 0; i < inputData.length; i++) {
let s = inputData[i] * 32767.0;
s = Math.max(-32768, Math.min(32767, s));
pcmData[i] = s;
}
const commandArray = new Uint8Array([0x70, 0x20]);
const finalArray = new Uint8Array(commandArray.length + pcmData.byteLength);
finalArray.set(commandArray);
finalArray.set(new Uint8Array(pcmData.buffer), commandArray.length);
this.webSocket.send(finalArray);
} catch (error) {
console.error('音声データ送信エラー:', error);
this.notifyError('DATA_SEND_ERROR', '音声データの送信に失敗しました');
}
}
};
this.mediaStreamSource.connect(this.processor);
this.processor.connect(this.audioContext.destination);
console.log('音声処理を開始しました', {
sampleRate: this.audioContext.sampleRate,
bufferSize: this.processor.bufferSize
});
} catch (error) {
console.error('Audio Setup Error:', error);
this.notifyError('AUDIO_SETUP_ERROR', 'オーディオの設定に失敗しました。');
}
}
private notifyError(error: string, message: string): void {
console.error(`[${error}] ${message}`);
if (this.onErrorCallback) {
this.onErrorCallback({ error, message });
}
}
private async reset(): Promise<void> {
const cleanup = async () => {
if (this.webSocket?.readyState === WebSocket.OPEN) {
try {
this.webSocket.send('e');
await new Promise<void>((resolve) => {
const closeTimeout = setTimeout(() => {
console.warn('WebSocket終了待機がタイムアウトしました');
resolve();
}, 1000);
const closeHandler = () => {
clearTimeout(closeTimeout);
resolve();
};
this.webSocket?.addEventListener('close', closeHandler, { once: true });
setTimeout(() => {
this.webSocket?.close();
this.webSocket?.removeEventListener('close', closeHandler);
resolve();
}, 500);
});
} catch (error) {
console.error('WebSocket終了エラー:', error);
}
}
if (this.webSocket) {
this.webSocket.close();
this.webSocket = null;
}
if (this.processor) {
this.processor.disconnect();
this.processor.onaudioprocess = null;
this.processor = null;
}
if (this.mediaStreamSource) {
this.mediaStreamSource.disconnect();
this.mediaStreamSource = null;
}
if (this.mediaStream) {
const tracks = this.mediaStream.getTracks();
tracks.forEach(track => {
track.enabled = false;
track.stop();
});
this.mediaStream = null;
}
if (this.audioContext?.state !== 'closed') {
try {
await this.audioContext?.close();
} catch (error) {
console.error('AudioContext終了エラー:', error);
}
this.audioContext = null;
}
};
try {
await cleanup();
} catch (error) {
console.error('リセット中にエラーが発生:', error);
} finally {
this.isRecording = false;
this.recordingStartTime = 0;
this.onTranscriptCallback = null;
this.onErrorCallback = null;
}
}
public async startRecording(
onTranscript: (text: string, isFinal: boolean) => void,
onError?: (error: SpeechRecognitionError) => void
): Promise<void> {
if (this.isRecording) {
return;
}
this.onTranscriptCallback = onTranscript;
this.onErrorCallback = onError || null;
try {
await this.setupAudioProcessing();
this.recordingStartTime = Date.now();
this.isRecording = true;
console.log('Recording started:', new Date());
} catch (error) {
console.error('Recording Start Error:', error);
this.notifyError('RECORDING_ERROR', '録音の開始に失敗しました。');
}
}
public async stopRecording(): Promise<void> {
if (!this.isRecording) {
return;
}
try {
this.isRecording = false;
await this.reset();
console.log('Recording stopped');
} catch (error) {
console.error('Recording Stop Error:', error);
this.notifyError('RECORDING_ERROR', '録音の停止に失敗しました。');
}
}
public getIsRecording(): boolean {
return this.isRecording;
}
}
export const amiVoiceSpeechService = new AmiVoiceSpeechService();
例)
def greet
puts Hello World
end
自分で試したこと
- 設定パラメータの最適化
// Before
const DEFAULT_CONFIG: SpeechServiceConfig = {
codec: 'LSB16K',
resultUpdatedInterval: 100,
extension: 'advanced',
segmenterProperties: 'threading=true'
};
// After
const DEFAULT_CONFIG: SpeechServiceConfig = {
codec: 'LSB16K',
resultUpdatedInterval: 200, // 認識結果の更新間隔を延長し安定性向上
extension: 'progressive', // より高精度な認識モードに変更
segmenterProperties: 'continuous' // 連続音声認識モードに最適化
};
- 音声入力パラメータの最適化
// Before
this.mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: 16000,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true
}
});
// After
this.mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: this.config.sampleRate, // 設定から動的に取得
echoCancellation: false, // エコーキャンセルをエンジン側に任せる
noiseSuppression: false, // ノイズ抑制をエンジン側に任せる
autoGainControl: true // 音量の自動調整は維持
}
});
- エラーハンドリングの強化
private readonly CONNECTION_TIMEOUT = 15000; // 接続タイムアウト: 15秒
private readonly SESSION_TIMEOUT = 8000; // セッションタイムアウト: 8秒
private async connect(): Promise<boolean> {
try {
if (this.webSocket?.readyState === WebSocket.OPEN) {
return true;
}
this.webSocket = new WebSocket(import.meta.env.VITE_AMIVOICE_WEBSOCKET_URL);
return new Promise((resolve) => {
const timeout = setTimeout(() => {
if (this.onErrorCallback) {
this.onErrorCallback({
type: 'connection',
message: '接続タイムアウト'
});
}
resolve(false);
}, this.CONNECTION_TIMEOUT);
// WebSocket接続イベントハンドリングを追加
// ...
});
} catch (error) {
// エラーハンドリングの強化
}
}
- 認識結果の処理改善
resultUpdated: (result: string) => {
try {
const jsonResult = JSON.parse(result) as RecognitionResult;
if (jsonResult.results?.length > 0) {
const text = jsonResult.results
.map(r => {
if (r.tokens?.length > 0) {
return r.tokens.map(t => t.written).join('');
}
if (r.text) {
return r.text.replace(/\.+/g, '').trim(); // 不要な句点を除去
}
return '';
})
.join('')
.trim();
if (text && this.onTranscriptCallback) {
this.onTranscriptCallback(text, false); // 中間結果として通知
}
}
} catch (error) {
console.error('Result parse error:', error);
}
}
- 音声処理の最適化
private async setupAudioProcessing(): Promise<void> {
// AudioContextの設定を最適化
this.audioContext = new AudioContext({
sampleRate: this.config.sampleRate
});
// バッファサイズを最適化
this.processor = this.audioContext.createScriptProcessor(
this.config.bufferSize, // 4096に設定
1, // 入力チャンネル数
1 // 出力チャンネル数
);
}