はじめに
ブラウザ上で音声を取得し、websocketを用いてサーバーに音声を送り、
サーバーから google speech to textに音声を転送して音声認識結果をサーバーに結果をもらうところまで出来ました。
###注意
この記事はテスト段階の途中経過を記事にしたものです。
参考にした記事からのコピペが含まれています。
初めて記事を書くため、ミス等あれば、コメントよろしくお願いします。
###注意2
ssl証明書のある環境やローカルホストでかつchromeにて音声認識を行いたい場合、
一番下にある index2.html を参照
#Google Cloud Speech-to-TextのAPIに登録
#フロントサイド
index.html
<!DOCTYPE html>
<html lang="jp">
<head></head>
<body>
<p>ここにHTMLの文章などが入ります</p>
<input buttonid="button1" type="button" value="open" onclick="myfunc(this)">
<input buttonid="button2" type="button" value="close" onclick="myfunc(this)">
</body>
<script>
var connection = null;
var myfunc = function (button) {
if (button.value == "open") {
var handleSuccess = function (stream) {
var context = new AudioContext();
var input = context.createMediaStreamSource(stream)
var processor = context.createScriptProcessor(1024, 1, 1);
// WebSocketのコネクション
connection = new WebSocket('ws://hogehoge.com:8000/websocket');
input.connect(processor);
processor.connect(context.destination);
processor.onaudioprocess = function (e) {
var voice = e.inputBuffer.getChannelData(0);
connection.send(downsampleBuffer(voice, context.sampleRate, 16000)); // websocketで送る
};
};
navigator.mediaDevices.getUserMedia({ audio: true, video: false })
.then(handleSuccess)
var time1 = function () {
connection.close();
context.close()
};
setTimeout(time1, 1000 * 60 * 3);
}
if (button.value == "close") {
connection.close();
context.close()
};
};
const downsampleBuffer = (buffer, sampleRate, outSampleRate) => {
if (outSampleRate > sampleRate) {
console.error('downsampling rate show be smaller than original sample rate');
}
const sampleRateRatio = sampleRate / outSampleRate;
const newLength = Math.round(buffer.length / sampleRateRatio);
const result = new Int16Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
// bpsを縮める処理 (n byte分のデータを合算して、n byteで割る)
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
// 次のoffsetまでの合計を保存
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i += 1) {
accum += buffer[i];
count += 1;
}
// 16進数で保存 (LINEAR16でストリーミングするため)
result[offsetResult] = Math.min(1, accum / count) * 0x7FFF;
offsetResult += 1;
offsetBuffer = nextOffsetBuffer;
}
return result.buffer;
};
</script>
</html>
#サーバーサイド
stream.py
import tornado.ioloop
import tornado.web
import tornado.websocket
import sys
import threading
import os
from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
from SpeechClientBridge import SpeechClientBridge
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
config= RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="ja-JP",
)
streaming_config= StreamingRecognitionConfig(config=config, interim_results=True)
def on_transcription_response(response):
num_chars_printed= 0
if not response.results:
return
result= response.results[0]
if not result.alternatives:
return
transcript= result.alternatives[0].transcript
overwrite_chars= " " * (num_chars_printed -len(transcript))
if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.flush()
num_chars_printed= len(transcript)
else:
print('==>'+transcript + overwrite_chars)
class WebSocketHandler(tornado.websocket.WebSocketHandler):
bridge= SpeechClientBridge(streaming_config, on_transcription_response)
t= threading.Thread(target=bridge.start)
def check_origin(self, origin):
return True
def open(self):
print("opened")
self.t.start()
def on_message(self, message):
if message is None:
self.bridge.add_request(None)
self.bridge.terminate()
return
# print(message)
if type(message)== str:
print(message)
elif type(message)== None:
return
else:
self.bridge.add_request(message)
def on_close(self):
self.bridge.terminate()
print("closed")
app = tornado.web.Application([
(r"/websocket", WebSocketHandler)
])
if __name__ == "__main__":
app.listen(8000)
tornado.ioloop.IOLoop.instance().start()
SpeechClientBridge.py
import queue
from google.cloud import speech
class SpeechClientBridge:
def __init__(self, streaming_config, on_response):
self._on_response= on_response
self._queue= queue.Queue()
self._ended= False
self.streaming_config= streaming_config
def start(self):
client= speech.SpeechClient()
stream= self.generator()
requests= (
speech.StreamingRecognizeRequest(audio_content=content)
for content in stream
)
responses= client.streaming_recognize(self.streaming_config, requests)
self.process_responses_loop(responses)
def terminate(self):
self._ended= True
def add_request(self, buffer):
self._queue.put(bytes(buffer), block=False)
def process_responses_loop(self, responses):
for response in responses:
self._on_response(response)
if self._ended:
break
def generator(self):
while not self._ended:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk= self._queue.get()
if chunk is None:
return
data= [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk= self._queue.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)
#実行
サーバーサイドにて
python stream.py
を実行
ブラウザにて、index.htmlを開く
#https環境の場合
index2.html
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Web Speech API</title>
<script>
var flg = 0
var flag_speech = 0;
var text = ""
function vr_function() {
if (flg != 0) {
flg = 0;
return
}
window.SpeechRecognition = window.SpeechRecognition || webkitSpeechRecognition;
var recognition = new webkitSpeechRecognition();
recognition.lang = 'ja';
recognition.interimResults = true;
recognition.continuous = true;
recognition.onsoundstart = function () {
document.getElementById('status').innerHTML = "認識中";
};
recognition.onnomatch = function () {
document.getElementById('status').innerHTML = "もう一度試してください";
};
recognition.onerror = function () {
document.getElementById('status').innerHTML = "エラー";
if (flag_speech == 0)
vr_function();
};
recognition.onsoundend = function () {
document.getElementById('status').innerHTML = "停止中";
vr_function();
};
recognition.onresult = function (event) {
var results = event.results;
for (var i = event.resultIndex; i < results.length; i++) {
if (results[i].isFinal) {
text = text + results[i][0].transcript + "<br />";
document.getElementById('result_text').innerHTML = "[途中経過] " + results[i][0].transcript;
document.getElementById('result').innerHTML = text;
vr_function();
}
else {
document.getElementById('result_text').innerHTML = "[途中経過] " + results[i][0].transcript;
flag_speech = 1;
}
}
}
flag_speech = 0;
document.getElementById('status').innerHTML = "start";
recognition.start();
}
function stop() {
document.getElementById('status').innerHTML = "stop";
flg = 1
}
function download() {
stop()
let blob = new Blob([document.getElementById('result').innerHTML], { type: "text/plan" });
let link = document.createElement('a');
link.href = URL.createObjectURL(blob);
link.download = '作ったファイル.txt';
link.click();
}
</script>
<style>
.border {
border: 2px solid;
text-align: left;
padding: 2px;
width: 25%;
font-size: 20px;
}
</style>
</head>
<body>
<div id="result_text" class="border">
[途中経過]
</div>
<br>
<div id="status" class="border">
状態
</div>
<br>
<input type="button" onClick="vr_function()" value="音認開始">
<input type="button" onClick="stop()" value="音認停止">
<input type="button" onClick="download()" value="結果を保存する">
<div id="result" class="border">
認識結果
</div>
</body>
</html>
参考文献、引用