やりたいこと
- こちらを参考にして実装
導入
$ git clone https://github.com/FunAudioLLM/SenseVoice.git
$ cd SenseVoice
$ pip install torch<=2.3 torchaudio modelscope huggingface huggingface_hub funasr>=1.1.3 numpy<=1.26.4 fastapi>=0.111.1
$ touch sample.ipynb
実装
sample.ipynb
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
import os
import requests
from IPython.display import Audio
# サンプル音声をダウンロードする関数
def download_audio(url):
audio_file = url.split("/")[-1]
if not os.path.exists(audio_file):
r = requests.get(url)
with open(audio_file, "wb") as f:
f.write(r.content)
return audio_file
model_dir = "iic/SenseVoiceSmall"
model = AutoModel(
model=model_dir,
trust_remote_code=True,
remote_code="./model.py",
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device="cuda:0",
)
wav_url = "http://sython.org/Corpus/STUDIES/ITA-Emotion100-Teacher-Angry-001.wav"
wav_path = download_audio(wav_url)
# wav_path = f"{model.model_path}/example/{lang}.mp3"
# print(model.model_path)
for lang in ["サンプル音声"]:
print(f"===== {lang} =====")
display(Audio(wav_path))
for file in [
wav_path
]:
print(f"##### {file} #####")
res = model.generate(
input=file,
language="ja",
use_itn=True,
batch_size_s=60,
merge_vad=True,
merge_length_s=15,
ban_emo_unk=True, # Unknown タグを回避
)
text = rich_transcription_postprocess(res[0]["text"])
print("\nASR結果: ", text)

