Macbookの画面録画から音声のみを抽出した後にテキストを要約する方法
m4a
↓
wav変換(16kHz mono)
↓
5分ごとに分割
↓
Whisperで文字起こし
↓
1つのテキストに保存
フォルダ構成
project/
├── auto_whisper.py
├── input/
│ ├── a.m4a
│ ├── b.m4a
│ └── c.m4a
└── output/
import os
import subprocess
import glob
from faster_whisper import WhisperModel
# ===== OpenMP回避 =====
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# ===== 設定 =====
INPUT_DIR = "./input"
OUTPUT_DIR = "./output"
CHUNK_SEC = 300 # 5分
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ===== モデル =====
model = WhisperModel(
"small",
device="cpu",
compute_type="int8",
cpu_threads=8,
num_workers=1
)
# ===== m4aファイル取得 =====
files = glob.glob(f"{INPUT_DIR}/*.m4a")
for input_file in files:
print(f"\n=== 処理開始: {input_file} ===")
base = os.path.splitext(os.path.basename(input_file))[0]
wav_file = f"{OUTPUT_DIR}/{base}.wav"
# ===== ① wav変換 =====
subprocess.run([
"ffmpeg", "-y",
"-i", input_file,
"-ar", "16000",
"-ac", "1",
wav_file
])
# ===== ② 無音除去(強化版)=====
cleaned_wav = f"{OUTPUT_DIR}/{base}_clean.wav"
subprocess.run([
"ffmpeg", "-y",
"-i", wav_file,
"-af", "silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-40dB",
cleaned_wav
])
# ===== ③ 分割 =====
chunk_pattern = f"{OUTPUT_DIR}/{base}_chunk_%03d.wav"
subprocess.run([
"ffmpeg", "-y",
"-i", cleaned_wav,
"-f", "segment",
"-segment_time", str(CHUNK_SEC),
"-c", "copy",
chunk_pattern
])
# ===== ④ 文字起こし(時間補正あり)=====
chunk_files = sorted(glob.glob(f"{OUTPUT_DIR}/{base}_chunk_*.wav"))
output_txt = f"{OUTPUT_DIR}/{base}.txt"
offset = 0
with open(output_txt, "w", encoding="utf-8") as f:
for i, chunk in enumerate(chunk_files):
print(f"処理中: {chunk}")
segments, _ = model.transcribe(
chunk,
beam_size=1,
vad_filter=True
)
for s in segments:
start = s.start + offset
end = s.end + offset
f.write(f"[{start:.2f}-{end:.2f}] {s.text}\n")
offset += CHUNK_SEC
print(f"完了: {output_txt}")