ローカルで音声データを素早く文字起こしする手順

#faster_whisper

Last updated at 2026-03-17Posted at 2026-03-17

Macbookの画面録画から音声のみを抽出した後にテキストを要約する方法

m4a
↓
wav変換（16kHz mono）
↓
5分ごとに分割
↓
Whisperで文字起こし
↓
1つのテキストに保存

フォルダ構成
project/
├── auto_whisper.py
├── input/
│   ├── a.m4a
│   ├── b.m4a
│   └── c.m4a
└── output/

import os
import subprocess
import glob
from faster_whisper import WhisperModel

# ===== OpenMP回避 =====
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# ===== 設定 =====
INPUT_DIR = "./input"
OUTPUT_DIR = "./output"
CHUNK_SEC = 300  # 5分

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ===== モデル =====
model = WhisperModel(
    "small",
    device="cpu",
    compute_type="int8",
    cpu_threads=8,
    num_workers=1
)

# ===== m4aファイル取得 =====
files = glob.glob(f"{INPUT_DIR}/*.m4a")

for input_file in files:
    print(f"\n=== 処理開始: {input_file} ===")

    base = os.path.splitext(os.path.basename(input_file))[0]
    wav_file = f"{OUTPUT_DIR}/{base}.wav"

    # ===== ① wav変換 =====
    subprocess.run([
        "ffmpeg", "-y",
        "-i", input_file,
        "-ar", "16000",
        "-ac", "1",
        wav_file
    ])

    # ===== ② 無音除去（強化版）=====
    cleaned_wav = f"{OUTPUT_DIR}/{base}_clean.wav"

    subprocess.run([
        "ffmpeg", "-y",
        "-i", wav_file,
        "-af", "silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-40dB",
        cleaned_wav
    ])

    # ===== ③ 分割 =====
    chunk_pattern = f"{OUTPUT_DIR}/{base}_chunk_%03d.wav"

    subprocess.run([
        "ffmpeg", "-y",
        "-i", cleaned_wav,
        "-f", "segment",
        "-segment_time", str(CHUNK_SEC),
        "-c", "copy",
        chunk_pattern
    ])

    # ===== ④ 文字起こし（時間補正あり）=====
    chunk_files = sorted(glob.glob(f"{OUTPUT_DIR}/{base}_chunk_*.wav"))

    output_txt = f"{OUTPUT_DIR}/{base}.txt"

    offset = 0

    with open(output_txt, "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunk_files):
            print(f"処理中: {chunk}")

            segments, _ = model.transcribe(
                chunk,
                beam_size=1,
                vad_filter=True
            )

            for s in segments:
                start = s.start + offset
                end = s.end + offset

                f.write(f"[{start:.2f}-{end:.2f}] {s.text}\n")

            offset += CHUNK_SEC

    print(f"完了: {output_txt}")

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up