GPT5出ましたね。最新技術の情報は常に英語ででまわるので日本語に翻訳してみました。
日本語翻訳を作るコードはこちらになります。すべて GPT5 に書いてもらいました。
まずは YouTube をダウンロードします。
# dl_youtube.py
# Usage:
# python ytdl_to_wav.py "https://www.youtube.com/watch?v=XXXXXXXX" -o downloads --prefix ai_
# python ytdl_to_wav.py "https://youtu.be/XXXXXXXX" -o out --prefix gpt5_
import argparse
import re
import subprocess
from pathlib import Path
from typing import Tuple
from yt_dlp import YoutubeDL
def sanitize_prefix(p: str) -> str:
"""安全な文字 (英数, -, _) のみ残す。フォルダ区切り等は除去。"""
p = re.sub(r"[^\w\-]", "_", p) # 非許可文字を _
return p
def download_mp4(url: str, out_dir: Path, prefix: str = "") -> Tuple[Path, str]:
out_dir.mkdir(parents=True, exist_ok=True)
safe_prefix = sanitize_prefix(prefix) if prefix else ""
outtmpl = f"{safe_prefix}%(id)s.%(ext)s" if safe_prefix else "%(id)s.%(ext)s"
ydl_opts = {
"paths": {"home": str(out_dir)},
"outtmpl": outtmpl,
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"quiet": True,
"noprogress": True,
}
with YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
vid_id = info.get("id")
# 通常は mp4 になるが、保険で複数拡張子を探索
for ext in (".mp4", ".mkv", ".webm", ".mov"):
cand = out_dir / f"{safe_prefix}{vid_id}{ext}"
if cand.exists():
return cand.resolve(), vid_id
# それでも見つからない場合は前方一致で探索
matches = list(out_dir.glob(f"{safe_prefix}{vid_id}.*"))
if matches:
return matches[0].resolve(), vid_id
raise FileNotFoundError("Downloaded video file not found.")
def to_wav_16k_mono(video_path: Path, out_dir: Path, base_name: str) -> Path:
out_dir.mkdir(parents=True, exist_ok=True)
wav_path = out_dir / f"{base_name}.wav"
cmd = [
"ffmpeg", "-y",
"-i", str(video_path),
"-vn",
"-ac", "1",
"-ar", "16000",
"-c:a", "pcm_s16le",
str(wav_path),
]
subprocess.run(cmd, check=True)
return wav_path.resolve()
def main():
ap = argparse.ArgumentParser(description="Download YouTube MP4 and convert to WAV(16k mono).")
ap.add_argument("url", help="YouTube video URL")
ap.add_argument("-o", "--out", default="downloads", help="Output directory (default: downloads)")
ap.add_argument("--prefix", default="", help="Prefix for output filenames, e.g., 'ai_' or 'gpt5_'")
args = ap.parse_args()
out_dir = Path(args.out)
prefix = args.prefix or ""
safe_prefix = sanitize_prefix(prefix)
print("Downloading MP4...")
mp4, vid_id = download_mp4(args.url, out_dir, safe_prefix)
print(f"MP4: {mp4.name}")
base_name = f"{safe_prefix}{vid_id}"
print("Converting to WAV (16k/mono)...")
wav = to_wav_16k_mono(mp4, out_dir, base_name)
print(f"WAV: {wav.name}")
print("\nDone. Feed the WAV to your transcriber.")
if __name__ == "__main__":
main()
次に音声データをテキスト化します。
# stt.py
# Requirements:
# pip install openai
# ffmpeg/ffprobe が PATH にあること
#
# Usage examples:
# python stt.py input.wav --out result.txt
# python stt.py input.m4a --language en --max-seconds 1200 --out result.txt
# python stt.py input.mp3 --split-mode fixed --chunk-seconds 900 --out result.txt
import argparse
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import List, Optional
from openai import OpenAI
# ===== あなたの OpenAI API キー(環境変数 OPENAI_API_KEY があればそちらを優先)=====
#API_KEY = "YOUR_OPENAI_API_KEY_HERE"
DEFAULT_MODEL = "gpt-4o-mini-transcribe"
DEFAULT_MAX_SECONDS = 1200 # 20分以内に収める(モデル上限に安全マージン)
DEFAULT_MIN_SECONDS = 60 # あまりに短い分割を避ける下限
DEFAULT_CHUNK_SECONDS = 1200 # split-mode=fixed の既定
DEFAULT_SILENCE_DB = -35.0 # 無音判定しきい値(dB)
DEFAULT_SILENCE_DUR = 0.6 # 無音とみなす最短継続時間(秒)
# ---------- Utils ----------
def run(cmd: List[str]) -> str:
"""Run a command and return combined stdout (text). Raises on error."""
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, check=True)
return res.stdout
def ffprobe_duration_sec(path: Path) -> Optional[float]:
try:
out = run([
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "default=nw=1:nk=1",
str(path)
])
return float(out.strip())
except Exception:
return None
def preprocess_to_flac_16k_mono(src: Path, dst: Path) -> Path:
"""
入力を FLAC(16kHz/mono) に変換。以降はこの FLAC を分割・転写に使用する。
"""
dst.parent.mkdir(parents=True, exist_ok=True)
cmd = [
"ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
"-i", str(src),
"-ac", "1", "-ar", "16000",
"-c:a", "flac", "-compression_level", "5",
str(dst),
]
subprocess.run(cmd, check=True)
return dst
# ---------- Silence detection ----------
_sil_start_pat = re.compile(r"silence_start:\s*([0-9]+(?:\.[0-9]+)?)")
_sil_end_pat = re.compile(r"silence_end:\s*([0-9]+(?:\.[0-9]+)?)")
def detect_silences(audio: Path, noise_db: float, min_dur: float) -> List[float]:
"""
silencedetect で無音の開始/終了を検出し、分割点候補(無音“終了”の時刻)を返す。
"""
out = run([
"ffmpeg", "-hide_banner", "-loglevel", "info",
"-i", str(audio),
"-af", f"silencedetect=noise={noise_db}dB:d={min_dur}",
"-f", "null", "-"
])
# しばしば stderr に出るが、run() は stdout と結合して回収
ends = [float(m.group(1)) for m in _sil_end_pat.finditer(out)]
return sorted(set(ends))
def choose_cut_times_by_silence(
duration: float,
silence_points: List[float],
max_seconds: int,
min_seconds: int,
) -> List[float]:
"""
[0, duration] を max_seconds 以内で区切る。
目標境界の近くにある無音点(<= max)を優先、なければ少し超えた無音点も許容。
まったく無い場合は空配列(=固定長へフォールバック)。
戻り値は segment_times(先頭0は含めない)。
"""
if duration <= max_seconds:
return []
cut_times: List[float] = []
cur = 0.0
tol_over = min(120.0, max_seconds * 0.2) # 目標超過の許容(最大+120秒)
while cur + max_seconds < duration:
target = cur + max_seconds
candidates_le = [t for t in silence_points if (t - cur) >= min_seconds and t <= target]
pick = max(candidates_le) if candidates_le else None
if pick is None:
candidates_ge = [t for t in silence_points if t > target and (t - target) <= tol_over]
if candidates_ge:
pick = min(candidates_ge)
if pick is None:
pick = target # 無音が見つからない → 最終フォールバック
cut_times.append(round(pick, 3))
if len(cut_times) > 1 and abs(cut_times[-1] - cut_times[-2]) < 1e-3:
break # 無限ループ予防
cur = pick
cut_times = [t for t in cut_times if 0.0 < t < duration]
return sorted(set(cut_times))
# ---------- Segmentation (re-encode with -ss/-t to guarantee length) ----------
def _cut_flac_ss_t(src_flac: Path, start: float, end: float, out_path: Path):
"""[start, end) を確実に切り出し、FLAC(可逆)で保存"""
dur = max(0.0, end - start)
if dur <= 0.01:
return
out_path.parent.mkdir(parents=True, exist_ok=True)
cmd = [
"ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
"-ss", f"{start:.3f}",
"-i", str(src_flac),
"-t", f"{dur:.3f}",
"-c:a", "flac", "-compression_level", "5",
str(out_path),
]
subprocess.run(cmd, check=True)
def segment_by_times_flac(src_flac: Path, times: List[float], out_dir: Path, total_duration: float) -> List[Path]:
"""
無音スナップで得たカット時刻 times を使って [0, t1], [t1, t2], ..., [tn, duration] を生成。
"""
cut_points = [0.0] + sorted([t for t in times if 0.0 < t < total_duration]) + [total_duration]
paths: List[Path] = []
for i in range(len(cut_points) - 1):
start, end = cut_points[i], cut_points[i+1]
out_path = out_dir / f"chunk_{i:04d}.flac"
_cut_flac_ss_t(src_flac, start, end, out_path)
paths.append(out_path)
if not paths:
raise RuntimeError("Failed to create chunks from times.")
return paths
def segment_fixed_flac_encode(src_flac: Path, segment_seconds: int, out_dir: Path, total_duration: float) -> List[Path]:
"""
固定長で [0, seg], [seg, 2*seg], ... を作成(各チャンクをFLACで書き出し)。
"""
paths: List[Path] = []
start = 0.0
idx = 0
while start < total_duration - 0.01:
end = min(total_duration, start + segment_seconds)
out_path = out_dir / f"chunk_{idx:04d}.flac"
_cut_flac_ss_t(src_flac, start, end, out_path)
paths.append(out_path)
idx += 1
start = end
if not paths:
raise RuntimeError("Failed to create fixed chunks.")
return paths
# ---------- Transcription ----------
def transcribe_one(file_path: Path, model: str, language: Optional[str], prompt: Optional[str]) -> str:
client = OpenAI(api_key=API_KEY)
with open(file_path, "rb") as f:
resp = client.audio.transcriptions.create(
model=model,
file=f,
language=language or None,
prompt=prompt or None,
)
return getattr(resp, "text", None) or (resp.get("text", "") if isinstance(resp, dict) else "")
# ---------- Main ----------
def main():
ap = argparse.ArgumentParser(description="Transcribe audio with FLAC preprocessing and silence-aware splitting.")
ap.add_argument("audio", help="Input audio path (wav/m4a/mp3/flac etc.)")
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Transcription model (default: {DEFAULT_MODEL})")
ap.add_argument("--language", default=None, help="ISO 639-1 (en/ja/es/...) or omit for auto-detect")
ap.add_argument("--prompt", default=None, help="Terminology hint (optional)")
ap.add_argument("--out", default=None, help="Output text file (stdout if omitted)")
# 分割モードとパラメータ
ap.add_argument("--split-mode", choices=["silence", "fixed"], default="silence",
help="silence: 無音ベース分割 (default), fixed: 固定長分割")
ap.add_argument("--max-seconds", type=int, default=DEFAULT_MAX_SECONDS,
help=f"最大チャンク秒(silenceモードで目標上限, default {DEFAULT_MAX_SECONDS})")
ap.add_argument("--min-seconds", type=int, default=DEFAULT_MIN_SECONDS,
help=f"最小チャンク秒(silenceモードの下限, default {DEFAULT_MIN_SECONDS})")
ap.add_argument("--chunk-seconds", type=int, default=DEFAULT_CHUNK_SECONDS,
help=f"split-mode=fixed の分割秒数(default {DEFAULT_CHUNK_SECONDS})")
ap.add_argument("--silence-threshold", type=float, default=DEFAULT_SILENCE_DB,
help=f"silencedetect の noise(dB) しきい値 (default {DEFAULT_SILENCE_DB})")
ap.add_argument("--silence-duration", type=float, default=DEFAULT_SILENCE_DUR,
help=f"silencedetect の最短無音継続秒 (default {DEFAULT_SILENCE_DUR})")
args = ap.parse_args()
src = Path(args.audio)
if not src.exists():
print(f"File not found: {src}", file=sys.stderr)
sys.exit(1)
# OPENAI_API_KEY があれば優先
if os.getenv("OPENAI_API_KEY"):
global API_KEY
API_KEY = os.getenv("OPENAI_API_KEY") # type: ignore[assignment]
try:
with tempfile.TemporaryDirectory() as td:
tdir = Path(td)
# 1) 前処理:FLAC(16k/mono) に変換(アップロード軽量&無劣化)
pre_flac = preprocess_to_flac_16k_mono(src, tdir / "preprocessed.flac")
duration = ffprobe_duration_sec(pre_flac) or 0.0
# 2) 分割
if args.split_mode == "silence":
sil_points = detect_silences(pre_flac, args.silence_threshold, args.silence_duration)
cut_times = choose_cut_times_by_silence(duration, sil_points, args.max_seconds, args.min_seconds)
if not cut_times:
# 無音が乏しい/一本で収まる等 → 固定長へフォールバック
seg_len = min(args.chunk_seconds, args.max_seconds)
chunk_paths = segment_fixed_flac_encode(pre_flac, seg_len, tdir, duration)
else:
chunk_paths = segment_by_times_flac(pre_flac, cut_times, tdir, duration)
else:
# 固定長分割
chunk_paths = segment_fixed_flac_encode(pre_flac, args.chunk_seconds, tdir, duration)
# 3) 逐次転写
texts: List[str] = []
for i, cp in enumerate(chunk_paths, 1):
print(f"[{i}/{len(chunk_paths)}] Transcribing {cp.name} ...", file=sys.stderr)
txt = transcribe_one(cp, args.model, args.language, args.prompt).strip()
texts.append(txt)
# 4) 出力
joined = "\n\n".join(t for t in texts if t)
if args.out:
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
with open(args.out, "w", encoding="utf-8") as wf:
wf.write(joined)
print(f"Saved: {args.out}")
else:
print(joined)
# 参考情報
print(f"[Info] duration={duration:.1f}s chunks={len(chunk_paths)} mode={args.split_mode}", file=sys.stderr)
except subprocess.CalledProcessError as e:
print(f"ffmpeg error: {e}", file=sys.stderr)
sys.exit(2)
except Exception as e:
print(f"Transcription failed: {e}", file=sys.stderr)
sys.exit(3)
if __name__ == "__main__":
main()
次に英語のテキストを日本語化します。
# translate_to_ja.py
# Requirements:
# pip install openai
#
# Usage:
# python translate_to_ja.py result.txt
# python translate_to_ja.py in.txt --out out_ja.txt --model gpt-4o-mini --chunk-chars 12000
import argparse
import os
import time
from pathlib import Path
from typing import List
from openai import OpenAI
# ===== あなたの OpenAI API キー(環境変数があればそちらを優先)=====
API_KEY = "YOUR_OPENAI_API_KEY_HERE"
DEFAULT_MODEL = "gpt-4o-mini" # コスパ重視。精度重視なら gpt-4o も可
DEFAULT_CHUNK_CHARS = 12000 # 文字数で分割(安全余裕あり)
SYSTEM_PROMPT = (
"You are a professional Japanese translator. "
"Translate the user's text to natural, clear Japanese. "
"Preserve meaning, formatting, lists, code blocks and line breaks. "
"Do not summarize or omit content."
)
def split_into_chunks(text: str, chunk_chars: int) -> List[str]:
"""段落/改行境界を優先して text を ~chunk_chars ごとに分割"""
chunks = []
n = len(text)
i = 0
while i < n:
end = min(n, i + chunk_chars)
# 段落境界(空行)を優先
cut = text.rfind("\n\n", i, end)
if cut == -1 or cut <= i + int(chunk_chars * 0.4):
# 次に単一改行を試す
c2 = text.rfind("\n", i, end)
cut = c2 if c2 != -1 and c2 > i else end
chunks.append(text[i:cut])
i = cut
# ここでは空白チャンクも残しておく(後段でAPI呼び出しをスキップするため)
return chunks
def translate_chunk(client: OpenAI, model: str, chunk: str, temperature: float = 0.1) -> str:
resp = client.chat.completions.create(
model=model,
temperature=temperature,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": chunk},
],
)
return resp.choices[0].message.content.strip()
def main():
ap = argparse.ArgumentParser(description="Translate a text file to Japanese using OpenAI.")
ap.add_argument("input", help="Input text file path (e.g., result.txt)")
ap.add_argument("--out", default=None, help="Output file path (default: <input>_ja.txt)")
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use (default: {DEFAULT_MODEL})")
ap.add_argument("--chunk-chars", type=int, default=DEFAULT_CHUNK_CHARS, help="Max characters per chunk")
ap.add_argument("--sleep", type=float, default=0.0, help="Seconds to sleep between requests")
args = ap.parse_args()
in_path = Path(args.input)
if not in_path.exists():
raise FileNotFoundError(f"Input not found: {in_path}")
out_path = Path(args.out) if args.out else in_path.with_name(in_path.stem + "_ja.txt")
# APIキー:環境変数があれば優先
api_key = os.getenv("OPENAI_API_KEY") or API_KEY
client = OpenAI(api_key=api_key) if api_key and api_key != "YOUR_OPENAI_API_KEY_HERE" else None
text = in_path.read_text(encoding="utf-8", errors="ignore")
# ファイル全体が空/空白のみなら、そのまま空で返す
if text.strip() == "":
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text("", encoding="utf-8")
print(f"Saved: {out_path}")
return
chunks = split_into_chunks(text, args.chunk_chars)
outputs: List[str] = []
for idx, ch in enumerate(chunks, 1):
# チャンクが空白のみなら API を呼ばずに空文字を追加
if ch.strip() == "":
outputs.append("")
continue
if client is None:
raise RuntimeError("APIキーが未設定です。OPENAI_API_KEY を設定するか、コード内の API_KEY を設定してください。")
print(f"[{idx}/{len(chunks)}] Translating... ({len(ch)} chars)")
try:
ja = translate_chunk(client, args.model, ch)
except Exception as e:
raise RuntimeError(f"Translation failed at chunk {idx}: {e}") from e
outputs.append(ja)
if args.sleep > 0 and idx < len(chunks):
time.sleep(args.sleep)
out_path.parent.mkdir(parents=True, exist_ok=True)
# チャンク間は空行で連結(元の段落感を維持)
out_path.write_text("\n\n".join(outputs), encoding="utf-8")
print(f"Saved: {out_path}")
if __name__ == "__main__":
main()
次に日本語のテキストを音声化します。
# ja_text_to_speech.py
# Requirements:
# pip install openai
# ffmpeg が PATH にあること
#
# Usage:
# python ja_text_to_speech.py out_ja.txt --out voice.mp3 --voice alloy --model gpt-4o-mini-tts
# python ja_text_to_speech.py out_ja.txt --format wav --chunk-chars 3000
import argparse
import os
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import List
from openai import OpenAI
from openai import BadRequestError
# ===== あなたの OpenAI API キー(環境変数があれば優先)=====
API_KEY = "YOUR_OPENAI_API_KEY_HERE"
DEFAULT_MODEL = "gpt-4o-mini-tts"
DEFAULT_VOICE = "alloy"
DEFAULT_FORMAT = "mp3"
# 初期チャンクの文字数上限(日本語はtoken化が変動するので余裕を見ておく)
DEFAULT_CHUNK_CHARS = 2000
# ---------- helpers ----------
def run(cmd: List[str]) -> None:
subprocess.run(cmd, check=True)
def split_into_chunks(text: str, chunk_chars: int) -> List[str]:
"""
日本語向けに句点・感嘆・疑問・改行を境に ~chunk_chars で分割。
"""
text = text.replace("\r\n", "\n")
seps = "。!?\n"
chunks, cur = [], []
cur_len = 0
def flush():
nonlocal cur, cur_len
s = "".join(cur).strip()
chunks.append(s) # 空白のみなら後段でスキップ
cur, cur_len = [], 0
for ch in text:
cur.append(ch)
cur_len += 1
if ch in seps and cur_len >= chunk_chars:
flush()
elif cur_len >= int(chunk_chars * 1.2): # 句点が来ない場合の強制切り
flush()
flush()
# 先頭・末尾の空白チャンクは残し、後段で無視する(段落の位置をなるべく保つ)
return [c for c in chunks]
def synth_chunk(client: OpenAI, model: str, voice: str, fmt: str, text: str, out_path: Path) -> None:
"""
1チャンクを音声化し out_path に保存。
"""
if fmt not in ("mp3", "wav", "opus"):
raise ValueError("format must be mp3|wav|opus")
with client.audio.speech.with_streaming_response.create(
model=model,
voice=voice,
input=text,
response_format=fmt, # ← 重要: 'format' ではなく 'response_format'
) as resp:
resp.stream_to_file(out_path)
def is_token_limit_error(e: Exception) -> bool:
msg = str(e)
return ("maximum input limit" in msg) or ("over the maximum input limit" in msg)
def concat_with_ffmpeg(parts: List[Path], out_path: Path, fmt: str) -> None:
"""
生成したチャンク音声を ffmpeg で結合(再エンコードしてパラメータ不一致を回避)。
"""
if not parts:
# 入力が空なら 0.3 秒の無音を出力
if fmt == "mp3":
run(["ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
"-f", "lavfi", "-i", "anullsrc=r=24000:cl=mono",
"-t", "0.3", "-c:a", "libmp3lame", "-b:a", "128k", str(out_path)])
elif fmt == "wav":
run(["ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
"-f", "lavfi", "-i", "anullsrc=r=16000:cl=mono",
"-t", "0.3", "-c:a", "pcm_s16le", str(out_path)])
else: # opus
run(["ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
"-f", "lavfi", "-i", "anullsrc=r=48000:cl=mono",
"-t", "0.3", "-c:a", "libopus", "-b:a", "96k", str(out_path)])
return
with tempfile.TemporaryDirectory() as td:
list_path = Path(td) / "list.txt"
list_path.write_text("\n".join(f"file '{p.as_posix()}'" for p in parts), encoding="utf-8")
if fmt == "mp3":
codec = ["-c:a", "libmp3lame", "-b:a", "192k"]
elif fmt == "wav":
codec = ["-c:a", "pcm_s16le"]
else: # opus
codec = ["-c:a", "libopus", "-b:a", "96k"]
run([
"ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
"-f", "concat", "-safe", "0", "-i", str(list_path),
*codec, str(out_path)
])
def synth_with_token_backoff(
client: OpenAI,
model: str,
voice: str,
fmt: str,
text: str,
tdir: Path,
parts: List[Path],
base_label: str,
max_chars: int,
depth: int = 0,
) -> None:
"""
トークン上限エラーが出たら、分割サイズを縮めて再試行(再帰)。
"""
if text.strip() == "":
return
# 出力ファイル名(全体での通し番号)
idx = len(parts) + 1
out_path = tdir / f"{base_label}_{idx:04d}.{fmt}"
try:
print(f" {' '*depth}TTS try (len={len(text)} chars) -> {out_path.name}")
synth_chunk(client, model, voice, fmt, text, out_path)
parts.append(out_path)
return
except BadRequestError as e:
if not is_token_limit_error(e):
raise # 別のエラーはそのまま
# トークン超過 → さらに細かく分割して再帰
smaller = max(300, int(max_chars * 0.6))
if smaller >= max_chars:
smaller = max_chars - 200
if smaller < 300:
# これ以上細かくできないのにまだ失敗 → 例外にする
raise
sub_chunks = [c for c in split_into_chunks(text, smaller) if c.strip() != ""]
if not sub_chunks:
# どうしても分割できない長大な一文 → 強制スライス
step = max(300, int(max_chars * 0.5))
sub_chunks = [text[i:i+step] for i in range(0, len(text), step)]
for sub in sub_chunks:
synth_with_token_backoff(
client, model, voice, fmt, sub, tdir, parts, base_label, smaller, depth + 1
)
# ---------- main ----------
def main():
ap = argparse.ArgumentParser(description="Japanese TTS: Convert a text file to speech using OpenAI gpt-4o-mini-tts.")
ap.add_argument("input", help="Input text file path (e.g., out_ja.txt)")
ap.add_argument("--out", default=None, help="Output audio path (default: <input>.mp3)")
ap.add_argument("--model", default=DEFAULT_MODEL, help=f"TTS model (default: {DEFAULT_MODEL})")
ap.add_argument("--voice", default=DEFAULT_VOICE, help=f"Voice name (default: {DEFAULT_VOICE})")
ap.add_argument("--format", default=DEFAULT_FORMAT, choices=["mp3", "wav", "opus"], help="Audio format")
ap.add_argument("--chunk-chars", type=int, default=DEFAULT_CHUNK_CHARS, help="Initial max characters per chunk")
args = ap.parse_args()
in_path = Path(args.input)
if not in_path.exists():
print(f"Input not found: {in_path}", file=sys.stderr)
sys.exit(1)
out_path = Path(args.out) if args.out else in_path.with_suffix(f".{args.format}")
# APIキー
api_key = os.getenv("OPENAI_API_KEY") or API_KEY
if not api_key or api_key == "YOUR_OPENAI_API_KEY_HERE":
raise RuntimeError("APIキーが未設定です。OPENAI_API_KEY を設定するか、コード内の API_KEY を書き換えてください。")
client = OpenAI(api_key=api_key)
text = in_path.read_text(encoding="utf-8", errors="ignore")
# 空なら無音ファイルを作成して終了
if text.strip() == "":
concat_with_ffmpeg([], out_path, args.format)
print(f"Saved (silence): {out_path}")
return
# まず大まかに段落ベースで分割
top_chunks = [c for c in split_into_chunks(text, args.chunk_chars) if c is not None]
parts: List[Path] = []
with tempfile.TemporaryDirectory() as td:
tdir = Path(td)
total = len(top_chunks)
for i, ch in enumerate(top_chunks, 1):
if ch.strip() == "":
continue
print(f"[{i}/{total}] TTS generating... ({len(ch)} chars)")
try:
synth_with_token_backoff(
client, args.model, args.voice, args.format,
ch, tdir, parts, base_label="part", max_chars=args.chunk_chars
)
except Exception as e:
print(f" -> Failed to synthesize chunk {i}: {e}", file=sys.stderr)
raise
concat_with_ffmpeg(parts, out_path, args.format)
print(f"Saved: {out_path}")
if __name__ == "__main__":
main()
最後に音声を動画に変換します。
ffmpeg -loop 1 -i screen.png -i out/gpt5_0Uu_VJeVVfo_ja.mp3 … out/gpt5_0Uu_VJeVVfo_ja.mp4