#!/usr/bin/env python3 """Generate an English .srt next to each video using whisper.cpp.""" import argparse import os import re import shutil import subprocess import sys import tempfile from pathlib import Path WHISPER_DIR = Path( os.environ.get("WHISPER_DIR", Path.home() / "repos/whisper.cpp") ) DEFAULT_MODEL = Path( os.environ.get("WHISPER_MODEL", WHISPER_DIR / "models/ggml-large-v3.bin") ) DEFAULT_BIN = Path( os.environ.get("WHISPER_BIN", WHISPER_DIR / "build/bin/whisper-cli") ) MIN_WORDS_TO_WRAP = 2 SRT_HEADER_LINES = 2 def wrap_line(text: str, limit: int) -> str: text = " ".join(text.split()) if len(text) <= limit: return text words = text.split(" ") if len(words) < MIN_WORDS_TO_WRAP: return text best = 0 best_score = float("inf") cum = 0 for i, w in enumerate(words[:-1]): cum += len(w) + (1 if i > 0 else 0) top, bot = cum, len(text) - cum - 1 score = abs(bot - top) if top > limit: score += (top - limit) * 100 if bot > limit: score += (bot - limit) * 100 if w.rstrip(",.!?:;") != w: score -= 8 if score < best_score: best_score, best = score, i return " ".join(words[: best + 1]) + "\n" + " ".join(words[best + 1 :]) def wrap_srt(path: Path, limit: int) -> None: blocks = re.split(r"\n\n+", path.read_text(encoding="utf-8").strip()) out: list[str] = [] for b in blocks: lines = b.split("\n") if len(lines) <= SRT_HEADER_LINES: out.append(b) continue head, body = lines[:SRT_HEADER_LINES], " ".join(lines[SRT_HEADER_LINES:]) out.append("\n".join([*head, wrap_line(body, limit)])) path.write_text("\n\n".join(out) + "\n", encoding="utf-8") def output_stem(video: Path, override: str | None) -> Path: if override: stem = override[:-4] if override.endswith(".srt") else override return Path(stem) return video.with_suffix("") def process(video: Path, args: argparse.Namespace) -> None: out_stem = output_stem(video, args.output) srt = out_stem.with_suffix(".srt") if srt.exists() and not args.force: print(f"skip: {srt} exists (use --force to overwrite)") return out_stem.parent.mkdir(parents=True, exist_ok=True) print(f">> {video}") with tempfile.TemporaryDirectory() as tmpdir: wav_path = Path(tmpdir) / "audio.wav" subprocess.run( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", str(video), "-vn", "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", str(wav_path), ], check=True, ) cmd = [ str(args.bin), "-m", str(args.model), "-f", str(wav_path), "-of", str(out_stem), "--output-srt", "-l", args.lang, "-mc", "0", ] if args.translate: cmd.append("--translate") if args.max_len > 0: cmd += ["--max-len", str(args.max_len), "--split-on-word"] subprocess.run(cmd, check=True) if args.line_len > 0: wrap_srt(srt, args.line_len) print(f"<< {srt}") def main() -> int: p = argparse.ArgumentParser( prog="video2srt", description="Generate an English .srt next to each video using whisper.cpp.", ) p.add_argument("videos", nargs="+", type=Path, metavar="VIDEO") p.add_argument( "-t", "--transcribe", dest="translate", action="store_false", help="Transcribe in source language (default: translate to English)", ) p.add_argument( "-l", "--lang", default=os.environ.get("SRC_LANG", "auto"), help="Force source language (default: auto-detect)", ) p.add_argument( "-m", "--model", type=Path, default=DEFAULT_MODEL, help=f"Path to ggml model (default: {DEFAULT_MODEL})", ) p.add_argument( "-o", "--output", help="Output .srt path (single input only; default: