feat(video2srt): convert to python and add --vad-max-speech

2026-05-25 18:06:07 +02:00
parent 673c5a234a
commit 7c5b21b680
1 changed files with 213 additions and 142 deletions
@@ -1,137 +1,41 @@
-#!/usr/bin/env bash
-set -euo pipefail
+#!/usr/bin/env python3
+"""Generate an English .srt next to each video using whisper.cpp."""

-WHISPER_DIR="${WHISPER_DIR:-$HOME/repos/whisper.cpp}"
-WHISPER_MODEL="${WHISPER_MODEL:-$WHISPER_DIR/models/ggml-large-v3.bin}"
-WHISPER_BIN="${WHISPER_BIN:-$WHISPER_DIR/build/bin/whisper-cli}"
-VAD_MODEL="${VAD_MODEL:-$WHISPER_DIR/models/ggml-silero-v6.2.0.bin}"
-TRANSLATE="${TRANSLATE:-1}"
-SRC_LANG="${SRC_LANG:-auto}"
-FORCE="${FORCE:-0}"
-USE_VAD="${USE_VAD:-1}"
-MAX_LEN="${MAX_LEN:-84}"
-LINE_LEN="${LINE_LEN:-42}"
-OUTPUT=""
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path

-usage() {
-    cat <<EOF
-Usage: video2srt [options] <video> [<video>...]
+WHISPER_DIR = Path(
+    os.environ.get("WHISPER_DIR", Path.home() / "repos/whisper.cpp")
+)
+DEFAULT_MODEL = Path(
+    os.environ.get("WHISPER_MODEL", WHISPER_DIR / "models/ggml-large-v3.bin")
+)
+DEFAULT_BIN = Path(
+    os.environ.get("WHISPER_BIN", WHISPER_DIR / "build/bin/whisper-cli")
+)
+DEFAULT_VAD = Path(
+    os.environ.get("VAD_MODEL", WHISPER_DIR / "models/ggml-silero-v6.2.0.bin")
+)

-Generates an English .srt next to each video using whisper.cpp.
+MIN_WORDS_TO_WRAP = 2
+SRT_HEADER_LINES = 2

-Options:
-  -t, --transcribe   Transcribe in source language (default: translate to English)
-  -l, --lang CODE    Force source language (default: auto-detect)
-  -m, --model PATH   Path to ggml model (default: $WHISPER_MODEL)
-  -o, --output PATH  Output .srt path (single input only; default: <video>.srt)
-  -f, --force        Overwrite existing .srt
-      --no-vad       Disable Silero VAD pre-filtering (VAD reduces hallucination loops)
-      --max-len N    Max characters per SRT entry, 0 to disable (default: $MAX_LEN)
-      --line-len N   Max characters per visible line; longer entries wrap to 2 lines, 0 to disable (default: $LINE_LEN)
-  -h, --help         Show this help

-Env overrides: WHISPER_DIR, WHISPER_MODEL, WHISPER_BIN
-EOF
-}
-
-args=()
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        -t|--transcribe) TRANSLATE=0; shift ;;
-        -l|--lang) SRC_LANG="$2"; shift 2 ;;
-        -m|--model) WHISPER_MODEL="$2"; shift 2 ;;
-        -o|--output) OUTPUT="$2"; shift 2 ;;
-        -f|--force) FORCE=1; shift ;;
-        --no-vad) USE_VAD=0; shift ;;
-        --max-len) MAX_LEN="$2"; shift 2 ;;
-        --line-len) LINE_LEN="$2"; shift 2 ;;
-        -h|--help) usage; exit 0 ;;
-        --) shift; args+=("$@"); break ;;
-        -*) echo "Unknown option: $1" >&2; usage >&2; exit 2 ;;
-        *) args+=("$1"); shift ;;
-    esac
-done
-
-if [[ ${#args[@]} -eq 0 ]]; then
-    usage >&2; exit 2
-fi
-
-if [[ -n "$OUTPUT" && ${#args[@]} -gt 1 ]]; then
-    echo "--output cannot be combined with multiple input files" >&2; exit 2
-fi
-
-[[ -x "$WHISPER_BIN" ]] || { echo "whisper-cli not found at $WHISPER_BIN" >&2; exit 1; }
-[[ -f "$WHISPER_MODEL" ]] || { echo "model not found at $WHISPER_MODEL" >&2; exit 1; }
-command -v ffmpeg >/dev/null || { echo "ffmpeg not installed" >&2; exit 1; }
-
-for video in "${args[@]}"; do
-    if [[ ! -f "$video" ]]; then
-        echo "skip: $video (not a file)" >&2
-        continue
-    fi
-
-    if [[ -n "$OUTPUT" ]]; then
-        out_stem="${OUTPUT%.srt}"
-        out_dir=$(dirname -- "$out_stem")
-    else
-        out_dir=$(dirname -- "$video")
-        base=$(basename -- "$video")
-        out_stem="$out_dir/${base%.*}"
-    fi
-    srt="$out_stem.srt"
-
-    if [[ -f "$srt" && "$FORCE" != "1" ]]; then
-        echo "skip: $srt exists (use --force to overwrite)"
-        continue
-    fi
-
-    echo ">> $video"
-    tmpwav=$(mktemp --suffix=.wav)
-    trap 'rm -f "$tmpwav"' EXIT
-
-    ffmpeg -hide_banner -loglevel error -y \
-        -i "$video" -vn -ar 16000 -ac 1 -c:a pcm_s16le "$tmpwav"
-
-    mkdir -p -- "$out_dir"
-    whisper_args=(
-        -m "$WHISPER_MODEL"
-        -f "$tmpwav"
-        -of "$out_stem"
-        --output-srt
-        -l "$SRC_LANG"
-        -mc 0
-    )
-    [[ "$TRANSLATE" == "1" ]] && whisper_args+=(--translate)
-    if [[ "$MAX_LEN" -gt 0 ]]; then
-        whisper_args+=(--max-len "$MAX_LEN" --split-on-word)
-    fi
-    if [[ "$USE_VAD" == "1" ]]; then
-        if [[ -f "$VAD_MODEL" ]]; then
-            whisper_args+=(--vad --vad-model "$VAD_MODEL")
-        else
-            echo "warn: VAD model not found at $VAD_MODEL, running without VAD" >&2
-            echo "      download with: sh $WHISPER_DIR/models/download-vad-model.sh silero-v6.2.0" >&2
-        fi
-    fi
-
-    "$WHISPER_BIN" "${whisper_args[@]}"
-
-    if [[ "$LINE_LEN" -gt 0 ]]; then
-        python3 - "$srt" "$LINE_LEN" <<'PYEOF'
-import re, sys, pathlib
-
-path = pathlib.Path(sys.argv[1])
-max_line = int(sys.argv[2])
-
-def wrap(text, limit):
+def wrap_line(text: str, limit: int) -> str:
    text = " ".join(text.split())
    if len(text) <= limit:
        return text
    words = text.split(" ")
-    if len(words) < 2:
+    if len(words) < MIN_WORDS_TO_WRAP:
        return text
-    best = None
-    best_score = None
+    best = 0
+    best_score = float("inf")
    cum = 0
    for i, w in enumerate(words[:-1]):
        cum += len(w) + (1 if i > 0 else 0)
@@ -143,24 +47,191 @@ def wrap(text, limit):
            score += (bot - limit) * 100
        if w.rstrip(",.!?:;") != w:
            score -= 8
-        if best_score is None or score < best_score:
+        if score < best_score:
            best_score, best = score, i
-    return " ".join(words[:best+1]) + "\n" + " ".join(words[best+1:])
+    return " ".join(words[: best + 1]) + "\n" + " ".join(words[best + 1 :])

-blocks = re.split(r"\n\n+", path.read_text(encoding="utf-8").strip())
-out = []
-for b in blocks:
+
+def wrap_srt(path: Path, limit: int) -> None:
+    blocks = re.split(r"\n\n+", path.read_text(encoding="utf-8").strip())
+    out: list[str] = []
+    for b in blocks:
        lines = b.split("\n")
-    if len(lines) < 3:
+        if len(lines) <= SRT_HEADER_LINES:
            out.append(b)
            continue
-    head, body = lines[:2], " ".join(lines[2:])
-    out.append("\n".join(head + [wrap(body, max_line)]))
-path.write_text("\n\n".join(out) + "\n", encoding="utf-8")
-PYEOF
-    fi
+        head, body = lines[:SRT_HEADER_LINES], " ".join(lines[SRT_HEADER_LINES:])
+        out.append("\n".join([*head, wrap_line(body, limit)]))
+    path.write_text("\n\n".join(out) + "\n", encoding="utf-8")

-    rm -f "$tmpwav"
-    trap - EXIT
-    echo "<< $srt"
-done
+
+def output_stem(video: Path, override: str | None) -> Path:
+    if override:
+        stem = override[:-4] if override.endswith(".srt") else override
+        return Path(stem)
+    return video.with_suffix("")
+
+
+def process(video: Path, args: argparse.Namespace) -> None:
+    out_stem = output_stem(video, args.output)
+    srt = out_stem.with_suffix(".srt")
+
+    if srt.exists() and not args.force:
+        print(f"skip: {srt} exists (use --force to overwrite)")
+        return
+
+    out_stem.parent.mkdir(parents=True, exist_ok=True)
+    print(f">> {video}")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        wav_path = Path(tmpdir) / "audio.wav"
+
+        subprocess.run(
+            [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel",
+                "error",
+                "-y",
+                "-i",
+                str(video),
+                "-vn",
+                "-ar",
+                "16000",
+                "-ac",
+                "1",
+                "-c:a",
+                "pcm_s16le",
+                str(wav_path),
+            ],
+            check=True,
+        )
+
+        cmd = [
+            str(args.bin),
+            "-m",
+            str(args.model),
+            "-f",
+            str(wav_path),
+            "-of",
+            str(out_stem),
+            "--output-srt",
+            "-l",
+            args.lang,
+            "-mc",
+            "0",
+        ]
+        if args.translate:
+            cmd.append("--translate")
+        if args.max_len > 0:
+            cmd += ["--max-len", str(args.max_len), "--split-on-word"]
+        if args.vad:
+            if args.vad_model.exists():
+                cmd += ["--vad", "--vad-model", str(args.vad_model)]
+                if args.vad_max_speech > 0:
+                    cmd += ["-vmsd", str(args.vad_max_speech)]
+            else:
+                print(
+                    f"warn: VAD model not found at {args.vad_model}, running without VAD",
+                    file=sys.stderr,
+                )
+                print(
+                    f"      download with: sh {WHISPER_DIR}/models/download-vad-model.sh silero-v6.2.0",
+                    file=sys.stderr,
+                )
+
+        subprocess.run(cmd, check=True)
+
+        if args.line_len > 0:
+            wrap_srt(srt, args.line_len)
+
+        print(f"<< {srt}")
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(
+        prog="video2srt",
+        description="Generate an English .srt next to each video using whisper.cpp.",
+    )
+    p.add_argument("videos", nargs="+", type=Path, metavar="VIDEO")
+    p.add_argument(
+        "-t",
+        "--transcribe",
+        dest="translate",
+        action="store_false",
+        help="Transcribe in source language (default: translate to English)",
+    )
+    p.add_argument(
+        "-l",
+        "--lang",
+        default=os.environ.get("SRC_LANG", "auto"),
+        help="Force source language (default: auto-detect)",
+    )
+    p.add_argument(
+        "-m",
+        "--model",
+        type=Path,
+        default=DEFAULT_MODEL,
+        help=f"Path to ggml model (default: {DEFAULT_MODEL})",
+    )
+    p.add_argument(
+        "-o",
+        "--output",
+        help="Output .srt path (single input only; default: <video>.srt)",
+    )
+    p.add_argument(
+        "-f", "--force", action="store_true", help="Overwrite existing .srt"
+    )
+    p.add_argument(
+        "--no-vad",
+        dest="vad",
+        action="store_false",
+        help="Disable Silero VAD pre-filtering (VAD reduces hallucination loops)",
+    )
+    p.add_argument(
+        "--vad-max-speech",
+        type=float,
+        default=15.0,
+        help="Max seconds of speech per VAD chunk; shorter values give tighter timestamps (default: 15)",
+    )
+    p.add_argument(
+        "--max-len",
+        type=int,
+        default=84,
+        help="Max characters per SRT entry, 0 to disable (default: 84)",
+    )
+    p.add_argument(
+        "--line-len",
+        type=int,
+        default=42,
+        help="Max characters per visible line, 0 to disable (default: 42)",
+    )
+    p.add_argument(
+        "--bin", type=Path, default=DEFAULT_BIN, help=argparse.SUPPRESS
+    )
+    p.add_argument(
+        "--vad-model", type=Path, default=DEFAULT_VAD, help=argparse.SUPPRESS
+    )
+
+    args = p.parse_args()
+
+    if args.output and len(args.videos) > 1:
+        p.error("--output cannot be combined with multiple input files")
+    if not args.bin.is_file() or not os.access(args.bin, os.X_OK):
+        sys.exit(f"whisper-cli not found at {args.bin}")
+    if not args.model.is_file():
+        sys.exit(f"model not found at {args.model}")
+    if shutil.which("ffmpeg") is None:
+        sys.exit("ffmpeg not installed")
+
+    for video in args.videos:
+        if not video.is_file():
+            print(f"skip: {video} (not a file)", file=sys.stderr)
+            continue
+        process(video, args)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())