dotfiles/.local/bin/video2srt

#!/usr/bin/env bash
set -euo pipefail

WHISPER_DIR="${WHISPER_DIR:-$HOME/repos/whisper.cpp}"
WHISPER_MODEL="${WHISPER_MODEL:-$WHISPER_DIR/models/ggml-large-v3.bin}"
WHISPER_BIN="${WHISPER_BIN:-$WHISPER_DIR/build/bin/whisper-cli}"
VAD_MODEL="${VAD_MODEL:-$WHISPER_DIR/models/ggml-silero-v6.2.0.bin}"
TRANSLATE="${TRANSLATE:-1}"
SRC_LANG="${SRC_LANG:-auto}"
FORCE="${FORCE:-0}"
USE_VAD="${USE_VAD:-1}"
MAX_LEN="${MAX_LEN:-84}"
LINE_LEN="${LINE_LEN:-42}"
OUTPUT=""

usage() {
    cat <<EOF
Usage: video2srt [options] <video> [<video>...]

Generates an English .srt next to each video using whisper.cpp.

Options:
  -t, --transcribe   Transcribe in source language (default: translate to English)
  -l, --lang CODE    Force source language (default: auto-detect)
  -m, --model PATH   Path to ggml model (default: $WHISPER_MODEL)
  -o, --output PATH  Output .srt path (single input only; default: <video>.srt)
  -f, --force        Overwrite existing .srt
      --no-vad       Disable Silero VAD pre-filtering (VAD reduces hallucination loops)
      --max-len N    Max characters per SRT entry, 0 to disable (default: $MAX_LEN)
      --line-len N   Max characters per visible line; longer entries wrap to 2 lines, 0 to disable (default: $LINE_LEN)
  -h, --help         Show this help

Env overrides: WHISPER_DIR, WHISPER_MODEL, WHISPER_BIN
EOF
}

args=()
while [[ $# -gt 0 ]]; do
    case "$1" in
        -t|--transcribe) TRANSLATE=0; shift ;;
        -l|--lang) SRC_LANG="$2"; shift 2 ;;
        -m|--model) WHISPER_MODEL="$2"; shift 2 ;;
        -o|--output) OUTPUT="$2"; shift 2 ;;
        -f|--force) FORCE=1; shift ;;
        --no-vad) USE_VAD=0; shift ;;
        --max-len) MAX_LEN="$2"; shift 2 ;;
        --line-len) LINE_LEN="$2"; shift 2 ;;
        -h|--help) usage; exit 0 ;;
        --) shift; args+=("$@"); break ;;
        -*) echo "Unknown option: $1" >&2; usage >&2; exit 2 ;;
        *) args+=("$1"); shift ;;
    esac
done

if [[ ${#args[@]} -eq 0 ]]; then
    usage >&2; exit 2
fi

if [[ -n "$OUTPUT" && ${#args[@]} -gt 1 ]]; then
    echo "--output cannot be combined with multiple input files" >&2; exit 2
fi

[[ -x "$WHISPER_BIN" ]] || { echo "whisper-cli not found at $WHISPER_BIN" >&2; exit 1; }
[[ -f "$WHISPER_MODEL" ]] || { echo "model not found at $WHISPER_MODEL" >&2; exit 1; }
command -v ffmpeg >/dev/null || { echo "ffmpeg not installed" >&2; exit 1; }

for video in "${args[@]}"; do
    if [[ ! -f "$video" ]]; then
        echo "skip: $video (not a file)" >&2
        continue
    fi

    if [[ -n "$OUTPUT" ]]; then
        out_stem="${OUTPUT%.srt}"
        out_dir=$(dirname -- "$out_stem")
    else
        out_dir=$(dirname -- "$video")
        base=$(basename -- "$video")
        out_stem="$out_dir/${base%.*}"
    fi
    srt="$out_stem.srt"

    if [[ -f "$srt" && "$FORCE" != "1" ]]; then
        echo "skip: $srt exists (use --force to overwrite)"
        continue
    fi

    echo ">> $video"
    tmpwav=$(mktemp --suffix=.wav)
    trap 'rm -f "$tmpwav"' EXIT

    ffmpeg -hide_banner -loglevel error -y \
        -i "$video" -vn -ar 16000 -ac 1 -c:a pcm_s16le "$tmpwav"

    mkdir -p -- "$out_dir"
    whisper_args=(
        -m "$WHISPER_MODEL"
        -f "$tmpwav"
        -of "$out_stem"
        --output-srt
        -l "$SRC_LANG"
        -mc 0
    )
    [[ "$TRANSLATE" == "1" ]] && whisper_args+=(--translate)
    if [[ "$MAX_LEN" -gt 0 ]]; then
        whisper_args+=(--max-len "$MAX_LEN" --split-on-word)
    fi
    if [[ "$USE_VAD" == "1" ]]; then
        if [[ -f "$VAD_MODEL" ]]; then
            whisper_args+=(--vad --vad-model "$VAD_MODEL")
        else
            echo "warn: VAD model not found at $VAD_MODEL, running without VAD" >&2
            echo "      download with: sh $WHISPER_DIR/models/download-vad-model.sh silero-v6.2.0" >&2
        fi
    fi

    "$WHISPER_BIN" "${whisper_args[@]}"

    if [[ "$LINE_LEN" -gt 0 ]]; then
        python3 - "$srt" "$LINE_LEN" <<'PYEOF'
import re, sys, pathlib

path = pathlib.Path(sys.argv[1])
max_line = int(sys.argv[2])

def wrap(text, limit):
    text = " ".join(text.split())
    if len(text) <= limit:
        return text
    words = text.split(" ")
    if len(words) < 2:
        return text
    best = None
    best_score = None
    cum = 0
    for i, w in enumerate(words[:-1]):
        cum += len(w) + (1 if i > 0 else 0)
        top, bot = cum, len(text) - cum - 1
        score = abs(bot - top)
        if top > limit:
            score += (top - limit) * 100
        if bot > limit:
            score += (bot - limit) * 100
        if w.rstrip(",.!?:;") != w:
            score -= 8
        if best_score is None or score < best_score:
            best_score, best = score, i
    return " ".join(words[:best+1]) + "\n" + " ".join(words[best+1:])

blocks = re.split(r"\n\n+", path.read_text(encoding="utf-8").strip())
out = []
for b in blocks:
    lines = b.split("\n")
    if len(lines) < 3:
        out.append(b)
        continue
    head, body = lines[:2], " ".join(lines[2:])
    out.append("\n".join(head + [wrap(body, max_line)]))
path.write_text("\n\n".join(out) + "\n", encoding="utf-8")
PYEOF
    fi

    rm -f "$tmpwav"
    trap - EXIT
    echo "<< $srt"
done