feat(video2srt): convert to python and add --vad-max-speech
This commit is contained in:
+210
-139
@@ -1,137 +1,41 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
#!/usr/bin/env python3
|
||||
"""Generate an English .srt next to each video using whisper.cpp."""
|
||||
|
||||
WHISPER_DIR="${WHISPER_DIR:-$HOME/repos/whisper.cpp}"
|
||||
WHISPER_MODEL="${WHISPER_MODEL:-$WHISPER_DIR/models/ggml-large-v3.bin}"
|
||||
WHISPER_BIN="${WHISPER_BIN:-$WHISPER_DIR/build/bin/whisper-cli}"
|
||||
VAD_MODEL="${VAD_MODEL:-$WHISPER_DIR/models/ggml-silero-v6.2.0.bin}"
|
||||
TRANSLATE="${TRANSLATE:-1}"
|
||||
SRC_LANG="${SRC_LANG:-auto}"
|
||||
FORCE="${FORCE:-0}"
|
||||
USE_VAD="${USE_VAD:-1}"
|
||||
MAX_LEN="${MAX_LEN:-84}"
|
||||
LINE_LEN="${LINE_LEN:-42}"
|
||||
OUTPUT=""
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: video2srt [options] <video> [<video>...]
|
||||
WHISPER_DIR = Path(
|
||||
os.environ.get("WHISPER_DIR", Path.home() / "repos/whisper.cpp")
|
||||
)
|
||||
DEFAULT_MODEL = Path(
|
||||
os.environ.get("WHISPER_MODEL", WHISPER_DIR / "models/ggml-large-v3.bin")
|
||||
)
|
||||
DEFAULT_BIN = Path(
|
||||
os.environ.get("WHISPER_BIN", WHISPER_DIR / "build/bin/whisper-cli")
|
||||
)
|
||||
DEFAULT_VAD = Path(
|
||||
os.environ.get("VAD_MODEL", WHISPER_DIR / "models/ggml-silero-v6.2.0.bin")
|
||||
)
|
||||
|
||||
Generates an English .srt next to each video using whisper.cpp.
|
||||
MIN_WORDS_TO_WRAP = 2
|
||||
SRT_HEADER_LINES = 2
|
||||
|
||||
Options:
|
||||
-t, --transcribe Transcribe in source language (default: translate to English)
|
||||
-l, --lang CODE Force source language (default: auto-detect)
|
||||
-m, --model PATH Path to ggml model (default: $WHISPER_MODEL)
|
||||
-o, --output PATH Output .srt path (single input only; default: <video>.srt)
|
||||
-f, --force Overwrite existing .srt
|
||||
--no-vad Disable Silero VAD pre-filtering (VAD reduces hallucination loops)
|
||||
--max-len N Max characters per SRT entry, 0 to disable (default: $MAX_LEN)
|
||||
--line-len N Max characters per visible line; longer entries wrap to 2 lines, 0 to disable (default: $LINE_LEN)
|
||||
-h, --help Show this help
|
||||
|
||||
Env overrides: WHISPER_DIR, WHISPER_MODEL, WHISPER_BIN
|
||||
EOF
|
||||
}
|
||||
|
||||
args=()
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-t|--transcribe) TRANSLATE=0; shift ;;
|
||||
-l|--lang) SRC_LANG="$2"; shift 2 ;;
|
||||
-m|--model) WHISPER_MODEL="$2"; shift 2 ;;
|
||||
-o|--output) OUTPUT="$2"; shift 2 ;;
|
||||
-f|--force) FORCE=1; shift ;;
|
||||
--no-vad) USE_VAD=0; shift ;;
|
||||
--max-len) MAX_LEN="$2"; shift 2 ;;
|
||||
--line-len) LINE_LEN="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
--) shift; args+=("$@"); break ;;
|
||||
-*) echo "Unknown option: $1" >&2; usage >&2; exit 2 ;;
|
||||
*) args+=("$1"); shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ${#args[@]} -eq 0 ]]; then
|
||||
usage >&2; exit 2
|
||||
fi
|
||||
|
||||
if [[ -n "$OUTPUT" && ${#args[@]} -gt 1 ]]; then
|
||||
echo "--output cannot be combined with multiple input files" >&2; exit 2
|
||||
fi
|
||||
|
||||
[[ -x "$WHISPER_BIN" ]] || { echo "whisper-cli not found at $WHISPER_BIN" >&2; exit 1; }
|
||||
[[ -f "$WHISPER_MODEL" ]] || { echo "model not found at $WHISPER_MODEL" >&2; exit 1; }
|
||||
command -v ffmpeg >/dev/null || { echo "ffmpeg not installed" >&2; exit 1; }
|
||||
|
||||
for video in "${args[@]}"; do
|
||||
if [[ ! -f "$video" ]]; then
|
||||
echo "skip: $video (not a file)" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ -n "$OUTPUT" ]]; then
|
||||
out_stem="${OUTPUT%.srt}"
|
||||
out_dir=$(dirname -- "$out_stem")
|
||||
else
|
||||
out_dir=$(dirname -- "$video")
|
||||
base=$(basename -- "$video")
|
||||
out_stem="$out_dir/${base%.*}"
|
||||
fi
|
||||
srt="$out_stem.srt"
|
||||
|
||||
if [[ -f "$srt" && "$FORCE" != "1" ]]; then
|
||||
echo "skip: $srt exists (use --force to overwrite)"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo ">> $video"
|
||||
tmpwav=$(mktemp --suffix=.wav)
|
||||
trap 'rm -f "$tmpwav"' EXIT
|
||||
|
||||
ffmpeg -hide_banner -loglevel error -y \
|
||||
-i "$video" -vn -ar 16000 -ac 1 -c:a pcm_s16le "$tmpwav"
|
||||
|
||||
mkdir -p -- "$out_dir"
|
||||
whisper_args=(
|
||||
-m "$WHISPER_MODEL"
|
||||
-f "$tmpwav"
|
||||
-of "$out_stem"
|
||||
--output-srt
|
||||
-l "$SRC_LANG"
|
||||
-mc 0
|
||||
)
|
||||
[[ "$TRANSLATE" == "1" ]] && whisper_args+=(--translate)
|
||||
if [[ "$MAX_LEN" -gt 0 ]]; then
|
||||
whisper_args+=(--max-len "$MAX_LEN" --split-on-word)
|
||||
fi
|
||||
if [[ "$USE_VAD" == "1" ]]; then
|
||||
if [[ -f "$VAD_MODEL" ]]; then
|
||||
whisper_args+=(--vad --vad-model "$VAD_MODEL")
|
||||
else
|
||||
echo "warn: VAD model not found at $VAD_MODEL, running without VAD" >&2
|
||||
echo " download with: sh $WHISPER_DIR/models/download-vad-model.sh silero-v6.2.0" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
"$WHISPER_BIN" "${whisper_args[@]}"
|
||||
|
||||
if [[ "$LINE_LEN" -gt 0 ]]; then
|
||||
python3 - "$srt" "$LINE_LEN" <<'PYEOF'
|
||||
import re, sys, pathlib
|
||||
|
||||
path = pathlib.Path(sys.argv[1])
|
||||
max_line = int(sys.argv[2])
|
||||
|
||||
def wrap(text, limit):
|
||||
def wrap_line(text: str, limit: int) -> str:
|
||||
text = " ".join(text.split())
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
words = text.split(" ")
|
||||
if len(words) < 2:
|
||||
if len(words) < MIN_WORDS_TO_WRAP:
|
||||
return text
|
||||
best = None
|
||||
best_score = None
|
||||
best = 0
|
||||
best_score = float("inf")
|
||||
cum = 0
|
||||
for i, w in enumerate(words[:-1]):
|
||||
cum += len(w) + (1 if i > 0 else 0)
|
||||
@@ -143,24 +47,191 @@ def wrap(text, limit):
|
||||
score += (bot - limit) * 100
|
||||
if w.rstrip(",.!?:;") != w:
|
||||
score -= 8
|
||||
if best_score is None or score < best_score:
|
||||
if score < best_score:
|
||||
best_score, best = score, i
|
||||
return " ".join(words[:best+1]) + "\n" + " ".join(words[best+1:])
|
||||
return " ".join(words[: best + 1]) + "\n" + " ".join(words[best + 1 :])
|
||||
|
||||
blocks = re.split(r"\n\n+", path.read_text(encoding="utf-8").strip())
|
||||
out = []
|
||||
for b in blocks:
|
||||
|
||||
def wrap_srt(path: Path, limit: int) -> None:
|
||||
blocks = re.split(r"\n\n+", path.read_text(encoding="utf-8").strip())
|
||||
out: list[str] = []
|
||||
for b in blocks:
|
||||
lines = b.split("\n")
|
||||
if len(lines) < 3:
|
||||
if len(lines) <= SRT_HEADER_LINES:
|
||||
out.append(b)
|
||||
continue
|
||||
head, body = lines[:2], " ".join(lines[2:])
|
||||
out.append("\n".join(head + [wrap(body, max_line)]))
|
||||
path.write_text("\n\n".join(out) + "\n", encoding="utf-8")
|
||||
PYEOF
|
||||
fi
|
||||
head, body = lines[:SRT_HEADER_LINES], " ".join(lines[SRT_HEADER_LINES:])
|
||||
out.append("\n".join([*head, wrap_line(body, limit)]))
|
||||
path.write_text("\n\n".join(out) + "\n", encoding="utf-8")
|
||||
|
||||
rm -f "$tmpwav"
|
||||
trap - EXIT
|
||||
echo "<< $srt"
|
||||
done
|
||||
|
||||
def output_stem(video: Path, override: str | None) -> Path:
|
||||
if override:
|
||||
stem = override[:-4] if override.endswith(".srt") else override
|
||||
return Path(stem)
|
||||
return video.with_suffix("")
|
||||
|
||||
|
||||
def process(video: Path, args: argparse.Namespace) -> None:
|
||||
out_stem = output_stem(video, args.output)
|
||||
srt = out_stem.with_suffix(".srt")
|
||||
|
||||
if srt.exists() and not args.force:
|
||||
print(f"skip: {srt} exists (use --force to overwrite)")
|
||||
return
|
||||
|
||||
out_stem.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f">> {video}")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
wav_path = Path(tmpdir) / "audio.wav"
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
str(video),
|
||||
"-vn",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-c:a",
|
||||
"pcm_s16le",
|
||||
str(wav_path),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
cmd = [
|
||||
str(args.bin),
|
||||
"-m",
|
||||
str(args.model),
|
||||
"-f",
|
||||
str(wav_path),
|
||||
"-of",
|
||||
str(out_stem),
|
||||
"--output-srt",
|
||||
"-l",
|
||||
args.lang,
|
||||
"-mc",
|
||||
"0",
|
||||
]
|
||||
if args.translate:
|
||||
cmd.append("--translate")
|
||||
if args.max_len > 0:
|
||||
cmd += ["--max-len", str(args.max_len), "--split-on-word"]
|
||||
if args.vad:
|
||||
if args.vad_model.exists():
|
||||
cmd += ["--vad", "--vad-model", str(args.vad_model)]
|
||||
if args.vad_max_speech > 0:
|
||||
cmd += ["-vmsd", str(args.vad_max_speech)]
|
||||
else:
|
||||
print(
|
||||
f"warn: VAD model not found at {args.vad_model}, running without VAD",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f" download with: sh {WHISPER_DIR}/models/download-vad-model.sh silero-v6.2.0",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
if args.line_len > 0:
|
||||
wrap_srt(srt, args.line_len)
|
||||
|
||||
print(f"<< {srt}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="video2srt",
|
||||
description="Generate an English .srt next to each video using whisper.cpp.",
|
||||
)
|
||||
p.add_argument("videos", nargs="+", type=Path, metavar="VIDEO")
|
||||
p.add_argument(
|
||||
"-t",
|
||||
"--transcribe",
|
||||
dest="translate",
|
||||
action="store_false",
|
||||
help="Transcribe in source language (default: translate to English)",
|
||||
)
|
||||
p.add_argument(
|
||||
"-l",
|
||||
"--lang",
|
||||
default=os.environ.get("SRC_LANG", "auto"),
|
||||
help="Force source language (default: auto-detect)",
|
||||
)
|
||||
p.add_argument(
|
||||
"-m",
|
||||
"--model",
|
||||
type=Path,
|
||||
default=DEFAULT_MODEL,
|
||||
help=f"Path to ggml model (default: {DEFAULT_MODEL})",
|
||||
)
|
||||
p.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Output .srt path (single input only; default: <video>.srt)",
|
||||
)
|
||||
p.add_argument(
|
||||
"-f", "--force", action="store_true", help="Overwrite existing .srt"
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-vad",
|
||||
dest="vad",
|
||||
action="store_false",
|
||||
help="Disable Silero VAD pre-filtering (VAD reduces hallucination loops)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--vad-max-speech",
|
||||
type=float,
|
||||
default=15.0,
|
||||
help="Max seconds of speech per VAD chunk; shorter values give tighter timestamps (default: 15)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--max-len",
|
||||
type=int,
|
||||
default=84,
|
||||
help="Max characters per SRT entry, 0 to disable (default: 84)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--line-len",
|
||||
type=int,
|
||||
default=42,
|
||||
help="Max characters per visible line, 0 to disable (default: 42)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--bin", type=Path, default=DEFAULT_BIN, help=argparse.SUPPRESS
|
||||
)
|
||||
p.add_argument(
|
||||
"--vad-model", type=Path, default=DEFAULT_VAD, help=argparse.SUPPRESS
|
||||
)
|
||||
|
||||
args = p.parse_args()
|
||||
|
||||
if args.output and len(args.videos) > 1:
|
||||
p.error("--output cannot be combined with multiple input files")
|
||||
if not args.bin.is_file() or not os.access(args.bin, os.X_OK):
|
||||
sys.exit(f"whisper-cli not found at {args.bin}")
|
||||
if not args.model.is_file():
|
||||
sys.exit(f"model not found at {args.model}")
|
||||
if shutil.which("ffmpeg") is None:
|
||||
sys.exit("ffmpeg not installed")
|
||||
|
||||
for video in args.videos:
|
||||
if not video.is_file():
|
||||
print(f"skip: {video} (not a file)", file=sys.stderr)
|
||||
continue
|
||||
process(video, args)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user