feat: initial commit

2026-05-26 00:45:02 +02:00
commit 9d8e0b06d3
6 changed files with 283 additions and 0 deletions
@@ -0,0 +1,6 @@
+.venv/
+__pycache__/
+*.pyc
+*.egg-info/
+dist/
+build/
@@ -0,0 +1,48 @@
+PREFIX ?= $(HOME)/.local
+BINDIR ?= $(PREFIX)/bin
+PROG ?= video2srt
+PYTHON ?= python3
+VENV ?= .venv
+BACKEND ?= rocm
+
+PIP := $(VENV)/bin/pip
+
+ifeq ($(BACKEND),rocm)
+TORCH_INDEX := --index-url https://download.pytorch.org/whl/rocm6.3
+else ifeq ($(BACKEND),cuda)
+TORCH_INDEX := --index-url https://download.pytorch.org/whl/cu124
+else ifeq ($(BACKEND),cpu)
+TORCH_INDEX :=
+else
+$(error Unknown BACKEND=$(BACKEND), expected: rocm | cuda | cpu)
+endif
+
+.PHONY: build install clean uninstall
+
+build: $(VENV)/.installed
+
+$(VENV)/.torch-$(BACKEND): | $(VENV)
+	$(PIP) install torch torchaudio $(TORCH_INDEX)
+	touch $@
+
+$(VENV)/.installed: pyproject.toml $(VENV)/.torch-$(BACKEND)
+	$(PIP) install -e .
+ifeq ($(BACKEND),rocm)
+	$(PIP) uninstall -y triton || true
+	$(PIP) install --force-reinstall --no-deps pytorch-triton-rocm $(TORCH_INDEX)
+endif
+	touch $@
+
+$(VENV):
+	$(PYTHON) -m venv $(VENV)
+
+$(BINDIR)/$(PROG): $(VENV)/.installed
+	install -D -m 755 $(VENV)/bin/$(PROG) $@
+
+install: $(BINDIR)/$(PROG)
+
+clean:
+	rm -rf $(VENV)
+
+uninstall:
+	rm -f $(BINDIR)/$(PROG)
@@ -0,0 +1,27 @@
+# video2srt
+
+Generate an English `.srt` next to each video using [stable-ts](https://github.com/jianfch/stable-ts), with Demucs vocal isolation to fight hallucinations on noisy or musical audio.
+
+## Install
+
+Requires Python 3.11 and `make`.
+
+```sh
+make              # AMD ROCm (default)
+make BACKEND=cuda # NVIDIA CUDA
+make BACKEND=cpu  # CPU-only
+
+make install      # install .venv/bin/video2srt to ~/.local/bin
+make clean        # remove .venv
+```
+
+## Usage
+
+```sh
+video2srt movie.mkv                    # translate to English (default)
+video2srt -t movie.mkv                 # transcribe in source language
+video2srt -l ja movie.mkv              # force Japanese source language
+video2srt --no-denoise movie.mkv       # skip Demucs vocal isolation
+video2srt -o english.srt movie.mkv     # custom output path
+video2srt -f *.mkv                     # batch overwrite
+```
@@ -0,0 +1,68 @@
+[project]
+name = "video2srt"
+version = "0.1.0"
+description = "Generate an English .srt next to each video using stable-ts."
+requires-python = "==3.11.*"
+dependencies = [
+    "stable-ts~=2.19.1",
+    "demucs~=4.0.1",
+]
+
+[project.scripts]
+video2srt = "video2srt.cli:main"
+
+[build-system]
+requires = ["setuptools>=75.8"]
+build-backend = "setuptools.build_meta"
+
+[tool.ruff]
+line-length = 80
+preview = true
+
+[tool.ruff.lint]
+select = [
+    "YTT",
+    "ANN",
+    "ASYNC",
+    "B",
+    "A",
+    "COM",
+    "C4",
+    "DTZ",
+    "T10",
+    "FIX",
+    "FA",
+    "ISC",
+    "PIE",
+    "PYI",
+    "PT",
+    "RET",
+    "SIM",
+    "TC",
+    "I",
+    "C90",
+    "DOC",
+    "D",
+    "F",
+    "PL",
+    "UP",
+    "RUF"
+]
+ignore = [
+    "A001",
+    "D100",
+    "D101",
+    "D103",
+    "D104",
+    "D202",
+    "D203",
+    "D212",
+    "D301",
+    "D413",
+    "TC006",
+    "COM812",
+    "PLR0913",
+    "PLR0917",
+    "PYI011",
+    "UP031"
+]
@@ -0,0 +1,134 @@
+"""Generate an English .srt next to each video using stable-ts."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+warnings.filterwarnings("ignore", message=r".*FP16 is not supported.*")
+warnings.filterwarnings("ignore", message=r".*Word-level timestamps.*")
+warnings.filterwarnings("ignore", message=r".*non-tuple sequence.*")
+
+import stable_whisper
+import torch
+
+if TYPE_CHECKING:
+    from whisper.model import (
+        Whisper,
+    )
+
+DEFAULT_MODEL = os.environ.get("WHISPER_MODEL", "large-v3")
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def output_stem(video: Path, override: str | None) -> Path:
+    if override:
+        path = Path(override)
+        return path.with_suffix("") if path.suffix == ".srt" else path
+    return video.with_suffix("")
+
+
+def process(video: Path, model: Whisper, args: argparse.Namespace) -> None:
+    out_stem = output_stem(video, args.output)
+    srt = out_stem.with_suffix(".srt")
+
+    if srt.exists() and not args.force:
+        print(f"skip: {srt} exists (use --force to overwrite)")
+        return
+
+    out_stem.parent.mkdir(parents=True, exist_ok=True)
+    print(f">> {video}")
+
+    result: stable_whisper.WhisperResult = model.transcribe(  # type: ignore
+        str(video),
+        task="translate" if args.translate else "transcribe",
+        language=None if args.lang == "auto" else args.lang,
+        suppress_silence=True,
+        denoiser="demucs" if args.denoiser else None,
+    )
+    result.to_srt_vtt(
+        str(srt), word_level=False, segment_level=True
+    )
+
+    print(f"<< {srt}")
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(
+        prog="video2srt",
+        description="Generate an English .srt next to each video using stable-ts.",
+    )
+    p.add_argument("videos", nargs="+", type=Path, metavar="VIDEO")
+    p.add_argument(
+        "-t",
+        "--transcribe",
+        dest="translate",
+        action="store_false",
+        help="Transcribe in source language (default: translate to English)",
+    )
+    p.add_argument(
+        "-l",
+        "--lang",
+        default="auto",
+        help="Source language code or 'auto' (default: auto)",
+    )
+    p.add_argument(
+        "-m",
+        "--model",
+        default=DEFAULT_MODEL,
+        help=f"Whisper model name or path (default: {DEFAULT_MODEL})",
+    )
+    p.add_argument(
+        "-o",
+        "--output",
+        help="Output .srt path (single input only; default: <video>.srt)",
+    )
+    p.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        help="Overwrite existing .srt",
+    )
+    p.add_argument(
+        "--no-denoise",
+        dest="denoiser",
+        action="store_false",
+        help="Disable Demucs denoiser (default: on; isolates vocals from music/ambient)",
+    )
+    p.add_argument(
+        "--device",
+        default=DEFAULT_DEVICE,
+        help=f"Compute device: cuda, cpu, mps (default: {DEFAULT_DEVICE})",
+    )
+
+    args = p.parse_args()
+
+    if args.output and len(args.videos) > 1:
+        p.error("--output cannot be combined with multiple input files")
+
+    videos: list[Path] = []
+    for video in args.videos:
+        if video.is_file():
+            videos.append(video)
+        else:
+            print(f"skip: {video} (not a file)", file=sys.stderr)
+
+    if not videos:
+        print("no valid input files", file=sys.stderr)
+        return 1
+
+    print(f"loading model {args.model} on {args.device}...")
+    model = stable_whisper.load_model(args.model, device=args.device)
+
+    for video in videos:
+        process(video, model, args)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())