feat: initial commit

This commit is contained in:
2026-05-26 00:45:02 +02:00
commit 9d8e0b06d3
6 changed files with 283 additions and 0 deletions
+6
View File
@@ -0,0 +1,6 @@
.venv/
__pycache__/
*.pyc
*.egg-info/
dist/
build/
+48
View File
@@ -0,0 +1,48 @@
PREFIX ?= $(HOME)/.local
BINDIR ?= $(PREFIX)/bin
PROG ?= video2srt
PYTHON ?= python3
VENV ?= .venv
BACKEND ?= rocm
PIP := $(VENV)/bin/pip
ifeq ($(BACKEND),rocm)
TORCH_INDEX := --index-url https://download.pytorch.org/whl/rocm6.3
else ifeq ($(BACKEND),cuda)
TORCH_INDEX := --index-url https://download.pytorch.org/whl/cu124
else ifeq ($(BACKEND),cpu)
TORCH_INDEX :=
else
$(error Unknown BACKEND=$(BACKEND), expected: rocm | cuda | cpu)
endif
.PHONY: build install clean uninstall
build: $(VENV)/.installed
$(VENV)/.torch-$(BACKEND): | $(VENV)
$(PIP) install torch torchaudio $(TORCH_INDEX)
touch $@
$(VENV)/.installed: pyproject.toml $(VENV)/.torch-$(BACKEND)
$(PIP) install -e .
ifeq ($(BACKEND),rocm)
$(PIP) uninstall -y triton || true
$(PIP) install --force-reinstall --no-deps pytorch-triton-rocm $(TORCH_INDEX)
endif
touch $@
$(VENV):
$(PYTHON) -m venv $(VENV)
$(BINDIR)/$(PROG): $(VENV)/.installed
install -D -m 755 $(VENV)/bin/$(PROG) $@
install: $(BINDIR)/$(PROG)
clean:
rm -rf $(VENV)
uninstall:
rm -f $(BINDIR)/$(PROG)
+27
View File
@@ -0,0 +1,27 @@
# video2srt
Generate an English `.srt` next to each video using [stable-ts](https://github.com/jianfch/stable-ts), with Demucs vocal isolation to fight hallucinations on noisy or musical audio.
## Install
Requires Python 3.11 and `make`.
```sh
make # AMD ROCm (default)
make BACKEND=cuda # NVIDIA CUDA
make BACKEND=cpu # CPU-only
make install # install .venv/bin/video2srt to ~/.local/bin
make clean # remove .venv
```
## Usage
```sh
video2srt movie.mkv # translate to English (default)
video2srt -t movie.mkv # transcribe in source language
video2srt -l ja movie.mkv # force Japanese source language
video2srt --no-denoise movie.mkv # skip Demucs vocal isolation
video2srt -o english.srt movie.mkv # custom output path
video2srt -f *.mkv # batch overwrite
```
+68
View File
@@ -0,0 +1,68 @@
[project]
name = "video2srt"
version = "0.1.0"
description = "Generate an English .srt next to each video using stable-ts."
requires-python = "==3.11.*"
dependencies = [
"stable-ts~=2.19.1",
"demucs~=4.0.1",
]
[project.scripts]
video2srt = "video2srt.cli:main"
[build-system]
requires = ["setuptools>=75.8"]
build-backend = "setuptools.build_meta"
[tool.ruff]
line-length = 80
preview = true
[tool.ruff.lint]
select = [
"YTT",
"ANN",
"ASYNC",
"B",
"A",
"COM",
"C4",
"DTZ",
"T10",
"FIX",
"FA",
"ISC",
"PIE",
"PYI",
"PT",
"RET",
"SIM",
"TC",
"I",
"C90",
"DOC",
"D",
"F",
"PL",
"UP",
"RUF"
]
ignore = [
"A001",
"D100",
"D101",
"D103",
"D104",
"D202",
"D203",
"D212",
"D301",
"D413",
"TC006",
"COM812",
"PLR0913",
"PLR0917",
"PYI011",
"UP031"
]
View File
+134
View File
@@ -0,0 +1,134 @@
"""Generate an English .srt next to each video using stable-ts."""
from __future__ import annotations
import argparse
import os
import sys
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
warnings.filterwarnings("ignore", message=r".*FP16 is not supported.*")
warnings.filterwarnings("ignore", message=r".*Word-level timestamps.*")
warnings.filterwarnings("ignore", message=r".*non-tuple sequence.*")
import stable_whisper
import torch
if TYPE_CHECKING:
from whisper.model import (
Whisper,
)
DEFAULT_MODEL = os.environ.get("WHISPER_MODEL", "large-v3")
DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def output_stem(video: Path, override: str | None) -> Path:
if override:
path = Path(override)
return path.with_suffix("") if path.suffix == ".srt" else path
return video.with_suffix("")
def process(video: Path, model: Whisper, args: argparse.Namespace) -> None:
out_stem = output_stem(video, args.output)
srt = out_stem.with_suffix(".srt")
if srt.exists() and not args.force:
print(f"skip: {srt} exists (use --force to overwrite)")
return
out_stem.parent.mkdir(parents=True, exist_ok=True)
print(f">> {video}")
result: stable_whisper.WhisperResult = model.transcribe( # type: ignore
str(video),
task="translate" if args.translate else "transcribe",
language=None if args.lang == "auto" else args.lang,
suppress_silence=True,
denoiser="demucs" if args.denoiser else None,
)
result.to_srt_vtt(
str(srt), word_level=False, segment_level=True
)
print(f"<< {srt}")
def main() -> int:
p = argparse.ArgumentParser(
prog="video2srt",
description="Generate an English .srt next to each video using stable-ts.",
)
p.add_argument("videos", nargs="+", type=Path, metavar="VIDEO")
p.add_argument(
"-t",
"--transcribe",
dest="translate",
action="store_false",
help="Transcribe in source language (default: translate to English)",
)
p.add_argument(
"-l",
"--lang",
default="auto",
help="Source language code or 'auto' (default: auto)",
)
p.add_argument(
"-m",
"--model",
default=DEFAULT_MODEL,
help=f"Whisper model name or path (default: {DEFAULT_MODEL})",
)
p.add_argument(
"-o",
"--output",
help="Output .srt path (single input only; default: <video>.srt)",
)
p.add_argument(
"-f",
"--force",
action="store_true",
help="Overwrite existing .srt",
)
p.add_argument(
"--no-denoise",
dest="denoiser",
action="store_false",
help="Disable Demucs denoiser (default: on; isolates vocals from music/ambient)",
)
p.add_argument(
"--device",
default=DEFAULT_DEVICE,
help=f"Compute device: cuda, cpu, mps (default: {DEFAULT_DEVICE})",
)
args = p.parse_args()
if args.output and len(args.videos) > 1:
p.error("--output cannot be combined with multiple input files")
videos: list[Path] = []
for video in args.videos:
if video.is_file():
videos.append(video)
else:
print(f"skip: {video} (not a file)", file=sys.stderr)
if not videos:
print("no valid input files", file=sys.stderr)
return 1
print(f"loading model {args.model} on {args.device}...")
model = stable_whisper.load_model(args.model, device=args.device)
for video in videos:
process(video, model, args)
return 0
if __name__ == "__main__":
sys.exit(main())