commit 9d8e0b06d37715ce5d94e90f4f7092de99761a61 Author: Oscar Wallberg Date: Tue May 26 00:45:02 2026 +0200 feat: initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a18610e --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.venv/ +__pycache__/ +*.pyc +*.egg-info/ +dist/ +build/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..778431f --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +PREFIX ?= $(HOME)/.local +BINDIR ?= $(PREFIX)/bin +PROG ?= video2srt +PYTHON ?= python3 +VENV ?= .venv +BACKEND ?= rocm + +PIP := $(VENV)/bin/pip + +ifeq ($(BACKEND),rocm) +TORCH_INDEX := --index-url https://download.pytorch.org/whl/rocm6.3 +else ifeq ($(BACKEND),cuda) +TORCH_INDEX := --index-url https://download.pytorch.org/whl/cu124 +else ifeq ($(BACKEND),cpu) +TORCH_INDEX := +else +$(error Unknown BACKEND=$(BACKEND), expected: rocm | cuda | cpu) +endif + +.PHONY: build install clean uninstall + +build: $(VENV)/.installed + +$(VENV)/.torch-$(BACKEND): | $(VENV) + $(PIP) install torch torchaudio $(TORCH_INDEX) + touch $@ + +$(VENV)/.installed: pyproject.toml $(VENV)/.torch-$(BACKEND) + $(PIP) install -e . +ifeq ($(BACKEND),rocm) + $(PIP) uninstall -y triton || true + $(PIP) install --force-reinstall --no-deps pytorch-triton-rocm $(TORCH_INDEX) +endif + touch $@ + +$(VENV): + $(PYTHON) -m venv $(VENV) + +$(BINDIR)/$(PROG): $(VENV)/.installed + install -D -m 755 $(VENV)/bin/$(PROG) $@ + +install: $(BINDIR)/$(PROG) + +clean: + rm -rf $(VENV) + +uninstall: + rm -f $(BINDIR)/$(PROG) diff --git a/README.md b/README.md new file mode 100644 index 0000000..04a6045 --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +# video2srt + +Generate an English `.srt` next to each video using [stable-ts](https://github.com/jianfch/stable-ts), with Demucs vocal isolation to fight hallucinations on noisy or musical audio. + +## Install + +Requires Python 3.11 and `make`. + +```sh +make # AMD ROCm (default) +make BACKEND=cuda # NVIDIA CUDA +make BACKEND=cpu # CPU-only + +make install # install .venv/bin/video2srt to ~/.local/bin +make clean # remove .venv +``` + +## Usage + +```sh +video2srt movie.mkv # translate to English (default) +video2srt -t movie.mkv # transcribe in source language +video2srt -l ja movie.mkv # force Japanese source language +video2srt --no-denoise movie.mkv # skip Demucs vocal isolation +video2srt -o english.srt movie.mkv # custom output path +video2srt -f *.mkv # batch overwrite +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..66615de --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[project] +name = "video2srt" +version = "0.1.0" +description = "Generate an English .srt next to each video using stable-ts." +requires-python = "==3.11.*" +dependencies = [ + "stable-ts~=2.19.1", + "demucs~=4.0.1", +] + +[project.scripts] +video2srt = "video2srt.cli:main" + +[build-system] +requires = ["setuptools>=75.8"] +build-backend = "setuptools.build_meta" + +[tool.ruff] +line-length = 80 +preview = true + +[tool.ruff.lint] +select = [ + "YTT", + "ANN", + "ASYNC", + "B", + "A", + "COM", + "C4", + "DTZ", + "T10", + "FIX", + "FA", + "ISC", + "PIE", + "PYI", + "PT", + "RET", + "SIM", + "TC", + "I", + "C90", + "DOC", + "D", + "F", + "PL", + "UP", + "RUF" +] +ignore = [ + "A001", + "D100", + "D101", + "D103", + "D104", + "D202", + "D203", + "D212", + "D301", + "D413", + "TC006", + "COM812", + "PLR0913", + "PLR0917", + "PYI011", + "UP031" +] diff --git a/src/video2srt/__init__.py b/src/video2srt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/video2srt/cli.py b/src/video2srt/cli.py new file mode 100644 index 0000000..7d41576 --- /dev/null +++ b/src/video2srt/cli.py @@ -0,0 +1,134 @@ +"""Generate an English .srt next to each video using stable-ts.""" + +from __future__ import annotations + +import argparse +import os +import sys +import warnings +from pathlib import Path +from typing import TYPE_CHECKING + +warnings.filterwarnings("ignore", message=r".*FP16 is not supported.*") +warnings.filterwarnings("ignore", message=r".*Word-level timestamps.*") +warnings.filterwarnings("ignore", message=r".*non-tuple sequence.*") + +import stable_whisper +import torch + +if TYPE_CHECKING: + from whisper.model import ( + Whisper, + ) + +DEFAULT_MODEL = os.environ.get("WHISPER_MODEL", "large-v3") +DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + + +def output_stem(video: Path, override: str | None) -> Path: + if override: + path = Path(override) + return path.with_suffix("") if path.suffix == ".srt" else path + return video.with_suffix("") + + +def process(video: Path, model: Whisper, args: argparse.Namespace) -> None: + out_stem = output_stem(video, args.output) + srt = out_stem.with_suffix(".srt") + + if srt.exists() and not args.force: + print(f"skip: {srt} exists (use --force to overwrite)") + return + + out_stem.parent.mkdir(parents=True, exist_ok=True) + print(f">> {video}") + + result: stable_whisper.WhisperResult = model.transcribe( # type: ignore + str(video), + task="translate" if args.translate else "transcribe", + language=None if args.lang == "auto" else args.lang, + suppress_silence=True, + denoiser="demucs" if args.denoiser else None, + ) + result.to_srt_vtt( + str(srt), word_level=False, segment_level=True + ) + + print(f"<< {srt}") + + +def main() -> int: + p = argparse.ArgumentParser( + prog="video2srt", + description="Generate an English .srt next to each video using stable-ts.", + ) + p.add_argument("videos", nargs="+", type=Path, metavar="VIDEO") + p.add_argument( + "-t", + "--transcribe", + dest="translate", + action="store_false", + help="Transcribe in source language (default: translate to English)", + ) + p.add_argument( + "-l", + "--lang", + default="auto", + help="Source language code or 'auto' (default: auto)", + ) + p.add_argument( + "-m", + "--model", + default=DEFAULT_MODEL, + help=f"Whisper model name or path (default: {DEFAULT_MODEL})", + ) + p.add_argument( + "-o", + "--output", + help="Output .srt path (single input only; default: