submaster/whisper_project/srt_to_kokoro.py
Cesar Mendivil c22767d3d4 Refactor SRT to Kokoro synthesis script for improved CLI functionality and compatibility
- Updated `srt_to_kokoro.py` to provide a CLI entrypoint with argument parsing.
- Enhanced error handling and logging for better user feedback.
- Introduced a compatibility layer for legacy scripts.
- Added configuration handling via `config.toml` for endpoint and API key.
- Improved documentation and comments for clarity.

Enhance PipelineOrchestrator with in-process transcriber fallback

- Implemented `InProcessTranscriber` to handle transcription using multiple strategies.
- Added support for `srt_only` flag to return translated SRT without TTS synthesis.
- Improved error handling and logging for transcriber initialization.

Add installation and usage documentation

- Created `INSTALLATION.md` for detailed setup instructions for CPU and GPU environments.
- Added `USAGE.md` with practical examples for common use cases and command-line options.
- Included a script for automated installation and environment setup.

Implement SRT burning utility

- Added `burn_srt.py` to facilitate embedding SRT subtitles into video files using ffmpeg.
- Provided command-line options for style and codec customization.

Update project configuration management

- Introduced `config.py` to centralize configuration loading from `config.toml`.
- Ensured that environment variables are not read to avoid implicit overrides.

Enhance package management with `pyproject.toml`

- Added `pyproject.toml` for modern packaging and dependency management.
- Defined optional dependencies for CPU and TTS support.

Add smoke test fixture for SRT

- Created `smoke_test.srt` as a sample subtitle file for testing purposes.

Update requirements and setup configurations

- Revised `requirements.txt` and `setup.cfg` for better dependency management and clarity.
- Included installation instructions for editable mode and local TTS support.
2025-10-25 00:00:02 -07:00

138 lines
4.4 KiB
Python

"""Small CLI shim for SRT -> Kokoro synthesis.
This file provides: parse_srt_file, synth_chunk (thin wrappers) and a
CLI entrypoint that uses `whisper_project.config` (config.toml) and CLI
flags. It intentionally does NOT read environment variables.
"""
from __future__ import annotations
from typing import Any
import argparse
import logging
import sys
from whisper_project.infra.kokoro_utils import (
parse_srt_file as _parse_srt_file,
synth_chunk as _synth_chunk,
)
from whisper_project.infra.kokoro_adapter import KokoroHttpClient
from whisper_project import config
def parse_srt_file(path: str):
"""Parse a .srt and return the list of subtitles.
Delegates to `whisper_project.infra.kokoro_utils.parse_srt_file`.
"""
return _parse_srt_file(path)
def synth_chunk(
endpoint: str,
text: str,
headers: dict,
payload_template: Any,
timeout: int = 60,
) -> bytes:
"""Send text to the endpoint and return audio bytes.
Delegates to `whisper_project.infra.kokoro_utils.synth_chunk`.
"""
return _synth_chunk(endpoint, text, headers, payload_template, timeout=timeout)
def synthesize_from_srt(srt_path: str, out_wav: str, endpoint: str = "", api_key: str = ""):
"""Compatibility layer name used historically by scripts.
The canonical implementation lives in `KokoroHttpClient`. Call that class
method instead when integrating programmatically.
"""
raise NotImplementedError(
"Use KokoroHttpClient.synthesize_from_srt or the infra adapter"
)
def _build_arg_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser()
p.add_argument("--srt", required=True, help="Path to input .srt file")
p.add_argument("--endpoint", required=False, help="Direct synthesis endpoint (optional)")
p.add_argument(
"--api-key",
required=False,
help=(
"API key for Authorization header; if omitted the value from"
" config.toml is used"
),
)
p.add_argument("--voice", default="em_alex")
p.add_argument("--model", default="model")
p.add_argument("--out", required=True, help="Output WAV path")
p.add_argument("--video", required=False, help="Optional original video path to mix or align with")
p.add_argument("--align", action="store_true", help="Align segments using SRT timestamps")
p.add_argument("--keep-chunks", action="store_true")
p.add_argument("--mix-with-original", action="store_true")
p.add_argument("--mix-background-volume", type=float, default=0.2)
p.add_argument("--replace-original", action="store_true")
p.add_argument(
"--config-mode",
choices=["defaults", "override-env", "force"],
default="override-env",
help=(
"Configuration precedence: 'defaults' = CLI > TOML; "
"'override-env' = CLI > TOML; 'force' = TOML > CLI"
),
)
return p
def main() -> None:
p = _build_arg_parser()
args = p.parse_args()
# Resolve configuration: only CLI flags and config.toml are used.
kokoro_ep = getattr(args, "endpoint", None)
kokoro_key = getattr(args, "api_key", None)
mode = getattr(args, "config_mode", "defaults")
if mode in ("defaults", "override-env"):
# CLI > TOML
endpoint = kokoro_ep or args.endpoint or config.KOKORO_ENDPOINT
api_key = kokoro_key or args.api_key or config.KOKORO_API_KEY
else:
# force: TOML > CLI
endpoint = config.KOKORO_ENDPOINT or kokoro_ep or args.endpoint
api_key = config.KOKORO_API_KEY or kokoro_key or args.api_key
if not endpoint:
logging.getLogger(__name__).error(
"Please provide --endpoint or set kokoro.endpoint in config.toml"
)
sys.exit(2)
client = KokoroHttpClient(
endpoint,
api_key=api_key,
voice=args.voice,
model=args.model,
)
try:
client.synthesize_from_srt(
srt_path=args.srt,
out_wav=args.out,
video=args.video,
align=args.align,
keep_chunks=args.keep_chunks,
mix_with_original=args.mix_with_original,
mix_background_volume=args.mix_background_volume,
)
logging.getLogger(__name__).info("Output written to: %s", args.out)
except Exception:
logging.getLogger(__name__).exception("Error synthesizing from SRT")
sys.exit(1)
if __name__ == "__main__":
main()