"""Small CLI shim for SRT -> Kokoro synthesis. This file provides: parse_srt_file, synth_chunk (thin wrappers) and a CLI entrypoint that uses `whisper_project.config` (config.toml) and CLI flags. It intentionally does NOT read environment variables. """ from __future__ import annotations from typing import Any import argparse import logging import sys from whisper_project.infra.kokoro_utils import ( parse_srt_file as _parse_srt_file, synth_chunk as _synth_chunk, ) from whisper_project.infra.kokoro_adapter import KokoroHttpClient from whisper_project import config def parse_srt_file(path: str): """Parse a .srt and return the list of subtitles. Delegates to `whisper_project.infra.kokoro_utils.parse_srt_file`. """ return _parse_srt_file(path) def synth_chunk( endpoint: str, text: str, headers: dict, payload_template: Any, timeout: int = 60, ) -> bytes: """Send text to the endpoint and return audio bytes. Delegates to `whisper_project.infra.kokoro_utils.synth_chunk`. """ return _synth_chunk(endpoint, text, headers, payload_template, timeout=timeout) def synthesize_from_srt(srt_path: str, out_wav: str, endpoint: str = "", api_key: str = ""): """Compatibility layer name used historically by scripts. The canonical implementation lives in `KokoroHttpClient`. Call that class method instead when integrating programmatically. """ raise NotImplementedError( "Use KokoroHttpClient.synthesize_from_srt or the infra adapter" ) def _build_arg_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser() p.add_argument("--srt", required=True, help="Path to input .srt file") p.add_argument("--endpoint", required=False, help="Direct synthesis endpoint (optional)") p.add_argument( "--api-key", required=False, help=( "API key for Authorization header; if omitted the value from" " config.toml is used" ), ) p.add_argument("--voice", default="em_alex") p.add_argument("--model", default="model") p.add_argument("--out", required=True, help="Output WAV path") p.add_argument("--video", required=False, help="Optional original video path to mix or align with") p.add_argument("--align", action="store_true", help="Align segments using SRT timestamps") p.add_argument("--keep-chunks", action="store_true") p.add_argument("--mix-with-original", action="store_true") p.add_argument("--mix-background-volume", type=float, default=0.2) p.add_argument("--replace-original", action="store_true") p.add_argument( "--config-mode", choices=["defaults", "override-env", "force"], default="override-env", help=( "Configuration precedence: 'defaults' = CLI > TOML; " "'override-env' = CLI > TOML; 'force' = TOML > CLI" ), ) return p def main() -> None: p = _build_arg_parser() args = p.parse_args() # Resolve configuration: only CLI flags and config.toml are used. kokoro_ep = getattr(args, "endpoint", None) kokoro_key = getattr(args, "api_key", None) mode = getattr(args, "config_mode", "defaults") if mode in ("defaults", "override-env"): # CLI > TOML endpoint = kokoro_ep or args.endpoint or config.KOKORO_ENDPOINT api_key = kokoro_key or args.api_key or config.KOKORO_API_KEY else: # force: TOML > CLI endpoint = config.KOKORO_ENDPOINT or kokoro_ep or args.endpoint api_key = config.KOKORO_API_KEY or kokoro_key or args.api_key if not endpoint: logging.getLogger(__name__).error( "Please provide --endpoint or set kokoro.endpoint in config.toml" ) sys.exit(2) client = KokoroHttpClient( endpoint, api_key=api_key, voice=args.voice, model=args.model, ) try: client.synthesize_from_srt( srt_path=args.srt, out_wav=args.out, video=args.video, align=args.align, keep_chunks=args.keep_chunks, mix_with_original=args.mix_with_original, mix_background_volume=args.mix_background_volume, ) logging.getLogger(__name__).info("Output written to: %s", args.out) except Exception: logging.getLogger(__name__).exception("Error synthesizing from SRT") sys.exit(1) if __name__ == "__main__": main()