submaster/whisper_project/srt_to_kokoro.py

"""Small CLI shim for SRT -> Kokoro synthesis.

This file provides: parse_srt_file, synth_chunk (thin wrappers) and a
CLI entrypoint that uses `whisper_project.config` (config.toml) and CLI
flags. It intentionally does NOT read environment variables.
"""

from __future__ import annotations

from typing import Any
import argparse
import logging
import sys

from whisper_project.infra.kokoro_utils import (
    parse_srt_file as _parse_srt_file,
    synth_chunk as _synth_chunk,
)
from whisper_project.infra.kokoro_adapter import KokoroHttpClient
from whisper_project import config


def parse_srt_file(path: str):
    """Parse a .srt and return the list of subtitles.

    Delegates to `whisper_project.infra.kokoro_utils.parse_srt_file`.
    """
    return _parse_srt_file(path)


def synth_chunk(
    endpoint: str,
    text: str,
    headers: dict,
    payload_template: Any,
    timeout: int = 60,
) -> bytes:
    """Send text to the endpoint and return audio bytes.

    Delegates to `whisper_project.infra.kokoro_utils.synth_chunk`.
    """
    return _synth_chunk(endpoint, text, headers, payload_template, timeout=timeout)


def synthesize_from_srt(srt_path: str, out_wav: str, endpoint: str = "", api_key: str = ""):
    """Compatibility layer name used historically by scripts.

    The canonical implementation lives in `KokoroHttpClient`. Call that class
    method instead when integrating programmatically.
    """
    raise NotImplementedError(
        "Use KokoroHttpClient.synthesize_from_srt or the infra adapter"
    )


def _build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser()
    p.add_argument("--srt", required=True, help="Path to input .srt file")
    p.add_argument("--endpoint", required=False, help="Direct synthesis endpoint (optional)")
    p.add_argument(
        "--api-key",
        required=False,
        help=(
            "API key for Authorization header; if omitted the value from"
            " config.toml is used"
        ),
    )
    p.add_argument("--voice", default="em_alex")
    p.add_argument("--model", default="model")
    p.add_argument("--out", required=True, help="Output WAV path")
    p.add_argument("--video", required=False, help="Optional original video path to mix or align with")
    p.add_argument("--align", action="store_true", help="Align segments using SRT timestamps")
    p.add_argument("--keep-chunks", action="store_true")
    p.add_argument("--mix-with-original", action="store_true")
    p.add_argument("--mix-background-volume", type=float, default=0.2)
    p.add_argument("--replace-original", action="store_true")
    p.add_argument(
        "--config-mode",
        choices=["defaults", "override-env", "force"],
        default="override-env",
        help=(
            "Configuration precedence: 'defaults' = CLI > TOML; "
            "'override-env' = CLI > TOML; 'force' = TOML > CLI"
        ),
    )
    return p


def main() -> None:
    p = _build_arg_parser()
    args = p.parse_args()

    # Resolve configuration: only CLI flags and config.toml are used.
    kokoro_ep = getattr(args, "endpoint", None)
    kokoro_key = getattr(args, "api_key", None)

    mode = getattr(args, "config_mode", "defaults")
    if mode in ("defaults", "override-env"):
        # CLI > TOML
        endpoint = kokoro_ep or args.endpoint or config.KOKORO_ENDPOINT
        api_key = kokoro_key or args.api_key or config.KOKORO_API_KEY
    else:
        # force: TOML > CLI
        endpoint = config.KOKORO_ENDPOINT or kokoro_ep or args.endpoint
        api_key = config.KOKORO_API_KEY or kokoro_key or args.api_key

    if not endpoint:
        logging.getLogger(__name__).error(
            "Please provide --endpoint or set kokoro.endpoint in config.toml"
        )
        sys.exit(2)

    client = KokoroHttpClient(
        endpoint,
        api_key=api_key,
        voice=args.voice,
        model=args.model,
    )

    try:
        client.synthesize_from_srt(
            srt_path=args.srt,
            out_wav=args.out,
            video=args.video,
            align=args.align,
            keep_chunks=args.keep_chunks,
            mix_with_original=args.mix_with_original,
            mix_background_volume=args.mix_background_volume,
        )
        logging.getLogger(__name__).info("Output written to: %s", args.out)
    except Exception:
        logging.getLogger(__name__).exception("Error synthesizing from SRT")
        sys.exit(1)


if __name__ == "__main__":
    main()