submaster/whisper_project/srt_to_kokoro.py

#!/usr/bin/env python3
"""
srt_to_kokoro.py

Leer un archivo .srt y sintetizar cada subtítulo usando una API OpenAPI-compatible (p. ej. Kokoro).
- Intenta autodetectar un endpoint de síntesis en `--openapi` (URL JSON) buscando paths que contengan 'synth'|'tts'|'text' y que acepten POST.
- Alternativamente usa `--endpoint` y un `--payload-template` con {text} como placeholder.
- Guarda fragmentos temporales y los concatena con ffmpeg en un único WAV de salida.

Dependencias: requests, srt (pip install requests srt)
Requiere ffmpeg en PATH.

Ejemplos:
  python srt_to_kokoro.py --srt subs.srt --openapi "https://kokoro.../openapi.json" --voice "alloy" --out out.wav --api-key "TOKEN"
  python srt_to_kokoro.py --srt subs.srt --endpoint "https://kokoro.../v1/synthesize" --payload-template '{"text": "{text}", "voice": "alloy"}' --out out.wav

"""

import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
from typing import Optional

try:
    import requests
except Exception as e:
    print("Este script requiere la librería 'requests'. Instálala con: pip install requests")
    raise

try:
    import srt
except Exception:
    print("Este script requiere la librería 'srt'. Instálala con: pip install srt")
    raise


def find_synthesis_endpoint(openapi_url: str) -> Optional[str]:
    """Intento heurístico: baja openapi.json y busca paths con 'synth'|'tts'|'text' que soporten POST."""
    try:
        r = requests.get(openapi_url, timeout=20)
        r.raise_for_status()
        spec = r.json()
    except Exception as e:
        print(f"No pude leer openapi.json desde {openapi_url}: {e}")
        return None

    paths = spec.get("paths", {})
    candidate = None
    for path, methods in paths.items():
        lname = path.lower()
        if any(k in lname for k in ("synth", "tts", "text", "synthesize")):
            for method, op in methods.items():
                if method.lower() == "post":
                    # candidato
                    candidate = path
                    break
        if candidate:
            break

    if not candidate:
        # fallback: scan operationId or summary
        for path, methods in paths.items():
            for method, op in methods.items():
                meta = json.dumps(op).lower()
                if any(k in meta for k in ("synth", "tts", "text", "synthesize")) and method.lower() == "post":
                    candidate = path
                    break
            if candidate:
                break

    if not candidate:
        return None

    # Construir base url desde openapi_url
    from urllib.parse import urlparse, urljoin
    p = urlparse(openapi_url)
    base = f"{p.scheme}://{p.netloc}"
    return urljoin(base, candidate)


def parse_srt_file(path: str):
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    subs = list(srt.parse(raw))
    return subs


def synth_chunk(endpoint: str, text: str, headers: dict, payload_template: Optional[str], timeout=60):
    """Envía la solicitud y devuelve bytes de audio. Maneja respuestas audio/* o JSON con campo base64."""
    # Construir payload
    if payload_template:
        body = payload_template.replace("{text}", text)
        try:
            json_body = json.loads(body)
        except Exception:
            # enviar como texto plano
            json_body = {"text": text}
    else:
        json_body = {"text": text}

    # Realizar POST
    r = requests.post(endpoint, json=json_body, headers=headers, timeout=timeout)
    r.raise_for_status()

    ctype = r.headers.get("Content-Type", "")
    if ctype.startswith("audio/"):
        return r.content
    # Si viene JSON con base64
    try:
        j = r.json()
        # buscar campos con 'audio' o 'wav' o 'base64'
        for k in ("audio", "wav", "data", "base64"):
            if k in j:
                val = j[k]
                # si es base64
                import base64
                try:
                    return base64.b64decode(val)
                except Exception:
                    # tal vez ya es bytes hex u otra cosa
                    pass
    except Exception:
        pass

    # Fallback: devolver raw bytes
    return r.content


def ensure_ffmpeg():
    if shutil.which("ffmpeg") is None:
        print("ffmpeg no está disponible en PATH. Instálalo para poder concatenar/convertir audios.")
        sys.exit(1)


def convert_and_save(raw_bytes: bytes, target_path: str):
    """Guarda bytes a un archivo temporal y convierte a WAV PCM 16k mono usando ffmpeg."""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as tmp:
        tmp.write(raw_bytes)
        tmp.flush()
        tmp_path = tmp.name

    # Convertir con ffmpeg a WAV 22050 Hz mono 16-bit
    cmd = [
        "ffmpeg", "-y", "-i", tmp_path,
        "-ar", "22050", "-ac", "1", "-sample_fmt", "s16", target_path
    ]
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    except subprocess.CalledProcessError as e:
        print(f"ffmpeg falló al convertir chunk: {e}")
        # como fallback, escribir los bytes "crudos"
        with open(target_path, "wb") as out:
            out.write(raw_bytes)
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass


def create_silence(duration: float, out_path: str, sr: int = 22050):
    """Create a silent wav of given duration (seconds) at sr and save to out_path."""
    # use ffmpeg anullsrc
    cmd = [
        "ffmpeg",
        "-y",
        "-f",
        "lavfi",
        "-i",
        f"anullsrc=channel_layout=mono:sample_rate={sr}",
        "-t",
        f"{duration}",
        "-c:a",
        "pcm_s16le",
        out_path,
    ]
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    except subprocess.CalledProcessError:
        # fallback: write tiny silence by creating zero bytes
        try:
            with open(out_path, "wb") as fh:
                fh.write(b"\x00" * 1024)
        except Exception:
            pass


def pad_or_trim_wav(in_path: str, out_path: str, target_duration: float, sr: int = 22050):
    """Pad with silence or trim input wav to match target_duration (seconds)."""
    # get duration
    try:
        p = subprocess.run([
            "ffprobe",
            "-v",
            "error",
            "-show_entries",
            "format=duration",
            "-of",
            "default=noprint_wrappers=1:nokey=1",
            in_path,
        ], capture_output=True, text=True)
        cur = float(p.stdout.strip())
    except Exception:
        cur = 0.0

    if cur == 0.0:
        shutil.copy(in_path, out_path)
        return

    if abs(cur - target_duration) < 0.02:
        shutil.copy(in_path, out_path)
        return

    if cur > target_duration:
        cmd = ["ffmpeg", "-y", "-i", in_path, "-t", f"{target_duration}", out_path]
        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        return

    # pad: create silence of missing duration and concat
    pad = target_duration - cur
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as sil:
        sil_path = sil.name
    try:
        create_silence(pad, sil_path, sr=sr)
        # concat in_path + sil_path
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
            listf.write(f"file '{os.path.abspath(in_path)}'\n")
            listf.write(f"file '{os.path.abspath(sil_path)}'\n")
            listname = listf.name
        cmd2 = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
        subprocess.run(cmd2, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    finally:
        try:
            os.remove(sil_path)
        except Exception:
            pass
        try:
            os.remove(listname)
        except Exception:
            pass


def concat_chunks(chunks: list, out_path: str):
    # Crear lista para ffmpeg concat demuxer
    ensure_ffmpeg()
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
        for c in chunks:
            listf.write(f"file '{os.path.abspath(c)}'\n")
        listname = listf.name

    cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError:
        # fallback: concatenar mediante reconversión
        tmp_concat = out_path + ".tmp.wav"
        cmd2 = ["ffmpeg", "-y", "-i", f"concat:{'|'.join(chunks)}", "-c", "copy", tmp_concat]
        subprocess.run(cmd2)
        shutil.move(tmp_concat, out_path)
    finally:
        try:
            os.remove(listname)
        except Exception:
            pass


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--srt", required=True, help="Ruta al archivo .srt traducido")
    p.add_argument("--openapi", required=False, help="URL al openapi.json de Kokoro (intenta autodetectar endpoint)")
    p.add_argument("--endpoint", required=False, help="URL directa del endpoint de síntesis (usa esto si autodetección falla)")
    p.add_argument(
        "--payload-template",
        required=False,
        help='Plantilla JSON para el payload con {text} como placeholder, ejemplo: "{\"text\": \"{text}\", \"voice\": \"alloy\"}"',
    )
    p.add_argument("--api-key", required=False, help="Valor para autorización (se envía como header Authorization: Bearer <key>)")
    p.add_argument("--voice", required=False, help="Nombre de voz si aplica (se añade al payload si se usa template)")
    p.add_argument("--out", required=True, help="Ruta de salida WAV final")
    p.add_argument(
        "--video",
        required=False,
        help="Ruta al vídeo original (necesario si quieres mezclar el audio con la pista original).",
    )
    p.add_argument(
        "--mix-with-original",
        action="store_true",
        help="Mezclar el WAV generado con la pista de audio original del vídeo (usa --video).",
    )
    p.add_argument(
        "--mix-background-volume",
        type=float,
        default=0.2,
        help="Volumen de la pista original al mezclar (0.0-1.0), por defecto 0.2",
    )
    p.add_argument(
        "--replace-original",
        action="store_true",
        help="Reemplazar la pista de audio del vídeo original por el WAV generado (usa --video).",
    )
    p.add_argument(
        "--align",
        action="store_true",
        help="Generar silencios para alinear segmentos con los timestamps del SRT (inserta gaps entre segmentos).",
    )
    p.add_argument(
        "--keep-chunks",
        action="store_true",
        help="Conservar los WAV de cada segmento en el directorio temporal (útil para debugging).",
    )
    args = p.parse_args()

    headers = {"Accept": "*/*"}
    if args.api_key:
        headers["Authorization"] = f"Bearer {args.api_key}"

    endpoint = args.endpoint
    if not endpoint and args.openapi:
        print("Intentando detectar endpoint desde openapi.json...")
        endpoint = find_synthesis_endpoint(args.openapi)
        if endpoint:
            print(f"Usando endpoint detectado: {endpoint}")
        else:
            print("No se detectó endpoint automáticamente. Pasa --endpoint o --payload-template.")
            sys.exit(1)

    if not endpoint:
        print("Debes proporcionar --endpoint o --openapi para que el script funcione.")
        sys.exit(1)

    subs = parse_srt_file(args.srt)
    tmpdir = tempfile.mkdtemp(prefix="srt_kokoro_")
    chunk_files = []

    print(f"Sintetizando {len(subs)} segmentos...")
    prev_end = 0.0
    for i, sub in enumerate(subs, start=1):
        text = re.sub(r"\s+", " ", sub.content.strip())
        if not text:
            prev_end = sub.end.total_seconds()
            continue

        start_sec = sub.start.total_seconds()
        end_sec = sub.end.total_seconds()
        duration = end_sec - start_sec

        # if align requested, insert silence for gap between previous end and current start
        if args.align:
            gap = start_sec - prev_end
            if gap > 0.01:
                sil_target = os.path.join(tmpdir, f"sil_{i:04d}.wav")
                create_silence(gap, sil_target)
                chunk_files.append(sil_target)

        try:
            raw = synth_chunk(endpoint, text, headers, args.payload_template)
        except Exception as e:
            print(f"Error al sintetizar segmento {i}: {e}")
            prev_end = end_sec
            continue

        target = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
        convert_and_save(raw, target)

        # If align: pad or trim to subtitle duration, otherwise keep raw chunk
        if args.align:
            aligned = os.path.join(tmpdir, f"chunk_{i:04d}.aligned.wav")
            pad_or_trim_wav(target, aligned, duration)
            # replace target with aligned file in list
            chunk_files.append(aligned)
            # remove original raw chunk unless keep-chunks
            if not args.keep_chunks:
                try:
                    os.remove(target)
                except Exception:
                    pass
        else:
            chunk_files.append(target)

        prev_end = end_sec
        print(f" - Segmento {i}/{len(subs)} -> {os.path.basename(chunk_files[-1])}")

    if not chunk_files:
        print("No se generaron fragmentos de audio. Abortando.")
        shutil.rmtree(tmpdir, ignore_errors=True)
        sys.exit(1)

    print("Concatenando fragments...")
    concat_chunks(chunk_files, args.out)
    print(f"Archivo final generado en: {args.out}")

    # Si el usuario pidió mezclar con la pista original del vídeo
    if args.mix_with_original:
        if not args.video:
            print("--mix-with-original requiere que pases --video con la ruta del vídeo original.")
        else:
            # extraer audio del vídeo original a wav temporal (mono 22050)
            orig_tmp = os.path.join(tempfile.gettempdir(), f"orig_audio_{os.getpid()}.wav")
            mixed_tmp = os.path.join(tempfile.gettempdir(), f"mixed_audio_{os.getpid()}.wav")
            try:
                cmd_ext = [
                    "ffmpeg",
                    "-y",
                    "-i",
                    args.video,
                    "-vn",
                    "-ar",
                    "22050",
                    "-ac",
                    "1",
                    "-sample_fmt",
                    "s16",
                    orig_tmp,
                ]
                subprocess.run(cmd_ext, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

                # Mezclar: new audio (args.out) en primer plano, original a volumen reducido
                vol = float(args.mix_background_volume)
                # construir filtro: [0:a]volume=1[a1];[1:a]volume=vol[a0];[a1][a0]amix=inputs=2:duration=first:weights=1 vol [mix]
                filter_complex = f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:weights=1 {vol}[mix]"
                # usar ffmpeg para mezclar y generar mixed_tmp
                cmd_mix = [
                    "ffmpeg",
                    "-y",
                    "-i",
                    args.out,
                    "-i",
                    orig_tmp,
                    "-filter_complex",
                    f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:dropout_transition=0[mix]",
                    "-map",
                    "[mix]",
                    "-c:a",
                    "pcm_s16le",
                    mixed_tmp,
                ]
                subprocess.run(cmd_mix, check=True)

                # reemplazar args.out con mixed_tmp
                shutil.move(mixed_tmp, args.out)
                print(f"Archivo mezclado generado en: {args.out}")
            except subprocess.CalledProcessError as e:
                print(f"Error al mezclar audio con la pista original: {e}")
            finally:
                try:
                    if os.path.exists(orig_tmp):
                        os.remove(orig_tmp)
                except Exception:
                    pass

    # Si se solicita reemplazar la pista original en el vídeo
    if args.replace_original:
        if not args.video:
            print("--replace-original requiere que pases --video con la ruta del vídeo original.")
        else:
            out_video = os.path.splitext(args.video)[0] + ".replaced_audio.mp4"
            try:
                cmd_rep = [
                    "ffmpeg",
                    "-y",
                    "-i",
                    args.video,
                    "-i",
                    args.out,
                    "-map",
                    "0:v:0",
                    "-map",
                    "1:a:0",
                    "-c:v",
                    "copy",
                    "-c:a",
                    "aac",
                    "-b:a",
                    "192k",
                    out_video,
                ]
                subprocess.run(cmd_rep, check=True)
                print(f"Vídeo con audio reemplazado generado: {out_video}")
            except subprocess.CalledProcessError as e:
                print(f"Error al reemplazar audio en el vídeo: {e}")

    # limpieza
    shutil.rmtree(tmpdir, ignore_errors=True)


if __name__ == '__main__':
    main()