#!/usr/bin/env python3 """ srt_to_kokoro.py Leer un archivo .srt y sintetizar cada subtítulo usando una API OpenAPI-compatible (p. ej. Kokoro). - Intenta autodetectar un endpoint de síntesis en `--openapi` (URL JSON) buscando paths que contengan 'synth'|'tts'|'text' y que acepten POST. - Alternativamente usa `--endpoint` y un `--payload-template` con {text} como placeholder. - Guarda fragmentos temporales y los concatena con ffmpeg en un único WAV de salida. Dependencias: requests, srt (pip install requests srt) Requiere ffmpeg en PATH. Ejemplos: python srt_to_kokoro.py --srt subs.srt --openapi "https://kokoro.../openapi.json" --voice "alloy" --out out.wav --api-key "TOKEN" python srt_to_kokoro.py --srt subs.srt --endpoint "https://kokoro.../v1/synthesize" --payload-template '{"text": "{text}", "voice": "alloy"}' --out out.wav """ import argparse import json import os import re import shutil import subprocess import sys import tempfile from typing import Optional try: import requests except Exception as e: print("Este script requiere la librería 'requests'. Instálala con: pip install requests") raise try: import srt except Exception: print("Este script requiere la librería 'srt'. Instálala con: pip install srt") raise def find_synthesis_endpoint(openapi_url: str) -> Optional[str]: """Intento heurístico: baja openapi.json y busca paths con 'synth'|'tts'|'text' que soporten POST.""" try: r = requests.get(openapi_url, timeout=20) r.raise_for_status() spec = r.json() except Exception as e: print(f"No pude leer openapi.json desde {openapi_url}: {e}") return None paths = spec.get("paths", {}) candidate = None for path, methods in paths.items(): lname = path.lower() if any(k in lname for k in ("synth", "tts", "text", "synthesize")): for method, op in methods.items(): if method.lower() == "post": # candidato candidate = path break if candidate: break if not candidate: # fallback: scan operationId or summary for path, methods in paths.items(): for method, op in methods.items(): meta = json.dumps(op).lower() if any(k in meta for k in ("synth", "tts", "text", "synthesize")) and method.lower() == "post": candidate = path break if candidate: break if not candidate: return None # Construir base url desde openapi_url from urllib.parse import urlparse, urljoin p = urlparse(openapi_url) base = f"{p.scheme}://{p.netloc}" return urljoin(base, candidate) def parse_srt_file(path: str): with open(path, "r", encoding="utf-8") as f: raw = f.read() subs = list(srt.parse(raw)) return subs def synth_chunk(endpoint: str, text: str, headers: dict, payload_template: Optional[str], timeout=60): """Envía la solicitud y devuelve bytes de audio. Maneja respuestas audio/* o JSON con campo base64.""" # Construir payload if payload_template: body = payload_template.replace("{text}", text) try: json_body = json.loads(body) except Exception: # enviar como texto plano json_body = {"text": text} else: json_body = {"text": text} # Realizar POST r = requests.post(endpoint, json=json_body, headers=headers, timeout=timeout) r.raise_for_status() ctype = r.headers.get("Content-Type", "") if ctype.startswith("audio/"): return r.content # Si viene JSON con base64 try: j = r.json() # buscar campos con 'audio' o 'wav' o 'base64' for k in ("audio", "wav", "data", "base64"): if k in j: val = j[k] # si es base64 import base64 try: return base64.b64decode(val) except Exception: # tal vez ya es bytes hex u otra cosa pass except Exception: pass # Fallback: devolver raw bytes return r.content def ensure_ffmpeg(): if shutil.which("ffmpeg") is None: print("ffmpeg no está disponible en PATH. Instálalo para poder concatenar/convertir audios.") sys.exit(1) def convert_and_save(raw_bytes: bytes, target_path: str): """Guarda bytes a un archivo temporal y convierte a WAV PCM 16k mono usando ffmpeg.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as tmp: tmp.write(raw_bytes) tmp.flush() tmp_path = tmp.name # Convertir con ffmpeg a WAV 22050 Hz mono 16-bit cmd = [ "ffmpeg", "-y", "-i", tmp_path, "-ar", "22050", "-ac", "1", "-sample_fmt", "s16", target_path ] try: subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: print(f"ffmpeg falló al convertir chunk: {e}") # como fallback, escribir los bytes "crudos" with open(target_path, "wb") as out: out.write(raw_bytes) finally: try: os.remove(tmp_path) except Exception: pass def create_silence(duration: float, out_path: str, sr: int = 22050): """Create a silent wav of given duration (seconds) at sr and save to out_path.""" # use ffmpeg anullsrc cmd = [ "ffmpeg", "-y", "-f", "lavfi", "-i", f"anullsrc=channel_layout=mono:sample_rate={sr}", "-t", f"{duration}", "-c:a", "pcm_s16le", out_path, ] try: subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError: # fallback: write tiny silence by creating zero bytes try: with open(out_path, "wb") as fh: fh.write(b"\x00" * 1024) except Exception: pass def pad_or_trim_wav(in_path: str, out_path: str, target_duration: float, sr: int = 22050): """Pad with silence or trim input wav to match target_duration (seconds).""" # get duration try: p = subprocess.run([ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", in_path, ], capture_output=True, text=True) cur = float(p.stdout.strip()) except Exception: cur = 0.0 if cur == 0.0: shutil.copy(in_path, out_path) return if abs(cur - target_duration) < 0.02: shutil.copy(in_path, out_path) return if cur > target_duration: cmd = ["ffmpeg", "-y", "-i", in_path, "-t", f"{target_duration}", out_path] subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return # pad: create silence of missing duration and concat pad = target_duration - cur with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as sil: sil_path = sil.name try: create_silence(pad, sil_path, sr=sr) # concat in_path + sil_path with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf: listf.write(f"file '{os.path.abspath(in_path)}'\n") listf.write(f"file '{os.path.abspath(sil_path)}'\n") listname = listf.name cmd2 = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path] subprocess.run(cmd2, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) finally: try: os.remove(sil_path) except Exception: pass try: os.remove(listname) except Exception: pass def concat_chunks(chunks: list, out_path: str): # Crear lista para ffmpeg concat demuxer ensure_ffmpeg() with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf: for c in chunks: listf.write(f"file '{os.path.abspath(c)}'\n") listname = listf.name cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path] try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError: # fallback: concatenar mediante reconversión tmp_concat = out_path + ".tmp.wav" cmd2 = ["ffmpeg", "-y", "-i", f"concat:{'|'.join(chunks)}", "-c", "copy", tmp_concat] subprocess.run(cmd2) shutil.move(tmp_concat, out_path) finally: try: os.remove(listname) except Exception: pass def main(): p = argparse.ArgumentParser() p.add_argument("--srt", required=True, help="Ruta al archivo .srt traducido") p.add_argument("--openapi", required=False, help="URL al openapi.json de Kokoro (intenta autodetectar endpoint)") p.add_argument("--endpoint", required=False, help="URL directa del endpoint de síntesis (usa esto si autodetección falla)") p.add_argument( "--payload-template", required=False, help='Plantilla JSON para el payload con {text} como placeholder, ejemplo: "{\"text\": \"{text}\", \"voice\": \"alloy\"}"', ) p.add_argument("--api-key", required=False, help="Valor para autorización (se envía como header Authorization: Bearer )") p.add_argument("--voice", required=False, help="Nombre de voz si aplica (se añade al payload si se usa template)") p.add_argument("--out", required=True, help="Ruta de salida WAV final") p.add_argument( "--video", required=False, help="Ruta al vídeo original (necesario si quieres mezclar el audio con la pista original).", ) p.add_argument( "--mix-with-original", action="store_true", help="Mezclar el WAV generado con la pista de audio original del vídeo (usa --video).", ) p.add_argument( "--mix-background-volume", type=float, default=0.2, help="Volumen de la pista original al mezclar (0.0-1.0), por defecto 0.2", ) p.add_argument( "--replace-original", action="store_true", help="Reemplazar la pista de audio del vídeo original por el WAV generado (usa --video).", ) p.add_argument( "--align", action="store_true", help="Generar silencios para alinear segmentos con los timestamps del SRT (inserta gaps entre segmentos).", ) p.add_argument( "--keep-chunks", action="store_true", help="Conservar los WAV de cada segmento en el directorio temporal (útil para debugging).", ) args = p.parse_args() headers = {"Accept": "*/*"} if args.api_key: headers["Authorization"] = f"Bearer {args.api_key}" endpoint = args.endpoint if not endpoint and args.openapi: print("Intentando detectar endpoint desde openapi.json...") endpoint = find_synthesis_endpoint(args.openapi) if endpoint: print(f"Usando endpoint detectado: {endpoint}") else: print("No se detectó endpoint automáticamente. Pasa --endpoint o --payload-template.") sys.exit(1) if not endpoint: print("Debes proporcionar --endpoint o --openapi para que el script funcione.") sys.exit(1) subs = parse_srt_file(args.srt) tmpdir = tempfile.mkdtemp(prefix="srt_kokoro_") chunk_files = [] print(f"Sintetizando {len(subs)} segmentos...") prev_end = 0.0 for i, sub in enumerate(subs, start=1): text = re.sub(r"\s+", " ", sub.content.strip()) if not text: prev_end = sub.end.total_seconds() continue start_sec = sub.start.total_seconds() end_sec = sub.end.total_seconds() duration = end_sec - start_sec # if align requested, insert silence for gap between previous end and current start if args.align: gap = start_sec - prev_end if gap > 0.01: sil_target = os.path.join(tmpdir, f"sil_{i:04d}.wav") create_silence(gap, sil_target) chunk_files.append(sil_target) try: raw = synth_chunk(endpoint, text, headers, args.payload_template) except Exception as e: print(f"Error al sintetizar segmento {i}: {e}") prev_end = end_sec continue target = os.path.join(tmpdir, f"chunk_{i:04d}.wav") convert_and_save(raw, target) # If align: pad or trim to subtitle duration, otherwise keep raw chunk if args.align: aligned = os.path.join(tmpdir, f"chunk_{i:04d}.aligned.wav") pad_or_trim_wav(target, aligned, duration) # replace target with aligned file in list chunk_files.append(aligned) # remove original raw chunk unless keep-chunks if not args.keep_chunks: try: os.remove(target) except Exception: pass else: chunk_files.append(target) prev_end = end_sec print(f" - Segmento {i}/{len(subs)} -> {os.path.basename(chunk_files[-1])}") if not chunk_files: print("No se generaron fragmentos de audio. Abortando.") shutil.rmtree(tmpdir, ignore_errors=True) sys.exit(1) print("Concatenando fragments...") concat_chunks(chunk_files, args.out) print(f"Archivo final generado en: {args.out}") # Si el usuario pidió mezclar con la pista original del vídeo if args.mix_with_original: if not args.video: print("--mix-with-original requiere que pases --video con la ruta del vídeo original.") else: # extraer audio del vídeo original a wav temporal (mono 22050) orig_tmp = os.path.join(tempfile.gettempdir(), f"orig_audio_{os.getpid()}.wav") mixed_tmp = os.path.join(tempfile.gettempdir(), f"mixed_audio_{os.getpid()}.wav") try: cmd_ext = [ "ffmpeg", "-y", "-i", args.video, "-vn", "-ar", "22050", "-ac", "1", "-sample_fmt", "s16", orig_tmp, ] subprocess.run(cmd_ext, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Mezclar: new audio (args.out) en primer plano, original a volumen reducido vol = float(args.mix_background_volume) # construir filtro: [0:a]volume=1[a1];[1:a]volume=vol[a0];[a1][a0]amix=inputs=2:duration=first:weights=1 vol [mix] filter_complex = f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:weights=1 {vol}[mix]" # usar ffmpeg para mezclar y generar mixed_tmp cmd_mix = [ "ffmpeg", "-y", "-i", args.out, "-i", orig_tmp, "-filter_complex", f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:dropout_transition=0[mix]", "-map", "[mix]", "-c:a", "pcm_s16le", mixed_tmp, ] subprocess.run(cmd_mix, check=True) # reemplazar args.out con mixed_tmp shutil.move(mixed_tmp, args.out) print(f"Archivo mezclado generado en: {args.out}") except subprocess.CalledProcessError as e: print(f"Error al mezclar audio con la pista original: {e}") finally: try: if os.path.exists(orig_tmp): os.remove(orig_tmp) except Exception: pass # Si se solicita reemplazar la pista original en el vídeo if args.replace_original: if not args.video: print("--replace-original requiere que pases --video con la ruta del vídeo original.") else: out_video = os.path.splitext(args.video)[0] + ".replaced_audio.mp4" try: cmd_rep = [ "ffmpeg", "-y", "-i", args.video, "-i", args.out, "-map", "0:v:0", "-map", "1:a:0", "-c:v", "copy", "-c:a", "aac", "-b:a", "192k", out_video, ] subprocess.run(cmd_rep, check=True) print(f"Vídeo con audio reemplazado generado: {out_video}") except subprocess.CalledProcessError as e: print(f"Error al reemplazar audio en el vídeo: {e}") # limpieza shutil.rmtree(tmpdir, ignore_errors=True) if __name__ == '__main__': main()