493 lines
17 KiB
Python
493 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
srt_to_kokoro.py
|
|
|
|
Leer un archivo .srt y sintetizar cada subtítulo usando una API OpenAPI-compatible (p. ej. Kokoro).
|
|
- Intenta autodetectar un endpoint de síntesis en `--openapi` (URL JSON) buscando paths que contengan 'synth'|'tts'|'text' y que acepten POST.
|
|
- Alternativamente usa `--endpoint` y un `--payload-template` con {text} como placeholder.
|
|
- Guarda fragmentos temporales y los concatena con ffmpeg en un único WAV de salida.
|
|
|
|
Dependencias: requests, srt (pip install requests srt)
|
|
Requiere ffmpeg en PATH.
|
|
|
|
Ejemplos:
|
|
python srt_to_kokoro.py --srt subs.srt --openapi "https://kokoro.../openapi.json" --voice "alloy" --out out.wav --api-key "TOKEN"
|
|
python srt_to_kokoro.py --srt subs.srt --endpoint "https://kokoro.../v1/synthesize" --payload-template '{"text": "{text}", "voice": "alloy"}' --out out.wav
|
|
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from typing import Optional
|
|
|
|
try:
|
|
import requests
|
|
except Exception as e:
|
|
print("Este script requiere la librería 'requests'. Instálala con: pip install requests")
|
|
raise
|
|
|
|
try:
|
|
import srt
|
|
except Exception:
|
|
print("Este script requiere la librería 'srt'. Instálala con: pip install srt")
|
|
raise
|
|
|
|
|
|
def find_synthesis_endpoint(openapi_url: str) -> Optional[str]:
|
|
"""Intento heurístico: baja openapi.json y busca paths con 'synth'|'tts'|'text' que soporten POST."""
|
|
try:
|
|
r = requests.get(openapi_url, timeout=20)
|
|
r.raise_for_status()
|
|
spec = r.json()
|
|
except Exception as e:
|
|
print(f"No pude leer openapi.json desde {openapi_url}: {e}")
|
|
return None
|
|
|
|
paths = spec.get("paths", {})
|
|
candidate = None
|
|
for path, methods in paths.items():
|
|
lname = path.lower()
|
|
if any(k in lname for k in ("synth", "tts", "text", "synthesize")):
|
|
for method, op in methods.items():
|
|
if method.lower() == "post":
|
|
# candidato
|
|
candidate = path
|
|
break
|
|
if candidate:
|
|
break
|
|
|
|
if not candidate:
|
|
# fallback: scan operationId or summary
|
|
for path, methods in paths.items():
|
|
for method, op in methods.items():
|
|
meta = json.dumps(op).lower()
|
|
if any(k in meta for k in ("synth", "tts", "text", "synthesize")) and method.lower() == "post":
|
|
candidate = path
|
|
break
|
|
if candidate:
|
|
break
|
|
|
|
if not candidate:
|
|
return None
|
|
|
|
# Construir base url desde openapi_url
|
|
from urllib.parse import urlparse, urljoin
|
|
p = urlparse(openapi_url)
|
|
base = f"{p.scheme}://{p.netloc}"
|
|
return urljoin(base, candidate)
|
|
|
|
|
|
def parse_srt_file(path: str):
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
raw = f.read()
|
|
subs = list(srt.parse(raw))
|
|
return subs
|
|
|
|
|
|
def synth_chunk(endpoint: str, text: str, headers: dict, payload_template: Optional[str], timeout=60):
|
|
"""Envía la solicitud y devuelve bytes de audio. Maneja respuestas audio/* o JSON con campo base64."""
|
|
# Construir payload
|
|
if payload_template:
|
|
body = payload_template.replace("{text}", text)
|
|
try:
|
|
json_body = json.loads(body)
|
|
except Exception:
|
|
# enviar como texto plano
|
|
json_body = {"text": text}
|
|
else:
|
|
json_body = {"text": text}
|
|
|
|
# Realizar POST
|
|
r = requests.post(endpoint, json=json_body, headers=headers, timeout=timeout)
|
|
r.raise_for_status()
|
|
|
|
ctype = r.headers.get("Content-Type", "")
|
|
if ctype.startswith("audio/"):
|
|
return r.content
|
|
# Si viene JSON con base64
|
|
try:
|
|
j = r.json()
|
|
# buscar campos con 'audio' o 'wav' o 'base64'
|
|
for k in ("audio", "wav", "data", "base64"):
|
|
if k in j:
|
|
val = j[k]
|
|
# si es base64
|
|
import base64
|
|
try:
|
|
return base64.b64decode(val)
|
|
except Exception:
|
|
# tal vez ya es bytes hex u otra cosa
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback: devolver raw bytes
|
|
return r.content
|
|
|
|
|
|
def ensure_ffmpeg():
|
|
if shutil.which("ffmpeg") is None:
|
|
print("ffmpeg no está disponible en PATH. Instálalo para poder concatenar/convertir audios.")
|
|
sys.exit(1)
|
|
|
|
|
|
def convert_and_save(raw_bytes: bytes, target_path: str):
|
|
"""Guarda bytes a un archivo temporal y convierte a WAV PCM 16k mono usando ffmpeg."""
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as tmp:
|
|
tmp.write(raw_bytes)
|
|
tmp.flush()
|
|
tmp_path = tmp.name
|
|
|
|
# Convertir con ffmpeg a WAV 22050 Hz mono 16-bit
|
|
cmd = [
|
|
"ffmpeg", "-y", "-i", tmp_path,
|
|
"-ar", "22050", "-ac", "1", "-sample_fmt", "s16", target_path
|
|
]
|
|
try:
|
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"ffmpeg falló al convertir chunk: {e}")
|
|
# como fallback, escribir los bytes "crudos"
|
|
with open(target_path, "wb") as out:
|
|
out.write(raw_bytes)
|
|
finally:
|
|
try:
|
|
os.remove(tmp_path)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def create_silence(duration: float, out_path: str, sr: int = 22050):
|
|
"""Create a silent wav of given duration (seconds) at sr and save to out_path."""
|
|
# use ffmpeg anullsrc
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-f",
|
|
"lavfi",
|
|
"-i",
|
|
f"anullsrc=channel_layout=mono:sample_rate={sr}",
|
|
"-t",
|
|
f"{duration}",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
out_path,
|
|
]
|
|
try:
|
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
except subprocess.CalledProcessError:
|
|
# fallback: write tiny silence by creating zero bytes
|
|
try:
|
|
with open(out_path, "wb") as fh:
|
|
fh.write(b"\x00" * 1024)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def pad_or_trim_wav(in_path: str, out_path: str, target_duration: float, sr: int = 22050):
|
|
"""Pad with silence or trim input wav to match target_duration (seconds)."""
|
|
# get duration
|
|
try:
|
|
p = subprocess.run([
|
|
"ffprobe",
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"default=noprint_wrappers=1:nokey=1",
|
|
in_path,
|
|
], capture_output=True, text=True)
|
|
cur = float(p.stdout.strip())
|
|
except Exception:
|
|
cur = 0.0
|
|
|
|
if cur == 0.0:
|
|
shutil.copy(in_path, out_path)
|
|
return
|
|
|
|
if abs(cur - target_duration) < 0.02:
|
|
shutil.copy(in_path, out_path)
|
|
return
|
|
|
|
if cur > target_duration:
|
|
cmd = ["ffmpeg", "-y", "-i", in_path, "-t", f"{target_duration}", out_path]
|
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
return
|
|
|
|
# pad: create silence of missing duration and concat
|
|
pad = target_duration - cur
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as sil:
|
|
sil_path = sil.name
|
|
try:
|
|
create_silence(pad, sil_path, sr=sr)
|
|
# concat in_path + sil_path
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
|
|
listf.write(f"file '{os.path.abspath(in_path)}'\n")
|
|
listf.write(f"file '{os.path.abspath(sil_path)}'\n")
|
|
listname = listf.name
|
|
cmd2 = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
|
|
subprocess.run(cmd2, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
finally:
|
|
try:
|
|
os.remove(sil_path)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
os.remove(listname)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def concat_chunks(chunks: list, out_path: str):
|
|
# Crear lista para ffmpeg concat demuxer
|
|
ensure_ffmpeg()
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
|
|
for c in chunks:
|
|
listf.write(f"file '{os.path.abspath(c)}'\n")
|
|
listname = listf.name
|
|
|
|
cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
|
|
try:
|
|
subprocess.run(cmd, check=True)
|
|
except subprocess.CalledProcessError:
|
|
# fallback: concatenar mediante reconversión
|
|
tmp_concat = out_path + ".tmp.wav"
|
|
cmd2 = ["ffmpeg", "-y", "-i", f"concat:{'|'.join(chunks)}", "-c", "copy", tmp_concat]
|
|
subprocess.run(cmd2)
|
|
shutil.move(tmp_concat, out_path)
|
|
finally:
|
|
try:
|
|
os.remove(listname)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--srt", required=True, help="Ruta al archivo .srt traducido")
|
|
p.add_argument("--openapi", required=False, help="URL al openapi.json de Kokoro (intenta autodetectar endpoint)")
|
|
p.add_argument("--endpoint", required=False, help="URL directa del endpoint de síntesis (usa esto si autodetección falla)")
|
|
p.add_argument(
|
|
"--payload-template",
|
|
required=False,
|
|
help='Plantilla JSON para el payload con {text} como placeholder, ejemplo: "{\"text\": \"{text}\", \"voice\": \"alloy\"}"',
|
|
)
|
|
p.add_argument("--api-key", required=False, help="Valor para autorización (se envía como header Authorization: Bearer <key>)")
|
|
p.add_argument("--voice", required=False, help="Nombre de voz si aplica (se añade al payload si se usa template)")
|
|
p.add_argument("--out", required=True, help="Ruta de salida WAV final")
|
|
p.add_argument(
|
|
"--video",
|
|
required=False,
|
|
help="Ruta al vídeo original (necesario si quieres mezclar el audio con la pista original).",
|
|
)
|
|
p.add_argument(
|
|
"--mix-with-original",
|
|
action="store_true",
|
|
help="Mezclar el WAV generado con la pista de audio original del vídeo (usa --video).",
|
|
)
|
|
p.add_argument(
|
|
"--mix-background-volume",
|
|
type=float,
|
|
default=0.2,
|
|
help="Volumen de la pista original al mezclar (0.0-1.0), por defecto 0.2",
|
|
)
|
|
p.add_argument(
|
|
"--replace-original",
|
|
action="store_true",
|
|
help="Reemplazar la pista de audio del vídeo original por el WAV generado (usa --video).",
|
|
)
|
|
p.add_argument(
|
|
"--align",
|
|
action="store_true",
|
|
help="Generar silencios para alinear segmentos con los timestamps del SRT (inserta gaps entre segmentos).",
|
|
)
|
|
p.add_argument(
|
|
"--keep-chunks",
|
|
action="store_true",
|
|
help="Conservar los WAV de cada segmento en el directorio temporal (útil para debugging).",
|
|
)
|
|
args = p.parse_args()
|
|
|
|
headers = {"Accept": "*/*"}
|
|
if args.api_key:
|
|
headers["Authorization"] = f"Bearer {args.api_key}"
|
|
|
|
endpoint = args.endpoint
|
|
if not endpoint and args.openapi:
|
|
print("Intentando detectar endpoint desde openapi.json...")
|
|
endpoint = find_synthesis_endpoint(args.openapi)
|
|
if endpoint:
|
|
print(f"Usando endpoint detectado: {endpoint}")
|
|
else:
|
|
print("No se detectó endpoint automáticamente. Pasa --endpoint o --payload-template.")
|
|
sys.exit(1)
|
|
|
|
if not endpoint:
|
|
print("Debes proporcionar --endpoint o --openapi para que el script funcione.")
|
|
sys.exit(1)
|
|
|
|
subs = parse_srt_file(args.srt)
|
|
tmpdir = tempfile.mkdtemp(prefix="srt_kokoro_")
|
|
chunk_files = []
|
|
|
|
print(f"Sintetizando {len(subs)} segmentos...")
|
|
prev_end = 0.0
|
|
for i, sub in enumerate(subs, start=1):
|
|
text = re.sub(r"\s+", " ", sub.content.strip())
|
|
if not text:
|
|
prev_end = sub.end.total_seconds()
|
|
continue
|
|
|
|
start_sec = sub.start.total_seconds()
|
|
end_sec = sub.end.total_seconds()
|
|
duration = end_sec - start_sec
|
|
|
|
# if align requested, insert silence for gap between previous end and current start
|
|
if args.align:
|
|
gap = start_sec - prev_end
|
|
if gap > 0.01:
|
|
sil_target = os.path.join(tmpdir, f"sil_{i:04d}.wav")
|
|
create_silence(gap, sil_target)
|
|
chunk_files.append(sil_target)
|
|
|
|
try:
|
|
raw = synth_chunk(endpoint, text, headers, args.payload_template)
|
|
except Exception as e:
|
|
print(f"Error al sintetizar segmento {i}: {e}")
|
|
prev_end = end_sec
|
|
continue
|
|
|
|
target = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
|
|
convert_and_save(raw, target)
|
|
|
|
# If align: pad or trim to subtitle duration, otherwise keep raw chunk
|
|
if args.align:
|
|
aligned = os.path.join(tmpdir, f"chunk_{i:04d}.aligned.wav")
|
|
pad_or_trim_wav(target, aligned, duration)
|
|
# replace target with aligned file in list
|
|
chunk_files.append(aligned)
|
|
# remove original raw chunk unless keep-chunks
|
|
if not args.keep_chunks:
|
|
try:
|
|
os.remove(target)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
chunk_files.append(target)
|
|
|
|
prev_end = end_sec
|
|
print(f" - Segmento {i}/{len(subs)} -> {os.path.basename(chunk_files[-1])}")
|
|
|
|
if not chunk_files:
|
|
print("No se generaron fragmentos de audio. Abortando.")
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
sys.exit(1)
|
|
|
|
print("Concatenando fragments...")
|
|
concat_chunks(chunk_files, args.out)
|
|
print(f"Archivo final generado en: {args.out}")
|
|
|
|
# Si el usuario pidió mezclar con la pista original del vídeo
|
|
if args.mix_with_original:
|
|
if not args.video:
|
|
print("--mix-with-original requiere que pases --video con la ruta del vídeo original.")
|
|
else:
|
|
# extraer audio del vídeo original a wav temporal (mono 22050)
|
|
orig_tmp = os.path.join(tempfile.gettempdir(), f"orig_audio_{os.getpid()}.wav")
|
|
mixed_tmp = os.path.join(tempfile.gettempdir(), f"mixed_audio_{os.getpid()}.wav")
|
|
try:
|
|
cmd_ext = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
args.video,
|
|
"-vn",
|
|
"-ar",
|
|
"22050",
|
|
"-ac",
|
|
"1",
|
|
"-sample_fmt",
|
|
"s16",
|
|
orig_tmp,
|
|
]
|
|
subprocess.run(cmd_ext, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
|
|
# Mezclar: new audio (args.out) en primer plano, original a volumen reducido
|
|
vol = float(args.mix_background_volume)
|
|
# construir filtro: [0:a]volume=1[a1];[1:a]volume=vol[a0];[a1][a0]amix=inputs=2:duration=first:weights=1 vol [mix]
|
|
filter_complex = f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:weights=1 {vol}[mix]"
|
|
# usar ffmpeg para mezclar y generar mixed_tmp
|
|
cmd_mix = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
args.out,
|
|
"-i",
|
|
orig_tmp,
|
|
"-filter_complex",
|
|
f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:dropout_transition=0[mix]",
|
|
"-map",
|
|
"[mix]",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
mixed_tmp,
|
|
]
|
|
subprocess.run(cmd_mix, check=True)
|
|
|
|
# reemplazar args.out con mixed_tmp
|
|
shutil.move(mixed_tmp, args.out)
|
|
print(f"Archivo mezclado generado en: {args.out}")
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error al mezclar audio con la pista original: {e}")
|
|
finally:
|
|
try:
|
|
if os.path.exists(orig_tmp):
|
|
os.remove(orig_tmp)
|
|
except Exception:
|
|
pass
|
|
|
|
# Si se solicita reemplazar la pista original en el vídeo
|
|
if args.replace_original:
|
|
if not args.video:
|
|
print("--replace-original requiere que pases --video con la ruta del vídeo original.")
|
|
else:
|
|
out_video = os.path.splitext(args.video)[0] + ".replaced_audio.mp4"
|
|
try:
|
|
cmd_rep = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
args.video,
|
|
"-i",
|
|
args.out,
|
|
"-map",
|
|
"0:v:0",
|
|
"-map",
|
|
"1:a:0",
|
|
"-c:v",
|
|
"copy",
|
|
"-c:a",
|
|
"aac",
|
|
"-b:a",
|
|
"192k",
|
|
out_video,
|
|
]
|
|
subprocess.run(cmd_rep, check=True)
|
|
print(f"Vídeo con audio reemplazado generado: {out_video}")
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error al reemplazar audio en el vídeo: {e}")
|
|
|
|
# limpieza
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|