#!/usr/bin/env python3 # Orquesta: transcripción -> traducción -> síntesis por segmento -> reemplazo/mezcla -> quemado de subtítulos import argparse import os import shlex import shutil import subprocess import sys import tempfile def run(cmd, dry_run=False, env=None): # Ejecuta un comando. Acepta str (ejecuta vía shell) o list (sin shell). # Imprime el comando de forma segura para copiar/pegar. Si dry_run=True # no ejecuta nada. if isinstance(cmd, (list, tuple)): printable = " ".join(shlex.quote(str(x)) for x in cmd) else: printable = cmd print("+", printable) if dry_run: return 0 if isinstance(cmd, (list, tuple)): return subprocess.run(cmd, shell=False, check=True, env=env) return subprocess.run(cmd, shell=True, check=True, env=env) def json_payload_template(model, voice): # Payload JSON con {text} como placeholder que acepta srt_to_kokoro return '{"model":"' + model + '","voice":"' + voice + '","input":"{text}","response_format":"wav"}' def main(): p = argparse.ArgumentParser() p.add_argument("--video", required=True, help="Vídeo de entrada") p.add_argument( "--srt", help=("SRT de entrada (si ya existe). Si no, se transcribe del audio"), ) p.add_argument("--kokoro-endpoint", required=True, help="URL del endpoint TTS") p.add_argument("--kokoro-key", required=True, help="API key para Kokoro") p.add_argument("--voice", default="em_alex", help="Nombre de voz (p.ej. em_alex)") p.add_argument("--kokoro-model", default="model", help="ID del modelo Kokoro") p.add_argument("--whisper-model", default="base", help="Modelo de Whisper para transcribir") p.add_argument("--out", default=None, help="Vídeo de salida final (opcional)") p.add_argument( "--translate-method", choices=["local", "gemini", "none"], default="local", help=( "Método para traducir el SRT: 'local' (MarianMT), 'gemini' (API)" " o 'none' (usar SRT proporcionado)" ), ) p.add_argument("--gemini-key", default=None, help="API key para Gemini (si aplica)") p.add_argument( "--mix", action="store_true", help="Mezclar el audio sintetizado con la pista original en lugar de reemplazarla", ) p.add_argument( "--mix-background-volume", type=float, default=0.2, help="Volumen de la pista original al mezclar (0.0-1.0)", ) p.add_argument( "--keep-chunks", action="store_true", help="Conservar los archivos de chunks generados por la síntesis (debug)", ) p.add_argument( "--keep-temp", action="store_true", help="No borrar el directorio temporal de trabajo al terminar", ) p.add_argument("--dry-run", action="store_true", help="Solo mostrar comandos sin ejecutar") args = p.parse_args() video = os.path.abspath(args.video) if not os.path.exists(video): print("Vídeo no encontrado:", video, file=sys.stderr) sys.exit(2) workdir = tempfile.mkdtemp(prefix="full_pipeline_") try: # 1) obtener SRT: si no se pasa, extraer audio y transcribir if args.srt: srt_in = os.path.abspath(args.srt) print("Usando SRT proporcionado:", srt_in) else: audio_tmp = os.path.join(workdir, "extracted_audio.wav") cmd_extract = [ "ffmpeg", "-y", "-i", video, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_tmp, ] run(cmd_extract, dry_run=args.dry_run) # llamar al script transcribe.py para generar SRT srt_in = os.path.join(workdir, "transcribed.srt") cmd_trans = [ sys.executable, "whisper_project/transcribe.py", "--file", audio_tmp, "--backend", "faster-whisper", "--model", args.whisper_model, "--srt", "--srt-file", srt_in, ] run(cmd_trans, dry_run=args.dry_run) # 2) traducir SRT según método elegido srt_translated = os.path.join(workdir, "translated.srt") if args.translate_method == "local": cmd_translate = [ sys.executable, "whisper_project/translate_srt_local.py", "--in", srt_in, "--out", srt_translated, ] run(cmd_translate, dry_run=args.dry_run) elif args.translate_method == "gemini": gem_key = args.gemini_key or os.environ.get("GEMINI_API_KEY") if not gem_key: print( "--translate-method=gemini requiere --gemini-key o la var de entorno GEMINI_API_KEY", file=sys.stderr, ) sys.exit(4) cmd_translate = [ sys.executable, "whisper_project/translate_srt_with_gemini.py", "--in", srt_in, "--out", srt_translated, "--gemini-api-key", gem_key, ] run(cmd_translate, dry_run=args.dry_run) else: # none: usar SRT tal cual srt_translated = srt_in # 3) sintetizar por segmento con Kokoro, alinear, concatenar y # reemplazar o mezclar audio en el vídeo dub_wav = os.path.join(workdir, "dub_final.wav") payload = json_payload_template(args.kokoro_model, args.voice) synth_cmd = [ sys.executable, "whisper_project/srt_to_kokoro.py", "--srt", srt_translated, "--endpoint", args.kokoro_endpoint, "--payload-template", payload, "--api-key", args.kokoro_key, "--out", dub_wav, "--video", video, "--align", ] if args.keep_chunks: synth_cmd.append("--keep-chunks") if args.mix: synth_cmd += ["--mix-with-original", "--mix-background-volume", str(args.mix_background_volume)] else: synth_cmd.append("--replace-original") run(synth_cmd, dry_run=args.dry_run) # 4) quemar SRT en vídeo resultante out_video = args.out if args.out else os.path.splitext(video)[0] + ".replaced_audio.subs.mp4" replaced_src = os.path.splitext(video)[0] + ".replaced_audio.mp4" # build filter string vf = f"subtitles={srt_translated}:force_style='FontName=Arial,FontSize=24'" cmd_burn = [ "ffmpeg", "-y", "-i", replaced_src, "-vf", vf, "-c:a", "copy", out_video, ] run(cmd_burn, dry_run=args.dry_run) print("Flujo completado. Vídeo final:", out_video) finally: if args.dry_run: print("(dry-run) leaving workdir:", workdir) else: if not args.keep_temp: try: shutil.rmtree(workdir) except Exception: pass if __name__ == '__main__': main() #!/usr/bin/env python3 # run_full_pipeline.py # Orquesta: transcripción -> traducción -> síntesis por segmento -> reemplazo/mezcla -> quemado de subtítulos import argparse import os import shlex import shutil import subprocess import sys import tempfile def run(cmd, dry_run=False, env=None): # Ejecuta un comando. Acepta str (ejecuta vía shell) o list (sin shell). # Imprime el comando de forma segura para copiar/pegar. Si dry_run=True # no ejecuta nada. if isinstance(cmd, (list, tuple)): printable = " ".join(shlex.quote(str(x)) for x in cmd) else: printable = cmd print("+", printable) if dry_run: return 0 if isinstance(cmd, (list, tuple)): return subprocess.run(cmd, shell=False, check=True, env=env) return subprocess.run(cmd, shell=True, check=True, env=env) def json_payload_template(model, voice): # Payload JSON con {text} como placeholder que acepta srt_to_kokoro return '{"model":"' + model + '","voice":"' + voice + '","input":"{text}","response_format":"wav"}' def main(): p = argparse.ArgumentParser() p.add_argument("--video", required=True, help="Vídeo de entrada") p.add_argument( "--srt", help=("SRT de entrada (si ya existe). Si no, se transcribe del audio"), ) p.add_argument("--kokoro-endpoint", required=True, help="URL del endpoint TTS") p.add_argument("--kokoro-key", required=True, help="API key para Kokoro") p.add_argument("--voice", default="em_alex", help="Nombre de voz (p.ej. em_alex)") p.add_argument("--kokoro-model", default="model", help="ID del modelo Kokoro") p.add_argument("--whisper-model", default="base", help="Modelo de Whisper para transcribir") p.add_argument("--out", default=None, help="Vídeo de salida final (opcional)") p.add_argument( "--translate-method", choices=["local", "gemini", "none"], default="local", help=( "Método para traducir el SRT: 'local' (MarianMT), 'gemini' (API)" " o 'none' (usar SRT proporcionado)" ), ) p.add_argument("--gemini-key", default=None, help="API key para Gemini (si aplica)") p.add_argument( "--mix", action="store_true", help="Mezclar el audio sintetizado con la pista original en lugar de reemplazarla", ) p.add_argument( "--mix-background-volume", type=float, default=0.2, help="Volumen de la pista original al mezclar (0.0-1.0)", ) p.add_argument( "--keep-chunks", action="store_true", help="Conservar los archivos de chunks generados por la síntesis (debug)", ) p.add_argument( "--keep-temp", action="store_true", help="No borrar el directorio temporal de trabajo al terminar", ) p.add_argument("--dry-run", action="store_true", help="Solo mostrar comandos sin ejecutar") args = p.parse_args() video = os.path.abspath(args.video) if not os.path.exists(video): print("Vídeo no encontrado:", video, file=sys.stderr) sys.exit(2) workdir = tempfile.mkdtemp(prefix="full_pipeline_") try: # 1) obtener SRT: si no se pasa, extraer audio y transcribir if args.srt: srt_in = os.path.abspath(args.srt) print("Usando SRT proporcionado:", srt_in) else: audio_tmp = os.path.join(workdir, "extracted_audio.wav") cmd_extract = [ "ffmpeg", "-y", "-i", video, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_tmp, ] run(cmd_extract, dry_run=args.dry_run) # llamar al script transcribe.py para generar SRT srt_in = os.path.join(workdir, "transcribed.srt") cmd_trans = [ sys.executable, "whisper_project/transcribe.py", "--file", audio_tmp, "--backend", "faster-whisper", "--model", args.whisper_model, "--srt", "--srt-file", srt_in, ] run(cmd_trans, dry_run=args.dry_run) # 2) traducir SRT según método elegido srt_translated = os.path.join(workdir, "translated.srt") if args.translate_method == "local": cmd_translate = [ sys.executable, "whisper_project/translate_srt_local.py", "--in", srt_in, "--out", srt_translated, ] run(cmd_translate, dry_run=args.dry_run) elif args.translate_method == "gemini": gem_key = args.gemini_key or os.environ.get("GEMINI_API_KEY") if not gem_key: print( "--translate-method=gemini requiere --gemini-key o la var de entorno GEMINI_API_KEY", file=sys.stderr, ) sys.exit(4) cmd_translate = [ sys.executable, "whisper_project/translate_srt_with_gemini.py", "--in", srt_in, "--out", srt_translated, "--gemini-api-key", gem_key, ] run(cmd_translate, dry_run=args.dry_run) else: # none: usar SRT tal cual srt_translated = srt_in # 3) sintetizar por segmento con Kokoro, alinear, concatenar y # reemplazar o mezclar audio en el vídeo dub_wav = os.path.join(workdir, "dub_final.wav") payload = json_payload_template(args.kokoro_model, args.voice) synth_cmd = [ sys.executable, "whisper_project/srt_to_kokoro.py", "--srt", srt_translated, "--endpoint", args.kokoro_endpoint, "--payload-template", payload, "--api-key", args.kokoro_key, "--out", dub_wav, "--video", video, "--align", ] if args.keep_chunks: synth_cmd.append("--keep-chunks") if args.mix: synth_cmd += ["--mix-with-original", "--mix-background-volume", str(args.mix_background_volume)] else: synth_cmd.append("--replace-original") run(synth_cmd, dry_run=args.dry_run) # 4) quemar SRT en vídeo resultante out_video = args.out if args.out else os.path.splitext(video)[0] + ".replaced_audio.subs.mp4" replaced_src = os.path.splitext(video)[0] + ".replaced_audio.mp4" # build filter string vf = f"subtitles={srt_translated}:force_style='FontName=Arial,FontSize=24'" cmd_burn = [ "ffmpeg", "-y", "-i", replaced_src, "-vf", vf, "-c:a", "copy", out_video, ] run(cmd_burn, dry_run=args.dry_run) print("Flujo completado. Vídeo final:", out_video) finally: if args.dry_run: print("(dry-run) leaving workdir:", workdir) else: if not args.keep_temp: try: shutil.rmtree(workdir) except Exception: pass if __name__ == '__main__': main()