submaster/whisper_project/run_full_pipeline.py

449 lines
15 KiB
Python

#!/usr/bin/env python3
# Orquesta: transcripción -> traducción -> síntesis por segmento -> reemplazo/mezcla -> quemado de subtítulos
import argparse
import os
import shlex
import shutil
import subprocess
import sys
import tempfile
def run(cmd, dry_run=False, env=None):
# Ejecuta un comando. Acepta str (ejecuta vía shell) o list (sin shell).
# Imprime el comando de forma segura para copiar/pegar. Si dry_run=True
# no ejecuta nada.
if isinstance(cmd, (list, tuple)):
printable = " ".join(shlex.quote(str(x)) for x in cmd)
else:
printable = cmd
print("+", printable)
if dry_run:
return 0
if isinstance(cmd, (list, tuple)):
return subprocess.run(cmd, shell=False, check=True, env=env)
return subprocess.run(cmd, shell=True, check=True, env=env)
def json_payload_template(model, voice):
# Payload JSON con {text} como placeholder que acepta srt_to_kokoro
return '{"model":"' + model + '","voice":"' + voice + '","input":"{text}","response_format":"wav"}'
def main():
p = argparse.ArgumentParser()
p.add_argument("--video", required=True, help="Vídeo de entrada")
p.add_argument(
"--srt",
help=("SRT de entrada (si ya existe). Si no, se transcribe del audio"),
)
p.add_argument("--kokoro-endpoint", required=True, help="URL del endpoint TTS")
p.add_argument("--kokoro-key", required=True, help="API key para Kokoro")
p.add_argument("--voice", default="em_alex", help="Nombre de voz (p.ej. em_alex)")
p.add_argument("--kokoro-model", default="model", help="ID del modelo Kokoro")
p.add_argument("--whisper-model", default="base", help="Modelo de Whisper para transcribir")
p.add_argument("--out", default=None, help="Vídeo de salida final (opcional)")
p.add_argument(
"--translate-method",
choices=["local", "gemini", "none"],
default="local",
help=(
"Método para traducir el SRT: 'local' (MarianMT), 'gemini' (API)"
" o 'none' (usar SRT proporcionado)"
),
)
p.add_argument("--gemini-key", default=None, help="API key para Gemini (si aplica)")
p.add_argument(
"--mix",
action="store_true",
help="Mezclar el audio sintetizado con la pista original en lugar de reemplazarla",
)
p.add_argument(
"--mix-background-volume",
type=float,
default=0.2,
help="Volumen de la pista original al mezclar (0.0-1.0)",
)
p.add_argument(
"--keep-chunks",
action="store_true",
help="Conservar los archivos de chunks generados por la síntesis (debug)",
)
p.add_argument(
"--keep-temp",
action="store_true",
help="No borrar el directorio temporal de trabajo al terminar",
)
p.add_argument("--dry-run", action="store_true", help="Solo mostrar comandos sin ejecutar")
args = p.parse_args()
video = os.path.abspath(args.video)
if not os.path.exists(video):
print("Vídeo no encontrado:", video, file=sys.stderr)
sys.exit(2)
workdir = tempfile.mkdtemp(prefix="full_pipeline_")
try:
# 1) obtener SRT: si no se pasa, extraer audio y transcribir
if args.srt:
srt_in = os.path.abspath(args.srt)
print("Usando SRT proporcionado:", srt_in)
else:
audio_tmp = os.path.join(workdir, "extracted_audio.wav")
cmd_extract = [
"ffmpeg",
"-y",
"-i",
video,
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
audio_tmp,
]
run(cmd_extract, dry_run=args.dry_run)
# llamar al script transcribe.py para generar SRT
srt_in = os.path.join(workdir, "transcribed.srt")
cmd_trans = [
sys.executable,
"whisper_project/transcribe.py",
"--file",
audio_tmp,
"--backend",
"faster-whisper",
"--model",
args.whisper_model,
"--srt",
"--srt-file",
srt_in,
]
run(cmd_trans, dry_run=args.dry_run)
# 2) traducir SRT según método elegido
srt_translated = os.path.join(workdir, "translated.srt")
if args.translate_method == "local":
cmd_translate = [
sys.executable,
"whisper_project/translate_srt_local.py",
"--in",
srt_in,
"--out",
srt_translated,
]
run(cmd_translate, dry_run=args.dry_run)
elif args.translate_method == "gemini":
gem_key = args.gemini_key or os.environ.get("GEMINI_API_KEY")
if not gem_key:
print(
"--translate-method=gemini requiere --gemini-key o la var de entorno GEMINI_API_KEY",
file=sys.stderr,
)
sys.exit(4)
cmd_translate = [
sys.executable,
"whisper_project/translate_srt_with_gemini.py",
"--in",
srt_in,
"--out",
srt_translated,
"--gemini-api-key",
gem_key,
]
run(cmd_translate, dry_run=args.dry_run)
else:
# none: usar SRT tal cual
srt_translated = srt_in
# 3) sintetizar por segmento con Kokoro, alinear, concatenar y
# reemplazar o mezclar audio en el vídeo
dub_wav = os.path.join(workdir, "dub_final.wav")
payload = json_payload_template(args.kokoro_model, args.voice)
synth_cmd = [
sys.executable,
"whisper_project/srt_to_kokoro.py",
"--srt",
srt_translated,
"--endpoint",
args.kokoro_endpoint,
"--payload-template",
payload,
"--api-key",
args.kokoro_key,
"--out",
dub_wav,
"--video",
video,
"--align",
]
if args.keep_chunks:
synth_cmd.append("--keep-chunks")
if args.mix:
synth_cmd += ["--mix-with-original", "--mix-background-volume", str(args.mix_background_volume)]
else:
synth_cmd.append("--replace-original")
run(synth_cmd, dry_run=args.dry_run)
# 4) quemar SRT en vídeo resultante
out_video = args.out if args.out else os.path.splitext(video)[0] + ".replaced_audio.subs.mp4"
replaced_src = os.path.splitext(video)[0] + ".replaced_audio.mp4"
# build filter string
vf = f"subtitles={srt_translated}:force_style='FontName=Arial,FontSize=24'"
cmd_burn = [
"ffmpeg",
"-y",
"-i",
replaced_src,
"-vf",
vf,
"-c:a",
"copy",
out_video,
]
run(cmd_burn, dry_run=args.dry_run)
print("Flujo completado. Vídeo final:", out_video)
finally:
if args.dry_run:
print("(dry-run) leaving workdir:", workdir)
else:
if not args.keep_temp:
try:
shutil.rmtree(workdir)
except Exception:
pass
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# run_full_pipeline.py
# Orquesta: transcripción -> traducción -> síntesis por segmento -> reemplazo/mezcla -> quemado de subtítulos
import argparse
import os
import shlex
import shutil
import subprocess
import sys
import tempfile
def run(cmd, dry_run=False, env=None):
# Ejecuta un comando. Acepta str (ejecuta vía shell) o list (sin shell).
# Imprime el comando de forma segura para copiar/pegar. Si dry_run=True
# no ejecuta nada.
if isinstance(cmd, (list, tuple)):
printable = " ".join(shlex.quote(str(x)) for x in cmd)
else:
printable = cmd
print("+", printable)
if dry_run:
return 0
if isinstance(cmd, (list, tuple)):
return subprocess.run(cmd, shell=False, check=True, env=env)
return subprocess.run(cmd, shell=True, check=True, env=env)
def json_payload_template(model, voice):
# Payload JSON con {text} como placeholder que acepta srt_to_kokoro
return '{"model":"' + model + '","voice":"' + voice + '","input":"{text}","response_format":"wav"}'
def main():
p = argparse.ArgumentParser()
p.add_argument("--video", required=True, help="Vídeo de entrada")
p.add_argument(
"--srt",
help=("SRT de entrada (si ya existe). Si no, se transcribe del audio"),
)
p.add_argument("--kokoro-endpoint", required=True, help="URL del endpoint TTS")
p.add_argument("--kokoro-key", required=True, help="API key para Kokoro")
p.add_argument("--voice", default="em_alex", help="Nombre de voz (p.ej. em_alex)")
p.add_argument("--kokoro-model", default="model", help="ID del modelo Kokoro")
p.add_argument("--whisper-model", default="base", help="Modelo de Whisper para transcribir")
p.add_argument("--out", default=None, help="Vídeo de salida final (opcional)")
p.add_argument(
"--translate-method",
choices=["local", "gemini", "none"],
default="local",
help=(
"Método para traducir el SRT: 'local' (MarianMT), 'gemini' (API)"
" o 'none' (usar SRT proporcionado)"
),
)
p.add_argument("--gemini-key", default=None, help="API key para Gemini (si aplica)")
p.add_argument(
"--mix",
action="store_true",
help="Mezclar el audio sintetizado con la pista original en lugar de reemplazarla",
)
p.add_argument(
"--mix-background-volume",
type=float,
default=0.2,
help="Volumen de la pista original al mezclar (0.0-1.0)",
)
p.add_argument(
"--keep-chunks",
action="store_true",
help="Conservar los archivos de chunks generados por la síntesis (debug)",
)
p.add_argument(
"--keep-temp",
action="store_true",
help="No borrar el directorio temporal de trabajo al terminar",
)
p.add_argument("--dry-run", action="store_true", help="Solo mostrar comandos sin ejecutar")
args = p.parse_args()
video = os.path.abspath(args.video)
if not os.path.exists(video):
print("Vídeo no encontrado:", video, file=sys.stderr)
sys.exit(2)
workdir = tempfile.mkdtemp(prefix="full_pipeline_")
try:
# 1) obtener SRT: si no se pasa, extraer audio y transcribir
if args.srt:
srt_in = os.path.abspath(args.srt)
print("Usando SRT proporcionado:", srt_in)
else:
audio_tmp = os.path.join(workdir, "extracted_audio.wav")
cmd_extract = [
"ffmpeg",
"-y",
"-i",
video,
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
audio_tmp,
]
run(cmd_extract, dry_run=args.dry_run)
# llamar al script transcribe.py para generar SRT
srt_in = os.path.join(workdir, "transcribed.srt")
cmd_trans = [
sys.executable,
"whisper_project/transcribe.py",
"--file",
audio_tmp,
"--backend",
"faster-whisper",
"--model",
args.whisper_model,
"--srt",
"--srt-file",
srt_in,
]
run(cmd_trans, dry_run=args.dry_run)
# 2) traducir SRT según método elegido
srt_translated = os.path.join(workdir, "translated.srt")
if args.translate_method == "local":
cmd_translate = [
sys.executable,
"whisper_project/translate_srt_local.py",
"--in",
srt_in,
"--out",
srt_translated,
]
run(cmd_translate, dry_run=args.dry_run)
elif args.translate_method == "gemini":
gem_key = args.gemini_key or os.environ.get("GEMINI_API_KEY")
if not gem_key:
print(
"--translate-method=gemini requiere --gemini-key o la var de entorno GEMINI_API_KEY",
file=sys.stderr,
)
sys.exit(4)
cmd_translate = [
sys.executable,
"whisper_project/translate_srt_with_gemini.py",
"--in",
srt_in,
"--out",
srt_translated,
"--gemini-api-key",
gem_key,
]
run(cmd_translate, dry_run=args.dry_run)
else:
# none: usar SRT tal cual
srt_translated = srt_in
# 3) sintetizar por segmento con Kokoro, alinear, concatenar y
# reemplazar o mezclar audio en el vídeo
dub_wav = os.path.join(workdir, "dub_final.wav")
payload = json_payload_template(args.kokoro_model, args.voice)
synth_cmd = [
sys.executable,
"whisper_project/srt_to_kokoro.py",
"--srt",
srt_translated,
"--endpoint",
args.kokoro_endpoint,
"--payload-template",
payload,
"--api-key",
args.kokoro_key,
"--out",
dub_wav,
"--video",
video,
"--align",
]
if args.keep_chunks:
synth_cmd.append("--keep-chunks")
if args.mix:
synth_cmd += ["--mix-with-original", "--mix-background-volume", str(args.mix_background_volume)]
else:
synth_cmd.append("--replace-original")
run(synth_cmd, dry_run=args.dry_run)
# 4) quemar SRT en vídeo resultante
out_video = args.out if args.out else os.path.splitext(video)[0] + ".replaced_audio.subs.mp4"
replaced_src = os.path.splitext(video)[0] + ".replaced_audio.mp4"
# build filter string
vf = f"subtitles={srt_translated}:force_style='FontName=Arial,FontSize=24'"
cmd_burn = [
"ffmpeg",
"-y",
"-i",
replaced_src,
"-vf",
vf,
"-c:a",
"copy",
out_video,
]
run(cmd_burn, dry_run=args.dry_run)
print("Flujo completado. Vídeo final:", out_video)
finally:
if args.dry_run:
print("(dry-run) leaving workdir:", workdir)
else:
if not args.keep_temp:
try:
shutil.rmtree(workdir)
except Exception:
pass
if __name__ == '__main__':
main()