449 lines
15 KiB
Python
449 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
# Orquesta: transcripción -> traducción -> síntesis por segmento -> reemplazo/mezcla -> quemado de subtítulos
|
|
|
|
import argparse
|
|
import os
|
|
import shlex
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
|
|
|
|
def run(cmd, dry_run=False, env=None):
|
|
# Ejecuta un comando. Acepta str (ejecuta vía shell) o list (sin shell).
|
|
# Imprime el comando de forma segura para copiar/pegar. Si dry_run=True
|
|
# no ejecuta nada.
|
|
if isinstance(cmd, (list, tuple)):
|
|
printable = " ".join(shlex.quote(str(x)) for x in cmd)
|
|
else:
|
|
printable = cmd
|
|
print("+", printable)
|
|
if dry_run:
|
|
return 0
|
|
if isinstance(cmd, (list, tuple)):
|
|
return subprocess.run(cmd, shell=False, check=True, env=env)
|
|
return subprocess.run(cmd, shell=True, check=True, env=env)
|
|
|
|
|
|
def json_payload_template(model, voice):
|
|
# Payload JSON con {text} como placeholder que acepta srt_to_kokoro
|
|
return '{"model":"' + model + '","voice":"' + voice + '","input":"{text}","response_format":"wav"}'
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--video", required=True, help="Vídeo de entrada")
|
|
p.add_argument(
|
|
"--srt",
|
|
help=("SRT de entrada (si ya existe). Si no, se transcribe del audio"),
|
|
)
|
|
p.add_argument("--kokoro-endpoint", required=True, help="URL del endpoint TTS")
|
|
p.add_argument("--kokoro-key", required=True, help="API key para Kokoro")
|
|
p.add_argument("--voice", default="em_alex", help="Nombre de voz (p.ej. em_alex)")
|
|
p.add_argument("--kokoro-model", default="model", help="ID del modelo Kokoro")
|
|
p.add_argument("--whisper-model", default="base", help="Modelo de Whisper para transcribir")
|
|
p.add_argument("--out", default=None, help="Vídeo de salida final (opcional)")
|
|
p.add_argument(
|
|
"--translate-method",
|
|
choices=["local", "gemini", "none"],
|
|
default="local",
|
|
help=(
|
|
"Método para traducir el SRT: 'local' (MarianMT), 'gemini' (API)"
|
|
" o 'none' (usar SRT proporcionado)"
|
|
),
|
|
)
|
|
p.add_argument("--gemini-key", default=None, help="API key para Gemini (si aplica)")
|
|
p.add_argument(
|
|
"--mix",
|
|
action="store_true",
|
|
help="Mezclar el audio sintetizado con la pista original en lugar de reemplazarla",
|
|
)
|
|
p.add_argument(
|
|
"--mix-background-volume",
|
|
type=float,
|
|
default=0.2,
|
|
help="Volumen de la pista original al mezclar (0.0-1.0)",
|
|
)
|
|
p.add_argument(
|
|
"--keep-chunks",
|
|
action="store_true",
|
|
help="Conservar los archivos de chunks generados por la síntesis (debug)",
|
|
)
|
|
p.add_argument(
|
|
"--keep-temp",
|
|
action="store_true",
|
|
help="No borrar el directorio temporal de trabajo al terminar",
|
|
)
|
|
p.add_argument("--dry-run", action="store_true", help="Solo mostrar comandos sin ejecutar")
|
|
args = p.parse_args()
|
|
|
|
video = os.path.abspath(args.video)
|
|
if not os.path.exists(video):
|
|
print("Vídeo no encontrado:", video, file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
workdir = tempfile.mkdtemp(prefix="full_pipeline_")
|
|
try:
|
|
# 1) obtener SRT: si no se pasa, extraer audio y transcribir
|
|
if args.srt:
|
|
srt_in = os.path.abspath(args.srt)
|
|
print("Usando SRT proporcionado:", srt_in)
|
|
else:
|
|
audio_tmp = os.path.join(workdir, "extracted_audio.wav")
|
|
cmd_extract = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
video,
|
|
"-vn",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
"16000",
|
|
"-ac",
|
|
"1",
|
|
audio_tmp,
|
|
]
|
|
run(cmd_extract, dry_run=args.dry_run)
|
|
|
|
# llamar al script transcribe.py para generar SRT
|
|
srt_in = os.path.join(workdir, "transcribed.srt")
|
|
cmd_trans = [
|
|
sys.executable,
|
|
"whisper_project/transcribe.py",
|
|
"--file",
|
|
audio_tmp,
|
|
"--backend",
|
|
"faster-whisper",
|
|
"--model",
|
|
args.whisper_model,
|
|
"--srt",
|
|
"--srt-file",
|
|
srt_in,
|
|
]
|
|
run(cmd_trans, dry_run=args.dry_run)
|
|
|
|
# 2) traducir SRT según método elegido
|
|
srt_translated = os.path.join(workdir, "translated.srt")
|
|
if args.translate_method == "local":
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_local.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
]
|
|
run(cmd_translate, dry_run=args.dry_run)
|
|
elif args.translate_method == "gemini":
|
|
gem_key = args.gemini_key or os.environ.get("GEMINI_API_KEY")
|
|
if not gem_key:
|
|
print(
|
|
"--translate-method=gemini requiere --gemini-key o la var de entorno GEMINI_API_KEY",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(4)
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_with_gemini.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
"--gemini-api-key",
|
|
gem_key,
|
|
]
|
|
run(cmd_translate, dry_run=args.dry_run)
|
|
else:
|
|
# none: usar SRT tal cual
|
|
srt_translated = srt_in
|
|
|
|
# 3) sintetizar por segmento con Kokoro, alinear, concatenar y
|
|
# reemplazar o mezclar audio en el vídeo
|
|
dub_wav = os.path.join(workdir, "dub_final.wav")
|
|
payload = json_payload_template(args.kokoro_model, args.voice)
|
|
synth_cmd = [
|
|
sys.executable,
|
|
"whisper_project/srt_to_kokoro.py",
|
|
"--srt",
|
|
srt_translated,
|
|
"--endpoint",
|
|
args.kokoro_endpoint,
|
|
"--payload-template",
|
|
payload,
|
|
"--api-key",
|
|
args.kokoro_key,
|
|
"--out",
|
|
dub_wav,
|
|
"--video",
|
|
video,
|
|
"--align",
|
|
]
|
|
if args.keep_chunks:
|
|
synth_cmd.append("--keep-chunks")
|
|
if args.mix:
|
|
synth_cmd += ["--mix-with-original", "--mix-background-volume", str(args.mix_background_volume)]
|
|
else:
|
|
synth_cmd.append("--replace-original")
|
|
|
|
run(synth_cmd, dry_run=args.dry_run)
|
|
|
|
# 4) quemar SRT en vídeo resultante
|
|
out_video = args.out if args.out else os.path.splitext(video)[0] + ".replaced_audio.subs.mp4"
|
|
replaced_src = os.path.splitext(video)[0] + ".replaced_audio.mp4"
|
|
# build filter string
|
|
vf = f"subtitles={srt_translated}:force_style='FontName=Arial,FontSize=24'"
|
|
cmd_burn = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
replaced_src,
|
|
"-vf",
|
|
vf,
|
|
"-c:a",
|
|
"copy",
|
|
out_video,
|
|
]
|
|
run(cmd_burn, dry_run=args.dry_run)
|
|
|
|
print("Flujo completado. Vídeo final:", out_video)
|
|
|
|
finally:
|
|
if args.dry_run:
|
|
print("(dry-run) leaving workdir:", workdir)
|
|
else:
|
|
if not args.keep_temp:
|
|
try:
|
|
shutil.rmtree(workdir)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
#!/usr/bin/env python3
|
|
# run_full_pipeline.py
|
|
# Orquesta: transcripción -> traducción -> síntesis por segmento -> reemplazo/mezcla -> quemado de subtítulos
|
|
|
|
import argparse
|
|
import os
|
|
import shlex
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
|
|
|
|
def run(cmd, dry_run=False, env=None):
|
|
# Ejecuta un comando. Acepta str (ejecuta vía shell) o list (sin shell).
|
|
# Imprime el comando de forma segura para copiar/pegar. Si dry_run=True
|
|
# no ejecuta nada.
|
|
if isinstance(cmd, (list, tuple)):
|
|
printable = " ".join(shlex.quote(str(x)) for x in cmd)
|
|
else:
|
|
printable = cmd
|
|
print("+", printable)
|
|
if dry_run:
|
|
return 0
|
|
if isinstance(cmd, (list, tuple)):
|
|
return subprocess.run(cmd, shell=False, check=True, env=env)
|
|
return subprocess.run(cmd, shell=True, check=True, env=env)
|
|
|
|
|
|
def json_payload_template(model, voice):
|
|
# Payload JSON con {text} como placeholder que acepta srt_to_kokoro
|
|
return '{"model":"' + model + '","voice":"' + voice + '","input":"{text}","response_format":"wav"}'
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--video", required=True, help="Vídeo de entrada")
|
|
p.add_argument(
|
|
"--srt",
|
|
help=("SRT de entrada (si ya existe). Si no, se transcribe del audio"),
|
|
)
|
|
p.add_argument("--kokoro-endpoint", required=True, help="URL del endpoint TTS")
|
|
p.add_argument("--kokoro-key", required=True, help="API key para Kokoro")
|
|
p.add_argument("--voice", default="em_alex", help="Nombre de voz (p.ej. em_alex)")
|
|
p.add_argument("--kokoro-model", default="model", help="ID del modelo Kokoro")
|
|
p.add_argument("--whisper-model", default="base", help="Modelo de Whisper para transcribir")
|
|
p.add_argument("--out", default=None, help="Vídeo de salida final (opcional)")
|
|
p.add_argument(
|
|
"--translate-method",
|
|
choices=["local", "gemini", "none"],
|
|
default="local",
|
|
help=(
|
|
"Método para traducir el SRT: 'local' (MarianMT), 'gemini' (API)"
|
|
" o 'none' (usar SRT proporcionado)"
|
|
),
|
|
)
|
|
p.add_argument("--gemini-key", default=None, help="API key para Gemini (si aplica)")
|
|
p.add_argument(
|
|
"--mix",
|
|
action="store_true",
|
|
help="Mezclar el audio sintetizado con la pista original en lugar de reemplazarla",
|
|
)
|
|
p.add_argument(
|
|
"--mix-background-volume",
|
|
type=float,
|
|
default=0.2,
|
|
help="Volumen de la pista original al mezclar (0.0-1.0)",
|
|
)
|
|
p.add_argument(
|
|
"--keep-chunks",
|
|
action="store_true",
|
|
help="Conservar los archivos de chunks generados por la síntesis (debug)",
|
|
)
|
|
p.add_argument(
|
|
"--keep-temp",
|
|
action="store_true",
|
|
help="No borrar el directorio temporal de trabajo al terminar",
|
|
)
|
|
p.add_argument("--dry-run", action="store_true", help="Solo mostrar comandos sin ejecutar")
|
|
args = p.parse_args()
|
|
|
|
video = os.path.abspath(args.video)
|
|
if not os.path.exists(video):
|
|
print("Vídeo no encontrado:", video, file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
workdir = tempfile.mkdtemp(prefix="full_pipeline_")
|
|
try:
|
|
# 1) obtener SRT: si no se pasa, extraer audio y transcribir
|
|
if args.srt:
|
|
srt_in = os.path.abspath(args.srt)
|
|
print("Usando SRT proporcionado:", srt_in)
|
|
else:
|
|
audio_tmp = os.path.join(workdir, "extracted_audio.wav")
|
|
cmd_extract = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
video,
|
|
"-vn",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
"16000",
|
|
"-ac",
|
|
"1",
|
|
audio_tmp,
|
|
]
|
|
run(cmd_extract, dry_run=args.dry_run)
|
|
|
|
# llamar al script transcribe.py para generar SRT
|
|
srt_in = os.path.join(workdir, "transcribed.srt")
|
|
cmd_trans = [
|
|
sys.executable,
|
|
"whisper_project/transcribe.py",
|
|
"--file",
|
|
audio_tmp,
|
|
"--backend",
|
|
"faster-whisper",
|
|
"--model",
|
|
args.whisper_model,
|
|
"--srt",
|
|
"--srt-file",
|
|
srt_in,
|
|
]
|
|
run(cmd_trans, dry_run=args.dry_run)
|
|
|
|
# 2) traducir SRT según método elegido
|
|
srt_translated = os.path.join(workdir, "translated.srt")
|
|
if args.translate_method == "local":
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_local.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
]
|
|
run(cmd_translate, dry_run=args.dry_run)
|
|
elif args.translate_method == "gemini":
|
|
gem_key = args.gemini_key or os.environ.get("GEMINI_API_KEY")
|
|
if not gem_key:
|
|
print(
|
|
"--translate-method=gemini requiere --gemini-key o la var de entorno GEMINI_API_KEY",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(4)
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_with_gemini.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
"--gemini-api-key",
|
|
gem_key,
|
|
]
|
|
run(cmd_translate, dry_run=args.dry_run)
|
|
else:
|
|
# none: usar SRT tal cual
|
|
srt_translated = srt_in
|
|
|
|
# 3) sintetizar por segmento con Kokoro, alinear, concatenar y
|
|
# reemplazar o mezclar audio en el vídeo
|
|
dub_wav = os.path.join(workdir, "dub_final.wav")
|
|
payload = json_payload_template(args.kokoro_model, args.voice)
|
|
synth_cmd = [
|
|
sys.executable,
|
|
"whisper_project/srt_to_kokoro.py",
|
|
"--srt",
|
|
srt_translated,
|
|
"--endpoint",
|
|
args.kokoro_endpoint,
|
|
"--payload-template",
|
|
payload,
|
|
"--api-key",
|
|
args.kokoro_key,
|
|
"--out",
|
|
dub_wav,
|
|
"--video",
|
|
video,
|
|
"--align",
|
|
]
|
|
if args.keep_chunks:
|
|
synth_cmd.append("--keep-chunks")
|
|
if args.mix:
|
|
synth_cmd += ["--mix-with-original", "--mix-background-volume", str(args.mix_background_volume)]
|
|
else:
|
|
synth_cmd.append("--replace-original")
|
|
|
|
run(synth_cmd, dry_run=args.dry_run)
|
|
|
|
# 4) quemar SRT en vídeo resultante
|
|
out_video = args.out if args.out else os.path.splitext(video)[0] + ".replaced_audio.subs.mp4"
|
|
replaced_src = os.path.splitext(video)[0] + ".replaced_audio.mp4"
|
|
# build filter string
|
|
vf = f"subtitles={srt_translated}:force_style='FontName=Arial,FontSize=24'"
|
|
cmd_burn = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
replaced_src,
|
|
"-vf",
|
|
vf,
|
|
"-c:a",
|
|
"copy",
|
|
out_video,
|
|
]
|
|
run(cmd_burn, dry_run=args.dry_run)
|
|
|
|
print("Flujo completado. Vídeo final:", out_video)
|
|
|
|
finally:
|
|
if args.dry_run:
|
|
print("(dry-run) leaving workdir:", workdir)
|
|
else:
|
|
if not args.keep_temp:
|
|
try:
|
|
shutil.rmtree(workdir)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |