485 lines
18 KiB
Python
485 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
dub_and_burn.py
|
|
|
|
Flujo automatizado:
|
|
- Extrae audio del vídeo
|
|
- Transcribe y traduce con Whisper (usando process_video helpers)
|
|
- Sintetiza cada segmento con Kokoro (/api/v1/audio/speech) usando voice=em_alex
|
|
- Ajusta cada chunk a la duración del segmento (pad/trim)
|
|
- Concatena los chunks y reemplaza la pista de audio en el vídeo
|
|
- Genera SRT traducido y lo quema en el vídeo final
|
|
|
|
Requisitos:
|
|
- ffmpeg / ffprobe en PATH
|
|
- Python venv del proyecto con requests y srt instalados (el venv se creó ya)
|
|
|
|
Uso ejemplo:
|
|
python3 dub_and_burn.py --video input.mp4 --out out_dubbed.mp4 \
|
|
--kokoro-endpoint "https://kokoro.bfzqqk.easypanel.host/api/v1/audio/speech" \
|
|
--api-key "048665fa9596db326c17c6f5f84d7d03" \
|
|
--voice em_alex --model model_q8f16
|
|
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shlex
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
|
|
import requests
|
|
import srt
|
|
|
|
# Import translation/transcription helpers from process_video
|
|
from whisper_project.process_video import (
|
|
extract_audio,
|
|
transcribe_and_translate_faster,
|
|
transcribe_and_translate_openai,
|
|
burn_subtitles,
|
|
)
|
|
|
|
# Use write_srt from transcribe module if available
|
|
from whisper_project.transcribe import write_srt
|
|
|
|
|
|
def ensure_ffmpeg():
|
|
if shutil.which("ffmpeg") is None or shutil.which("ffprobe") is None:
|
|
print("ffmpeg/ffprobe no encontrados en PATH. Instálalos.")
|
|
sys.exit(1)
|
|
|
|
|
|
def get_duration(path: str) -> float:
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v",
|
|
"error",
|
|
"-show_entries",
|
|
"format=duration",
|
|
"-of",
|
|
"default=noprint_wrappers=1:nokey=1",
|
|
path,
|
|
]
|
|
p = subprocess.run(cmd, capture_output=True, text=True)
|
|
if p.returncode != 0:
|
|
return 0.0
|
|
try:
|
|
return float(p.stdout.strip())
|
|
except Exception:
|
|
return 0.0
|
|
|
|
|
|
def pad_or_trim(in_path: str, out_path: str, target_duration: float, sr: int = 22050):
|
|
cur = get_duration(in_path)
|
|
if cur == 0.0:
|
|
# copy as-is
|
|
shutil.copy(in_path, out_path)
|
|
return True
|
|
if abs(cur - target_duration) < 0.02:
|
|
# casi igual
|
|
shutil.copy(in_path, out_path)
|
|
return True
|
|
if cur > target_duration:
|
|
# recortar
|
|
cmd = ["ffmpeg", "-y", "-i", in_path, "-t", f"{target_duration}", out_path]
|
|
subprocess.run(cmd, check=True)
|
|
return True
|
|
else:
|
|
# pad: crear silencio de duración faltante y concatenar
|
|
pad = target_duration - cur
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as sil:
|
|
sil_path = sil.name
|
|
try:
|
|
cmd1 = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-f",
|
|
"lavfi",
|
|
"-i",
|
|
f"anullsrc=channel_layout=mono:sample_rate={sr}",
|
|
"-t",
|
|
f"{pad}",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
sil_path,
|
|
]
|
|
subprocess.run(cmd1, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
|
|
# concat in_path + sil_path
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
|
|
listf.write(f"file '{os.path.abspath(in_path)}'\n")
|
|
listf.write(f"file '{os.path.abspath(sil_path)}'\n")
|
|
listname = listf.name
|
|
cmd2 = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
|
|
subprocess.run(cmd2, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
finally:
|
|
try:
|
|
os.remove(sil_path)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
os.remove(listname)
|
|
except Exception:
|
|
pass
|
|
return True
|
|
|
|
|
|
def synthesize_segment_kokoro(endpoint: str, api_key: str, model: str, voice: str, text: str) -> bytes:
|
|
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "Accept": "*/*"}
|
|
payload = {"model": model, "voice": voice, "input": text, "response_format": "wav"}
|
|
r = requests.post(endpoint, json=payload, headers=headers, timeout=120)
|
|
r.raise_for_status()
|
|
# si viene audio
|
|
ctype = r.headers.get("Content-Type", "")
|
|
if ctype.startswith("audio/"):
|
|
return r.content
|
|
# intentar JSON base64
|
|
try:
|
|
j = r.json()
|
|
for k in ("audio", "wav", "data", "base64"):
|
|
if k in j:
|
|
import base64
|
|
|
|
return base64.b64decode(j[k])
|
|
except Exception:
|
|
pass
|
|
# fallback
|
|
return r.content
|
|
|
|
|
|
def translate_with_gemini(text: str, target_lang: str, api_key: str, model: str = "gemini-2.5-flash") -> str:
|
|
"""Usa la API HTTP de Gemini para traducir un texto al idioma objetivo.
|
|
|
|
Notas:
|
|
- Se asume un endpoint compatible con la API de Google Gemini HTTP (OpenAI-like).
|
|
- El parámetro `model` por defecto es 'gemini-2.5-flash' según solicitud.
|
|
"""
|
|
# Endpoint público de ejemplo: https://api.openai.com/v1/responses
|
|
# Usamos la ruta /v1/responses que muchas instalaciones usan; si tu instancia Gemini requiere otra URL,
|
|
# pásala modificando la función (o la env var GEMINI_ENDPOINT).
|
|
# Si la API key parece una clave de Google (empieza con 'AIza'), usar
|
|
# la API Generative Language de Google con key en query param.
|
|
try:
|
|
if api_key and api_key.startswith("AIza"):
|
|
gl_model = model
|
|
# Formato: https://generativelanguage.googleapis.com/v1beta2/models/{model}:generate?key=API_KEY
|
|
gl_endpoint = (
|
|
f"https://generativelanguage.googleapis.com/v1beta2/models/{gl_model}:generateContent?key={api_key}"
|
|
)
|
|
body = {
|
|
"prompt": {"text": f"Traduce al {target_lang} el siguiente texto, devuelve solo el texto traducido:\n\n{text}"},
|
|
"maxOutputTokens": 1024,
|
|
"temperature": 0.0,
|
|
"candidateCount": 1,
|
|
}
|
|
r = requests.post(gl_endpoint, json=body, timeout=20)
|
|
r.raise_for_status()
|
|
j = r.json()
|
|
# la respuesta suele tener 'candidates' con 'content'
|
|
if isinstance(j, dict):
|
|
if "candidates" in j and isinstance(j["candidates"], list) and j["candidates"]:
|
|
first = j["candidates"][0]
|
|
if isinstance(first, dict):
|
|
# varios formatos posibles
|
|
if "content" in first and isinstance(first["content"], str):
|
|
return first["content"].strip()
|
|
if "output" in first and isinstance(first["output"], str):
|
|
return first["output"].strip()
|
|
# content puede ser una lista de bloques
|
|
if "content" in first and isinstance(first["content"], list):
|
|
# buscar textos dentro
|
|
parts = []
|
|
for c in first["content"]:
|
|
if isinstance(c, dict) and isinstance(c.get("text"), str):
|
|
parts.append(c.get("text"))
|
|
if parts:
|
|
return "\n".join(parts).strip()
|
|
# fallback buscar fields comunes
|
|
for key in ("output_text", "text", "response", "translated_text"):
|
|
if key in j and isinstance(j[key], str):
|
|
return j[key].strip()
|
|
return text
|
|
|
|
# Si no es Google API key, intentar API OpenAI-like (Responses)
|
|
gemini_endpoint = os.environ.get("GEMINI_ENDPOINT", "https://api.openai.com/v1/responses")
|
|
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
|
prompt = (
|
|
f"Traduce el siguiente texto al idioma {target_lang}. Mantén solo el texto traducido, sin añadidos:\n\n{text}"
|
|
)
|
|
payload = {"model": model, "input": prompt, "max_output_tokens": 1024}
|
|
r = requests.post(gemini_endpoint, json=payload, headers=headers, timeout=20)
|
|
r.raise_for_status()
|
|
j = r.json()
|
|
if isinstance(j, dict):
|
|
if "output" in j and isinstance(j["output"], list):
|
|
for item in j["output"]:
|
|
if isinstance(item, dict) and "content" in item:
|
|
cont = item["content"]
|
|
if isinstance(cont, list):
|
|
texts = [c.get("text") for c in cont if isinstance(c, dict) and c.get("text")]
|
|
if texts:
|
|
return "\n".join(texts).strip()
|
|
elif isinstance(cont, str):
|
|
return cont.strip()
|
|
for key in ("output_text", "text", "response", "translated_text"):
|
|
if key in j and isinstance(j[key], str):
|
|
return j[key].strip()
|
|
if isinstance(j, list) and j:
|
|
if isinstance(j[0], str):
|
|
return j[0]
|
|
if isinstance(j, str):
|
|
return j
|
|
except Exception as e:
|
|
print(f"Warning: Gemini translation failed: {e}")
|
|
|
|
return text
|
|
|
|
|
|
def concat_chunks(chunk_files: List[str], out_path: str):
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
|
|
for c in chunk_files:
|
|
listf.write(f"file '{os.path.abspath(c)}'\n")
|
|
listname = listf.name
|
|
cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
|
|
subprocess.run(cmd, check=True)
|
|
try:
|
|
os.remove(listname)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def replace_audio_in_video(video_path: str, audio_path: str, out_video: str):
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
video_path,
|
|
"-i",
|
|
audio_path,
|
|
"-map",
|
|
"0:v:0",
|
|
"-map",
|
|
"1:a:0",
|
|
"-c:v",
|
|
"copy",
|
|
"-c:a",
|
|
"aac",
|
|
"-b:a",
|
|
"192k",
|
|
"-shortest",
|
|
out_video,
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
|
|
|
|
def normalize_segments(segments) -> List[Dict]:
|
|
out = []
|
|
for s in segments:
|
|
if isinstance(s, dict):
|
|
start = s.get("start")
|
|
end = s.get("end")
|
|
text = s.get("text", "")
|
|
else:
|
|
# faster-whisper Segment object
|
|
start = getattr(s, "start", None)
|
|
end = getattr(s, "end", None)
|
|
text = getattr(s, "text", "")
|
|
if start is None or end is None:
|
|
continue
|
|
out.append({"start": float(start), "end": float(end), "text": str(text).strip()})
|
|
return out
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Doblar vídeo usando Kokoro y quemar SRT traducido")
|
|
parser.add_argument("--video", "-v", required=True)
|
|
parser.add_argument("--out", "-o", default=None, help="Vídeo de salida final (con audio reemplazado y SRT quemado)")
|
|
parser.add_argument("--temp-dub", default=None, help="Archivo de audio temporal generado (si quieres conservarlo)")
|
|
parser.add_argument("--kokoro-endpoint", required=True, help="URL al endpoint /api/v1/audio/speech")
|
|
parser.add_argument("--api-key", required=True, help="Token para Authorization: Bearer <token>")
|
|
parser.add_argument("--model", default="model", help="Modelo Kokoro a usar (usa 'model' fp32 326MB)")
|
|
parser.add_argument("--voice", default="em_alex", help="Voice id a usar (em_alex)")
|
|
parser.add_argument(
|
|
"--whisper-backend",
|
|
choices=["faster-whisper", "openai-whisper"],
|
|
default="faster-whisper",
|
|
)
|
|
parser.add_argument("--whisper-model", default="base")
|
|
|
|
# Gemini options
|
|
parser.add_argument(
|
|
"--use-gemini",
|
|
action="store_true",
|
|
help="Usar Gemini (HTTP) para traducir segmentos en lugar de Whisper translate",
|
|
)
|
|
parser.add_argument("--gemini-api-key", default=None, help="API key para Gemini (Bearer)")
|
|
parser.add_argument(
|
|
"--gemini-model",
|
|
default="gemini-2.5-flash",
|
|
help="Modelo Gemini a usar (por defecto: gemini-2.5-flash)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
ensure_ffmpeg()
|
|
|
|
video = Path(args.video)
|
|
if not video.exists():
|
|
print("Vídeo no encontrado", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
out_video = args.out if args.out else str(video.with_name(video.stem + "_dubbed.mp4"))
|
|
tmpdir = tempfile.mkdtemp(prefix="dub_and_burn_")
|
|
|
|
try:
|
|
audio_wav = os.path.join(tmpdir, "extracted_audio.wav")
|
|
print("Extrayendo audio...")
|
|
extract_audio(str(video), audio_wav)
|
|
|
|
print("Transcribiendo (y traduciendo si no se usa Gemini) ...")
|
|
|
|
# Si se solicita Gemini, hacemos transcribe-only y luego traducimos por segmento con Gemini
|
|
if args.use_gemini:
|
|
# permitir pasar la key por variable de entorno GEMINI_API_KEY
|
|
if not args.gemini_api_key:
|
|
args.gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
|
if not args.gemini_api_key:
|
|
print("--use-gemini requiere --gemini-api-key o la var de entorno GEMINI_API_KEY", file=sys.stderr)
|
|
sys.exit(4)
|
|
# transcribir sin traducir
|
|
from faster_whisper import WhisperModel
|
|
|
|
wm = WhisperModel(args.whisper_model, device="cpu", compute_type="int8")
|
|
segments, info = wm.transcribe(audio_wav, beam_size=5, task="transcribe")
|
|
else:
|
|
if args.whisper_backend == "faster-whisper":
|
|
segments = transcribe_and_translate_faster(audio_wav, args.whisper_model, "es")
|
|
else:
|
|
segments = transcribe_and_translate_openai(audio_wav, args.whisper_model, "es")
|
|
|
|
if not segments:
|
|
print("No se obtuvieron segmentos; abortando", file=sys.stderr)
|
|
sys.exit(3)
|
|
|
|
segs = normalize_segments(segments)
|
|
|
|
# si usamos gemini, traducir por segmento ahora
|
|
if args.use_gemini:
|
|
print(f"Traduciendo {len(segs)} segmentos con Gemini (model={args.gemini_model})...")
|
|
for s in segs:
|
|
try:
|
|
src = s.get("text", "")
|
|
if src:
|
|
tgt = translate_with_gemini(src, "es", args.gemini_api_key, model=args.gemini_model)
|
|
s["text"] = tgt
|
|
except Exception as e:
|
|
print(f"Warning: Gemini fallo en segmento: {e}")
|
|
|
|
# generar SRT traducido
|
|
srt_out = os.path.join(tmpdir, "translated.srt")
|
|
srt_segments = []
|
|
for i, s in enumerate(segs, start=1):
|
|
srt_segments.append(s)
|
|
write_srt(srt_segments, srt_out)
|
|
print(f"SRT traducido guardado en: {srt_out}")
|
|
|
|
# sintetizar por segmento
|
|
chunk_files = []
|
|
print(f"Sintetizando {len(segs)} segmentos con Kokoro (voice={args.voice})...")
|
|
for i, s in enumerate(segs, start=1):
|
|
text = s.get("text", "")
|
|
if not text:
|
|
# generar silencio con la duración del segmento
|
|
target_dur = s["end"] - s["start"]
|
|
silent = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-f",
|
|
"lavfi",
|
|
"-i",
|
|
"anullsrc=channel_layout=mono:sample_rate=22050",
|
|
"-t",
|
|
f"{target_dur}",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
silent,
|
|
]
|
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
chunk_files.append(silent)
|
|
print(f" - Segmento {i}: silencio {target_dur}s")
|
|
continue
|
|
|
|
try:
|
|
raw = synthesize_segment_kokoro(args.kokoro_endpoint, args.api_key, args.model, args.voice, text)
|
|
except Exception as e:
|
|
print(f"Error sintetizando segmento {i}: {e}")
|
|
# fallback: generar silencio
|
|
target_dur = s["end"] - s["start"]
|
|
silent = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-f",
|
|
"lavfi",
|
|
"-i",
|
|
"anullsrc=channel_layout=mono:sample_rate=22050",
|
|
"-t",
|
|
f"{target_dur}",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
silent,
|
|
]
|
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
chunk_files.append(silent)
|
|
continue
|
|
|
|
# guardar raw en temp file
|
|
tmp_chunk = os.path.join(tmpdir, f"raw_chunk_{i:04d}.bin")
|
|
with open(tmp_chunk, "wb") as f:
|
|
f.write(raw)
|
|
|
|
# convertir a WAV estandar (22050 mono)
|
|
tmp_wav = os.path.join(tmpdir, f"tmp_chunk_{i:04d}.wav")
|
|
cmdc = ["ffmpeg", "-y", "-i", tmp_chunk, "-ar", "22050", "-ac", "1", "-sample_fmt", "s16", tmp_wav]
|
|
subprocess.run(cmdc, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
|
|
# ajustar a la duración del segmento
|
|
target_dur = s["end"] - s["start"]
|
|
final_chunk = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
|
|
pad_or_trim(tmp_wav, final_chunk, target_dur, sr=22050)
|
|
chunk_files.append(final_chunk)
|
|
print(f" - Segmento {i}/{len(segs)} -> {os.path.basename(final_chunk)}")
|
|
|
|
# concatenar chunks
|
|
dub_wav = args.temp_dub if args.temp_dub else os.path.join(tmpdir, "dub_final.wav")
|
|
print("Concatenando chunks...")
|
|
concat_chunks(chunk_files, dub_wav)
|
|
print(f"Archivo dub generado en: {dub_wav}")
|
|
|
|
# reemplazar audio en el vídeo
|
|
replaced = os.path.join(tmpdir, "video_replaced.mp4")
|
|
print("Reemplazando pista de audio en el vídeo...")
|
|
replace_audio_in_video(str(video), dub_wav, replaced)
|
|
|
|
# quemar SRT traducido
|
|
print("Quemando SRT traducido en el vídeo...")
|
|
burn_subtitles(replaced, srt_out, out_video)
|
|
|
|
print(f"Vídeo final generado: {out_video}")
|
|
|
|
finally:
|
|
try:
|
|
shutil.rmtree(tmpdir)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|