#!/usr/bin/env python3 """ dub_and_burn.py Flujo automatizado: - Extrae audio del vídeo - Transcribe y traduce con Whisper (usando process_video helpers) - Sintetiza cada segmento con Kokoro (/api/v1/audio/speech) usando voice=em_alex - Ajusta cada chunk a la duración del segmento (pad/trim) - Concatena los chunks y reemplaza la pista de audio en el vídeo - Genera SRT traducido y lo quema en el vídeo final Requisitos: - ffmpeg / ffprobe en PATH - Python venv del proyecto con requests y srt instalados (el venv se creó ya) Uso ejemplo: python3 dub_and_burn.py --video input.mp4 --out out_dubbed.mp4 \ --kokoro-endpoint "https://kokoro.bfzqqk.easypanel.host/api/v1/audio/speech" \ --api-key "048665fa9596db326c17c6f5f84d7d03" \ --voice em_alex --model model_q8f16 """ import argparse import json import os import shlex import shutil import subprocess import sys import tempfile from pathlib import Path from typing import List, Dict import requests import srt # Import translation/transcription helpers from process_video from whisper_project.process_video import ( extract_audio, transcribe_and_translate_faster, transcribe_and_translate_openai, burn_subtitles, ) # Use write_srt from transcribe module if available from whisper_project.transcribe import write_srt def ensure_ffmpeg(): if shutil.which("ffmpeg") is None or shutil.which("ffprobe") is None: print("ffmpeg/ffprobe no encontrados en PATH. Instálalos.") sys.exit(1) def get_duration(path: str) -> float: cmd = [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path, ] p = subprocess.run(cmd, capture_output=True, text=True) if p.returncode != 0: return 0.0 try: return float(p.stdout.strip()) except Exception: return 0.0 def pad_or_trim(in_path: str, out_path: str, target_duration: float, sr: int = 22050): cur = get_duration(in_path) if cur == 0.0: # copy as-is shutil.copy(in_path, out_path) return True if abs(cur - target_duration) < 0.02: # casi igual shutil.copy(in_path, out_path) return True if cur > target_duration: # recortar cmd = ["ffmpeg", "-y", "-i", in_path, "-t", f"{target_duration}", out_path] subprocess.run(cmd, check=True) return True else: # pad: crear silencio de duración faltante y concatenar pad = target_duration - cur with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as sil: sil_path = sil.name try: cmd1 = [ "ffmpeg", "-y", "-f", "lavfi", "-i", f"anullsrc=channel_layout=mono:sample_rate={sr}", "-t", f"{pad}", "-c:a", "pcm_s16le", sil_path, ] subprocess.run(cmd1, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # concat in_path + sil_path with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf: listf.write(f"file '{os.path.abspath(in_path)}'\n") listf.write(f"file '{os.path.abspath(sil_path)}'\n") listname = listf.name cmd2 = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path] subprocess.run(cmd2, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) finally: try: os.remove(sil_path) except Exception: pass try: os.remove(listname) except Exception: pass return True def synthesize_segment_kokoro(endpoint: str, api_key: str, model: str, voice: str, text: str) -> bytes: headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "Accept": "*/*"} payload = {"model": model, "voice": voice, "input": text, "response_format": "wav"} r = requests.post(endpoint, json=payload, headers=headers, timeout=120) r.raise_for_status() # si viene audio ctype = r.headers.get("Content-Type", "") if ctype.startswith("audio/"): return r.content # intentar JSON base64 try: j = r.json() for k in ("audio", "wav", "data", "base64"): if k in j: import base64 return base64.b64decode(j[k]) except Exception: pass # fallback return r.content def translate_with_gemini(text: str, target_lang: str, api_key: str, model: str = "gemini-2.5-flash") -> str: """Usa la API HTTP de Gemini para traducir un texto al idioma objetivo. Notas: - Se asume un endpoint compatible con la API de Google Gemini HTTP (OpenAI-like). - El parámetro `model` por defecto es 'gemini-2.5-flash' según solicitud. """ # Endpoint público de ejemplo: https://api.openai.com/v1/responses # Usamos la ruta /v1/responses que muchas instalaciones usan; si tu instancia Gemini requiere otra URL, # pásala modificando la función (o la env var GEMINI_ENDPOINT). # Si la API key parece una clave de Google (empieza con 'AIza'), usar # la API Generative Language de Google con key en query param. try: if api_key and api_key.startswith("AIza"): gl_model = model # Formato: https://generativelanguage.googleapis.com/v1beta2/models/{model}:generate?key=API_KEY gl_endpoint = ( f"https://generativelanguage.googleapis.com/v1beta2/models/{gl_model}:generateContent?key={api_key}" ) body = { "prompt": {"text": f"Traduce al {target_lang} el siguiente texto, devuelve solo el texto traducido:\n\n{text}"}, "maxOutputTokens": 1024, "temperature": 0.0, "candidateCount": 1, } r = requests.post(gl_endpoint, json=body, timeout=20) r.raise_for_status() j = r.json() # la respuesta suele tener 'candidates' con 'content' if isinstance(j, dict): if "candidates" in j and isinstance(j["candidates"], list) and j["candidates"]: first = j["candidates"][0] if isinstance(first, dict): # varios formatos posibles if "content" in first and isinstance(first["content"], str): return first["content"].strip() if "output" in first and isinstance(first["output"], str): return first["output"].strip() # content puede ser una lista de bloques if "content" in first and isinstance(first["content"], list): # buscar textos dentro parts = [] for c in first["content"]: if isinstance(c, dict) and isinstance(c.get("text"), str): parts.append(c.get("text")) if parts: return "\n".join(parts).strip() # fallback buscar fields comunes for key in ("output_text", "text", "response", "translated_text"): if key in j and isinstance(j[key], str): return j[key].strip() return text # Si no es Google API key, intentar API OpenAI-like (Responses) gemini_endpoint = os.environ.get("GEMINI_ENDPOINT", "https://api.openai.com/v1/responses") headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} prompt = ( f"Traduce el siguiente texto al idioma {target_lang}. Mantén solo el texto traducido, sin añadidos:\n\n{text}" ) payload = {"model": model, "input": prompt, "max_output_tokens": 1024} r = requests.post(gemini_endpoint, json=payload, headers=headers, timeout=20) r.raise_for_status() j = r.json() if isinstance(j, dict): if "output" in j and isinstance(j["output"], list): for item in j["output"]: if isinstance(item, dict) and "content" in item: cont = item["content"] if isinstance(cont, list): texts = [c.get("text") for c in cont if isinstance(c, dict) and c.get("text")] if texts: return "\n".join(texts).strip() elif isinstance(cont, str): return cont.strip() for key in ("output_text", "text", "response", "translated_text"): if key in j and isinstance(j[key], str): return j[key].strip() if isinstance(j, list) and j: if isinstance(j[0], str): return j[0] if isinstance(j, str): return j except Exception as e: print(f"Warning: Gemini translation failed: {e}") return text def concat_chunks(chunk_files: List[str], out_path: str): with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf: for c in chunk_files: listf.write(f"file '{os.path.abspath(c)}'\n") listname = listf.name cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path] subprocess.run(cmd, check=True) try: os.remove(listname) except Exception: pass def replace_audio_in_video(video_path: str, audio_path: str, out_video: str): cmd = [ "ffmpeg", "-y", "-i", video_path, "-i", audio_path, "-map", "0:v:0", "-map", "1:a:0", "-c:v", "copy", "-c:a", "aac", "-b:a", "192k", "-shortest", out_video, ] subprocess.run(cmd, check=True) def normalize_segments(segments) -> List[Dict]: out = [] for s in segments: if isinstance(s, dict): start = s.get("start") end = s.get("end") text = s.get("text", "") else: # faster-whisper Segment object start = getattr(s, "start", None) end = getattr(s, "end", None) text = getattr(s, "text", "") if start is None or end is None: continue out.append({"start": float(start), "end": float(end), "text": str(text).strip()}) return out def main(): parser = argparse.ArgumentParser(description="Doblar vídeo usando Kokoro y quemar SRT traducido") parser.add_argument("--video", "-v", required=True) parser.add_argument("--out", "-o", default=None, help="Vídeo de salida final (con audio reemplazado y SRT quemado)") parser.add_argument("--temp-dub", default=None, help="Archivo de audio temporal generado (si quieres conservarlo)") parser.add_argument("--kokoro-endpoint", required=True, help="URL al endpoint /api/v1/audio/speech") parser.add_argument("--api-key", required=True, help="Token para Authorization: Bearer ") parser.add_argument("--model", default="model", help="Modelo Kokoro a usar (usa 'model' fp32 326MB)") parser.add_argument("--voice", default="em_alex", help="Voice id a usar (em_alex)") parser.add_argument( "--whisper-backend", choices=["faster-whisper", "openai-whisper"], default="faster-whisper", ) parser.add_argument("--whisper-model", default="base") # Gemini options parser.add_argument( "--use-gemini", action="store_true", help="Usar Gemini (HTTP) para traducir segmentos en lugar de Whisper translate", ) parser.add_argument("--gemini-api-key", default=None, help="API key para Gemini (Bearer)") parser.add_argument( "--gemini-model", default="gemini-2.5-flash", help="Modelo Gemini a usar (por defecto: gemini-2.5-flash)", ) args = parser.parse_args() ensure_ffmpeg() video = Path(args.video) if not video.exists(): print("Vídeo no encontrado", file=sys.stderr) sys.exit(2) out_video = args.out if args.out else str(video.with_name(video.stem + "_dubbed.mp4")) tmpdir = tempfile.mkdtemp(prefix="dub_and_burn_") try: audio_wav = os.path.join(tmpdir, "extracted_audio.wav") print("Extrayendo audio...") extract_audio(str(video), audio_wav) print("Transcribiendo (y traduciendo si no se usa Gemini) ...") # Si se solicita Gemini, hacemos transcribe-only y luego traducimos por segmento con Gemini if args.use_gemini: # permitir pasar la key por variable de entorno GEMINI_API_KEY if not args.gemini_api_key: args.gemini_api_key = os.environ.get("GEMINI_API_KEY") if not args.gemini_api_key: print("--use-gemini requiere --gemini-api-key o la var de entorno GEMINI_API_KEY", file=sys.stderr) sys.exit(4) # transcribir sin traducir from faster_whisper import WhisperModel wm = WhisperModel(args.whisper_model, device="cpu", compute_type="int8") segments, info = wm.transcribe(audio_wav, beam_size=5, task="transcribe") else: if args.whisper_backend == "faster-whisper": segments = transcribe_and_translate_faster(audio_wav, args.whisper_model, "es") else: segments = transcribe_and_translate_openai(audio_wav, args.whisper_model, "es") if not segments: print("No se obtuvieron segmentos; abortando", file=sys.stderr) sys.exit(3) segs = normalize_segments(segments) # si usamos gemini, traducir por segmento ahora if args.use_gemini: print(f"Traduciendo {len(segs)} segmentos con Gemini (model={args.gemini_model})...") for s in segs: try: src = s.get("text", "") if src: tgt = translate_with_gemini(src, "es", args.gemini_api_key, model=args.gemini_model) s["text"] = tgt except Exception as e: print(f"Warning: Gemini fallo en segmento: {e}") # generar SRT traducido srt_out = os.path.join(tmpdir, "translated.srt") srt_segments = [] for i, s in enumerate(segs, start=1): srt_segments.append(s) write_srt(srt_segments, srt_out) print(f"SRT traducido guardado en: {srt_out}") # sintetizar por segmento chunk_files = [] print(f"Sintetizando {len(segs)} segmentos con Kokoro (voice={args.voice})...") for i, s in enumerate(segs, start=1): text = s.get("text", "") if not text: # generar silencio con la duración del segmento target_dur = s["end"] - s["start"] silent = os.path.join(tmpdir, f"chunk_{i:04d}.wav") cmd = [ "ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=channel_layout=mono:sample_rate=22050", "-t", f"{target_dur}", "-c:a", "pcm_s16le", silent, ] subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) chunk_files.append(silent) print(f" - Segmento {i}: silencio {target_dur}s") continue try: raw = synthesize_segment_kokoro(args.kokoro_endpoint, args.api_key, args.model, args.voice, text) except Exception as e: print(f"Error sintetizando segmento {i}: {e}") # fallback: generar silencio target_dur = s["end"] - s["start"] silent = os.path.join(tmpdir, f"chunk_{i:04d}.wav") cmd = [ "ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=channel_layout=mono:sample_rate=22050", "-t", f"{target_dur}", "-c:a", "pcm_s16le", silent, ] subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) chunk_files.append(silent) continue # guardar raw en temp file tmp_chunk = os.path.join(tmpdir, f"raw_chunk_{i:04d}.bin") with open(tmp_chunk, "wb") as f: f.write(raw) # convertir a WAV estandar (22050 mono) tmp_wav = os.path.join(tmpdir, f"tmp_chunk_{i:04d}.wav") cmdc = ["ffmpeg", "-y", "-i", tmp_chunk, "-ar", "22050", "-ac", "1", "-sample_fmt", "s16", tmp_wav] subprocess.run(cmdc, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # ajustar a la duración del segmento target_dur = s["end"] - s["start"] final_chunk = os.path.join(tmpdir, f"chunk_{i:04d}.wav") pad_or_trim(tmp_wav, final_chunk, target_dur, sr=22050) chunk_files.append(final_chunk) print(f" - Segmento {i}/{len(segs)} -> {os.path.basename(final_chunk)}") # concatenar chunks dub_wav = args.temp_dub if args.temp_dub else os.path.join(tmpdir, "dub_final.wav") print("Concatenando chunks...") concat_chunks(chunk_files, dub_wav) print(f"Archivo dub generado en: {dub_wav}") # reemplazar audio en el vídeo replaced = os.path.join(tmpdir, "video_replaced.mp4") print("Reemplazando pista de audio en el vídeo...") replace_audio_in_video(str(video), dub_wav, replaced) # quemar SRT traducido print("Quemando SRT traducido en el vídeo...") burn_subtitles(replaced, srt_out, out_video) print(f"Vídeo final generado: {out_video}") finally: try: shutil.rmtree(tmpdir) except Exception: pass if __name__ == '__main__': main()