submaster/whisper_project/dub_and_burn.py
2025-10-23 21:54:13 -07:00

485 lines
18 KiB
Python

#!/usr/bin/env python3
"""
dub_and_burn.py
Flujo automatizado:
- Extrae audio del vídeo
- Transcribe y traduce con Whisper (usando process_video helpers)
- Sintetiza cada segmento con Kokoro (/api/v1/audio/speech) usando voice=em_alex
- Ajusta cada chunk a la duración del segmento (pad/trim)
- Concatena los chunks y reemplaza la pista de audio en el vídeo
- Genera SRT traducido y lo quema en el vídeo final
Requisitos:
- ffmpeg / ffprobe en PATH
- Python venv del proyecto con requests y srt instalados (el venv se creó ya)
Uso ejemplo:
python3 dub_and_burn.py --video input.mp4 --out out_dubbed.mp4 \
--kokoro-endpoint "https://kokoro.bfzqqk.easypanel.host/api/v1/audio/speech" \
--api-key "048665fa9596db326c17c6f5f84d7d03" \
--voice em_alex --model model_q8f16
"""
import argparse
import json
import os
import shlex
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import List, Dict
import requests
import srt
# Import translation/transcription helpers from process_video
from whisper_project.process_video import (
extract_audio,
transcribe_and_translate_faster,
transcribe_and_translate_openai,
burn_subtitles,
)
# Use write_srt from transcribe module if available
from whisper_project.transcribe import write_srt
def ensure_ffmpeg():
if shutil.which("ffmpeg") is None or shutil.which("ffprobe") is None:
print("ffmpeg/ffprobe no encontrados en PATH. Instálalos.")
sys.exit(1)
def get_duration(path: str) -> float:
cmd = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
path,
]
p = subprocess.run(cmd, capture_output=True, text=True)
if p.returncode != 0:
return 0.0
try:
return float(p.stdout.strip())
except Exception:
return 0.0
def pad_or_trim(in_path: str, out_path: str, target_duration: float, sr: int = 22050):
cur = get_duration(in_path)
if cur == 0.0:
# copy as-is
shutil.copy(in_path, out_path)
return True
if abs(cur - target_duration) < 0.02:
# casi igual
shutil.copy(in_path, out_path)
return True
if cur > target_duration:
# recortar
cmd = ["ffmpeg", "-y", "-i", in_path, "-t", f"{target_duration}", out_path]
subprocess.run(cmd, check=True)
return True
else:
# pad: crear silencio de duración faltante y concatenar
pad = target_duration - cur
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as sil:
sil_path = sil.name
try:
cmd1 = [
"ffmpeg",
"-y",
"-f",
"lavfi",
"-i",
f"anullsrc=channel_layout=mono:sample_rate={sr}",
"-t",
f"{pad}",
"-c:a",
"pcm_s16le",
sil_path,
]
subprocess.run(cmd1, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# concat in_path + sil_path
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
listf.write(f"file '{os.path.abspath(in_path)}'\n")
listf.write(f"file '{os.path.abspath(sil_path)}'\n")
listname = listf.name
cmd2 = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
subprocess.run(cmd2, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
finally:
try:
os.remove(sil_path)
except Exception:
pass
try:
os.remove(listname)
except Exception:
pass
return True
def synthesize_segment_kokoro(endpoint: str, api_key: str, model: str, voice: str, text: str) -> bytes:
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json", "Accept": "*/*"}
payload = {"model": model, "voice": voice, "input": text, "response_format": "wav"}
r = requests.post(endpoint, json=payload, headers=headers, timeout=120)
r.raise_for_status()
# si viene audio
ctype = r.headers.get("Content-Type", "")
if ctype.startswith("audio/"):
return r.content
# intentar JSON base64
try:
j = r.json()
for k in ("audio", "wav", "data", "base64"):
if k in j:
import base64
return base64.b64decode(j[k])
except Exception:
pass
# fallback
return r.content
def translate_with_gemini(text: str, target_lang: str, api_key: str, model: str = "gemini-2.5-flash") -> str:
"""Usa la API HTTP de Gemini para traducir un texto al idioma objetivo.
Notas:
- Se asume un endpoint compatible con la API de Google Gemini HTTP (OpenAI-like).
- El parámetro `model` por defecto es 'gemini-2.5-flash' según solicitud.
"""
# Endpoint público de ejemplo: https://api.openai.com/v1/responses
# Usamos la ruta /v1/responses que muchas instalaciones usan; si tu instancia Gemini requiere otra URL,
# pásala modificando la función (o la env var GEMINI_ENDPOINT).
# Si la API key parece una clave de Google (empieza con 'AIza'), usar
# la API Generative Language de Google con key en query param.
try:
if api_key and api_key.startswith("AIza"):
gl_model = model
# Formato: https://generativelanguage.googleapis.com/v1beta2/models/{model}:generate?key=API_KEY
gl_endpoint = (
f"https://generativelanguage.googleapis.com/v1beta2/models/{gl_model}:generateContent?key={api_key}"
)
body = {
"prompt": {"text": f"Traduce al {target_lang} el siguiente texto, devuelve solo el texto traducido:\n\n{text}"},
"maxOutputTokens": 1024,
"temperature": 0.0,
"candidateCount": 1,
}
r = requests.post(gl_endpoint, json=body, timeout=20)
r.raise_for_status()
j = r.json()
# la respuesta suele tener 'candidates' con 'content'
if isinstance(j, dict):
if "candidates" in j and isinstance(j["candidates"], list) and j["candidates"]:
first = j["candidates"][0]
if isinstance(first, dict):
# varios formatos posibles
if "content" in first and isinstance(first["content"], str):
return first["content"].strip()
if "output" in first and isinstance(first["output"], str):
return first["output"].strip()
# content puede ser una lista de bloques
if "content" in first and isinstance(first["content"], list):
# buscar textos dentro
parts = []
for c in first["content"]:
if isinstance(c, dict) and isinstance(c.get("text"), str):
parts.append(c.get("text"))
if parts:
return "\n".join(parts).strip()
# fallback buscar fields comunes
for key in ("output_text", "text", "response", "translated_text"):
if key in j and isinstance(j[key], str):
return j[key].strip()
return text
# Si no es Google API key, intentar API OpenAI-like (Responses)
gemini_endpoint = os.environ.get("GEMINI_ENDPOINT", "https://api.openai.com/v1/responses")
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
prompt = (
f"Traduce el siguiente texto al idioma {target_lang}. Mantén solo el texto traducido, sin añadidos:\n\n{text}"
)
payload = {"model": model, "input": prompt, "max_output_tokens": 1024}
r = requests.post(gemini_endpoint, json=payload, headers=headers, timeout=20)
r.raise_for_status()
j = r.json()
if isinstance(j, dict):
if "output" in j and isinstance(j["output"], list):
for item in j["output"]:
if isinstance(item, dict) and "content" in item:
cont = item["content"]
if isinstance(cont, list):
texts = [c.get("text") for c in cont if isinstance(c, dict) and c.get("text")]
if texts:
return "\n".join(texts).strip()
elif isinstance(cont, str):
return cont.strip()
for key in ("output_text", "text", "response", "translated_text"):
if key in j and isinstance(j[key], str):
return j[key].strip()
if isinstance(j, list) and j:
if isinstance(j[0], str):
return j[0]
if isinstance(j, str):
return j
except Exception as e:
print(f"Warning: Gemini translation failed: {e}")
return text
def concat_chunks(chunk_files: List[str], out_path: str):
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as listf:
for c in chunk_files:
listf.write(f"file '{os.path.abspath(c)}'\n")
listname = listf.name
cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listname, "-c", "copy", out_path]
subprocess.run(cmd, check=True)
try:
os.remove(listname)
except Exception:
pass
def replace_audio_in_video(video_path: str, audio_path: str, out_video: str):
cmd = [
"ffmpeg",
"-y",
"-i",
video_path,
"-i",
audio_path,
"-map",
"0:v:0",
"-map",
"1:a:0",
"-c:v",
"copy",
"-c:a",
"aac",
"-b:a",
"192k",
"-shortest",
out_video,
]
subprocess.run(cmd, check=True)
def normalize_segments(segments) -> List[Dict]:
out = []
for s in segments:
if isinstance(s, dict):
start = s.get("start")
end = s.get("end")
text = s.get("text", "")
else:
# faster-whisper Segment object
start = getattr(s, "start", None)
end = getattr(s, "end", None)
text = getattr(s, "text", "")
if start is None or end is None:
continue
out.append({"start": float(start), "end": float(end), "text": str(text).strip()})
return out
def main():
parser = argparse.ArgumentParser(description="Doblar vídeo usando Kokoro y quemar SRT traducido")
parser.add_argument("--video", "-v", required=True)
parser.add_argument("--out", "-o", default=None, help="Vídeo de salida final (con audio reemplazado y SRT quemado)")
parser.add_argument("--temp-dub", default=None, help="Archivo de audio temporal generado (si quieres conservarlo)")
parser.add_argument("--kokoro-endpoint", required=True, help="URL al endpoint /api/v1/audio/speech")
parser.add_argument("--api-key", required=True, help="Token para Authorization: Bearer <token>")
parser.add_argument("--model", default="model", help="Modelo Kokoro a usar (usa 'model' fp32 326MB)")
parser.add_argument("--voice", default="em_alex", help="Voice id a usar (em_alex)")
parser.add_argument(
"--whisper-backend",
choices=["faster-whisper", "openai-whisper"],
default="faster-whisper",
)
parser.add_argument("--whisper-model", default="base")
# Gemini options
parser.add_argument(
"--use-gemini",
action="store_true",
help="Usar Gemini (HTTP) para traducir segmentos en lugar de Whisper translate",
)
parser.add_argument("--gemini-api-key", default=None, help="API key para Gemini (Bearer)")
parser.add_argument(
"--gemini-model",
default="gemini-2.5-flash",
help="Modelo Gemini a usar (por defecto: gemini-2.5-flash)",
)
args = parser.parse_args()
ensure_ffmpeg()
video = Path(args.video)
if not video.exists():
print("Vídeo no encontrado", file=sys.stderr)
sys.exit(2)
out_video = args.out if args.out else str(video.with_name(video.stem + "_dubbed.mp4"))
tmpdir = tempfile.mkdtemp(prefix="dub_and_burn_")
try:
audio_wav = os.path.join(tmpdir, "extracted_audio.wav")
print("Extrayendo audio...")
extract_audio(str(video), audio_wav)
print("Transcribiendo (y traduciendo si no se usa Gemini) ...")
# Si se solicita Gemini, hacemos transcribe-only y luego traducimos por segmento con Gemini
if args.use_gemini:
# permitir pasar la key por variable de entorno GEMINI_API_KEY
if not args.gemini_api_key:
args.gemini_api_key = os.environ.get("GEMINI_API_KEY")
if not args.gemini_api_key:
print("--use-gemini requiere --gemini-api-key o la var de entorno GEMINI_API_KEY", file=sys.stderr)
sys.exit(4)
# transcribir sin traducir
from faster_whisper import WhisperModel
wm = WhisperModel(args.whisper_model, device="cpu", compute_type="int8")
segments, info = wm.transcribe(audio_wav, beam_size=5, task="transcribe")
else:
if args.whisper_backend == "faster-whisper":
segments = transcribe_and_translate_faster(audio_wav, args.whisper_model, "es")
else:
segments = transcribe_and_translate_openai(audio_wav, args.whisper_model, "es")
if not segments:
print("No se obtuvieron segmentos; abortando", file=sys.stderr)
sys.exit(3)
segs = normalize_segments(segments)
# si usamos gemini, traducir por segmento ahora
if args.use_gemini:
print(f"Traduciendo {len(segs)} segmentos con Gemini (model={args.gemini_model})...")
for s in segs:
try:
src = s.get("text", "")
if src:
tgt = translate_with_gemini(src, "es", args.gemini_api_key, model=args.gemini_model)
s["text"] = tgt
except Exception as e:
print(f"Warning: Gemini fallo en segmento: {e}")
# generar SRT traducido
srt_out = os.path.join(tmpdir, "translated.srt")
srt_segments = []
for i, s in enumerate(segs, start=1):
srt_segments.append(s)
write_srt(srt_segments, srt_out)
print(f"SRT traducido guardado en: {srt_out}")
# sintetizar por segmento
chunk_files = []
print(f"Sintetizando {len(segs)} segmentos con Kokoro (voice={args.voice})...")
for i, s in enumerate(segs, start=1):
text = s.get("text", "")
if not text:
# generar silencio con la duración del segmento
target_dur = s["end"] - s["start"]
silent = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
cmd = [
"ffmpeg",
"-y",
"-f",
"lavfi",
"-i",
"anullsrc=channel_layout=mono:sample_rate=22050",
"-t",
f"{target_dur}",
"-c:a",
"pcm_s16le",
silent,
]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
chunk_files.append(silent)
print(f" - Segmento {i}: silencio {target_dur}s")
continue
try:
raw = synthesize_segment_kokoro(args.kokoro_endpoint, args.api_key, args.model, args.voice, text)
except Exception as e:
print(f"Error sintetizando segmento {i}: {e}")
# fallback: generar silencio
target_dur = s["end"] - s["start"]
silent = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
cmd = [
"ffmpeg",
"-y",
"-f",
"lavfi",
"-i",
"anullsrc=channel_layout=mono:sample_rate=22050",
"-t",
f"{target_dur}",
"-c:a",
"pcm_s16le",
silent,
]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
chunk_files.append(silent)
continue
# guardar raw en temp file
tmp_chunk = os.path.join(tmpdir, f"raw_chunk_{i:04d}.bin")
with open(tmp_chunk, "wb") as f:
f.write(raw)
# convertir a WAV estandar (22050 mono)
tmp_wav = os.path.join(tmpdir, f"tmp_chunk_{i:04d}.wav")
cmdc = ["ffmpeg", "-y", "-i", tmp_chunk, "-ar", "22050", "-ac", "1", "-sample_fmt", "s16", tmp_wav]
subprocess.run(cmdc, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# ajustar a la duración del segmento
target_dur = s["end"] - s["start"]
final_chunk = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
pad_or_trim(tmp_wav, final_chunk, target_dur, sr=22050)
chunk_files.append(final_chunk)
print(f" - Segmento {i}/{len(segs)} -> {os.path.basename(final_chunk)}")
# concatenar chunks
dub_wav = args.temp_dub if args.temp_dub else os.path.join(tmpdir, "dub_final.wav")
print("Concatenando chunks...")
concat_chunks(chunk_files, dub_wav)
print(f"Archivo dub generado en: {dub_wav}")
# reemplazar audio en el vídeo
replaced = os.path.join(tmpdir, "video_replaced.mp4")
print("Reemplazando pista de audio en el vídeo...")
replace_audio_in_video(str(video), dub_wav, replaced)
# quemar SRT traducido
print("Quemando SRT traducido en el vídeo...")
burn_subtitles(replaced, srt_out, out_video)
print(f"Vídeo final generado: {out_video}")
finally:
try:
shutil.rmtree(tmpdir)
except Exception:
pass
if __name__ == '__main__':
main()