- Updated `srt_to_kokoro.py` to provide a CLI entrypoint with argument parsing. - Enhanced error handling and logging for better user feedback. - Introduced a compatibility layer for legacy scripts. - Added configuration handling via `config.toml` for endpoint and API key. - Improved documentation and comments for clarity. Enhance PipelineOrchestrator with in-process transcriber fallback - Implemented `InProcessTranscriber` to handle transcription using multiple strategies. - Added support for `srt_only` flag to return translated SRT without TTS synthesis. - Improved error handling and logging for transcriber initialization. Add installation and usage documentation - Created `INSTALLATION.md` for detailed setup instructions for CPU and GPU environments. - Added `USAGE.md` with practical examples for common use cases and command-line options. - Included a script for automated installation and environment setup. Implement SRT burning utility - Added `burn_srt.py` to facilitate embedding SRT subtitles into video files using ffmpeg. - Provided command-line options for style and codec customization. Update project configuration management - Introduced `config.py` to centralize configuration loading from `config.toml`. - Ensured that environment variables are not read to avoid implicit overrides. Enhance package management with `pyproject.toml` - Added `pyproject.toml` for modern packaging and dependency management. - Defined optional dependencies for CPU and TTS support. Add smoke test fixture for SRT - Created `smoke_test.srt` as a sample subtitle file for testing purposes. Update requirements and setup configurations - Revised `requirements.txt` and `setup.cfg` for better dependency management and clarity. - Included installation instructions for editable mode and local TTS support.
431 lines
17 KiB
Python
431 lines
17 KiB
Python
"""Orquestador que compone los adaptadores infra para ejecutar el pipeline.
|
|
|
|
Proporciona una clase `Orchestrator` con método `run` y soporta modo dry-run
|
|
para inspección sin ejecutar los pasos pesados.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from whisper_project.infra import process_video, transcribe
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Orchestrator:
|
|
"""Orquesta: extracción audio -> transcripción -> TTS por segmento -> reemplazo audio -> quemar subtítulos.
|
|
|
|
Nota: los pasos concretos se delegan a los adaptadores en `whisper_project.infra`.
|
|
"""
|
|
|
|
def __init__(self, dry_run: bool = False, tts_model: str = "kokoro", verbose: bool = False):
|
|
self.dry_run = dry_run
|
|
self.tts_model = tts_model
|
|
# No configurar basicConfig aquí: usar configure_logging desde el CLI/main
|
|
if verbose:
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
def run(self, src_video: str, out_dir: str, translate: bool = False) -> dict:
|
|
"""Ejecuta el pipeline.
|
|
|
|
Args:
|
|
src_video: ruta al vídeo de entrada.
|
|
out_dir: carpeta donde escribir resultados intermedios/finales.
|
|
translate: si True, intentará traducir SRT (delegado a futuras implementaciones).
|
|
|
|
Returns:
|
|
diccionario con resultados y rutas generadas.
|
|
"""
|
|
src = Path(src_video)
|
|
out = Path(out_dir)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
|
|
result = {
|
|
"input_video": str(src.resolve()),
|
|
"out_dir": str(out.resolve()),
|
|
"steps": [],
|
|
}
|
|
|
|
# 1) Extraer audio
|
|
audio_wav = out / f"{src.stem}.wav"
|
|
step = {"name": "extract_audio", "out": str(audio_wav)}
|
|
result["steps"].append(step)
|
|
if self.dry_run:
|
|
logger.info("[dry-run] extraer audio: %s -> %s", src, audio_wav)
|
|
else:
|
|
logger.info("extraer audio: %s -> %s", src, audio_wav)
|
|
process_video.extract_audio(str(src), str(audio_wav))
|
|
|
|
# 2) Transcribir (segmentado si es necesario)
|
|
srt_path = out / f"{src.stem}.srt"
|
|
step = {"name": "transcribe", "out": str(srt_path)}
|
|
result["steps"].append(step)
|
|
if self.dry_run:
|
|
logger.info("[dry-run] transcribir audio -> %s", srt_path)
|
|
segments = []
|
|
else:
|
|
logger.info("transcribir audio -> %s", srt_path)
|
|
# usamos la función delegante que el proyecto expone
|
|
segments = transcribe.transcribe_segmented_with_tempfiles(str(audio_wav), [])
|
|
transcribe.write_srt(segments, str(srt_path))
|
|
|
|
# 3) (Opcional) traducir SRT — placeholder
|
|
if translate:
|
|
step = {"name": "translate", "out": str(srt_path)}
|
|
result["steps"].append(step)
|
|
if self.dry_run:
|
|
logger.info("[dry-run] traducir SRT: %s", srt_path)
|
|
else:
|
|
logger.info("traducir SRT: %s (funcionalidad no implementada en orquestador)", srt_path)
|
|
|
|
# 4) Generar TTS segmentado en un WAV final (dub)
|
|
dubbed_wav = out / f"{src.stem}.dub.wav"
|
|
step = {"name": "tts_and_stitch", "out": str(dubbed_wav)}
|
|
result["steps"].append(step)
|
|
if self.dry_run:
|
|
logger.info("[dry-run] synthesize TTS por segmento -> %s (modelo=%s)", dubbed_wav, self.tts_model)
|
|
else:
|
|
logger.info("synthesize TTS por segmento -> %s (modelo=%s)", dubbed_wav, self.tts_model)
|
|
# por ahora usamos la función helper de transcribe para síntesis (si existe)
|
|
try:
|
|
# `segments` viene de la transcripción previa
|
|
transcribe.tts_synthesize(" ".join([s.get("text", "") for s in segments]), str(dubbed_wav), model=self.tts_model)
|
|
except Exception:
|
|
# Fallback simple: crear un silencio (no romper)
|
|
logger.exception("TTS falló, creando archivo vacío como fallback")
|
|
try:
|
|
process_video.pad_or_trim_wav(0.0, str(dubbed_wav))
|
|
except Exception:
|
|
logger.exception("No se pudo crear WAV de fallback")
|
|
|
|
# 5) Reemplazar audio en el vídeo
|
|
dubbed_video = out / f"{src.stem}.dub.mp4"
|
|
step = {"name": "replace_audio_in_video", "out": str(dubbed_video)}
|
|
result["steps"].append(step)
|
|
if self.dry_run:
|
|
logger.info("[dry-run] reemplazar audio en video: %s -> %s", src, dubbed_video)
|
|
else:
|
|
logger.info("reemplazar audio en video: %s -> %s", src, dubbed_video)
|
|
process_video.replace_audio_in_video(str(src), str(dubbed_wav), str(dubbed_video))
|
|
|
|
# 6) Quemar subtítulos en vídeo final
|
|
burned = out / f"{src.stem}.burned.mp4"
|
|
step = {"name": "burn_subtitles", "out": str(burned)}
|
|
result["steps"].append(step)
|
|
if self.dry_run:
|
|
logger.info("[dry-run] quemar subtítulos: %s + %s -> %s", dubbed_video, srt_path, burned)
|
|
else:
|
|
logger.info("quemar subtítulos: %s + %s -> %s", dubbed_video, srt_path, burned)
|
|
process_video.burn_subtitles(str(dubbed_video), str(srt_path), str(burned))
|
|
|
|
return result
|
|
|
|
|
|
__all__ = ["Orchestrator"]
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from typing import Optional
|
|
|
|
from ..core.models import PipelineResult
|
|
from ..infra import ffmpeg_adapter
|
|
from ..infra.kokoro_adapter import KokoroHttpClient
|
|
|
|
|
|
class PipelineOrchestrator:
|
|
"""Use case class that coordinates the high-level steps of the pipeline.
|
|
|
|
Esta clase mantiene la lógica de orquestación en métodos pequeños y
|
|
testables, y depende de adaptadores infra para las operaciones I/O.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
kokoro_endpoint: str,
|
|
kokoro_key: Optional[str] = None,
|
|
voice: Optional[str] = None,
|
|
kokoro_model: Optional[str] = None,
|
|
transcriber=None,
|
|
translator=None,
|
|
tts_client=None,
|
|
audio_processor=None,
|
|
):
|
|
# Si no se inyectan adaptadores, crear implementaciones por defecto
|
|
# Sólo importar adaptadores pesados si no se inyectan implementaciones.
|
|
if transcriber is None:
|
|
# Definir un transcriptor en-proceso de respaldo que pruebe varias
|
|
# estrategias antes de fallar (faster-whisper -> openai -> segmentado).
|
|
try:
|
|
from ..infra.transcribe_adapter import TranscribeService
|
|
|
|
class InProcessTranscriber:
|
|
def __init__(self, model: str = "base") -> None:
|
|
self._svc = TranscribeService(model=model)
|
|
|
|
def transcribe(self, file: str, *args, **kwargs):
|
|
# Compatibilidad con llamadas posicionales: si se pasa
|
|
# un segundo argumento posicional lo tratamos como srt_out
|
|
srt_out = None
|
|
if args:
|
|
srt_out = args[0]
|
|
srt_flag = kwargs.get("srt", bool(srt_out))
|
|
srt_file = kwargs.get("srt_file", srt_out)
|
|
|
|
try:
|
|
return self._svc.transcribe_faster(file)
|
|
except Exception:
|
|
try:
|
|
return self._svc.transcribe_openai(file)
|
|
except Exception:
|
|
try:
|
|
duration = self._svc.get_audio_duration(file) or 0
|
|
segs = self._svc.make_uniform_segments(duration, seg_seconds=max(30, int(duration) or 30))
|
|
results = self._svc.transcribe_segmented_with_tempfiles(file, segs, backend="openai-whisper", model=self._svc.model)
|
|
if srt_flag and srt_file:
|
|
self._svc.write_srt(results, srt_file)
|
|
return results
|
|
except Exception:
|
|
return []
|
|
|
|
# Intentar usar el adaptador rápido si existe, pero envolver su
|
|
# llamada para detectar errores en tiempo de ejecución y caer
|
|
# al InProcessTranscriber.
|
|
try:
|
|
from ..infra.faster_whisper_adapter import FasterWhisperTranscriber
|
|
|
|
fw = FasterWhisperTranscriber()
|
|
|
|
class TranscriberProxy:
|
|
def __init__(self, fast_impl, fallback):
|
|
self._fast = fast_impl
|
|
self._fallback = fallback
|
|
|
|
def transcribe(self, *args, **kwargs):
|
|
try:
|
|
return self._fast.transcribe(*args, **kwargs)
|
|
except Exception:
|
|
return self._fallback.transcribe(*args, **kwargs)
|
|
|
|
self.transcriber = TranscriberProxy(fw, InProcessTranscriber())
|
|
except Exception:
|
|
# Si no existe el adaptador rápido, usar directamente el in-process
|
|
self.transcriber = InProcessTranscriber()
|
|
except Exception:
|
|
# último recurso: no hay transcriptor en memoria
|
|
self.transcriber = None
|
|
else:
|
|
self.transcriber = transcriber
|
|
|
|
if translator is None:
|
|
try:
|
|
from ..infra.marian_adapter import MarianTranslator
|
|
|
|
self.translator = MarianTranslator()
|
|
except Exception:
|
|
self.translator = None
|
|
else:
|
|
self.translator = translator
|
|
|
|
if tts_client is None:
|
|
try:
|
|
from ..infra.kokoro_adapter import KokoroHttpClient
|
|
|
|
self.tts_client = KokoroHttpClient(kokoro_endpoint, api_key=kokoro_key, voice=voice, model=kokoro_model)
|
|
except Exception:
|
|
self.tts_client = None
|
|
else:
|
|
self.tts_client = tts_client
|
|
|
|
if audio_processor is None:
|
|
try:
|
|
from ..infra.ffmpeg_adapter import FFmpegAudioProcessor
|
|
|
|
self.audio_processor = FFmpegAudioProcessor()
|
|
except Exception:
|
|
self.audio_processor = None
|
|
else:
|
|
self.audio_processor = audio_processor
|
|
|
|
def run(
|
|
self,
|
|
video: str,
|
|
srt: Optional[str],
|
|
workdir: str,
|
|
translate_method: str = "local",
|
|
gemini_api_key: Optional[str] = None,
|
|
whisper_model: str = "base",
|
|
mix: bool = False,
|
|
mix_background_volume: float = 0.2,
|
|
keep_chunks: bool = False,
|
|
dry_run: bool = False,
|
|
srt_only: bool = False,
|
|
) -> PipelineResult:
|
|
"""Run the pipeline.
|
|
|
|
When dry_run=True the orchestrator will only print planned actions
|
|
instead of executing subprocesses or ffmpeg commands.
|
|
"""
|
|
# 0) prepare paths
|
|
if dry_run:
|
|
logger.info("[dry-run] workdir: %s", workdir)
|
|
|
|
# 1) extraer audio
|
|
audio_tmp = os.path.join(workdir, "extracted_audio.wav")
|
|
if dry_run:
|
|
logger.info("[dry-run] ffmpeg extract audio -> %s", audio_tmp)
|
|
else:
|
|
self.audio_processor.extract_audio(video, audio_tmp, sr=16000)
|
|
|
|
# 2) transcribir si es necesario
|
|
if srt:
|
|
srt_in = srt
|
|
else:
|
|
srt_in = os.path.join(workdir, "transcribed.srt")
|
|
cmd_trans = [
|
|
sys.executable,
|
|
"whisper_project/transcribe.py",
|
|
"--file",
|
|
audio_tmp,
|
|
"--backend",
|
|
"faster-whisper",
|
|
"--model",
|
|
whisper_model,
|
|
"--srt",
|
|
"--srt-file",
|
|
srt_in,
|
|
]
|
|
if dry_run:
|
|
logger.info("[dry-run] %s", " ".join(cmd_trans))
|
|
else:
|
|
# Use injected transcriber when possible
|
|
try:
|
|
self.transcriber.transcribe(audio_tmp, srt_in)
|
|
except Exception:
|
|
# Fallback to subprocess if adapter not available
|
|
subprocess.run(cmd_trans, check=True)
|
|
|
|
# 3) traducir
|
|
srt_translated = os.path.join(workdir, "translated.srt")
|
|
if translate_method == "local":
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_local.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
]
|
|
if dry_run:
|
|
logger.info("[dry-run] %s", " ".join(cmd_translate))
|
|
else:
|
|
try:
|
|
self.translator.translate_srt(srt_in, srt_translated)
|
|
except Exception:
|
|
subprocess.run(cmd_translate, check=True)
|
|
elif translate_method == "gemini":
|
|
# preferir adaptador inyectado que soporte Gemini, sino usar el local wrapper
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_with_gemini.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
]
|
|
if gemini_api_key:
|
|
cmd_translate += ["--gemini-api-key", gemini_api_key]
|
|
|
|
if dry_run:
|
|
logger.info("[dry-run] %s", " ".join(cmd_translate))
|
|
else:
|
|
try:
|
|
# intentar usar adaptador Gemini si está disponible
|
|
if self.translator and getattr(self.translator, "__class__", None).__name__ == "GeminiTranslator":
|
|
self.translator.translate_srt(srt_in, srt_translated)
|
|
else:
|
|
# intentar importar adaptador local
|
|
from ..infra.gemini_adapter import GeminiTranslator
|
|
|
|
gem = GeminiTranslator(api_key=gemini_api_key)
|
|
gem.translate_srt(srt_in, srt_translated)
|
|
except Exception:
|
|
subprocess.run(cmd_translate, check=True)
|
|
elif translate_method == "argos":
|
|
cmd_translate = [
|
|
sys.executable,
|
|
"whisper_project/translate_srt_argos.py",
|
|
"--in",
|
|
srt_in,
|
|
"--out",
|
|
srt_translated,
|
|
]
|
|
if dry_run:
|
|
logger.info("[dry-run] %s", " ".join(cmd_translate))
|
|
else:
|
|
try:
|
|
if self.translator and getattr(self.translator, "__class__", None).__name__ == "ArgosTranslator":
|
|
self.translator.translate_srt(srt_in, srt_translated)
|
|
else:
|
|
from ..infra.argos_adapter import ArgosTranslator
|
|
|
|
a = ArgosTranslator()
|
|
a.translate_srt(srt_in, srt_translated)
|
|
except Exception:
|
|
subprocess.run(cmd_translate, check=True)
|
|
elif translate_method == "none":
|
|
srt_translated = srt_in
|
|
else:
|
|
raise ValueError("translate_method not supported in this orchestrator")
|
|
|
|
# Si el usuario solicitó sólo el SRT traducido, devolvemos inmediatamente
|
|
# tras la traducción y NO ejecutamos síntesis TTS / reemplazo / quemado.
|
|
if srt_only:
|
|
if dry_run:
|
|
logger.info("[dry-run] srt-only mode: devolver %s", srt_translated)
|
|
return PipelineResult(
|
|
workdir=workdir,
|
|
srt_translated=srt_translated,
|
|
srt_original=srt_in,
|
|
)
|
|
|
|
# 4) sintetizar por segmento
|
|
dub_wav = os.path.join(workdir, "dub_final.wav")
|
|
if dry_run:
|
|
logger.info("[dry-run] synthesize from srt %s -> %s", srt_translated, dub_wav)
|
|
else:
|
|
# Use injected tts_client
|
|
self.tts_client.synthesize_from_srt(
|
|
srt_translated,
|
|
dub_wav,
|
|
video=video,
|
|
align=True,
|
|
keep_chunks=keep_chunks,
|
|
mix_with_original=mix,
|
|
mix_background_volume=mix_background_volume,
|
|
)
|
|
|
|
# 5) reemplazar audio en vídeo
|
|
replaced = os.path.splitext(video)[0] + ".replaced_audio.mp4"
|
|
if dry_run:
|
|
logger.info("[dry-run] replace audio in video -> %s", replaced)
|
|
else:
|
|
self.audio_processor.replace_audio_in_video(video, dub_wav, replaced)
|
|
|
|
# 6) quemar subtítulos
|
|
burned = os.path.splitext(video)[0] + ".replaced_audio.subs.mp4"
|
|
if dry_run:
|
|
logger.info("[dry-run] burn subtitles %s -> %s", srt_translated, burned)
|
|
else:
|
|
self.audio_processor.burn_subtitles(replaced, srt_translated, burned)
|
|
|
|
return PipelineResult(
|
|
workdir=workdir,
|
|
dub_wav=dub_wav,
|
|
replaced_video=replaced,
|
|
burned_video=burned,
|
|
srt_translated=srt_translated,
|
|
srt_original=srt_in,
|
|
)
|