import os import subprocess import shutil from typing import Optional # Importar funciones pesadas (parsing/synth) de forma perezosa dentro de # `synthesize_from_srt` para evitar fallos en la importación del paquete cuando # dependencias opcionales (p.ej. 'srt') no están instaladas. from .ffmpeg_adapter import FFmpegAudioProcessor class KokoroHttpClient: """Cliente HTTP para sintetizar segmentos desde un .srt usando un endpoint compatible. Reemplaza la invocación por subprocess a `srt_to_kokoro.py`. Reusa las funciones de `srt_to_kokoro.py` para parsing y síntesis HTTP (synth_chunk) y usa FFmpegAudioProcessor para operaciones con WAV cuando sea necesario. """ def __init__(self, endpoint: str, api_key: Optional[str] = None, voice: Optional[str] = None, model: Optional[str] = None): self.endpoint = endpoint self.api_key = api_key self.voice = voice or "em_alex" self.model = model or "model" self._processor = FFmpegAudioProcessor() def synthesize_from_srt(self, srt_path: str, out_wav: str, video: Optional[str] = None, align: bool = True, keep_chunks: bool = False, mix_with_original: bool = False, mix_background_volume: float = 0.2): """Sintetiza cada subtítulo del SRT y concatena en out_wav. Parámetros claves coinciden con la versión previa del adaptador CLI para compatibilidad. """ headers = {"Accept": "*/*"} if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" # importar las utilidades sólo cuando se vayan a usar try: from whisper_project.srt_to_kokoro import parse_srt_file, synth_chunk except ModuleNotFoundError as e: raise RuntimeError("Módulo requerido no encontrado para síntesis por SRT: instale 'srt' y 'requests' (pip install srt requests)") from e subs = parse_srt_file(srt_path) tmpdir = os.path.join(os.path.dirname(out_wav), f".kokoro_tmp_{os.getpid()}") os.makedirs(tmpdir, exist_ok=True) chunk_files = [] prev_end = 0.0 for i, sub in enumerate(subs, start=1): text = "\n".join(line.strip() for line in sub.content.splitlines()).strip() if not text: prev_end = sub.end.total_seconds() continue start_sec = sub.start.total_seconds() end_sec = sub.end.total_seconds() duration = end_sec - start_sec # align: insertar silencio por la brecha anterior if align: gap = start_sec - prev_end if gap > 0.01: sil_target = os.path.join(tmpdir, f"sil_{i:04d}.wav") self._processor.create_silence(gap, sil_target) chunk_files.append(sil_target) # construir payload_template simple que reemplace {text} payload_template = '{"model":"%s","voice":"%s","input":"{text}","response_format":"wav"}' % (self.model, self.voice) try: raw = synth_chunk(self.endpoint, text, headers, payload_template) except Exception as e: # saltar segmento con log y continuar print(f"Error al sintetizar segmento {i}: {e}") prev_end = end_sec continue target = os.path.join(tmpdir, f"chunk_{i:04d}.wav") # convertir/normalizar bytes a wav self._processor.save_bytes_as_wav(raw, target) if align: aligned = os.path.join(tmpdir, f"chunk_{i:04d}.aligned.wav") self._processor.pad_or_trim_wav(target, aligned, duration) chunk_files.append(aligned) if not keep_chunks: try: os.remove(target) except Exception: pass else: chunk_files.append(target) prev_end = end_sec print(f" - Segmento {i}/{len(subs)} -> {os.path.basename(chunk_files[-1])}") if not chunk_files: raise RuntimeError("No se generaron fragmentos de audio desde el SRT") # concatenar self._processor.concat_wavs(chunk_files, out_wav) # operaciones opcionales: mezclar o reemplazar en vídeo original if mix_with_original and video: # extraer audio original y mezclar: delegar a srt_to_kokoro original no es necesario # aquí podemos replicar la estrategia previa: extraer audio, usar ffmpeg para mezclar orig_tmp = os.path.join(tmpdir, f"orig_{os.getpid()}.wav") try: self._processor.extract_audio(video, orig_tmp, sr=22050) # mezclar usando ffmpeg filter_complex mixed_tmp = os.path.join(tmpdir, f"mixed_{os.getpid()}.wav") vol = float(mix_background_volume) cmd = [ "ffmpeg", "-y", "-i", out_wav, "-i", orig_tmp, "-filter_complex", f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:dropout_transition=0[mix]", "-map", "[mix]", "-c:a", "pcm_s16le", mixed_tmp, ] subprocess.run(cmd, check=True) shutil.move(mixed_tmp, out_wav) finally: try: if os.path.exists(orig_tmp): os.remove(orig_tmp) except Exception: pass if video: # si se pidió reemplazar la pista original out_video = os.path.splitext(video)[0] + ".replaced_audio.mp4" try: self._processor.replace_audio_in_video(video, out_wav, out_video) except Exception as e: print(f"Error al reemplazar audio en el vídeo: {e}") # limpieza: opcional conservar tmpdir si keep_chunks if not keep_chunks: try: import shutil as _sh _sh.rmtree(tmpdir, ignore_errors=True) except Exception: pass