154 lines
6.2 KiB
Python
154 lines
6.2 KiB
Python
import os
|
|
import subprocess
|
|
import shutil
|
|
from typing import Optional
|
|
|
|
# Importar funciones pesadas (parsing/synth) de forma perezosa dentro de
|
|
# `synthesize_from_srt` para evitar fallos en la importación del paquete cuando
|
|
# dependencias opcionales (p.ej. 'srt') no están instaladas.
|
|
|
|
from .ffmpeg_adapter import FFmpegAudioProcessor
|
|
|
|
|
|
class KokoroHttpClient:
|
|
"""Cliente HTTP para sintetizar segmentos desde un .srt usando un endpoint compatible.
|
|
|
|
Reemplaza la invocación por subprocess a `srt_to_kokoro.py`. Reusa las funciones de
|
|
`srt_to_kokoro.py` para parsing y síntesis HTTP (synth_chunk) y usa FFmpegAudioProcessor
|
|
para operaciones con WAV cuando sea necesario.
|
|
"""
|
|
|
|
def __init__(self, endpoint: str, api_key: Optional[str] = None, voice: Optional[str] = None, model: Optional[str] = None):
|
|
self.endpoint = endpoint
|
|
self.api_key = api_key
|
|
self.voice = voice or "em_alex"
|
|
self.model = model or "model"
|
|
self._processor = FFmpegAudioProcessor()
|
|
|
|
def synthesize_from_srt(self, srt_path: str, out_wav: str, video: Optional[str] = None, align: bool = True, keep_chunks: bool = False, mix_with_original: bool = False, mix_background_volume: float = 0.2):
|
|
"""Sintetiza cada subtítulo del SRT y concatena en out_wav.
|
|
|
|
Parámetros claves coinciden con la versión previa del adaptador CLI para compatibilidad.
|
|
"""
|
|
headers = {"Accept": "*/*"}
|
|
if self.api_key:
|
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
|
|
# importar las utilidades sólo cuando se vayan a usar
|
|
try:
|
|
from whisper_project.srt_to_kokoro import parse_srt_file, synth_chunk
|
|
except ModuleNotFoundError as e:
|
|
raise RuntimeError("Módulo requerido no encontrado para síntesis por SRT: instale 'srt' y 'requests' (pip install srt requests)") from e
|
|
|
|
subs = parse_srt_file(srt_path)
|
|
tmpdir = os.path.join(os.path.dirname(out_wav), f".kokoro_tmp_{os.getpid()}")
|
|
os.makedirs(tmpdir, exist_ok=True)
|
|
chunk_files = []
|
|
|
|
prev_end = 0.0
|
|
for i, sub in enumerate(subs, start=1):
|
|
text = "\n".join(line.strip() for line in sub.content.splitlines()).strip()
|
|
if not text:
|
|
prev_end = sub.end.total_seconds()
|
|
continue
|
|
|
|
start_sec = sub.start.total_seconds()
|
|
end_sec = sub.end.total_seconds()
|
|
duration = end_sec - start_sec
|
|
|
|
# align: insertar silencio por la brecha anterior
|
|
if align:
|
|
gap = start_sec - prev_end
|
|
if gap > 0.01:
|
|
sil_target = os.path.join(tmpdir, f"sil_{i:04d}.wav")
|
|
self._processor.create_silence(gap, sil_target)
|
|
chunk_files.append(sil_target)
|
|
|
|
# construir payload_template simple que reemplace {text}
|
|
payload_template = '{"model":"%s","voice":"%s","input":"{text}","response_format":"wav"}' % (self.model, self.voice)
|
|
|
|
try:
|
|
raw = synth_chunk(self.endpoint, text, headers, payload_template)
|
|
except Exception as e:
|
|
# saltar segmento con log y continuar
|
|
print(f"Error al sintetizar segmento {i}: {e}")
|
|
prev_end = end_sec
|
|
continue
|
|
|
|
target = os.path.join(tmpdir, f"chunk_{i:04d}.wav")
|
|
# convertir/normalizar bytes a wav
|
|
self._processor.save_bytes_as_wav(raw, target)
|
|
|
|
if align:
|
|
aligned = os.path.join(tmpdir, f"chunk_{i:04d}.aligned.wav")
|
|
self._processor.pad_or_trim_wav(target, aligned, duration)
|
|
chunk_files.append(aligned)
|
|
if not keep_chunks:
|
|
try:
|
|
os.remove(target)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
chunk_files.append(target)
|
|
|
|
prev_end = end_sec
|
|
print(f" - Segmento {i}/{len(subs)} -> {os.path.basename(chunk_files[-1])}")
|
|
|
|
if not chunk_files:
|
|
raise RuntimeError("No se generaron fragmentos de audio desde el SRT")
|
|
|
|
# concatenar
|
|
self._processor.concat_wavs(chunk_files, out_wav)
|
|
|
|
# operaciones opcionales: mezclar o reemplazar en vídeo original
|
|
if mix_with_original and video:
|
|
# extraer audio original y mezclar: delegar a srt_to_kokoro original no es necesario
|
|
# aquí podemos replicar la estrategia previa: extraer audio, usar ffmpeg para mezclar
|
|
orig_tmp = os.path.join(tmpdir, f"orig_{os.getpid()}.wav")
|
|
try:
|
|
self._processor.extract_audio(video, orig_tmp, sr=22050)
|
|
# mezclar usando ffmpeg filter_complex
|
|
mixed_tmp = os.path.join(tmpdir, f"mixed_{os.getpid()}.wav")
|
|
vol = float(mix_background_volume)
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-i",
|
|
out_wav,
|
|
"-i",
|
|
orig_tmp,
|
|
"-filter_complex",
|
|
f"[0:a]volume=1[a1];[1:a]volume={vol}[a0];[a1][a0]amix=inputs=2:duration=first:dropout_transition=0[mix]",
|
|
"-map",
|
|
"[mix]",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
mixed_tmp,
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
shutil.move(mixed_tmp, out_wav)
|
|
finally:
|
|
try:
|
|
if os.path.exists(orig_tmp):
|
|
os.remove(orig_tmp)
|
|
except Exception:
|
|
pass
|
|
|
|
if video:
|
|
# si se pidió reemplazar la pista original
|
|
out_video = os.path.splitext(video)[0] + ".replaced_audio.mp4"
|
|
try:
|
|
self._processor.replace_audio_in_video(video, out_wav, out_video)
|
|
except Exception as e:
|
|
print(f"Error al reemplazar audio en el vídeo: {e}")
|
|
|
|
# limpieza: opcional conservar tmpdir si keep_chunks
|
|
if not keep_chunks:
|
|
try:
|
|
import shutil as _sh
|
|
|
|
_sh.rmtree(tmpdir, ignore_errors=True)
|
|
except Exception:
|
|
pass
|
|
|