submaster/whisper_project/process_video.py
2025-10-23 21:54:13 -07:00

180 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""Procesamiento de vídeo: extrae audio, transcribe/traduce y
quema subtítulos.
Flujo:
- Extrae audio con ffmpeg (WAV 16k mono)
- Transcribe con faster-whisper o openai-whisper
(opción task='translate')
- Escribe SRT y lo incrusta en el vídeo con ffmpeg
Nota: requiere ffmpeg instalado y, para modelos, faster-whisper
o openai-whisper.
"""
import argparse
import subprocess
import tempfile
from pathlib import Path
import sys
from transcribe import write_srt
def extract_audio(video_path: str, out_audio: str):
cmd = [
"ffmpeg",
"-y",
"-i",
video_path,
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
out_audio,
]
subprocess.run(cmd, check=True)
def burn_subtitles(video_path: str, srt_path: str, out_video: str):
# Usar filtro subtitles de ffmpeg
cmd = [
"ffmpeg",
"-y",
"-i",
video_path,
"-vf",
f"subtitles={srt_path}",
"-c:a",
"copy",
out_video,
]
subprocess.run(cmd, check=True)
def transcribe_and_translate_faster(audio_path: str, model: str, target: str):
from faster_whisper import WhisperModel
wm = WhisperModel(model, device="cpu", compute_type="int8")
segments, info = wm.transcribe(
audio_path, beam_size=5, task="translate", language=target
)
return segments
def transcribe_and_translate_openai(audio_path: str, model: str, target: str):
import whisper
m = whisper.load_model(model, device="cpu")
result = m.transcribe(
audio_path, fp16=False, task="translate", language=target
)
return result.get("segments", None)
def main():
parser = argparse.ArgumentParser(
description=(
"Extraer, transcribir/traducir y quemar subtítulos en vídeo"
" (offline)"
)
)
parser.add_argument(
"--video", "-v", required=True, help="Ruta del archivo de vídeo"
)
parser.add_argument(
"--backend",
"-b",
choices=["faster-whisper", "openai-whisper"],
default="faster-whisper",
)
parser.add_argument(
"--model",
"-m",
default="base",
help="Modelo de whisper a usar (tiny, base, etc.)",
)
parser.add_argument(
"--to", "-t", default="es", help="Idioma de destino para traducción"
)
parser.add_argument(
"--out",
"-o",
default=None,
help=(
"Ruta del vídeo de salida (si no se especifica,"
" se usa input_burned.mp4)"
),
)
parser.add_argument(
"--srt",
default=None,
help=(
"Ruta SRT a escribir (si no se especifica,"
" se usa input.srt)"
),
)
args = parser.parse_args()
video = Path(args.video)
if not video.exists():
print("Vídeo no encontrado", file=sys.stderr)
sys.exit(2)
out_video = (
args.out
if args.out
else str(video.with_name(video.stem + "_burned.mp4"))
)
srt_path = args.srt if args.srt else str(video.with_suffix('.srt'))
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
audio_path = tmp.name
try:
print("Extrayendo audio con ffmpeg...")
extract_audio(str(video), audio_path)
print(
f"Transcribiendo y traduciendo a '{args.to}'"
f" usando {args.backend}..."
)
if args.backend == "faster-whisper":
segments = transcribe_and_translate_faster(
audio_path, args.model, args.to
)
else:
segments = transcribe_and_translate_openai(
audio_path, args.model, args.to
)
if not segments:
print(
"No se obtuvieron segmentos de la transcripción",
file=sys.stderr,
)
sys.exit(3)
print(f"Escribiendo SRT en {srt_path}...")
write_srt(segments, srt_path)
print(
f"Quemando subtítulos en el vídeo -> {out_video}"
f" (esto puede tardar)..."
)
burn_subtitles(str(video), srt_path, out_video)
print("Proceso completado.")
finally:
try:
Path(audio_path).unlink()
except Exception:
pass
if __name__ == "__main__":
main()