#!/usr/bin/env python3 """translate_srt_local.py Traduce un .srt localmente usando MarianMT (Helsinki-NLP/opus-mt-en-es). Uso: source .venv/bin/activate python3 whisper_project/translate_srt_local.py --in path/to/in.srt --out path/to/out.srt Requisitos: transformers, sentencepiece, srt """ import argparse import srt from transformers import AutoModelForSeq2SeqLM, AutoTokenizer def translate_srt(in_path: str, out_path: str, model_name: str = "Helsinki-NLP/opus-mt-en-es", batch_size: int = 8): with open(in_path, "r", encoding="utf-8") as f: subs = list(srt.parse(f.read())) # Cargar modelo y tokenizador tok = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) texts = [sub.content.strip() for sub in subs] translated = [] for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] # tokenizar enc = tok(batch, return_tensors="pt", padding=True, truncation=True) outs = model.generate(**enc, max_length=512) outs_decoded = tok.batch_decode(outs, skip_special_tokens=True) translated.extend(outs_decoded) # Asignar traducidos for sub, t in zip(subs, translated): sub.content = t.strip() with open(out_path, "w", encoding="utf-8") as f: f.write(srt.compose(subs)) print(f"SRT traducido guardado en: {out_path}") def main(): p = argparse.ArgumentParser() p.add_argument("--in", dest="in_srt", required=True) p.add_argument("--out", dest="out_srt", required=True) p.add_argument("--model", default="Helsinki-NLP/opus-mt-en-es") p.add_argument("--batch-size", dest="batch_size", type=int, default=8) args = p.parse_args() translate_srt(args.in_srt, args.out_srt, model_name=args.model, batch_size=args.batch_size) if __name__ == '__main__': main()