submaster/whisper_project/translate_srt_local.py

58 lines
1.8 KiB
Python

#!/usr/bin/env python3
"""translate_srt_local.py
Traduce un .srt localmente usando MarianMT (Helsinki-NLP/opus-mt-en-es).
Uso:
source .venv/bin/activate
python3 whisper_project/translate_srt_local.py --in path/to/in.srt --out path/to/out.srt
Requisitos: transformers, sentencepiece, srt
"""
import argparse
import srt
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
def translate_srt(in_path: str, out_path: str, model_name: str = "Helsinki-NLP/opus-mt-en-es", batch_size: int = 8):
with open(in_path, "r", encoding="utf-8") as f:
subs = list(srt.parse(f.read()))
# Cargar modelo y tokenizador
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
texts = [sub.content.strip() for sub in subs]
translated = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
# tokenizar
enc = tok(batch, return_tensors="pt", padding=True, truncation=True)
outs = model.generate(**enc, max_length=512)
outs_decoded = tok.batch_decode(outs, skip_special_tokens=True)
translated.extend(outs_decoded)
# Asignar traducidos
for sub, t in zip(subs, translated):
sub.content = t.strip()
with open(out_path, "w", encoding="utf-8") as f:
f.write(srt.compose(subs))
print(f"SRT traducido guardado en: {out_path}")
def main():
p = argparse.ArgumentParser()
p.add_argument("--in", dest="in_srt", required=True)
p.add_argument("--out", dest="out_srt", required=True)
p.add_argument("--model", default="Helsinki-NLP/opus-mt-en-es")
p.add_argument("--batch-size", dest="batch_size", type=int, default=8)
args = p.parse_args()
translate_srt(args.in_srt, args.out_srt, model_name=args.model, batch_size=args.batch_size)
if __name__ == '__main__':
main()