58 lines
1.8 KiB
Python
58 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""translate_srt_local.py
|
|
Traduce un .srt localmente usando MarianMT (Helsinki-NLP/opus-mt-en-es).
|
|
|
|
Uso:
|
|
source .venv/bin/activate
|
|
python3 whisper_project/translate_srt_local.py --in path/to/in.srt --out path/to/out.srt
|
|
|
|
Requisitos: transformers, sentencepiece, srt
|
|
"""
|
|
import argparse
|
|
import srt
|
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
|
|
|
def translate_srt(in_path: str, out_path: str, model_name: str = "Helsinki-NLP/opus-mt-en-es", batch_size: int = 8):
|
|
with open(in_path, "r", encoding="utf-8") as f:
|
|
subs = list(srt.parse(f.read()))
|
|
|
|
# Cargar modelo y tokenizador
|
|
tok = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
|
|
texts = [sub.content.strip() for sub in subs]
|
|
translated = []
|
|
|
|
for i in range(0, len(texts), batch_size):
|
|
batch = texts[i:i+batch_size]
|
|
# tokenizar
|
|
enc = tok(batch, return_tensors="pt", padding=True, truncation=True)
|
|
outs = model.generate(**enc, max_length=512)
|
|
outs_decoded = tok.batch_decode(outs, skip_special_tokens=True)
|
|
translated.extend(outs_decoded)
|
|
|
|
# Asignar traducidos
|
|
for sub, t in zip(subs, translated):
|
|
sub.content = t.strip()
|
|
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
f.write(srt.compose(subs))
|
|
|
|
print(f"SRT traducido guardado en: {out_path}")
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--in", dest="in_srt", required=True)
|
|
p.add_argument("--out", dest="out_srt", required=True)
|
|
p.add_argument("--model", default="Helsinki-NLP/opus-mt-en-es")
|
|
p.add_argument("--batch-size", dest="batch_size", type=int, default=8)
|
|
args = p.parse_args()
|
|
|
|
translate_srt(args.in_srt, args.out_srt, model_name=args.model, batch_size=args.batch_size)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|