121 lines
4.1 KiB
Python
Executable File
121 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Script para obtener transcript de un video de YouTube usando las funciones del proyecto.
|
|
Maneja HTTP 429 y guarda el resultado en JSON.
|
|
|
|
Uso:
|
|
python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]
|
|
|
|
Ejemplos:
|
|
python3 fetch_transcript.py K08TM4OVLyo es
|
|
python3 fetch_transcript.py K08TM4OVLyo es chrome
|
|
python3 fetch_transcript.py K08TM4OVLyo es "chrome:Profile 1"
|
|
"""
|
|
import sys
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import glob
|
|
from main import parse_subtitle_format, get_transcript_data
|
|
|
|
def fetch_with_browser_cookies(video_id, lang="es", browser="chrome"):
|
|
"""Intenta obtener transcript usando cookies desde el navegador directamente."""
|
|
print(f"🔑 Usando cookies desde navegador: {browser}")
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
cmd = [
|
|
"yt-dlp",
|
|
"--cookies-from-browser", browser,
|
|
"--skip-download",
|
|
"--write-auto-sub",
|
|
"--write-sub",
|
|
"--sub-lang", lang,
|
|
"--sub-format", "vtt",
|
|
"-o", os.path.join(tmpdir, "%(id)s.%(ext)s"),
|
|
f"https://www.youtube.com/watch?v={video_id}"
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
|
|
|
|
# Buscar archivos VTT generados
|
|
files = glob.glob(os.path.join(tmpdir, f"{video_id}*.vtt"))
|
|
if files:
|
|
with open(files[0], 'r', encoding='utf-8') as f:
|
|
vtt_content = f.read()
|
|
segments = parse_subtitle_format(vtt_content, 'vtt')
|
|
return segments, None
|
|
else:
|
|
stderr = result.stderr or ''
|
|
return None, f"No se generaron archivos. Error: {stderr[:500]}"
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return None, "Timeout al ejecutar yt-dlp"
|
|
except FileNotFoundError:
|
|
return None, "yt-dlp no está instalado. Ejecuta: pip install yt-dlp"
|
|
except Exception as e:
|
|
return None, f"Error: {str(e)[:200]}"
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Uso: python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]")
|
|
print("")
|
|
print("Ejemplos:")
|
|
print(" python3 fetch_transcript.py K08TM4OVLyo")
|
|
print(" python3 fetch_transcript.py K08TM4OVLyo es")
|
|
print(" python3 fetch_transcript.py K08TM4OVLyo es chrome")
|
|
print(" python3 fetch_transcript.py K08TM4OVLyo es 'chrome:Profile 1'")
|
|
print(" python3 fetch_transcript.py K08TM4OVLyo es firefox")
|
|
print("")
|
|
sys.exit(1)
|
|
|
|
video_id = sys.argv[1]
|
|
lang = sys.argv[2] if len(sys.argv) > 2 else "es"
|
|
browser = sys.argv[3] if len(sys.argv) > 3 else None
|
|
|
|
print(f"🔍 Intentando obtener transcript para: {video_id}")
|
|
print(f" Idioma: {lang}")
|
|
|
|
if browser:
|
|
print(" Método: Cookies desde {}".format(browser))
|
|
segments, error = fetch_with_browser_cookies(video_id, lang, browser)
|
|
else:
|
|
print(" Método: API del proyecto")
|
|
print(" Cookies: {}".format(os.getenv('API_COOKIES_PATH', './data/cookies.txt')))
|
|
segments, error = get_transcript_data(video_id, lang)
|
|
|
|
print("")
|
|
|
|
|
|
if error:
|
|
print(f"❌ ERROR: {error}")
|
|
sys.exit(1)
|
|
|
|
if not segments:
|
|
print("❌ No se obtuvieron segmentos")
|
|
sys.exit(1)
|
|
|
|
print(f"✅ Éxito: {len(segments)} segmentos obtenidos")
|
|
|
|
# Guardar a JSON
|
|
output_file = f"{video_id}_transcript.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(segments, f, ensure_ascii=False, indent=2)
|
|
print(f"💾 Guardado en: {output_file}")
|
|
|
|
# Guardar texto concatenado
|
|
text_file = f"{video_id}_transcript.txt"
|
|
combined_text = "\n".join([seg.get('text', '') for seg in segments])
|
|
with open(text_file, 'w', encoding='utf-8') as f:
|
|
f.write(combined_text)
|
|
print(f"📄 Texto guardado en: {text_file}")
|
|
|
|
# Mostrar primeros 10 segmentos
|
|
print("\n📝 Primeros 10 segmentos:")
|
|
for seg in segments[:10]:
|
|
print(f" [{seg.get('start', 0):.1f}s] {seg.get('text', '')}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|