#!/usr/bin/env python3 """ Script para obtener transcript de un video de YouTube usando las funciones del proyecto. Maneja HTTP 429 y guarda el resultado en JSON. Uso: python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER] Ejemplos: python3 fetch_transcript.py K08TM4OVLyo es python3 fetch_transcript.py K08TM4OVLyo es chrome python3 fetch_transcript.py K08TM4OVLyo es "chrome:Profile 1" """ import sys import json import os import subprocess import tempfile import glob from main import parse_subtitle_format def fetch_with_browser_cookies(video_id, lang="es", browser="chrome"): """Intenta obtener transcript usando cookies desde el navegador directamente.""" print(f"🔑 Usando cookies desde navegador: {browser}") with tempfile.TemporaryDirectory() as tmpdir: cmd = [ "yt-dlp", "--cookies-from-browser", browser, "--skip-download", "--write-auto-sub", "--write-sub", "--sub-lang", lang, "--sub-format", "vtt", "-o", os.path.join(tmpdir, "%(id)s.%(ext)s"), f"https://www.youtube.com/watch?v={video_id}" ] try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=180) # Buscar archivos VTT generados files = glob.glob(os.path.join(tmpdir, f"{video_id}*.vtt")) if files: with open(files[0], 'r', encoding='utf-8') as f: vtt_content = f.read() segments = parse_subtitle_format(vtt_content, 'vtt') return segments, None else: stderr = result.stderr or '' return None, f"No se generaron archivos. Error: {stderr[:500]}" except subprocess.TimeoutExpired: return None, "Timeout al ejecutar yt-dlp" except FileNotFoundError: return None, "yt-dlp no está instalado. Ejecuta: pip install yt-dlp" except Exception as e: return None, f"Error: {str(e)[:200]}" def main(): if len(sys.argv) < 2: print("Uso: python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]") print("") print("Ejemplos:") print(" python3 fetch_transcript.py K08TM4OVLyo") print(" python3 fetch_transcript.py K08TM4OVLyo es") print(" python3 fetch_transcript.py K08TM4OVLyo es chrome") print(" python3 fetch_transcript.py K08TM4OVLyo es 'chrome:Profile 1'") print(" python3 fetch_transcript.py K08TM4OVLyo es firefox") print("") sys.exit(1) video_id = sys.argv[1] lang = sys.argv[2] if len(sys.argv) > 2 else "es" browser = sys.argv[3] if len(sys.argv) > 3 else None print(f"🔍 Intentando obtener transcript para: {video_id}") print(f" Idioma: {lang}") if browser: print(f" Método: Cookies desde {browser}") segments, error = fetch_with_browser_cookies(video_id, lang, browser) else: print(f" Método: API del proyecto") print(f" Cookies: {os.getenv('API_COOKIES_PATH', './cookies.txt')}") from main import get_transcript_data segments, error = get_transcript_data(video_id, lang) print("") # Intentar obtener transcript segments, error = get_transcript_data(video_id, lang) if error: print(f"❌ ERROR: {error}") sys.exit(1) if not segments: print("❌ No se obtuvieron segmentos") sys.exit(1) print(f"✅ Éxito: {len(segments)} segmentos obtenidos") # Guardar a JSON output_file = f"{video_id}_transcript.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(segments, f, ensure_ascii=False, indent=2) print(f"💾 Guardado en: {output_file}") # Guardar texto concatenado text_file = f"{video_id}_transcript.txt" combined_text = "\n".join([seg.get('text', '') for seg in segments]) with open(text_file, 'w', encoding='utf-8') as f: f.write(combined_text) print(f"📄 Texto guardado en: {text_file}") # Mostrar primeros 10 segmentos print("\n📝 Primeros 10 segmentos:") for seg in segments[:10]: print(f" [{seg.get('start', 0):.1f}s] {seg.get('text', '')}") if __name__ == "__main__": main()