TubeScript-API/fetch_transcript.py

#!/usr/bin/env python3
"""
Script para obtener transcript de un video de YouTube usando las funciones del proyecto.
Maneja HTTP 429 y guarda el resultado en JSON.

Uso:
    python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]

Ejemplos:
    python3 fetch_transcript.py K08TM4OVLyo es
    python3 fetch_transcript.py K08TM4OVLyo es chrome
    python3 fetch_transcript.py K08TM4OVLyo es "chrome:Profile 1"
"""
import sys
import json
import os
import subprocess
import tempfile
import glob
from main import parse_subtitle_format, get_transcript_data

def fetch_with_browser_cookies(video_id, lang="es", browser="chrome"):
    """Intenta obtener transcript usando cookies desde el navegador directamente."""
    print(f"🔑 Usando cookies desde navegador: {browser}")

    with tempfile.TemporaryDirectory() as tmpdir:
        cmd = [
            "yt-dlp",
            "--cookies-from-browser", browser,
            "--skip-download",
            "--write-auto-sub",
            "--write-sub",
            "--sub-lang", lang,
            "--sub-format", "vtt",
            "-o", os.path.join(tmpdir, "%(id)s.%(ext)s"),
            f"https://www.youtube.com/watch?v={video_id}"
        ]

        try:
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=180)

            # Buscar archivos VTT generados
            files = glob.glob(os.path.join(tmpdir, f"{video_id}*.vtt"))
            if files:
                with open(files[0], 'r', encoding='utf-8') as f:
                    vtt_content = f.read()
                segments = parse_subtitle_format(vtt_content, 'vtt')
                return segments, None
            else:
                stderr = result.stderr or ''
                return None, f"No se generaron archivos. Error: {stderr[:500]}"

        except subprocess.TimeoutExpired:
            return None, "Timeout al ejecutar yt-dlp"
        except FileNotFoundError:
            return None, "yt-dlp no está instalado. Ejecuta: pip install yt-dlp"
        except Exception as e:
            return None, f"Error: {str(e)[:200]}"

def main():
    if len(sys.argv) < 2:
        print("Uso: python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]")
        print("")
        print("Ejemplos:")
        print("  python3 fetch_transcript.py K08TM4OVLyo")
        print("  python3 fetch_transcript.py K08TM4OVLyo es")
        print("  python3 fetch_transcript.py K08TM4OVLyo es chrome")
        print("  python3 fetch_transcript.py K08TM4OVLyo es 'chrome:Profile 1'")
        print("  python3 fetch_transcript.py K08TM4OVLyo es firefox")
        print("")
        sys.exit(1)

    video_id = sys.argv[1]
    lang = sys.argv[2] if len(sys.argv) > 2 else "es"
    browser = sys.argv[3] if len(sys.argv) > 3 else None

    print(f"🔍 Intentando obtener transcript para: {video_id}")
    print(f"   Idioma: {lang}")

    if browser:
        print("   Método: Cookies desde {}".format(browser))
        segments, error = fetch_with_browser_cookies(video_id, lang, browser)
    else:
        print("   Método: API del proyecto")
        print("   Cookies: {}".format(os.getenv('API_COOKIES_PATH', './data/cookies.txt')))
        segments, error = get_transcript_data(video_id, lang)

    print("")


    if error:
        print(f"❌ ERROR: {error}")
        sys.exit(1)

    if not segments:
        print("❌ No se obtuvieron segmentos")
        sys.exit(1)

    print(f"✅ Éxito: {len(segments)} segmentos obtenidos")

    # Guardar a JSON
    output_file = f"{video_id}_transcript.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(segments, f, ensure_ascii=False, indent=2)
    print(f"💾 Guardado en: {output_file}")

    # Guardar texto concatenado
    text_file = f"{video_id}_transcript.txt"
    combined_text = "\n".join([seg.get('text', '') for seg in segments])
    with open(text_file, 'w', encoding='utf-8') as f:
        f.write(combined_text)
    print(f"📄 Texto guardado en: {text_file}")

    # Mostrar primeros 10 segmentos
    print("\n📝 Primeros 10 segmentos:")
    for seg in segments[:10]:
        print(f"   [{seg.get('start', 0):.1f}s] {seg.get('text', '')}")

if __name__ == "__main__":
    main()