TubeScript-API/fetch_transcript.py

124 lines
4.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Script para obtener transcript de un video de YouTube usando las funciones del proyecto.
Maneja HTTP 429 y guarda el resultado en JSON.
Uso:
python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]
Ejemplos:
python3 fetch_transcript.py K08TM4OVLyo es
python3 fetch_transcript.py K08TM4OVLyo es chrome
python3 fetch_transcript.py K08TM4OVLyo es "chrome:Profile 1"
"""
import sys
import json
import os
import subprocess
import tempfile
import glob
from main import parse_subtitle_format
def fetch_with_browser_cookies(video_id, lang="es", browser="chrome"):
"""Intenta obtener transcript usando cookies desde el navegador directamente."""
print(f"🔑 Usando cookies desde navegador: {browser}")
with tempfile.TemporaryDirectory() as tmpdir:
cmd = [
"yt-dlp",
"--cookies-from-browser", browser,
"--skip-download",
"--write-auto-sub",
"--write-sub",
"--sub-lang", lang,
"--sub-format", "vtt",
"-o", os.path.join(tmpdir, "%(id)s.%(ext)s"),
f"https://www.youtube.com/watch?v={video_id}"
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
# Buscar archivos VTT generados
files = glob.glob(os.path.join(tmpdir, f"{video_id}*.vtt"))
if files:
with open(files[0], 'r', encoding='utf-8') as f:
vtt_content = f.read()
segments = parse_subtitle_format(vtt_content, 'vtt')
return segments, None
else:
stderr = result.stderr or ''
return None, f"No se generaron archivos. Error: {stderr[:500]}"
except subprocess.TimeoutExpired:
return None, "Timeout al ejecutar yt-dlp"
except FileNotFoundError:
return None, "yt-dlp no está instalado. Ejecuta: pip install yt-dlp"
except Exception as e:
return None, f"Error: {str(e)[:200]}"
def main():
if len(sys.argv) < 2:
print("Uso: python3 fetch_transcript.py VIDEO_ID [LANG] [BROWSER]")
print("")
print("Ejemplos:")
print(" python3 fetch_transcript.py K08TM4OVLyo")
print(" python3 fetch_transcript.py K08TM4OVLyo es")
print(" python3 fetch_transcript.py K08TM4OVLyo es chrome")
print(" python3 fetch_transcript.py K08TM4OVLyo es 'chrome:Profile 1'")
print(" python3 fetch_transcript.py K08TM4OVLyo es firefox")
print("")
sys.exit(1)
video_id = sys.argv[1]
lang = sys.argv[2] if len(sys.argv) > 2 else "es"
browser = sys.argv[3] if len(sys.argv) > 3 else None
print(f"🔍 Intentando obtener transcript para: {video_id}")
print(f" Idioma: {lang}")
if browser:
print(f" Método: Cookies desde {browser}")
segments, error = fetch_with_browser_cookies(video_id, lang, browser)
else:
print(f" Método: API del proyecto")
print(f" Cookies: {os.getenv('API_COOKIES_PATH', './cookies.txt')}")
from main import get_transcript_data
segments, error = get_transcript_data(video_id, lang)
print("")
# Intentar obtener transcript
segments, error = get_transcript_data(video_id, lang)
if error:
print(f"❌ ERROR: {error}")
sys.exit(1)
if not segments:
print("❌ No se obtuvieron segmentos")
sys.exit(1)
print(f"✅ Éxito: {len(segments)} segmentos obtenidos")
# Guardar a JSON
output_file = f"{video_id}_transcript.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(segments, f, ensure_ascii=False, indent=2)
print(f"💾 Guardado en: {output_file}")
# Guardar texto concatenado
text_file = f"{video_id}_transcript.txt"
combined_text = "\n".join([seg.get('text', '') for seg in segments])
with open(text_file, 'w', encoding='utf-8') as f:
f.write(combined_text)
print(f"📄 Texto guardado en: {text_file}")
# Mostrar primeros 10 segmentos
print("\n📝 Primeros 10 segmentos:")
for seg in segments[:10]:
print(f" [{seg.get('start', 0):.1f}s] {seg.get('text', '')}")
if __name__ == "__main__":
main()