import os import json import subprocess import requests import time import re import tempfile import glob import random from fastapi import FastAPI, HTTPException, UploadFile, File from typing import List, Dict, Any, cast from fastapi.responses import JSONResponse # Intentar importar youtube_transcript_api como fallback try: from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound YOUTUBE_TRANSCRIPT_API_AVAILABLE = True except Exception: # definir placeholders para evitar NameError si la librería no está instalada YouTubeTranscriptApi = None class TranscriptsDisabled(Exception): pass class NoTranscriptFound(Exception): pass YOUTUBE_TRANSCRIPT_API_AVAILABLE = False # Import CookieManager from yt_wrap to provide cookiefile paths per request from yt_wrap import CookieManager app = FastAPI(title="TubeScript API Pro - JSON Cleaner") # Ruta de cookies configurable vía variable de entorno: API_COOKIES_PATH # Por defecto, usar ./data/cookies.txt para agrupar configuraciones en la carpeta data DEFAULT_COOKIES_PATH = './data/cookies.txt' # Proxy opcional para requests/yt-dlp (ej. socks5h://127.0.0.1:9050) DEFAULT_PROXY = os.getenv('API_PROXY', '') # Nuevo: rotador/simple selector de proxies # - Si se define API_PROXY se usa directamente. # - Si se define API_PROXIES (lista separada por comas) se elige uno al azar. # Ej: API_PROXIES="socks5h://127.0.0.1:9050,http://10.0.0.1:3128" # Nuevo: ruta por defecto del archivo whitelist PROXY_WHITELIST_FILE = os.getenv('PROXY_WHITELIST_FILE', 'tools/whitelist.txt') _proxy_whitelist_cache = { 'ts': 0, 'proxies': [] } def _load_whitelist_file(path: str, ttl: int = 30): """Carga proxies desde archivo path con TTL en segundos para cache. Retorna lista de proxies (puede ser vacía). """ now = time.time() if _proxy_whitelist_cache['proxies'] and (now - _proxy_whitelist_cache['ts'] < ttl): return _proxy_whitelist_cache['proxies'] proxies = [] try: if os.path.exists(path): with open(path, 'r', encoding='utf-8') as fh: for line in fh: p = line.strip() if p and not p.startswith('#'): proxies.append(p) except Exception: proxies = [] _proxy_whitelist_cache['proxies'] = proxies _proxy_whitelist_cache['ts'] = now return proxies def _get_proxy_choice() -> str | None: """Devuelve una URL de proxy elegida: - Prioridad: API_PROXY (single) -> API_PROXIES (comma list) -> PROXY_WHITELIST_FILE -> None """ # 1) Legacy single proxy has priority single = os.getenv('API_PROXY', '') or DEFAULT_PROXY or '' if single: return single # 2) comma-separated list from env lst = os.getenv('API_PROXIES', '') or '' if lst: proxies = [p.strip() for p in lst.split(',') if p.strip()] if proxies: return random.choice(proxies) # 3) whitelist file wl_file = os.getenv('PROXY_WHITELIST_FILE', PROXY_WHITELIST_FILE) proxies = _load_whitelist_file(wl_file) if proxies: return random.choice(proxies) return None def clean_youtube_json(raw_json: Dict) -> List[Dict]: """ Transforma el formato complejo 'json3' de YouTube a un formato simple: [{'start': 0.0, 'duration': 2.0, 'text': 'Hola'}] """ clean_data = [] # YouTube guarda los eventos de texto en la llave 'events' events = raw_json.get('events', []) for event in events: # Solo procesamos eventos que tengan segmentos de texto if 'segs' in event: text = "".join([seg['utf8'] for seg in event['segs']]).strip() if text and text != '\n': clean_data.append({ "start": event.get('tStartMs', 0) / 1000.0, # Convertir a segundos "duration": event.get('dDurationMs', 0) / 1000.0, "text": text.replace('\n', ' ') }) return clean_data def parse_subtitle_format(content: str, format_type: str) -> List[Dict]: """ Parsea diferentes formatos de subtítulos (json3, srv3, vtt) al formato estándar """ try: if format_type == 'json3': # Formato JSON3 de YouTube data = json.loads(content) if isinstance(content, str) else content return clean_youtube_json(data) elif format_type in ['srv3', 'vtt']: # Para srv3 y vtt, intentar parsear como JSON primero try: data = json.loads(content) if isinstance(content, str) else content # srv3 también tiene estructura similar a json3 if 'events' in data: return clean_youtube_json(data) except: pass # Si no es JSON, intentar parsear como texto VTT clean_data = [] lines = content.split('\n') if isinstance(content, str) else [] current_time = 0.0 current_text = "" for line in lines: line = line.strip() if not line or line.startswith('WEBVTT') or '-->' in line: if '-->' in line: # Extraer tiempo de inicio try: time_parts = line.split('-->')[0].strip().split(':') if len(time_parts) >= 2: current_time = float(time_parts[-2]) * 60 + float(time_parts[-1]) except: pass continue if line and not line.isdigit(): current_text = line if current_text: clean_data.append({ "start": current_time, "duration": 2.0, # Duración aproximada "text": current_text }) current_time += 2.0 return clean_data if clean_data else [] else: # Formato desconocido, intentar como JSON data = json.loads(content) if isinstance(content, str) else content if 'events' in data: return clean_youtube_json(data) return [] except Exception as e: print(f"Error parsing subtitle format {format_type}: {e}") return [] def extract_video_id(video_id_or_url: str) -> str: """ Normaliza la entrada y extrae el video_id si se recibe una URL completa. Acepta: https://www.youtube.com/watch?v=ID, youtu.be/ID, o el propio ID. """ if not video_id_or_url: return "" s = video_id_or_url.strip() # Si ya parece un id (11-20 caracteres alfanuméricos y -, _), retornarlo if re.match(r'^[A-Za-z0-9_-]{8,20}$', s): return s # Intentar extraer de URL completa # watch?v= m = re.search(r'[?&]v=([A-Za-z0-9_-]{8,20})', s) if m: return m.group(1) # youtu.be/ m = re.search(r'youtu\.be/([A-Za-z0-9_-]{8,20})', s) if m: return m.group(1) # /v/ or /embed/ m = re.search(r'(?:/v/|/embed/)([A-Za-z0-9_-]{8,20})', s) if m: return m.group(1) # Si no se detecta, devolver la entrada original (fallará después si es inválida) return s def format_segments_text(segments: List[Dict]) -> List[str]: """Devuelve una lista 'format_text' con textos limpios extraídos de segments. - elimina prefijos tipo 'Kind: captions' - elimina contenido en corchetes/paréntesis - elimina etiquetas HTML - normaliza espacios - divide por saltos de línea para obtener frases independientes """ def _clean_text(t: str) -> str: if not t: return '' s = str(t).strip() s = re.sub(r'^\s*Kind\s*:\s*.*$', '', s, flags=re.IGNORECASE).strip() # eliminar contenido entre corchetes (no-greedy) s = re.sub(r'\[[^\]]*\]', '', s) s = re.sub(r'\([^\)]*\)', '', s) s = re.sub(r'<[^>]+>', '', s) s = re.sub(r'[♪★■◆►▶◀•–—]', '', s) s = re.sub(r'\s+', ' ', s).strip() return s output: List[str] = [] for seg in segments or []: raw = seg.get('text', '') cleaned = _clean_text(raw) if not cleaned: continue parts = [p.strip() for p in re.split(r'[\n\r]+', cleaned) if p.strip()] output.extend(parts) return output NODE_PATH = "/usr/bin/node" def _yt_client_args(has_cookies: bool, for_stream: bool = False) -> list: """Devuelve --extractor-args y --js-runtimes para metadata/streams. Estrategia actualizada 2026-03-07: - android → REQUIERE GVS PO Token desde 2026 → formatos HTTPS omitidos → HTTP 403. YA NO SE USA para metadata ni streams. - Sin cookies → tv_embedded (sin PO Token, sin n-challenge, funciona para metadata) - Con cookies → web + Node.js (Node resuelve n-challenge/signature) - for_stream → tv_embedded (más fiable para HLS/lives sin cookies) Diagnóstico: - android → requiere GVS PO Token (2026) → NO usar - mweb → requiere Visitor Data PO Token → NO usar sin cookies - tv_embedded → sin PO Token requerido → ✅ funciona para metadata/stream - web + Node.js → ✅ funciona con cookies """ if for_stream or not has_cookies: return ["--extractor-args", "youtube:player_client=tv_embedded"] else: return [ "--extractor-args", "youtube:player_client=web", "--js-runtimes", f"node:{NODE_PATH}", ] def _yt_subs_args(has_cookies: bool) -> list: """Devuelve --extractor-args para descarga de subtítulos. Estrategia actualizada 2026-03-07: - android → requiere GVS PO Token desde 2026 → subtítulos HTTP 403 → NO usar. - tv_embedded → sin PO Token, obtiene auto-subs sin bot-check → ✅ preferido. - mweb → fallback útil si tv_embedded no trae subs en ciertos idiomas. - web + Node → sólo con cookies (resuelve n-challenge). """ if has_cookies: return [ "--extractor-args", "youtube:player_client=web", "--js-runtimes", f"node:{NODE_PATH}", ] return ["--extractor-args", "youtube:player_client=tv_embedded,mweb"] # Nuevo helper: obtener thumbnails para un video — usa URLs estáticas directas (sin yt-dlp) def get_video_thumbnails(video_id: str) -> List[str]: """Devuelve URLs de thumbnail sin llamar yt-dlp (rápido, sin bloquear el transcript). YouTube siempre tiene estas URLs disponibles para cualquier video público. """ return [ f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg", f"https://img.youtube.com/vi/{video_id}/sddefault.jpg", f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg", f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg", f"https://img.youtube.com/vi/{video_id}/default.jpg", ] def get_transcript_data(video_id: str, lang: str = "es"): video_id = extract_video_id(video_id) if not video_id: return None, [], "video_id inválido o vacío" url = f"https://www.youtube.com/watch?v={video_id}" # Use CookieManager to get a cookiefile path per request (may be None) cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() # cookies_path: prefer the temporary cookiefile if present, otherwise fall back to env path cookies_path = cookiefile_path or os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) # proxy support proxy = _get_proxy_choice() proxies = {'http': proxy, 'https': proxy} if proxy else None def load_cookies_from_file(path: str) -> dict: """Parsea un cookies.txt en formato Netscape a un dict usable por requests.""" cookies = {} try: if not path or not os.path.exists(path): return cookies with open(path, 'r', encoding='utf-8', errors='ignore') as fh: for line in fh: line = line.strip() if not line or line.startswith('#'): continue parts = line.split('\t') # formato Netscape: domain, flag, path, secure, expiration, name, value if len(parts) >= 7: name = parts[5].strip() value = parts[6].strip() if name: cookies[name] = value else: # fallback: intento simple name=value if '=' in line: k, v = line.split('=', 1) cookies[k.strip()] = v.strip() except Exception: return {} return cookies cookies_for_requests = load_cookies_from_file(cookies_path) if cookies_path else {} _has_ck_subs = bool(cookies_path and os.path.exists(cookies_path)) # Intento rápido y fiable: usar yt-dlp para descargar subtítulos (auto o manual) al tmpdir try: with tempfile.TemporaryDirectory() as tmpdl: # Construir lista amplia de variantes de idioma # yt-dlp usa códigos exactos; cubrimos las variantes más comunes sub_langs = [lang] if lang == "en": sub_langs = ["en", "en-US", "en-en", "en-GB", "en-CA", "en-AU"] elif lang == "es": sub_langs = ["es", "es-419", "es-MX", "es-ES", "es-LA", "es-en"] elif len(lang) == 2: sub_langs = [lang, f"{lang}-{lang.upper()}", f"{lang}-419", f"{lang}-en"] # tv_embedded/mweb para subtítulos sin cookies (no requieren PO Token) # web + Node.js cuando hay cookies (resuelve n-challenge) ytdlp_cmd = [ "yt-dlp", url, "--skip-download", "--write-auto-sub", "--write-sub", "--sub-format", "vtt/json3/srv3/best", "-o", os.path.join(tmpdl, "%(id)s.%(ext)s"), "--no-warnings", "--sub-lang", ",".join(sub_langs), ] + _yt_subs_args(_has_ck_subs) # Pasar cookies solo cuando se usa cliente web (con cookies) if _has_ck_subs: ytdlp_cmd.extend(["--cookies", cookies_path]) # attach proxy if configured if proxy: ytdlp_cmd.extend(['--proxy', proxy]) try: result = subprocess.run(ytdlp_cmd, capture_output=True, text=True, timeout=120) stderr = (result.stderr or "").lower() # Error: YouTube pide autenticación if result.returncode != 0 and ('sign in' in stderr or 'confirm you' in stderr or 'bot' in stderr): return None, get_video_thumbnails(video_id), "YouTube requiere autenticación para este video. Sube un cookies.txt válido con /upload_cookies." # Si yt-dlp falló por rate limiting, devolver mensaje claro stderr = (result.stderr or "").lower() if result.returncode != 0 and ('sign in' in stderr or 'confirm you' in stderr or 'bot' in stderr): return None, get_video_thumbnails(video_id), "YouTube requiere autenticación para este video. Sube un cookies.txt válido con /upload_cookies." if result.returncode != 0 and ('http error 429' in stderr or 'too many requests' in stderr): return None, get_video_thumbnails(video_id), "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega un cookies.txt válido exportado desde tu navegador y monta en el contenedor, o espera unos minutos." if result.returncode != 0 and ('http error 403' in stderr or 'forbidden' in stderr): return None, get_video_thumbnails(video_id), "Acceso denegado al descargar subtítulos (HTTP 403). El video puede tener restricciones. Usa cookies.txt con una cuenta autorizada." except subprocess.TimeoutExpired: pass # revisar archivos creados — yt-dlp genera nombres con doble extensión: ID.lang.vtt # glob "ID.*" no hace match; usar "ID*" para cubrir ID.en.vtt, ID.en-en.vtt, etc. files = glob.glob(os.path.join(tmpdl, f"{video_id}*")) # filtrar solo archivos de texto (vtt, json3, srv3, ttml, srt) files = [f for f in files if os.path.isfile(f) and any(f.endswith(ext) for ext in ('.vtt', '.json3', '.srv3', '.srt', '.ttml'))] if files: combined = [] seen_content = set() for fpath in files: try: with open(fpath, 'r', encoding='utf-8') as fh: content = fh.read() # desduplicar archivos con mismo contenido (en.vtt vs en-en.vtt) content_hash = hash(content[:500]) if content_hash not in seen_content: seen_content.add(content_hash) combined.append(content) except Exception: continue if combined: vtt_combined = "\n".join(combined) parsed = parse_subtitle_format(vtt_combined, 'vtt') # filtrar segmentos de ruido del header VTT _noise = {'kind: captions', 'language:', 'webvtt', 'position:', 'align:'} parsed = [s for s in parsed if s.get('text') and not any(s['text'].lower().startswith(n) for n in _noise)] if parsed: return parsed, get_video_thumbnails(video_id), None finally: # cleanup any temp cookiefile created for this request try: cookie_mgr.cleanup() except Exception: pass # ...existing code continues... # 1) Intento principal: obtener metadata con yt-dlp _has_ck = os.path.exists(cookies_path) command = [ "yt-dlp", "--skip-download", "--dump-json", "--no-warnings", ] + _yt_client_args(_has_ck) + [url] if _has_ck: command.extend(["--cookies", cookies_path]) if proxy: command.extend(['--proxy', proxy]) try: result = subprocess.run(command, capture_output=True, text=True, timeout=60) if result.returncode != 0: error_msg = result.stderr if result.stderr else "Error desconocido from yt-dlp" # Si yt-dlp reporta algo, enviar mensaje útil # No abortar inmediatamente: intentaremos fallback descargando subs con yt-dlp video_metadata = None else: if not result.stdout.strip(): video_metadata = None else: try: video_metadata = json.loads(result.stdout) except Exception: video_metadata = None except subprocess.TimeoutExpired: video_metadata = None except FileNotFoundError: return None, get_video_thumbnails(video_id), "yt-dlp no está instalado en el contenedor/entorno. Instala yt-dlp y vuelve a intentar." except Exception as e: video_metadata = None requested_subs = {} if video_metadata: requested_subs = video_metadata.get('requested_subtitles', {}) or {} # Buscar en automatic_captions y subtitles si requested_subs está vacío if not requested_subs: automatic_captions = video_metadata.get('automatic_captions', {}) or {} for lang_key, formats in automatic_captions.items(): if lang in lang_key or lang_key.startswith(lang): if formats: requested_subs = {lang_key: formats[0]} break if not requested_subs: subtitles = video_metadata.get('subtitles', {}) or {} for lang_key, formats in subtitles.items(): if lang in lang_key or lang_key.startswith(lang): if formats: requested_subs = {lang_key: formats[0]} break # Si requested_subs está disponible, intentar descargar vía requests la URL proporcionada if requested_subs: lang_key = next(iter(requested_subs)) sub_url = requested_subs[lang_key].get('url') if sub_url: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8', 'Referer': 'https://www.youtube.com/', } max_retries = 3 response = None rate_limited = False for attempt in range(max_retries): try: response = requests.get(sub_url, headers=headers, timeout=30, cookies=cookies_for_requests, proxies=proxies) if response.status_code == 200: break elif response.status_code == 429: rate_limited = True if attempt < max_retries - 1: time.sleep(2 * (attempt + 1)) continue else: # salir del loop y usar fallback con yt-dlp más abajo break elif response.status_code == 403: return None, get_video_thumbnails(video_id), "Acceso denegado (HTTP 403). El video puede tener restricciones de edad o región. Intenta con cookies.txt." elif response.status_code == 404: # No encontramos la URL esperada; intentar fallback response = None break else: return None, get_video_thumbnails(video_id), f"Error al descargar subtítulos desde YouTube (HTTP {response.status_code})." except requests.exceptions.Timeout: if attempt < max_retries - 1: continue return None, get_video_thumbnails(video_id), "Timeout al descargar subtítulos. Intenta nuevamente." except requests.exceptions.RequestException as e: return None, get_video_thumbnails(video_id), f"Error de conexión al descargar subtítulos: {str(e)[:100]}" # Si obtuvimos un 200, procesarlo; si hubo rate limiting, intentar fallback con yt-dlp if response and response.status_code == 200: subtitle_format = requested_subs[lang_key].get('ext', 'json3') try: # Si la respuesta parece ser una playlist M3U8 o contiene enlaces a timedtext, # extraer las URLs y concatenar su contenido (VTT) antes de parsear. text_body = response.text if isinstance(response.text, str) else None if text_body and ('#EXTM3U' in text_body or 'timedtext' in text_body or text_body.strip().lower().startswith('#extm3u')): # Extraer URLs (líneas que empiecen con http) urls = re.findall(r'^(https?://\S+)', text_body, flags=re.M) # Intento 1: descargar cada URL con requests (usa cookies montadas si aplican) combined = [] for idx, u in enumerate(urls): try: r2 = requests.get(u, headers=headers, timeout=20, cookies=cookies_for_requests, proxies=proxies) if r2.status_code == 200 and r2.text: combined.append(r2.text) continue # Si recibimos 429, 403, o falló, intentaremos con yt-dlp (fallback) if r2.status_code == 429: # fallback a yt-dlp raise Exception('rate_limited') except Exception: # fallthrough al fallback con yt-dlp pass # Intento 2 (fallback): usar yt-dlp para descargar ese timedtext/url a un archivo temporal try: with tempfile.TemporaryDirectory() as tdir: out_template = os.path.join(tdir, f"timedtext_{idx}.%(ext)s") ytdlp_cmd = [ "yt-dlp", u, "-o", out_template, "--no-warnings", ] if os.path.exists(cookies_path): ytdlp_cmd.extend(["--cookies", cookies_path]) # pasar proxy a yt-dlp si está configurado if proxy: ytdlp_cmd.extend(['--proxy', proxy]) try: res2 = subprocess.run(ytdlp_cmd, capture_output=True, text=True, timeout=60) stderr2 = (res2.stderr or "").lower() if res2.returncode != 0 and ('http error 429' in stderr2 or 'too many requests' in stderr2): # rate limit cuando intentamos descargar timedtext return None, get_video_thumbnails(video_id), "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega cookies.txt válido o intenta más tarde." if res2.returncode != 0 and ('http error 403' in stderr2 or 'forbidden' in stderr2): return None, get_video_thumbnails(video_id), "Acceso denegado al descargar subtítulos (HTTP 403). Intenta con cookies.txt o una cuenta con permisos." except Exception: pass # leer cualquier archivo creado en el tempdir for fpath in glob.glob(os.path.join(tdir, "timedtext_*.*")): try: with open(fpath, 'r', encoding='utf-8') as fh: txt = fh.read() if txt: combined.append(txt) except Exception: continue except Exception: continue if combined: vtt_combined = "\n".join(combined) formatted_transcript = parse_subtitle_format(vtt_combined, 'vtt') if formatted_transcript: return formatted_transcript, get_video_thumbnails(video_id) except Exception as e: return None, get_video_thumbnails(video_id), f"Error al procesar los subtítulos: {str(e)[:200]}" if not formatted_transcript: return None, get_video_thumbnails(video_id), "Los subtítulos están vacíos o no se pudieron procesar." return formatted_transcript, get_video_thumbnails(video_id), None # Si hubo rate limiting, intentar fallback con yt-dlp para descargar la URL de subtítulos if rate_limited and (not response or response.status_code != 200): # Intentar descargar la URL de subtítulos directamente con yt-dlp (usa cookies si existen) try: with tempfile.TemporaryDirectory() as tdir: out_template = os.path.join(tdir, "sub.%(ext)s") ytdlp_cmd = [ "yt-dlp", sub_url, "-o", out_template, "--no-warnings", ] if os.path.exists(cookies_path): ytdlp_cmd.extend(["--cookies", cookies_path]) if proxy: ytdlp_cmd.extend(['--proxy', proxy]) res = subprocess.run(ytdlp_cmd, capture_output=True, text=True, timeout=90) stderr = (res.stderr or "").lower() if res.returncode != 0 and ('http error 429' in stderr or 'too many requests' in stderr): return None, get_video_thumbnails(video_id), "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega cookies.txt válido o intenta más tarde." # Leer archivos generados combined = [] for fpath in glob.glob(os.path.join(tdir, "*.*")): try: with open(fpath, 'r', encoding='utf-8') as fh: txt = fh.read() if txt: combined.append(txt) except Exception: continue if combined: vtt_combined = "\n".join(combined) formatted_transcript = parse_subtitle_format(vtt_combined, 'vtt') if formatted_transcript: return formatted_transcript, get_video_thumbnails(video_id) except FileNotFoundError: return None, get_video_thumbnails(video_id), "yt-dlp no está instalado en el contenedor/entorno. Instala yt-dlp y vuelve a intentar." except Exception: # seguir con otros fallbacks pass # si no logró con yt-dlp, continuar y dejar que los fallbacks posteriores manejen el caso # Fallback: intentarlo descargando subtítulos con yt-dlp a un directorio temporal # (esto cubre casos en que la metadata no incluye requested_subs) try: with tempfile.TemporaryDirectory() as tmpdir: # Intentar con auto-sub primero, luego con sub (manual) ytdlp_variants = [ ("--write-auto-sub", "auto"), ("--write-sub", "manual") ] downloaded = None for flag, label in ytdlp_variants: cmd = [ "yt-dlp", url, "--skip-download", flag, "--sub-lang", lang, "--sub-format", "json3/vtt/srv3/best", "-o", os.path.join(tmpdir, "%(id)s.%(ext)s"), "--no-warnings", ] + _yt_subs_args(_has_ck_subs) # Pasar cookies sólo con cliente web if _has_ck_subs: cmd.extend(["--cookies", cookies_path]) # añadir proxy a la llamada de yt-dlp si está configurado if proxy: cmd.extend(['--proxy', proxy]) r = subprocess.run(cmd, capture_output=True, text=True, timeout=120) # Revisar si se creó algún archivo en tmpdir (doble ext: ID.en.vtt) files = glob.glob(os.path.join(tmpdir, f"{video_id}*")) files = [f for f in files if os.path.isfile(f) and any(f.endswith(e) for e in ('.vtt', '.json3', '.srv3', '.srt', '.ttml'))] if files: # Tomar el primero válido downloaded = files[0] break if downloaded: ext = os.path.splitext(downloaded)[1].lstrip('.') try: with open(downloaded, 'r', encoding='utf-8') as fh: content = fh.read() except Exception as e: return None, get_video_thumbnails(video_id), f"Error leyendo archivo de subtítulos descargado: {str(e)[:200]}" # Intentar parsear según extensión conocida fmt = 'json3' if ext in ('json', 'json3') else ('vtt' if ext == 'vtt' else 'srv3') formatted_transcript = parse_subtitle_format(content, fmt) if formatted_transcript: return formatted_transcript, get_video_thumbnails(video_id), None else: return None, get_video_thumbnails(video_id), "Se descargaron subtítulos pero no se pudieron procesar." except FileNotFoundError: return None, get_video_thumbnails(video_id), "yt-dlp no está instalado. Instala yt-dlp en el contenedor/entorno y vuelve a intentar." except Exception as e: # No hacer crash, retornar mensaje general return None, get_video_thumbnails(video_id), f"Error al intentar descargar subtítulos con yt-dlp: {str(e)[:200]}" return None, get_video_thumbnails(video_id), ( f"No se encontraron subtítulos para este video en idioma '{lang}'. " "Puede que el video no tenga subtítulos, estén en otro idioma, o requiera autenticación. " "Prueba: ?lang=en | /debug/fetch_subs/{video_id} | sube cookies con /upload_cookies" ) # ── Clientes Innertube (sincronizados con NewPipeExtractor + yt-dlp 2026-03) ── _NP_IOS = { "clientName": "IOS", "clientVersion": "21.03.2", "clientScreen": "WATCH", "platform": "MOBILE", "deviceMake": "Apple", "deviceModel": "iPhone16,2", "osName": "iOS", "osVersion": "18.7.2.22H124", "userAgent": "com.google.ios.youtube/21.03.2 (iPhone16,2; U; CPU iOS 18_7_2 like Mac OS X;)", } _NP_ANDROID = { "clientName": "ANDROID", "clientVersion": "21.03.36", "clientScreen": "WATCH", "platform": "MOBILE", "osName": "Android", "osVersion": "16", "androidSdkVersion": 36, "userAgent": "com.google.android.youtube/21.03.36 (Linux; U; Android 16) gzip", } # tv_embedded: NO requiere PO Token, siempre devuelve videoDetails + hlsManifestUrl en lives # Es el cliente más fiable para obtener title/description sin autenticación. _NP_TV_EMBEDDED = { "clientName": "TVHTML5_SIMPLY_EMBEDDED_PLAYER", "clientVersion": "2.0", "clientScreen": "EMBED", "platform": "TV", "userAgent": "Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/538.1 (KHTML, like Gecko) Version/6.0 TV Safari/538.1", } # GAPIS: youtubei.googleapis.com — usado por NewPipe para iOS/Android/TV _GAPIS_BASE = "https://youtubei.googleapis.com/youtubei/v1" def _np_build_ctx(client: dict, visitor_data: str = "") -> dict: """context.client igual que prepareJsonBuilder de YoutubeParsingHelper.java.""" ctx = { "clientName": client["clientName"], "clientVersion": client["clientVersion"], "clientScreen": client.get("clientScreen", "WATCH"), "platform": client.get("platform", "MOBILE"), "hl": "en", "gl": "US", "utcOffsetMinutes": 0, } if visitor_data: ctx["visitorData"] = visitor_data for k in ("deviceMake", "deviceModel", "osName", "osVersion", "androidSdkVersion"): if client.get(k): ctx[k] = client[k] return ctx def _np_get_visitor_data(client: dict, proxies: dict = None) -> str: """POST /visitor_id → responseContext.visitorData (getVisitorDataFromInnertube).""" try: ctx = _np_build_ctx(client) payload = { "context": { "client": ctx, "request": {"internalExperimentFlags": [], "useSsl": True}, "user": {"lockedSafetyMode": False}, } } headers = { "User-Agent": client["userAgent"], "X-Goog-Api-Format-Version": "2", "Content-Type": "application/json", } r = requests.post( f"{_GAPIS_BASE}/visitor_id?prettyPrint=false", json=payload, headers=headers, timeout=8, proxies=proxies, ) if r.status_code == 200: return r.json().get("responseContext", {}).get("visitorData", "") except Exception: pass return "" def _np_call_player(video_id: str, client: dict, visitor_data: str = "", proxies: dict = None) -> dict: """POST /player igual que getIosPlayerResponse/getAndroidPlayerResponse de NewPipe.""" import string as _str n = int(time.time()) chars = _str.digits + _str.ascii_lowercase t = "" while n: t = chars[n % 36] + t n //= 36 url = f"{_GAPIS_BASE}/player?prettyPrint=false&t={t or '0'}&id={video_id}" ctx = _np_build_ctx(client, visitor_data) payload = { "context": { "client": ctx, "request": {"internalExperimentFlags": [], "useSsl": True}, "user": {"lockedSafetyMode": False}, }, "videoId": video_id, "contentCheckOk": True, "racyCheckOk": True, } headers = { "User-Agent": client["userAgent"], "X-Goog-Api-Format-Version": "2", "Content-Type": "application/json", } try: r = requests.post(url, json=payload, headers=headers, timeout=15, proxies=proxies) if r.status_code == 200: return r.json() except Exception: pass return {} def innertube_get_stream(video_id: str, proxy: str = None) -> dict: """ Obtiene URL de stream replicando NewPipeExtractor + fallback tv_embedded. Orden de intentos: 1. iOS → hlsManifestUrl (prioritario para lives, trae videoDetails) 2. Android → formats directas + videoDetails 3. tv_embedded → sin PO Token, siempre trae videoDetails y hlsManifestUrl en lives Sin cookies | Sin firma JS | Sin bot-check desde servidores """ result = { "title": None, "description": None, "is_live": False, "hls_url": None, "formats": [], "error": None, } proxies = {"http": proxy, "https": proxy} if proxy else None vd_ios = _np_get_visitor_data(_NP_IOS, proxies) vd_android = _np_get_visitor_data(_NP_ANDROID, proxies) # ── iOS — preferido para hlsManifestUrl en lives ────────────────────────── ios = _np_call_player(video_id, _NP_IOS, vd_ios, proxies) ps = ios.get("playabilityStatus") or {} if ps.get("status") == "LOGIN_REQUIRED": result["error"] = f"Login requerido: {ps.get('reason','')}" return result vd_meta = ios.get("videoDetails") or {} result["title"] = vd_meta.get("title") or None result["description"] = vd_meta.get("shortDescription") or None result["is_live"] = bool(vd_meta.get("isLive") or vd_meta.get("isLiveContent")) ios_sd = ios.get("streamingData") or {} hls = ios_sd.get("hlsManifestUrl") if hls: result["hls_url"] = hls result["formats"] = [ {"itag": f.get("itag"), "mimeType": f.get("mimeType"), "quality": f.get("quality")} for f in (ios_sd.get("formats", []) + ios_sd.get("adaptiveFormats", []))[:8] ] # Intentar completar videoDetails si iOS no los trajo if not result["title"]: vd_android_resp = _np_call_player(video_id, _NP_ANDROID, vd_android, proxies) vd2 = vd_android_resp.get("videoDetails") or {} result["title"] = vd2.get("title") or result["title"] result["description"] = vd2.get("shortDescription") or result["description"] if not result["title"]: # último intento: tv_embedded tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) vd3 = tv.get("videoDetails") or {} result["title"] = vd3.get("title") or result["title"] result["description"] = vd3.get("shortDescription") or result["description"] return result # ── Android — para videos normales o si iOS no dio HLS ─────────────────── android = _np_call_player(video_id, _NP_ANDROID, vd_android, proxies) if not result["title"]: vd2 = android.get("videoDetails") or {} result["title"] = vd2.get("title") or None result["description"] = vd2.get("shortDescription") or None result["is_live"] = result["is_live"] or bool( vd2.get("isLive") or vd2.get("isLiveContent")) android_sd = android.get("streamingData") or {} hls = android_sd.get("hlsManifestUrl") if hls: result["hls_url"] = hls if not result["title"]: tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) vd3 = tv.get("videoDetails") or {} result["title"] = vd3.get("title") or result["title"] result["description"] = vd3.get("shortDescription") or result["description"] return result all_fmts = android_sd.get("formats", []) + android_sd.get("adaptiveFormats", []) best = sorted([f for f in all_fmts if f.get("url")], key=lambda x: x.get("bitrate", 0), reverse=True) if best: result["hls_url"] = best[0]["url"] result["formats"] = [ {"itag": f.get("itag"), "mimeType": f.get("mimeType"), "quality": f.get("quality")} for f in best[:8] ] if not result["title"]: tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) vd3 = tv.get("videoDetails") or {} result["title"] = vd3.get("title") or result["title"] result["description"] = vd3.get("shortDescription") or result["description"] return result # ── tv_embedded — sin PO Token, último recurso para streamingData ───────── tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) vd3 = tv.get("videoDetails") or {} if not result["title"]: result["title"] = vd3.get("title") or None result["description"] = vd3.get("shortDescription") or None result["is_live"] = result["is_live"] or bool( vd3.get("isLive") or vd3.get("isLiveContent")) tv_sd = tv.get("streamingData") or {} hls = tv_sd.get("hlsManifestUrl") if hls: result["hls_url"] = hls return result all_fmts_tv = tv_sd.get("formats", []) + tv_sd.get("adaptiveFormats", []) best_tv = sorted([f for f in all_fmts_tv if f.get("url")], key=lambda x: x.get("bitrate", 0), reverse=True) if best_tv: result["hls_url"] = best_tv[0]["url"] result["formats"] = [ {"itag": f.get("itag"), "mimeType": f.get("mimeType"), "quality": f.get("quality")} for f in best_tv[:8] ] return result result["error"] = ( "Innertube no devolvió streamingData (iOS + Android + tv_embedded). " "Puede ser DRM, región bloqueada, privado, o YouTube actualizó su API." ) return result def _fetch_metadata_ytdlp(video_id: str, proxy: str = None) -> dict: """Obtiene title, description, is_live usando yt-dlp. Prueba clientes en orden hasta obtener título: 1. tv_embedded — sin PO Token, devuelve videoDetails completo 2. ios — HLS nativo, suele traer title 3. mweb — fallback adicional 4. --print title (rápido, último recurso) """ url = f"https://www.youtube.com/watch?v={video_id}" proxy_args = ["--proxy", proxy] if proxy else [] # Intentar con --dump-json para cada cliente for client in ("tv_embedded", "ios", "mweb"): cmd = [ "yt-dlp", "--skip-download", "--dump-json", "--no-warnings", "--extractor-args", f"youtube:player_client={client}", url, ] + proxy_args try: res = subprocess.run(cmd, capture_output=True, text=True, timeout=25) if res.returncode == 0 and res.stdout.strip(): d = json.loads(res.stdout.strip()) title = d.get("title") or d.get("fulltitle") if title: return { "title": title, "description": d.get("description") or None, "is_live": bool(d.get("is_live") or d.get("was_live")), } except Exception: continue # Último recurso: --print title (muy rápido, sólo el título) for client in ("tv_embedded", "ios", "mweb"): cmd = [ "yt-dlp", "--skip-download", "--no-warnings", "--print", "%(title)s\n%(is_live)s\n%(description)s", "--extractor-args", f"youtube:player_client={client}", url, ] + proxy_args try: res = subprocess.run(cmd, capture_output=True, text=True, timeout=20) if res.returncode == 0 and res.stdout.strip(): lines = res.stdout.strip().splitlines() title = lines[0].strip() if lines else None if title and title.lower() not in ("none", "na", ""): is_live = lines[1].strip().lower() in ("true", "1") if len(lines) > 1 else False desc = "\n".join(lines[2:]).strip() if len(lines) > 2 else None return { "title": title, "description": desc or None, "is_live": is_live, } except Exception: continue return {"title": None, "description": None, "is_live": False} def get_stream_url(video_id: str): """ Obtiene la URL de transmisión m3u8/HLS. Devuelve: (stream_url, title, description, is_live, error) Estrategia: 1. innertube_get_stream() — iOS + Android + tv_embedded, sin cookies 2. Fallback yt-dlp con tv_embedded/ios/web 3. title/description siempre se completan con _fetch_metadata_ytdlp si faltan """ video_id = extract_video_id(video_id) proxy = os.getenv('API_PROXY', DEFAULT_PROXY) or None # ── 1. Innertube directo (NewPipe) ──────────────────────────────────────── it = innertube_get_stream(video_id, proxy=proxy) title = it.get("title") description = it.get("description") is_live = it.get("is_live", False) if it.get("hls_url"): # Completar metadatos con yt-dlp si Innertube no los trajo if not title: meta = _fetch_metadata_ytdlp(video_id, proxy=proxy) title = meta["title"] or title description = meta["description"] or description is_live = is_live or meta["is_live"] return it["hls_url"], title, description, is_live, None # ── 2. Fallback yt-dlp ──────────────────────────────────────────────────── cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() cookies_path_env = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) effective_cookie = cookiefile_path or ( cookies_path_env if os.path.exists(cookies_path_env) else None) has_ck = bool(effective_cookie) yt_url = f"https://www.youtube.com/watch?v={video_id}" BOT_MARKERS = ("sign in to confirm", "not a bot", "sign in to") def _is_bot(s: str) -> bool: return any(m in s.lower() for m in BOT_MARKERS) def _build_args(client: str) -> list: args = ["--no-warnings", "--no-check-certificate", "--no-playlist", "--extractor-args", f"youtube:player_client={client}"] if client == "web": args += ["--js-runtimes", f"node:{NODE_PATH}"] if effective_cookie and client == "web": args += ["--cookies", effective_cookie] if proxy: args += ["--proxy", proxy] return args def _ytdlp_url(fmt: str, client: str): cmd = ["yt-dlp", "-g", "-f", fmt] + _build_args(client) + [yt_url] try: res = subprocess.run(cmd, capture_output=True, text=True, check=False, timeout=90) if res.returncode == 0 and res.stdout.strip(): for line in res.stdout.strip().splitlines(): line = line.strip() if line.startswith("http"): return line, False return None, _is_bot(res.stderr or "") except Exception: return None, False # tv_embedded no requiere PO Token; ios da HLS nativo; web+cookies resuelve n-challenge clients = ["tv_embedded", "ios"] + (["web"] if has_ck else []) fmts = (["91", "92", "93", "94", "95", "96", "best[protocol=m3u8_native]", "best[protocol=m3u8]", "best"] if is_live else ["best[ext=m3u8]", "best[protocol=m3u8_native]", "best[protocol=m3u8]", "best", "best[ext=mp4]"]) got_bot = False try: for client in clients: for fmt in fmts: u, is_b = _ytdlp_url(fmt, client) if u: # Completar metadatos si todavía faltan if not title: meta = _fetch_metadata_ytdlp(video_id, proxy=proxy) title = meta["title"] or title description = meta["description"] or description is_live = is_live or meta["is_live"] return u, title, description, is_live, None if is_b: got_bot = True finally: try: cookie_mgr.cleanup() except Exception: pass # Último intento de metadatos aunque no haya stream if not title: meta = _fetch_metadata_ytdlp(video_id, proxy=proxy) title = meta["title"] or title description = meta["description"] or description if got_bot: # Intentar fallback con Playwright usando _attempt_playwright_fallback y devolver m3u8/cookies si encuentra; si falla, devolver mensaje anterior con detalle. try: pw_m3u8, pw_cookies, pw_err = _attempt_playwright_fallback(video_id) if pw_m3u8: # si Playwright encontró el m3u8, retornar exitoso return pw_m3u8, title, description, is_live, None # si Playwright no tuvo éxito, incluir su error en la respuesta detail = pw_err or 'YouTube detectó actividad de bot. Sube cookies.txt con /upload_cookies.' except Exception as e: detail = f'YouTube detectó actividad de bot. Además, Playwright fallback falló: {str(e)[:200]}' return None, title, description, is_live, detail return None, title, description, is_live, ( "YouTube detectó actividad de bot. " "Sube cookies.txt: curl -X POST http://localhost:8282/upload_cookies -F 'file=@cookies.txt'" ) @app.get("/debug/stream/{video_id}") def debug_stream(video_id: str): """Diagnóstico completo del endpoint /stream: muestra qué devuelve cada cliente Innertube (iOS, Android, tv_embedded) y yt-dlp por separado. """ video_id = extract_video_id(video_id) proxy = _get_proxy_choice() proxies = {"http": proxy, "https": proxy} if proxy else None def _call(client_dict, label): try: vd_data = _np_get_visitor_data(client_dict, proxies) resp = _np_call_player(video_id, client_dict, vd_data, proxies) ps = resp.get("playabilityStatus") or {} vd = resp.get("videoDetails") or {} sd = resp.get("streamingData") or {} return { "client": label, "status": ps.get("status"), "reason": ps.get("reason", ""), "title": vd.get("title"), "description_preview": str(vd.get("shortDescription", "") or "")[:120], "isLive": vd.get("isLive"), "isLiveContent": vd.get("isLiveContent"), "hlsManifestUrl": (sd.get("hlsManifestUrl") or "")[:100], "formats_count": len(sd.get("formats", [])), "adaptiveFormats_count": len(sd.get("adaptiveFormats", [])), "streamingData_keys": list(sd.keys()), } except Exception as e: return {"client": label, "error": str(e)} results = [ _call(_NP_IOS, "iOS"), _call(_NP_ANDROID, "Android"), _call(_NP_TV_EMBEDDED, "tv_embedded"), ] # yt-dlp dump-json con tv_embedded ytdlp_meta = {} try: url = f"https://www.youtube.com/watch?v={video_id}" cmd = ["yt-dlp", "--skip-download", "--dump-json", "--no-warnings", "--extractor-args", "youtube:player_client=tv_embedded", url] if proxy: cmd.extend(["--proxy", proxy]) res = subprocess.run(cmd, capture_output=True, text=True, timeout=30) if res.returncode == 0 and res.stdout.strip(): d = json.loads(res.stdout.strip()) ytdlp_meta = { "title": d.get("title"), "description_preview": str(d.get("description") or "")[:120], "is_live": d.get("is_live"), "was_live": d.get("was_live"), } else: ytdlp_meta = {"error": res.stderr[:500]} except Exception as e: ytdlp_meta = {"error": str(e)} return { "video_id": video_id, "innertube_clients": results, "ytdlp_tv_embedded": ytdlp_meta, } @app.get("/transcript/{video_id}") def transcript_endpoint(video_id: str, lang: str = "es"): data, thumbnails, error = get_transcript_data(video_id, lang) # Fallback automático a 'en' si no hay subs en el idioma pedido if (error and lang != "en" and "No se encontraron" in (error or "") and "autenticación" not in (error or "")): data_en, thumbnails_en, error_en = get_transcript_data(video_id, "en") if data_en and not error_en: data, thumbnails, error = data_en, thumbnails_en, None if error: raise HTTPException(status_code=400, detail=error) # Concatenar texto de segmentos para mostrar como texto plano además de los segmentos try: combined_text = "\n".join([seg.get('text', '') for seg in data if seg.get('text')]) except Exception: combined_text = "" # Nuevo: arreglo format_text con cada segmento como elemento (texto limpio) try: format_text_list = format_segments_text(data) except Exception: format_text_list = [] format_text = format_text_list return { "video_id": video_id, "count": len(data), "segments": data, "text": combined_text, "format_text": format_text, "thumbnails": thumbnails } @app.get('/transcript_vtt/{video_id}') def transcript_vtt(video_id: str, lang: str = 'es'): """Descarga (con yt-dlp) y devuelve subtítulos en VTT, además de segmentos parseados y texto concatenado.""" vtt_text, error = fetch_vtt_subtitles(video_id, lang) if error: raise HTTPException(status_code=400, detail=error) # parsear VTT a segmentos usando parse_subtitle_format segments = parse_subtitle_format(vtt_text, 'vtt') if vtt_text else [] combined_text = '\n'.join([s.get('text','') for s in segments]) # format_text con texto limpio listo para procesamiento por agentes format_text = format_segments_text(segments) thumbnails = get_video_thumbnails(video_id) return { 'video_id': video_id, 'vtt': vtt_text, 'count': len(segments), 'segments': segments, 'text': combined_text, 'format_text': format_text, 'thumbnails': thumbnails } @app.get("/stream/{video_id}") def stream_endpoint(video_id: str): """ Obtiene la URL de transmisión (m3u8/HLS) de un video/live de YouTube. - Para lives en vivo (🔴): devuelve URL HLS directa usable con FFmpeg/VLC. - Para videos normales: devuelve la mejor URL de video disponible. Ejemplo FFmpeg: ffmpeg -re -i "URL_M3U8" -c copy -f flv rtmp://destino/stream_key """ stream_url, title, description, is_live, error = get_stream_url(video_id) if error: raise HTTPException(status_code=400, detail=error) thumbnails = get_video_thumbnails(video_id) url_type = "m3u8/hls" if stream_url and "m3u8" in stream_url.lower() else "direct/mp4" return { "video_id": video_id, "title": title, "description": description, "is_live": is_live, "stream_url": stream_url, "url_type": url_type, "youtube_url": f"https://www.youtube.com/watch?v={video_id}", "ffmpeg_example": f'ffmpeg -re -i "{stream_url}" -c copy -f flv rtmp://destino/stream_key', "thumbnails": thumbnails, "usage": { "description": "Usa stream_url con FFmpeg para retransmitir", "command_template": "ffmpeg -re -i \"{stream_url}\" -c copy -f flv {rtmp_url}/{stream_key}", "platforms": { "youtube": "rtmp://a.rtmp.youtube.com/live2/YOUR_STREAM_KEY", "facebook": "rtmps://live-api-s.facebook.com:443/rtmp/YOUR_STREAM_KEY", "twitch": "rtmp://live.twitch.tv/app/YOUR_STREAM_KEY", "twitter": "rtmps://fa.contribute.live-video.net/app/YOUR_STREAM_KEY" } } } @app.post('/upload_cookies') async def upload_cookies(file: UploadFile = File(...)): """Endpoint para subir cookies.txt y guardarlo en el servidor en /app/cookies.txt""" try: content = await file.read() if not content: raise HTTPException(status_code=400, detail='Archivo vacío') # Determinar ruta objetivo a partir de la variable de entorno target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) target_dir = os.path.dirname(target) or '.' # Crear directorio si no existe try: os.makedirs(target_dir, exist_ok=True) except Exception: # Si no se puede crear el directorio, intentamos escribir en el working dir como fallback target = os.path.basename(target) # Guardar con permisos de escritura with open(target, 'wb') as fh: fh.write(content) return {"detail": "cookies.txt guardado correctamente", "path": os.path.abspath(target)} except Exception as e: raise HTTPException(status_code=500, detail=f'Error al guardar cookies: {str(e)[:200]}') # ── Rutas conocidas de perfiles de navegador en Linux/Mac/Windows ──────────── _BROWSER_PROFILES = { "chrome": [ # Linux os.path.expanduser("~/.config/google-chrome/Default"), os.path.expanduser("~/.config/google-chrome/Profile 1"), # Montaje desde docker-compose (host path mapeado) "/host-chrome/Default", "/host-chrome", # macOS os.path.expanduser("~/Library/Application Support/Google/Chrome/Default"), ], "chromium": [ os.path.expanduser("~/.config/chromium/Default"), "/host-chromium/Default", "/host-chromium", os.path.expanduser("~/Library/Application Support/Chromium/Default"), ], "brave": [ os.path.expanduser("~/.config/BraveSoftware/Brave-Browser/Default"), "/host-brave/Default", "/host-brave", os.path.expanduser("~/Library/Application Support/BraveSoftware/Brave-Browser/Default"), ], "firefox": [ # Firefox usa --cookies-from-browser firefox directamente, yt-dlp detecta el perfil os.path.expanduser("~/.mozilla/firefox"), "/host-firefox", ], "edge": [ os.path.expanduser("~/.config/microsoft-edge/Default"), "/host-edge/Default", ], } def _find_browser_profile(browser: str) -> str | None: """Devuelve la primera ruta de perfil existente para el navegador dado.""" for path in _BROWSER_PROFILES.get(browser, []): if os.path.exists(path): return path return None def _extract_cookies_from_browser(browser: str, profile_path: str | None, target: str, proxy: str | None = None) -> dict: """ Usa yt-dlp --cookies-from-browser para extraer cookies de YouTube del perfil del navegador indicado y guardarlas en target (Netscape format). """ cmd = [ "yt-dlp", "--cookies-from-browser", browser if not profile_path else f"{browser}:{profile_path}", "--cookies", target, # exportar a archivo Netscape "--skip-download", "--no-warnings", "--extractor-args", "youtube:player_client=tv_embedded", "https://www.youtube.com/watch?v=dQw4w9WgXcQ", # video corto para forzar extracción ] if proxy: cmd.extend(["--proxy", proxy]) try: res = subprocess.run(cmd, capture_output=True, text=True, timeout=60) stderr = res.stderr or "" stdout = res.stdout or "" # Verificar que el archivo fue creado y no está vacío if os.path.exists(target) and os.path.getsize(target) > 100: # Contar cookies de youtube.com yt_cookies = 0 with open(target, "r", errors="ignore") as fh: for line in fh: if ".youtube.com" in line or "youtube.com" in line: yt_cookies += 1 return { "success": True, "browser": browser, "profile_path": profile_path, "cookies_file": target, "youtube_cookie_lines": yt_cookies, "stderr_preview": stderr[:300] if stderr else "", } else: return { "success": False, "browser": browser, "error": "No se generó el archivo de cookies o está vacío", "stderr": stderr[:500], "stdout": stdout[:200], "returncode": res.returncode, } except subprocess.TimeoutExpired: return {"success": False, "browser": browser, "error": "Timeout al extraer cookies (60s)"} except FileNotFoundError: return {"success": False, "browser": browser, "error": "yt-dlp no encontrado"} except Exception as e: return {"success": False, "browser": browser, "error": str(e)[:200]} @app.post("/extract_chrome_cookies") def extract_chrome_cookies(browser: str = "chrome", profile_path: str = ""): """ Extrae cookies de YouTube directamente desde el perfil del navegador instalado en el HOST (montado como volumen) y las guarda en /app/data/cookies.txt. Parámetros: - browser: chrome | chromium | brave | firefox | edge (default: chrome) - profile_path: ruta manual al perfil (opcional, se auto-detecta si está vacío) Requisito en docker-compose.yml (ya incluido): volumes: - ~/.config/google-chrome:/host-chrome:ro Ejemplo: curl -X POST "http://localhost:8282/extract_chrome_cookies?browser=chrome" curl -X POST "http://localhost:8282/extract_chrome_cookies?browser=brave" """ proxy = _get_proxy_choice() target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) # Asegurar directorio destino target_dir = os.path.dirname(target) or "." os.makedirs(target_dir, exist_ok=True) browser = browser.lower().strip() valid_browsers = list(_BROWSER_PROFILES.keys()) if browser not in valid_browsers: raise HTTPException( status_code=400, detail=f"Navegador '{browser}' no soportado. Usa: {', '.join(valid_browsers)}" ) # Auto-detectar perfil si no se indicó resolved_profile = profile_path.strip() or _find_browser_profile(browser) if not resolved_profile and browser != "firefox": # Para Firefox yt-dlp lo detecta solo; para el resto necesitamos la ruta available = {b: _find_browser_profile(b) for b in valid_browsers} found = {b: p for b, p in available.items() if p} raise HTTPException( status_code=404, detail=( f"No se encontró el perfil de '{browser}' en las rutas conocidas. " f"Agrega el volumen en docker-compose.yml o pasa profile_path manualmente. " f"Perfiles encontrados: {found if found else 'ninguno'}" ) ) result = _extract_cookies_from_browser(browser, resolved_profile, target, proxy) if not result["success"]: raise HTTPException(status_code=500, detail=result) return { "detail": f"Cookies extraídas de {browser} y guardadas en {target}", **result, "next_step": "Los endpoints /transcript y /stream usarán estas cookies automáticamente.", } @app.get("/cookies/status") def cookies_status(): """Muestra el estado actual de las cookies configuradas y qué navegadores están disponibles.""" target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) proxy = _get_proxy_choice() # Estado del archivo de cookies actual cookies_info = {"path": target, "exists": False, "size_bytes": 0, "youtube_lines": 0} if os.path.exists(target): cookies_info["exists"] = True cookies_info["size_bytes"] = os.path.getsize(target) yt_lines = 0 try: with open(target, "r", errors="ignore") as fh: for line in fh: if "youtube.com" in line and not line.startswith("#"): yt_lines += 1 except Exception: pass cookies_info["youtube_lines"] = yt_lines # Detectar perfiles de navegador disponibles (en el contenedor / host montado) available_browsers = {} for browser in _BROWSER_PROFILES: path = _find_browser_profile(browser) available_browsers[browser] = { "found": bool(path), "profile_path": path, } return { "cookies_file": cookies_info, "available_browsers": available_browsers, "extract_endpoint": "POST /extract_chrome_cookies?browser=chrome", "upload_endpoint": "POST /upload_cookies", "proxy": proxy or "no configurado", "note": ( "Para usar cookies de Chrome del host, agrega en docker-compose.yml: " "volumes: - ~/.config/google-chrome:/host-chrome:ro" ), } def debug_metadata(video_id: str): """Endpoint de depuración: obtiene --dump-json de yt-dlp para un video. Devuelve la metadata (automatic_captions, subtitles, requested_subtitles) para inspección. """ # try to use dynamic cookiefile per request cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() cookies_path = cookiefile_path or os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) proxy = _get_proxy_choice() url = f"https://www.youtube.com/watch?v={video_id}" cmd = [ "yt-dlp", "--skip-download", "--dump-json", "--no-warnings", url ] + _yt_client_args(os.path.exists(cookies_path)) if os.path.exists(cookies_path): cmd.extend(["--cookies", cookies_path]) if proxy: cmd.extend(['--proxy', proxy]) try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=60) except FileNotFoundError: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=500, detail="yt-dlp no está instalado en el contenedor/entorno.") except subprocess.TimeoutExpired: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=504, detail="yt-dlp demoró demasiado en responder.") except Exception as e: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=500, detail=str(e)[:300]) if proc.returncode != 0: stderr = proc.stderr or '' try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=500, detail=f"yt-dlp error: {stderr[:1000]}") try: metadata = json.loads(proc.stdout) except Exception: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=500, detail="No se pudo parsear la salida JSON de yt-dlp.") try: cookie_mgr.cleanup() except Exception: pass # Devolver solo las partes útiles para depuración debug_info = { 'id': metadata.get('id'), 'title': metadata.get('title'), 'uploader': metadata.get('uploader'), 'is_live': metadata.get('is_live'), 'automatic_captions': metadata.get('automatic_captions'), 'subtitles': metadata.get('subtitles'), 'requested_subtitles': metadata.get('requested_subtitles'), 'formats_sample': metadata.get('formats')[:5] if metadata.get('formats') else None, } return debug_info @app.get('/debug/fetch_subs/{video_id}') def debug_fetch_subs(video_id: str, lang: str = 'es'): """Intenta descargar subtítulos con yt-dlp dentro del entorno y devuelve el log y el contenido (parcial) si existe. Usa cookies definidas en API_COOKIES_PATH. """ cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() out_dir = tempfile.mkdtemp(prefix='subs_') out_template = os.path.join(out_dir, '%(id)s.%(ext)s') url = f"https://www.youtube.com/watch?v={video_id}" cmd = [ 'yt-dlp', '--verbose', '--skip-download', '--write-auto-sub', '--write-sub', '--sub-lang', lang, '--sub-format', 'json3/vtt/srv3/best', '--output', out_template, url ] + _yt_subs_args(bool(cookiefile_path)) if cookiefile_path: cmd.extend(['--cookies', cookiefile_path]) try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=240) except FileNotFoundError: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=500, detail='yt-dlp no está instalado en el contenedor.') except subprocess.TimeoutExpired: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=504, detail='La ejecución de yt-dlp demoró demasiado.') except Exception as e: try: cookie_mgr.cleanup() except Exception: pass raise HTTPException(status_code=500, detail=str(e)[:300]) stdout = proc.stdout or '' stderr = proc.stderr or '' rc = proc.returncode # Buscar archivos generados (yt-dlp usa doble extensión: ID.lang.vtt) generated = [] for f in glob.glob(os.path.join(out_dir, f"{video_id}*")): size = None try: size = os.path.getsize(f) # tomar las primeras 200 líneas para no retornar archivos enormes with open(f, 'r', encoding='utf-8', errors='ignore') as fh: sample = ''.join([next(fh) for _ in range(200)]) if size > 0 else '' generated.append({ 'path': f, 'size': size, 'sample': sample }) except StopIteration: # menos de 200 líneas try: with open(f, 'r', encoding='utf-8', errors='ignore') as fh: sample = fh.read() except Exception: sample = None if size is None: try: size = os.path.getsize(f) except Exception: size = 0 generated.append({'path': f, 'size': size, 'sample': sample}) except Exception: if size is None: try: size = os.path.getsize(f) except Exception: size = 0 generated.append({'path': f, 'size': size, 'sample': None}) try: cookie_mgr.cleanup() except Exception: pass return { 'video_id': video_id, 'rc': rc, 'stdout_tail': stdout[-2000:], 'stderr_tail': stderr[-2000:], 'generated': generated, 'out_dir': out_dir } # Nuevo helper para descargar VTT directamente y retornarlo como texto def fetch_vtt_subtitles(video_id: str, lang: str = 'es'): """Descarga subtítulos en formato VTT usando yt-dlp y devuelve su contenido. Retorna (vtt_text, None) en caso de éxito o (None, error_message) en caso de error. """ url = f"https://www.youtube.com/watch?v={video_id}" cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() with tempfile.TemporaryDirectory() as tmpdir: out_template = os.path.join(tmpdir, '%(id)s.%(ext)s') cmd = [ 'yt-dlp', '--skip-download', '--write-auto-sub', '--write-sub', '--sub-lang', lang, '--sub-format', 'vtt', '--output', out_template, url ] + _yt_subs_args(bool(cookiefile_path)) if cookiefile_path: cmd.extend(['--cookies', cookiefile_path]) try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=180) except FileNotFoundError: try: cookie_mgr.cleanup() except Exception: pass return None, 'yt-dlp no está instalado en el entorno.' except subprocess.TimeoutExpired: try: cookie_mgr.cleanup() except Exception: pass return None, 'La descarga de subtítulos tardó demasiado.' except Exception as e: try: cookie_mgr.cleanup() except Exception: pass return None, f'Error ejecutando yt-dlp: {str(e)[:200]}' stderr = (proc.stderr or '').lower() if proc.returncode != 0: try: cookie_mgr.cleanup() except Exception: pass if 'http error 429' in stderr or 'too many requests' in stderr: return None, 'YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Revisa cookies.txt o prueba desde otra IP.' if 'http error 403' in stderr or 'forbidden' in stderr: return None, 'Acceso denegado al descargar subtítulos (HTTP 403). Usa cookies.txt con una cuenta autorizada.' return None, f'yt-dlp error: {proc.stderr[:1000]}' # buscar archivos generados (doble extensión: ID.lang.vtt) files = glob.glob(os.path.join(tmpdir, f"{video_id}*")) files = [f for f in files if os.path.isfile(f) and any(f.endswith(e) for e in ('.vtt', '.json3', '.srv3', '.srt', '.ttml'))] if not files: try: cookie_mgr.cleanup() except Exception: pass return None, 'No se generaron archivos de subtítulos.' # intentar preferir .vtt vtt_path = None for f in files: if f.lower().endswith('.vtt'): vtt_path = f break if not vtt_path: vtt_path = files[0] try: with open(vtt_path, 'r', encoding='utf-8', errors='ignore') as fh: content = fh.read() try: cookie_mgr.cleanup() except Exception: pass return content, None except Exception as e: try: cookie_mgr.cleanup() except Exception: pass return None, f'Error leyendo archivo de subtítulos: {str(e)[:200]}' @app.post('/upload_vtt/{video_id}') async def upload_vtt(video_id: str, file: UploadFile = File(...)): """Permite subir un archivo VTT para un video y devuelve segmentos parseados y texto. Guarda el archivo en /app/data/{video_id}.vtt (sobrescribe si existe). """ try: content = await file.read() if not content: raise HTTPException(status_code=400, detail='Archivo vacío') target_dir = os.path.join(os.getcwd(), 'data') os.makedirs(target_dir, exist_ok=True) target_path = os.path.join(target_dir, f"{video_id}.vtt") with open(target_path, 'wb') as fh: fh.write(content) # Leer como texto para parsear text = content.decode('utf-8', errors='ignore') segments = parse_subtitle_format(text, 'vtt') if text else [] combined_text = '\n'.join([s.get('text','') for s in segments]) format_text = format_segments_text(segments) return { 'video_id': video_id, 'path': target_path, 'count': len(segments), 'segments': segments, 'text': combined_text, 'format_text': format_text } except Exception as e: raise HTTPException(status_code=500, detail=f'Error al guardar/parsear VTT: {str(e)[:200]}') @app.get('/transcript_alt/{video_id}') def transcript_alt(video_id: str, lang: str = 'es'): """Intento alternativo de obtener transcript usando youtube-transcript-api (si está disponible). Retorna segmentos en el mismo formato que get_transcript_data para mantener consistencia. """ if not YOUTUBE_TRANSCRIPT_API_AVAILABLE: raise HTTPException(status_code=501, detail='youtube-transcript-api no está instalado en el entorno.') vid = extract_video_id(video_id) if not vid: raise HTTPException(status_code=400, detail='video_id inválido') # preparar idiomas a probar langs = [lang] if len(lang) == 2: langs.append(f"{lang}-419") try: # get_transcript puede lanzar excepciones si no hay transcript # Usar cast para silenciar el analizador estático que no infiere la comprobación previa transcript_api = cast(Any, YouTubeTranscriptApi) transcript_list = transcript_api.get_transcript(vid, languages=langs) except NoTranscriptFound: raise HTTPException(status_code=404, detail='No se encontró transcript con youtube-transcript-api') except TranscriptsDisabled: raise HTTPException(status_code=403, detail='Los transcripts están deshabilitados para este video') except Exception as e: raise HTTPException(status_code=500, detail=f'Error youtube-transcript-api: {str(e)[:300]}') # transcript_list tiene items con keys: text, start, duration segments = [] for item in transcript_list: segments.append({ 'start': float(item.get('start', 0)), 'duration': float(item.get('duration', 0)), 'text': item.get('text', '').strip() }) combined_text = '\n'.join([s['text'] for s in segments if s.get('text')]) format_text = format_segments_text(segments) thumbnails = get_video_thumbnails(vid) return { 'video_id': vid, 'count': len(segments), 'segments': segments, 'text': combined_text, 'format_text': format_text, 'source': 'youtube-transcript-api' } @app.get('/playwright/stream/{video_id}') def playwright_stream(video_id: str, profile: str = '', headless: bool = True, timeout: int = 60): """Usa Playwright (script tools/playwright_extract_m3u8.py) para abrir el video en un navegador real (o con perfil persistente) y extraer las URLs m3u8 y cookies. Parámetros: - profile: ruta al user-data-dir de Chrome (opcional). Si el contenedor tiene el perfil montado en /host-chrome, pásalo como `/host-chrome/Default`. - headless: true/false para ejecutar sin UI. - timeout: segundos máximos a esperar por la ejecución del script. Uso (ejemplo): curl 'http://localhost:8282/playwright/stream/cmqVmX2UVBM?headless=false&profile=/host-chrome' Nota: el script genera `./data/cookies.txt` si logra extraer cookies. """ vid = extract_video_id(video_id) if not vid: raise HTTPException(status_code=400, detail='video_id inválido') script = os.path.join(os.getcwd(), 'tools', 'playwright_extract_m3u8.py') if not os.path.exists(script): raise HTTPException(status_code=500, detail='Script Playwright no encontrado en tools/playwright_extract_m3u8.py') cmd = ['python3', script, '--video', f'https://www.youtube.com/watch?v={vid}', '--timeout', str(timeout)] if headless: cmd.append('--headless') # profile can be provided via env PLAYWRIGHT_PROFILE or param profile_path = profile or os.getenv('PLAYWRIGHT_PROFILE', '') if profile_path: cmd.extend(['--profile', profile_path]) try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 10) except subprocess.TimeoutExpired: raise HTTPException(status_code=504, detail='Playwright timed out') except Exception as e: raise HTTPException(status_code=500, detail=f'Error ejecutando Playwright: {str(e)[:300]}') if proc.returncode != 0: # incluir stderr para diagnóstico detail = (proc.stderr or proc.stdout or 'Error desconocido')[:2000] return JSONResponse(status_code=500, content={"error": "Playwright error", "detail": detail}) try: out = json.loads(proc.stdout or '{}') except Exception: return JSONResponse(status_code=500, content={"error": "No se pudo parsear salida Playwright", "raw": proc.stdout[:2000]}) return out def _attempt_playwright_fallback(video_id: str, headless: bool = True, profile: str | None = None, timeout: int = 60): """Ejecuta el script Playwright para intentar extraer m3u8 y cookies. Retorna (m3u8_url or None, cookies_saved_path or None, error_message or None) """ script = os.path.join(os.getcwd(), 'tools', 'playwright_extract_m3u8.py') if not os.path.exists(script): return None, None, 'Playwright extractor script no disponible' cmd = ['python3', script, '--video', f'https://www.youtube.com/watch?v={video_id}', '--timeout', str(timeout)] if headless: cmd.append('--headless') # profile can be provided via env PLAYWRIGHT_PROFILE or param profile_path = profile or os.getenv('PLAYWRIGHT_PROFILE', '') if profile_path: cmd.extend(['--profile', profile_path]) try: proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 10) except subprocess.TimeoutExpired: return None, None, 'Playwright timed out' except Exception as e: return None, None, f'Error ejecutando Playwright: {str(e)[:200]}' if proc.returncode != 0: # incluir stderr para diagnóstico detail = (proc.stderr or proc.stdout or 'Error desconocido')[:2000] return None, None, f'Playwright error: {detail}' try: data = json.loads(proc.stdout or '{}') except Exception: return None, None, 'No se pudo parsear la salida de Playwright' urls = data.get('m3u8_urls') or [] cookies_file = data.get('cookies_file') if not urls: return None, cookies_file, 'No se encontró m3u8 via Playwright' # tomar la primera URL válida m3u8 = urls[0] # Si Playwright devolvió cookies, moverlas a API_COOKIES_PATH para que el resto del sistema las use if cookies_file and os.path.exists(cookies_file): target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) try: target_dir = os.path.dirname(target) or '.' os.makedirs(target_dir, exist_ok=True) # copiar contenido with open(cookies_file, 'rb') as src, open(target, 'wb') as dst: dst.write(src.read()) return m3u8, target, None except Exception as e: return m3u8, None, f'm3u8 encontrado pero no se pudo guardar cookies: {str(e)[:200]}' return m3u8, None, None