From 2923510c5132a60a3f3b63d67fd3cc625a85cade Mon Sep 17 00:00:00 2001 From: Cesar Mendivil Date: Tue, 17 Mar 2026 00:29:51 -0700 Subject: [PATCH] Add Playwright tools for extracting M3U8 URLs and proxy management - Introduced `playwright_extract_m3u8.py` to extract M3U8 URLs from YouTube videos using Playwright. - Added `README_PLAYWRIGHT.md` for usage instructions and requirements. - Created `expand_and_test_proxies.py` to expand user-provided proxies and test their validity. - Implemented `generate_proxy_whitelist.py` to generate a whitelist of working proxies based on testing results. - Added sample proxy files: `user_proxies.txt` for user-defined proxies and `proxies_sample.txt` as a template. - Generated `expanded_proxies.txt`, `whitelist.json`, and `whitelist.txt` for storing expanded and valid proxies. - Included error handling and logging for proxy testing results. --- Dockerfile.api | 11 +- docker-compose.yml | 16 +- docker-rebuild.sh | 178 +++----- export-chrome-cookies.sh | 141 ++++++ main.py | 703 +++++++++++++++++++++++++++--- tools/README_PLAYWRIGHT.md | 27 ++ tools/expand_and_test_proxies.py | 113 +++++ tools/expanded_proxies.txt | 30 ++ tools/generate_proxy_whitelist.py | 242 ++++++++++ tools/playwright_extract_m3u8.py | 177 ++++++++ tools/proxies_sample.txt | 0 tools/user_proxies.txt | 10 + tools/whitelist.json | 256 +++++++++++ tools/whitelist.txt | 0 14 files changed, 1728 insertions(+), 176 deletions(-) create mode 100755 export-chrome-cookies.sh create mode 100644 tools/README_PLAYWRIGHT.md create mode 100644 tools/expand_and_test_proxies.py create mode 100644 tools/expanded_proxies.txt create mode 100644 tools/generate_proxy_whitelist.py create mode 100755 tools/playwright_extract_m3u8.py create mode 100644 tools/proxies_sample.txt create mode 100644 tools/user_proxies.txt create mode 100644 tools/whitelist.json create mode 100644 tools/whitelist.txt diff --git a/Dockerfile.api b/Dockerfile.api index 7cc5a5f..c6c2f02 100644 --- a/Dockerfile.api +++ b/Dockerfile.api @@ -3,8 +3,8 @@ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 -# Instalar ffmpeg, Node.js (LTS via NodeSource) y herramientas necesarias -# Node.js + yt-dlp-utils son requeridos para resolver el n-challenge y signature de YouTube +# Instalar ffmpeg, Node.js 20 LTS y herramientas necesarias +# Node.js es requerido por yt-dlp --js-runtimes para resolver n-challenge/signature de YouTube RUN apt-get update \ && apt-get install -y --no-install-recommends \ ffmpeg \ @@ -13,8 +13,7 @@ RUN apt-get update \ gnupg \ && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ && apt-get install -y --no-install-recommends nodejs \ - && rm -rf /var/lib/apt/lists/* \ - && npm install -g yt-dlp-utils 2>/dev/null || true + && rm -rf /var/lib/apt/lists/* WORKDIR /app @@ -22,7 +21,7 @@ WORKDIR /app COPY requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r /app/requirements.txt -# Instalar yt-dlp desde la última versión del binario oficial (no pip) para tener siempre la más reciente +# Instalar yt-dlp desde el binario oficial más reciente (no pip) para siempre tener la última versión RUN curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp \ && chmod a+rx /usr/local/bin/yt-dlp @@ -42,5 +41,5 @@ USER appuser EXPOSE 8000 -# Comando por defecto para ejecutar la API +# Comando para ejecutar la API CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker-compose.yml b/docker-compose.yml index a691032..6d4b0b4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,23 +1,31 @@ services: - # Servicio FastAPI - Backend API tubescript-api: build: context: . dockerfile: Dockerfile.api args: - # Invalida solo la capa COPY . /app para que siempre tome el código más reciente - # sin necesidad de --no-cache (que descarga todo desde cero) + # Invalida la capa COPY . /app sin necesidad de --no-cache completo CACHEBUST: "${CACHEBUST:-1}" image: tubescript-api:latest container_name: tubescript_api ports: - "8282:8000" volumes: + # Datos persistentes: cookies.txt, config, etc. - ./data:/app/data:rw + # ── Perfiles de navegador del HOST (read-only) ────────────────────────── + # yt-dlp puede leer cookies directamente del navegador con + # POST /extract_chrome_cookies?browser=chrome + # Descomenta el navegador que tengas instalado en el host: + - ${HOME}/.config/google-chrome:/host-chrome:ro + # - ${HOME}/.config/chromium:/host-chromium:ro + # - ${HOME}/.config/BraveSoftware/Brave-Browser:/host-brave:ro + # - ${HOME}/.mozilla/firefox:/host-firefox:ro + # - ${HOME}/.config/microsoft-edge:/host-edge:ro environment: - PYTHONUNBUFFERED=1 - API_COOKIES_PATH=/app/data/cookies.txt - # Optional: set API_PROXY when you want the container to use a SOCKS/HTTP proxy + # Proxy opcional: socks5h://127.0.0.1:9050 - API_PROXY=${API_PROXY:-} restart: unless-stopped networks: diff --git a/docker-rebuild.sh b/docker-rebuild.sh index 011de5e..da3e63c 100755 --- a/docker-rebuild.sh +++ b/docker-rebuild.sh @@ -1,125 +1,83 @@ #!/bin/bash - -# Script para reconstruir las imágenes Docker de TubeScript - +# Script para reconstruir TubeScript-API desde cero set -e +GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m' +ok() { echo -e "${GREEN}✅ $1${NC}"; } +warn() { echo -e "${YELLOW}⚠️ $1${NC}"; } +err() { echo -e "${RED}❌ $1${NC}"; } + echo "════════════════════════════════════════════════════════════" -echo " 🔨 TubeScript-API - Rebuild de Docker" +echo " 🔨 TubeScript-API — Rebuild completo" echo "════════════════════════════════════════════════════════════" echo "" -# Colores -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -RED='\033[0;31m' -NC='\033[0m' - -print_success() { - echo -e "${GREEN}✅ $1${NC}" -} - -print_warning() { - echo -e "${YELLOW}⚠️ $1${NC}" -} - -print_error() { - echo -e "${RED}❌ $1${NC}" -} - -# Verificar Docker -echo "🔍 Verificando Docker..." -if ! command -v docker &> /dev/null; then - print_error "Docker no está instalado" +# ── Verificar Docker (plugin compose, no docker-compose legacy) ────────────── +if ! docker compose version &>/dev/null; then + err "docker compose no está disponible. Instala Docker Desktop o el plugin compose." exit 1 fi - -if ! command -v docker-compose &> /dev/null; then - print_error "Docker Compose no está instalado" - exit 1 -fi - -print_success "Docker encontrado" +ok "Docker compose disponible: $(docker compose version --short 2>/dev/null || echo 'ok')" echo "" -# Asegurar carpeta data para montajes de configuración -echo "📁 Asegurando carpeta './data' para montaje de configuración..." -if [ ! -d "./data" ]; then - mkdir -p ./data - chmod 755 ./data || true - print_success "Carpeta ./data creada" +# ── Carpeta data ────────────────────────────────────────────────────────────── +mkdir -p ./data +chmod 777 ./data 2>/dev/null || true +ok "Carpeta ./data lista (permisos 777)" +echo " Coloca cookies.txt en ./data/cookies.txt para autenticación" +echo "" + +# ── Detener contenedores existentes ────────────────────────────────────────── +echo "🛑 Deteniendo contenedores..." +docker compose down --remove-orphans 2>/dev/null || true +ok "Contenedores detenidos" +echo "" + +# ── Eliminar imagen anterior para forzar build limpio ───────────────────────── +echo "🧹 Eliminando imagen anterior (tubescript-api:latest)..." +docker rmi tubescript-api:latest 2>/dev/null && ok "Imagen anterior eliminada" || warn "No había imagen previa" +echo "" + +# ── Build sin caché ─────────────────────────────────────────────────────────── +echo "🔨 Construyendo imagen desde cero (--no-cache)..." +echo " Esto puede tardar 3-5 minutos la primera vez..." +echo "" +CACHEBUST=$(date +%s) docker compose build --no-cache + +ok "Imagen construida exitosamente" +echo "" + +# ── Iniciar servicios ───────────────────────────────────────────────────────── +echo "🚀 Iniciando servicios..." +docker compose up -d + +ok "Servicios iniciados" +echo "" + +# ── Esperar y mostrar estado ────────────────────────────────────────────────── +echo "⏳ Esperando que la API arranque (15s)..." +sleep 15 + +echo "" +echo "📊 Estado de contenedores:" +docker compose ps +echo "" + +# ── Health check ────────────────────────────────────────────────────────────── +echo "🩺 Verificando API..." +if curl -sf http://localhost:8282/docs -o /dev/null; then + ok "API respondiendo en http://localhost:8282" else - print_success "Carpeta ./data ya existe" -fi -echo "Nota: coloca aquí archivos persistentes como stream_config.json, streams_state.json y cookies.txt (ej: ./data/cookies.txt)" -echo "" - -# Detener contenedores -echo "🛑 Deteniendo contenedores existentes..." -docker compose down 2>/dev/null || true -print_success "Contenedores detenidos" -echo "" - -# Limpiar imágenes antiguas (opcional) -echo "🧹 ¿Deseas eliminar las imágenes antiguas? (s/N)" -read -p "> " clean_images -if [ "$clean_images" = "s" ] || [ "$clean_images" = "S" ]; then - echo "Eliminando imágenes antiguas..." - docker compose down --rmi all 2>/dev/null || true - print_success "Imágenes antiguas eliminadas" -fi -echo "" - -# Reconstruir con CACHEBUST para invalidar solo la capa COPY . /app -# CACHEBUST=$(date +%s) se exporta para que docker-compose.yml lo tome via ${CACHEBUST:-1} -echo "🔨 Reconstruyendo imagen con código actualizado..." -echo "Usando CACHEBUST=$(date +%s) para forzar copia fresca del código..." -echo "" - -export CACHEBUST="$(date +%s)" -docker compose build - -if [ $? -eq 0 ]; then - print_success "Imagen reconstruida exitosamente" -else - print_error "Error al reconstruir imagen" - exit 1 -fi -echo "" - -# Preguntar si desea iniciar -echo "🚀 ¿Deseas iniciar los servicios ahora? (S/n)" -read -p "> " start_services -if [ "$start_services" != "n" ] && [ "$start_services" != "N" ]; then - echo "" - echo "🚀 Iniciando servicios..." - docker compose up -d - - if [ $? -eq 0 ]; then - print_success "Servicios iniciados" - echo "" - echo "📊 Estado de los servicios:" - sleep 3 - docker compose ps - echo "" - echo "════════════════════════════════════════════════════════════" - print_success "¡Rebuild completado!" - echo "════════════════════════════════════════════════════════════" - echo "" - echo "🌐 Servicios disponibles:" - echo " API: http://localhost:8282" - echo " Docs API: http://localhost:8282/docs" - echo "" - else - print_error "Error al iniciar servicios" - exit 1 - fi -else - echo "" - print_success "Rebuild completado (servicios no iniciados)" - echo "" - echo "Para iniciar los servicios:" - echo " CACHEBUST=\$(date +%s) docker compose up -d --build" + warn "API aún no responde (puede necesitar más tiempo). Revisa: docker compose logs -f" fi +echo "" echo "════════════════════════════════════════════════════════════" +ok "¡Rebuild completado!" +echo "════════════════════════════════════════════════════════════" +echo "" +echo " API: http://localhost:8282" +echo " Docs: http://localhost:8282/docs" +echo " Logs: docker compose logs -f" +echo " Cookies: curl -X POST http://localhost:8282/upload_cookies -F 'file=@cookies.txt'" +echo "" diff --git a/export-chrome-cookies.sh b/export-chrome-cookies.sh new file mode 100755 index 0000000..4bdf73a --- /dev/null +++ b/export-chrome-cookies.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# ───────────────────────────────────────────────────────────────────────────── +# export-chrome-cookies.sh +# Exporta cookies de YouTube desde el perfil del navegador del HOST +# usando yt-dlp, y las copia a ./data/cookies.txt para que la API las use. +# +# Uso: +# ./export-chrome-cookies.sh # Chrome (default) +# ./export-chrome-cookies.sh chromium # Chromium +# ./export-chrome-cookies.sh brave # Brave +# ./export-chrome-cookies.sh firefox # Firefox +# ./export-chrome-cookies.sh edge # Edge +# +# IMPORTANTE: +# - Cierra el navegador antes de ejecutar (Chrome bloquea el archivo de cookies) +# - En Linux no requiere contraseña ni keychain especial +# ───────────────────────────────────────────────────────────────────────────── + +set -e + +BROWSER="${1:-chrome}" +OUTPUT="./data/cookies.txt" +GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m' + +ok() { echo -e "${GREEN}✅ $1${NC}"; } +warn() { echo -e "${YELLOW}⚠️ $1${NC}"; } +err() { echo -e "${RED}❌ $1${NC}"; exit 1; } + +echo "" +echo "🍪 Exportando cookies de YouTube desde: $BROWSER" +echo "" + +# Verificar yt-dlp +if ! command -v yt-dlp &>/dev/null; then + err "yt-dlp no está instalado. Instala con: pip install yt-dlp" +fi + +# Verificar que el navegador no esté corriendo (puede causar errores de bloqueo) +BROWSER_PROC="" +case "$BROWSER" in + chrome) BROWSER_PROC="google-chrome\|chrome" ;; + chromium) BROWSER_PROC="chromium" ;; + brave) BROWSER_PROC="brave" ;; + firefox) BROWSER_PROC="firefox" ;; + edge) BROWSER_PROC="msedge\|microsoft-edge" ;; +esac + +if [ -n "$BROWSER_PROC" ] && pgrep -f "$BROWSER_PROC" &>/dev/null; then + warn "El navegador '$BROWSER' parece estar corriendo." + warn "Ciérralo antes de exportar para evitar errores de bloqueo del DB." + echo "" + read -p "¿Continuar de todos modos? (s/N): " confirm + [[ "$confirm" =~ ^[sS]$ ]] || { echo "Cancelado."; exit 0; } + echo "" +fi + +# Crear directorio de destino +mkdir -p "$(dirname "$OUTPUT")" + +# Detectar ruta del perfil +PROFILE_PATH="" +case "$BROWSER" in + chrome) + for p in "$HOME/.config/google-chrome/Default" "$HOME/.config/google-chrome/Profile 1"; do + [ -d "$p" ] && PROFILE_PATH="$p" && break + done + ;; + chromium) + PROFILE_PATH="$HOME/.config/chromium/Default" + ;; + brave) + PROFILE_PATH="$HOME/.config/BraveSoftware/Brave-Browser/Default" + ;; + firefox) + # Firefox: yt-dlp detecta el perfil automáticamente + PROFILE_PATH="" + ;; + edge) + PROFILE_PATH="$HOME/.config/microsoft-edge/Default" + ;; + *) + err "Navegador '$BROWSER' no soportado. Usa: chrome, chromium, brave, firefox, edge" + ;; +esac + +if [ -n "$PROFILE_PATH" ] && [ ! -d "$PROFILE_PATH" ]; then + err "No se encontró el perfil de $BROWSER en: $PROFILE_PATH" +fi + +# Construir argumento --cookies-from-browser +if [ -n "$PROFILE_PATH" ]; then + BROWSER_ARG="${BROWSER}:${PROFILE_PATH}" + echo " Perfil: $PROFILE_PATH" +else + BROWSER_ARG="$BROWSER" + echo " Perfil: detectado automáticamente" +fi +echo " Destino: $OUTPUT" +echo "" + +# Exportar cookies con yt-dlp +echo "⏳ Extrayendo cookies..." +yt-dlp \ + --cookies-from-browser "$BROWSER_ARG" \ + --cookies "$OUTPUT" \ + --skip-download \ + --no-warnings \ + --extractor-args "youtube:player_client=tv_embedded" \ + "https://www.youtube.com/watch?v=dQw4w9WgXcQ" 2>&1 || { + err "Error al extraer cookies. Asegúrate de que el navegador está cerrado y tienes sesión en YouTube." + } + +# Verificar resultado +if [ ! -f "$OUTPUT" ] || [ ! -s "$OUTPUT" ]; then + err "No se generó el archivo de cookies o está vacío." +fi + +YT_LINES=$(grep -c "youtube.com" "$OUTPUT" 2>/dev/null || echo 0) +FILE_SIZE=$(du -h "$OUTPUT" | cut -f1) + +echo "" +ok "Cookies exportadas exitosamente" +echo " Archivo: $OUTPUT" +echo " Tamaño: $FILE_SIZE" +echo " Líneas youtube.com: $YT_LINES" +echo "" + +if [ "$YT_LINES" -lt 3 ]; then + warn "Pocas cookies de YouTube encontradas ($YT_LINES)." + warn "Verifica que estás logueado en YouTube en $BROWSER." +else + ok "Cookies de YouTube encontradas: $YT_LINES líneas" +fi + +echo "" +echo "📋 Próximos pasos:" +echo " 1. Si el contenedor está corriendo, las cookies ya están disponibles en /app/data/" +echo " 2. Si no está corriendo: docker compose up -d" +echo " 3. Prueba: curl http://localhost:8282/cookies/status" +echo "" + diff --git a/main.py b/main.py index bea8a5e..432f7b1 100644 --- a/main.py +++ b/main.py @@ -6,8 +6,10 @@ import time import re import tempfile import glob +import random from fastapi import FastAPI, HTTPException, UploadFile, File from typing import List, Dict, Any, cast +from fastapi.responses import JSONResponse # Intentar importar youtube_transcript_api como fallback try: @@ -34,6 +36,62 @@ DEFAULT_COOKIES_PATH = './data/cookies.txt' # Proxy opcional para requests/yt-dlp (ej. socks5h://127.0.0.1:9050) DEFAULT_PROXY = os.getenv('API_PROXY', '') +# Nuevo: rotador/simple selector de proxies +# - Si se define API_PROXY se usa directamente. +# - Si se define API_PROXIES (lista separada por comas) se elige uno al azar. +# Ej: API_PROXIES="socks5h://127.0.0.1:9050,http://10.0.0.1:3128" + +# Nuevo: ruta por defecto del archivo whitelist +PROXY_WHITELIST_FILE = os.getenv('PROXY_WHITELIST_FILE', 'tools/whitelist.txt') +_proxy_whitelist_cache = { 'ts': 0, 'proxies': [] } + + +def _load_whitelist_file(path: str, ttl: int = 30): + """Carga proxies desde archivo path con TTL en segundos para cache. + Retorna lista de proxies (puede ser vacía). + """ + now = time.time() + if _proxy_whitelist_cache['proxies'] and (now - _proxy_whitelist_cache['ts'] < ttl): + return _proxy_whitelist_cache['proxies'] + proxies = [] + try: + if os.path.exists(path): + with open(path, 'r', encoding='utf-8') as fh: + for line in fh: + p = line.strip() + if p and not p.startswith('#'): + proxies.append(p) + except Exception: + proxies = [] + _proxy_whitelist_cache['proxies'] = proxies + _proxy_whitelist_cache['ts'] = now + return proxies + + +def _get_proxy_choice() -> str | None: + """Devuelve una URL de proxy elegida: + - Prioridad: API_PROXY (single) -> API_PROXIES (comma list) -> PROXY_WHITELIST_FILE -> None + """ + # 1) Legacy single proxy has priority + single = os.getenv('API_PROXY', '') or DEFAULT_PROXY or '' + if single: + return single + + # 2) comma-separated list from env + lst = os.getenv('API_PROXIES', '') or '' + if lst: + proxies = [p.strip() for p in lst.split(',') if p.strip()] + if proxies: + return random.choice(proxies) + + # 3) whitelist file + wl_file = os.getenv('PROXY_WHITELIST_FILE', PROXY_WHITELIST_FILE) + proxies = _load_whitelist_file(wl_file) + if proxies: + return random.choice(proxies) + + return None + def clean_youtube_json(raw_json: Dict) -> List[Dict]: """ Transforma el formato complejo 'json3' de YouTube a un formato @@ -186,18 +244,21 @@ NODE_PATH = "/usr/bin/node" def _yt_client_args(has_cookies: bool, for_stream: bool = False) -> list: """Devuelve --extractor-args y --js-runtimes para metadata/streams. - Estrategia (basada en pruebas reales 2026-03-05): - - Sin cookies → android (sin n-challenge, sin Node.js) - - Con cookies → web + Node.js (web acepta cookies; Node resuelve n-challenge/signature) - - for_stream → android (mejor compatibilidad HLS en lives) + Estrategia actualizada 2026-03-07: + - android → REQUIERE GVS PO Token desde 2026 → formatos HTTPS omitidos → HTTP 403. + YA NO SE USA para metadata ni streams. + - Sin cookies → tv_embedded (sin PO Token, sin n-challenge, funciona para metadata) + - Con cookies → web + Node.js (Node resuelve n-challenge/signature) + - for_stream → tv_embedded (más fiable para HLS/lives sin cookies) Diagnóstico: - - mweb con cookies → requiere GVS PO Token (no disponible) - - android con cookies → yt-dlp lo salta (no soporta cookies) - - web con cookies + --js-runtimes node → ✅ funciona + - android → requiere GVS PO Token (2026) → NO usar + - mweb → requiere Visitor Data PO Token → NO usar sin cookies + - tv_embedded → sin PO Token requerido → ✅ funciona para metadata/stream + - web + Node.js → ✅ funciona con cookies """ if for_stream or not has_cookies: - return ["--extractor-args", "youtube:player_client=android"] + return ["--extractor-args", "youtube:player_client=tv_embedded"] else: return [ "--extractor-args", "youtube:player_client=web", @@ -208,13 +269,18 @@ def _yt_client_args(has_cookies: bool, for_stream: bool = False) -> list: def _yt_subs_args(has_cookies: bool) -> list: """Devuelve --extractor-args para descarga de subtítulos. - Para subtítulos siempre usamos android: - - android sin cookies → ✅ funciona, obtiene auto-subs sin n-challenge - - android con cookies → yt-dlp lo salta pero descarga igual sin cookies - - web con cookies → falla en sub-langs no exactos (ej: en vs en-US) - Resultado: android es siempre el cliente más fiable para subtítulos. + Estrategia actualizada 2026-03-07: + - android → requiere GVS PO Token desde 2026 → subtítulos HTTP 403 → NO usar. + - tv_embedded → sin PO Token, obtiene auto-subs sin bot-check → ✅ preferido. + - mweb → fallback útil si tv_embedded no trae subs en ciertos idiomas. + - web + Node → sólo con cookies (resuelve n-challenge). """ - return ["--extractor-args", "youtube:player_client=android"] + if has_cookies: + return [ + "--extractor-args", "youtube:player_client=web", + "--js-runtimes", f"node:{NODE_PATH}", + ] + return ["--extractor-args", "youtube:player_client=tv_embedded,mweb"] @@ -245,7 +311,7 @@ def get_transcript_data(video_id: str, lang: str = "es"): # cookies_path: prefer the temporary cookiefile if present, otherwise fall back to env path cookies_path = cookiefile_path or os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) # proxy support - proxy = os.getenv('API_PROXY', DEFAULT_PROXY) or None + proxy = _get_proxy_choice() proxies = {'http': proxy, 'https': proxy} if proxy else None def load_cookies_from_file(path: str) -> dict: @@ -276,6 +342,7 @@ def get_transcript_data(video_id: str, lang: str = "es"): return cookies cookies_for_requests = load_cookies_from_file(cookies_path) if cookies_path else {} + _has_ck_subs = bool(cookies_path and os.path.exists(cookies_path)) # Intento rápido y fiable: usar yt-dlp para descargar subtítulos (auto o manual) al tmpdir try: @@ -290,8 +357,8 @@ def get_transcript_data(video_id: str, lang: str = "es"): elif len(lang) == 2: sub_langs = [lang, f"{lang}-{lang.upper()}", f"{lang}-419", f"{lang}-en"] - # siempre android para subtítulos — NO pasar --cookies porque android no las soporta - # (yt-dlp salta el cliente android si recibe cookies → no descarga nada) + # tv_embedded/mweb para subtítulos sin cookies (no requieren PO Token) + # web + Node.js cuando hay cookies (resuelve n-challenge) ytdlp_cmd = [ "yt-dlp", url, @@ -302,8 +369,10 @@ def get_transcript_data(video_id: str, lang: str = "es"): "-o", os.path.join(tmpdl, "%(id)s.%(ext)s"), "--no-warnings", "--sub-lang", ",".join(sub_langs), - ] + _yt_subs_args(False) - # NO se pasan cookies con android (android no las soporta en yt-dlp) + ] + _yt_subs_args(_has_ck_subs) + # Pasar cookies solo cuando se usa cliente web (con cookies) + if _has_ck_subs: + ytdlp_cmd.extend(["--cookies", cookies_path]) # attach proxy if configured if proxy: @@ -610,8 +679,10 @@ def get_transcript_data(video_id: str, lang: str = "es"): "--sub-format", "json3/vtt/srv3/best", "-o", os.path.join(tmpdir, "%(id)s.%(ext)s"), "--no-warnings", - ] + _yt_subs_args(False) - # NO cookies con android (android no las soporta, yt-dlp lo saltaría) + ] + _yt_subs_args(_has_ck_subs) + # Pasar cookies sólo con cliente web + if _has_ck_subs: + cmd.extend(["--cookies", cookies_path]) # añadir proxy a la llamada de yt-dlp si está configurado if proxy: @@ -654,7 +725,7 @@ def get_transcript_data(video_id: str, lang: str = "es"): "Prueba: ?lang=en | /debug/fetch_subs/{video_id} | sube cookies con /upload_cookies" ) -# ── Clientes exactos de NewPipeExtractor (ClientsConstants.java dev 2026-03-05) ── +# ── Clientes Innertube (sincronizados con NewPipeExtractor + yt-dlp 2026-03) ── _NP_IOS = { "clientName": "IOS", "clientVersion": "21.03.2", "clientScreen": "WATCH", "platform": "MOBILE", @@ -668,7 +739,16 @@ _NP_ANDROID = { "osName": "Android", "osVersion": "16", "androidSdkVersion": 36, "userAgent": "com.google.android.youtube/21.03.36 (Linux; U; Android 16) gzip", } -# GAPIS: youtubei.googleapis.com — NewPipe lo usa para iOS y Android (YoutubeStreamHelper.java) +# tv_embedded: NO requiere PO Token, siempre devuelve videoDetails + hlsManifestUrl en lives +# Es el cliente más fiable para obtener title/description sin autenticación. +_NP_TV_EMBEDDED = { + "clientName": "TVHTML5_SIMPLY_EMBEDDED_PLAYER", + "clientVersion": "2.0", + "clientScreen": "EMBED", + "platform": "TV", + "userAgent": "Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/538.1 (KHTML, like Gecko) Version/6.0 TV Safari/538.1", +} +# GAPIS: youtubei.googleapis.com — usado por NewPipe para iOS/Android/TV _GAPIS_BASE = "https://youtubei.googleapis.com/youtubei/v1" @@ -754,12 +834,14 @@ def _np_call_player(video_id: str, client: dict, def innertube_get_stream(video_id: str, proxy: str = None) -> dict: """ - Obtiene URL de stream replicando exactamente NewPipeExtractor: - 1. visitorData via /visitor_id (para ambos clientes) - 2. iOS /player → iosStreamingData.hlsManifestUrl (prioritario para lives) - 3. Android /player → formats directas (videos normales) + Obtiene URL de stream replicando NewPipeExtractor + fallback tv_embedded. - Sin cookies | Sin firma JS | Sin PO Token | Sin bot-check desde servidores + Orden de intentos: + 1. iOS → hlsManifestUrl (prioritario para lives, trae videoDetails) + 2. Android → formats directas + videoDetails + 3. tv_embedded → sin PO Token, siempre trae videoDetails y hlsManifestUrl en lives + + Sin cookies | Sin firma JS | Sin bot-check desde servidores """ result = { "title": None, "description": None, @@ -771,7 +853,7 @@ def innertube_get_stream(video_id: str, proxy: str = None) -> dict: vd_ios = _np_get_visitor_data(_NP_IOS, proxies) vd_android = _np_get_visitor_data(_NP_ANDROID, proxies) - # iOS — preferido para hlsManifestUrl en lives (como hace NewPipe) + # ── iOS — preferido para hlsManifestUrl en lives ────────────────────────── ios = _np_call_player(video_id, _NP_IOS, vd_ios, proxies) ps = ios.get("playabilityStatus") or {} if ps.get("status") == "LOGIN_REQUIRED": @@ -779,8 +861,8 @@ def innertube_get_stream(video_id: str, proxy: str = None) -> dict: return result vd_meta = ios.get("videoDetails") or {} - result["title"] = vd_meta.get("title") - result["description"] = vd_meta.get("shortDescription") + result["title"] = vd_meta.get("title") or None + result["description"] = vd_meta.get("shortDescription") or None result["is_live"] = bool(vd_meta.get("isLive") or vd_meta.get("isLiveContent")) ios_sd = ios.get("streamingData") or {} @@ -791,20 +873,38 @@ def innertube_get_stream(video_id: str, proxy: str = None) -> dict: {"itag": f.get("itag"), "mimeType": f.get("mimeType"), "quality": f.get("quality")} for f in (ios_sd.get("formats", []) + ios_sd.get("adaptiveFormats", []))[:8] ] + # Intentar completar videoDetails si iOS no los trajo + if not result["title"]: + vd_android_resp = _np_call_player(video_id, _NP_ANDROID, vd_android, proxies) + vd2 = vd_android_resp.get("videoDetails") or {} + result["title"] = vd2.get("title") or result["title"] + result["description"] = vd2.get("shortDescription") or result["description"] + if not result["title"]: + # último intento: tv_embedded + tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) + vd3 = tv.get("videoDetails") or {} + result["title"] = vd3.get("title") or result["title"] + result["description"] = vd3.get("shortDescription") or result["description"] return result - # Android — para videos normales o si iOS no dio HLS + # ── Android — para videos normales o si iOS no dio HLS ─────────────────── android = _np_call_player(video_id, _NP_ANDROID, vd_android, proxies) if not result["title"]: vd2 = android.get("videoDetails") or {} - result["title"] = vd2.get("title") - result["description"] = vd2.get("shortDescription") - result["is_live"] = bool(vd2.get("isLive") or vd2.get("isLiveContent")) + result["title"] = vd2.get("title") or None + result["description"] = vd2.get("shortDescription") or None + result["is_live"] = result["is_live"] or bool( + vd2.get("isLive") or vd2.get("isLiveContent")) android_sd = android.get("streamingData") or {} hls = android_sd.get("hlsManifestUrl") if hls: result["hls_url"] = hls + if not result["title"]: + tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) + vd3 = tv.get("videoDetails") or {} + result["title"] = vd3.get("title") or result["title"] + result["description"] = vd3.get("shortDescription") or result["description"] return result all_fmts = android_sd.get("formats", []) + android_sd.get("adaptiveFormats", []) @@ -816,37 +916,136 @@ def innertube_get_stream(video_id: str, proxy: str = None) -> dict: {"itag": f.get("itag"), "mimeType": f.get("mimeType"), "quality": f.get("quality")} for f in best[:8] ] + if not result["title"]: + tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) + vd3 = tv.get("videoDetails") or {} + result["title"] = vd3.get("title") or result["title"] + result["description"] = vd3.get("shortDescription") or result["description"] + return result + + # ── tv_embedded — sin PO Token, último recurso para streamingData ───────── + tv = _np_call_player(video_id, _NP_TV_EMBEDDED, "", proxies) + vd3 = tv.get("videoDetails") or {} + if not result["title"]: + result["title"] = vd3.get("title") or None + result["description"] = vd3.get("shortDescription") or None + result["is_live"] = result["is_live"] or bool( + vd3.get("isLive") or vd3.get("isLiveContent")) + + tv_sd = tv.get("streamingData") or {} + hls = tv_sd.get("hlsManifestUrl") + if hls: + result["hls_url"] = hls + return result + + all_fmts_tv = tv_sd.get("formats", []) + tv_sd.get("adaptiveFormats", []) + best_tv = sorted([f for f in all_fmts_tv if f.get("url")], + key=lambda x: x.get("bitrate", 0), reverse=True) + if best_tv: + result["hls_url"] = best_tv[0]["url"] + result["formats"] = [ + {"itag": f.get("itag"), "mimeType": f.get("mimeType"), "quality": f.get("quality")} + for f in best_tv[:8] + ] return result result["error"] = ( - "Innertube no devolvió streamingData. " + "Innertube no devolvió streamingData (iOS + Android + tv_embedded). " "Puede ser DRM, región bloqueada, privado, o YouTube actualizó su API." ) return result +def _fetch_metadata_ytdlp(video_id: str, proxy: str = None) -> dict: + """Obtiene title, description, is_live usando yt-dlp. + + Prueba clientes en orden hasta obtener título: + 1. tv_embedded — sin PO Token, devuelve videoDetails completo + 2. ios — HLS nativo, suele traer title + 3. mweb — fallback adicional + 4. --print title (rápido, último recurso) + """ + url = f"https://www.youtube.com/watch?v={video_id}" + proxy_args = ["--proxy", proxy] if proxy else [] + + # Intentar con --dump-json para cada cliente + for client in ("tv_embedded", "ios", "mweb"): + cmd = [ + "yt-dlp", "--skip-download", "--dump-json", "--no-warnings", + "--extractor-args", f"youtube:player_client={client}", + url, + ] + proxy_args + try: + res = subprocess.run(cmd, capture_output=True, text=True, timeout=25) + if res.returncode == 0 and res.stdout.strip(): + d = json.loads(res.stdout.strip()) + title = d.get("title") or d.get("fulltitle") + if title: + return { + "title": title, + "description": d.get("description") or None, + "is_live": bool(d.get("is_live") or d.get("was_live")), + } + except Exception: + continue + + # Último recurso: --print title (muy rápido, sólo el título) + for client in ("tv_embedded", "ios", "mweb"): + cmd = [ + "yt-dlp", "--skip-download", "--no-warnings", + "--print", "%(title)s\n%(is_live)s\n%(description)s", + "--extractor-args", f"youtube:player_client={client}", + url, + ] + proxy_args + try: + res = subprocess.run(cmd, capture_output=True, text=True, timeout=20) + if res.returncode == 0 and res.stdout.strip(): + lines = res.stdout.strip().splitlines() + title = lines[0].strip() if lines else None + if title and title.lower() not in ("none", "na", ""): + is_live = lines[1].strip().lower() in ("true", "1") if len(lines) > 1 else False + desc = "\n".join(lines[2:]).strip() if len(lines) > 2 else None + return { + "title": title, + "description": desc or None, + "is_live": is_live, + } + except Exception: + continue + + return {"title": None, "description": None, "is_live": False} + + + def get_stream_url(video_id: str): """ Obtiene la URL de transmisión m3u8/HLS. Devuelve: (stream_url, title, description, is_live, error) Estrategia: - 1. innertube_get_stream() — técnica NewPipe, sin cookies, sin bot-check - 2. Fallback yt-dlp si Innertube falla + 1. innertube_get_stream() — iOS + Android + tv_embedded, sin cookies + 2. Fallback yt-dlp con tv_embedded/ios/web + 3. title/description siempre se completan con _fetch_metadata_ytdlp si faltan """ video_id = extract_video_id(video_id) proxy = os.getenv('API_PROXY', DEFAULT_PROXY) or None # ── 1. Innertube directo (NewPipe) ──────────────────────────────────────── it = innertube_get_stream(video_id, proxy=proxy) - if it.get("hls_url"): - return (it["hls_url"], it.get("title"), it.get("description"), - it.get("is_live", False), None) title = it.get("title") description = it.get("description") is_live = it.get("is_live", False) + if it.get("hls_url"): + # Completar metadatos con yt-dlp si Innertube no los trajo + if not title: + meta = _fetch_metadata_ytdlp(video_id, proxy=proxy) + title = meta["title"] or title + description = meta["description"] or description + is_live = is_live or meta["is_live"] + return it["hls_url"], title, description, is_live, None + # ── 2. Fallback yt-dlp ──────────────────────────────────────────────────── cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() @@ -884,7 +1083,8 @@ def get_stream_url(video_id: str): except Exception: return None, False - clients = ["android", "ios"] + (["web"] if has_ck else []) + # tv_embedded no requiere PO Token; ios da HLS nativo; web+cookies resuelve n-challenge + clients = ["tv_embedded", "ios"] + (["web"] if has_ck else []) fmts = (["91", "92", "93", "94", "95", "96", "best[protocol=m3u8_native]", "best[protocol=m3u8]", "best"] if is_live else @@ -896,6 +1096,12 @@ def get_stream_url(video_id: str): for fmt in fmts: u, is_b = _ytdlp_url(fmt, client) if u: + # Completar metadatos si todavía faltan + if not title: + meta = _fetch_metadata_ytdlp(video_id, proxy=proxy) + title = meta["title"] or title + description = meta["description"] or description + is_live = is_live or meta["is_live"] return u, title, description, is_live, None if is_b: got_bot = True @@ -905,18 +1111,96 @@ def get_stream_url(video_id: str): except Exception: pass + # Último intento de metadatos aunque no haya stream + if not title: + meta = _fetch_metadata_ytdlp(video_id, proxy=proxy) + title = meta["title"] or title + description = meta["description"] or description + if got_bot: - return None, title, description, is_live, ( - "YouTube detectó actividad de bot. " - "Sube cookies.txt: curl -X POST http://localhost:8282/upload_cookies -F 'file=@cookies.txt'" - ) + # Intentar fallback con Playwright usando _attempt_playwright_fallback y devolver m3u8/cookies si encuentra; si falla, devolver mensaje anterior con detalle. + try: + pw_m3u8, pw_cookies, pw_err = _attempt_playwright_fallback(video_id) + if pw_m3u8: + # si Playwright encontró el m3u8, retornar exitoso + return pw_m3u8, title, description, is_live, None + # si Playwright no tuvo éxito, incluir su error en la respuesta + detail = pw_err or 'YouTube detectó actividad de bot. Sube cookies.txt con /upload_cookies.' + except Exception as e: + detail = f'YouTube detectó actividad de bot. Además, Playwright fallback falló: {str(e)[:200]}' + return None, title, description, is_live, detail return None, title, description, is_live, ( - it.get("error") or - "No se pudo obtener la URL del stream. " - "Si es un live, verifica que esté EN VIVO (🔴) ahora mismo." + "YouTube detectó actividad de bot. " + "Sube cookies.txt: curl -X POST http://localhost:8282/upload_cookies -F 'file=@cookies.txt'" ) -# ...existing code (old get_stream_url body — reemplazado arriba) — ELIMINAR... + +@app.get("/debug/stream/{video_id}") +def debug_stream(video_id: str): + """Diagnóstico completo del endpoint /stream: muestra qué devuelve cada cliente + Innertube (iOS, Android, tv_embedded) y yt-dlp por separado. + """ + video_id = extract_video_id(video_id) + proxy = _get_proxy_choice() + proxies = {"http": proxy, "https": proxy} if proxy else None + + def _call(client_dict, label): + try: + vd_data = _np_get_visitor_data(client_dict, proxies) + resp = _np_call_player(video_id, client_dict, vd_data, proxies) + ps = resp.get("playabilityStatus") or {} + vd = resp.get("videoDetails") or {} + sd = resp.get("streamingData") or {} + return { + "client": label, + "status": ps.get("status"), + "reason": ps.get("reason", ""), + "title": vd.get("title"), + "description_preview": str(vd.get("shortDescription", "") or "")[:120], + "isLive": vd.get("isLive"), + "isLiveContent": vd.get("isLiveContent"), + "hlsManifestUrl": (sd.get("hlsManifestUrl") or "")[:100], + "formats_count": len(sd.get("formats", [])), + "adaptiveFormats_count": len(sd.get("adaptiveFormats", [])), + "streamingData_keys": list(sd.keys()), + } + except Exception as e: + return {"client": label, "error": str(e)} + + results = [ + _call(_NP_IOS, "iOS"), + _call(_NP_ANDROID, "Android"), + _call(_NP_TV_EMBEDDED, "tv_embedded"), + ] + + # yt-dlp dump-json con tv_embedded + ytdlp_meta = {} + try: + url = f"https://www.youtube.com/watch?v={video_id}" + cmd = ["yt-dlp", "--skip-download", "--dump-json", "--no-warnings", + "--extractor-args", "youtube:player_client=tv_embedded", url] + if proxy: + cmd.extend(["--proxy", proxy]) + res = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if res.returncode == 0 and res.stdout.strip(): + d = json.loads(res.stdout.strip()) + ytdlp_meta = { + "title": d.get("title"), + "description_preview": str(d.get("description") or "")[:120], + "is_live": d.get("is_live"), + "was_live": d.get("was_live"), + } + else: + ytdlp_meta = {"error": res.stderr[:500]} + except Exception as e: + ytdlp_meta = {"error": str(e)} + + return { + "video_id": video_id, + "innertube_clients": results, + "ytdlp_tv_embedded": ytdlp_meta, + } + @app.get("/transcript/{video_id}") def transcript_endpoint(video_id: str, lang: str = "es"): @@ -1048,7 +1332,211 @@ async def upload_cookies(file: UploadFile = File(...)): except Exception as e: raise HTTPException(status_code=500, detail=f'Error al guardar cookies: {str(e)[:200]}') -@app.get("/debug/metadata/{video_id}") + +# ── Rutas conocidas de perfiles de navegador en Linux/Mac/Windows ──────────── +_BROWSER_PROFILES = { + "chrome": [ + # Linux + os.path.expanduser("~/.config/google-chrome/Default"), + os.path.expanduser("~/.config/google-chrome/Profile 1"), + # Montaje desde docker-compose (host path mapeado) + "/host-chrome/Default", + "/host-chrome", + # macOS + os.path.expanduser("~/Library/Application Support/Google/Chrome/Default"), + ], + "chromium": [ + os.path.expanduser("~/.config/chromium/Default"), + "/host-chromium/Default", + "/host-chromium", + os.path.expanduser("~/Library/Application Support/Chromium/Default"), + ], + "brave": [ + os.path.expanduser("~/.config/BraveSoftware/Brave-Browser/Default"), + "/host-brave/Default", + "/host-brave", + os.path.expanduser("~/Library/Application Support/BraveSoftware/Brave-Browser/Default"), + ], + "firefox": [ + # Firefox usa --cookies-from-browser firefox directamente, yt-dlp detecta el perfil + os.path.expanduser("~/.mozilla/firefox"), + "/host-firefox", + ], + "edge": [ + os.path.expanduser("~/.config/microsoft-edge/Default"), + "/host-edge/Default", + ], +} + + +def _find_browser_profile(browser: str) -> str | None: + """Devuelve la primera ruta de perfil existente para el navegador dado.""" + for path in _BROWSER_PROFILES.get(browser, []): + if os.path.exists(path): + return path + return None + + +def _extract_cookies_from_browser(browser: str, profile_path: str | None, + target: str, proxy: str | None = None) -> dict: + """ + Usa yt-dlp --cookies-from-browser para extraer cookies de YouTube + del perfil del navegador indicado y guardarlas en target (Netscape format). + """ + cmd = [ + "yt-dlp", + "--cookies-from-browser", browser if not profile_path else f"{browser}:{profile_path}", + "--cookies", target, # exportar a archivo Netscape + "--skip-download", + "--no-warnings", + "--extractor-args", "youtube:player_client=tv_embedded", + "https://www.youtube.com/watch?v=dQw4w9WgXcQ", # video corto para forzar extracción + ] + if proxy: + cmd.extend(["--proxy", proxy]) + + try: + res = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + stderr = res.stderr or "" + stdout = res.stdout or "" + + # Verificar que el archivo fue creado y no está vacío + if os.path.exists(target) and os.path.getsize(target) > 100: + # Contar cookies de youtube.com + yt_cookies = 0 + with open(target, "r", errors="ignore") as fh: + for line in fh: + if ".youtube.com" in line or "youtube.com" in line: + yt_cookies += 1 + return { + "success": True, + "browser": browser, + "profile_path": profile_path, + "cookies_file": target, + "youtube_cookie_lines": yt_cookies, + "stderr_preview": stderr[:300] if stderr else "", + } + else: + return { + "success": False, + "browser": browser, + "error": "No se generó el archivo de cookies o está vacío", + "stderr": stderr[:500], + "stdout": stdout[:200], + "returncode": res.returncode, + } + except subprocess.TimeoutExpired: + return {"success": False, "browser": browser, "error": "Timeout al extraer cookies (60s)"} + except FileNotFoundError: + return {"success": False, "browser": browser, "error": "yt-dlp no encontrado"} + except Exception as e: + return {"success": False, "browser": browser, "error": str(e)[:200]} + + +@app.post("/extract_chrome_cookies") +def extract_chrome_cookies(browser: str = "chrome", profile_path: str = ""): + """ + Extrae cookies de YouTube directamente desde el perfil del navegador instalado + en el HOST (montado como volumen) y las guarda en /app/data/cookies.txt. + + Parámetros: + - browser: chrome | chromium | brave | firefox | edge (default: chrome) + - profile_path: ruta manual al perfil (opcional, se auto-detecta si está vacío) + + Requisito en docker-compose.yml (ya incluido): + volumes: + - ~/.config/google-chrome:/host-chrome:ro + + Ejemplo: + curl -X POST "http://localhost:8282/extract_chrome_cookies?browser=chrome" + curl -X POST "http://localhost:8282/extract_chrome_cookies?browser=brave" + """ + proxy = _get_proxy_choice() + target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) + + # Asegurar directorio destino + target_dir = os.path.dirname(target) or "." + os.makedirs(target_dir, exist_ok=True) + + browser = browser.lower().strip() + valid_browsers = list(_BROWSER_PROFILES.keys()) + if browser not in valid_browsers: + raise HTTPException( + status_code=400, + detail=f"Navegador '{browser}' no soportado. Usa: {', '.join(valid_browsers)}" + ) + + # Auto-detectar perfil si no se indicó + resolved_profile = profile_path.strip() or _find_browser_profile(browser) + + if not resolved_profile and browser != "firefox": + # Para Firefox yt-dlp lo detecta solo; para el resto necesitamos la ruta + available = {b: _find_browser_profile(b) for b in valid_browsers} + found = {b: p for b, p in available.items() if p} + raise HTTPException( + status_code=404, + detail=( + f"No se encontró el perfil de '{browser}' en las rutas conocidas. " + f"Agrega el volumen en docker-compose.yml o pasa profile_path manualmente. " + f"Perfiles encontrados: {found if found else 'ninguno'}" + ) + ) + + result = _extract_cookies_from_browser(browser, resolved_profile, target, proxy) + + if not result["success"]: + raise HTTPException(status_code=500, detail=result) + + return { + "detail": f"Cookies extraídas de {browser} y guardadas en {target}", + **result, + "next_step": "Los endpoints /transcript y /stream usarán estas cookies automáticamente.", + } + + +@app.get("/cookies/status") +def cookies_status(): + """Muestra el estado actual de las cookies configuradas y qué navegadores están disponibles.""" + target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) + proxy = _get_proxy_choice() + + # Estado del archivo de cookies actual + cookies_info = {"path": target, "exists": False, "size_bytes": 0, "youtube_lines": 0} + if os.path.exists(target): + cookies_info["exists"] = True + cookies_info["size_bytes"] = os.path.getsize(target) + yt_lines = 0 + try: + with open(target, "r", errors="ignore") as fh: + for line in fh: + if "youtube.com" in line and not line.startswith("#"): + yt_lines += 1 + except Exception: + pass + cookies_info["youtube_lines"] = yt_lines + + # Detectar perfiles de navegador disponibles (en el contenedor / host montado) + available_browsers = {} + for browser in _BROWSER_PROFILES: + path = _find_browser_profile(browser) + available_browsers[browser] = { + "found": bool(path), + "profile_path": path, + } + + return { + "cookies_file": cookies_info, + "available_browsers": available_browsers, + "extract_endpoint": "POST /extract_chrome_cookies?browser=chrome", + "upload_endpoint": "POST /upload_cookies", + "proxy": proxy or "no configurado", + "note": ( + "Para usar cookies de Chrome del host, agrega en docker-compose.yml: " + "volumes: - ~/.config/google-chrome:/host-chrome:ro" + ), + } + + def debug_metadata(video_id: str): """Endpoint de depuración: obtiene --dump-json de yt-dlp para un video. Devuelve la metadata (automatic_captions, subtitles, requested_subtitles) para inspección. @@ -1057,7 +1545,7 @@ def debug_metadata(video_id: str): cookie_mgr = CookieManager() cookiefile_path = cookie_mgr.get_cookiefile_path() cookies_path = cookiefile_path or os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) - proxy = os.getenv('API_PROXY', DEFAULT_PROXY) or None + proxy = _get_proxy_choice() url = f"https://www.youtube.com/watch?v={video_id}" @@ -1321,7 +1809,6 @@ def fetch_vtt_subtitles(video_id: str, lang: str = 'es'): pass return None, f'Error leyendo archivo de subtítulos: {str(e)[:200]}' - @app.post('/upload_vtt/{video_id}') async def upload_vtt(video_id: str, file: UploadFile = File(...)): """Permite subir un archivo VTT para un video y devuelve segmentos parseados y texto. @@ -1409,6 +1896,110 @@ def transcript_alt(video_id: str, lang: str = 'es'): 'source': 'youtube-transcript-api' } -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) +@app.get('/playwright/stream/{video_id}') +def playwright_stream(video_id: str, profile: str = '', headless: bool = True, timeout: int = 60): + """Usa Playwright (script tools/playwright_extract_m3u8.py) para abrir el video + en un navegador real (o con perfil persistente) y extraer las URLs m3u8 y cookies. + + Parámetros: + - profile: ruta al user-data-dir de Chrome (opcional). Si el contenedor tiene el + perfil montado en /host-chrome, pásalo como `/host-chrome/Default`. + - headless: true/false para ejecutar sin UI. + - timeout: segundos máximos a esperar por la ejecución del script. + + Uso (ejemplo): + curl 'http://localhost:8282/playwright/stream/cmqVmX2UVBM?headless=false&profile=/host-chrome' + + Nota: el script genera `./data/cookies.txt` si logra extraer cookies. + """ + vid = extract_video_id(video_id) + if not vid: + raise HTTPException(status_code=400, detail='video_id inválido') + + script = os.path.join(os.getcwd(), 'tools', 'playwright_extract_m3u8.py') + if not os.path.exists(script): + raise HTTPException(status_code=500, detail='Script Playwright no encontrado en tools/playwright_extract_m3u8.py') + + cmd = ['python3', script, '--video', f'https://www.youtube.com/watch?v={vid}', '--timeout', str(timeout)] + if headless: + cmd.append('--headless') + # profile can be provided via env PLAYWRIGHT_PROFILE or param + profile_path = profile or os.getenv('PLAYWRIGHT_PROFILE', '') + if profile_path: + cmd.extend(['--profile', profile_path]) + + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 10) + except subprocess.TimeoutExpired: + raise HTTPException(status_code=504, detail='Playwright timed out') + except Exception as e: + raise HTTPException(status_code=500, detail=f'Error ejecutando Playwright: {str(e)[:300]}') + + if proc.returncode != 0: + # incluir stderr para diagnóstico + detail = (proc.stderr or proc.stdout or 'Error desconocido')[:2000] + return JSONResponse(status_code=500, content={"error": "Playwright error", "detail": detail}) + + try: + out = json.loads(proc.stdout or '{}') + except Exception: + return JSONResponse(status_code=500, content={"error": "No se pudo parsear salida Playwright", "raw": proc.stdout[:2000]}) + + return out + +def _attempt_playwright_fallback(video_id: str, headless: bool = True, profile: str | None = None, timeout: int = 60): + """Ejecuta el script Playwright para intentar extraer m3u8 y cookies. + Retorna (m3u8_url or None, cookies_saved_path or None, error_message or None) + """ + script = os.path.join(os.getcwd(), 'tools', 'playwright_extract_m3u8.py') + if not os.path.exists(script): + return None, None, 'Playwright extractor script no disponible' + + cmd = ['python3', script, '--video', f'https://www.youtube.com/watch?v={video_id}', '--timeout', str(timeout)] + if headless: + cmd.append('--headless') + # profile can be provided via env PLAYWRIGHT_PROFILE or param + profile_path = profile or os.getenv('PLAYWRIGHT_PROFILE', '') + if profile_path: + cmd.extend(['--profile', profile_path]) + + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout + 10) + except subprocess.TimeoutExpired: + return None, None, 'Playwright timed out' + except Exception as e: + return None, None, f'Error ejecutando Playwright: {str(e)[:200]}' + + if proc.returncode != 0: + # incluir stderr para diagnóstico + detail = (proc.stderr or proc.stdout or 'Error desconocido')[:2000] + return None, None, f'Playwright error: {detail}' + + try: + data = json.loads(proc.stdout or '{}') + except Exception: + return None, None, 'No se pudo parsear la salida de Playwright' + + urls = data.get('m3u8_urls') or [] + cookies_file = data.get('cookies_file') + + if not urls: + return None, cookies_file, 'No se encontró m3u8 via Playwright' + + # tomar la primera URL válida + m3u8 = urls[0] + + # Si Playwright devolvió cookies, moverlas a API_COOKIES_PATH para que el resto del sistema las use + if cookies_file and os.path.exists(cookies_file): + target = os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH) + try: + target_dir = os.path.dirname(target) or '.' + os.makedirs(target_dir, exist_ok=True) + # copiar contenido + with open(cookies_file, 'rb') as src, open(target, 'wb') as dst: + dst.write(src.read()) + return m3u8, target, None + except Exception as e: + return m3u8, None, f'm3u8 encontrado pero no se pudo guardar cookies: {str(e)[:200]}' + + return m3u8, None, None diff --git a/tools/README_PLAYWRIGHT.md b/tools/README_PLAYWRIGHT.md new file mode 100644 index 0000000..a0eace3 --- /dev/null +++ b/tools/README_PLAYWRIGHT.md @@ -0,0 +1,27 @@ +Playwright extractor +===================== + +Este script abre un video de YouTube con Playwright, captura peticiones de red y busca +URLs M3U8/HLS. Opcionalmente exporta cookies al formato Netscape en `./data/cookies.txt`. + +Requisitos (host): + pip install playwright + python -m playwright install + +Uso ejemplo (headful, usando tu perfil de Chrome): + python3 tools/playwright_extract_m3u8.py --video https://www.youtube.com/watch?v=cmqVmX2UVBM --profile ~/.config/google-chrome --headless + +Si no usas perfil, quita `--profile` y el script abrirá un contexto temporal. + +Salida JSON: + { + "m3u8_urls": [ ... ], + "cookies_file": "./data/cookies.txt", + "errors": [] + } + +Consejos: + - Ejecuta en el host (no en contenedor) si quieres usar tu perfil real de Chrome. + - Si Playwright no encuentra el ejecutable del navegador, corre `python -m playwright install`. + - Para usar las cookies exportadas desde la API: `curl -s http://localhost:8282/cookies/status` para comprobarlas. + diff --git a/tools/expand_and_test_proxies.py b/tools/expand_and_test_proxies.py new file mode 100644 index 0000000..062cf2f --- /dev/null +++ b/tools/expand_and_test_proxies.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +expand_and_test_proxies.py + +Lee tools/user_proxies.txt, genera variantes (intenta también SOCKS5/SOCKS5H en puertos comunes) +y ejecuta tools/generate_proxy_whitelist.py con la lista expandida. + +Uso: + python3 tools/expand_and_test_proxies.py + +Salida: + - tools/expanded_proxies.txt (lista expandida) + - llama a generate_proxy_whitelist.py y produce tools/whitelist.json y tools/whitelist.txt + +""" +import os +import re +import subprocess +from pathlib import Path + +BASE = Path(__file__).resolve().parent +USER_FILE = BASE / 'user_proxies.txt' +EXPANDED_FILE = BASE / 'expanded_proxies.txt' +GEN_SCRIPT = BASE / 'generate_proxy_whitelist.py' + +COMMON_SOCKS_PORTS = [1080, 10808, 9050] + + +def normalize_line(line: str) -> str | None: + s = line.strip() + if not s or s.startswith('#'): + return None + return s + + +def parse_host_port(s: str): + # remove scheme if present + m = re.match(r'^(?:(?P[a-zA-Z0-9+.-]+)://)?(?P[^:/@]+)(?::(?P\d+))?(?:@.*)?$', s) + if not m: + return None, None, None + scheme = m.group('scheme') + host = m.group('host') + port = m.group('port') + port = int(port) if port else None + return scheme, host, port + + +def build_variants(s: str): + scheme, host, port = parse_host_port(s) + variants = [] + # keep original if it has scheme + if scheme: + variants.append(s) + else: + # assume http by default if none + if port: + variants.append(f'http://{host}:{port}') + else: + variants.append(f'http://{host}:80') + + # Try socks5h on same port if port present + if port: + variants.append(f'socks5h://{host}:{port}') + # Try socks5h on common ports + for p in COMMON_SOCKS_PORTS: + variants.append(f'socks5h://{host}:{p}') + + # Deduplicate preserving order + seen = set() + out = [] + for v in variants: + if v in seen: + continue + seen.add(v) + out.append(v) + return out + + +def main(): + if not USER_FILE.exists(): + print(f'No se encontró {USER_FILE}. Crea el archivo con proxies (uno por línea).') + return + + all_variants = [] + with USER_FILE.open('r', encoding='utf-8') as fh: + for line in fh: + s = normalize_line(line) + if not s: + continue + vars = build_variants(s) + all_variants.extend(vars) + + # write expanded file + with EXPANDED_FILE.open('w', encoding='utf-8') as fh: + for v in all_variants: + fh.write(v + '\n') + + print(f'Wrote expanded proxies to {EXPANDED_FILE} ({len(all_variants)} entries)') + + # Call generator + cmd = [ 'python3', str(GEN_SCRIPT), '--input', str(EXPANDED_FILE), '--out-json', str(BASE / 'whitelist.json'), '--out-txt', str(BASE / 'whitelist.txt'), '--test-url', 'https://www.youtube.com/watch?v=dQw4w9WgXcQ', '--concurrency', '6'] + print('Running generator...') + try: + res = subprocess.run(cmd, capture_output=True, text=True, timeout=600) + print('Generator exit code:', res.returncode) + print('stdout:\n', res.stdout) + print('stderr:\n', res.stderr) + except Exception as e: + print('Error running generator:', e) + +if __name__ == '__main__': + main() + diff --git a/tools/expanded_proxies.txt b/tools/expanded_proxies.txt new file mode 100644 index 0000000..6157f9e --- /dev/null +++ b/tools/expanded_proxies.txt @@ -0,0 +1,30 @@ +http://48.210.225.96:80 +socks5h://48.210.225.96:80 +socks5h://48.210.225.96:1080 +socks5h://48.210.225.96:10808 +socks5h://48.210.225.96:9050 +http://107.174.231.218:8888 +socks5h://107.174.231.218:8888 +socks5h://107.174.231.218:1080 +socks5h://107.174.231.218:10808 +socks5h://107.174.231.218:9050 +http://188.239.43.6:80 +socks5h://188.239.43.6:80 +socks5h://188.239.43.6:1080 +socks5h://188.239.43.6:10808 +socks5h://188.239.43.6:9050 +http://52.229.30.3:80 +socks5h://52.229.30.3:80 +socks5h://52.229.30.3:1080 +socks5h://52.229.30.3:10808 +socks5h://52.229.30.3:9050 +http://142.93.202.130:3128 +socks5h://142.93.202.130:3128 +socks5h://142.93.202.130:1080 +socks5h://142.93.202.130:10808 +socks5h://142.93.202.130:9050 +http://154.219.101.86:8888 +socks5h://154.219.101.86:8888 +socks5h://154.219.101.86:1080 +socks5h://154.219.101.86:10808 +socks5h://154.219.101.86:9050 diff --git a/tools/generate_proxy_whitelist.py b/tools/generate_proxy_whitelist.py new file mode 100644 index 0000000..086802c --- /dev/null +++ b/tools/generate_proxy_whitelist.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +""" +generate_proxy_whitelist.py + +Lee una lista de proxies desde un archivo (proxies.txt), prueba cada proxy con yt-dlp +intentando descargar metadata mínimo de YouTube, mide latencia y genera: + - whitelist.json : lista estructurada de proxies con estado y métricas + - whitelist.txt : solo proxies válidos, ordenados por latencia + +Formato de proxies.txt: una URL por línea, ejemplos: + socks5h://127.0.0.1:1080 + http://10.0.0.1:3128 + +Uso: + python3 tools/generate_proxy_whitelist.py --input tools/proxies.txt --out tools/whitelist.json --test-url https://www.youtube.com/watch?v=dQw4w9WgXcQ + +Notas: + - Requiere tener `yt-dlp` instalado en el entorno donde se ejecuta este script. + - Este script intenta usar yt-dlp porque valida directamente que el proxy funciona + para las llamadas a YouTube (incluye manejo de JS/firma en yt-dlp cuando aplique). + - Ajusta timeouts y pruebas por concurrencia según tus necesidades. +""" + +import argparse +import json +import subprocess +import time +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import urlparse + +import requests + +# Mensajes que indican bloqueo/bot-check de yt-dlp +BOT_MARKERS = ("sign in to confirm", "not a bot", "sign in to", "HTTP Error 403", "HTTP Error 429") + + +def test_proxy(proxy: str, test_url: str, timeout: int = 25) -> dict: + """Prueba un proxy ejecutando yt-dlp --dump-json sobre test_url. + Retorna dict con info: proxy, ok, rc, stderr, elapsed_ms, stdout_preview + """ + proxy = proxy.strip() + if not proxy: + return {"proxy": proxy, "ok": False, "error": "empty"} + + cmd = [ + "yt-dlp", + "--skip-download", + "--dump-json", + "--no-warnings", + "--extractor-args", "youtube:player_client=tv_embedded", + "--socket-timeout", "10", + test_url, + "--proxy", proxy, + ] + + start = time.perf_counter() + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + elapsed = (time.perf_counter() - start) * 1000.0 + stdout = proc.stdout or "" + stderr = proc.stderr or "" + rc = proc.returncode + + # heurística de éxito: rc == 0 y stdout no vacío y no markers de bot en stderr + stderr_low = stderr.lower() + bot_hit = any(m.lower() in stderr_low for m in BOT_MARKERS) + ok = (rc == 0 and stdout.strip() != "" and not bot_hit) + + return { + "proxy": proxy, + "ok": ok, + "rc": rc, + "elapsed_ms": int(elapsed), + "bot_detected": bool(bot_hit), + "stderr_preview": stderr[:1000], + "stdout_preview": stdout[:2000], + } + + except subprocess.TimeoutExpired: + elapsed = (time.perf_counter() - start) * 1000.0 + return {"proxy": proxy, "ok": False, "error": "timeout", "elapsed_ms": int(elapsed)} + except FileNotFoundError: + return {"proxy": proxy, "ok": False, "error": "yt-dlp-not-found"} + except Exception as e: + elapsed = (time.perf_counter() - start) * 1000.0 + return {"proxy": proxy, "ok": False, "error": str(e), "elapsed_ms": int(elapsed)} + + +def generate_whitelist(input_file: str, out_json: str, out_txt: str, test_url: str, concurrency: int = 6): + proxies = [] + with open(input_file, 'r', encoding='utf-8') as fh: + for line in fh: + line = line.strip() + if not line or line.startswith('#'): + continue + proxies.append(line) + + results = [] + with ThreadPoolExecutor(max_workers=concurrency) as ex: + futures = {ex.submit(test_proxy, p, test_url): p for p in proxies} + for fut in as_completed(futures): + try: + r = fut.result() + except Exception as e: + r = {"proxy": futures[fut], "ok": False, "error": str(e)} + results.append(r) + print(f"Tested: {r.get('proxy')} ok={r.get('ok')} rc={r.get('rc', '-') } elapsed={r.get('elapsed_ms','-')}ms") + + # Ordenar proxies válidos por elapsed asc + valid = [r for r in results if r.get('ok')] + valid_sorted = sorted(valid, key=lambda x: x.get('elapsed_ms', 999999)) + + # Guardar JSON completo + out = {"tested_at": int(time.time()), "test_url": test_url, "results": results, "valid_count": len(valid_sorted)} + with open(out_json, 'w', encoding='utf-8') as fh: + json.dump(out, fh, indent=2, ensure_ascii=False) + + # Guardar lista TXT (whitelist) con orden preferido + with open(out_txt, 'w', encoding='utf-8') as fh: + for r in valid_sorted: + fh.write(r['proxy'] + '\n') + + return out, valid_sorted + + +def _extract_proxies_from_json(obj): + """Dado un objeto JSON (parsed), intenta extraer una lista de proxies en forma de URLs. + Soporta varias estructuras comunes: + - lista simple de strings: ["socks5h://1.2.3.4:1080", ...] + - lista de objetos con keys como ip, port, protocol + - objetos anidados con 'proxy' o 'url' o 'address' + """ + proxies = [] + if isinstance(obj, list): + for item in obj: + if isinstance(item, str): + proxies.append(item.strip()) + elif isinstance(item, dict): + # intentar keys comunes + # ejemplos: {"ip":"1.2.3.4","port":1080, "protocol":"socks5"} + ip = item.get('ip') or item.get('host') or item.get('address') or item.get('ip_address') + port = item.get('port') or item.get('p') + proto = item.get('protocol') or item.get('proto') or item.get('type') or item.get('scheme') + if ip and port: + proto = proto or 'http' + proxies.append(f"{proto}://{ip}:{port}") + continue + # buscar valores en keys que puedan contener url + for k in ('proxy','url','address','connect'): + v = item.get(k) + if isinstance(v, str) and v.strip(): + proxies.append(v.strip()) + break + elif isinstance(obj, dict): + # encontrar listas dentro del dict + for v in obj.values(): + if isinstance(v, (list, dict)): + proxies.extend(_extract_proxies_from_json(v)) + # si el dict mismo tiene un campo 'proxy' o similar + for k in ('proxies','list','data'): + if k in obj and isinstance(obj[k], (list,dict)): + proxies.extend(_extract_proxies_from_json(obj[k])) + return [p for p in proxies if p] + + +def download_and_write_proxies(url: str, out_file: str) -> int: + """Descarga JSON desde `url`, extrae proxies y las escribe en `out_file`. + Retorna número de proxies escritas. + """ + try: + r = requests.get(url, timeout=30) + r.raise_for_status() + data = r.json() + except Exception as e: + raise RuntimeError(f"Error descargando/parsing JSON desde {url}: {e}") + + proxies = _extract_proxies_from_json(data) + # normalizar: si la entrada es 'ip:port' convertir a http://ip:port + normalized = [] + for p in proxies: + p = p.strip() + if not p: + continue + # si es 'ip:port' o 'ip port' + if ':' in p and not p.lower().startswith(('http://','https://','socks5://','socks5h://','socks4://')): + # asumir http + normalized.append('http://' + p) + else: + normalized.append(p) + + # dedup preserving order + seen = set() + out = [] + for p in normalized: + if p in seen: + continue + seen.add(p) + out.append(p) + + if not out: + # como fallback, si JSON es una estructura plana de objetos con 'ip' y 'port' + # ya manejado, si nada, error + raise RuntimeError(f"No se extrajeron proxies del JSON: {url}") + + with open(out_file, 'w', encoding='utf-8') as fh: + for p in out: + fh.write(p + '\n') + return len(out) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Test a list of proxies with yt-dlp and generate a whitelist') + parser.add_argument('--input', default='tools/proxies.txt', help='Input file with proxies (one per line)') + parser.add_argument('--out-json', default='tools/whitelist.json', help='Output JSON results') + parser.add_argument('--out-txt', default='tools/whitelist.txt', help='Output whitelist (one proxy per line)') + parser.add_argument('--test-url', default='https://www.youtube.com/watch?v=dQw4w9WgXcQ', help='YouTube test URL to use') + parser.add_argument('--concurrency', type=int, default=6, help='Concurrent workers') + parser.add_argument('--from-url', default='', help='Download a JSON of proxies from a URL and use it as input') + args = parser.parse_args() + + # If from-url provided, download and write to temporary input file + input_file = args.input + temp_written = False + try: + if args.from_url: + print(f"Downloading proxies JSON from: {args.from_url}") + written = download_and_write_proxies(args.from_url, input_file) + print(f"Wrote {written} proxies to {input_file}") + temp_written = True + + if not os.path.exists(input_file): + print(f"Input file {input_file} not found. Create it with one proxy per line or use --from-url.") + raise SystemExit(1) + + out, valid_sorted = generate_whitelist(input_file, args.out_json, args.out_txt, args.test_url, args.concurrency) + print('\nSummary:') + print(f" Tested: {len(out['results'])}, Valid: {len(valid_sorted)}") + print(f" JSON: {args.out_json}, TXT whitelist: {args.out_txt}") + finally: + # optionally remove temp file? keep it for inspection + pass diff --git a/tools/playwright_extract_m3u8.py b/tools/playwright_extract_m3u8.py new file mode 100755 index 0000000..33a84f6 --- /dev/null +++ b/tools/playwright_extract_m3u8.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""playwright_extract_m3u8.py + +Abre una página de YouTube con Playwright y captura la primera URL m3u8/HLS +visible en las peticiones de red. También puede exportar cookies al formato +Netscape para usarlas con yt-dlp/tu API. + +Uso: + python3 tools/playwright_extract_m3u8.py --video https://www.youtube.com/watch?v=ID [--profile /path/to/profile] [--headless] + +Requisitos (host): + pip install playwright + python -m playwright install + +Notas: + - Recomiendo ejecutarlo en el host (no en el contenedor) para usar el perfil de Chrome + y para que Playwright pueda manejar la interfaz gráfica si necesitas login/manual. + - Si pasas --profile, se lanzará una sesión persistente usando ese directorio (útil + para usar tu sesión de Chrome ya logueada). Si dejas vacío, se usa un contexto limpio. +""" +import argparse +import os +import json +import time +from pathlib import Path + +try: + from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout +except Exception as e: + print("playwright no está instalado. Instala con: pip install playwright && python -m playwright install") + raise + + +def write_netscape_cookie_file(cookies, target_path): + # cookies: list of dicts like Playwright provides + lines = ["# Netscape HTTP Cookie File"] + for c in cookies: + domain = c.get("domain", "") + flag = "TRUE" if domain.startswith('.') else "FALSE" + path = c.get("path", "/") + secure = "TRUE" if c.get("secure") else "FALSE" + expires = str(int(c.get("expires", 0))) if c.get("expires") else "0" + name = c.get("name", "") + value = c.get("value", "") + lines.append("\t".join([domain, flag, path, secure, expires, name, value])) + Path(target_path).parent.mkdir(parents=True, exist_ok=True) + with open(target_path, "w", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + + +def extract_m3u8(video_url: str, profile: str | None, headless: bool, timeout: int = 45, save_cookies: bool = True): + result = {"m3u8_urls": [], "cookies_file": None, "errors": []} + data_dir = Path.cwd() / "data" + data_dir.mkdir(exist_ok=True) + target_cookies = str(data_dir / "cookies.txt") + + with sync_playwright() as p: + # Usar Chromium para mejor compatibilidad con Chrome profile + browser_type = p.chromium + # establecer User-Agent a uno real para simular navegador + ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + extra_headers = {"Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"} + + launch_args = ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] + if profile: + # persistent context uses a profile dir (user data dir) + user_data_dir = profile + # avoid passing user_agent due to some Playwright builds missing API; set headers only + context = browser_type.launch_persistent_context(user_data_dir=user_data_dir, headless=headless, extra_http_headers=extra_headers, args=launch_args) + else: + # pass common args to help in container environments + browser = browser_type.launch(headless=headless, args=launch_args) + # do not pass user_agent param; rely on browser default and headers + context = browser.new_context(extra_http_headers=extra_headers) + + # debug info + try: + print(f"[playwright] started browser headless={headless} profile={'yes' if profile else 'no'}") + except Exception: + pass + + page = context.new_page() + + collected = set() + + def on_response(resp): + try: + url = resp.url + # heurística: m3u8 en URL o content-type de respuesta + if ".m3u8" in url.lower(): + collected.add(url) + else: + ct = resp.headers.get("content-type", "") + if "application/vnd.apple.mpegurl" in ct or "vnd.apple.mpegurl" in ct or "application/x-mpegURL" in ct: + collected.add(url) + except Exception: + pass + + page.on("response", on_response) + + try: + page.goto(video_url, timeout=timeout * 1000) + # esperar un poco para que las peticiones de manifest se disparen + wait_seconds = 6 + for i in range(wait_seconds): + time.sleep(1) + # si encontramos algo temprano, romper + if collected: + break + + # Si no encontramos m3u8, intentar forzar la apertura del player y realizar scroll + if not collected: + try: + # click play + page.evaluate("() => { const v = document.querySelector('video'); if (v) v.play(); }") + except Exception: + pass + # esperar más + time.sleep(3) + + # recopilar URLs + result_urls = list(collected) + # desduplicar y ordenar + result_urls = sorted(set(result_urls)) + result['m3u8_urls'] = result_urls + + # guardar cookies si se pidió + if save_cookies: + try: + cookies = context.cookies() + write_netscape_cookie_file(cookies, target_cookies) + result['cookies_file'] = target_cookies + except Exception as e: + result['errors'].append(f"cookie_export_error:{e}") + + except PWTimeout as e: + result['errors'].append(f"page_timeout: {e}") + except Exception as e: + import traceback + result['errors'].append(traceback.format_exc()) + finally: + # intentar cerrar context y browser si existen + try: + if 'context' in locals() and context: + try: + context.close() + except Exception: + pass + except Exception: + pass + try: + if 'browser' in locals() and browser: + try: + browser.close() + except Exception: + pass + except Exception: + pass + + return result + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Playwright m3u8 extractor for YouTube') + parser.add_argument('--video', required=True, help='Video URL or ID (e.g. https://www.youtube.com/watch?v=ID)') + parser.add_argument('--profile', default='', help='Path to browser profile (user data dir) to reuse logged session') + parser.add_argument('--headless', action='store_true', help='Run headless') + parser.add_argument('--timeout', type=int, default=45, help='Timeout for page load (seconds)') + parser.add_argument('--no-cookies', dest='save_cookies', action='store_false', help='Don\'t save cookies to ./data/cookies.txt') + args = parser.parse_args() + + video = args.video + if len(video) == 11 and not video.startswith('http'): + video = f'https://www.youtube.com/watch?v={video}' + + res = extract_m3u8(video, profile=args.profile or None, headless=args.headless, timeout=args.timeout, save_cookies=args.save_cookies) + print(json.dumps(res, indent=2, ensure_ascii=False)) diff --git a/tools/proxies_sample.txt b/tools/proxies_sample.txt new file mode 100644 index 0000000..e69de29 diff --git a/tools/user_proxies.txt b/tools/user_proxies.txt new file mode 100644 index 0000000..344b276 --- /dev/null +++ b/tools/user_proxies.txt @@ -0,0 +1,10 @@ +# Proxies proporcionados por el usuario (formato: esquema://ip:port) +# Fuente: lista JSON proporcionada por el usuario — comprobadas por Google (campo "google": true) + +http://48.210.225.96:80 +http://107.174.231.218:8888 +http://188.239.43.6:80 +http://52.229.30.3:80 +http://142.93.202.130:3128 +http://154.219.101.86:8888 + diff --git a/tools/whitelist.json b/tools/whitelist.json new file mode 100644 index 0000000..63d57e6 --- /dev/null +++ b/tools/whitelist.json @@ -0,0 +1,256 @@ +{ + "tested_at": 1772912928, + "test_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + "results": [ + { + "proxy": "http://107.174.231.218:8888", + "ok": false, + "rc": 1, + "elapsed_ms": 2714, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request')) (caused by ProxyError(\"('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://107.174.231.218:8888", + "ok": false, + "rc": 1, + "elapsed_ms": 1473, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://48.210.225.96:9050", + "ok": false, + "rc": 1, + "elapsed_ms": 4559, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://48.210.225.96:80", + "ok": false, + "rc": 1, + "elapsed_ms": 4850, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "http://48.210.225.96:80", + "ok": false, + "rc": 1, + "elapsed_ms": 5159, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request')) (caused by ProxyError(\"('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://107.174.231.218:1080", + "ok": false, + "rc": 1, + "elapsed_ms": 1057, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://107.174.231.218:10808", + "ok": false, + "rc": 1, + "elapsed_ms": 1208, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://107.174.231.218:9050", + "ok": false, + "rc": 1, + "elapsed_ms": 1123, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://188.239.43.6:80", + "ok": false, + "rc": 1, + "elapsed_ms": 7075, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 104] Connection reset by peer (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 104] Connection reset by peer\"))\n", + "stdout_preview": "" + }, + { + "proxy": "http://188.239.43.6:80", + "ok": false, + "rc": 1, + "elapsed_ms": 7192, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')) (caused by TransportError(\"('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))\"))\n", + "stdout_preview": "" + }, + { + "proxy": "http://52.229.30.3:80", + "ok": false, + "rc": 1, + "elapsed_ms": 2332, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request')) (caused by ProxyError(\"('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://52.229.30.3:80", + "ok": false, + "rc": 1, + "elapsed_ms": 2265, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://48.210.225.96:1080", + "ok": false, + "error": "timeout", + "elapsed_ms": 25022 + }, + { + "proxy": "socks5h://48.210.225.96:10808", + "ok": false, + "error": "timeout", + "elapsed_ms": 25036 + }, + { + "proxy": "socks5h://52.229.30.3:9050", + "ok": false, + "rc": 1, + "elapsed_ms": 2430, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "http://142.93.202.130:3128", + "ok": false, + "rc": 1, + "elapsed_ms": 1668, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request')) (caused by ProxyError(\"('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://142.93.202.130:3128", + "ok": false, + "rc": 1, + "elapsed_ms": 1652, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://188.239.43.6:1080", + "ok": false, + "error": "timeout", + "elapsed_ms": 25031 + }, + { + "proxy": "socks5h://188.239.43.6:10808", + "ok": false, + "error": "timeout", + "elapsed_ms": 25030 + }, + { + "proxy": "socks5h://142.93.202.130:1080", + "ok": false, + "rc": 1, + "elapsed_ms": 1364, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://142.93.202.130:10808", + "ok": false, + "rc": 1, + "elapsed_ms": 1405, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://142.93.202.130:9050", + "ok": false, + "rc": 1, + "elapsed_ms": 1322, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://154.219.101.86:1080", + "ok": false, + "rc": 1, + "elapsed_ms": 2199, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "http://154.219.101.86:8888", + "ok": false, + "rc": 1, + "elapsed_ms": 3651, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request')) (caused by ProxyError(\"('Unable to connect to proxy', OSError('Tunnel connection failed: 400 Bad Request'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://154.219.101.86:8888", + "ok": false, + "rc": 1, + "elapsed_ms": 3628, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: ('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48')) (caused by ProxyError(\"('[Errno 0] Invalid response version from server. Expected 05 got 48', InvalidVersionError(0, 'Invalid response version from server. Expected 05 got 48'))\")); please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://154.219.101.86:10808", + "ok": false, + "rc": 1, + "elapsed_ms": 1981, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://188.239.43.6:9050", + "ok": false, + "error": "timeout", + "elapsed_ms": 25023 + }, + { + "proxy": "socks5h://154.219.101.86:9050", + "ok": false, + "rc": 1, + "elapsed_ms": 1962, + "bot_detected": false, + "stderr_preview": "ERROR: [youtube] dQw4w9WgXcQ: Unable to download API page: SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused (caused by TransportError(\"SocksHTTPSConnection(host='www.youtube.com', port=443): Failed to establish a new connection: [Errno 111] Connection refused\"))\n", + "stdout_preview": "" + }, + { + "proxy": "socks5h://52.229.30.3:1080", + "ok": false, + "error": "timeout", + "elapsed_ms": 25026 + }, + { + "proxy": "socks5h://52.229.30.3:10808", + "ok": false, + "error": "timeout", + "elapsed_ms": 25028 + } + ], + "valid_count": 0 +} \ No newline at end of file diff --git a/tools/whitelist.txt b/tools/whitelist.txt new file mode 100644 index 0000000..e69de29