TubeScript-API/tools/playwright_extract_m3u8.py
Cesar Mendivil 2923510c51 Add Playwright tools for extracting M3U8 URLs and proxy management
- Introduced `playwright_extract_m3u8.py` to extract M3U8 URLs from YouTube videos using Playwright.
- Added `README_PLAYWRIGHT.md` for usage instructions and requirements.
- Created `expand_and_test_proxies.py` to expand user-provided proxies and test their validity.
- Implemented `generate_proxy_whitelist.py` to generate a whitelist of working proxies based on testing results.
- Added sample proxy files: `user_proxies.txt` for user-defined proxies and `proxies_sample.txt` as a template.
- Generated `expanded_proxies.txt`, `whitelist.json`, and `whitelist.txt` for storing expanded and valid proxies.
- Included error handling and logging for proxy testing results.
2026-03-17 00:29:51 -07:00

178 lines
7.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""playwright_extract_m3u8.py
Abre una página de YouTube con Playwright y captura la primera URL m3u8/HLS
visible en las peticiones de red. También puede exportar cookies al formato
Netscape para usarlas con yt-dlp/tu API.
Uso:
python3 tools/playwright_extract_m3u8.py --video https://www.youtube.com/watch?v=ID [--profile /path/to/profile] [--headless]
Requisitos (host):
pip install playwright
python -m playwright install
Notas:
- Recomiendo ejecutarlo en el host (no en el contenedor) para usar el perfil de Chrome
y para que Playwright pueda manejar la interfaz gráfica si necesitas login/manual.
- Si pasas --profile, se lanzará una sesión persistente usando ese directorio (útil
para usar tu sesión de Chrome ya logueada). Si dejas vacío, se usa un contexto limpio.
"""
import argparse
import os
import json
import time
from pathlib import Path
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except Exception as e:
print("playwright no está instalado. Instala con: pip install playwright && python -m playwright install")
raise
def write_netscape_cookie_file(cookies, target_path):
# cookies: list of dicts like Playwright provides
lines = ["# Netscape HTTP Cookie File"]
for c in cookies:
domain = c.get("domain", "")
flag = "TRUE" if domain.startswith('.') else "FALSE"
path = c.get("path", "/")
secure = "TRUE" if c.get("secure") else "FALSE"
expires = str(int(c.get("expires", 0))) if c.get("expires") else "0"
name = c.get("name", "")
value = c.get("value", "")
lines.append("\t".join([domain, flag, path, secure, expires, name, value]))
Path(target_path).parent.mkdir(parents=True, exist_ok=True)
with open(target_path, "w", encoding="utf-8") as fh:
fh.write("\n".join(lines) + "\n")
def extract_m3u8(video_url: str, profile: str | None, headless: bool, timeout: int = 45, save_cookies: bool = True):
result = {"m3u8_urls": [], "cookies_file": None, "errors": []}
data_dir = Path.cwd() / "data"
data_dir.mkdir(exist_ok=True)
target_cookies = str(data_dir / "cookies.txt")
with sync_playwright() as p:
# Usar Chromium para mejor compatibilidad con Chrome profile
browser_type = p.chromium
# establecer User-Agent a uno real para simular navegador
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
extra_headers = {"Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}
launch_args = ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
if profile:
# persistent context uses a profile dir (user data dir)
user_data_dir = profile
# avoid passing user_agent due to some Playwright builds missing API; set headers only
context = browser_type.launch_persistent_context(user_data_dir=user_data_dir, headless=headless, extra_http_headers=extra_headers, args=launch_args)
else:
# pass common args to help in container environments
browser = browser_type.launch(headless=headless, args=launch_args)
# do not pass user_agent param; rely on browser default and headers
context = browser.new_context(extra_http_headers=extra_headers)
# debug info
try:
print(f"[playwright] started browser headless={headless} profile={'yes' if profile else 'no'}")
except Exception:
pass
page = context.new_page()
collected = set()
def on_response(resp):
try:
url = resp.url
# heurística: m3u8 en URL o content-type de respuesta
if ".m3u8" in url.lower():
collected.add(url)
else:
ct = resp.headers.get("content-type", "")
if "application/vnd.apple.mpegurl" in ct or "vnd.apple.mpegurl" in ct or "application/x-mpegURL" in ct:
collected.add(url)
except Exception:
pass
page.on("response", on_response)
try:
page.goto(video_url, timeout=timeout * 1000)
# esperar un poco para que las peticiones de manifest se disparen
wait_seconds = 6
for i in range(wait_seconds):
time.sleep(1)
# si encontramos algo temprano, romper
if collected:
break
# Si no encontramos m3u8, intentar forzar la apertura del player y realizar scroll
if not collected:
try:
# click play
page.evaluate("() => { const v = document.querySelector('video'); if (v) v.play(); }")
except Exception:
pass
# esperar más
time.sleep(3)
# recopilar URLs
result_urls = list(collected)
# desduplicar y ordenar
result_urls = sorted(set(result_urls))
result['m3u8_urls'] = result_urls
# guardar cookies si se pidió
if save_cookies:
try:
cookies = context.cookies()
write_netscape_cookie_file(cookies, target_cookies)
result['cookies_file'] = target_cookies
except Exception as e:
result['errors'].append(f"cookie_export_error:{e}")
except PWTimeout as e:
result['errors'].append(f"page_timeout: {e}")
except Exception as e:
import traceback
result['errors'].append(traceback.format_exc())
finally:
# intentar cerrar context y browser si existen
try:
if 'context' in locals() and context:
try:
context.close()
except Exception:
pass
except Exception:
pass
try:
if 'browser' in locals() and browser:
try:
browser.close()
except Exception:
pass
except Exception:
pass
return result
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Playwright m3u8 extractor for YouTube')
parser.add_argument('--video', required=True, help='Video URL or ID (e.g. https://www.youtube.com/watch?v=ID)')
parser.add_argument('--profile', default='', help='Path to browser profile (user data dir) to reuse logged session')
parser.add_argument('--headless', action='store_true', help='Run headless')
parser.add_argument('--timeout', type=int, default=45, help='Timeout for page load (seconds)')
parser.add_argument('--no-cookies', dest='save_cookies', action='store_false', help='Don\'t save cookies to ./data/cookies.txt')
args = parser.parse_args()
video = args.video
if len(video) == 11 and not video.startswith('http'):
video = f'https://www.youtube.com/watch?v={video}'
res = extract_m3u8(video, profile=args.profile or None, headless=args.headless, timeout=args.timeout, save_cookies=args.save_cookies)
print(json.dumps(res, indent=2, ensure_ascii=False))