- Introduced `playwright_extract_m3u8.py` to extract M3U8 URLs from YouTube videos using Playwright. - Added `README_PLAYWRIGHT.md` for usage instructions and requirements. - Created `expand_and_test_proxies.py` to expand user-provided proxies and test their validity. - Implemented `generate_proxy_whitelist.py` to generate a whitelist of working proxies based on testing results. - Added sample proxy files: `user_proxies.txt` for user-defined proxies and `proxies_sample.txt` as a template. - Generated `expanded_proxies.txt`, `whitelist.json`, and `whitelist.txt` for storing expanded and valid proxies. - Included error handling and logging for proxy testing results.
178 lines
7.3 KiB
Python
Executable File
178 lines
7.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""playwright_extract_m3u8.py
|
|
|
|
Abre una página de YouTube con Playwright y captura la primera URL m3u8/HLS
|
|
visible en las peticiones de red. También puede exportar cookies al formato
|
|
Netscape para usarlas con yt-dlp/tu API.
|
|
|
|
Uso:
|
|
python3 tools/playwright_extract_m3u8.py --video https://www.youtube.com/watch?v=ID [--profile /path/to/profile] [--headless]
|
|
|
|
Requisitos (host):
|
|
pip install playwright
|
|
python -m playwright install
|
|
|
|
Notas:
|
|
- Recomiendo ejecutarlo en el host (no en el contenedor) para usar el perfil de Chrome
|
|
y para que Playwright pueda manejar la interfaz gráfica si necesitas login/manual.
|
|
- Si pasas --profile, se lanzará una sesión persistente usando ese directorio (útil
|
|
para usar tu sesión de Chrome ya logueada). Si dejas vacío, se usa un contexto limpio.
|
|
"""
|
|
import argparse
|
|
import os
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
|
except Exception as e:
|
|
print("playwright no está instalado. Instala con: pip install playwright && python -m playwright install")
|
|
raise
|
|
|
|
|
|
def write_netscape_cookie_file(cookies, target_path):
|
|
# cookies: list of dicts like Playwright provides
|
|
lines = ["# Netscape HTTP Cookie File"]
|
|
for c in cookies:
|
|
domain = c.get("domain", "")
|
|
flag = "TRUE" if domain.startswith('.') else "FALSE"
|
|
path = c.get("path", "/")
|
|
secure = "TRUE" if c.get("secure") else "FALSE"
|
|
expires = str(int(c.get("expires", 0))) if c.get("expires") else "0"
|
|
name = c.get("name", "")
|
|
value = c.get("value", "")
|
|
lines.append("\t".join([domain, flag, path, secure, expires, name, value]))
|
|
Path(target_path).parent.mkdir(parents=True, exist_ok=True)
|
|
with open(target_path, "w", encoding="utf-8") as fh:
|
|
fh.write("\n".join(lines) + "\n")
|
|
|
|
|
|
def extract_m3u8(video_url: str, profile: str | None, headless: bool, timeout: int = 45, save_cookies: bool = True):
|
|
result = {"m3u8_urls": [], "cookies_file": None, "errors": []}
|
|
data_dir = Path.cwd() / "data"
|
|
data_dir.mkdir(exist_ok=True)
|
|
target_cookies = str(data_dir / "cookies.txt")
|
|
|
|
with sync_playwright() as p:
|
|
# Usar Chromium para mejor compatibilidad con Chrome profile
|
|
browser_type = p.chromium
|
|
# establecer User-Agent a uno real para simular navegador
|
|
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
extra_headers = {"Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}
|
|
|
|
launch_args = ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
|
|
if profile:
|
|
# persistent context uses a profile dir (user data dir)
|
|
user_data_dir = profile
|
|
# avoid passing user_agent due to some Playwright builds missing API; set headers only
|
|
context = browser_type.launch_persistent_context(user_data_dir=user_data_dir, headless=headless, extra_http_headers=extra_headers, args=launch_args)
|
|
else:
|
|
# pass common args to help in container environments
|
|
browser = browser_type.launch(headless=headless, args=launch_args)
|
|
# do not pass user_agent param; rely on browser default and headers
|
|
context = browser.new_context(extra_http_headers=extra_headers)
|
|
|
|
# debug info
|
|
try:
|
|
print(f"[playwright] started browser headless={headless} profile={'yes' if profile else 'no'}")
|
|
except Exception:
|
|
pass
|
|
|
|
page = context.new_page()
|
|
|
|
collected = set()
|
|
|
|
def on_response(resp):
|
|
try:
|
|
url = resp.url
|
|
# heurística: m3u8 en URL o content-type de respuesta
|
|
if ".m3u8" in url.lower():
|
|
collected.add(url)
|
|
else:
|
|
ct = resp.headers.get("content-type", "")
|
|
if "application/vnd.apple.mpegurl" in ct or "vnd.apple.mpegurl" in ct or "application/x-mpegURL" in ct:
|
|
collected.add(url)
|
|
except Exception:
|
|
pass
|
|
|
|
page.on("response", on_response)
|
|
|
|
try:
|
|
page.goto(video_url, timeout=timeout * 1000)
|
|
# esperar un poco para que las peticiones de manifest se disparen
|
|
wait_seconds = 6
|
|
for i in range(wait_seconds):
|
|
time.sleep(1)
|
|
# si encontramos algo temprano, romper
|
|
if collected:
|
|
break
|
|
|
|
# Si no encontramos m3u8, intentar forzar la apertura del player y realizar scroll
|
|
if not collected:
|
|
try:
|
|
# click play
|
|
page.evaluate("() => { const v = document.querySelector('video'); if (v) v.play(); }")
|
|
except Exception:
|
|
pass
|
|
# esperar más
|
|
time.sleep(3)
|
|
|
|
# recopilar URLs
|
|
result_urls = list(collected)
|
|
# desduplicar y ordenar
|
|
result_urls = sorted(set(result_urls))
|
|
result['m3u8_urls'] = result_urls
|
|
|
|
# guardar cookies si se pidió
|
|
if save_cookies:
|
|
try:
|
|
cookies = context.cookies()
|
|
write_netscape_cookie_file(cookies, target_cookies)
|
|
result['cookies_file'] = target_cookies
|
|
except Exception as e:
|
|
result['errors'].append(f"cookie_export_error:{e}")
|
|
|
|
except PWTimeout as e:
|
|
result['errors'].append(f"page_timeout: {e}")
|
|
except Exception as e:
|
|
import traceback
|
|
result['errors'].append(traceback.format_exc())
|
|
finally:
|
|
# intentar cerrar context y browser si existen
|
|
try:
|
|
if 'context' in locals() and context:
|
|
try:
|
|
context.close()
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if 'browser' in locals() and browser:
|
|
try:
|
|
browser.close()
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
return result
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Playwright m3u8 extractor for YouTube')
|
|
parser.add_argument('--video', required=True, help='Video URL or ID (e.g. https://www.youtube.com/watch?v=ID)')
|
|
parser.add_argument('--profile', default='', help='Path to browser profile (user data dir) to reuse logged session')
|
|
parser.add_argument('--headless', action='store_true', help='Run headless')
|
|
parser.add_argument('--timeout', type=int, default=45, help='Timeout for page load (seconds)')
|
|
parser.add_argument('--no-cookies', dest='save_cookies', action='store_false', help='Don\'t save cookies to ./data/cookies.txt')
|
|
args = parser.parse_args()
|
|
|
|
video = args.video
|
|
if len(video) == 11 and not video.startswith('http'):
|
|
video = f'https://www.youtube.com/watch?v={video}'
|
|
|
|
res = extract_m3u8(video, profile=args.profile or None, headless=args.headless, timeout=args.timeout, save_cookies=args.save_cookies)
|
|
print(json.dumps(res, indent=2, ensure_ascii=False))
|