TubeScript-API/tools/playwright_extract_m3u8.py

#!/usr/bin/env python3
"""playwright_extract_m3u8.py

Abre una página de YouTube con Playwright y captura la primera URL m3u8/HLS
visible en las peticiones de red. También puede exportar cookies al formato
Netscape para usarlas con yt-dlp/tu API.

Uso:
  python3 tools/playwright_extract_m3u8.py --video https://www.youtube.com/watch?v=ID [--profile /path/to/profile] [--headless]

Requisitos (host):
  pip install playwright
  python -m playwright install

Notas:
 - Recomiendo ejecutarlo en el host (no en el contenedor) para usar el perfil de Chrome
   y para que Playwright pueda manejar la interfaz gráfica si necesitas login/manual.
 - Si pasas --profile, se lanzará una sesión persistente usando ese directorio (útil
   para usar tu sesión de Chrome ya logueada). Si dejas vacío, se usa un contexto limpio.
"""
import argparse
import os
import json
import time
from pathlib import Path

try:
    from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except Exception as e:
    print("playwright no está instalado. Instala con: pip install playwright && python -m playwright install")
    raise


def write_netscape_cookie_file(cookies, target_path):
    # cookies: list of dicts like Playwright provides
    lines = ["# Netscape HTTP Cookie File"]
    for c in cookies:
        domain = c.get("domain", "")
        flag = "TRUE" if domain.startswith('.') else "FALSE"
        path = c.get("path", "/")
        secure = "TRUE" if c.get("secure") else "FALSE"
        expires = str(int(c.get("expires", 0))) if c.get("expires") else "0"
        name = c.get("name", "")
        value = c.get("value", "")
        lines.append("\t".join([domain, flag, path, secure, expires, name, value]))
    Path(target_path).parent.mkdir(parents=True, exist_ok=True)
    with open(target_path, "w", encoding="utf-8") as fh:
        fh.write("\n".join(lines) + "\n")


def extract_m3u8(video_url: str, profile: str | None, headless: bool, timeout: int = 45, save_cookies: bool = True):
    result = {"m3u8_urls": [], "cookies_file": None, "errors": []}
    data_dir = Path.cwd() / "data"
    data_dir.mkdir(exist_ok=True)
    target_cookies = str(data_dir / "cookies.txt")

    with sync_playwright() as p:
        # Usar Chromium para mejor compatibilidad con Chrome profile
        browser_type = p.chromium
        # establecer User-Agent a uno real para simular navegador
        ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        extra_headers = {"Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}

        launch_args = ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
        if profile:
            # persistent context uses a profile dir (user data dir)
            user_data_dir = profile
            # avoid passing user_agent due to some Playwright builds missing API; set headers only
            context = browser_type.launch_persistent_context(user_data_dir=user_data_dir, headless=headless, extra_http_headers=extra_headers, args=launch_args)
        else:
            # pass common args to help in container environments
            browser = browser_type.launch(headless=headless, args=launch_args)
            # do not pass user_agent param; rely on browser default and headers
            context = browser.new_context(extra_http_headers=extra_headers)

        # debug info
        try:
            print(f"[playwright] started browser headless={headless} profile={'yes' if profile else 'no'}")
        except Exception:
            pass

        page = context.new_page()

        collected = set()

        def on_response(resp):
            try:
                url = resp.url
                # heurística: m3u8 en URL o content-type de respuesta
                if ".m3u8" in url.lower():
                    collected.add(url)
                else:
                    ct = resp.headers.get("content-type", "")
                    if "application/vnd.apple.mpegurl" in ct or "vnd.apple.mpegurl" in ct or "application/x-mpegURL" in ct:
                        collected.add(url)
            except Exception:
                pass

        page.on("response", on_response)

        try:
            page.goto(video_url, timeout=timeout * 1000)
            # esperar un poco para que las peticiones de manifest se disparen
            wait_seconds = 6
            for i in range(wait_seconds):
                time.sleep(1)
                # si encontramos algo temprano, romper
                if collected:
                    break

            # Si no encontramos m3u8, intentar forzar la apertura del player y realizar scroll
            if not collected:
                try:
                    # click play
                    page.evaluate("() => { const v = document.querySelector('video'); if (v) v.play(); }")
                except Exception:
                    pass
                # esperar más
                time.sleep(3)

            # recopilar URLs
            result_urls = list(collected)
            # desduplicar y ordenar
            result_urls = sorted(set(result_urls))
            result['m3u8_urls'] = result_urls

            # guardar cookies si se pidió
            if save_cookies:
                try:
                    cookies = context.cookies()
                    write_netscape_cookie_file(cookies, target_cookies)
                    result['cookies_file'] = target_cookies
                except Exception as e:
                    result['errors'].append(f"cookie_export_error:{e}")

        except PWTimeout as e:
            result['errors'].append(f"page_timeout: {e}")
        except Exception as e:
            import traceback
            result['errors'].append(traceback.format_exc())
        finally:
            # intentar cerrar context y browser si existen
            try:
                if 'context' in locals() and context:
                    try:
                        context.close()
                    except Exception:
                        pass
            except Exception:
                pass
            try:
                if 'browser' in locals() and browser:
                    try:
                        browser.close()
                    except Exception:
                        pass
            except Exception:
                pass

    return result


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Playwright m3u8 extractor for YouTube')
    parser.add_argument('--video', required=True, help='Video URL or ID (e.g. https://www.youtube.com/watch?v=ID)')
    parser.add_argument('--profile', default='', help='Path to browser profile (user data dir) to reuse logged session')
    parser.add_argument('--headless', action='store_true', help='Run headless')
    parser.add_argument('--timeout', type=int, default=45, help='Timeout for page load (seconds)')
    parser.add_argument('--no-cookies', dest='save_cookies', action='store_false', help='Don\'t save cookies to ./data/cookies.txt')
    args = parser.parse_args()

    video = args.video
    if len(video) == 11 and not video.startswith('http'):
        video = f'https://www.youtube.com/watch?v={video}'

    res = extract_m3u8(video, profile=args.profile or None, headless=args.headless, timeout=args.timeout, save_cookies=args.save_cookies)
    print(json.dumps(res, indent=2, ensure_ascii=False))