Add get_video_thumbnails function and update transcript handling to include thumbnail URLs

2026-02-22 22:54:12 -07:00 · 2026-02-22 22:54:12 -07:00 · 344fd5809a
commit 344fd5809a
parent f8924a2965
1 changed files with 139 additions and 31 deletions
--- a/main.py
+++ b/main.py
@ -161,8 +161,8 @@ def format_segments_text(segments: List[Dict]) -> List[str]:
            return ''
        s = str(t).strip()
        s = re.sub(r'^\s*Kind\s*:\s*.*$', '', s, flags=re.IGNORECASE).strip()
-        # eliminar contenido entre corchetes, patrón seguro para corchetes
-        s = re.sub(r'\[[^\]]*\]', '', s)
+        # eliminar contenido entre corchetes (no-greedy)
+        s = re.sub(r'\[.*?\]', '', s)
        s = re.sub(r'\(.*?\)', '', s)
        s = re.sub(r'<[^>]+>', '', s)
        s = re.sub(r'[♪★■◆►▶◀•–—]', '', s)
@ -180,10 +180,83 @@ def format_segments_text(segments: List[Dict]) -> List[str]:
    return output


+# Nuevo helper: obtener thumbnails para un video (intenta yt-dlp --dump-json, fallback a URLs estándar)
+def get_video_thumbnails(video_id: str) -> List[str]:
+    """Devuelve una lista de URLs de thumbnail para el video.
+    Primero intenta obtener metadata con yt-dlp y extraer 'thumbnails' o 'thumbnail'.
+    Si falla, construye una lista de URLs por defecto (maxresdefault, sddefault, hqdefault, mqdefault, default).
+    """
+    thumbs: List[str] = []
+    url = f"https://www.youtube.com/watch?v={video_id}"
+
+    cookie_mgr = CookieManager()
+    cookiefile_path = cookie_mgr.get_cookiefile_path()
+    cookies_path = cookiefile_path or os.getenv('API_COOKIES_PATH', DEFAULT_COOKIES_PATH)
+    proxy = os.getenv('API_PROXY', DEFAULT_PROXY) or None
+
+    cmd = [
+        "yt-dlp",
+        "--skip-download",
+        "--dump-json",
+        "--no-warnings",
+        url
+    ]
+    if os.path.exists(cookies_path):
+        cmd.extend(["--cookies", cookies_path])
+    if proxy:
+        cmd.extend(['--proxy', proxy])
+
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        if proc.returncode == 0 and proc.stdout:
+            try:
+                meta = json.loads(proc.stdout)
+                # thumbnails puede ser lista de dicts con 'url'
+                t = meta.get('thumbnails') or meta.get('thumbnail')
+                if isinstance(t, list):
+                    for item in t:
+                        if isinstance(item, dict) and item.get('url'):
+                            thumbs.append(item.get('url'))
+                        elif isinstance(item, str):
+                            thumbs.append(item)
+                elif isinstance(t, dict) and t.get('url'):
+                    thumbs.append(t.get('url'))
+                elif isinstance(t, str):
+                    thumbs.append(t)
+            except Exception:
+                pass
+    except Exception:
+        pass
+    finally:
+        try:
+            cookie_mgr.cleanup()
+        except Exception:
+            pass
+
+    # Si no obtuvimos thumbnails desde metadata, construir URLs estándar
+    if not thumbs:
+        thumbs = [
+            f"https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg",
+            f"https://i.ytimg.com/vi/{video_id}/sddefault.jpg",
+            f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg",
+            f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg",
+            f"https://i.ytimg.com/vi/{video_id}/default.jpg",
+        ]
+
+    # deduplicate while preserving order
+    seen = set()
+    unique_thumbs = []
+    for t in thumbs:
+        if t and t not in seen:
+            seen.add(t)
+            unique_thumbs.append(t)
+
+    return unique_thumbs
+
 def get_transcript_data(video_id: str, lang: str = "es"):
    video_id = extract_video_id(video_id)
    if not video_id:
-        return None, "video_id inválido o vacío"
+        return None, [], "video_id inválido o vacío"

    url = f"https://www.youtube.com/watch?v={video_id}"

@ -262,9 +335,9 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                # Si yt-dlp falló por rate limiting, devolver mensaje claro
                stderr = (result.stderr or "").lower()
                if result.returncode != 0 and ('http error 429' in stderr or 'too many requests' in stderr):
-                    return None, "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega un cookies.txt válido exportado desde tu navegador y monta en el contenedor, o espera unos minutos."
+                    return None, get_video_thumbnails(video_id), "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega un cookies.txt válido exportado desde tu navegador y monta en el contenedor, o espera unos minutos."
                if result.returncode != 0 and ('http error 403' in stderr or 'forbidden' in stderr):
-                    return None, "Acceso denegado al descargar subtítulos (HTTP 403). El video puede tener restricciones. Usa cookies.txt con una cuenta autorizada."
+                    return None, get_video_thumbnails(video_id), "Acceso denegado al descargar subtítulos (HTTP 403). El video puede tener restricciones. Usa cookies.txt con una cuenta autorizada."
            except subprocess.TimeoutExpired:
                pass

@ -282,7 +355,7 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                    vtt_combined = "\n".join(combined)
                    parsed = parse_subtitle_format(vtt_combined, 'vtt')
                    if parsed:
-                        return parsed, None
+                        return parsed, get_video_thumbnails(video_id), None
    finally:
        # cleanup any temp cookiefile created for this request
        try:
@ -325,7 +398,7 @@ def get_transcript_data(video_id: str, lang: str = "es"):
    except subprocess.TimeoutExpired:
        video_metadata = None
    except FileNotFoundError:
-        return None, "yt-dlp no está instalado en el contenedor/entorno. Instala yt-dlp y vuelve a intentar."
+        return None, get_video_thumbnails(video_id), "yt-dlp no está instalado en el contenedor/entorno. Instala yt-dlp y vuelve a intentar."
    except Exception as e:
        video_metadata = None

@ -380,19 +453,19 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                            # salir del loop y usar fallback con yt-dlp más abajo
                            break
                    elif response.status_code == 403:
-                        return None, "Acceso denegado (HTTP 403). El video puede tener restricciones de edad o región. Intenta con cookies.txt."
+                        return None, get_video_thumbnails(video_id), "Acceso denegado (HTTP 403). El video puede tener restricciones de edad o región. Intenta con cookies.txt."
                    elif response.status_code == 404:
                        # No encontramos la URL esperada; intentar fallback
                        response = None
                        break
                    else:
-                        return None, f"Error al descargar subtítulos desde YouTube (HTTP {response.status_code})."
+                        return None, get_video_thumbnails(video_id), f"Error al descargar subtítulos desde YouTube (HTTP {response.status_code})."
                except requests.exceptions.Timeout:
                    if attempt < max_retries - 1:
                        continue
-                    return None, "Timeout al descargar subtítulos. Intenta nuevamente."
+                    return None, get_video_thumbnails(video_id), "Timeout al descargar subtítulos. Intenta nuevamente."
                except requests.exceptions.RequestException as e:
-                    return None, f"Error de conexión al descargar subtítulos: {str(e)[:100]}"
+                    return None, get_video_thumbnails(video_id), f"Error de conexión al descargar subtítulos: {str(e)[:100]}"

            # Si obtuvimos un 200, procesarlo; si hubo rate limiting, intentar fallback con yt-dlp
            if response and response.status_code == 200:
@ -443,9 +516,9 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                                        stderr2 = (res2.stderr or "").lower()
                                        if res2.returncode != 0 and ('http error 429' in stderr2 or 'too many requests' in stderr2):
                                            # rate limit cuando intentamos descargar timedtext
-                                            return None, "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega cookies.txt válido o intenta más tarde."
+                                            return None, get_video_thumbnails(video_id), "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega cookies.txt válido o intenta más tarde."
                                        if res2.returncode != 0 and ('http error 403' in stderr2 or 'forbidden' in stderr2):
-                                            return None, "Acceso denegado al descargar subtítulos (HTTP 403). Intenta con cookies.txt o una cuenta con permisos."
+                                            return None, get_video_thumbnails(video_id), "Acceso denegado al descargar subtítulos (HTTP 403). Intenta con cookies.txt o una cuenta con permisos."
                                    except Exception:
                                        pass

@ -465,7 +538,7 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                            vtt_combined = "\n".join(combined)
                            formatted_transcript = parse_subtitle_format(vtt_combined, 'vtt')
                            if formatted_transcript:
-                                return formatted_transcript, None
+                                return formatted_transcript, get_video_thumbnails(video_id), None

                    try:
                        subtitle_data = response.json()
@ -473,12 +546,12 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                    except json.JSONDecodeError:
                        formatted_transcript = parse_subtitle_format(response.text, subtitle_format)
                except Exception as e:
-                    return None, f"Error al procesar los subtítulos: {str(e)[:200]}"
+                    return None, get_video_thumbnails(video_id), f"Error al procesar los subtítulos: {str(e)[:200]}"

                if not formatted_transcript:
-                    return None, "Los subtítulos están vacíos o no se pudieron procesar."
+                    return None, get_video_thumbnails(video_id), "Los subtítulos están vacíos o no se pudieron procesar."

-                return formatted_transcript, None
+                return formatted_transcript, get_video_thumbnails(video_id), None
            # Si hubo rate limiting, intentar fallback con yt-dlp para descargar la URL de subtítulos
            if rate_limited and (not response or response.status_code != 200):
                 # Intentar descargar la URL de subtítulos directamente con yt-dlp (usa cookies si existen)
@ -499,7 +572,7 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                         res = subprocess.run(ytdlp_cmd, capture_output=True, text=True, timeout=90)
                         stderr = (res.stderr or "").lower()
                         if res.returncode != 0 and ('http error 429' in stderr or 'too many requests' in stderr):
-                             return None, "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega cookies.txt válido o intenta más tarde."
+                             return None, get_video_thumbnails(video_id), "YouTube está limitando las peticiones al descargar subtítulos (HTTP 429). Agrega cookies.txt válido o intenta más tarde."
                         # Leer archivos generados
                         combined = []
                         for fpath in glob.glob(os.path.join(tdir, "*.*")):
@ -514,9 +587,9 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                             vtt_combined = "\n".join(combined)
                             formatted_transcript = parse_subtitle_format(vtt_combined, 'vtt')
                             if formatted_transcript:
-                                 return formatted_transcript, None
+                                 return formatted_transcript, get_video_thumbnails(video_id), None
                 except FileNotFoundError:
-                     return None, "yt-dlp no está instalado en el contenedor/entorno. Instala yt-dlp y vuelve a intentar."
+                     return None, get_video_thumbnails(video_id), "yt-dlp no está instalado en el contenedor/entorno. Instala yt-dlp y vuelve a intentar."
                 except Exception:
                     # seguir con otros fallbacks
                     pass
@ -567,22 +640,22 @@ def get_transcript_data(video_id: str, lang: str = "es"):
                    with open(downloaded, 'r', encoding='utf-8') as fh:
                        content = fh.read()
                except Exception as e:
-                    return None, f"Error leyendo archivo de subtítulos descargado: {str(e)[:200]}"
+                    return None, get_video_thumbnails(video_id), f"Error leyendo archivo de subtítulos descargado: {str(e)[:200]}"

                # Intentar parsear según extensión conocida
                fmt = 'json3' if ext in ('json', 'json3') else ('vtt' if ext == 'vtt' else 'srv3')
                formatted_transcript = parse_subtitle_format(content, fmt)
                if formatted_transcript:
-                    return formatted_transcript, None
+                    return formatted_transcript, get_video_thumbnails(video_id), None
                else:
-                    return None, "Se descargaron subtítulos pero no se pudieron procesar."
+                    return None, get_video_thumbnails(video_id), "Se descargaron subtítulos pero no se pudieron procesar."
    except FileNotFoundError:
-        return None, "yt-dlp no está instalado. Instala yt-dlp en el contenedor/entorno y vuelve a intentar."
+        return None, get_video_thumbnails(video_id), "yt-dlp no está instalado. Instala yt-dlp en el contenedor/entorno y vuelve a intentar."
    except Exception as e:
        # No hacer crash, retornar mensaje general
-        return None, f"Error al intentar descargar subtítulos con yt-dlp: {str(e)[:200]}"
+        return None, get_video_thumbnails(video_id), f"Error al intentar descargar subtítulos con yt-dlp: {str(e)[:200]}"

-    return None, "No se encontraron subtítulos para este video (o el video no tiene subtítulos disponibles). Intenta con otro video en vivo o agrega cookies.txt si hay restricciones."
+    return None, get_video_thumbnails(video_id), "No se encontraron subtítulos para este video (o el video no tiene subtítulos disponibles). Intenta con otro video en vivo o agrega cookies.txt si hay restricciones."

 def get_stream_url(video_id: str):
    """
@ -664,8 +737,8 @@ def get_stream_url(video_id: str):

@app.get("/transcript/{video_id}")
 def transcript_endpoint(video_id: str, lang: str = "es"):
-    data, error = get_transcript_data(video_id, lang)
-    
+    data, thumbnails, error = get_transcript_data(video_id, lang)
+
    if error:
        raise HTTPException(status_code=400, detail=error)
    
@ -688,7 +761,34 @@ def transcript_endpoint(video_id: str, lang: str = "es"):
        "count": len(data),
        "segments": data,
        "text": combined_text,
-        "format_text": format_text
+        "format_text": format_text,
+        "thumbnails": thumbnails
+    }
+
+@app.get('/transcript_vtt/{video_id}')
+def transcript_vtt(video_id: str, lang: str = 'es'):
+    """Descarga (con yt-dlp) y devuelve subtítulos en VTT, además de segmentos parseados y texto concatenado."""
+    vtt_text, error = fetch_vtt_subtitles(video_id, lang)
+    if error:
+        raise HTTPException(status_code=400, detail=error)
+
+    # parsear VTT a segmentos usando parse_subtitle_format
+    segments = parse_subtitle_format(vtt_text, 'vtt') if vtt_text else []
+
+    combined_text = '\n'.join([s.get('text','') for s in segments])
+    # format_text con texto limpio listo para procesamiento por agentes
+    format_text = format_segments_text(segments)
+
+    thumbnails = get_video_thumbnails(video_id)
+
+    return {
+        'video_id': video_id,
+        'vtt': vtt_text,
+        'count': len(segments),
+        'segments': segments,
+        'text': combined_text,
+        'format_text': format_text,
+        'thumbnails': thumbnails
    }

@app.get("/stream/{video_id}")
@ -707,7 +807,9 @@ def stream_endpoint(video_id: str):
    
    if error:
        raise HTTPException(status_code=400, detail=error)
-    
+
+    thumbnails = get_video_thumbnails(video_id)
+
    # Determinar el tipo de URL obtenida
    url_type = "unknown"
    if "m3u8" in stream_url.lower():
@ -721,6 +823,7 @@ def stream_endpoint(video_id: str):
        "url_type": url_type,
        "youtube_url": f"https://www.youtube.com/watch?v={video_id}",
        "ffmpeg_example": f'ffmpeg -re -i "{stream_url}" -c copy -f flv rtmp://destino/stream_key',
+        "thumbnails": thumbnails,
        "usage": {
            "description": "Usa stream_url con FFmpeg para retransmitir",
            "command_template": "ffmpeg -re -i \"{stream_url}\" -c copy -f flv {rtmp_url}/{stream_key}",
@ -1034,13 +1137,16 @@ def transcript_vtt(video_id: str, lang: str = 'es'):
    # format_text con texto limpio listo para procesamiento por agentes
    format_text = format_segments_text(segments)

+    thumbnails = get_video_thumbnails(video_id)
+
    return {
        'video_id': video_id,
        'vtt': vtt_text,
        'count': len(segments),
        'segments': segments,
        'text': combined_text,
-        'format_text': format_text
+        'format_text': format_text,
+        'thumbnails': thumbnails
    }

@app.post('/upload_vtt/{video_id}')
@ -1119,6 +1225,8 @@ def transcript_alt(video_id: str, lang: str = 'es'):
    combined_text = '\n'.join([s['text'] for s in segments if s.get('text')])
    format_text = format_segments_text(segments)

+    thumbnails = get_video_thumbnails(vid)
+
    return {
        'video_id': vid,
        'count': len(segments),