From f27d22eb81fdadc96c16627e048b0451133c2037 Mon Sep 17 00:00:00 2001 From: Kevin Watt Date: Fri, 30 May 2025 12:03:04 +0800 Subject: [PATCH] Revert "Revert "feat: add transcript download functionality"" --- README.md | 8 +++++ src/__tests__/subtitle.test.ts | 59 ++++++++++++++++++++++++++++++- src/index.mts | 19 +++++++++- src/modules/subtitle.ts | 63 +++++++++++++++++++++++++++++++++- src/modules/utils.ts | 35 +++++++++++++++++++ 5 files changed, 181 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f7230cb..a465b20 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,12 @@ pip install yt-dlp * Inputs: * `url` (string, required): URL of the video +* **download_transcript** + * Download and clean video subtitles to produce a plain text transcript without timestamps or formatting + * Inputs: + * `url` (string, required): URL of the video + * `language` (string, optional): Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en' + ## Usage Examples Ask your LLM to: @@ -80,6 +86,8 @@ Ask your LLM to: "Download Chinese subtitles from this video: https://youtube.com/watch?v=..." "Download this video in 1080p: https://youtube.com/watch?v=..." "Download audio from this YouTube video: https://youtube.com/watch?v=..." +"Get a clean transcript of this video: https://youtube.com/watch?v=..." +"Download Spanish transcript from this video: https://youtube.com/watch?v=..." ``` ## Manual Start diff --git a/src/__tests__/subtitle.test.ts b/src/__tests__/subtitle.test.ts index 19e360b..1e55ea2 100644 --- a/src/__tests__/subtitle.test.ts +++ b/src/__tests__/subtitle.test.ts @@ -3,7 +3,8 @@ import { describe, test, expect } from '@jest/globals'; import * as os from 'os'; import * as path from 'path'; -import { listSubtitles, downloadSubtitles } from '../modules/subtitle.js'; +import { listSubtitles, downloadSubtitles, downloadTranscript } from '../modules/subtitle.js'; +import { cleanSubtitleToTranscript } from '../modules/utils.js'; import { CONFIG } from '../config.js'; import * as fs from 'fs'; @@ -51,4 +52,60 @@ describe('Subtitle Functions', () => { .toThrow(); }); }); + + describe('downloadTranscript', () => { + test('downloads and cleans transcript successfully', async () => { + const result = await downloadTranscript(testUrl, 'en', testConfig); + expect(typeof result).toBe('string'); + expect(result.length).toBeGreaterThan(0); + expect(result).not.toContain('WEBVTT'); + expect(result).not.toContain('-->'); + expect(result).not.toMatch(/^\d+$/m); + }, 30000); + + test('handles invalid URL', async () => { + await expect(downloadTranscript('invalid-url', 'en', testConfig)) + .rejects + .toThrow(); + }); + }); + + describe('cleanSubtitleToTranscript', () => { + test('cleans SRT content correctly', () => { + const srtContent = `1 +00:00:01,000 --> 00:00:03,000 +Hello world + +2 +00:00:04,000 --> 00:00:06,000 +This is a test + +3 +00:00:07,000 --> 00:00:09,000 +Bold text here`; + + const result = cleanSubtitleToTranscript(srtContent); + expect(result).toBe('Hello world This is a test Bold text here'); + }); + + test('handles empty content', () => { + const result = cleanSubtitleToTranscript(''); + expect(result).toBe(''); + }); + + test('removes timestamps and sequence numbers', () => { + const srtContent = `1 +00:00:01,000 --> 00:00:03,000 +First line + +2 +00:00:04,000 --> 00:00:06,000 +Second line`; + + const result = cleanSubtitleToTranscript(srtContent); + expect(result).not.toContain('00:00'); + expect(result).not.toMatch(/^\d+$/); + expect(result).toBe('First line Second line'); + }); + }); }); \ No newline at end of file diff --git a/src/index.mts b/src/index.mts index 2310dc1..6dbe2c6 100644 --- a/src/index.mts +++ b/src/index.mts @@ -15,7 +15,7 @@ import { CONFIG } from "./config.js"; import { _spawnPromise, safeCleanup } from "./modules/utils.js"; import { downloadVideo } from "./modules/video.js"; import { downloadAudio } from "./modules/audio.js"; -import { listSubtitles, downloadSubtitles } from "./modules/subtitle.js"; +import { listSubtitles, downloadSubtitles, downloadTranscript } from "./modules/subtitle.js"; const VERSION = '0.6.26'; @@ -148,6 +148,18 @@ server.setRequestHandler(ListToolsRequestSchema, async () => { required: ["url"], }, }, + { + name: "download_transcript", + description: "Download and clean video subtitles to produce a plain text transcript without timestamps or formatting.", + inputSchema: { + type: "object", + properties: { + url: { type: "string", description: "URL of the video" }, + language: { type: "string", description: "Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'" }, + }, + required: ["url"], + }, + }, ], }; }); @@ -211,6 +223,11 @@ server.setRequestHandler( () => downloadAudio(args.url, CONFIG), "Error downloading audio" ); + } else if (toolName === "download_transcript") { + return handleToolExecution( + () => downloadTranscript(args.url, args.language || CONFIG.download.defaultSubtitleLanguage, CONFIG), + "Error downloading transcript" + ); } else { return { content: [{ type: "text", text: `Unknown tool: ${toolName}` }], diff --git a/src/modules/subtitle.ts b/src/modules/subtitle.ts index 206378b..4a71cb8 100644 --- a/src/modules/subtitle.ts +++ b/src/modules/subtitle.ts @@ -2,7 +2,7 @@ import * as fs from "fs"; import * as path from "path"; import * as os from "os"; import type { Config } from '../config.js'; -import { _spawnPromise, validateUrl } from "./utils.js"; +import { _spawnPromise, validateUrl, cleanSubtitleToTranscript } from "./utils.js"; /** * Lists all available subtitles for a video. @@ -105,4 +105,65 @@ export async function downloadSubtitles( } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } +} + +/** + * Downloads and cleans subtitles to produce a plain text transcript. + * + * @param url - The URL of the video + * @param language - Language code (e.g., 'en', 'zh-Hant', 'ja') + * @param config - Configuration object + * @returns Promise resolving to the cleaned transcript text + * @throws {Error} When URL is invalid, language is not available, or download fails + * + * @example + * ```typescript + * try { + * const transcript = await downloadTranscript('https://youtube.com/watch?v=...', 'en', config); + * console.log('Transcript:', transcript); + * } catch (error) { + * console.error('Failed to download transcript:', error); + * } + * ``` + */ +export async function downloadTranscript( + url: string, + language: string, + config: Config +): Promise { + if (!validateUrl(url)) { + throw new Error('Invalid or unsupported URL format'); + } + + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), config.file.tempDirPrefix)); + + try { + await _spawnPromise('yt-dlp', [ + '--skip-download', + '--write-subs', + '--write-auto-subs', + '--sub-lang', language, + '--sub-format', 'ttml', + '--convert-subs', 'srt', + '--output', path.join(tempDir, 'transcript.%(ext)s'), + url + ]); + + const srtFiles = fs.readdirSync(tempDir) + .filter(file => file.endsWith('.srt')); + + if (srtFiles.length === 0) { + throw new Error('No subtitle files found for transcript generation'); + } + + let transcriptContent = ''; + for (const file of srtFiles) { + const srtContent = fs.readFileSync(path.join(tempDir, file), 'utf8'); + transcriptContent += cleanSubtitleToTranscript(srtContent) + ' '; + } + + return transcriptContent.trim(); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } } \ No newline at end of file diff --git a/src/modules/utils.ts b/src/modules/utils.ts index 70d897f..a1efb49 100644 --- a/src/modules/utils.ts +++ b/src/modules/utils.ts @@ -145,4 +145,39 @@ export function generateRandomFilename(extension: string = 'mp4'): string { const timestamp = getFormattedTimestamp(); const randomId = randomBytes(4).toString('hex'); return `${timestamp}_${randomId}.${extension}`; +} + +/** + * Cleans SRT subtitle content to produce a plain text transcript. + * Removes timestamps, sequence numbers, and HTML tags. + * + * @param srtContent - Raw SRT subtitle content + * @returns Cleaned transcript text + * + * @example + * ```typescript + * const cleanedText = cleanSubtitleToTranscript(srtContent); + * console.log(cleanedText); // 'Hello world this is a transcript...' + * ``` + */ +export function cleanSubtitleToTranscript(srtContent: string): string { + return srtContent + .split('\n') + .filter(line => { + const trimmed = line.trim(); + // Remove empty lines + if (!trimmed) return false; + // Remove sequence numbers (lines that are just digits) + if (/^\d+$/.test(trimmed)) return false; + // Remove timestamp lines + if (/^\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}$/.test(trimmed)) return false; + return true; + }) + .map(line => { + // Remove HTML tags + return line.replace(/<[^>]*>/g, ''); + }) + .join(' ') + .replace(/\s+/g, ' ') + .trim(); } \ No newline at end of file