From c79766c241f6a7276c44242f4dfcabe5e05b04d3 Mon Sep 17 00:00:00 2001 From: Kevin Watt Date: Fri, 30 May 2025 11:57:52 +0800 Subject: [PATCH] Revert "feat: add transcript download functionality" --- README.md | 8 ----- src/__tests__/subtitle.test.ts | 59 +------------------------------ src/index.mts | 19 +--------- src/modules/subtitle.ts | 63 +--------------------------------- src/modules/utils.ts | 35 ------------------- 5 files changed, 3 insertions(+), 181 deletions(-) diff --git a/README.md b/README.md index a465b20..f7230cb 100644 --- a/README.md +++ b/README.md @@ -71,12 +71,6 @@ pip install yt-dlp * Inputs: * `url` (string, required): URL of the video -* **download_transcript** - * Download and clean video subtitles to produce a plain text transcript without timestamps or formatting - * Inputs: - * `url` (string, required): URL of the video - * `language` (string, optional): Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en' - ## Usage Examples Ask your LLM to: @@ -86,8 +80,6 @@ Ask your LLM to: "Download Chinese subtitles from this video: https://youtube.com/watch?v=..." "Download this video in 1080p: https://youtube.com/watch?v=..." "Download audio from this YouTube video: https://youtube.com/watch?v=..." -"Get a clean transcript of this video: https://youtube.com/watch?v=..." -"Download Spanish transcript from this video: https://youtube.com/watch?v=..." ``` ## Manual Start diff --git a/src/__tests__/subtitle.test.ts b/src/__tests__/subtitle.test.ts index 1e55ea2..19e360b 100644 --- a/src/__tests__/subtitle.test.ts +++ b/src/__tests__/subtitle.test.ts @@ -3,8 +3,7 @@ import { describe, test, expect } from '@jest/globals'; import * as os from 'os'; import * as path from 'path'; -import { listSubtitles, downloadSubtitles, downloadTranscript } from '../modules/subtitle.js'; -import { cleanSubtitleToTranscript } from '../modules/utils.js'; +import { listSubtitles, downloadSubtitles } from '../modules/subtitle.js'; import { CONFIG } from '../config.js'; import * as fs from 'fs'; @@ -52,60 +51,4 @@ describe('Subtitle Functions', () => { .toThrow(); }); }); - - describe('downloadTranscript', () => { - test('downloads and cleans transcript successfully', async () => { - const result = await downloadTranscript(testUrl, 'en', testConfig); - expect(typeof result).toBe('string'); - expect(result.length).toBeGreaterThan(0); - expect(result).not.toContain('WEBVTT'); - expect(result).not.toContain('-->'); - expect(result).not.toMatch(/^\d+$/m); - }, 30000); - - test('handles invalid URL', async () => { - await expect(downloadTranscript('invalid-url', 'en', testConfig)) - .rejects - .toThrow(); - }); - }); - - describe('cleanSubtitleToTranscript', () => { - test('cleans SRT content correctly', () => { - const srtContent = `1 -00:00:01,000 --> 00:00:03,000 -Hello world - -2 -00:00:04,000 --> 00:00:06,000 -This is a test - -3 -00:00:07,000 --> 00:00:09,000 -Bold text here`; - - const result = cleanSubtitleToTranscript(srtContent); - expect(result).toBe('Hello world This is a test Bold text here'); - }); - - test('handles empty content', () => { - const result = cleanSubtitleToTranscript(''); - expect(result).toBe(''); - }); - - test('removes timestamps and sequence numbers', () => { - const srtContent = `1 -00:00:01,000 --> 00:00:03,000 -First line - -2 -00:00:04,000 --> 00:00:06,000 -Second line`; - - const result = cleanSubtitleToTranscript(srtContent); - expect(result).not.toContain('00:00'); - expect(result).not.toMatch(/^\d+$/); - expect(result).toBe('First line Second line'); - }); - }); }); \ No newline at end of file diff --git a/src/index.mts b/src/index.mts index 6dbe2c6..2310dc1 100644 --- a/src/index.mts +++ b/src/index.mts @@ -15,7 +15,7 @@ import { CONFIG } from "./config.js"; import { _spawnPromise, safeCleanup } from "./modules/utils.js"; import { downloadVideo } from "./modules/video.js"; import { downloadAudio } from "./modules/audio.js"; -import { listSubtitles, downloadSubtitles, downloadTranscript } from "./modules/subtitle.js"; +import { listSubtitles, downloadSubtitles } from "./modules/subtitle.js"; const VERSION = '0.6.26'; @@ -148,18 +148,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => { required: ["url"], }, }, - { - name: "download_transcript", - description: "Download and clean video subtitles to produce a plain text transcript without timestamps or formatting.", - inputSchema: { - type: "object", - properties: { - url: { type: "string", description: "URL of the video" }, - language: { type: "string", description: "Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'" }, - }, - required: ["url"], - }, - }, ], }; }); @@ -223,11 +211,6 @@ server.setRequestHandler( () => downloadAudio(args.url, CONFIG), "Error downloading audio" ); - } else if (toolName === "download_transcript") { - return handleToolExecution( - () => downloadTranscript(args.url, args.language || CONFIG.download.defaultSubtitleLanguage, CONFIG), - "Error downloading transcript" - ); } else { return { content: [{ type: "text", text: `Unknown tool: ${toolName}` }], diff --git a/src/modules/subtitle.ts b/src/modules/subtitle.ts index 4a71cb8..206378b 100644 --- a/src/modules/subtitle.ts +++ b/src/modules/subtitle.ts @@ -2,7 +2,7 @@ import * as fs from "fs"; import * as path from "path"; import * as os from "os"; import type { Config } from '../config.js'; -import { _spawnPromise, validateUrl, cleanSubtitleToTranscript } from "./utils.js"; +import { _spawnPromise, validateUrl } from "./utils.js"; /** * Lists all available subtitles for a video. @@ -105,65 +105,4 @@ export async function downloadSubtitles( } finally { fs.rmSync(tempDir, { recursive: true, force: true }); } -} - -/** - * Downloads and cleans subtitles to produce a plain text transcript. - * - * @param url - The URL of the video - * @param language - Language code (e.g., 'en', 'zh-Hant', 'ja') - * @param config - Configuration object - * @returns Promise resolving to the cleaned transcript text - * @throws {Error} When URL is invalid, language is not available, or download fails - * - * @example - * ```typescript - * try { - * const transcript = await downloadTranscript('https://youtube.com/watch?v=...', 'en', config); - * console.log('Transcript:', transcript); - * } catch (error) { - * console.error('Failed to download transcript:', error); - * } - * ``` - */ -export async function downloadTranscript( - url: string, - language: string, - config: Config -): Promise { - if (!validateUrl(url)) { - throw new Error('Invalid or unsupported URL format'); - } - - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), config.file.tempDirPrefix)); - - try { - await _spawnPromise('yt-dlp', [ - '--skip-download', - '--write-subs', - '--write-auto-subs', - '--sub-lang', language, - '--sub-format', 'ttml', - '--convert-subs', 'srt', - '--output', path.join(tempDir, 'transcript.%(ext)s'), - url - ]); - - const srtFiles = fs.readdirSync(tempDir) - .filter(file => file.endsWith('.srt')); - - if (srtFiles.length === 0) { - throw new Error('No subtitle files found for transcript generation'); - } - - let transcriptContent = ''; - for (const file of srtFiles) { - const srtContent = fs.readFileSync(path.join(tempDir, file), 'utf8'); - transcriptContent += cleanSubtitleToTranscript(srtContent) + ' '; - } - - return transcriptContent.trim(); - } finally { - fs.rmSync(tempDir, { recursive: true, force: true }); - } } \ No newline at end of file diff --git a/src/modules/utils.ts b/src/modules/utils.ts index a1efb49..70d897f 100644 --- a/src/modules/utils.ts +++ b/src/modules/utils.ts @@ -145,39 +145,4 @@ export function generateRandomFilename(extension: string = 'mp4'): string { const timestamp = getFormattedTimestamp(); const randomId = randomBytes(4).toString('hex'); return `${timestamp}_${randomId}.${extension}`; -} - -/** - * Cleans SRT subtitle content to produce a plain text transcript. - * Removes timestamps, sequence numbers, and HTML tags. - * - * @param srtContent - Raw SRT subtitle content - * @returns Cleaned transcript text - * - * @example - * ```typescript - * const cleanedText = cleanSubtitleToTranscript(srtContent); - * console.log(cleanedText); // 'Hello world this is a transcript...' - * ``` - */ -export function cleanSubtitleToTranscript(srtContent: string): string { - return srtContent - .split('\n') - .filter(line => { - const trimmed = line.trim(); - // Remove empty lines - if (!trimmed) return false; - // Remove sequence numbers (lines that are just digits) - if (/^\d+$/.test(trimmed)) return false; - // Remove timestamp lines - if (/^\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}$/.test(trimmed)) return false; - return true; - }) - .map(line => { - // Remove HTML tags - return line.replace(/<[^>]*>/g, ''); - }) - .join(' ') - .replace(/\s+/g, ' ') - .trim(); } \ No newline at end of file