diff --git a/README.md b/README.md
index f7230cb..a465b20 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,12 @@ pip install yt-dlp
* Inputs:
* `url` (string, required): URL of the video
+* **download_transcript**
+ * Download and clean video subtitles to produce a plain text transcript without timestamps or formatting
+ * Inputs:
+ * `url` (string, required): URL of the video
+ * `language` (string, optional): Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'
+
## Usage Examples
Ask your LLM to:
@@ -80,6 +86,8 @@ Ask your LLM to:
"Download Chinese subtitles from this video: https://youtube.com/watch?v=..."
"Download this video in 1080p: https://youtube.com/watch?v=..."
"Download audio from this YouTube video: https://youtube.com/watch?v=..."
+"Get a clean transcript of this video: https://youtube.com/watch?v=..."
+"Download Spanish transcript from this video: https://youtube.com/watch?v=..."
```
## Manual Start
diff --git a/src/__tests__/subtitle.test.ts b/src/__tests__/subtitle.test.ts
index 19e360b..1e55ea2 100644
--- a/src/__tests__/subtitle.test.ts
+++ b/src/__tests__/subtitle.test.ts
@@ -3,7 +3,8 @@
import { describe, test, expect } from '@jest/globals';
import * as os from 'os';
import * as path from 'path';
-import { listSubtitles, downloadSubtitles } from '../modules/subtitle.js';
+import { listSubtitles, downloadSubtitles, downloadTranscript } from '../modules/subtitle.js';
+import { cleanSubtitleToTranscript } from '../modules/utils.js';
import { CONFIG } from '../config.js';
import * as fs from 'fs';
@@ -51,4 +52,60 @@ describe('Subtitle Functions', () => {
.toThrow();
});
});
+
+ describe('downloadTranscript', () => {
+ test('downloads and cleans transcript successfully', async () => {
+ const result = await downloadTranscript(testUrl, 'en', testConfig);
+ expect(typeof result).toBe('string');
+ expect(result.length).toBeGreaterThan(0);
+ expect(result).not.toContain('WEBVTT');
+ expect(result).not.toContain('-->');
+ expect(result).not.toMatch(/^\d+$/m);
+ }, 30000);
+
+ test('handles invalid URL', async () => {
+ await expect(downloadTranscript('invalid-url', 'en', testConfig))
+ .rejects
+ .toThrow();
+ });
+ });
+
+ describe('cleanSubtitleToTranscript', () => {
+ test('cleans SRT content correctly', () => {
+ const srtContent = `1
+00:00:01,000 --> 00:00:03,000
+Hello world
+
+2
+00:00:04,000 --> 00:00:06,000
+This is a test
+
+3
+00:00:07,000 --> 00:00:09,000
+Bold text here`;
+
+ const result = cleanSubtitleToTranscript(srtContent);
+ expect(result).toBe('Hello world This is a test Bold text here');
+ });
+
+ test('handles empty content', () => {
+ const result = cleanSubtitleToTranscript('');
+ expect(result).toBe('');
+ });
+
+ test('removes timestamps and sequence numbers', () => {
+ const srtContent = `1
+00:00:01,000 --> 00:00:03,000
+First line
+
+2
+00:00:04,000 --> 00:00:06,000
+Second line`;
+
+ const result = cleanSubtitleToTranscript(srtContent);
+ expect(result).not.toContain('00:00');
+ expect(result).not.toMatch(/^\d+$/);
+ expect(result).toBe('First line Second line');
+ });
+ });
});
\ No newline at end of file
diff --git a/src/index.mts b/src/index.mts
index 2310dc1..6dbe2c6 100644
--- a/src/index.mts
+++ b/src/index.mts
@@ -15,7 +15,7 @@ import { CONFIG } from "./config.js";
import { _spawnPromise, safeCleanup } from "./modules/utils.js";
import { downloadVideo } from "./modules/video.js";
import { downloadAudio } from "./modules/audio.js";
-import { listSubtitles, downloadSubtitles } from "./modules/subtitle.js";
+import { listSubtitles, downloadSubtitles, downloadTranscript } from "./modules/subtitle.js";
const VERSION = '0.6.26';
@@ -148,6 +148,18 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
required: ["url"],
},
},
+ {
+ name: "download_transcript",
+ description: "Download and clean video subtitles to produce a plain text transcript without timestamps or formatting.",
+ inputSchema: {
+ type: "object",
+ properties: {
+ url: { type: "string", description: "URL of the video" },
+ language: { type: "string", description: "Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'" },
+ },
+ required: ["url"],
+ },
+ },
],
};
});
@@ -211,6 +223,11 @@ server.setRequestHandler(
() => downloadAudio(args.url, CONFIG),
"Error downloading audio"
);
+ } else if (toolName === "download_transcript") {
+ return handleToolExecution(
+ () => downloadTranscript(args.url, args.language || CONFIG.download.defaultSubtitleLanguage, CONFIG),
+ "Error downloading transcript"
+ );
} else {
return {
content: [{ type: "text", text: `Unknown tool: ${toolName}` }],
diff --git a/src/modules/subtitle.ts b/src/modules/subtitle.ts
index 206378b..4a71cb8 100644
--- a/src/modules/subtitle.ts
+++ b/src/modules/subtitle.ts
@@ -2,7 +2,7 @@ import * as fs from "fs";
import * as path from "path";
import * as os from "os";
import type { Config } from '../config.js';
-import { _spawnPromise, validateUrl } from "./utils.js";
+import { _spawnPromise, validateUrl, cleanSubtitleToTranscript } from "./utils.js";
/**
* Lists all available subtitles for a video.
@@ -105,4 +105,65 @@ export async function downloadSubtitles(
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
+}
+
+/**
+ * Downloads and cleans subtitles to produce a plain text transcript.
+ *
+ * @param url - The URL of the video
+ * @param language - Language code (e.g., 'en', 'zh-Hant', 'ja')
+ * @param config - Configuration object
+ * @returns Promise resolving to the cleaned transcript text
+ * @throws {Error} When URL is invalid, language is not available, or download fails
+ *
+ * @example
+ * ```typescript
+ * try {
+ * const transcript = await downloadTranscript('https://youtube.com/watch?v=...', 'en', config);
+ * console.log('Transcript:', transcript);
+ * } catch (error) {
+ * console.error('Failed to download transcript:', error);
+ * }
+ * ```
+ */
+export async function downloadTranscript(
+ url: string,
+ language: string,
+ config: Config
+): Promise {
+ if (!validateUrl(url)) {
+ throw new Error('Invalid or unsupported URL format');
+ }
+
+ const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), config.file.tempDirPrefix));
+
+ try {
+ await _spawnPromise('yt-dlp', [
+ '--skip-download',
+ '--write-subs',
+ '--write-auto-subs',
+ '--sub-lang', language,
+ '--sub-format', 'ttml',
+ '--convert-subs', 'srt',
+ '--output', path.join(tempDir, 'transcript.%(ext)s'),
+ url
+ ]);
+
+ const srtFiles = fs.readdirSync(tempDir)
+ .filter(file => file.endsWith('.srt'));
+
+ if (srtFiles.length === 0) {
+ throw new Error('No subtitle files found for transcript generation');
+ }
+
+ let transcriptContent = '';
+ for (const file of srtFiles) {
+ const srtContent = fs.readFileSync(path.join(tempDir, file), 'utf8');
+ transcriptContent += cleanSubtitleToTranscript(srtContent) + ' ';
+ }
+
+ return transcriptContent.trim();
+ } finally {
+ fs.rmSync(tempDir, { recursive: true, force: true });
+ }
}
\ No newline at end of file
diff --git a/src/modules/utils.ts b/src/modules/utils.ts
index 70d897f..a1efb49 100644
--- a/src/modules/utils.ts
+++ b/src/modules/utils.ts
@@ -145,4 +145,39 @@ export function generateRandomFilename(extension: string = 'mp4'): string {
const timestamp = getFormattedTimestamp();
const randomId = randomBytes(4).toString('hex');
return `${timestamp}_${randomId}.${extension}`;
+}
+
+/**
+ * Cleans SRT subtitle content to produce a plain text transcript.
+ * Removes timestamps, sequence numbers, and HTML tags.
+ *
+ * @param srtContent - Raw SRT subtitle content
+ * @returns Cleaned transcript text
+ *
+ * @example
+ * ```typescript
+ * const cleanedText = cleanSubtitleToTranscript(srtContent);
+ * console.log(cleanedText); // 'Hello world this is a transcript...'
+ * ```
+ */
+export function cleanSubtitleToTranscript(srtContent: string): string {
+ return srtContent
+ .split('\n')
+ .filter(line => {
+ const trimmed = line.trim();
+ // Remove empty lines
+ if (!trimmed) return false;
+ // Remove sequence numbers (lines that are just digits)
+ if (/^\d+$/.test(trimmed)) return false;
+ // Remove timestamp lines
+ if (/^\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}$/.test(trimmed)) return false;
+ return true;
+ })
+ .map(line => {
+ // Remove HTML tags
+ return line.replace(/<[^>]*>/g, '');
+ })
+ .join(' ')
+ .replace(/\s+/g, ' ')
+ .trim();
}
\ No newline at end of file