backend: Adds live captions functionality to rooms

Adds support for live captions in meet rooms.
This includes schema definitions, API configurations,
and LiveKit integration for dispatching captions agents.
Captions are disabled by default and can be enabled per room.
This commit is contained in:
CSantosM 2026-01-22 18:24:50 +01:00
parent f677b18879
commit 9ae27bf32a
12 changed files with 112 additions and 43 deletions

View File

@ -26,6 +26,8 @@ content:
enabled: true
e2ee:
enabled: false
captions:
enabled: false
roles:
moderator:
permissions:

View File

@ -35,6 +35,8 @@ content:
enabled: true
e2ee:
enabled: false
captions:
enabled: false
roles:
moderator:
permissions:

View File

@ -13,6 +13,9 @@ MeetRoomConfig:
e2ee:
$ref: '#/MeetE2EEConfig'
description: Config for End-to-End Encryption (E2EE) in the room.
captions:
$ref: '#/MeetCaptionsConfig'
description: Config for live captions in the room.
MeetChatConfig:
type: object
properties:
@ -80,3 +83,13 @@ MeetE2EEConfig:
If true, the room will have End-to-End Encryption (E2EE) enabled.<br/>
This ensures that the media streams are encrypted from the sender to the receiver, providing enhanced privacy and security for the participants.<br/>
**Enabling E2EE will disable the recording feature for the room**.
MeetCaptionsConfig:
type: object
properties:
enabled:
type: boolean
default: false
example: false
description: >
If true, the room will have live captions enabled.<br/>
This allows participants to see real-time captions of the all participants' speech during the meeting.<br/>

View File

@ -49,6 +49,8 @@ export const INTERNAL_CONFIG = {
PARTICIPANT_MAX_CONCURRENT_NAME_REQUESTS: '20', // Maximum number of request by the same name at the same time allowed
PARTICIPANT_NAME_RESERVATION_TTL: '12h' as StringValue, // Time-to-live for participant name reservations
CAPTIONS_AGENT_NAME: 'agent-meet-captions',
// MongoDB Schema Versions
// These define the current schema version for each collection
// Increment when making breaking changes to the schema structure

View File

@ -85,7 +85,7 @@ export const MEET_ENV = {
ENABLED_MODULES: process.env.ENABLED_MODULES ?? '',
// Agent Speech Processing configuration
AGENT_SPEECH_PROCESSING_NAME: process.env.MEET_AGENT_SPEECH_PROCESSING_NAME || '',
CAPTIONS_ENABLED: process.env.MEET_CAPTIONS || 'true',
};
export function checkModuleEnabled() {

View File

@ -105,6 +105,20 @@ const MeetE2EEConfigSchema = new Schema(
{ _id: false }
);
/**
* Mongoose schema for MeetRoom captions configuration.
*/
const MeetCaptionsConfigSchema = new Schema(
{
enabled: {
type: Boolean,
required: true,
default: false
}
},
{ _id: false }
);
/**
* Sub-schema for room theme configuration.
*/
@ -181,6 +195,11 @@ const MeetRoomConfigSchema = new Schema(
type: MeetE2EEConfigSchema,
required: true,
default: { enabled: false }
},
captions: {
type: MeetCaptionsConfigSchema,
required: true,
default: { enabled: false }
}
},
{ _id: false }

View File

@ -7,6 +7,7 @@ import {
MeetRecordingConfig,
MeetRecordingLayout,
MeetRoomAutoDeletionPolicy,
MeetRoomCaptionsConfig,
MeetRoomConfig,
MeetRoomDeletionPolicyWithMeeting,
MeetRoomDeletionPolicyWithRecordings,
@ -55,6 +56,10 @@ const E2EEConfigSchema: z.ZodType<MeetE2EEConfig> = z.object({
enabled: z.boolean()
});
const CaptionsConfigSchema: z.ZodType<MeetRoomCaptionsConfig> = z.object({
enabled: z.boolean()
});
const ThemeModeSchema: z.ZodType<MeetRoomThemeMode> = z.nativeEnum(MeetRoomThemeMode);
const hexColorSchema = z
@ -92,7 +97,8 @@ const UpdateRoomConfigSchema: z.ZodType<Partial<MeetRoomConfig>> = z
recording: RecordingConfigSchema.optional(),
chat: ChatConfigSchema.optional(),
virtualBackground: VirtualBackgroundConfigSchema.optional(),
e2ee: E2EEConfigSchema.optional()
e2ee: E2EEConfigSchema.optional(),
captions: CaptionsConfigSchema.optional()
// appearance: AppearanceConfigSchema,
})
.transform((data: Partial<MeetRoomConfig>) => {
@ -123,7 +129,8 @@ const CreateRoomConfigSchema = z
})),
chat: ChatConfigSchema.optional().default(() => ({ enabled: true })),
virtualBackground: VirtualBackgroundConfigSchema.optional().default(() => ({ enabled: true })),
e2ee: E2EEConfigSchema.optional().default(() => ({ enabled: false }))
e2ee: E2EEConfigSchema.optional().default(() => ({ enabled: false })),
captions: CaptionsConfigSchema.optional().default(() => ({ enabled: false }))
// appearance: AppearanceConfigSchema,
})
.transform((data) => {
@ -207,7 +214,8 @@ export const RoomOptionsSchema: z.ZodType<MeetRoomOptions> = z.object({
},
chat: { enabled: true },
virtualBackground: { enabled: true },
e2ee: { enabled: false }
e2ee: { enabled: false },
captions: { enabled: false }
})
// maxParticipants: z
// .number()

View File

@ -163,8 +163,8 @@ export class LivekitWebhookService {
* @param participant - Information about the newly joined participant.
*/
async handleParticipantJoined(room: Room, participant: ParticipantInfo) {
// Skip if the participant is an egress participant
if (this.livekitService.isEgressParticipant(participant)) return;
// Skip if the participant is not a standard participant
if (!this.livekitService.isStandardParticipant(participant)) return;
try {
const { recordings } = await this.recordingService.getAllRecordings({ roomId: room.name });
@ -185,8 +185,8 @@ export class LivekitWebhookService {
* @param participant - Information about the participant who left.
*/
async handleParticipantLeft(room: Room, participant: ParticipantInfo) {
// Skip if the participant is an egress participant
if (this.livekitService.isEgressParticipant(participant)) return;
// Skip if the participant is not a standard participant
if (!this.livekitService.isStandardParticipant(participant)) return;
try {
// Release the participant's reserved name

View File

@ -1,3 +1,4 @@
import { ParticipantInfo_Kind } from '@livekit/protocol';
import { inject, injectable } from 'inversify';
import {
CreateOptions,
@ -400,8 +401,11 @@ export class LiveKitService {
}
}
isEgressParticipant(participant: ParticipantInfo): boolean {
// TODO: Remove deprecated warning by using ParticipantInfo_Kind: participant.kind === ParticipantInfo_Kind.EGRESS;
return participant.identity.startsWith('EG_') && participant.permission?.recorder === true;
/**
* Checks if a participant is a standard participant (web clients).
* @param participant
*/
isStandardParticipant(participant: ParticipantInfo): boolean {
return participant.kind === ParticipantInfo_Kind.STANDARD;
}
}

View File

@ -133,9 +133,10 @@ export class RoomMemberService {
// Get participant permissions (with join meeting)
const permissions = await this.getRoomMemberPermissions(roomId, role, true);
const withCaptions = room.config.captions.enabled ?? false;
// Generate token with participant name
return this.tokenService.generateRoomMemberToken(role, permissions, participantName, participantIdentity);
return this.tokenService.generateRoomMemberToken(role, permissions, participantName, participantIdentity, withCaptions);
}
/**

View File

@ -42,7 +42,8 @@ export class TokenService {
role: MeetRoomMemberRole,
permissions: MeetRoomMemberPermissions,
participantName?: string,
participantIdentity?: string
participantIdentity?: string,
roomWithCaptions = false
): Promise<string> {
const metadata: MeetRoomMemberTokenMetadata = {
livekitUrl: MEET_ENV.LIVEKIT_URL,
@ -56,23 +57,36 @@ export class TokenService {
ttl: INTERNAL_CONFIG.ROOM_MEMBER_TOKEN_EXPIRATION,
metadata: JSON.stringify(metadata)
};
return await this.generateJwtToken(tokenOptions, permissions.livekit as VideoGrant);
return await this.generateJwtToken(tokenOptions, permissions.livekit as VideoGrant, roomWithCaptions);
}
private async generateJwtToken(tokenOptions: AccessTokenOptions, grants?: VideoGrant): Promise<string> {
private async generateJwtToken(
tokenOptions: AccessTokenOptions,
grants?: VideoGrant,
roomWithCaptions = false
): Promise<string> {
const at = new AccessToken(MEET_ENV.LIVEKIT_API_KEY, MEET_ENV.LIVEKIT_API_SECRET, tokenOptions);
if (grants) {
at.addGrant(grants);
}
if (MEET_ENV.AGENT_SPEECH_PROCESSING_NAME) {
const captionsEnabledInEnv = MEET_ENV.CAPTIONS_ENABLED === 'true';
const captionsEnabledInRoom = Boolean(roomWithCaptions);
this.logger.debug('Adding speech processing agent dispatch to token', MEET_ENV.AGENT_SPEECH_PROCESSING_NAME);
// Warn if configuration is inconsistent
if (!captionsEnabledInEnv && captionsEnabledInRoom) {
this.logger.warn(
`Captions feature is disabled in environment but Room is created with captions enabled. Please enable captions in environment by setting MEET_CAPTIONS_ENABLED=true to ensure proper functionality.`
);
}
if (captionsEnabledInEnv && captionsEnabledInRoom) {
this.logger.debug('Activating Captions Agent. Configuring Room Agent Dispatch.');
at.roomConfig = new RoomConfiguration({
agents: [
new RoomAgentDispatch({
agentName: MEET_ENV.AGENT_SPEECH_PROCESSING_NAME
agentName: INTERNAL_CONFIG.CAPTIONS_AGENT_NAME
})
]
});

View File

@ -4,56 +4,60 @@ import { MeetRecordingLayout } from './recording.model';
* Interface representing the config for a room.
*/
export interface MeetRoomConfig {
chat: MeetChatConfig;
recording: MeetRecordingConfig;
virtualBackground: MeetVirtualBackgroundConfig;
e2ee: MeetE2EEConfig;
// appearance: MeetAppearanceConfig;
chat: MeetChatConfig;
recording: MeetRecordingConfig;
virtualBackground: MeetVirtualBackgroundConfig;
e2ee: MeetE2EEConfig;
captions: MeetRoomCaptionsConfig;
// appearance: MeetAppearanceConfig;
}
/**
* Interface representing the config for recordings in a room.
*/
export interface MeetRecordingConfig {
enabled: boolean;
layout?: MeetRecordingLayout;
allowAccessTo?: MeetRecordingAccess;
enabled: boolean;
layout?: MeetRecordingLayout;
allowAccessTo?: MeetRecordingAccess;
}
export enum MeetRecordingAccess {
ADMIN = 'admin', // Only admins can access the recording
ADMIN_MODERATOR = 'admin_moderator', // Admins and moderators can access
ADMIN_MODERATOR_SPEAKER = 'admin_moderator_speaker' // Admins, moderators and speakers can access
ADMIN = 'admin', // Only admins can access the recording
ADMIN_MODERATOR = 'admin_moderator', // Admins and moderators can access
ADMIN_MODERATOR_SPEAKER = 'admin_moderator_speaker', // Admins, moderators and speakers can access
}
export interface MeetChatConfig {
enabled: boolean;
enabled: boolean;
}
export interface MeetVirtualBackgroundConfig {
enabled: boolean;
enabled: boolean;
}
export interface MeetE2EEConfig {
enabled: boolean;
enabled: boolean;
}
export interface MeetRoomCaptionsConfig {
enabled: boolean;
}
export interface MeetAppearanceConfig {
themes: MeetRoomTheme[];
themes: MeetRoomTheme[];
}
export interface MeetRoomTheme {
name: string;
enabled: boolean;
baseTheme: MeetRoomThemeMode;
backgroundColor?: string;
primaryColor?: string;
secondaryColor?: string;
accentColor?: string;
surfaceColor?: string;
name: string;
enabled: boolean;
baseTheme: MeetRoomThemeMode;
backgroundColor?: string;
primaryColor?: string;
secondaryColor?: string;
accentColor?: string;
surfaceColor?: string;
}
export enum MeetRoomThemeMode {
LIGHT = 'light',
DARK = 'dark'
LIGHT = 'light',
DARK = 'dark',
}