Add nvidia and vosk live_captions providers to YAML

This commit is contained in:
pabloFuente 2026-01-21 11:48:59 +01:00
parent d51a1b2cdf
commit 2ea399dc42
2 changed files with 46 additions and 2 deletions

View File

@ -16,7 +16,7 @@ live_captions:
# - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
processing: automatic
# Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox]
# Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox, nvidia, vosk]
# The custom configuration for the selected provider must be set below
provider:
@ -320,6 +320,28 @@ live_captions:
# Set context to improve recognition of difficult and rare words. Context is a string and can include words, phrases, sentences, or summaries (limit: 10K chars). See https://soniox.com/docs/stt/concepts/context
context:
nvidia:
# API key for NVIDIA. See https://build.nvidia.com/explore/speech?integrate_nim=true&hosted_api=true&modal=integrate-nim
# Required when using NVIDIA's cloud services. To use a self-hosted NVIDIA Riva server setup "server" and "use_ssl" instead.
api_key:
# The NVIDIA Riva ASR model to use. Default is "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer"
# See available models: https://build.nvidia.com/search/models?filters=usecase%3Ausecase_speech_to_text
model:
# The NVIDIA function ID for the model. Default is "1598d209-5e27-4d3c-8079-4751568b1081"
function_id:
# Whether to add punctuation to transcription results. Default is true.
punctuate:
# The language code for transcription. Default is "en-US"
language_code:
# Audio sample rate in Hz. Default is 16000.
sample_rate:
# The NVIDIA Riva server address. Default is "grpc.nvcf.nvidia.com:443"
# For self-hosted NIM, use your server address (e.g., "localhost:50051")
server:
# Whether to use SSL for the connection. Default is true.
# Set to false for locally hosted Riva NIM services without SSL.
use_ssl:
vosk:
# Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk"
# Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):

View File

@ -16,7 +16,7 @@ live_captions:
# - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
processing: automatic
# Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox]
# Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox, nvidia, vosk]
# The custom configuration for the selected provider must be set below
provider:
@ -320,6 +320,28 @@ live_captions:
# Set context to improve recognition of difficult and rare words. Context is a string and can include words, phrases, sentences, or summaries (limit: 10K chars). See https://soniox.com/docs/stt/concepts/context
context:
nvidia:
# API key for NVIDIA. See https://build.nvidia.com/explore/speech?integrate_nim=true&hosted_api=true&modal=integrate-nim
# Required when using NVIDIA's cloud services. To use a self-hosted NVIDIA Riva server setup "server" and "use_ssl" instead.
api_key:
# The NVIDIA Riva ASR model to use. Default is "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer"
# See available models: https://build.nvidia.com/search/models?filters=usecase%3Ausecase_speech_to_text
model:
# The NVIDIA function ID for the model. Default is "1598d209-5e27-4d3c-8079-4751568b1081"
function_id:
# Whether to add punctuation to transcription results. Default is true.
punctuate:
# The language code for transcription. Default is "en-US"
language_code:
# Audio sample rate in Hz. Default is 16000.
sample_rate:
# The NVIDIA Riva server address. Default is "grpc.nvcf.nvidia.com:443"
# For self-hosted NIM, use your server address (e.g., "localhost:50051")
server:
# Whether to use SSL for the connection. Default is true.
# Set to false for locally hosted Riva NIM services without SSL.
use_ssl:
vosk:
# Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk"
# Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):