Add nvidia and vosk live_captions providers to YAML

2026-01-21 11:48:59 +01:00 · 2026-01-21 11:48:59 +01:00 · 2ea399dc42
commit 2ea399dc42
parent d51a1b2cdf
2 changed files with 46 additions and 2 deletions
--- a/community/agent-speech-processing.yaml
+++ b/community/agent-speech-processing.yaml
@ -16,7 +16,7 @@ live_captions:
  # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
  processing: automatic

-  # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox]
+  # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox, nvidia, vosk]
  # The custom configuration for the selected provider must be set below
  provider:

@ -320,6 +320,28 @@ live_captions:
    # Set context to improve recognition of difficult and rare words. Context is a string and can include words, phrases, sentences, or summaries (limit: 10K chars). See https://soniox.com/docs/stt/concepts/context
    context:

+  nvidia:
+    # API key for NVIDIA. See https://build.nvidia.com/explore/speech?integrate_nim=true&hosted_api=true&modal=integrate-nim
+    # Required when using NVIDIA's cloud services. To use a self-hosted NVIDIA Riva server setup "server" and "use_ssl" instead.
+    api_key:
+    # The NVIDIA Riva ASR model to use. Default is "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer"
+    # See available models: https://build.nvidia.com/search/models?filters=usecase%3Ausecase_speech_to_text
+    model:
+    # The NVIDIA function ID for the model. Default is "1598d209-5e27-4d3c-8079-4751568b1081"
+    function_id:
+    # Whether to add punctuation to transcription results. Default is true.
+    punctuate:
+    # The language code for transcription. Default is "en-US"
+    language_code:
+    # Audio sample rate in Hz. Default is 16000.
+    sample_rate:
+    # The NVIDIA Riva server address. Default is "grpc.nvcf.nvidia.com:443"
+    # For self-hosted NIM, use your server address (e.g., "localhost:50051")
+    server:
+    # Whether to use SSL for the connection. Default is true.
+    # Set to false for locally hosted Riva NIM services without SSL.
+    use_ssl:
+
  vosk:
    # Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk"
    # Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):
--- a/pro/agent-speech-processing.yaml
+++ b/pro/agent-speech-processing.yaml
@ -16,7 +16,7 @@ live_captions:
  # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
  processing: automatic

-  # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox]
+  # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox, nvidia, vosk]
  # The custom configuration for the selected provider must be set below
  provider:

@ -320,6 +320,28 @@ live_captions:
    # Set context to improve recognition of difficult and rare words. Context is a string and can include words, phrases, sentences, or summaries (limit: 10K chars). See https://soniox.com/docs/stt/concepts/context
    context:

+  nvidia:
+    # API key for NVIDIA. See https://build.nvidia.com/explore/speech?integrate_nim=true&hosted_api=true&modal=integrate-nim
+    # Required when using NVIDIA's cloud services. To use a self-hosted NVIDIA Riva server setup "server" and "use_ssl" instead.
+    api_key:
+    # The NVIDIA Riva ASR model to use. Default is "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer"
+    # See available models: https://build.nvidia.com/search/models?filters=usecase%3Ausecase_speech_to_text
+    model:
+    # The NVIDIA function ID for the model. Default is "1598d209-5e27-4d3c-8079-4751568b1081"
+    function_id:
+    # Whether to add punctuation to transcription results. Default is true.
+    punctuate:
+    # The language code for transcription. Default is "en-US"
+    language_code:
+    # Audio sample rate in Hz. Default is 16000.
+    sample_rate:
+    # The NVIDIA Riva server address. Default is "grpc.nvcf.nvidia.com:443"
+    # For self-hosted NIM, use your server address (e.g., "localhost:50051")
+    server:
+    # Whether to use SSL for the connection. Default is true.
+    # Set to false for locally hosted Riva NIM services without SSL.
+    use_ssl:
+
  vosk:
    # Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk"
    # Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):