diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml index 942253a..8573d5f 100644 --- a/community/agent-speech-processing.yaml +++ b/community/agent-speech-processing.yaml @@ -16,7 +16,7 @@ live_captions: # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. processing: automatic - # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox] + # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox, nvidia, vosk] # The custom configuration for the selected provider must be set below provider: @@ -320,6 +320,28 @@ live_captions: # Set context to improve recognition of difficult and rare words. Context is a string and can include words, phrases, sentences, or summaries (limit: 10K chars). See https://soniox.com/docs/stt/concepts/context context: + nvidia: + # API key for NVIDIA. See https://build.nvidia.com/explore/speech?integrate_nim=true&hosted_api=true&modal=integrate-nim + # Required when using NVIDIA's cloud services. To use a self-hosted NVIDIA Riva server setup "server" and "use_ssl" instead. + api_key: + # The NVIDIA Riva ASR model to use. Default is "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer" + # See available models: https://build.nvidia.com/search/models?filters=usecase%3Ausecase_speech_to_text + model: + # The NVIDIA function ID for the model. Default is "1598d209-5e27-4d3c-8079-4751568b1081" + function_id: + # Whether to add punctuation to transcription results. Default is true. + punctuate: + # The language code for transcription. Default is "en-US" + language_code: + # Audio sample rate in Hz. Default is 16000. + sample_rate: + # The NVIDIA Riva server address. Default is "grpc.nvcf.nvidia.com:443" + # For self-hosted NIM, use your server address (e.g., "localhost:50051") + server: + # Whether to use SSL for the connection. Default is true. + # Set to false for locally hosted Riva NIM services without SSL. + use_ssl: + vosk: # Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk" # Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models): diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml index 942253a..8573d5f 100644 --- a/pro/agent-speech-processing.yaml +++ b/pro/agent-speech-processing.yaml @@ -16,7 +16,7 @@ live_captions: # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. processing: automatic - # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox] + # Which speech-to-text AI provider to use [aws, azure, google, openai, azure_openai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam, mistralai, cartesia, soniox, nvidia, vosk] # The custom configuration for the selected provider must be set below provider: @@ -320,6 +320,28 @@ live_captions: # Set context to improve recognition of difficult and rare words. Context is a string and can include words, phrases, sentences, or summaries (limit: 10K chars). See https://soniox.com/docs/stt/concepts/context context: + nvidia: + # API key for NVIDIA. See https://build.nvidia.com/explore/speech?integrate_nim=true&hosted_api=true&modal=integrate-nim + # Required when using NVIDIA's cloud services. To use a self-hosted NVIDIA Riva server setup "server" and "use_ssl" instead. + api_key: + # The NVIDIA Riva ASR model to use. Default is "parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer" + # See available models: https://build.nvidia.com/search/models?filters=usecase%3Ausecase_speech_to_text + model: + # The NVIDIA function ID for the model. Default is "1598d209-5e27-4d3c-8079-4751568b1081" + function_id: + # Whether to add punctuation to transcription results. Default is true. + punctuate: + # The language code for transcription. Default is "en-US" + language_code: + # Audio sample rate in Hz. Default is 16000. + sample_rate: + # The NVIDIA Riva server address. Default is "grpc.nvcf.nvidia.com:443" + # For self-hosted NIM, use your server address (e.g., "localhost:50051") + server: + # Whether to use SSL for the connection. Default is true. + # Set to false for locally hosted Riva NIM services without SSL. + use_ssl: + vosk: # Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk" # Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):