Updated agent-speech-processing.yaml files with new providers

This commit is contained in:
pabloFuente 2026-02-04 16:39:06 +01:00
parent 0ee45ec06f
commit 803dfbbfa8
2 changed files with 64 additions and 0 deletions

View File

@ -342,6 +342,12 @@ live_captions:
# Set to false for locally hosted Riva NIM services without SSL.
use_ssl:
spitch:
# API key for Spitch. See https://docs.spitch.app/keys
api_key:
# Language short code for the generated speech. For supported values, see https://docs.spitch.app/concepts/languages
language:
vosk:
# Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk"
# Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):
@ -365,3 +371,29 @@ live_captions:
sample_rate:
# Whether to return interim/partial results during recognition. Default is true.
partial_results:
# Whether to override Vosk's built-in Voice Activity Detection (VAD) with Silero's VAD. Default is false.
use_silero_vad: false
sherpa:
# sherpa streaming model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-sherpa"
# Below is the list of pre-installed models in the container (available at https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models):
# - sherpa-streaming-zipformer-en-kroko-2025-08-06 (English)
# - sherpa-streaming-zipformer-es-kroko-2025-08-06 (Spanish)
# - sherpa-streaming-zipformer-de-kroko-2025-08-06 (German)
# - sherpa-streaming-zipformer-fr-kroko-2025-08-06 (French)
# - sherpa-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10 (Multilingual: Arabic, English, Indonesian, Japanese, Russian, Thai, Vietnamese, Chinese)
model: sherpa-streaming-zipformer-en-kroko-2025-08-06
# Language code for reference. Auto-detected from model name if not set.
language:
# Audio sample rate in Hz. Default is 16000.
sample_rate:
# Whether to return interim/partial results during recognition. Default is true.
partial_results:
# Number of threads for ONNX Runtime. Default is 2.
num_threads:
# Recognizer type ("transducer", "paraformer", "zipformer_ctc", "nemo_ctc", "t_one_ctc"). Auto-detected from model name if not set.
recognizer_type:
# Decoding method ("greedy_search", "modified_beam_search"). Default is "greedy_search".
decoding_method:
# Whether to override sherpa's built-in Voice Activity Detection (VAD) with Silero's VAD. Default is false.
use_silero_vad: false

View File

@ -342,6 +342,12 @@ live_captions:
# Set to false for locally hosted Riva NIM services without SSL.
use_ssl:
spitch:
# API key for Spitch. See https://docs.spitch.app/keys
api_key:
# Language short code for the generated speech. For supported values, see https://docs.spitch.app/concepts/languages
language:
vosk:
# Vosk language model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-vosk"
# Below is the list of pre-installed models in the container (available at https://alphacephei.com/vosk/models):
@ -365,3 +371,29 @@ live_captions:
sample_rate:
# Whether to return interim/partial results during recognition. Default is true.
partial_results:
# Whether to override Vosk's built-in Voice Activity Detection (VAD) with Silero's VAD. Default is false.
use_silero_vad: false
sherpa:
# sherpa streaming model. This provider requires docker_image "docker.io/openvidu/agent-speech-processing-sherpa"
# Below is the list of pre-installed models in the container (available at https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models):
# - sherpa-streaming-zipformer-en-kroko-2025-08-06 (English)
# - sherpa-streaming-zipformer-es-kroko-2025-08-06 (Spanish)
# - sherpa-streaming-zipformer-de-kroko-2025-08-06 (German)
# - sherpa-streaming-zipformer-fr-kroko-2025-08-06 (French)
# - sherpa-streaming-zipformer-ar_en_id_ja_ru_th_vi_zh-2025-02-10 (Multilingual: Arabic, English, Indonesian, Japanese, Russian, Thai, Vietnamese, Chinese)
model: sherpa-streaming-zipformer-en-kroko-2025-08-06
# Language code for reference. Auto-detected from model name if not set.
language:
# Audio sample rate in Hz. Default is 16000.
sample_rate:
# Whether to return interim/partial results during recognition. Default is true.
partial_results:
# Number of threads for ONNX Runtime. Default is 2.
num_threads:
# Recognizer type ("transducer", "paraformer", "zipformer_ctc", "nemo_ctc", "t_one_ctc"). Auto-detected from model name if not set.
recognizer_type:
# Decoding method ("greedy_search", "modified_beam_search"). Default is "greedy_search".
decoding_method:
# Whether to override sherpa's built-in Voice Activity Detection (VAD) with Silero's VAD. Default is false.
use_silero_vad: false