From 7970659f69a4b800386ff2c71578ab59f92ff112 Mon Sep 17 00:00:00 2001 From: cruizba Date: Wed, 4 Jun 2025 17:21:57 +0200 Subject: [PATCH 1/8] Revert "Bump to version 3.2.0" This reverts commit 9edcb4f442fec3b1ec83d65ebc19b10d65138f31. --- community/docker-compose.yaml | 10 +++++----- pro/docker-compose.yaml | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/community/docker-compose.yaml b/community/docker-compose.yaml index 5d79a55..8e6cdb5 100644 --- a/community/docker-compose.yaml +++ b/community/docker-compose.yaml @@ -1,6 +1,6 @@ services: caddy-proxy: - image: docker.io/openvidu/openvidu-caddy-local:3.2.0 + image: docker.io/openvidu/openvidu-caddy-local:main platform: linux/amd64 container_name: caddy-proxy restart: unless-stopped @@ -87,7 +87,7 @@ services: condition: service_completed_successfully dashboard: - image: docker.io/openvidu/openvidu-dashboard:3.2.0 + image: docker.io/openvidu/openvidu-dashboard:main platform: linux/amd64 container_name: dashboard restart: unless-stopped @@ -101,7 +101,7 @@ services: condition: service_completed_successfully openvidu: - image: docker.io/openvidu/openvidu-server:3.2.0 + image: docker.io/openvidu/openvidu-server:main platform: linux/amd64 restart: unless-stopped container_name: openvidu @@ -123,7 +123,7 @@ services: condition: service_completed_successfully ingress: - image: docker.io/openvidu/ingress:3.2.0 + image: docker.io/openvidu/ingress:main platform: linux/amd64 container_name: ingress restart: unless-stopped @@ -158,7 +158,7 @@ services: condition: service_completed_successfully default-app: - image: docker.io/openvidu/openvidu-call:3.2.0-demo + image: docker.io/openvidu/openvidu-call:main-demo platform: linux/amd64 container_name: openvidu-call restart: on-failure diff --git a/pro/docker-compose.yaml b/pro/docker-compose.yaml index 23a39c1..2e5d062 100644 --- a/pro/docker-compose.yaml +++ b/pro/docker-compose.yaml @@ -1,6 +1,6 @@ services: caddy-proxy: - image: docker.io/openvidu/openvidu-caddy-local:3.2.0 + image: docker.io/openvidu/openvidu-caddy-local:main platform: linux/amd64 container_name: caddy-proxy restart: unless-stopped @@ -87,7 +87,7 @@ services: condition: service_completed_successfully dashboard: - image: docker.io/openvidu/openvidu-dashboard:3.2.0 + image: docker.io/openvidu/openvidu-dashboard:main platform: linux/amd64 container_name: dashboard restart: unless-stopped @@ -101,7 +101,7 @@ services: condition: service_completed_successfully openvidu: - image: docker.io/openvidu/openvidu-server-pro:3.2.0 + image: docker.io/openvidu/openvidu-server-pro:main platform: linux/amd64 restart: unless-stopped container_name: openvidu @@ -125,7 +125,7 @@ services: condition: service_completed_successfully ingress: - image: docker.io/openvidu/ingress:3.2.0 + image: docker.io/openvidu/ingress:main platform: linux/amd64 container_name: ingress restart: unless-stopped @@ -160,7 +160,7 @@ services: condition: service_completed_successfully default-app: - image: docker.io/openvidu/openvidu-call:3.2.0-demo + image: docker.io/openvidu/openvidu-call:main-demo platform: linux/amd64 container_name: openvidu-call restart: on-failure @@ -196,7 +196,7 @@ services: condition: service_completed_successfully openvidu-v2compatibility: - image: docker.io/openvidu/openvidu-v2compatibility:3.2.0 + image: docker.io/openvidu/openvidu-v2compatibility:main platform: linux/amd64 restart: unless-stopped container_name: openvidu-v2compatibility From f16c0a8a647a9be340034cb6b73f26ef753fa5bb Mon Sep 17 00:00:00 2001 From: cruizba Date: Tue, 10 Jun 2025 11:01:21 +0200 Subject: [PATCH 2/8] Add operator service to docker-compose for agent management --- community/docker-compose.yaml | 25 +++++++++++++++++++++++++ pro/docker-compose.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/community/docker-compose.yaml b/community/docker-compose.yaml index 8e6cdb5..ce6bfd1 100644 --- a/community/docker-compose.yaml +++ b/community/docker-compose.yaml @@ -193,6 +193,31 @@ services: setup: condition: service_completed_successfully + operator: + image: docker.io/openvidu/openvidu-operator:main + platform: linux/amd64 + container_name: operator + restart: unless-stopped + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - agents-config:/agents-config + - ./:/deployment + environment: + - PLATFORM=linux/amd64 + - MODE=agent-manager-local + - DEPLOYMENT_FILES_DIR=/deployment + - AGENTS_CONFIG_DIR=/agents-config + - NETWORK_NAME=openvidu-community + - AGENTS_CONFIG_VOLUME=openvidu-agents-config + - LIVEKIT_URL=ws://openvidu:7880/ + - LIVEKIT_API_KEY=${LIVEKIT_API_KEY:-} + - LIVEKIT_API_SECRET=${LIVEKIT_API_SECRET:-} + - REDIS_ADDRESS=redis:6379 + - REDIS_PASSWORD=${REDIS_PASSWORD:-} + depends_on: + setup: + condition: service_completed_successfully + ready-check: image: docker.io/curlimages/curl:8.13.0 platform: linux/amd64 diff --git a/pro/docker-compose.yaml b/pro/docker-compose.yaml index 2e5d062..fb4a076 100644 --- a/pro/docker-compose.yaml +++ b/pro/docker-compose.yaml @@ -266,6 +266,30 @@ services: - ./scripts/utils.sh:/scripts/utils.sh command: /bin/sh /scripts/ready-check.sh + operator: + image: docker.io/openvidu/openvidu-operator:main + platform: linux/amd64 + container_name: operator + restart: unless-stopped + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - agents-config:/agents-config + - ./:/deployment + environment: + - PLATFORM=linux/amd64 + - MODE=agent-manager-local + - DEPLOYMENT_FILES_DIR=/deployment + - AGENTS_CONFIG_DIR=/agents-config + - NETWORK_NAME=openvidu-pro + - AGENTS_CONFIG_VOLUME=openvidu-pro-agents-config + - LIVEKIT_URL=ws://openvidu:7880/ + - LIVEKIT_API_KEY=${LIVEKIT_API_KEY:-} + - LIVEKIT_API_SECRET=${LIVEKIT_API_SECRET:-} + - REDIS_ADDRESS=redis:6379 + - REDIS_PASSWORD=${REDIS_PASSWORD:-} + depends_on: + setup: + condition: service_completed_successfully setup: image: docker.io/busybox:1.37.0 platform: linux/amd64 From b88e1420fda39af8b1550680ec261ace4758aeac Mon Sep 17 00:00:00 2001 From: pabloFuente Date: Mon, 16 Jun 2025 18:37:02 +0200 Subject: [PATCH 3/8] Added agent-speech-processing.yaml to community and pro local deployments --- community/agent-speech-processing.yaml | 226 +++++++++++++++++++++++++ pro/agent-speech-processing.yaml | 226 +++++++++++++++++++++++++ 2 files changed, 452 insertions(+) create mode 100644 community/agent-speech-processing.yaml create mode 100644 pro/agent-speech-processing.yaml diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml new file mode 100644 index 0000000..7d4345c --- /dev/null +++ b/community/agent-speech-processing.yaml @@ -0,0 +1,226 @@ +############################## +# Agent common configuration # +############################## + +# Docker image of the agent. +docker_image: docker.io/openvidu/agent-speech-processing:main + +# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled) +# automatic: the agent will run and will automatically connect to new Rooms. +# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. +# disabled: the agent will not run. +processing: disabled + +################################ +# Agent specific configuration # +################################ +speech_processing: + # Whether or not the agent should be hidden to the Participants of the Room. + hidden: true + + # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia + # The custom configuration for the selected provider must be set below + provider: + + aws: + # Credentials for AWS Transcribe. See https://docs.aws.amazon.com/transcribe/latest/dg/what-is.html + aws_access_key_id: + aws_secret_access_key: + aws_default_region: + # See https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html + language: + # The name of the custom vocabulary you want to use. + # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-vocabulary.html + vocabulary_name: + # The name of the custom language model you want to use. + # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-language-models-using.html + language_model_name: + # Whether or not to enable partial result stabilization. Partial result stabilization can reduce latency in your output, but may impact accuracy. + # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization + enable_partial_results_stabilization: + # Specify the level of stability to use when you enable partial results stabilization (enable_partial_results_stabilization: true). Valid values: high | medium | low + # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization + partial_results_stability: + # The name of the custom vocabulary filter you want to use to mask or remove words. + # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html + vocab_filter_name: + # The method used to filter the vocabulary. Valid values: mask | remove | tag + # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html + vocab_filter_method: + + azure: + # Credentials for Azure Speech Service. + # One of these combinations must be set: + # - speech_host + # - speech_key + speech_region + # - speech_auth_token + speech_region + # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-speech-to-text?tabs=macos%2Cterminal&pivots=programming-language-python#prerequisites + speech_host: + speech_key: + speech_auth_token: + speech_region: + # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. + # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages + languages: + # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw + # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering + profanity: + + google: + # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file. + # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types) + credentials_info: | + { + "type": "service_account", + "project_id": "my-project", + "private_key_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "private_key": "-----BEGIN PRIVATE KEY-----\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n-----END PRIVATE KEY-----\n", + "client_email": "my-email@my-project.iam.gserviceaccount.com", + "client_id": "xxxxxxxxxxxxxxxxxxxxx", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-email%40my-project.iam.gserviceaccount.com", + "universe_domain": "googleapis.com" + } + # Which model to use for recognition. If not set, uses the default model for the selected language. + # See https://cloud.google.com/speech-to-text/docs/transcription-model + model: + # The location to use for recognition. Default is "us-central1". Latency will be best if the location is close to your users. + # Check supported languages and locations at https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages + location: + # List of language codes to recognize. Default is ["en-US"]. + # See https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages + languages: + # Whether to detect the language of the audio. Default is true. + detect_language: + # If 'true', adds punctuation to recognition result hypotheses. This feature is only available in select languages. Setting this + # for requests in other languages has no effect at all. The default 'false' value does not add punctuation to result hypotheses. + # See https://cloud.google.com/speech-to-text/docs/automatic-punctuation + punctuate: + # The spoken punctuation behavior for the call. If not set, uses default behavior based on model of choice. + # e.g. command_and_search will enable spoken punctuation by default. If 'true', replaces spoken punctuation + # with the corresponding symbols in the request. For example, "how are you question mark" becomes "how are you?". + # See https://cloud.google.com/speech-to-text/docs/spoken-punctuation for support. If 'false', spoken punctuation is not replaced. + spoken_punctuation: + # Whether to return interim (non-final) transcription results. Defaults to true. + interim_results: + + openai: + # API key for OpenAI. See https://platform.openai.com/api-keys + api_key: + # See https://platform.openai.com/docs/guides/speech-to-text + model: + # The language of the input audio. Supplying the input language in ISO-639-1 format + # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency. + language: + + groq: + # API key for Groq. See https://console.groq.com/keys + api_key: + # See https://console.groq.com/docs/speech-to-text + model: + # The language of the input audio. Supplying the input language in ISO-639-1 format + # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency. + language: + # Prompt to guide the model's style or specify how to spell unfamiliar words. 224 tokens max. + prompt: + + deepgram: + # See https://console.deepgram.com/ + api_key: + # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.model + model: + # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language + language: + # Whether to return interim (non-final) transcription results. Defaults to true + interim_results: true + # Whether to apply smart formatting to numbers, dates, etc. Defaults to true + smart_format: true + # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations + punctuate: true + # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true + filler_words: true + # Whether to filter profanity from the transcription. Defaults to false + profanity_filter: false + # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead. + # keywords: + # - [OpenVidu, 1.5] + # - [WebRTC, 1] + # List of key terms to improve recognition accuracy. Defaults to None. keyterms is supported by Nova-3 models. + # Commented below is an example + keyterms: + # - "OpenVidu" + # - "WebRTC" + + assemblyai: + # API key for AssemblyAI. See https://assemblyai.com/app/ + api_key: + # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection. + format_turns: + + fal: + # API key for fal. See https://fal.ai/dashboard/keys + api_key: + # See https://fal.ai/models/fal-ai/wizper/api#schema + task: + # See https://fal.ai/models/fal-ai/wizper/api#schema + language: + # See https://fal.ai/models/fal-ai/wizper/api#schema + chunk_level: + # See https://fal.ai/models/fal-ai/wizper/api#schema + version: + + clova: + # Secret key issued when registering the app + api_key: + # API Gateway's unique invoke URL created in CLOVA Speech Domain. + # See https://guide.ncloud-docs.com/docs/en/clovaspeech-domain#create-domain + invoke_url: + # See https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-longsentence + language: + # Value between 0 and 1 indicating the threshold for the confidence score of the transcribed text. Default is 0.5. + # If the confidence score is lower than the threshold, the transcription event is not sent to the client. + # For a definition of the confidence score see https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-grpc + threshold: + + speechmatics: + # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/ + api_key: + # See https://docs.speechmatics.com/rt-api-ref#transcription-config + language: + # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale + output_locale: + # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts + enable_partials: + # See https://docs.speechmatics.com/features/realtime-latency#configuration-example + max_delay: + # See https://docs.speechmatics.com/features/realtime-latency#configuration-example + max_delay_mode: + # See https://docs.speechmatics.com/features/punctuation-settings + # Commented below is an example of punctuation settings + punctuation_overrides: + # permitted_marks: [ ".", "," ] + # sensitivity: 0.4 + # See https://docs.speechmatics.com/features/custom-dictionary + # Commented below is an example of a custom dictionary + additional_vocab: + # - content: financial crisis + # - content: gnocchi + # sounds_like: + # - nyohki + # - nokey + # - nochi + # - content: CEO + # sounds_like: + # - C.E.O. + + gladia: + # API key for Gladia. See https://app.gladia.io/account + api_key: + # Whether to return interim (non-final) transcription results. Defaults to True + interim_results: + # List of language codes to use for recognition. Defaults to None (auto-detect). See https://docs.gladia.io/chapters/limits-and-specifications/languages + languages: + # Whether to allow switching between languages during recognition. Defaults to True + code_switching: diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml new file mode 100644 index 0000000..7d4345c --- /dev/null +++ b/pro/agent-speech-processing.yaml @@ -0,0 +1,226 @@ +############################## +# Agent common configuration # +############################## + +# Docker image of the agent. +docker_image: docker.io/openvidu/agent-speech-processing:main + +# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled) +# automatic: the agent will run and will automatically connect to new Rooms. +# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. +# disabled: the agent will not run. +processing: disabled + +################################ +# Agent specific configuration # +################################ +speech_processing: + # Whether or not the agent should be hidden to the Participants of the Room. + hidden: true + + # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia + # The custom configuration for the selected provider must be set below + provider: + + aws: + # Credentials for AWS Transcribe. See https://docs.aws.amazon.com/transcribe/latest/dg/what-is.html + aws_access_key_id: + aws_secret_access_key: + aws_default_region: + # See https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html + language: + # The name of the custom vocabulary you want to use. + # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-vocabulary.html + vocabulary_name: + # The name of the custom language model you want to use. + # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-language-models-using.html + language_model_name: + # Whether or not to enable partial result stabilization. Partial result stabilization can reduce latency in your output, but may impact accuracy. + # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization + enable_partial_results_stabilization: + # Specify the level of stability to use when you enable partial results stabilization (enable_partial_results_stabilization: true). Valid values: high | medium | low + # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization + partial_results_stability: + # The name of the custom vocabulary filter you want to use to mask or remove words. + # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html + vocab_filter_name: + # The method used to filter the vocabulary. Valid values: mask | remove | tag + # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html + vocab_filter_method: + + azure: + # Credentials for Azure Speech Service. + # One of these combinations must be set: + # - speech_host + # - speech_key + speech_region + # - speech_auth_token + speech_region + # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-speech-to-text?tabs=macos%2Cterminal&pivots=programming-language-python#prerequisites + speech_host: + speech_key: + speech_auth_token: + speech_region: + # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. + # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages + languages: + # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw + # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering + profanity: + + google: + # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file. + # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types) + credentials_info: | + { + "type": "service_account", + "project_id": "my-project", + "private_key_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "private_key": "-----BEGIN PRIVATE KEY-----\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n-----END PRIVATE KEY-----\n", + "client_email": "my-email@my-project.iam.gserviceaccount.com", + "client_id": "xxxxxxxxxxxxxxxxxxxxx", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-email%40my-project.iam.gserviceaccount.com", + "universe_domain": "googleapis.com" + } + # Which model to use for recognition. If not set, uses the default model for the selected language. + # See https://cloud.google.com/speech-to-text/docs/transcription-model + model: + # The location to use for recognition. Default is "us-central1". Latency will be best if the location is close to your users. + # Check supported languages and locations at https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages + location: + # List of language codes to recognize. Default is ["en-US"]. + # See https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages + languages: + # Whether to detect the language of the audio. Default is true. + detect_language: + # If 'true', adds punctuation to recognition result hypotheses. This feature is only available in select languages. Setting this + # for requests in other languages has no effect at all. The default 'false' value does not add punctuation to result hypotheses. + # See https://cloud.google.com/speech-to-text/docs/automatic-punctuation + punctuate: + # The spoken punctuation behavior for the call. If not set, uses default behavior based on model of choice. + # e.g. command_and_search will enable spoken punctuation by default. If 'true', replaces spoken punctuation + # with the corresponding symbols in the request. For example, "how are you question mark" becomes "how are you?". + # See https://cloud.google.com/speech-to-text/docs/spoken-punctuation for support. If 'false', spoken punctuation is not replaced. + spoken_punctuation: + # Whether to return interim (non-final) transcription results. Defaults to true. + interim_results: + + openai: + # API key for OpenAI. See https://platform.openai.com/api-keys + api_key: + # See https://platform.openai.com/docs/guides/speech-to-text + model: + # The language of the input audio. Supplying the input language in ISO-639-1 format + # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency. + language: + + groq: + # API key for Groq. See https://console.groq.com/keys + api_key: + # See https://console.groq.com/docs/speech-to-text + model: + # The language of the input audio. Supplying the input language in ISO-639-1 format + # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency. + language: + # Prompt to guide the model's style or specify how to spell unfamiliar words. 224 tokens max. + prompt: + + deepgram: + # See https://console.deepgram.com/ + api_key: + # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.model + model: + # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language + language: + # Whether to return interim (non-final) transcription results. Defaults to true + interim_results: true + # Whether to apply smart formatting to numbers, dates, etc. Defaults to true + smart_format: true + # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations + punctuate: true + # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true + filler_words: true + # Whether to filter profanity from the transcription. Defaults to false + profanity_filter: false + # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead. + # keywords: + # - [OpenVidu, 1.5] + # - [WebRTC, 1] + # List of key terms to improve recognition accuracy. Defaults to None. keyterms is supported by Nova-3 models. + # Commented below is an example + keyterms: + # - "OpenVidu" + # - "WebRTC" + + assemblyai: + # API key for AssemblyAI. See https://assemblyai.com/app/ + api_key: + # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection. + format_turns: + + fal: + # API key for fal. See https://fal.ai/dashboard/keys + api_key: + # See https://fal.ai/models/fal-ai/wizper/api#schema + task: + # See https://fal.ai/models/fal-ai/wizper/api#schema + language: + # See https://fal.ai/models/fal-ai/wizper/api#schema + chunk_level: + # See https://fal.ai/models/fal-ai/wizper/api#schema + version: + + clova: + # Secret key issued when registering the app + api_key: + # API Gateway's unique invoke URL created in CLOVA Speech Domain. + # See https://guide.ncloud-docs.com/docs/en/clovaspeech-domain#create-domain + invoke_url: + # See https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-longsentence + language: + # Value between 0 and 1 indicating the threshold for the confidence score of the transcribed text. Default is 0.5. + # If the confidence score is lower than the threshold, the transcription event is not sent to the client. + # For a definition of the confidence score see https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-grpc + threshold: + + speechmatics: + # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/ + api_key: + # See https://docs.speechmatics.com/rt-api-ref#transcription-config + language: + # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale + output_locale: + # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts + enable_partials: + # See https://docs.speechmatics.com/features/realtime-latency#configuration-example + max_delay: + # See https://docs.speechmatics.com/features/realtime-latency#configuration-example + max_delay_mode: + # See https://docs.speechmatics.com/features/punctuation-settings + # Commented below is an example of punctuation settings + punctuation_overrides: + # permitted_marks: [ ".", "," ] + # sensitivity: 0.4 + # See https://docs.speechmatics.com/features/custom-dictionary + # Commented below is an example of a custom dictionary + additional_vocab: + # - content: financial crisis + # - content: gnocchi + # sounds_like: + # - nyohki + # - nokey + # - nochi + # - content: CEO + # sounds_like: + # - C.E.O. + + gladia: + # API key for Gladia. See https://app.gladia.io/account + api_key: + # Whether to return interim (non-final) transcription results. Defaults to True + interim_results: + # List of language codes to use for recognition. Defaults to None (auto-detect). See https://docs.gladia.io/chapters/limits-and-specifications/languages + languages: + # Whether to allow switching between languages during recognition. Defaults to True + code_switching: From 236b4779f9a8b4626a88d5f12be0d350315d4517 Mon Sep 17 00:00:00 2001 From: pabloFuente Date: Wed, 18 Jun 2025 19:16:33 +0200 Subject: [PATCH 4/8] Updated agent-speech-processing.yaml --- community/agent-speech-processing.yaml | 30 ++++++++++---------------- pro/agent-speech-processing.yaml | 30 ++++++++++---------------- 2 files changed, 22 insertions(+), 38 deletions(-) diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml index 7d4345c..f22e621 100644 --- a/community/agent-speech-processing.yaml +++ b/community/agent-speech-processing.yaml @@ -1,24 +1,16 @@ -############################## -# Agent common configuration # -############################## - # Docker image of the agent. docker_image: docker.io/openvidu/agent-speech-processing:main -# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled) -# automatic: the agent will run and will automatically connect to new Rooms. -# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. -# disabled: the agent will not run. -processing: disabled +# Whether to run the agent or not. +enabled: false -################################ -# Agent specific configuration # -################################ -speech_processing: - # Whether or not the agent should be hidden to the Participants of the Room. - hidden: true +live_captions: + # How this agent will connect to Rooms [automatic, manual] + # - automatic: the agent will automatically connect to new Rooms. + # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. + processing: automatic - # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia + # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia] # The custom configuration for the selected provider must be set below provider: @@ -154,10 +146,10 @@ speech_processing: # - "WebRTC" assemblyai: - # API key for AssemblyAI. See https://assemblyai.com/app/ + # API key for AssemblyAI. See https://www.assemblyai.com/dashboard/api-keys api_key: - # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection. - format_turns: + # Whether to return formatted final transcripts (proper punctuation, letter casing...). If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection. + format_turns: true fal: # API key for fal. See https://fal.ai/dashboard/keys diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml index 7d4345c..f22e621 100644 --- a/pro/agent-speech-processing.yaml +++ b/pro/agent-speech-processing.yaml @@ -1,24 +1,16 @@ -############################## -# Agent common configuration # -############################## - # Docker image of the agent. docker_image: docker.io/openvidu/agent-speech-processing:main -# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled) -# automatic: the agent will run and will automatically connect to new Rooms. -# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. -# disabled: the agent will not run. -processing: disabled +# Whether to run the agent or not. +enabled: false -################################ -# Agent specific configuration # -################################ -speech_processing: - # Whether or not the agent should be hidden to the Participants of the Room. - hidden: true +live_captions: + # How this agent will connect to Rooms [automatic, manual] + # - automatic: the agent will automatically connect to new Rooms. + # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. + processing: automatic - # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia + # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia] # The custom configuration for the selected provider must be set below provider: @@ -154,10 +146,10 @@ speech_processing: # - "WebRTC" assemblyai: - # API key for AssemblyAI. See https://assemblyai.com/app/ + # API key for AssemblyAI. See https://www.assemblyai.com/dashboard/api-keys api_key: - # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection. - format_turns: + # Whether to return formatted final transcripts (proper punctuation, letter casing...). If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection. + format_turns: true fal: # API key for fal. See https://fal.ai/dashboard/keys From 32e533f892fc00c497a3e2deaf76c251e4ee63a8 Mon Sep 17 00:00:00 2001 From: pabloFuente Date: Wed, 18 Jun 2025 20:25:05 +0200 Subject: [PATCH 5/8] Added sarvam STT AI provider to agent-speech-processing.yaml --- community/agent-speech-processing.yaml | 10 +++++++++- pro/agent-speech-processing.yaml | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml index f22e621..b105214 100644 --- a/community/agent-speech-processing.yaml +++ b/community/agent-speech-processing.yaml @@ -10,7 +10,7 @@ live_captions: # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. processing: automatic - # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia] + # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam] # The custom configuration for the selected provider must be set below provider: @@ -216,3 +216,11 @@ live_captions: languages: # Whether to allow switching between languages during recognition. Defaults to True code_switching: + + sarvam: + # API key for Sarvam. See https://dashboard.sarvam.ai/key-management + api_key: + # BCP-47 language code for supported Indian languages. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.language_code.language_code + language: + # The Sarvam STT model to use. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.model.model + model: diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml index f22e621..b105214 100644 --- a/pro/agent-speech-processing.yaml +++ b/pro/agent-speech-processing.yaml @@ -10,7 +10,7 @@ live_captions: # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API. processing: automatic - # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia] + # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam] # The custom configuration for the selected provider must be set below provider: @@ -216,3 +216,11 @@ live_captions: languages: # Whether to allow switching between languages during recognition. Defaults to True code_switching: + + sarvam: + # API key for Sarvam. See https://dashboard.sarvam.ai/key-management + api_key: + # BCP-47 language code for supported Indian languages. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.language_code.language_code + language: + # The Sarvam STT model to use. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.model.model + model: From c692d9b86df3dba51589b7f9a1535a5f0afc3d7d Mon Sep 17 00:00:00 2001 From: pabloFuente Date: Thu, 19 Jun 2025 13:38:18 +0200 Subject: [PATCH 6/8] Added azure_openai to agent-speech-processing.yaml. Fixed other providers --- community/agent-speech-processing.yaml | 53 +++++++++++++++++++------- pro/agent-speech-processing.yaml | 53 +++++++++++++++++++------- 2 files changed, 80 insertions(+), 26 deletions(-) diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml index b105214..2e86075 100644 --- a/community/agent-speech-processing.yaml +++ b/community/agent-speech-processing.yaml @@ -58,6 +58,29 @@ live_captions: # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering profanity: + azure_openai: + # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai + # Azure OpenAI API key. Mandatory value. + azure_api_key: + # Azure Active Directory token. Mandatory value. + azure_ad_token: + # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value. + azure_endpoint: + # Name of your model deployment. If given with `azure_endpoint`, sets the base client URL to include `/deployments/{azure_deployment}`. + azure_deployment: + # OpenAI REST API version used for the request. Mandatory value. + api_version: + # OpenAI organization ID. + organization: + # OpenAI project ID. + project: + # The language code to use for transcription (e.g., "en" for English). + language: + # ID of the model to use for speech-to-text. + model: + # Initial prompt to guide the transcription. + prompt: + google: # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file. # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types) @@ -155,13 +178,7 @@ live_captions: # API key for fal. See https://fal.ai/dashboard/keys api_key: # See https://fal.ai/models/fal-ai/wizper/api#schema - task: - # See https://fal.ai/models/fal-ai/wizper/api#schema language: - # See https://fal.ai/models/fal-ai/wizper/api#schema - chunk_level: - # See https://fal.ai/models/fal-ai/wizper/api#schema - version: clova: # Secret key issued when registering the app @@ -179,18 +196,28 @@ live_captions: speechmatics: # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/ api_key: - # See https://docs.speechmatics.com/rt-api-ref#transcription-config + # ISO 639-1 language code. All languages are global and can understand different dialects/accents. To see the list of all supported languages, see https://docs.speechmatics.com/introduction/supported-languages language: - # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale - output_locale: - # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts + # Operating point to use for the transcription per required accuracy & complexity. To learn more, see https://docs.speechmatics.com/features/accuracy-language-packs#accuracy + operating_point: + # Partial transcripts allow you to receive preliminary transcriptions and update as more context is available until the higher-accuracy final transcript is returned. Partials are returned faster but without any post-processing such as formatting. See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts enable_partials: - # See https://docs.speechmatics.com/features/realtime-latency#configuration-example + # RFC-5646 language code to make spelling rules more consistent in the transcription output. See https://docs.speechmatics.com/features/word-tagging#output-locale + output_locale: + # The delay in seconds between the end of a spoken word and returning the final transcript results. See https://docs.speechmatics.com/features/realtime-latency#configuration-example max_delay: # See https://docs.speechmatics.com/features/realtime-latency#configuration-example max_delay_mode: - # See https://docs.speechmatics.com/features/punctuation-settings - # Commented below is an example of punctuation settings + # Configuration for speaker diarization. See https://docs.speechmatics.com/features/diarization + speaker_diarization_config: + # See https://docs.speechmatics.com/features/diarization#max-speakers + max_speakers: + # See https://docs.speechmatics.com/features/diarization#speaker-sensitivity + speaker_sensitivity: + # See https://docs.speechmatics.com/features/diarization#prefer-current-speaker + prefer_current_speaker: + # Permitted puctuation marks for advanced punctuation. See https://docs.speechmatics.com/features/punctuation-settings + # Commented is an example of punctuation settings punctuation_overrides: # permitted_marks: [ ".", "," ] # sensitivity: 0.4 diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml index b105214..2e86075 100644 --- a/pro/agent-speech-processing.yaml +++ b/pro/agent-speech-processing.yaml @@ -58,6 +58,29 @@ live_captions: # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering profanity: + azure_openai: + # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai + # Azure OpenAI API key. Mandatory value. + azure_api_key: + # Azure Active Directory token. Mandatory value. + azure_ad_token: + # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value. + azure_endpoint: + # Name of your model deployment. If given with `azure_endpoint`, sets the base client URL to include `/deployments/{azure_deployment}`. + azure_deployment: + # OpenAI REST API version used for the request. Mandatory value. + api_version: + # OpenAI organization ID. + organization: + # OpenAI project ID. + project: + # The language code to use for transcription (e.g., "en" for English). + language: + # ID of the model to use for speech-to-text. + model: + # Initial prompt to guide the transcription. + prompt: + google: # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file. # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types) @@ -155,13 +178,7 @@ live_captions: # API key for fal. See https://fal.ai/dashboard/keys api_key: # See https://fal.ai/models/fal-ai/wizper/api#schema - task: - # See https://fal.ai/models/fal-ai/wizper/api#schema language: - # See https://fal.ai/models/fal-ai/wizper/api#schema - chunk_level: - # See https://fal.ai/models/fal-ai/wizper/api#schema - version: clova: # Secret key issued when registering the app @@ -179,18 +196,28 @@ live_captions: speechmatics: # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/ api_key: - # See https://docs.speechmatics.com/rt-api-ref#transcription-config + # ISO 639-1 language code. All languages are global and can understand different dialects/accents. To see the list of all supported languages, see https://docs.speechmatics.com/introduction/supported-languages language: - # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale - output_locale: - # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts + # Operating point to use for the transcription per required accuracy & complexity. To learn more, see https://docs.speechmatics.com/features/accuracy-language-packs#accuracy + operating_point: + # Partial transcripts allow you to receive preliminary transcriptions and update as more context is available until the higher-accuracy final transcript is returned. Partials are returned faster but without any post-processing such as formatting. See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts enable_partials: - # See https://docs.speechmatics.com/features/realtime-latency#configuration-example + # RFC-5646 language code to make spelling rules more consistent in the transcription output. See https://docs.speechmatics.com/features/word-tagging#output-locale + output_locale: + # The delay in seconds between the end of a spoken word and returning the final transcript results. See https://docs.speechmatics.com/features/realtime-latency#configuration-example max_delay: # See https://docs.speechmatics.com/features/realtime-latency#configuration-example max_delay_mode: - # See https://docs.speechmatics.com/features/punctuation-settings - # Commented below is an example of punctuation settings + # Configuration for speaker diarization. See https://docs.speechmatics.com/features/diarization + speaker_diarization_config: + # See https://docs.speechmatics.com/features/diarization#max-speakers + max_speakers: + # See https://docs.speechmatics.com/features/diarization#speaker-sensitivity + speaker_sensitivity: + # See https://docs.speechmatics.com/features/diarization#prefer-current-speaker + prefer_current_speaker: + # Permitted puctuation marks for advanced punctuation. See https://docs.speechmatics.com/features/punctuation-settings + # Commented is an example of punctuation settings punctuation_overrides: # permitted_marks: [ ".", "," ] # sensitivity: 0.4 From 8a268d8e658f3caf39a62bac4084db1a155afdaa Mon Sep 17 00:00:00 2001 From: pabloFuente Date: Tue, 24 Jun 2025 19:12:24 +0200 Subject: [PATCH 7/8] Updated agent-speech-processing.yaml --- community/agent-speech-processing.yaml | 26 ++++++++++++++++---------- pro/agent-speech-processing.yaml | 26 ++++++++++++++++---------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml index 2e86075..756390f 100644 --- a/community/agent-speech-processing.yaml +++ b/community/agent-speech-processing.yaml @@ -51,18 +51,18 @@ live_captions: speech_key: speech_auth_token: speech_region: - # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. + # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. E.g. ["en-US", "es-ES"] # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages - languages: + language: # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering profanity: azure_openai: # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai - # Azure OpenAI API key. Mandatory value. + # Azure OpenAI API key azure_api_key: - # Azure Active Directory token. Mandatory value. + # Azure Active Directory token azure_ad_token: # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value. azure_endpoint: @@ -124,11 +124,13 @@ live_captions: openai: # API key for OpenAI. See https://platform.openai.com/api-keys api_key: - # See https://platform.openai.com/docs/guides/speech-to-text + # The OpenAI model to use for transcription. See https://platform.openai.com/docs/guides/speech-to-text model: # The language of the input audio. Supplying the input language in ISO-639-1 format # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency. language: + # Optional text prompt to guide the transcription. Only supported for whisper-1. + prompt: groq: # API key for Groq. See https://console.groq.com/keys @@ -148,15 +150,19 @@ live_captions: model: # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language language: - # Whether to return interim (non-final) transcription results. Defaults to true + # Whether to enable automatic language detection. Defaults to false. See https://developers.deepgram.com/docs/language-detection + detect_language: false + # Whether to return interim (non-final) transcription results. Defaults to true. See https://developers.deepgram.com/docs/interim-results interim_results: true - # Whether to apply smart formatting to numbers, dates, etc. Defaults to true + # Whether to apply smart formatting to numbers, dates, etc. Defaults to true. See https://developers.deepgram.com/docs/smart-format smart_format: true - # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations + # When smart_format is used, ensures it does not wait for sequence to be complete before returning results. Defaults to true. See https://developers.deepgram.com/docs/smart-format#using-no-delay + no_delay: true + # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations. See https://developers.deepgram.com/docs/punctuation punctuate: true - # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true + # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true. See https://developers.deepgram.com/docs/filler-words filler_words: true - # Whether to filter profanity from the transcription. Defaults to false + # Whether to filter profanity from the transcription. Defaults to false. See https://developers.deepgram.com/docs/profanity-filter profanity_filter: false # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead. # keywords: diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml index 2e86075..756390f 100644 --- a/pro/agent-speech-processing.yaml +++ b/pro/agent-speech-processing.yaml @@ -51,18 +51,18 @@ live_captions: speech_key: speech_auth_token: speech_region: - # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. + # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. E.g. ["en-US", "es-ES"] # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages - languages: + language: # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering profanity: azure_openai: # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai - # Azure OpenAI API key. Mandatory value. + # Azure OpenAI API key azure_api_key: - # Azure Active Directory token. Mandatory value. + # Azure Active Directory token azure_ad_token: # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value. azure_endpoint: @@ -124,11 +124,13 @@ live_captions: openai: # API key for OpenAI. See https://platform.openai.com/api-keys api_key: - # See https://platform.openai.com/docs/guides/speech-to-text + # The OpenAI model to use for transcription. See https://platform.openai.com/docs/guides/speech-to-text model: # The language of the input audio. Supplying the input language in ISO-639-1 format # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency. language: + # Optional text prompt to guide the transcription. Only supported for whisper-1. + prompt: groq: # API key for Groq. See https://console.groq.com/keys @@ -148,15 +150,19 @@ live_captions: model: # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language language: - # Whether to return interim (non-final) transcription results. Defaults to true + # Whether to enable automatic language detection. Defaults to false. See https://developers.deepgram.com/docs/language-detection + detect_language: false + # Whether to return interim (non-final) transcription results. Defaults to true. See https://developers.deepgram.com/docs/interim-results interim_results: true - # Whether to apply smart formatting to numbers, dates, etc. Defaults to true + # Whether to apply smart formatting to numbers, dates, etc. Defaults to true. See https://developers.deepgram.com/docs/smart-format smart_format: true - # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations + # When smart_format is used, ensures it does not wait for sequence to be complete before returning results. Defaults to true. See https://developers.deepgram.com/docs/smart-format#using-no-delay + no_delay: true + # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations. See https://developers.deepgram.com/docs/punctuation punctuate: true - # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true + # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true. See https://developers.deepgram.com/docs/filler-words filler_words: true - # Whether to filter profanity from the transcription. Defaults to false + # Whether to filter profanity from the transcription. Defaults to false. See https://developers.deepgram.com/docs/profanity-filter profanity_filter: false # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead. # keywords: From 4bf87d6485edc518420ac55684c83a58328b3f1e Mon Sep 17 00:00:00 2001 From: cruizba Date: Thu, 26 Jun 2025 22:20:38 +0200 Subject: [PATCH 8/8] Bump to version 3.3.0 --- community/agent-speech-processing.yaml | 2 +- community/docker-compose.yaml | 12 ++++++------ pro/agent-speech-processing.yaml | 2 +- pro/docker-compose.yaml | 14 +++++++------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml index 756390f..dc51bd8 100644 --- a/community/agent-speech-processing.yaml +++ b/community/agent-speech-processing.yaml @@ -1,5 +1,5 @@ # Docker image of the agent. -docker_image: docker.io/openvidu/agent-speech-processing:main +docker_image: docker.io/openvidu/agent-speech-processing:3.3.0 # Whether to run the agent or not. enabled: false diff --git a/community/docker-compose.yaml b/community/docker-compose.yaml index ce6bfd1..84a77e4 100644 --- a/community/docker-compose.yaml +++ b/community/docker-compose.yaml @@ -1,6 +1,6 @@ services: caddy-proxy: - image: docker.io/openvidu/openvidu-caddy-local:main + image: docker.io/openvidu/openvidu-caddy-local:3.3.0 platform: linux/amd64 container_name: caddy-proxy restart: unless-stopped @@ -87,7 +87,7 @@ services: condition: service_completed_successfully dashboard: - image: docker.io/openvidu/openvidu-dashboard:main + image: docker.io/openvidu/openvidu-dashboard:3.3.0 platform: linux/amd64 container_name: dashboard restart: unless-stopped @@ -101,7 +101,7 @@ services: condition: service_completed_successfully openvidu: - image: docker.io/openvidu/openvidu-server:main + image: docker.io/openvidu/openvidu-server:3.3.0 platform: linux/amd64 restart: unless-stopped container_name: openvidu @@ -123,7 +123,7 @@ services: condition: service_completed_successfully ingress: - image: docker.io/openvidu/ingress:main + image: docker.io/openvidu/ingress:3.3.0 platform: linux/amd64 container_name: ingress restart: unless-stopped @@ -158,7 +158,7 @@ services: condition: service_completed_successfully default-app: - image: docker.io/openvidu/openvidu-call:main-demo + image: docker.io/openvidu/openvidu-call:3.3.0-demo platform: linux/amd64 container_name: openvidu-call restart: on-failure @@ -194,7 +194,7 @@ services: condition: service_completed_successfully operator: - image: docker.io/openvidu/openvidu-operator:main + image: docker.io/openvidu/openvidu-operator:3.3.0 platform: linux/amd64 container_name: operator restart: unless-stopped diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml index 756390f..dc51bd8 100644 --- a/pro/agent-speech-processing.yaml +++ b/pro/agent-speech-processing.yaml @@ -1,5 +1,5 @@ # Docker image of the agent. -docker_image: docker.io/openvidu/agent-speech-processing:main +docker_image: docker.io/openvidu/agent-speech-processing:3.3.0 # Whether to run the agent or not. enabled: false diff --git a/pro/docker-compose.yaml b/pro/docker-compose.yaml index fb4a076..274f8e9 100644 --- a/pro/docker-compose.yaml +++ b/pro/docker-compose.yaml @@ -1,6 +1,6 @@ services: caddy-proxy: - image: docker.io/openvidu/openvidu-caddy-local:main + image: docker.io/openvidu/openvidu-caddy-local:3.3.0 platform: linux/amd64 container_name: caddy-proxy restart: unless-stopped @@ -87,7 +87,7 @@ services: condition: service_completed_successfully dashboard: - image: docker.io/openvidu/openvidu-dashboard:main + image: docker.io/openvidu/openvidu-dashboard:3.3.0 platform: linux/amd64 container_name: dashboard restart: unless-stopped @@ -101,7 +101,7 @@ services: condition: service_completed_successfully openvidu: - image: docker.io/openvidu/openvidu-server-pro:main + image: docker.io/openvidu/openvidu-server-pro:3.3.0 platform: linux/amd64 restart: unless-stopped container_name: openvidu @@ -125,7 +125,7 @@ services: condition: service_completed_successfully ingress: - image: docker.io/openvidu/ingress:main + image: docker.io/openvidu/ingress:3.3.0 platform: linux/amd64 container_name: ingress restart: unless-stopped @@ -160,7 +160,7 @@ services: condition: service_completed_successfully default-app: - image: docker.io/openvidu/openvidu-call:main-demo + image: docker.io/openvidu/openvidu-call:3.3.0-demo platform: linux/amd64 container_name: openvidu-call restart: on-failure @@ -196,7 +196,7 @@ services: condition: service_completed_successfully openvidu-v2compatibility: - image: docker.io/openvidu/openvidu-v2compatibility:main + image: docker.io/openvidu/openvidu-v2compatibility:3.3.0 platform: linux/amd64 restart: unless-stopped container_name: openvidu-v2compatibility @@ -267,7 +267,7 @@ services: command: /bin/sh /scripts/ready-check.sh operator: - image: docker.io/openvidu/openvidu-operator:main + image: docker.io/openvidu/openvidu-operator:3.3.0 platform: linux/amd64 container_name: operator restart: unless-stopped