From 7970659f69a4b800386ff2c71578ab59f92ff112 Mon Sep 17 00:00:00 2001
From: cruizba <carlos.ruizbal@gmail.com>
Date: Wed, 4 Jun 2025 17:21:57 +0200
Subject: [PATCH 1/8] Revert "Bump to version 3.2.0"

This reverts commit 9edcb4f442fec3b1ec83d65ebc19b10d65138f31.
---
 community/docker-compose.yaml | 10 +++++-----
 pro/docker-compose.yaml       | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/community/docker-compose.yaml b/community/docker-compose.yaml
index 5d79a55..8e6cdb5 100644
--- a/community/docker-compose.yaml
+++ b/community/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   caddy-proxy:
-    image: docker.io/openvidu/openvidu-caddy-local:3.2.0
+    image: docker.io/openvidu/openvidu-caddy-local:main
     platform: linux/amd64
     container_name: caddy-proxy
     restart: unless-stopped
@@ -87,7 +87,7 @@ services:
         condition: service_completed_successfully
 
   dashboard:
-    image: docker.io/openvidu/openvidu-dashboard:3.2.0
+    image: docker.io/openvidu/openvidu-dashboard:main
     platform: linux/amd64
     container_name: dashboard
     restart: unless-stopped
@@ -101,7 +101,7 @@ services:
         condition: service_completed_successfully
 
   openvidu:
-    image: docker.io/openvidu/openvidu-server:3.2.0
+    image: docker.io/openvidu/openvidu-server:main
     platform: linux/amd64
     restart: unless-stopped
     container_name: openvidu
@@ -123,7 +123,7 @@ services:
         condition: service_completed_successfully
 
   ingress:
-    image: docker.io/openvidu/ingress:3.2.0
+    image: docker.io/openvidu/ingress:main
     platform: linux/amd64
     container_name: ingress
     restart: unless-stopped
@@ -158,7 +158,7 @@ services:
         condition: service_completed_successfully
 
   default-app:
-    image: docker.io/openvidu/openvidu-call:3.2.0-demo
+    image: docker.io/openvidu/openvidu-call:main-demo
     platform: linux/amd64
     container_name: openvidu-call
     restart: on-failure
diff --git a/pro/docker-compose.yaml b/pro/docker-compose.yaml
index 23a39c1..2e5d062 100644
--- a/pro/docker-compose.yaml
+++ b/pro/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   caddy-proxy:
-    image: docker.io/openvidu/openvidu-caddy-local:3.2.0
+    image: docker.io/openvidu/openvidu-caddy-local:main
     platform: linux/amd64
     container_name: caddy-proxy
     restart: unless-stopped
@@ -87,7 +87,7 @@ services:
         condition: service_completed_successfully
 
   dashboard:
-    image: docker.io/openvidu/openvidu-dashboard:3.2.0
+    image: docker.io/openvidu/openvidu-dashboard:main
     platform: linux/amd64
     container_name: dashboard
     restart: unless-stopped
@@ -101,7 +101,7 @@ services:
         condition: service_completed_successfully
 
   openvidu:
-    image: docker.io/openvidu/openvidu-server-pro:3.2.0
+    image: docker.io/openvidu/openvidu-server-pro:main
     platform: linux/amd64
     restart: unless-stopped
     container_name: openvidu
@@ -125,7 +125,7 @@ services:
         condition: service_completed_successfully
 
   ingress:
-    image: docker.io/openvidu/ingress:3.2.0
+    image: docker.io/openvidu/ingress:main
     platform: linux/amd64
     container_name: ingress
     restart: unless-stopped
@@ -160,7 +160,7 @@ services:
         condition: service_completed_successfully
 
   default-app:
-    image: docker.io/openvidu/openvidu-call:3.2.0-demo
+    image: docker.io/openvidu/openvidu-call:main-demo
     platform: linux/amd64
     container_name: openvidu-call
     restart: on-failure
@@ -196,7 +196,7 @@ services:
         condition: service_completed_successfully
 
   openvidu-v2compatibility:
-    image: docker.io/openvidu/openvidu-v2compatibility:3.2.0
+    image: docker.io/openvidu/openvidu-v2compatibility:main
     platform: linux/amd64
     restart: unless-stopped
     container_name: openvidu-v2compatibility

From f16c0a8a647a9be340034cb6b73f26ef753fa5bb Mon Sep 17 00:00:00 2001
From: cruizba <carlos.ruizbal@gmail.com>
Date: Tue, 10 Jun 2025 11:01:21 +0200
Subject: [PATCH 2/8] Add operator service to docker-compose for agent
 management

---
 community/docker-compose.yaml | 25 +++++++++++++++++++++++++
 pro/docker-compose.yaml       | 24 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/community/docker-compose.yaml b/community/docker-compose.yaml
index 8e6cdb5..ce6bfd1 100644
--- a/community/docker-compose.yaml
+++ b/community/docker-compose.yaml
@@ -193,6 +193,31 @@ services:
       setup:
         condition: service_completed_successfully
 
+  operator:
+    image: docker.io/openvidu/openvidu-operator:main
+    platform: linux/amd64
+    container_name: operator
+    restart: unless-stopped
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - agents-config:/agents-config
+      - ./:/deployment
+    environment:
+      - PLATFORM=linux/amd64
+      - MODE=agent-manager-local
+      - DEPLOYMENT_FILES_DIR=/deployment
+      - AGENTS_CONFIG_DIR=/agents-config
+      - NETWORK_NAME=openvidu-community
+      - AGENTS_CONFIG_VOLUME=openvidu-agents-config
+      - LIVEKIT_URL=ws://openvidu:7880/
+      - LIVEKIT_API_KEY=${LIVEKIT_API_KEY:-}
+      - LIVEKIT_API_SECRET=${LIVEKIT_API_SECRET:-}
+      - REDIS_ADDRESS=redis:6379
+      - REDIS_PASSWORD=${REDIS_PASSWORD:-}
+    depends_on:
+      setup:
+        condition: service_completed_successfully
+
   ready-check:
     image: docker.io/curlimages/curl:8.13.0
     platform: linux/amd64
diff --git a/pro/docker-compose.yaml b/pro/docker-compose.yaml
index 2e5d062..fb4a076 100644
--- a/pro/docker-compose.yaml
+++ b/pro/docker-compose.yaml
@@ -266,6 +266,30 @@ services:
       - ./scripts/utils.sh:/scripts/utils.sh
     command: /bin/sh /scripts/ready-check.sh
 
+  operator:
+    image: docker.io/openvidu/openvidu-operator:main
+    platform: linux/amd64
+    container_name: operator
+    restart: unless-stopped
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - agents-config:/agents-config
+      - ./:/deployment
+    environment:
+      - PLATFORM=linux/amd64
+      - MODE=agent-manager-local
+      - DEPLOYMENT_FILES_DIR=/deployment
+      - AGENTS_CONFIG_DIR=/agents-config
+      - NETWORK_NAME=openvidu-pro
+      - AGENTS_CONFIG_VOLUME=openvidu-pro-agents-config
+      - LIVEKIT_URL=ws://openvidu:7880/
+      - LIVEKIT_API_KEY=${LIVEKIT_API_KEY:-}
+      - LIVEKIT_API_SECRET=${LIVEKIT_API_SECRET:-}
+      - REDIS_ADDRESS=redis:6379
+      - REDIS_PASSWORD=${REDIS_PASSWORD:-}
+    depends_on:
+      setup:
+        condition: service_completed_successfully
   setup:
     image: docker.io/busybox:1.37.0
     platform: linux/amd64

From b88e1420fda39af8b1550680ec261ace4758aeac Mon Sep 17 00:00:00 2001
From: pabloFuente <pablofuenteperez@gmail.com>
Date: Mon, 16 Jun 2025 18:37:02 +0200
Subject: [PATCH 3/8] Added agent-speech-processing.yaml to community and pro
 local deployments

---
 community/agent-speech-processing.yaml | 226 +++++++++++++++++++++++++
 pro/agent-speech-processing.yaml       | 226 +++++++++++++++++++++++++
 2 files changed, 452 insertions(+)
 create mode 100644 community/agent-speech-processing.yaml
 create mode 100644 pro/agent-speech-processing.yaml

diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml
new file mode 100644
index 0000000..7d4345c
--- /dev/null
+++ b/community/agent-speech-processing.yaml
@@ -0,0 +1,226 @@
+##############################
+# Agent common configuration #
+##############################
+
+# Docker image of the agent.
+docker_image: docker.io/openvidu/agent-speech-processing:main
+
+# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled)
+# automatic: the agent will run and will automatically connect to new Rooms.
+# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
+# disabled: the agent will not run.
+processing: disabled
+
+################################
+# Agent specific configuration #
+################################
+speech_processing:
+  # Whether or not the agent should be hidden to the Participants of the Room.
+  hidden: true
+
+  # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia
+  # The custom configuration for the selected provider must be set below
+  provider:
+
+  aws:
+    # Credentials for AWS Transcribe. See https://docs.aws.amazon.com/transcribe/latest/dg/what-is.html
+    aws_access_key_id:
+    aws_secret_access_key:
+    aws_default_region:
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html
+    language:
+    # The name of the custom vocabulary you want to use.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-vocabulary.html
+    vocabulary_name:
+    # The name of the custom language model you want to use.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-language-models-using.html
+    language_model_name:
+    # Whether or not to enable partial result stabilization. Partial result stabilization can reduce latency in your output, but may impact accuracy.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization
+    enable_partial_results_stabilization:
+    # Specify the level of stability to use when you enable partial results stabilization (enable_partial_results_stabilization: true). Valid values: high | medium | low
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization
+    partial_results_stability:
+    # The name of the custom vocabulary filter you want to use to mask or remove words.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html
+    vocab_filter_name:
+    # The method used to filter the vocabulary. Valid values: mask | remove | tag
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html
+    vocab_filter_method:
+
+  azure:
+    # Credentials for Azure Speech Service.
+    # One of these combinations must be set:
+    #  - speech_host
+    #  - speech_key + speech_region
+    #  - speech_auth_token + speech_region
+    # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-speech-to-text?tabs=macos%2Cterminal&pivots=programming-language-python#prerequisites
+    speech_host:
+    speech_key:
+    speech_auth_token:
+    speech_region:
+    # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set.
+    # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages
+    languages:
+    # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw
+    # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering
+    profanity:
+
+  google:
+    # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file.
+    # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types)
+    credentials_info: |
+      {
+        "type": "service_account",
+        "project_id": "my-project",
+        "private_key_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+        "private_key": "-----BEGIN PRIVATE KEY-----\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n-----END PRIVATE KEY-----\n",
+        "client_email": "my-email@my-project.iam.gserviceaccount.com",
+        "client_id": "xxxxxxxxxxxxxxxxxxxxx",
+        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+        "token_uri": "https://oauth2.googleapis.com/token",
+        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-email%40my-project.iam.gserviceaccount.com",
+        "universe_domain": "googleapis.com"
+      }
+    # Which model to use for recognition. If not set, uses the default model for the selected language.
+    # See https://cloud.google.com/speech-to-text/docs/transcription-model
+    model:
+    # The location to use for recognition. Default is "us-central1". Latency will be best if the location is close to your users.
+    # Check supported languages and locations at https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+    location:
+    # List of language codes to recognize. Default is ["en-US"].
+    # See https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+    languages:
+    # Whether to detect the language of the audio. Default is true.
+    detect_language:
+    # If 'true', adds punctuation to recognition result hypotheses. This feature is only available in select languages. Setting this
+    # for requests in other languages has no effect at all. The default 'false' value does not add punctuation to result hypotheses.
+    # See https://cloud.google.com/speech-to-text/docs/automatic-punctuation
+    punctuate:
+    # The spoken punctuation behavior for the call. If not set, uses default behavior based on model of choice.
+    # e.g. command_and_search will enable spoken punctuation by default. If 'true', replaces spoken punctuation
+    # with the corresponding symbols in the request. For example, "how are you question mark" becomes "how are you?".
+    # See https://cloud.google.com/speech-to-text/docs/spoken-punctuation for support. If 'false', spoken punctuation is not replaced.
+    spoken_punctuation:
+    # Whether to return interim (non-final) transcription results. Defaults to true.
+    interim_results:
+
+  openai:
+    # API key for OpenAI. See https://platform.openai.com/api-keys
+    api_key:
+    # See https://platform.openai.com/docs/guides/speech-to-text
+    model:
+    # The language of the input audio. Supplying the input language in ISO-639-1 format
+    # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency.
+    language:
+
+  groq:
+    # API key for Groq. See https://console.groq.com/keys
+    api_key:
+    # See https://console.groq.com/docs/speech-to-text
+    model:
+    # The language of the input audio. Supplying the input language in ISO-639-1 format
+    # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency.
+    language:
+    # Prompt to guide the model's style or specify how to spell unfamiliar words. 224 tokens max.
+    prompt:
+
+  deepgram:
+    # See https://console.deepgram.com/
+    api_key:
+    # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.model
+    model:
+    # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language
+    language:
+    # Whether to return interim (non-final) transcription results. Defaults to true
+    interim_results: true
+    # Whether to apply smart formatting to numbers, dates, etc. Defaults to true
+    smart_format: true
+    # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations
+    punctuate: true
+    # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true
+    filler_words: true
+    # Whether to filter profanity from the transcription. Defaults to false
+    profanity_filter: false
+    # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead.
+    # keywords:
+    #   - [OpenVidu, 1.5]
+    #   - [WebRTC, 1]
+    # List of key terms to improve recognition accuracy. Defaults to None. keyterms is supported by Nova-3 models.
+    # Commented below is an example
+    keyterms:
+      # - "OpenVidu"
+      # - "WebRTC"
+
+  assemblyai:
+    # API key for AssemblyAI. See https://assemblyai.com/app/
+    api_key:
+    # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection.
+    format_turns:
+
+  fal:
+    # API key for fal. See https://fal.ai/dashboard/keys
+    api_key:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    task:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    language:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    chunk_level:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    version:
+
+  clova:
+    # Secret key issued when registering the app
+    api_key:
+    # API Gateway's unique invoke URL created in CLOVA Speech Domain.
+    # See https://guide.ncloud-docs.com/docs/en/clovaspeech-domain#create-domain
+    invoke_url:
+    # See https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-longsentence
+    language:
+    # Value between 0 and 1 indicating the threshold for the confidence score of the transcribed text. Default is 0.5.
+    # If the confidence score is lower than the threshold, the transcription event is not sent to the client.
+    # For a definition of the confidence score see https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-grpc
+    threshold:
+
+  speechmatics:
+    # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/
+    api_key:
+    # See https://docs.speechmatics.com/rt-api-ref#transcription-config
+    language:
+    # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale
+    output_locale:
+    # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts
+    enable_partials:
+    # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
+    max_delay:
+    # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
+    max_delay_mode:
+    # See https://docs.speechmatics.com/features/punctuation-settings
+    # Commented below is an example of punctuation settings
+    punctuation_overrides:
+      # permitted_marks: [ ".", "," ]
+      # sensitivity: 0.4
+    # See https://docs.speechmatics.com/features/custom-dictionary
+    # Commented below is an example of a custom dictionary
+    additional_vocab:
+      # - content: financial crisis
+      # - content: gnocchi
+      #   sounds_like:
+      #     - nyohki
+      #     - nokey
+      #     - nochi
+      # - content: CEO
+      #   sounds_like:
+      #     - C.E.O.
+
+  gladia:
+    # API key for Gladia. See https://app.gladia.io/account
+    api_key:
+    # Whether to return interim (non-final) transcription results. Defaults to True
+    interim_results:
+    # List of language codes to use for recognition. Defaults to None (auto-detect). See https://docs.gladia.io/chapters/limits-and-specifications/languages
+    languages:
+    # Whether to allow switching between languages during recognition. Defaults to True
+    code_switching:
diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml
new file mode 100644
index 0000000..7d4345c
--- /dev/null
+++ b/pro/agent-speech-processing.yaml
@@ -0,0 +1,226 @@
+##############################
+# Agent common configuration #
+##############################
+
+# Docker image of the agent.
+docker_image: docker.io/openvidu/agent-speech-processing:main
+
+# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled)
+# automatic: the agent will run and will automatically connect to new Rooms.
+# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
+# disabled: the agent will not run.
+processing: disabled
+
+################################
+# Agent specific configuration #
+################################
+speech_processing:
+  # Whether or not the agent should be hidden to the Participants of the Room.
+  hidden: true
+
+  # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia
+  # The custom configuration for the selected provider must be set below
+  provider:
+
+  aws:
+    # Credentials for AWS Transcribe. See https://docs.aws.amazon.com/transcribe/latest/dg/what-is.html
+    aws_access_key_id:
+    aws_secret_access_key:
+    aws_default_region:
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html
+    language:
+    # The name of the custom vocabulary you want to use.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-vocabulary.html
+    vocabulary_name:
+    # The name of the custom language model you want to use.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/custom-language-models-using.html
+    language_model_name:
+    # Whether or not to enable partial result stabilization. Partial result stabilization can reduce latency in your output, but may impact accuracy.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization
+    enable_partial_results_stabilization:
+    # Specify the level of stability to use when you enable partial results stabilization (enable_partial_results_stabilization: true). Valid values: high | medium | low
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/streaming-partial-results.html#streaming-partial-result-stabilization
+    partial_results_stability:
+    # The name of the custom vocabulary filter you want to use to mask or remove words.
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html
+    vocab_filter_name:
+    # The method used to filter the vocabulary. Valid values: mask | remove | tag
+    # See https://docs.aws.amazon.com/transcribe/latest/dg/vocabulary-filtering.html
+    vocab_filter_method:
+
+  azure:
+    # Credentials for Azure Speech Service.
+    # One of these combinations must be set:
+    #  - speech_host
+    #  - speech_key + speech_region
+    #  - speech_auth_token + speech_region
+    # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-speech-to-text?tabs=macos%2Cterminal&pivots=programming-language-python#prerequisites
+    speech_host:
+    speech_key:
+    speech_auth_token:
+    speech_region:
+    # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set.
+    # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages
+    languages:
+    # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw
+    # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering
+    profanity:
+
+  google:
+    # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file.
+    # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types)
+    credentials_info: |
+      {
+        "type": "service_account",
+        "project_id": "my-project",
+        "private_key_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+        "private_key": "-----BEGIN PRIVATE KEY-----\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n-----END PRIVATE KEY-----\n",
+        "client_email": "my-email@my-project.iam.gserviceaccount.com",
+        "client_id": "xxxxxxxxxxxxxxxxxxxxx",
+        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+        "token_uri": "https://oauth2.googleapis.com/token",
+        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+        "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-email%40my-project.iam.gserviceaccount.com",
+        "universe_domain": "googleapis.com"
+      }
+    # Which model to use for recognition. If not set, uses the default model for the selected language.
+    # See https://cloud.google.com/speech-to-text/docs/transcription-model
+    model:
+    # The location to use for recognition. Default is "us-central1". Latency will be best if the location is close to your users.
+    # Check supported languages and locations at https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+    location:
+    # List of language codes to recognize. Default is ["en-US"].
+    # See https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+    languages:
+    # Whether to detect the language of the audio. Default is true.
+    detect_language:
+    # If 'true', adds punctuation to recognition result hypotheses. This feature is only available in select languages. Setting this
+    # for requests in other languages has no effect at all. The default 'false' value does not add punctuation to result hypotheses.
+    # See https://cloud.google.com/speech-to-text/docs/automatic-punctuation
+    punctuate:
+    # The spoken punctuation behavior for the call. If not set, uses default behavior based on model of choice.
+    # e.g. command_and_search will enable spoken punctuation by default. If 'true', replaces spoken punctuation
+    # with the corresponding symbols in the request. For example, "how are you question mark" becomes "how are you?".
+    # See https://cloud.google.com/speech-to-text/docs/spoken-punctuation for support. If 'false', spoken punctuation is not replaced.
+    spoken_punctuation:
+    # Whether to return interim (non-final) transcription results. Defaults to true.
+    interim_results:
+
+  openai:
+    # API key for OpenAI. See https://platform.openai.com/api-keys
+    api_key:
+    # See https://platform.openai.com/docs/guides/speech-to-text
+    model:
+    # The language of the input audio. Supplying the input language in ISO-639-1 format
+    # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency.
+    language:
+
+  groq:
+    # API key for Groq. See https://console.groq.com/keys
+    api_key:
+    # See https://console.groq.com/docs/speech-to-text
+    model:
+    # The language of the input audio. Supplying the input language in ISO-639-1 format
+    # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency.
+    language:
+    # Prompt to guide the model's style or specify how to spell unfamiliar words. 224 tokens max.
+    prompt:
+
+  deepgram:
+    # See https://console.deepgram.com/
+    api_key:
+    # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.model
+    model:
+    # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language
+    language:
+    # Whether to return interim (non-final) transcription results. Defaults to true
+    interim_results: true
+    # Whether to apply smart formatting to numbers, dates, etc. Defaults to true
+    smart_format: true
+    # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations
+    punctuate: true
+    # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true
+    filler_words: true
+    # Whether to filter profanity from the transcription. Defaults to false
+    profanity_filter: false
+    # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead.
+    # keywords:
+    #   - [OpenVidu, 1.5]
+    #   - [WebRTC, 1]
+    # List of key terms to improve recognition accuracy. Defaults to None. keyterms is supported by Nova-3 models.
+    # Commented below is an example
+    keyterms:
+      # - "OpenVidu"
+      # - "WebRTC"
+
+  assemblyai:
+    # API key for AssemblyAI. See https://assemblyai.com/app/
+    api_key:
+    # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection.
+    format_turns:
+
+  fal:
+    # API key for fal. See https://fal.ai/dashboard/keys
+    api_key:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    task:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    language:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    chunk_level:
+    # See https://fal.ai/models/fal-ai/wizper/api#schema
+    version:
+
+  clova:
+    # Secret key issued when registering the app
+    api_key:
+    # API Gateway's unique invoke URL created in CLOVA Speech Domain.
+    # See https://guide.ncloud-docs.com/docs/en/clovaspeech-domain#create-domain
+    invoke_url:
+    # See https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-longsentence
+    language:
+    # Value between 0 and 1 indicating the threshold for the confidence score of the transcribed text. Default is 0.5.
+    # If the confidence score is lower than the threshold, the transcription event is not sent to the client.
+    # For a definition of the confidence score see https://api.ncloud-docs.com/docs/en/ai-application-service-clovaspeech-grpc
+    threshold:
+
+  speechmatics:
+    # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/
+    api_key:
+    # See https://docs.speechmatics.com/rt-api-ref#transcription-config
+    language:
+    # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale
+    output_locale:
+    # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts
+    enable_partials:
+    # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
+    max_delay:
+    # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
+    max_delay_mode:
+    # See https://docs.speechmatics.com/features/punctuation-settings
+    # Commented below is an example of punctuation settings
+    punctuation_overrides:
+      # permitted_marks: [ ".", "," ]
+      # sensitivity: 0.4
+    # See https://docs.speechmatics.com/features/custom-dictionary
+    # Commented below is an example of a custom dictionary
+    additional_vocab:
+      # - content: financial crisis
+      # - content: gnocchi
+      #   sounds_like:
+      #     - nyohki
+      #     - nokey
+      #     - nochi
+      # - content: CEO
+      #   sounds_like:
+      #     - C.E.O.
+
+  gladia:
+    # API key for Gladia. See https://app.gladia.io/account
+    api_key:
+    # Whether to return interim (non-final) transcription results. Defaults to True
+    interim_results:
+    # List of language codes to use for recognition. Defaults to None (auto-detect). See https://docs.gladia.io/chapters/limits-and-specifications/languages
+    languages:
+    # Whether to allow switching between languages during recognition. Defaults to True
+    code_switching:

From 236b4779f9a8b4626a88d5f12be0d350315d4517 Mon Sep 17 00:00:00 2001
From: pabloFuente <pablofuenteperez@gmail.com>
Date: Wed, 18 Jun 2025 19:16:33 +0200
Subject: [PATCH 4/8] Updated agent-speech-processing.yaml

---
 community/agent-speech-processing.yaml | 30 ++++++++++----------------
 pro/agent-speech-processing.yaml       | 30 ++++++++++----------------
 2 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml
index 7d4345c..f22e621 100644
--- a/community/agent-speech-processing.yaml
+++ b/community/agent-speech-processing.yaml
@@ -1,24 +1,16 @@
-##############################
-# Agent common configuration #
-##############################
-
 # Docker image of the agent.
 docker_image: docker.io/openvidu/agent-speech-processing:main
 
-# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled)
-# automatic: the agent will run and will automatically connect to new Rooms.
-# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
-# disabled: the agent will not run.
-processing: disabled
+# Whether to run the agent or not.
+enabled: false
 
-################################
-# Agent specific configuration #
-################################
-speech_processing:
-  # Whether or not the agent should be hidden to the Participants of the Room.
-  hidden: true
+live_captions:
+  # How this agent will connect to Rooms [automatic, manual]
+  # - automatic: the agent will automatically connect to new Rooms.
+  # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
+  processing: automatic
 
-  # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia
+  # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia]
   # The custom configuration for the selected provider must be set below
   provider:
 
@@ -154,10 +146,10 @@ speech_processing:
       # - "WebRTC"
 
   assemblyai:
-    # API key for AssemblyAI. See https://assemblyai.com/app/
+    # API key for AssemblyAI. See https://www.assemblyai.com/dashboard/api-keys
     api_key:
-    # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection.
-    format_turns:
+    # Whether to return formatted final transcripts (proper punctuation, letter casing...). If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection.
+    format_turns: true
 
   fal:
     # API key for fal. See https://fal.ai/dashboard/keys
diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml
index 7d4345c..f22e621 100644
--- a/pro/agent-speech-processing.yaml
+++ b/pro/agent-speech-processing.yaml
@@ -1,24 +1,16 @@
-##############################
-# Agent common configuration #
-##############################
-
 # Docker image of the agent.
 docker_image: docker.io/openvidu/agent-speech-processing:main
 
-# Whether this agent should run or not, and how it will connect to Rooms (automatic|manual|disabled)
-# automatic: the agent will run and will automatically connect to new Rooms.
-# manual: the agent will run and will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
-# disabled: the agent will not run.
-processing: disabled
+# Whether to run the agent or not.
+enabled: false
 
-################################
-# Agent specific configuration #
-################################
-speech_processing:
-  # Whether or not the agent should be hidden to the Participants of the Room.
-  hidden: true
+live_captions:
+  # How this agent will connect to Rooms [automatic, manual]
+  # - automatic: the agent will automatically connect to new Rooms.
+  # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
+  processing: automatic
 
-  # Which Speech-To-Text provider to use: aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia
+  # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia]
   # The custom configuration for the selected provider must be set below
   provider:
 
@@ -154,10 +146,10 @@ speech_processing:
       # - "WebRTC"
 
   assemblyai:
-    # API key for AssemblyAI. See https://assemblyai.com/app/
+    # API key for AssemblyAI. See https://www.assemblyai.com/dashboard/api-keys
     api_key:
-    # Whether to return formatted final transcripts. If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection.
-    format_turns:
+    # Whether to return formatted final transcripts (proper punctuation, letter casing...). If enabled, formatted final transcripts are emitted shortly following an end-of-turn detection.
+    format_turns: true
 
   fal:
     # API key for fal. See https://fal.ai/dashboard/keys

From 32e533f892fc00c497a3e2deaf76c251e4ee63a8 Mon Sep 17 00:00:00 2001
From: pabloFuente <pablofuenteperez@gmail.com>
Date: Wed, 18 Jun 2025 20:25:05 +0200
Subject: [PATCH 5/8] Added sarvam STT AI provider to
 agent-speech-processing.yaml

---
 community/agent-speech-processing.yaml | 10 +++++++++-
 pro/agent-speech-processing.yaml       | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml
index f22e621..b105214 100644
--- a/community/agent-speech-processing.yaml
+++ b/community/agent-speech-processing.yaml
@@ -10,7 +10,7 @@ live_captions:
   # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
   processing: automatic
 
-  # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia]
+  # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam]
   # The custom configuration for the selected provider must be set below
   provider:
 
@@ -216,3 +216,11 @@ live_captions:
     languages:
     # Whether to allow switching between languages during recognition. Defaults to True
     code_switching:
+
+  sarvam:
+    # API key for Sarvam. See https://dashboard.sarvam.ai/key-management
+    api_key:
+    # BCP-47 language code for supported Indian languages. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.language_code.language_code
+    language:
+    # The Sarvam STT model to use. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.model.model
+    model:
diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml
index f22e621..b105214 100644
--- a/pro/agent-speech-processing.yaml
+++ b/pro/agent-speech-processing.yaml
@@ -10,7 +10,7 @@ live_captions:
   # - manual: the agent will connect to new Rooms only when your application dictates it by using the Agent Dispatch API.
   processing: automatic
 
-  # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia]
+  # Which speech-to-text AI provider to use [aws, azure, google, opeanai, groq, deepgram, assemblyai, fal, clova, speechmatics, gladia, sarvam]
   # The custom configuration for the selected provider must be set below
   provider:
 
@@ -216,3 +216,11 @@ live_captions:
     languages:
     # Whether to allow switching between languages during recognition. Defaults to True
     code_switching:
+
+  sarvam:
+    # API key for Sarvam. See https://dashboard.sarvam.ai/key-management
+    api_key:
+    # BCP-47 language code for supported Indian languages. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.language_code.language_code
+    language:
+    # The Sarvam STT model to use. See https://docs.sarvam.ai/api-reference-docs/speech-to-text/transcribe#request.body.model.model
+    model:

From c692d9b86df3dba51589b7f9a1535a5f0afc3d7d Mon Sep 17 00:00:00 2001
From: pabloFuente <pablofuenteperez@gmail.com>
Date: Thu, 19 Jun 2025 13:38:18 +0200
Subject: [PATCH 6/8] Added azure_openai to agent-speech-processing.yaml. Fixed
 other providers

---
 community/agent-speech-processing.yaml | 53 +++++++++++++++++++-------
 pro/agent-speech-processing.yaml       | 53 +++++++++++++++++++-------
 2 files changed, 80 insertions(+), 26 deletions(-)

diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml
index b105214..2e86075 100644
--- a/community/agent-speech-processing.yaml
+++ b/community/agent-speech-processing.yaml
@@ -58,6 +58,29 @@ live_captions:
     # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering
     profanity:
 
+  azure_openai:
+    # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai
+    # Azure OpenAI API key. Mandatory value.
+    azure_api_key:
+    # Azure Active Directory token. Mandatory value.
+    azure_ad_token:
+    # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value.
+    azure_endpoint:
+    # Name of your model deployment. If given with `azure_endpoint`, sets the base client URL to include `/deployments/{azure_deployment}`.
+    azure_deployment:
+    # OpenAI REST API version used for the request. Mandatory value.
+    api_version:
+    # OpenAI organization ID.
+    organization:
+    # OpenAI project ID.
+    project:
+    # The language code to use for transcription (e.g., "en" for English).
+    language:
+    # ID of the model to use for speech-to-text.
+    model:
+    # Initial prompt to guide the transcription.
+    prompt:
+
   google:
     # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file.
     # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types)
@@ -155,13 +178,7 @@ live_captions:
     # API key for fal. See https://fal.ai/dashboard/keys
     api_key:
     # See https://fal.ai/models/fal-ai/wizper/api#schema
-    task:
-    # See https://fal.ai/models/fal-ai/wizper/api#schema
     language:
-    # See https://fal.ai/models/fal-ai/wizper/api#schema
-    chunk_level:
-    # See https://fal.ai/models/fal-ai/wizper/api#schema
-    version:
 
   clova:
     # Secret key issued when registering the app
@@ -179,18 +196,28 @@ live_captions:
   speechmatics:
     # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/
     api_key:
-    # See https://docs.speechmatics.com/rt-api-ref#transcription-config
+    # ISO 639-1 language code. All languages are global and can understand different dialects/accents. To see the list of all supported languages, see https://docs.speechmatics.com/introduction/supported-languages
     language:
-    # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale
-    output_locale:
-    # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts
+    # Operating point to use for the transcription per required accuracy & complexity. To learn more, see https://docs.speechmatics.com/features/accuracy-language-packs#accuracy
+    operating_point:
+    # Partial transcripts allow you to receive preliminary transcriptions and update as more context is available until the higher-accuracy final transcript is returned. Partials are returned faster but without any post-processing such as formatting. See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts
     enable_partials:
-    # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
+    # RFC-5646 language code to make spelling rules more consistent in the transcription output. See https://docs.speechmatics.com/features/word-tagging#output-locale
+    output_locale:
+    # The delay in seconds between the end of a spoken word and returning the final transcript results. See https://docs.speechmatics.com/features/realtime-latency#configuration-example
     max_delay:
     # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
     max_delay_mode:
-    # See https://docs.speechmatics.com/features/punctuation-settings
-    # Commented below is an example of punctuation settings
+    # Configuration for speaker diarization. See https://docs.speechmatics.com/features/diarization
+    speaker_diarization_config:
+      # See https://docs.speechmatics.com/features/diarization#max-speakers
+      max_speakers:
+      # See https://docs.speechmatics.com/features/diarization#speaker-sensitivity
+      speaker_sensitivity:
+      # See https://docs.speechmatics.com/features/diarization#prefer-current-speaker
+      prefer_current_speaker:
+    # Permitted puctuation marks for advanced punctuation. See https://docs.speechmatics.com/features/punctuation-settings
+    # Commented is an example of punctuation settings
     punctuation_overrides:
       # permitted_marks: [ ".", "," ]
       # sensitivity: 0.4
diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml
index b105214..2e86075 100644
--- a/pro/agent-speech-processing.yaml
+++ b/pro/agent-speech-processing.yaml
@@ -58,6 +58,29 @@ live_captions:
     # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering
     profanity:
 
+  azure_openai:
+    # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai
+    # Azure OpenAI API key. Mandatory value.
+    azure_api_key:
+    # Azure Active Directory token. Mandatory value.
+    azure_ad_token:
+    # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value.
+    azure_endpoint:
+    # Name of your model deployment. If given with `azure_endpoint`, sets the base client URL to include `/deployments/{azure_deployment}`.
+    azure_deployment:
+    # OpenAI REST API version used for the request. Mandatory value.
+    api_version:
+    # OpenAI organization ID.
+    organization:
+    # OpenAI project ID.
+    project:
+    # The language code to use for transcription (e.g., "en" for English).
+    language:
+    # ID of the model to use for speech-to-text.
+    model:
+    # Initial prompt to guide the transcription.
+    prompt:
+
   google:
     # Credentials for Google Cloud. This is the content of a Google Cloud credential JSON file.
     # Below is a dummy example for a credential type of "Service Account" (https://cloud.google.com/iam/docs/service-account-creds#key-types)
@@ -155,13 +178,7 @@ live_captions:
     # API key for fal. See https://fal.ai/dashboard/keys
     api_key:
     # See https://fal.ai/models/fal-ai/wizper/api#schema
-    task:
-    # See https://fal.ai/models/fal-ai/wizper/api#schema
     language:
-    # See https://fal.ai/models/fal-ai/wizper/api#schema
-    chunk_level:
-    # See https://fal.ai/models/fal-ai/wizper/api#schema
-    version:
 
   clova:
     # Secret key issued when registering the app
@@ -179,18 +196,28 @@ live_captions:
   speechmatics:
     # API key for Speechmatics. See https://portal.speechmatics.com/manage-access/
     api_key:
-    # See https://docs.speechmatics.com/rt-api-ref#transcription-config
+    # ISO 639-1 language code. All languages are global and can understand different dialects/accents. To see the list of all supported languages, see https://docs.speechmatics.com/introduction/supported-languages
     language:
-    # See https://docs.speechmatics.com/features/accuracy-language-packs#output-locale
-    output_locale:
-    # See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts
+    # Operating point to use for the transcription per required accuracy & complexity. To learn more, see https://docs.speechmatics.com/features/accuracy-language-packs#accuracy
+    operating_point:
+    # Partial transcripts allow you to receive preliminary transcriptions and update as more context is available until the higher-accuracy final transcript is returned. Partials are returned faster but without any post-processing such as formatting. See https://docs.speechmatics.com/features/realtime-latency#partial-transcripts
     enable_partials:
-    # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
+    # RFC-5646 language code to make spelling rules more consistent in the transcription output. See https://docs.speechmatics.com/features/word-tagging#output-locale
+    output_locale:
+    # The delay in seconds between the end of a spoken word and returning the final transcript results. See https://docs.speechmatics.com/features/realtime-latency#configuration-example
     max_delay:
     # See https://docs.speechmatics.com/features/realtime-latency#configuration-example
     max_delay_mode:
-    # See https://docs.speechmatics.com/features/punctuation-settings
-    # Commented below is an example of punctuation settings
+    # Configuration for speaker diarization. See https://docs.speechmatics.com/features/diarization
+    speaker_diarization_config:
+      # See https://docs.speechmatics.com/features/diarization#max-speakers
+      max_speakers:
+      # See https://docs.speechmatics.com/features/diarization#speaker-sensitivity
+      speaker_sensitivity:
+      # See https://docs.speechmatics.com/features/diarization#prefer-current-speaker
+      prefer_current_speaker:
+    # Permitted puctuation marks for advanced punctuation. See https://docs.speechmatics.com/features/punctuation-settings
+    # Commented is an example of punctuation settings
     punctuation_overrides:
       # permitted_marks: [ ".", "," ]
       # sensitivity: 0.4

From 8a268d8e658f3caf39a62bac4084db1a155afdaa Mon Sep 17 00:00:00 2001
From: pabloFuente <pablofuenteperez@gmail.com>
Date: Tue, 24 Jun 2025 19:12:24 +0200
Subject: [PATCH 7/8] Updated agent-speech-processing.yaml

---
 community/agent-speech-processing.yaml | 26 ++++++++++++++++----------
 pro/agent-speech-processing.yaml       | 26 ++++++++++++++++----------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml
index 2e86075..756390f 100644
--- a/community/agent-speech-processing.yaml
+++ b/community/agent-speech-processing.yaml
@@ -51,18 +51,18 @@ live_captions:
     speech_key:
     speech_auth_token:
     speech_region:
-    # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set.
+    # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. E.g. ["en-US", "es-ES"]
     # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages
-    languages:
+    language:
     # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw
     # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering
     profanity:
 
   azure_openai:
     # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai
-    # Azure OpenAI API key. Mandatory value.
+    # Azure OpenAI API key
     azure_api_key:
-    # Azure Active Directory token. Mandatory value.
+    # Azure Active Directory token
     azure_ad_token:
     # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value.
     azure_endpoint:
@@ -124,11 +124,13 @@ live_captions:
   openai:
     # API key for OpenAI. See https://platform.openai.com/api-keys
     api_key:
-    # See https://platform.openai.com/docs/guides/speech-to-text
+    # The OpenAI model to use for transcription. See https://platform.openai.com/docs/guides/speech-to-text
     model:
     # The language of the input audio. Supplying the input language in ISO-639-1 format
     # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency.
     language:
+    # Optional text prompt to guide the transcription. Only supported for whisper-1.
+    prompt:
 
   groq:
     # API key for Groq. See https://console.groq.com/keys
@@ -148,15 +150,19 @@ live_captions:
     model:
     # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language
     language:
-    # Whether to return interim (non-final) transcription results. Defaults to true
+    # Whether to enable automatic language detection. Defaults to false. See https://developers.deepgram.com/docs/language-detection
+    detect_language: false
+    # Whether to return interim (non-final) transcription results. Defaults to true. See https://developers.deepgram.com/docs/interim-results
     interim_results: true
-    # Whether to apply smart formatting to numbers, dates, etc. Defaults to true
+    # Whether to apply smart formatting to numbers, dates, etc. Defaults to true. See https://developers.deepgram.com/docs/smart-format
     smart_format: true
-    # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations
+    # When smart_format is used, ensures it does not wait for sequence to be complete before returning results. Defaults to true. See https://developers.deepgram.com/docs/smart-format#using-no-delay
+    no_delay: true
+    # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations. See https://developers.deepgram.com/docs/punctuation
     punctuate: true
-    # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true
+    # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true. See https://developers.deepgram.com/docs/filler-words
     filler_words: true
-    # Whether to filter profanity from the transcription. Defaults to false
+    # Whether to filter profanity from the transcription. Defaults to false. See https://developers.deepgram.com/docs/profanity-filter
     profanity_filter: false
     # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead.
     # keywords:
diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml
index 2e86075..756390f 100644
--- a/pro/agent-speech-processing.yaml
+++ b/pro/agent-speech-processing.yaml
@@ -51,18 +51,18 @@ live_captions:
     speech_key:
     speech_auth_token:
     speech_region:
-    # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set.
+    # Azure handles multiple languages and can auto-detect the language used. It requires the candidate set to be set. E.g. ["en-US", "es-ES"]
     # See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages
-    languages:
+    language:
     # Removes profanity (swearing), or replaces letters of profane words with stars. Valid values: Masked | Removed | Raw
     # See https://learn.microsoft.com/en-us/azure/ai-services/translator/profanity-filtering
     profanity:
 
   azure_openai:
     # Credentials for Azure OpenAI APIs. See https://learn.microsoft.com/en-us/azure/api-management/api-management-authenticate-authorize-azure-openai
-    # Azure OpenAI API key. Mandatory value.
+    # Azure OpenAI API key
     azure_api_key:
-    # Azure Active Directory token. Mandatory value.
+    # Azure Active Directory token
     azure_ad_token:
     # Azure OpenAI endpoint in the following format: https://{your-resource-name}.openai.azure.com. Mandatory value.
     azure_endpoint:
@@ -124,11 +124,13 @@ live_captions:
   openai:
     # API key for OpenAI. See https://platform.openai.com/api-keys
     api_key:
-    # See https://platform.openai.com/docs/guides/speech-to-text
+    # The OpenAI model to use for transcription. See https://platform.openai.com/docs/guides/speech-to-text
     model:
     # The language of the input audio. Supplying the input language in ISO-639-1 format
     # (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) will improve accuracy and latency.
     language:
+    # Optional text prompt to guide the transcription. Only supported for whisper-1.
+    prompt:
 
   groq:
     # API key for Groq. See https://console.groq.com/keys
@@ -148,15 +150,19 @@ live_captions:
     model:
     # See https://developers.deepgram.com/reference/speech-to-text-api/listen-streaming#request.query.language
     language:
-    # Whether to return interim (non-final) transcription results. Defaults to true
+    # Whether to enable automatic language detection. Defaults to false. See https://developers.deepgram.com/docs/language-detection
+    detect_language: false
+    # Whether to return interim (non-final) transcription results. Defaults to true. See https://developers.deepgram.com/docs/interim-results
     interim_results: true
-    # Whether to apply smart formatting to numbers, dates, etc. Defaults to true
+    # Whether to apply smart formatting to numbers, dates, etc. Defaults to true. See https://developers.deepgram.com/docs/smart-format
     smart_format: true
-    # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations
+    # When smart_format is used, ensures it does not wait for sequence to be complete before returning results. Defaults to true. See https://developers.deepgram.com/docs/smart-format#using-no-delay
+    no_delay: true
+    # Whether to add punctuations to the transcription. Defaults to true. Turn detector will work better with punctuations. See https://developers.deepgram.com/docs/punctuation
     punctuate: true
-    # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true
+    # Whether to include filler words (um, uh, etc.) in transcription. Defaults to true. See https://developers.deepgram.com/docs/filler-words
     filler_words: true
-    # Whether to filter profanity from the transcription. Defaults to false
+    # Whether to filter profanity from the transcription. Defaults to false. See https://developers.deepgram.com/docs/profanity-filter
     profanity_filter: false
     # List of tuples containing keywords and their boost values for improved recognition. Each tuple should be (keyword: str, boost: float). Defaults to None. keywords does not work with Nova-3 models. Use keyterms instead.
     # keywords:

From 4bf87d6485edc518420ac55684c83a58328b3f1e Mon Sep 17 00:00:00 2001
From: cruizba <carlos.ruizbal@gmail.com>
Date: Thu, 26 Jun 2025 22:20:38 +0200
Subject: [PATCH 8/8] Bump to version 3.3.0

---
 community/agent-speech-processing.yaml |  2 +-
 community/docker-compose.yaml          | 12 ++++++------
 pro/agent-speech-processing.yaml       |  2 +-
 pro/docker-compose.yaml                | 14 +++++++-------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/community/agent-speech-processing.yaml b/community/agent-speech-processing.yaml
index 756390f..dc51bd8 100644
--- a/community/agent-speech-processing.yaml
+++ b/community/agent-speech-processing.yaml
@@ -1,5 +1,5 @@
 # Docker image of the agent.
-docker_image: docker.io/openvidu/agent-speech-processing:main
+docker_image: docker.io/openvidu/agent-speech-processing:3.3.0
 
 # Whether to run the agent or not.
 enabled: false
diff --git a/community/docker-compose.yaml b/community/docker-compose.yaml
index ce6bfd1..84a77e4 100644
--- a/community/docker-compose.yaml
+++ b/community/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   caddy-proxy:
-    image: docker.io/openvidu/openvidu-caddy-local:main
+    image: docker.io/openvidu/openvidu-caddy-local:3.3.0
     platform: linux/amd64
     container_name: caddy-proxy
     restart: unless-stopped
@@ -87,7 +87,7 @@ services:
         condition: service_completed_successfully
 
   dashboard:
-    image: docker.io/openvidu/openvidu-dashboard:main
+    image: docker.io/openvidu/openvidu-dashboard:3.3.0
     platform: linux/amd64
     container_name: dashboard
     restart: unless-stopped
@@ -101,7 +101,7 @@ services:
         condition: service_completed_successfully
 
   openvidu:
-    image: docker.io/openvidu/openvidu-server:main
+    image: docker.io/openvidu/openvidu-server:3.3.0
     platform: linux/amd64
     restart: unless-stopped
     container_name: openvidu
@@ -123,7 +123,7 @@ services:
         condition: service_completed_successfully
 
   ingress:
-    image: docker.io/openvidu/ingress:main
+    image: docker.io/openvidu/ingress:3.3.0
     platform: linux/amd64
     container_name: ingress
     restart: unless-stopped
@@ -158,7 +158,7 @@ services:
         condition: service_completed_successfully
 
   default-app:
-    image: docker.io/openvidu/openvidu-call:main-demo
+    image: docker.io/openvidu/openvidu-call:3.3.0-demo
     platform: linux/amd64
     container_name: openvidu-call
     restart: on-failure
@@ -194,7 +194,7 @@ services:
         condition: service_completed_successfully
 
   operator:
-    image: docker.io/openvidu/openvidu-operator:main
+    image: docker.io/openvidu/openvidu-operator:3.3.0
     platform: linux/amd64
     container_name: operator
     restart: unless-stopped
diff --git a/pro/agent-speech-processing.yaml b/pro/agent-speech-processing.yaml
index 756390f..dc51bd8 100644
--- a/pro/agent-speech-processing.yaml
+++ b/pro/agent-speech-processing.yaml
@@ -1,5 +1,5 @@
 # Docker image of the agent.
-docker_image: docker.io/openvidu/agent-speech-processing:main
+docker_image: docker.io/openvidu/agent-speech-processing:3.3.0
 
 # Whether to run the agent or not.
 enabled: false
diff --git a/pro/docker-compose.yaml b/pro/docker-compose.yaml
index fb4a076..274f8e9 100644
--- a/pro/docker-compose.yaml
+++ b/pro/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   caddy-proxy:
-    image: docker.io/openvidu/openvidu-caddy-local:main
+    image: docker.io/openvidu/openvidu-caddy-local:3.3.0
     platform: linux/amd64
     container_name: caddy-proxy
     restart: unless-stopped
@@ -87,7 +87,7 @@ services:
         condition: service_completed_successfully
 
   dashboard:
-    image: docker.io/openvidu/openvidu-dashboard:main
+    image: docker.io/openvidu/openvidu-dashboard:3.3.0
     platform: linux/amd64
     container_name: dashboard
     restart: unless-stopped
@@ -101,7 +101,7 @@ services:
         condition: service_completed_successfully
 
   openvidu:
-    image: docker.io/openvidu/openvidu-server-pro:main
+    image: docker.io/openvidu/openvidu-server-pro:3.3.0
     platform: linux/amd64
     restart: unless-stopped
     container_name: openvidu
@@ -125,7 +125,7 @@ services:
         condition: service_completed_successfully
 
   ingress:
-    image: docker.io/openvidu/ingress:main
+    image: docker.io/openvidu/ingress:3.3.0
     platform: linux/amd64
     container_name: ingress
     restart: unless-stopped
@@ -160,7 +160,7 @@ services:
         condition: service_completed_successfully
 
   default-app:
-    image: docker.io/openvidu/openvidu-call:main-demo
+    image: docker.io/openvidu/openvidu-call:3.3.0-demo
     platform: linux/amd64
     container_name: openvidu-call
     restart: on-failure
@@ -196,7 +196,7 @@ services:
         condition: service_completed_successfully
 
   openvidu-v2compatibility:
-    image: docker.io/openvidu/openvidu-v2compatibility:main
+    image: docker.io/openvidu/openvidu-v2compatibility:3.3.0
     platform: linux/amd64
     restart: unless-stopped
     container_name: openvidu-v2compatibility
@@ -267,7 +267,7 @@ services:
     command: /bin/sh /scripts/ready-check.sh
 
   operator:
-    image: docker.io/openvidu/openvidu-operator:main
+    image: docker.io/openvidu/openvidu-operator:3.3.0
     platform: linux/amd64
     container_name: operator
     restart: unless-stopped