Carlos Santos 1663b008ed frontend (test): add Smart Mosaic layout helper functions and fake participant management
- Implemented helper functions for configuring Smart Mosaic layout, including setting participant count and waiting for participant visibility.
- Created a new file for managing fake participants, allowing for joining and disconnecting from LiveKit rooms using both CLI and browser-based methods.
- Introduced interfaces for browser-based fake participant options to streamline participant creation with audio and video assets.
2025-12-02 21:02:40 +01:00

331 lines
16 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# =============================================================================
# Audio Generation Script for Smart Mosaic Layout Tests
# =============================================================================
# This script generates test audio files from a base audio file (base.wav)
# for testing the Smart Mosaic layout speaker detection functionality.
#
# Requirements:
# - ffmpeg 7.0+ (optimized for this version)
# - base.wav file with continuous speech audio in the same directory
#
# IMPORTANT: This script generates WAV files for best compatibility with
# Chrome's fake audio capture (--use-file-for-fake-audio-capture).
# WAV format ensures proper audio device simulation and VAD detection.
#
# Usage:
# chmod +x generate-test-audio.sh
# ./generate-test-audio.sh
# =============================================================================
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BASE_AUDIO="$SCRIPT_DIR/base.wav"
OUTPUT_DIR="$SCRIPT_DIR"
# Audio settings
SAMPLE_RATE=48000
CHANNELS=1
# WAV encoding settings for Chrome fake audio capture compatibility
# PCM 16-bit is the most compatible format for Chrome's fake devices
WAV_OPTS="-c:a pcm_s16le -ar ${SAMPLE_RATE} -ac ${CHANNELS}"
# Check ffmpeg version
FFMPEG_VERSION=$(ffmpeg -version | head -n1 | grep -oP 'ffmpeg version \K[0-9]+')
echo "🔧 Detected ffmpeg major version: $FFMPEG_VERSION"
# Check if base audio exists
if [ ! -f "$BASE_AUDIO" ]; then
echo "❌ Error: base.wav not found in $SCRIPT_DIR"
echo "Please provide a base.wav file with continuous speech audio."
exit 1
fi
echo ""
echo "🎵 Generating test audio files from base.wav..."
echo " Output directory: $OUTPUT_DIR"
echo " Sample rate: ${SAMPLE_RATE}Hz, Channels: ${CHANNELS}"
echo " Codec: PCM 16-bit (WAV) for Chrome fake audio compatibility"
echo ""
# -----------------------------------------------------------------------------
# 1. continuous_speech.wav (30s)
# Continuous speech audio for participants who speak constantly
# -----------------------------------------------------------------------------
echo "1⃣ Generating continuous_speech.wav (30s of continuous speech)..."
ffmpeg -y -i "$BASE_AUDIO" -t 30 -af "aresample=${SAMPLE_RATE}" $WAV_OPTS "$OUTPUT_DIR/continuous_speech.wav" 2>/dev/null
echo " ✅ continuous_speech.wav created"
# -----------------------------------------------------------------------------
# 2. complete_silence.wav (30s)
# Complete digital silence using aevalsrc with explicit zero expression
# This generates samples with value exactly 0.0 - guaranteed no VAD trigger
# -----------------------------------------------------------------------------
echo "2⃣ Generating complete_silence.wav (30s of TRUE digital silence)..."
ffmpeg -y -f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
$WAV_OPTS "$OUTPUT_DIR/complete_silence.wav" 2>/dev/null
echo " ✅ complete_silence.wav created"
# -----------------------------------------------------------------------------
# 3. speech_5s_then_silence.wav (30s)
# 5s speech, then 25s TRUE silence
# Uses amix to combine speech with silence background for clean transitions
# -----------------------------------------------------------------------------
echo "3⃣ Generating speech_5s_then_silence.wav (5s speech + 25s TRUE silence)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:5,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE}[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speech_5s_then_silence.wav" 2>/dev/null
echo " ✅ speech_5s_then_silence.wav created"
# -----------------------------------------------------------------------------
# 4. silence_5s_then_speech.wav (30s)
# 5s TRUE silence, then 25s speech
# -----------------------------------------------------------------------------
echo "4⃣ Generating silence_5s_then_speech.wav (5s TRUE silence + 25s speech)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:25,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=5s:all=1[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/silence_5s_then_speech.wav" 2>/dev/null
echo " ✅ silence_5s_then_speech.wav created"
# -----------------------------------------------------------------------------
# 5. speech_gap_speech.wav (30s)
# 5s speech, 10s TRUE silence, 15s speech - for testing speaker re-activation
# -----------------------------------------------------------------------------
echo "5⃣ Generating speech_gap_speech.wav (5s speech + 10s TRUE gap + 15s speech)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:5,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE}[s1];
[0:a]atrim=5:20,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=15s:all=1[s2];
[1:a][s1][s2]amix=inputs=3:duration=first:dropout_transition=0,volume=3[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speech_gap_speech.wav" 2>/dev/null
echo " ✅ speech_gap_speech.wav created"
# -----------------------------------------------------------------------------
# 6-11. Sequential speaker audio files (for rotation tests)
# Each speaker has a unique time window for speech with TRUE silence elsewhere
# -----------------------------------------------------------------------------
echo "6⃣ Generating sequential speaker audio files (A through F)..."
# Speaker A: speaks 0-3s, then TRUE silence
echo " → speaker_seq_A.wav (speaks at 0-3s)"
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:3,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE}[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speaker_seq_A.wav" 2>/dev/null
# Speaker B: TRUE silence 0-5s, speaks 5-8s, then TRUE silence
echo " → speaker_seq_B.wav (speaks at 5-8s)"
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:3,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=5s:all=1[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speaker_seq_B.wav" 2>/dev/null
# Speaker C: TRUE silence 0-10s, speaks 10-13s, then TRUE silence
echo " → speaker_seq_C.wav (speaks at 10-13s)"
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:3,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=10s:all=1[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speaker_seq_C.wav" 2>/dev/null
# Speaker D: TRUE silence 0-15s, speaks 15-18s, then TRUE silence
echo " → speaker_seq_D.wav (speaks at 15-18s)"
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:3,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=15s:all=1[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speaker_seq_D.wav" 2>/dev/null
# Speaker E: TRUE silence 0-20s, speaks 20-23s, then TRUE silence
echo " → speaker_seq_E.wav (speaks at 20-23s)"
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:3,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=20s:all=1[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speaker_seq_E.wav" 2>/dev/null
# Speaker F: TRUE silence 0-25s, speaks 25-28s, then TRUE silence
echo " → speaker_seq_F.wav (speaks at 25-28s)"
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:3,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=25s:all=1[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/speaker_seq_F.wav" 2>/dev/null
echo " ✅ Sequential speaker files created (A-F)"
# -----------------------------------------------------------------------------
# 12. simultaneous_then_solo.wav (30s)
# 15s speech then 15s TRUE silence
# Used for the "simultaneous speech" test (this participant continues speaking)
# -----------------------------------------------------------------------------
echo "7⃣ Generating simultaneous_then_solo.wav (15s speech + 15s TRUE silence)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:15,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE}[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/simultaneous_then_solo.wav" 2>/dev/null
echo " ✅ simultaneous_then_solo.wav created"
# -----------------------------------------------------------------------------
# 13. simultaneous_then_stop.wav (30s)
# 5s speech then 25s TRUE silence
# Used for participants who stop speaking after simultaneous period
# -----------------------------------------------------------------------------
echo "8⃣ Generating simultaneous_then_stop.wav (5s speech + 25s TRUE silence)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:5,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE}[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/simultaneous_then_stop.wav" 2>/dev/null
echo " ✅ simultaneous_then_stop.wav created"
# -----------------------------------------------------------------------------
# 14. low_volume_speech.wav (30s)
# Continuous speech at 10% volume - below the audioLevel threshold (0.15)
# Used to test that participants with low audio levels are filtered out
# -----------------------------------------------------------------------------
echo "9⃣ Generating low_volume_speech.wav (30s speech at 10% volume)..."
ffmpeg -y \
-f lavfi -i "anoisesrc=color=pink:amplitude=0.02:s=${SAMPLE_RATE}:d=30" \
$WAV_OPTS "$OUTPUT_DIR/ambient_pink_noise.wav" 2>/dev/null
echo " ✅ low_volume_speech.wav created"
# -----------------------------------------------------------------------------
# 15. brief_sound_1s.wav (30s)
# Only 1 second of speech followed by silence
# Used to test minimum speaking duration filter (should be filtered out)
# -----------------------------------------------------------------------------
echo "🔟 Generating brief_sound_1s.wav (1s speech + 29s silence)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:1,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=5000|5000[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/brief_sound_1s_at_5s.wav" 2>/dev/null
echo " ✅ brief_sound_1s_at_5s.wav created"
# -----------------------------------------------------------------------------
# 16. brief_cough.wav (30s)
# Only 0.5 seconds of sound (simulating a cough) followed by silence
# Used to test that very brief sounds are filtered out
# -----------------------------------------------------------------------------
echo "1⃣1⃣ Generating brief_cough.wav (0.5s sound + 29.5s silence)..."
ffmpeg -y \
-i "$BASE_AUDIO" \
-f lavfi -i "aevalsrc=0:c=mono:s=${SAMPLE_RATE}:d=30" \
-filter_complex "
[0:a]atrim=0:0.5,asetpts=PTS-STARTPTS,aresample=${SAMPLE_RATE},adelay=5000|5000[speech];
[1:a][speech]amix=inputs=2:duration=first:dropout_transition=0,volume=2[out]
" \
-map "[out]" -t 30 $WAV_OPTS "$OUTPUT_DIR/brief_cough_at_5s.wav" 2>/dev/null
echo " ✅ brief_cough_at_5s.wav created"
# -----------------------------------------------------------------------------
# Verify silence in generated files
# -----------------------------------------------------------------------------
echo ""
echo "🔍 Verifying silence quality in generated files..."
verify_silence() {
local file=$1
local expected_silence_start=$2
# Check RMS level in silence portion (should be exactly 0 or very close)
local rms=$(ffmpeg -i "$file" -af "atrim=${expected_silence_start}:${expected_silence_start}+1,astats=metadata=1:reset=1" -f null - 2>&1 | grep "RMS level" | head -1 | grep -oP '[-0-9.]+' | head -1)
if [ -n "$rms" ]; then
echo " $file: RMS at ${expected_silence_start}s = ${rms}dB"
fi
}
# Verify a few key files
verify_silence "$OUTPUT_DIR/complete_silence.wav" 15
verify_silence "$OUTPUT_DIR/speech_5s_then_silence.wav" 10
verify_silence "$OUTPUT_DIR/speaker_seq_B.wav" 2
# -----------------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------------
echo ""
echo "============================================================================="
echo "✅ Audio generation complete! (WAV format for Chrome fake audio capture)"
echo "============================================================================="
echo ""
echo "Generated files:"
echo " 📁 $OUTPUT_DIR/"
echo " ├── continuous_speech.wav (30s continuous speech)"
echo " ├── complete_silence.wav (30s TRUE digital silence - aevalsrc=0)"
echo " ├── speech_5s_then_silence.wav (5s speech + 25s TRUE silence)"
echo " ├── silence_5s_then_speech.wav (5s TRUE silence + 25s speech)"
echo " ├── speech_gap_speech.wav (5s speech + 10s gap + 15s speech)"
echo " ├── speaker_seq_A.wav (speaks at 0-3s)"
echo " ├── speaker_seq_B.wav (speaks at 5-8s)"
echo " ├── speaker_seq_C.wav (speaks at 10-13s)"
echo " ├── speaker_seq_D.wav (speaks at 15-18s)"
echo " ├── speaker_seq_E.wav (speaks at 20-23s)"
echo " ├── speaker_seq_F.wav (speaks at 25-28s)"
echo " ├── simultaneous_then_solo.wav (15s speech + 15s silence)"
echo " ├── simultaneous_then_stop.wav (5s speech + 25s silence)"
echo " ├── low_volume_speech.wav (30s speech at 10% volume - below threshold)"
echo " ├── brief_sound_1s.wav (1s speech + 29s silence - too short)"
echo " └── brief_cough.wav (0.5s sound + 29.5s silence - simulates cough)"
echo ""
echo "Key features of this version:"
echo " • WAV format (PCM 16-bit) for Chrome fake audio capture compatibility"
echo " • Uses aevalsrc=0 for TRUE digital silence (samples = 0.0)"
echo " • amix filter for clean speech/silence transitions"
echo " • adelay for precise speech timing"
echo " • 48kHz sample rate, mono channel"
echo ""
echo "Usage in tests:"
echo " await joinBrowserFakeParticipant(browser, roomId, 'speaker1', {"
echo " audioFile: 'continuous_speech.wav'"
echo " });"
echo ""