Skip to content

Integration Test

Integration Test #10

name: Integration Test
on:
workflow_dispatch:
inputs:
text:
description: "Text to synthesize"
default: "Hello! This is a test of the Voxtral text to speech system running on CI."
required: false
voice:
description: "Voice preset"
default: "neutral_female"
required: false
jobs:
test:
strategy:
fail-fast: false
matrix:
include:
- os: ubuntu-24.04-arm
name: Linux ARM64 (tch)
backend: tch
libtorch-url: https://github.com/second-state/libtorch-releases/releases/download/v2.7.1/libtorch-cxx11-abi-aarch64-2.7.1.tar.gz
- os: macos-latest
name: macOS ARM64 (MLX)
backend: mlx
runs-on: ${{ matrix.os }}
name: Integration (${{ matrix.name }})
steps:
- uses: actions/checkout@v4
- name: Init MLX submodule
if: matrix.backend == 'mlx'
run: git submodule update --init --recursive
- name: Delete Cargo.lock
run: rm -f Cargo.lock
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
# ---------------------------------------------------------------
# Build
# ---------------------------------------------------------------
- name: Download libtorch
if: matrix.backend == 'tch'
run: |
curl -Lo libtorch.tar.gz "${{ matrix.libtorch-url }}"
tar xzf libtorch.tar.gz
- name: Set linker rpath-link (Linux only)
if: runner.os == 'Linux' && matrix.backend == 'tch'
run: echo "RUSTFLAGS=-C link-arg=-Wl,-rpath-link,${{ github.workspace }}/libtorch/lib" >> "$GITHUB_ENV"
- name: Build (tch)
if: matrix.backend == 'tch'
env:
LIBTORCH: ${{ github.workspace }}/libtorch
LIBTORCH_BYPASS_VERSION_CHECK: "1"
run: cargo build --release
- name: Build (MLX)
if: matrix.backend == 'mlx'
run: cargo build --release --no-default-features --features mlx
# ---------------------------------------------------------------
# Download model
# ---------------------------------------------------------------
- name: Download model
run: bash scripts/download_model.sh
- name: Convert voice embeddings to safetensors
run: |
python3 -m venv .venv
.venv/bin/pip install torch safetensors numpy packaging --quiet
.venv/bin/python3 -c "
import torch, os
from safetensors.torch import save_file
d = 'models/voxtral-4b-tts/voice_embedding'
for f in sorted(os.listdir(d)):
if f.endswith('.pt'):
t = torch.load(os.path.join(d, f), map_location='cpu', weights_only=True)
save_file({'embedding': t}, os.path.join(d, f.replace('.pt', '.safetensors')))
print(f'Converted {f}')
"
# ---------------------------------------------------------------
# CLI tests
# ---------------------------------------------------------------
# Note: macOS CI runners (M1) are ~60x slower than local M4 Max
# for MLX inference, so we use shorter text on macOS to stay
# within the 6h GitHub Actions timeout.
- name: "CLI: Generate speech (English, neutral_female)"
run: |
if [ "${{ matrix.backend }}" = "mlx" ]; then
TEXT="Hello."
else
TEXT="${{ inputs.text }}"
fi
./target/release/voxtral-tts models/voxtral-4b-tts \
--text "$TEXT" \
--voice neutral_female \
--output neutral_female_english.wav
file neutral_female_english.wav
ls -lh neutral_female_english.wav
- name: "CLI: Generate speech (French, fr_female)"
if: matrix.backend != 'mlx'
run: |
./target/release/voxtral-tts models/voxtral-4b-tts \
--text "Bonjour! Ceci est un test du système Voxtral." \
--voice fr_female \
--output fr_female_french.wav
file fr_female_french.wav
ls -lh fr_female_french.wav
- name: "CLI: Generate speech (custom voice via input)"
if: inputs.voice != 'neutral_female' && matrix.backend != 'mlx'
run: |
./target/release/voxtral-tts models/voxtral-4b-tts \
--text "${{ inputs.text }}" \
--voice "${{ inputs.voice }}" \
--output custom_voice.wav
file custom_voice.wav
ls -lh custom_voice.wav
- name: "CLI: List voices"
run: ./target/release/voxtral-tts models/voxtral-4b-tts --list-voices --text ""
# ---------------------------------------------------------------
# API server tests
# ---------------------------------------------------------------
- name: "Server: Start in background"
run: |
./target/release/voxtral-tts-server models/voxtral-4b-tts --port 8090 &
SERVER_PID=$!
echo "SERVER_PID=$SERVER_PID" >> "$GITHUB_ENV"
# Wait for server to be ready
for i in $(seq 1 60); do
if curl -sf http://127.0.0.1:8090/health > /dev/null 2>&1; then
echo "Server ready after ${i}s"
break
fi
sleep 1
done
curl -sf http://127.0.0.1:8090/health || (echo "Server failed to start"; kill $SERVER_PID 2>/dev/null; exit 1)
- name: "Server: GET /health"
run: curl -sf http://127.0.0.1:8090/health | tee /dev/stderr | grep -q ok
- name: "Server: GET /v1/models"
run: curl -sf http://127.0.0.1:8090/v1/models | tee /dev/stderr | grep -q voxtral
- name: "Server: POST /v1/audio/speech (alloy)"
run: |
if [ "${{ matrix.backend }}" = "mlx" ]; then
INPUT="Hello."
else
INPUT="Hello from the API server."
fi
curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d "{\"input\":\"$INPUT\",\"voice\":\"alloy\",\"model\":\"voxtral-4b-tts\"}" \
-o api_alloy.wav
file api_alloy.wav
ls -lh api_alloy.wav
- name: "Server: POST /v1/audio/speech (es_male)"
if: matrix.backend != 'mlx'
run: |
curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hola, esta es una prueba del servidor.","voice":"es_male","model":"voxtral-4b-tts"}' \
-o api_es_male.wav
file api_es_male.wav
ls -lh api_es_male.wav
- name: "Server: POST /v1/audio/speech (mp3)"
run: |
curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","response_format":"mp3"}' \
-o api_alloy.mp3
file api_alloy.mp3
ls -lh api_alloy.mp3
- name: "Server: POST /v1/audio/speech (flac)"
run: |
curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","response_format":"flac"}' \
-o api_alloy.flac
file api_alloy.flac
ls -lh api_alloy.flac
- name: "Server: POST /v1/audio/speech (ogg/opus)"
run: |
curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","response_format":"ogg"}' \
-o api_alloy.ogg
file api_alloy.ogg
ls -lh api_alloy.ogg
- name: "Server: POST /v1/audio/speech (pcm)"
run: |
curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","response_format":"pcm"}' \
-o api_alloy.pcm
test -s api_alloy.pcm
ls -lh api_alloy.pcm
- name: "Server: Validation errors"
run: |
# Empty input -> 400
STATUS=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"","voice":"alloy"}')
echo "Empty input: $STATUS"
[ "$STATUS" = "400" ] || (echo "Expected 400, got $STATUS"; exit 1)
# Invalid format -> 400
STATUS=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","response_format":"aac"}')
echo "Invalid format: $STATUS"
[ "$STATUS" = "400" ] || (echo "Expected 400, got $STATUS"; exit 1)
# Speed out of range -> 400
STATUS=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","speed":10.0}')
echo "Invalid speed: $STATUS"
[ "$STATUS" = "400" ] || (echo "Expected 400, got $STATUS"; exit 1)
- name: "Server: Streaming SSE"
if: matrix.backend != 'mlx'
run: |
curl -sN -X POST http://127.0.0.1:8090/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"input":"Hello.","voice":"alloy","stream":true}' \
--max-time 300 \
-o sse_output.txt || true
echo "--- SSE output (first 500 chars) ---"
head -c 500 sse_output.txt
echo ""
grep -q "speech.audio.delta" sse_output.txt || (echo "Missing speech.audio.delta"; exit 1)
grep -q "speech.audio.done" sse_output.txt || (echo "Missing speech.audio.done"; exit 1)
echo "Streaming test passed"
- name: "Server: Stop"
if: always()
run: kill ${{ env.SERVER_PID }} 2>/dev/null || true
# ---------------------------------------------------------------
# Upload audio artifacts
# ---------------------------------------------------------------
- name: Upload generated audio
if: always()
uses: actions/upload-artifact@v4
with:
name: audio-${{ matrix.os }}-${{ matrix.backend }}
path: |
*.wav
*.mp3
*.flac
*.ogg
*.pcm
- name: Upload binaries
uses: actions/upload-artifact@v4
with:
name: binaries-${{ matrix.os }}-${{ matrix.backend }}
path: |
target/release/voxtral-tts
target/release/voxtral-tts-server