Integration Test #10

Workflow file for this run

.github/workflows/integration-test.yml at 0891590

	name: Integration Test

	on:
	workflow_dispatch:
	inputs:
	text:
	description: "Text to synthesize"
	default: "Hello! This is a test of the Voxtral text to speech system running on CI."
	required: false
	voice:
	description: "Voice preset"
	default: "neutral_female"
	required: false

	jobs:
	test:
	strategy:
	fail-fast: false
	matrix:
	include:
	- os: ubuntu-24.04-arm
	name: Linux ARM64 (tch)
	backend: tch
	libtorch-url: https://github.com/second-state/libtorch-releases/releases/download/v2.7.1/libtorch-cxx11-abi-aarch64-2.7.1.tar.gz
	- os: macos-latest
	name: macOS ARM64 (MLX)
	backend: mlx

	runs-on: ${{ matrix.os }}
	name: Integration (${{ matrix.name }})

	steps:
	- uses: actions/checkout@v4

	- name: Init MLX submodule
	if: matrix.backend == 'mlx'
	run: git submodule update --init --recursive

	- name: Delete Cargo.lock
	run: rm -f Cargo.lock

	- name: Install Rust
	uses: dtolnay/rust-toolchain@stable

	# ---------------------------------------------------------------
	# Build
	# ---------------------------------------------------------------

	- name: Download libtorch
	if: matrix.backend == 'tch'
	run: \|
	curl -Lo libtorch.tar.gz "${{ matrix.libtorch-url }}"
	tar xzf libtorch.tar.gz

	- name: Set linker rpath-link (Linux only)
	if: runner.os == 'Linux' && matrix.backend == 'tch'
	run: echo "RUSTFLAGS=-C link-arg=-Wl,-rpath-link,${{ github.workspace }}/libtorch/lib" >> "$GITHUB_ENV"

	- name: Build (tch)
	if: matrix.backend == 'tch'
	env:
	LIBTORCH: ${{ github.workspace }}/libtorch
	LIBTORCH_BYPASS_VERSION_CHECK: "1"
	run: cargo build --release

	- name: Build (MLX)
	if: matrix.backend == 'mlx'
	run: cargo build --release --no-default-features --features mlx

	# ---------------------------------------------------------------
	# Download model
	# ---------------------------------------------------------------

	- name: Download model
	run: bash scripts/download_model.sh

	- name: Convert voice embeddings to safetensors
	run: \|
	python3 -m venv .venv
	.venv/bin/pip install torch safetensors numpy packaging --quiet
	.venv/bin/python3 -c "
	import torch, os
	from safetensors.torch import save_file
	d = 'models/voxtral-4b-tts/voice_embedding'
	for f in sorted(os.listdir(d)):
	if f.endswith('.pt'):
	t = torch.load(os.path.join(d, f), map_location='cpu', weights_only=True)
	save_file({'embedding': t}, os.path.join(d, f.replace('.pt', '.safetensors')))
	print(f'Converted {f}')
	"

	# ---------------------------------------------------------------
	# CLI tests
	# ---------------------------------------------------------------
	# Note: macOS CI runners (M1) are ~60x slower than local M4 Max
	# for MLX inference, so we use shorter text on macOS to stay
	# within the 6h GitHub Actions timeout.

	- name: "CLI: Generate speech (English, neutral_female)"
	run: \|
	if [ "${{ matrix.backend }}" = "mlx" ]; then
	TEXT="Hello."
	else
	TEXT="${{ inputs.text }}"
	fi
	./target/release/voxtral-tts models/voxtral-4b-tts \
	--text "$TEXT" \
	--voice neutral_female \
	--output neutral_female_english.wav
	file neutral_female_english.wav
	ls -lh neutral_female_english.wav

	- name: "CLI: Generate speech (French, fr_female)"
	if: matrix.backend != 'mlx'
	run: \|
	./target/release/voxtral-tts models/voxtral-4b-tts \
	--text "Bonjour! Ceci est un test du système Voxtral." \
	--voice fr_female \
	--output fr_female_french.wav
	file fr_female_french.wav
	ls -lh fr_female_french.wav

	- name: "CLI: Generate speech (custom voice via input)"
	if: inputs.voice != 'neutral_female' && matrix.backend != 'mlx'
	run: \|
	./target/release/voxtral-tts models/voxtral-4b-tts \
	--text "${{ inputs.text }}" \
	--voice "${{ inputs.voice }}" \
	--output custom_voice.wav
	file custom_voice.wav
	ls -lh custom_voice.wav

	- name: "CLI: List voices"
	run: ./target/release/voxtral-tts models/voxtral-4b-tts --list-voices --text ""

	# ---------------------------------------------------------------
	# API server tests
	# ---------------------------------------------------------------

	- name: "Server: Start in background"
	run: \|
	./target/release/voxtral-tts-server models/voxtral-4b-tts --port 8090 &
	SERVER_PID=$!
	echo "SERVER_PID=$SERVER_PID" >> "$GITHUB_ENV"
	# Wait for server to be ready
	for i in $(seq 1 60); do
	if curl -sf http://127.0.0.1:8090/health > /dev/null 2>&1; then
	echo "Server ready after ${i}s"
	break
	fi
	sleep 1
	done
	curl -sf http://127.0.0.1:8090/health \|\| (echo "Server failed to start"; kill $SERVER_PID 2>/dev/null; exit 1)

	- name: "Server: GET /health"
	run: curl -sf http://127.0.0.1:8090/health \| tee /dev/stderr \| grep -q ok

	- name: "Server: GET /v1/models"
	run: curl -sf http://127.0.0.1:8090/v1/models \| tee /dev/stderr \| grep -q voxtral

	- name: "Server: POST /v1/audio/speech (alloy)"
	run: \|
	if [ "${{ matrix.backend }}" = "mlx" ]; then
	INPUT="Hello."
	else
	INPUT="Hello from the API server."
	fi
	curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d "{\"input\":\"$INPUT\",\"voice\":\"alloy\",\"model\":\"voxtral-4b-tts\"}" \
	-o api_alloy.wav
	file api_alloy.wav
	ls -lh api_alloy.wav

	- name: "Server: POST /v1/audio/speech (es_male)"
	if: matrix.backend != 'mlx'
	run: \|
	curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hola, esta es una prueba del servidor.","voice":"es_male","model":"voxtral-4b-tts"}' \
	-o api_es_male.wav
	file api_es_male.wav
	ls -lh api_es_male.wav

	- name: "Server: POST /v1/audio/speech (mp3)"
	run: \|
	curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","response_format":"mp3"}' \
	-o api_alloy.mp3
	file api_alloy.mp3
	ls -lh api_alloy.mp3

	- name: "Server: POST /v1/audio/speech (flac)"
	run: \|
	curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","response_format":"flac"}' \
	-o api_alloy.flac
	file api_alloy.flac
	ls -lh api_alloy.flac

	- name: "Server: POST /v1/audio/speech (ogg/opus)"
	run: \|
	curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","response_format":"ogg"}' \
	-o api_alloy.ogg
	file api_alloy.ogg
	ls -lh api_alloy.ogg

	- name: "Server: POST /v1/audio/speech (pcm)"
	run: \|
	curl -sf -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","response_format":"pcm"}' \
	-o api_alloy.pcm
	test -s api_alloy.pcm
	ls -lh api_alloy.pcm

	- name: "Server: Validation errors"
	run: \|
	# Empty input -> 400
	STATUS=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"","voice":"alloy"}')
	echo "Empty input: $STATUS"
	[ "$STATUS" = "400" ] \|\| (echo "Expected 400, got $STATUS"; exit 1)

	# Invalid format -> 400
	STATUS=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","response_format":"aac"}')
	echo "Invalid format: $STATUS"
	[ "$STATUS" = "400" ] \|\| (echo "Expected 400, got $STATUS"; exit 1)

	# Speed out of range -> 400
	STATUS=$(curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","speed":10.0}')
	echo "Invalid speed: $STATUS"
	[ "$STATUS" = "400" ] \|\| (echo "Expected 400, got $STATUS"; exit 1)

	- name: "Server: Streaming SSE"
	if: matrix.backend != 'mlx'
	run: \|
	curl -sN -X POST http://127.0.0.1:8090/v1/audio/speech \
	-H "Content-Type: application/json" \
	-d '{"input":"Hello.","voice":"alloy","stream":true}' \
	--max-time 300 \
	-o sse_output.txt \|\| true
	echo "--- SSE output (first 500 chars) ---"
	head -c 500 sse_output.txt
	echo ""
	grep -q "speech.audio.delta" sse_output.txt \|\| (echo "Missing speech.audio.delta"; exit 1)
	grep -q "speech.audio.done" sse_output.txt \|\| (echo "Missing speech.audio.done"; exit 1)
	echo "Streaming test passed"

	- name: "Server: Stop"
	if: always()
	run: kill ${{ env.SERVER_PID }} 2>/dev/null \|\| true

	# ---------------------------------------------------------------
	# Upload audio artifacts
	# ---------------------------------------------------------------

	- name: Upload generated audio
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: audio-${{ matrix.os }}-${{ matrix.backend }}
	path: \|
	*.wav
	*.mp3
	*.flac
	*.ogg
	*.pcm

	- name: Upload binaries
	uses: actions/upload-artifact@v4
	with:
	name: binaries-${{ matrix.os }}-${{ matrix.backend }}
	path: \|
	target/release/voxtral-tts
	target/release/voxtral-tts-server

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Integration Test #10

Workflow file

Integration Test #10

Uh oh!

Workflow file for this run