-
-
Notifications
You must be signed in to change notification settings - Fork 6.4k
feat: Add ModelsLab text-to-speech provider #5165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| const MODELSLAB_VOICES = [ | ||
| { value: "en_us_001", label: "English (US) - Voice 1" }, | ||
| { value: "en_us_006", label: "English (US) - Voice 2" }, | ||
| { value: "en_us_007", label: "English (US) - Voice 3" }, | ||
| { value: "en_us_009", label: "English (US) - Voice 4" }, | ||
| { value: "en_us_010", label: "English (US) - Voice 5" }, | ||
| { value: "en_uk_001", label: "English (UK) - Voice 1" }, | ||
| { value: "en_uk_003", label: "English (UK) - Voice 2" }, | ||
| { value: "en_au_001", label: "English (AU) - Voice 1" }, | ||
| { value: "en_au_002", label: "English (AU) - Voice 2" }, | ||
| ]; | ||
|
|
||
| const MODELSLAB_LANGUAGES = [ | ||
| { value: "english", label: "English" }, | ||
| { value: "spanish", label: "Spanish" }, | ||
| { value: "french", label: "French" }, | ||
| { value: "german", label: "German" }, | ||
| { value: "italian", label: "Italian" }, | ||
| { value: "portuguese", label: "Portuguese" }, | ||
| { value: "polish", label: "Polish" }, | ||
| { value: "hindi", label: "Hindi" }, | ||
| ]; | ||
|
|
||
| const MODELSLAB_SPEEDS = [ | ||
| { value: "0.5", label: "0.5x (Slow)" }, | ||
| { value: "0.75", label: "0.75x" }, | ||
| { value: "1", label: "1x (Normal)" }, | ||
| { value: "1.25", label: "1.25x" }, | ||
| { value: "1.5", label: "1.5x (Fast)" }, | ||
| { value: "2", label: "2x (Very Fast)" }, | ||
| ]; | ||
|
|
||
| export default function ModelsLabTextToSpeechOptions({ settings }) { | ||
| return ( | ||
| <div className="flex flex-col gap-y-4"> | ||
| <div className="flex gap-x-4"> | ||
| <div className="flex flex-col w-60"> | ||
| <label className="text-white text-sm font-semibold block mb-3"> | ||
| API Key | ||
| </label> | ||
| <input | ||
| type="password" | ||
| name="TTSModelsLabApiKey" | ||
| className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5" | ||
| placeholder="ModelsLab API Key" | ||
| defaultValue={settings?.TTSModelsLabApiKey ? "*".repeat(20) : ""} | ||
| required={true} | ||
| autoComplete="off" | ||
| spellCheck={false} | ||
| /> | ||
| <p className="text-xs text-white/60 mt-1"> | ||
| Get your API key at{" "} | ||
| <a | ||
| href="https://modelslab.com/dashboard/api-keys" | ||
| target="_blank" | ||
| rel="noreferrer" | ||
| className="underline hover:text-white" | ||
| > | ||
| modelslab.com | ||
| </a> | ||
| </p> | ||
| </div> | ||
| <div className="flex flex-col w-60"> | ||
| <label className="text-white text-sm font-semibold block mb-3"> | ||
| Voice | ||
| </label> | ||
| <select | ||
| name="TTSModelsLabVoiceId" | ||
| defaultValue={settings?.TTSModelsLabVoiceId ?? "en_us_001"} | ||
| className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" | ||
| > | ||
| {MODELSLAB_VOICES.map((voice) => ( | ||
| <option key={voice.value} value={voice.value}> | ||
| {voice.label} | ||
| </option> | ||
| ))} | ||
| </select> | ||
| </div> | ||
| <div className="flex flex-col w-60"> | ||
| <label className="text-white text-sm font-semibold block mb-3"> | ||
| Language | ||
| </label> | ||
| <select | ||
| name="TTSModelsLabLanguage" | ||
| defaultValue={settings?.TTSModelsLabLanguage ?? "english"} | ||
| className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" | ||
| > | ||
| {MODELSLAB_LANGUAGES.map((lang) => ( | ||
| <option key={lang.value} value={lang.value}> | ||
| {lang.label} | ||
| </option> | ||
| ))} | ||
| </select> | ||
| </div> | ||
| <div className="flex flex-col w-40"> | ||
| <label className="text-white text-sm font-semibold block mb-3"> | ||
| Speed | ||
| </label> | ||
| <select | ||
| name="TTSModelsLabSpeed" | ||
| defaultValue={settings?.TTSModelsLabSpeed ?? "1"} | ||
| className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" | ||
| > | ||
| {MODELSLAB_SPEEDS.map((speed) => ( | ||
| <option key={speed.value} value={speed.value}> | ||
| {speed.label} | ||
| </option> | ||
| ))} | ||
| </select> | ||
| </div> | ||
| </div> | ||
| </div> | ||
| ); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| class ModelsLabTTS { | ||
| static VOICES = [ | ||
| "en_us_001", | ||
| "en_us_006", | ||
| "en_us_007", | ||
| "en_us_009", | ||
| "en_us_010", | ||
| "en_uk_001", | ||
| "en_uk_003", | ||
| "en_au_001", | ||
| "en_au_002", | ||
| ]; | ||
|
|
||
| static DEFAULT_VOICE = "en_us_001"; | ||
|
|
||
| constructor() { | ||
| if (!process.env.TTS_MODELSLAB_API_KEY) | ||
| throw new Error("No ModelsLab API key was set for TTS."); | ||
| this.apiKey = process.env.TTS_MODELSLAB_API_KEY; | ||
| this.voice = process.env.TTS_MODELSLAB_VOICE_ID ?? ModelsLabTTS.DEFAULT_VOICE; | ||
| this.language = process.env.TTS_MODELSLAB_LANGUAGE ?? "english"; | ||
| this.speed = parseFloat(process.env.TTS_MODELSLAB_SPEED ?? "1"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| this.#log(`Initialized with voice: ${this.voice}`); | ||
| } | ||
|
|
||
| #log(text, ...args) { | ||
| console.log(`\x1b[32m[ModelsLabTTS]\x1b[0m ${text}`, ...args); | ||
| } | ||
|
|
||
| /** | ||
| * Fetches a URL and returns the response body as a Buffer. | ||
| * @param {string} url | ||
| * @returns {Promise<Buffer>} | ||
| */ | ||
| async #fetchUrl(url) { | ||
| const response = await fetch(url); | ||
| if (!response.ok) throw new Error(`Failed to fetch audio: ${response.statusText}`); | ||
| const arrayBuffer = await response.arrayBuffer(); | ||
| return Buffer.from(arrayBuffer); | ||
| } | ||
|
|
||
| /** | ||
| * Polls the ModelsLab fetch endpoint until the audio is ready. | ||
| * Uses exponential backoff for better performance. | ||
| * @param {string|number} requestId | ||
| * @param {number} maxAttempts | ||
| * @returns {Promise<Buffer|null>} | ||
| */ | ||
| async #pollForResult(requestId, maxAttempts = 20) { | ||
| const fetchUrl = "https://modelslab.com/api/v6/voice/fetch"; | ||
| let delayMs = 1000; // Start with 1 second | ||
|
|
||
| for (let attempt = 0; attempt < maxAttempts; attempt++) { | ||
| await new Promise((r) => setTimeout(r, delayMs)); | ||
|
|
||
| const response = await fetch(fetchUrl, { | ||
| method: "POST", | ||
| headers: { "Content-Type": "application/json" }, | ||
| body: JSON.stringify({ key: this.apiKey, request_id: String(requestId) }), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The API key is sent as a body param and not an |
||
| }); | ||
|
|
||
| const data = await response.json(); | ||
|
|
||
| if (data.status === "success" && data.output?.length > 0) { | ||
| return await this.#fetchUrl(data.output[0]); | ||
| } | ||
|
|
||
| if (data.status === "error") { | ||
| this.#log("Poll error:", data.message || data.messege || "Unknown error"); | ||
| return null; | ||
| } | ||
|
|
||
| this.#log(`Polling attempt ${attempt + 1}/${maxAttempts}...`); | ||
|
|
||
| // Exponential backoff: 1s, 2s, 3s, 4s... up to 5s max | ||
| delayMs = Math.min(delayMs + 1000, 5000); | ||
| } | ||
|
|
||
| this.#log("Timed out waiting for audio generation."); | ||
| return null; | ||
| } | ||
|
Comment on lines
+49
to
+81
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no async/await for the HTTP request - you have to poll? This seems like a large error surface since a provider failure to process the job can lead to retrying until it dies to timeouts. Are there any docs around this endpoint? 3s flat is an approach, but an exp backoff might make more sense here? I am not sure what the performance is like for this provider to return audio |
||
|
|
||
| /** | ||
| * Generates a buffer from the given text input using the ModelsLab TTS API. | ||
| * @param {string} textInput - The text to be converted to audio. | ||
| * @returns {Promise<Buffer|null>} A buffer containing the audio data. | ||
| */ | ||
| async ttsBuffer(textInput) { | ||
| try { | ||
| const response = await fetch( | ||
| "https://modelslab.com/api/v6/voice/text_to_speech", | ||
| { | ||
| method: "POST", | ||
| headers: { "Content-Type": "application/json" }, | ||
| body: JSON.stringify({ | ||
| key: this.apiKey, | ||
| prompt: textInput, | ||
| voice_id: this.voice, | ||
| language: this.language, | ||
| speed: this.speed, | ||
| }), | ||
| } | ||
| ); | ||
|
|
||
| const data = await response.json(); | ||
|
|
||
| if (data.status === "success" && data.output?.length > 0) { | ||
| return await this.#fetchUrl(data.output[0]); | ||
| } | ||
|
|
||
| if (data.status === "processing" && data.id) { | ||
| this.#log(`Processing... polling for request ID: ${data.id}`); | ||
| return await this.#pollForResult(data.id); | ||
| } | ||
|
|
||
| this.#log("Unexpected response:", JSON.stringify(data)); | ||
| return null; | ||
| } catch (e) { | ||
| console.error("[ModelsLabTTS] Error:", e); | ||
| return null; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| module.exports = { ModelsLabTTS }; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to have language=French but voice be English (UK) - Voice2? I am not sure if that kind of combination is possible.
Additionally, do we have any insight into how often voices are updated or added? This list will not be actively maintained by the team so it can be out of date quickly.
If there is a way to pull from a GET /voice/models or something and render the dynamic list to the user would be best so its always current.