-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathf5tts_thai_example.py
More file actions
95 lines (83 loc) · 2.85 KB
/
f5tts_thai_example.py
File metadata and controls
95 lines (83 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
'''
Demonstration of F5-TTS Engine for Thai language
'''
import logging
from pathlib import Path
import torch
from cached_path import cached_path
from flowtts.inference import FlowTTSPipeline, ModelConfig, AudioConfig
from transformers import pipeline
logging.basicConfig(level=logging.INFO)
def thonburian_whisper(audio_file, model_name = "biodatlab/whisper-th-medium-combined", lang="th"):
'''
ASR for transcribing the reference audio file.
audio_file (str): reference audio file
model_name (str): Whsiper ASR model
lang (str): language code
Return: Transcribed text of "audio_file"
'''
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=model_name,
chunk_length_s=30,
device=device,
)
return pipe(str(audio_file), generate_kwargs={"language":f"<|{lang}|>", "task":"transcribe"}, batch_size=16)["text"]
def main():
'''
F5-TTS Demonstration for Text-to-Speech Generation
'''
# Configure model settings for F5
model_config = ModelConfig(
language="th",
model_type="F5",
checkpoint="hf://ThuraAung1601/E2-F5-TTS/F5_Thai/mega_f5_last.safetensors",
vocab_file="hf://ThuraAung1601/E2-F5-TTS/F5_Thai/mega_vocab.txt",
ode_method="euler",
use_ema=True,
vocoder="vocos",
device="cuda" if torch.cuda.is_available() else "cpu"
)
# Update audio config with F5-specific parameters
audio_config = AudioConfig(
silence_threshold=-45,
max_audio_length=20000,
cfg_strength=2.5,
nfe_step=32,
target_rms=0.1,
cross_fade_duration=0.15,
speed=1.0,
min_silence_len=500,
keep_silence=200,
seek_step=10
)
# Initialize pipeline
pipeline = FlowTTSPipeline(
model_config=model_config,
audio_config=audio_config,
temp_dir="temp_f5"
)
# Text for speech generation
test_text = "ยินดีที่ได้รู้จักคุณวันนี้อากาศดีมาก"
# Create output directory
output_dir = Path("outputs_f5")
output_dir.mkdir(parents=True, exist_ok=True)
# Reference data for Flow Matching based models to micmic
ref_voice = str(cached_path("hf://ThuraAung1601/E2-F5-TTS/ref_samples/ref_sample.wav"))
ref_text = thonburian_whisper(ref_voice)
# Generate speech
try:
output_path = pipeline(
text=test_text,
ref_voice=ref_voice,
ref_text=ref_text,
output_file=str(output_dir / "f5_output.wav"),
speed=1.0,
check_duration=True
)
print(f"Generated F5 audio saved to: {output_path}")
except Exception as e:
logging.error(f"Error during speech synthesis: {e}")
if __name__ == "__main__":
main()