-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwhisper_align.py
More file actions
88 lines (69 loc) · 2.24 KB
/
whisper_align.py
File metadata and controls
88 lines (69 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import whisperx
import torch
MODEL_NAME = "large-v3"
def get_model_cache_dir():
base = os.path.expanduser("~/Library/Application Support/LyricVision/models")
os.makedirs(base, exist_ok=True)
return base
def load_whisperx_model(device=None):
# FORCE CPU — avoids MPS errors on macOS
device = "cpu"
model = whisperx.load_model(
MODEL_NAME,
device,
compute_type="int8", # best for CPU
download_root=get_model_cache_dir()
)
return model, device
def transcribe_with_word_timestamps(audio_path, lyrics=None):
"""
If lyrics is provided -> forced alignment using provided text.
If lyrics is None -> normal transcription + alignment.
"""
model, device = load_whisperx_model()
audio = whisperx.load_audio(audio_path)
# Always transcribe first (needed for language detection)
result = model.transcribe(audio)
model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device
)
# -------------------------------------------------------
# FORCED ALIGNMENT MODE
# -------------------------------------------------------
if lyrics:
# Replace transcript segments with provided lyrics
custom_segments = [{
"text": lyrics.strip(),
"start": 0,
"end": result["segments"][-1]["end"] if result["segments"] else 0
}]
result_aligned = whisperx.align(
custom_segments,
model_a,
metadata,
audio,
device
)
# -------------------------------------------------------
# NORMAL TRANSCRIPTION MODE
# -------------------------------------------------------
else:
result_aligned = whisperx.align(
result["segments"],
model_a,
metadata,
audio,
device
)
words = []
for segment in result_aligned["segments"]:
for word in segment.get("words", []):
if "start" in word and "end" in word:
words.append({
"word": word["word"],
"start": word["start"],
"end": word["end"]
})
return words