LyricVision/whisper_align.py at main · KiwiSingh/LyricVision · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import whisperx
import torch

MODEL_NAME = "large-v3"


def get_model_cache_dir():
    base = os.path.expanduser("~/Library/Application Support/LyricVision/models")
    os.makedirs(base, exist_ok=True)
    return base


def load_whisperx_model(device=None):

    # FORCE CPU — avoids MPS errors on macOS
    device = "cpu"

    model = whisperx.load_model(
        MODEL_NAME,
        device,
        compute_type="int8",  # best for CPU
        download_root=get_model_cache_dir()
    )

    return model, device


def transcribe_with_word_timestamps(audio_path, lyrics=None):
    """
    If lyrics is provided -> forced alignment using provided text.
    If lyrics is None -> normal transcription + alignment.
    """

    model, device = load_whisperx_model()
    audio = whisperx.load_audio(audio_path)

    # Always transcribe first (needed for language detection)
    result = model.transcribe(audio)

    model_a, metadata = whisperx.load_align_model(
        language_code=result["language"],
        device=device
    )

    # -------------------------------------------------------
    # FORCED ALIGNMENT MODE
    # -------------------------------------------------------
    if lyrics:
        # Replace transcript segments with provided lyrics
        custom_segments = [{
            "text": lyrics.strip(),
            "start": 0,
            "end": result["segments"][-1]["end"] if result["segments"] else 0
        }]

        result_aligned = whisperx.align(
            custom_segments,
            model_a,
            metadata,
            audio,
            device
        )

    # -------------------------------------------------------
    # NORMAL TRANSCRIPTION MODE
    # -------------------------------------------------------
    else:
        result_aligned = whisperx.align(
            result["segments"],
            model_a,
            metadata,
            audio,
            device
        )

    words = []

    for segment in result_aligned["segments"]:
        for word in segment.get("words", []):
            if "start" in word and "end" in word:
                words.append({
                    "word": word["word"],
                    "start": word["start"],
                    "end": word["end"]
                })

    return words