Merge pull request #132 from kadirnar/delete-model-file

kadirnar · web-flow · commit 666d596b3a54 · 2025-08-04T17:08:14.000+03:00
Move model loading logic into SpeechToTextPipeline class
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ ffmpeg
 ffmpeg-python
 pre-commit
 fire
+transformers
diff --git a/whisperplus/model/load_model.py b/whisperplus/model/load_model.py
diff --git a/whisperplus/pipelines/whisper.py b/whisperplus/pipelines/whisper.py
@@ -1,9 +1,8 @@
 import logging
 from typing import Optional
 
-from transformers import pipeline
-
-from whisperplus.model.load_model import load_model_whisper
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
@@ -28,6 +27,75 @@ def __init__(
         else:
             logging.info("Model already loaded.")
 
+    def compile_model(self, model):
+        model.model.encoder.forward = torch.compile(
+            model.model.encoder.forward, mode="reduce-overhead", fullgraph=True)
+        model.model.decoder.forward = torch.compile(
+            model.model.decoder.forward, mode="reduce-overhead", fullgraph=True)
+        return model
+
+    def hqq_compile_model(self, model_id, quant_config, device):
+        import hqq.models.base as hqq_base
+        import torch._dynamo
+        from hqq.core.quantize import HQQBackend, HQQLinear
+        from hqq.models.hf.base import AutoHQQHFModel
+        from hqq.utils.patching import prepare_for_inference
+
+        torch._dynamo.config.suppress_errors = True
+
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
+
+        processor = AutoProcessor.from_pretrained(model_id)
+        HQQLinear.set_backend(HQQBackend.PYTORCH)
+
+        AutoHQQHFModel.quantize_model(
+            model.model.encoder, quant_config=quant_config, compute_dtype=torch.bfloat16, device=device)
+
+        AutoHQQHFModel.quantize_model(
+            model.model.decoder, quant_config=quant_config, compute_dtype=torch.bfloat16, device=device)
+
+        hqq_base._QUANT_LAYERS = [torch.nn.Linear, HQQLinear]
+        AutoHQQHFModel.set_auto_linear_tags(model.model.encoder)
+        prepare_for_inference(model.model.encoder)
+
+        AutoHQQHFModel.set_auto_linear_tags(model.model.decoder)
+        prepare_for_inference(model.model.decoder, backend="torchao_int4")
+
+        model = self.compile_model(model)
+        
+        return model, processor
+
+    def load_model_whisper(
+        self,
+        model_id: str = "distil-whisper/distil-large-v3",
+        quant_config=None,
+        hqq_compile: bool = False,
+        flash_attention_2: bool = False,
+        device=None):
+        
+        if hqq_compile:
+            return self.hqq_compile_model(model_id, quant_config, device)
+        else:
+            if flash_attention_2:
+                attn_implementation = "flash_attention_2"
+            else:
+                attn_implementation = "sdpa"
+
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id,
+                quantization_config=quant_config,
+                low_cpu_mem_usage=True,
+                use_safetensors=True,
+                attn_implementation=attn_implementation,
+                torch_dtype=torch.bfloat16,
+                device_map=device,
+            )
+
+            processor = AutoProcessor.from_pretrained(model_id)
+
+            return model, processor
+
     def load_plus_model(
         self,
         model_id: str = "distil-whisper/distil-large-v3",
@@ -36,7 +104,7 @@ def load_plus_model(
         flash_attention_2: bool = True,
     ):
 
-        model, processor = load_model_whisper(
+        model, processor = self.load_model_whisper(
             model_id=model_id,
             quant_config=quant_config,
             hqq_compile=hqq_compile,