fix(quant): extend GPU quantization restriction to CUDA backend

TimmyOVO · TimmyOVO · commit a1ece5cd52a8 · 2025-11-11T21:37:26.000+08:00
diff --git a/crates/infer-deepseek/src/model/mod.rs b/crates/infer-deepseek/src/model/mod.rs
@@ -240,6 +240,19 @@ impl ImageProjector {
         let quant = QuantizationState::global();
         let config = quant.config();
         let mut qmatmul: Option<std::sync::Arc<QMatMul>> = None;
+        // GPU fast-fail: disallow runtime quantization on Metal/CUDA for projector as well.
+        if (weight.device().is_metal() || weight.device().is_cuda())
+            && config.kind.is_enabled()
+            && quant.enabled_for(LinearLayerGroup::Projector)
+        {
+            anyhow::bail!(
+                "GPU backend: runtime quantization is disabled on Metal/CUDA. Refusing to fallback.\n\
+                 Disable quantization (DEEPSEEK_OCR_QUANT=none) or run on CPU.\n\
+                 Context: module=projector, in_dim={}, backend={}",
+                input_dim,
+                crate::quantization::backend_label(&weight.device())
+            );
+        }
         if quant.enabled_for(LinearLayerGroup::Projector) {
             match config.kind {
                 QuantizationKind::Q8_0 => {
diff --git a/crates/infer-deepseek/src/transformer/weights.rs b/crates/infer-deepseek/src/transformer/weights.rs
@@ -444,18 +444,19 @@ fn maybe_quantize_linear(
 ) -> Result<Option<Arc<QMatMul>>> {
     let quant = QuantizationState::global();
     let config = quant.config();
-    // Disable runtime quantization entirely on Metal to avoid MPS kernel issues.
-    if weight.device().is_metal() {
-        tracing::trace!(
-            tensor = tensor_name,
-            ?group,
-            action = "fallback",
-            reason = "metal_disabled",
-            backend = crate::quantization::backend_label(&weight.device()),
-            "quant-linear"
+    // GPU fast-fail: if quant is requested for this group on Metal/CUDA, error out (awaiting upstream kernel fixes).
+    if (weight.device().is_metal() || weight.device().is_cuda())
+        && config.kind.is_enabled()
+        && quant.enabled_for(group)
+    {
+        anyhow::bail!(
+            "GPU backend: runtime quantization is disabled on Metal/CUDA. Refusing to fallback.\n\
+             Disable quantization (DEEPSEEK_OCR_QUANT=none) or run on CPU.\n\
+             Context: tensor={}, group={:?}, backend={}",
+            tensor_name,
+            group,
+            crate::quantization::backend_label(&weight.device())
         );
-        quant.record_attempt(module, QuantizationOutcome::Fallback);
-        return Ok(None);
     }
     if !quant.enabled_for(group) {
         trace!(