Address review feedback: defer IP/L2 distinction to Similarity objects

mulugetam · mulugetam · commit d9ac5f2ec90a · 2026-05-04T15:38:11.000Z
Move the metric-specific query pre-adjustment and raw-decode distance
accumulation out of DCTemplate and into the Similarity classes, where
the IP/L2 distinction is already managed.

- Add a static adjust_query_for_raw_decode() method to each
  SimilarityL2 and SimilarityIP specialization (AVX512, AVX2, NEON).
- Replace the if constexpr (Sim::metric_type == METRIC_L2) branches
  in DCTemplate::set_query() with a single call to
  Sim::adjust_query_for_raw_decode().
- Replace the hand-written SIMD loops in query_to_code_predecoded()
  with calls to the existing Similarity accumulator interface
  (begin_N / add_N_components / result_N).
- Fix bench_scalar_quantizer.py: fix error by filter out QT_count
  since it's not a valid quantizer type.

Signed-off-by: Mulugeta Mammo &lt;mulugeta.mammo@intel.com&gt;
diff --git a/benchs/bench_scalar_quantizer.py b/benchs/bench_scalar_quantizer.py
@@ -17,7 +17,7 @@
 
 variants = [(name, getattr(faiss.ScalarQuantizer, name))
             for name in dir(faiss.ScalarQuantizer)
-            if name.startswith('QT_')]
+            if name.startswith('QT_') and name != 'QT_count']
 
 quantizer = faiss.IndexFlatL2(d)
 # quantizer.add(np.zeros((1, d), dtype='float32'))
diff --git a/faiss/impl/scalar_quantizer/sq-avx2.cpp b/faiss/impl/scalar_quantizer/sq-avx2.cpp
@@ -405,6 +405,22 @@ struct SimilarityL2<SIMDLevel::AVX2> {
         const __m128 v3 = _mm_add_ps(v1, v2);
         return _mm_cvtss_f32(v3);
     }
+
+    static void adjust_query_for_raw_decode(
+            const float* x,
+            float* q_adj,
+            size_t d,
+            float vmin,
+            float vdiff,
+            float& scale_factor,
+            float& bias) {
+        float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
+        for (size_t i = 0; i < d; i++) {
+            q_adj[i] = (x[i] - vmin) * inv_vdiff;
+        }
+        scale_factor = vdiff * vdiff;
+        bias = 0;
+    }
 };
 
 template <>
@@ -448,6 +464,23 @@ struct SimilarityIP<SIMDLevel::AVX2> {
         const __m128 v3 = _mm_add_ps(v1, v2);
         return _mm_cvtss_f32(v3);
     }
+
+    static void adjust_query_for_raw_decode(
+            const float* x,
+            float* q_adj,
+            size_t d,
+            float vmin,
+            float vdiff,
+            float& scale_factor,
+            float& bias) {
+        float sum_q = 0;
+        for (size_t i = 0; i < d; i++) {
+            q_adj[i] = x[i];
+            sum_q += x[i];
+        }
+        scale_factor = vdiff;
+        bias = vmin * sum_q;
+    }
 };
 
 /**********************************************************
@@ -506,66 +539,25 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
     void set_query(const float* x) final {
         q = x;
         if constexpr (has_decode_raw()) {
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                float inv_vdiff =
-                        (quant.vdiff != 0) ? 1.0f / quant.vdiff : 0.0f;
-                for (size_t i = 0; i < quant.d; i++) {
-                    q_adj[i] = (x[i] - quant.vmin) * inv_vdiff;
-                }
-                scale_factor = quant.vdiff * quant.vdiff;
-                bias = 0;
-            } else {
-                float sum_q = 0;
-                for (size_t i = 0; i < quant.d; i++) {
-                    q_adj[i] = x[i];
-                    sum_q += x[i];
-                }
-                scale_factor = quant.vdiff;
-                bias = quant.vmin * sum_q;
-            }
+            Sim::adjust_query_for_raw_decode(
+                    x,
+                    q_adj.data(),
+                    quant.d,
+                    quant.vmin,
+                    quant.vdiff,
+                    scale_factor,
+                    bias);
         }
     }
 
     float query_to_code_predecoded(const uint8_t* code) const {
-        __m256 acc0 = _mm256_setzero_ps();
-        __m256 acc1 = _mm256_setzero_ps();
-        const float* qptr = q_adj.data(); // hoist out of loop
-
-        size_t i = 0;
-        for (; i + 16 <= quant.d; i += 16) {
-            __m256 x0 = quant.decode_8_raw(code, static_cast<int>(i)).f;
-            __m256 x1 = quant.decode_8_raw(code, static_cast<int>(i + 8)).f;
-            __m256 q0 = _mm256_loadu_ps(qptr + i);
-            __m256 q1 = _mm256_loadu_ps(qptr + i + 8);
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                __m256 d0 = _mm256_sub_ps(q0, x0);
-                __m256 d1 = _mm256_sub_ps(q1, x1);
-                acc0 = _mm256_fmadd_ps(d0, d0, acc0);
-                acc1 = _mm256_fmadd_ps(d1, d1, acc1);
-            } else {
-                acc0 = _mm256_fmadd_ps(q0, x0, acc0);
-                acc1 = _mm256_fmadd_ps(q1, x1, acc1);
-            }
-        }
-        // tail for remaining 8-lane block if d isn't a multiple of 16
-        for (; i < quant.d; i += 8) {
-            __m256 xi = quant.decode_8_raw(code, static_cast<int>(i)).f;
-            __m256 qi = _mm256_loadu_ps(qptr + i);
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                __m256 diff = _mm256_sub_ps(qi, xi);
-                acc0 = _mm256_fmadd_ps(diff, diff, acc0);
-            } else {
-                acc0 = _mm256_fmadd_ps(qi, xi, acc0);
-            }
+        Similarity sim(q_adj.data());
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            simd8float32 xi = quant.decode_8_raw(code, static_cast<int>(i));
+            sim.add_8_components(xi);
         }
-        __m256 accu = _mm256_add_ps(acc0, acc1);
-
-        // horizontal sum
-        __m128 sum = _mm_add_ps(
-                _mm256_castps256_ps128(accu), _mm256_extractf128_ps(accu, 1));
-        sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
-        sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 1));
-        return bias + scale_factor * _mm_cvtss_f32(sum);
+        return bias + scale_factor * sim.result_8();
     }
 
     float symmetric_dis(idx_t i, idx_t j) override {
diff --git a/faiss/impl/scalar_quantizer/sq-avx512.cpp b/faiss/impl/scalar_quantizer/sq-avx512.cpp
@@ -417,6 +417,22 @@ struct SimilarityL2<SIMDLevel::AVX512> {
     FAISS_ALWAYS_INLINE float result_16() {
         return horizontal_add(accu16);
     }
+
+    static void adjust_query_for_raw_decode(
+            const float* x,
+            float* q_adj,
+            size_t d,
+            float vmin,
+            float vdiff,
+            float& scale_factor,
+            float& bias) {
+        float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
+        for (size_t i = 0; i < d; i++) {
+            q_adj[i] = (x[i] - vmin) * inv_vdiff;
+        }
+        scale_factor = vdiff * vdiff;
+        bias = 0;
+    }
 };
 
 template <>
@@ -451,6 +467,23 @@ struct SimilarityIP<SIMDLevel::AVX512> {
     FAISS_ALWAYS_INLINE float result_16() {
         return horizontal_add(accu16);
     }
+
+    static void adjust_query_for_raw_decode(
+            const float* x,
+            float* q_adj,
+            size_t d,
+            float vmin,
+            float vdiff,
+            float& scale_factor,
+            float& bias) {
+        float sum_q = 0;
+        for (size_t i = 0; i < d; i++) {
+            q_adj[i] = x[i];
+            sum_q += x[i];
+        }
+        scale_factor = vdiff;
+        bias = vmin * sum_q;
+    }
 };
 
 /**********************************************************
@@ -507,39 +540,25 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512>
     void set_query(const float* x) final {
         q = x;
         if constexpr (has_decode_raw()) {
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                float inv_vdiff =
-                        (quant.vdiff != 0) ? 1.0f / quant.vdiff : 0.0f;
-                for (size_t i = 0; i < quant.d; i++) {
-                    q_adj[i] = (x[i] - quant.vmin) * inv_vdiff;
-                }
-                scale_factor = quant.vdiff * quant.vdiff;
-                bias = 0;
-            } else {
-                float sum_q = 0;
-                for (size_t i = 0; i < quant.d; i++) {
-                    q_adj[i] = x[i];
-                    sum_q += x[i];
-                }
-                scale_factor = quant.vdiff;
-                bias = quant.vmin * sum_q;
-            }
+            Sim::adjust_query_for_raw_decode(
+                    x,
+                    q_adj.data(),
+                    quant.d,
+                    quant.vmin,
+                    quant.vdiff,
+                    scale_factor,
+                    bias);
         }
     }
 
     float query_to_code_predecoded(const uint8_t* code) const {
-        __m512 accu = _mm512_setzero_ps();
+        Similarity sim(q_adj.data());
+        sim.begin_16();
         for (size_t i = 0; i < quant.d; i += 16) {
-            __m512 xi = quant.decode_16_raw(code, i).f;
-            __m512 qi = _mm512_loadu_ps(q_adj.data() + i);
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                __m512 diff = _mm512_sub_ps(qi, xi);
-                accu = _mm512_fmadd_ps(diff, diff, accu);
-            } else {
-                accu = _mm512_fmadd_ps(qi, xi, accu);
-            }
+            simd16float32 xi = quant.decode_16_raw(code, i);
+            sim.add_16_components(xi);
         }
-        return bias + scale_factor * _mm512_reduce_add_ps(accu);
+        return bias + scale_factor * sim.result_16();
     }
 
     float symmetric_dis(idx_t i, idx_t j) override {
diff --git a/faiss/impl/scalar_quantizer/sq-neon.cpp b/faiss/impl/scalar_quantizer/sq-neon.cpp
@@ -403,6 +403,22 @@ struct SimilarityL2<SIMDLevel::ARM_NEON> {
     FAISS_ALWAYS_INLINE float result_8() {
         return horizontal_add(accu8);
     }
+
+    static void adjust_query_for_raw_decode(
+            const float* x,
+            float* q_adj,
+            size_t d,
+            float vmin,
+            float vdiff,
+            float& scale_factor,
+            float& bias) {
+        float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
+        for (size_t i = 0; i < d; i++) {
+            q_adj[i] = (x[i] - vmin) * inv_vdiff;
+        }
+        scale_factor = vdiff * vdiff;
+        bias = 0;
+    }
 };
 
 template <>
@@ -437,6 +453,23 @@ struct SimilarityIP<SIMDLevel::ARM_NEON> {
     FAISS_ALWAYS_INLINE float result_8() {
         return horizontal_add(accu8);
     }
+
+    static void adjust_query_for_raw_decode(
+            const float* x,
+            float* q_adj,
+            size_t d,
+            float vmin,
+            float vdiff,
+            float& scale_factor,
+            float& bias) {
+        float sum_q = 0;
+        for (size_t i = 0; i < d; i++) {
+            q_adj[i] = x[i];
+            sum_q += x[i];
+        }
+        scale_factor = vdiff;
+        bias = vmin * sum_q;
+    }
 };
 
 /**********************************************************
@@ -493,46 +526,25 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::ARM_NEON>
     void set_query(const float* x) final {
         q = x;
         if constexpr (has_decode_raw()) {
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                float inv_vdiff =
-                        (quant.vdiff != 0) ? 1.0f / quant.vdiff : 0.0f;
-                for (size_t i = 0; i < quant.d; i++) {
-                    q_adj[i] = (x[i] - quant.vmin) * inv_vdiff;
-                }
-                scale_factor = quant.vdiff * quant.vdiff;
-                bias = 0;
-            } else {
-                float sum_q = 0;
-                for (size_t i = 0; i < quant.d; i++) {
-                    q_adj[i] = x[i];
-                    sum_q += x[i];
-                }
-                scale_factor = quant.vdiff;
-                bias = quant.vmin * sum_q;
-            }
+            Sim::adjust_query_for_raw_decode(
+                    x,
+                    q_adj.data(),
+                    quant.d,
+                    quant.vmin,
+                    quant.vdiff,
+                    scale_factor,
+                    bias);
         }
     }
 
     float query_to_code_predecoded(const uint8_t* code) const {
-        float32x4_t accu0 = vdupq_n_f32(0);
-        float32x4_t accu1 = vdupq_n_f32(0);
+        Similarity sim(q_adj.data());
+        sim.begin_8();
         for (size_t i = 0; i < quant.d; i += 8) {
             simd8float32 xi = quant.decode_8_raw(code, i);
-            float32x4_t qi0 = vld1q_f32(q_adj.data() + i);
-            float32x4_t qi1 = vld1q_f32(q_adj.data() + i + 4);
-            if constexpr (Sim::metric_type == METRIC_L2) {
-                float32x4_t d0 = vsubq_f32(qi0, xi.data.val[0]);
-                float32x4_t d1 = vsubq_f32(qi1, xi.data.val[1]);
-                accu0 = vfmaq_f32(accu0, d0, d0);
-                accu1 = vfmaq_f32(accu1, d1, d1);
-            } else {
-                accu0 = vfmaq_f32(accu0, qi0, xi.data.val[0]);
-                accu1 = vfmaq_f32(accu1, qi1, xi.data.val[1]);
-            }
+            sim.add_8_components(xi);
         }
-        float32x4_t sum4 = vaddq_f32(accu0, accu1);
-        float result = vaddvq_f32(sum4);
-        return bias + scale_factor * result;
+        return bias + scale_factor * sim.result_8();
     }
 
     float symmetric_dis(idx_t i, idx_t j) override {