Skip to content

Commit 7a26eae

Browse files
alibeklfcfacebook-github-bot
authored andcommitted
SIMD optimization RaBitQ (facebookresearch#4515)
Summary: This diff introduces a new file rabitq_simd.h with multiple SIMD-optimized implementations of the dot product calculation using population count (popcnt) operations: 1. AVX-512 implementation with AVX512VPOPCNTDQ: Processes data in 512-bit (64-byte) chunks using dedicated AVX-512 popcnt instructions, with fallbacks to smaller vector sizes for remaining data. 2. AVX-512 fallback implementation without AVX512VPOPCNTDQ: Uses AVX512F instructions with a lookup-based popcount method for 512-bit vectors, falling back to smaller vectors for remaining data. 3. AVX2 implementation: Uses a lookup-based popcount method with 256-bit (32-byte) AVX2 instructions, handling leftovers with 128-bit SSE operations and scalar processing. 4. Scalar fallback: Processes data in 64-bit chunks with builtin popcount operations for systems without SIMD support. Differential Revision: D79301607
1 parent 8482842 commit 7a26eae

2 files changed

Lines changed: 545 additions & 24 deletions

File tree

faiss/impl/RaBitQuantizer.cpp

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,16 @@
77

88
#include <faiss/impl/RaBitQuantizer.h>
99

10+
#include <faiss/impl/FaissAssert.h>
11+
#include <faiss/utils/distances.h>
12+
#include <faiss/utils/rabitq_simd.h>
1013
#include <algorithm>
1114
#include <cmath>
1215
#include <cstring>
1316
#include <limits>
1417
#include <memory>
1518
#include <vector>
1619

17-
#include <faiss/impl/FaissAssert.h>
18-
#include <faiss/utils/distances.h>
19-
2020
namespace faiss {
2121

2222
struct FactorsData {
@@ -351,27 +351,9 @@ float RaBitDistanceComputerQ::distance_to_code(const uint8_t* code) {
351351
const size_t di_8b = (d + 7) / 8;
352352
const size_t di_64b = (di_8b / 8) * 8;
353353

354-
uint64_t dot_qo = 0;
355-
for (size_t j = 0; j < qb; j++) {
356-
const uint8_t* query_j = rearranged_rotated_qq.data() + j * di_8b;
357-
358-
// process 64-bit popcounts
359-
uint64_t count_dot = 0;
360-
for (size_t i = 0; i < di_64b; i += 8) {
361-
const auto qv = *(const uint64_t*)(query_j + i);
362-
const auto yv = *(const uint64_t*)(binary_data + i);
363-
count_dot += __builtin_popcountll(qv & yv);
364-
}
365-
366-
// process leftovers
367-
for (size_t i = di_64b; i < di_8b; i++) {
368-
const auto qv = *(query_j + i);
369-
const auto yv = *(binary_data + i);
370-
count_dot += __builtin_popcount(qv & yv);
371-
}
372-
373-
dot_qo += (count_dot << j);
374-
}
354+
// Use the optimized popcount function from rabitq_simd.h
355+
float dot_qo =
356+
rabitq_dp_popcnt(rearranged_rotated_qq.data(), binary_data, d, qb);
375357

376358
// It was a willful decision (after the discussion) to not to pre-cache
377359
// the sum of all bits, just in order to reduce the overhead per vector.

0 commit comments

Comments
 (0)