diff --git a/benchs/bench_flat_l2_panorama.py b/benchs/bench_flat_l2_panorama.py index 660109ba6c..0dc101172f 100644 --- a/benchs/bench_flat_l2_panorama.py +++ b/benchs/bench_flat_l2_panorama.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import argparse import multiprocessing as mp import time @@ -11,11 +12,18 @@ import numpy as np try: - from faiss.contrib.datasets_fb import DatasetGIST1M + from faiss.contrib.datasets_fb import DatasetSIFT1M, DatasetGIST1M except ImportError: - from faiss.contrib.datasets import DatasetGIST1M + from faiss.contrib.datasets import DatasetSIFT1M, DatasetGIST1M -ds = DatasetGIST1M() +parser = argparse.ArgumentParser() +parser.add_argument("--dataset", default="gist1m", choices=["sift1m", "gist1m"]) +args = parser.parse_args() + +if args.dataset == "sift1m": + ds = DatasetSIFT1M() +else: + ds = DatasetGIST1M() nq = 10 xq = ds.get_queries()[:nq] @@ -60,7 +68,7 @@ def build_index(name): return index -nlevels = 8 +nlevels = 16 if args.dataset == "gist1m" else 8 batch_size = 512 plt.figure(figsize=(8, 6), dpi=80) @@ -93,7 +101,8 @@ def build_index(name): ) plt.xticks(x, labels, rotation=0) plt.ylabel("QPS") -plt.title("Flat Indexes on GIST1M") +dataset_label = args.dataset.upper() +plt.title(f"Flat Indexes on {dataset_label}") plt.tight_layout() -plt.savefig("bench_flat_l2_panorama.png", bbox_inches="tight") +plt.savefig(f"bench_flat_l2_panorama_{args.dataset}.png", bbox_inches="tight") diff --git a/benchs/bench_ivf_flat_panorama.py b/benchs/bench_ivf_flat_panorama.py index 85cf840591..4c5fe96870 100644 --- a/benchs/bench_ivf_flat_panorama.py +++ b/benchs/bench_ivf_flat_panorama.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import argparse import multiprocessing as mp import time @@ -11,11 +12,18 @@ import numpy as np try: - from faiss.contrib.datasets_fb import DatasetGIST1M + from faiss.contrib.datasets_fb import DatasetSIFT1M, DatasetGIST1M except ImportError: - from faiss.contrib.datasets import DatasetGIST1M + from faiss.contrib.datasets import DatasetSIFT1M, DatasetGIST1M -ds = DatasetGIST1M() +parser = argparse.ArgumentParser() +parser.add_argument("--dataset", default="gist1m", choices=["sift1m", "gist1m"]) +args = parser.parse_args() + +if args.dataset == "sift1m": + ds = DatasetSIFT1M() +else: + ds = DatasetGIST1M() xq = ds.get_queries() xb = ds.get_database() @@ -29,7 +37,7 @@ k = 10 gt = gt[:, :k] -nlevels = 8 +nlevels = 16 if args.dataset == "gist1m" else 8 def get_ivf_index(index): @@ -90,12 +98,12 @@ def eval_and_plot(name, plot=True): eval_and_plot(f"IVF{nlist},Flat") # IVFFlatPanorama (with PCA transform to concentrate energy in early dimensions) -eval_and_plot(f"PCA{d},IVF{nlist},FlatPanorama{nlevels}") +eval_and_plot(f"PCA{d},IVF{nlist},FlatPanorama{nlevels}_{1024}") -plt.title("IVF Flat Indexes on GIST1M") -plt.title("Indices on GIST1M") +dataset_label = args.dataset.upper() +plt.title(f"IVF Flat Indexes on {dataset_label}") plt.xlabel(f"Recall@{k}") plt.ylabel("QPS") plt.yscale("log") plt.legend(bbox_to_anchor=(1.02, 0.1), loc="upper left", borderaxespad=0) -plt.savefig("bench_ivf_flat_panorama.png", bbox_inches="tight") +plt.savefig(f"bench_ivf_flat_panorama_{args.dataset}.png", bbox_inches="tight") diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index e6eda3df75..ae5863531c 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -362,6 +362,15 @@ endif() # Export FAISS_HEADERS variable to parent scope. set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE) +# Detect BMI2 compiler support. +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2) +if(COMPILER_SUPPORTS_BMI2) + set(FAISS_BMI2_FLAGS "-mbmi2") +else() + set(FAISS_BMI2_FLAGS "") +endif() + add_library(faiss ${FAISS_SRC}) add_library(faiss_avx2 ${FAISS_SRC}) @@ -369,7 +378,7 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE) endif() if(NOT WIN32) - target_compile_options(faiss_avx2 PRIVATE $<$:-mavx2 -mfma -mf16c -mpopcnt>) + target_compile_options(faiss_avx2 PRIVATE $<$:-mavx2 -mfma -mf16c -mpopcnt ${FAISS_BMI2_FLAGS}>) else() # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64 @@ -389,7 +398,7 @@ endif() if(NOT WIN32) # All modern CPUs support F, CD, VL, DQ, BW extensions. # Ref: https://en.wikipedia.org/wiki/AVX512 - target_compile_options(faiss_avx512 PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>) + target_compile_options(faiss_avx512 PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt ${FAISS_BMI2_FLAGS}>) else() target_compile_options(faiss_avx512 PRIVATE $<$:/arch:AVX512>) # we need bigobj for the swig wrapper @@ -405,7 +414,7 @@ endif() if(NOT WIN32) # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids. # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide - target_compile_options(faiss_avx512_spr PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16>) + target_compile_options(faiss_avx512_spr PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16 ${FAISS_BMI2_FLAGS}>) else() target_compile_options(faiss_avx512_spr PRIVATE $<$:/arch:AVX512>) # we need bigobj for the swig wrapper diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp index 5aa78b9a24..a4b3bcf938 100644 --- a/faiss/IndexFlat.cpp +++ b/faiss/IndexFlat.cpp @@ -628,8 +628,10 @@ inline void flat_pano_search_core( SingleResultHandler res(handler); std::vector query_cum_norms(index.n_levels + 1); - std::vector exact_distances(index.batch_size); std::vector active_indices(index.batch_size); + std::vector active_byteset(index.batch_size); + std::vector exact_distances(index.batch_size); + std::vector dot_buffer(index.batch_size); #pragma omp for for (int64_t i = 0; i < n; i++) { @@ -664,7 +666,9 @@ inline void flat_pano_search_core( nullptr, use_sel, active_indices, + active_byteset, exact_distances, + dot_buffer, threshold, local_stats); }); diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp index ba32fce132..ec57162f44 100644 --- a/faiss/IndexIVFFlatPanorama.cpp +++ b/faiss/IndexIVFFlatPanorama.cpp @@ -32,19 +32,23 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama( size_t nlist_in, int n_levels_in, MetricType metric, - bool own_invlists_in) + bool own_invlists_in, + size_t batch_size_in) : IndexIVFFlat(quantizer_in, d_in, nlist_in, metric, false), - n_levels(n_levels_in) { + n_levels(n_levels_in), + batch_size(batch_size_in) { FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); // We construct the inverted lists here so that we can use the // level-oriented storage. This does not cause a leak as we constructed // IndexIVF first, with own_invlists set to false. - this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, n_levels); + this->invlists = new ArrayInvertedListsPanorama( + nlist, code_size, n_levels, batch_size); this->own_invlists = own_invlists_in; } -IndexIVFFlatPanorama::IndexIVFFlatPanorama() : n_levels(0) {} +IndexIVFFlatPanorama::IndexIVFFlatPanorama() + : n_levels(0), batch_size(Panorama::kDefaultBatchSize) {} namespace { @@ -55,6 +59,11 @@ struct IVFFlatScannerPanorama : InvertedListScanner { using C = typename VectorDistance::C; static constexpr MetricType metric = VectorDistance::metric; + mutable std::vector active_indices_; + mutable std::vector active_byteset_; + mutable std::vector exact_distances_; + mutable std::vector dot_buffer_; + IVFFlatScannerPanorama( const VectorDistance& vd_in, const ArrayInvertedListsPanorama* storage_in, @@ -65,7 +74,11 @@ struct IVFFlatScannerPanorama : InvertedListScanner { storage(storage_in) { keep_max = vd.is_similarity; code_size = vd.d * sizeof(float); - cum_sums.resize(storage->n_levels + 1); + cum_sums.resize(storage->pano.n_levels + 1); + active_indices_.resize(storage->pano.batch_size); + active_byteset_.resize(storage->pano.batch_size); + exact_distances_.resize(storage->pano.batch_size); + dot_buffer_.resize(storage->pano.batch_size); } const float* xi = nullptr; @@ -90,6 +103,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner { } using InvertedListScanner::scan_codes; + size_t scan_codes( size_t list_size, const uint8_t* codes, @@ -97,20 +111,16 @@ struct IVFFlatScannerPanorama : InvertedListScanner { ResultHandler& handler) const override { size_t nup = 0; - const size_t n_batches = - (list_size + storage->kBatchSize - 1) / storage->kBatchSize; + const size_t bs = storage->pano.batch_size; + const size_t n_batches = (list_size + bs - 1) / bs; const float* cum_sums_data = storage->get_cum_sums(list_no); - std::vector exact_distances(storage->kBatchSize); - std::vector active_indices(storage->kBatchSize); - PanoramaStats local_stats; local_stats.reset(); for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t batch_start = batch_no * storage->kBatchSize; - + size_t batch_start = batch_no * bs; size_t num_active = with_metric_type(metric, [&]() { return storage->pano.progressive_filter_batch( codes, @@ -122,17 +132,18 @@ struct IVFFlatScannerPanorama : InvertedListScanner { sel, ids, use_sel, - active_indices, - exact_distances, + active_indices_, + active_byteset_, + exact_distances_, + dot_buffer_, handler.threshold, local_stats); }); - // Add batch survivors to heap. for (size_t i = 0; i < num_active; i++) { - uint32_t idx = active_indices[i]; + uint32_t idx = active_indices_[i]; size_t global_idx = batch_start + idx; - float dis = exact_distances[idx]; + float dis = exact_distances_[idx]; if (C::cmp(handler.threshold, dis)) { int64_t id = store_pairs ? lo_build(list_no, global_idx) diff --git a/faiss/IndexIVFFlatPanorama.h b/faiss/IndexIVFFlatPanorama.h index 9ec897895d..6144e43ce3 100644 --- a/faiss/IndexIVFFlatPanorama.h +++ b/faiss/IndexIVFFlatPanorama.h @@ -37,6 +37,7 @@ namespace faiss { /// `ArrayInvertedListsPanorama`, which is a struct member of `IndexIVF`. struct IndexIVFFlatPanorama : IndexIVFFlat { size_t n_levels; + size_t batch_size; std::vector> cum_sums; @@ -46,7 +47,8 @@ struct IndexIVFFlatPanorama : IndexIVFFlat { size_t nlist_, int n_levels, MetricType = METRIC_L2, - bool own_invlists = true); + bool own_invlists = true, + size_t batch_size = Panorama::kDefaultBatchSize); InvertedListScanner* get_InvertedListScanner( bool store_pairs, diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h index 79a23a64a7..1ec917ec8f 100644 --- a/faiss/impl/Panorama.h +++ b/faiss/impl/Panorama.h @@ -18,10 +18,190 @@ #include #include #include +#include #include +#if defined(__BMI2__) && defined(__AVX2__) +#include +#endif + namespace faiss { +#ifndef SWIG + +/// Compute dot products between query_level and active vectors. +/// +/// @tparam AllActive If true, vectors are at sequential positions 0..N-1 +/// (first level, full batch). If false, positions come +/// from active_indices (subsequent levels after pruning). +/// @tparam LevelWidth Compile-time level width in floats (0 = use runtime +/// level_width_dims). Enables full loop unrolling. +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template +static inline void compute_level_dot_kernel( + const float* FAISS_RESTRICT query_level, + const float* FAISS_RESTRICT level_storage, + const uint32_t* active_indices, + const size_t num_active, + const size_t level_width_dims, + float* FAISS_RESTRICT dot_products) { + const size_t width = LevelWidth > 0 ? LevelWidth : level_width_dims; + size_t i = 0; + for (; i + 4 <= num_active; i += 4) { + const float* y0 = level_storage + + (AllActive ? (i + 0) : active_indices[i + 0]) * width; + const float* y1 = level_storage + + (AllActive ? (i + 1) : active_indices[i + 1]) * width; + const float* y2 = level_storage + + (AllActive ? (i + 2) : active_indices[i + 2]) * width; + const float* y3 = level_storage + + (AllActive ? (i + 3) : active_indices[i + 3]) * width; + + float dp0 = 0, dp1 = 0, dp2 = 0, dp3 = 0; + FAISS_PRAGMA_IMPRECISE_LOOP + for (size_t j = 0; j < width; j++) { + float q = query_level[j]; + dp0 += q * y0[j]; + dp1 += q * y1[j]; + dp2 += q * y2[j]; + dp3 += q * y3[j]; + } + + dot_products[i + 0] = dp0; + dot_products[i + 1] = dp1; + dot_products[i + 2] = dp2; + dot_products[i + 3] = dp3; + } + for (; i < num_active; i++) { + const float* yj = + level_storage + (AllActive ? i : active_indices[i]) * width; + float dp = 0; + FAISS_PRAGMA_IMPRECISE_LOOP + for (size_t j = 0; j < width; j++) { + dp += query_level[j] * yj[j]; + } + dot_products[i] = dp; + } +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END + +/// Update exact distances with the current level's dot products, then apply +/// Panorama pruning: for each active vector, compute a lower bound on +/// the final distance and mark it for removal if it cannot beat the current +/// threshold. Writes 0/1 into active_byteset for subsequent compaction. +/// +/// Uses `if constexpr` on C::is_max rather than C::cmp() to ensure the +/// comparison autovectorizes (C::cmp generates scalar function calls). +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template +static inline void prune_kernel( + float* FAISS_RESTRICT exact_distances, + const float* FAISS_RESTRICT dot_buffer, + const float* FAISS_RESTRICT level_cum_sums, + uint8_t* FAISS_RESTRICT active_byteset, + const uint32_t* FAISS_RESTRICT active_indices, + const uint32_t num_active, + const float query_cum_norm, + const float threshold) { + FAISS_PRAGMA_IMPRECISE_LOOP + for (uint32_t i = 0; i < num_active; i++) { + uint32_t idx = AllActive ? i : active_indices[i]; + if constexpr (M == METRIC_INNER_PRODUCT) { + exact_distances[idx] += dot_buffer[i]; + } else { + exact_distances[idx] -= 2.0f * dot_buffer[i]; + } + + float cum_sum = level_cum_sums[idx]; + float cauchy_schwarz_bound; + if constexpr (M == METRIC_INNER_PRODUCT) { + cauchy_schwarz_bound = -cum_sum * query_cum_norm; + } else { + cauchy_schwarz_bound = 2.0f * cum_sum * query_cum_norm; + } + + float lower_bound = exact_distances[idx] - cauchy_schwarz_bound; + if constexpr (C::is_max) { + active_byteset[i] = (threshold > lower_bound) ? 1 : 0; + } else { + active_byteset[i] = (threshold < lower_bound) ? 1 : 0; + } + } +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END + +/// Compact active_indices in-place, removing entries where active_byteset[i] +/// is zero. Returns the new count of active elements. Uses a branchless BMI2 + +/// AVX2 fast path (8 elements/iteration via _pext_u64 permutation) with a +/// scalar fallback for the tail and non-x86 platforms. +static inline size_t compact_active_kernel( + uint32_t* active_indices, + const uint8_t* FAISS_RESTRICT active_byteset, + const size_t num_active) { + size_t next_active = 0; + size_t i = 0; + +#if defined(__BMI2__) && defined(__AVX2__) + for (; i + 8 <= num_active; i += 8) { + uint64_t bytes; + memcpy(&bytes, &active_byteset[i], 8); + + uint64_t expanded = bytes * 0xFFULL; + uint64_t packed = _pext_u64(0x0706050403020100ULL, expanded); + + __m256i perm = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128((int64_t)packed)); + __m256i data = _mm256_loadu_si256((const __m256i*)&active_indices[i]); + __m256i compacted = _mm256_permutevar8x32_epi32(data, perm); + _mm256_storeu_si256((__m256i*)&active_indices[next_active], compacted); + + next_active += __builtin_popcountll(bytes); + } +#endif + + for (; i < num_active; i++) { + active_indices[next_active] = active_indices[i]; + next_active += active_byteset[i] ? 1 : 0; + } + + return next_active; +} + +/// Compile-time dispatch: converts a runtime `width` value into a template +/// parameter by generating an if-else chain over [Lo, Hi] in steps of Step. +/// Falls through to LevelWidth=0 (runtime path) if no specialization matches. +/// Allows for specialization of common level widths. +namespace detail { +template +inline auto dispatch_width(size_t width, Lambda&& fn) { + if constexpr (Lo > Hi) { + return fn.template operator()<0>(); + } else { + if (width == Lo) { + return fn.template operator()(); + } + return dispatch_width( + width, std::forward(fn)); + } +} +} // namespace detail + +/// Specialize for common float level widths (multiples of 8 up to 128). +template +inline auto with_level_width(size_t width, LambdaType&& action) { + return detail::dispatch_width<8, 128, 8>( + width, std::forward(action)); +} + +template +inline auto with_bool(bool value, Lambda&& fn) { + if (value) { + return fn.template operator()(); + } else { + return fn.template operator()(); + } +} +#endif // SWIG + /** * Implements the core logic of Panorama-based refinement. * arXiv: https://arxiv.org/abs/2510.00566 @@ -42,6 +222,8 @@ namespace faiss { * accelerating the refinement stage. */ struct Panorama { + static constexpr size_t kDefaultBatchSize = 128; + size_t d = 0; size_t code_size = 0; size_t n_levels = 0; @@ -98,6 +280,7 @@ struct Panorama { /// 4. After all levels, survivors are exact distances; update heap. /// This achieves early termination while maintaining SIMD-friendly /// sequential access patterns in the level-oriented storage layout. +#ifndef SWIG template size_t progressive_filter_batch( const uint8_t* codes_base, @@ -110,111 +293,99 @@ struct Panorama { const idx_t* ids, bool use_sel, std::vector& active_indices, + std::vector& active_byteset, std::vector& exact_distances, + std::vector& dot_buffer, float threshold, - PanoramaStats& local_stats) const; + PanoramaStats& local_stats) const { + size_t batch_start = batch_no * batch_size; + size_t curr_batch_size = std::min(list_size - batch_start, batch_size); - void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const; -}; + size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); + const float* batch_cum_sums = cum_sums + cumsum_batch_offset; + const float* level_cum_sums = batch_cum_sums + batch_size; + float q_norm = query_cum_sums[0] * query_cum_sums[0]; -template -size_t Panorama::progressive_filter_batch( - const uint8_t* codes_base, - const float* cum_sums, - const float* query, - const float* query_cum_sums, - size_t batch_no, - size_t list_size, - const IDSelector* sel, - const idx_t* ids, - bool use_sel, - std::vector& active_indices, - std::vector& exact_distances, - float threshold, - PanoramaStats& local_stats) const { - size_t batch_start = batch_no * batch_size; - size_t curr_batch_size = std::min(list_size - batch_start, batch_size); - - size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); - const float* batch_cum_sums = cum_sums + cumsum_batch_offset; - const float* level_cum_sums = batch_cum_sums + batch_size; - float q_norm = query_cum_sums[0] * query_cum_sums[0]; - - size_t batch_offset = batch_no * batch_size * code_size; - const uint8_t* storage_base = codes_base + batch_offset; - - // Initialize active set with ID-filtered vectors. - size_t num_active = 0; - for (size_t i = 0; i < curr_batch_size; i++) { - size_t global_idx = batch_start + i; - idx_t id = (ids == nullptr) ? global_idx : ids[global_idx]; - bool include = !use_sel || sel->is_member(id); - - active_indices[num_active] = i; - float cum_sum = batch_cum_sums[i]; + size_t batch_offset = batch_no * batch_size * code_size; + const uint8_t* storage_base = codes_base + batch_offset; - if constexpr (M == METRIC_INNER_PRODUCT) { - exact_distances[i] = 0.0f; - } else { - exact_distances[i] = cum_sum * cum_sum + q_norm; - } + // Initialize active set with ID-filtered vectors. + size_t num_active = 0; + for (size_t i = 0; i < curr_batch_size; i++) { + size_t global_idx = batch_start + i; + idx_t id = (ids == nullptr) ? global_idx : ids[global_idx]; + bool include = !use_sel || sel->is_member(id); - num_active += include; - } + active_indices[num_active] = i; + float cum_sum = batch_cum_sums[i]; - if (num_active == 0) { - return 0; - } - - size_t total_active = num_active; - for (size_t level = 0; level < n_levels; level++) { - local_stats.total_dims_scanned += num_active; - local_stats.total_dims += total_active; - - float query_cum_norm = query_cum_sums[level + 1]; - - size_t level_offset = level * level_width * batch_size; - const float* level_storage = - (const float*)(storage_base + level_offset); + if constexpr (M == METRIC_INNER_PRODUCT) { + exact_distances[i] = 0.0f; + } else { + exact_distances[i] = cum_sum * cum_sum + q_norm; + } - size_t next_active = 0; - for (size_t i = 0; i < num_active; i++) { - uint32_t idx = active_indices[i]; - size_t actual_level_width = std::min( - level_width_floats, d - level * level_width_floats); + num_active += include; + } - const float* yj = level_storage + idx * actual_level_width; - const float* query_level = query + level * level_width_floats; + size_t total_active = num_active; + const bool first_level_full = (num_active == curr_batch_size); - float dot_product = - fvec_inner_product(query_level, yj, actual_level_width); + local_stats.total_dims += total_active * n_levels; - if constexpr (M == METRIC_INNER_PRODUCT) { - exact_distances[idx] += dot_product; - } else { - exact_distances[idx] -= 2.0f * dot_product; - } + for (size_t level = 0; (level < n_levels) && (num_active > 0); + level++) { + local_stats.total_dims_scanned += num_active; - float cum_sum = level_cum_sums[idx]; - float cauchy_schwarz_bound; - if constexpr (M == METRIC_INNER_PRODUCT) { - cauchy_schwarz_bound = -cum_sum * query_cum_norm; - } else { - cauchy_schwarz_bound = 2.0f * cum_sum * query_cum_norm; - } + float query_cum_norm = query_cum_sums[level + 1]; - float lower_bound = exact_distances[idx] - cauchy_schwarz_bound; + size_t level_offset = level * level_width * batch_size; + const float* level_storage = + (const float*)(storage_base + level_offset); + const float* query_level = query + level * level_width_floats; + size_t actual_level_width = std::min( + level_width_floats, d - level * level_width_floats); - active_indices[next_active] = idx; - next_active += C::cmp(threshold, lower_bound) ? 1 : 0; + num_active = with_bool( + level == 0 && first_level_full, [&]() { + with_level_width( + actual_level_width, [&]() { + compute_level_dot_kernel< + AllActive, + LevelWidth>( + query_level, + level_storage, + active_indices.data(), + num_active, + actual_level_width, + dot_buffer.data()); + }); + + prune_kernel( + exact_distances.data(), + dot_buffer.data(), + level_cum_sums, + active_byteset.data(), + active_indices.data(), + (uint32_t)num_active, + query_cum_norm, + threshold); + + return compact_active_kernel( + active_indices.data(), + active_byteset.data(), + num_active); + }); + + level_cum_sums += batch_size; } - num_active = next_active; - level_cum_sums += batch_size; - } + return num_active; + }; +#endif // SWIG - return num_active; -} + void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const; +}; } // namespace faiss #endif diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index abe6746b94..40b31778da 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -495,16 +495,44 @@ std::unique_ptr read_InvertedLists_up( READ1(n_levels); FAISS_THROW_IF_NOT_FMT( n_levels > 0, "invalid ilpn n_levels %zd", n_levels); + constexpr size_t bs = Panorama::kDefaultBatchSize; auto ailp = std::make_unique( - nlist, code_size, n_levels); + nlist, code_size, n_levels, bs); std::vector sizes(nlist); read_ArrayInvertedLists_sizes(f, sizes); for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); - size_t num_elems = - ((sizes[i] + ArrayInvertedListsPanorama::kBatchSize - 1) / - ArrayInvertedListsPanorama::kBatchSize) * - ArrayInvertedListsPanorama::kBatchSize; + size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs; + ailp->codes[i].resize(num_elems * code_size); + ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); + } + for (size_t i = 0; i < nlist; i++) { + size_t n = sizes[i]; + if (n > 0) { + read_vector_with_known_size( + ailp->codes[i], f, ailp->codes[i].size()); + read_vector_with_known_size(ailp->ids[i], f, n); + read_vector_with_known_size( + ailp->cum_sums[i], f, ailp->cum_sums[i].size()); + } + } + return ailp; + } else if (h == fourcc("ilp2") && !(io_flags & IO_FLAG_SKIP_IVF_DATA)) { + size_t nlist, code_size, n_levels, bs; + READ1(nlist); + FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilp2 nlist"); + READ1(code_size); + READ1(n_levels); + READ1(bs); + FAISS_THROW_IF_NOT_FMT( + n_levels > 0, "invalid ilp2 n_levels %zd", n_levels); + auto ailp = std::make_unique( + nlist, code_size, n_levels, bs); + std::vector sizes(nlist); + read_ArrayInvertedLists_sizes(f, sizes); + for (size_t i = 0; i < nlist; i++) { + ailp->ids[i].resize(sizes[i]); + size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); } @@ -1633,6 +1661,15 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { read_ivf_header(ivfp.get(), f); ivfp->code_size = ivfp->d * sizeof(float); READ1(ivfp->n_levels); + ivfp->batch_size = Panorama::kDefaultBatchSize; + read_InvertedLists(*ivfp, f, io_flags); + idx = std::move(ivfp); + } else if (h == fourcc("IwP2")) { + auto ivfp = std::make_unique(); + read_ivf_header(ivfp.get(), f); + ivfp->code_size = ivfp->d * sizeof(float); + READ1(ivfp->n_levels); + READ1(ivfp->batch_size); read_InvertedLists(*ivfp, f, io_flags); idx = std::move(ivfp); } else if (h == fourcc("IwFl")) { diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 03cc9bdd69..f9b5c8d346 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -269,11 +269,20 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { } else if ( const auto& ailp = dynamic_cast(ils)) { - uint32_t h = fourcc("ilpn"); - WRITE1(h); - WRITE1(ailp->nlist); - WRITE1(ailp->code_size); - WRITE1(ailp->n_levels); + if (ailp->pano.batch_size == Panorama::kDefaultBatchSize) { + uint32_t h = fourcc("ilpn"); + WRITE1(h); + WRITE1(ailp->nlist); + WRITE1(ailp->code_size); + WRITE1(ailp->n_levels); + } else { + uint32_t h = fourcc("ilp2"); + WRITE1(h); + WRITE1(ailp->nlist); + WRITE1(ailp->code_size); + WRITE1(ailp->n_levels); + WRITE1(ailp->pano.batch_size); + } uint32_t list_type = fourcc("full"); WRITE1(list_type); std::vector sizes; @@ -707,10 +716,18 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { } else if ( const IndexIVFFlatPanorama* ivfp = dynamic_cast(idx)) { - uint32_t h = fourcc("IwPn"); - WRITE1(h); - write_ivf_header(ivfp, f); - WRITE1(ivfp->n_levels); + if (ivfp->batch_size == Panorama::kDefaultBatchSize) { + uint32_t h = fourcc("IwPn"); + WRITE1(h); + write_ivf_header(ivfp, f); + WRITE1(ivfp->n_levels); + } else { + uint32_t h = fourcc("IwP2"); + WRITE1(h); + write_ivf_header(ivfp, f); + WRITE1(ivfp->n_levels); + WRITE1(ivfp->batch_size); + } write_InvertedLists(ivfp->invlists, f); } else if ( const IndexIVFFlat* ivfl_2 = diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h index b7aa4111e3..d72d2d37da 100644 --- a/faiss/impl/platform_macros.h +++ b/faiss/impl/platform_macros.h @@ -110,6 +110,7 @@ inline int __builtin_clzll(uint64_t x) { // MSVC uses pragma pack instead of __attribute__((packed)) // Use FAISS_PACK_STRUCTS_BEGIN/END to wrap packed structure definitions #define FAISS_PACKED +#define FAISS_RESTRICT __restrict #define FAISS_PACK_STRUCTS_BEGIN __pragma(pack(push, 1)) #define FAISS_PACK_STRUCTS_END __pragma(pack(pop)) @@ -126,9 +127,11 @@ inline int __builtin_clzll(uint64_t x) { #ifdef SWIG #define ALIGNED(x) #define FAISS_PACKED +#define FAISS_RESTRICT #else #define ALIGNED(x) __attribute__((aligned(x))) #define FAISS_PACKED __attribute__((packed)) +#define FAISS_RESTRICT __restrict #endif // On non-Windows, FAISS_PACKED handles packing, so these are no-ops diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp index 1e66227aea..ea3af51ad7 100644 --- a/faiss/index_factory.cpp +++ b/faiss/index_factory.cpp @@ -341,9 +341,11 @@ IndexIVF* parse_IndexIVF( if (match("FlatDedup")) { return new IndexIVFFlatDedup(get_q(), d, nlist, mt, own_il); } - if (match("FlatPanorama([0-9]+)?")) { + if (match("FlatPanorama([0-9]+)?(_([0-9]+))?")) { int nlevels = mres_to_int(sm[1], 8); // default to 8 levels - return new IndexIVFFlatPanorama(get_q(), d, nlist, nlevels, mt, own_il); + int bs = mres_to_int(sm[3], 128); + return new IndexIVFFlatPanorama( + get_q(), d, nlist, nlevels, mt, own_il, bs); } if (match(sq_pattern)) { return new IndexIVFScalarQuantizer( diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index 6f8e88ef35..c92b9d94e5 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -354,14 +354,15 @@ ArrayInvertedLists::~ArrayInvertedLists() {} ArrayInvertedListsPanorama::ArrayInvertedListsPanorama( size_t nlist_in, size_t code_size_in, - size_t n_levels_in) + size_t n_levels_in, + size_t batch_size) : ArrayInvertedLists(nlist_in, code_size_in), n_levels(n_levels_in), level_width( (((code_size_in / sizeof(float)) + n_levels_in - 1) / n_levels_in) * sizeof(float)), - pano(code_size_in, n_levels_in, kBatchSize) { + pano(code_size_in, n_levels_in, batch_size) { FAISS_THROW_IF_NOT(n_levels_in > 0); FAISS_THROW_IF_NOT(code_size_in % sizeof(float) == 0); FAISS_THROW_IF_NOT_MSG( @@ -389,9 +390,9 @@ size_t ArrayInvertedListsPanorama::add_entries( memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry); size_t new_size = o + n_entry; - size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1)); + size_t num_batches = (new_size + pano.batch_size - 1) / pano.batch_size; + codes[list_no].resize(num_batches * pano.batch_size * code_size); + cum_sums[list_no].resize(num_batches * pano.batch_size * (n_levels + 1)); // Cast to float* is safe here as we guarantee codes are always float // vectors for `IndexIVFFlatPanorama` (verified by the constructor). @@ -425,9 +426,9 @@ void ArrayInvertedListsPanorama::update_entries( void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { ids[list_no].resize(new_size); - size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1)); + size_t num_batches = (new_size + pano.batch_size - 1) / pano.batch_size; + codes[list_no].resize(num_batches * pano.batch_size * code_size); + cum_sums[list_no].resize(num_batches * pano.batch_size * (n_levels + 1)); } const uint8_t* ArrayInvertedListsPanorama::get_single_code( diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h index 743bcad62d..6b0e37c5c6 100644 --- a/faiss/invlists/InvertedLists.h +++ b/faiss/invlists/InvertedLists.h @@ -280,7 +280,6 @@ struct ArrayInvertedLists : InvertedLists { /// Level-oriented storage as defined in the IVFFlat section of Panorama /// (https://www.arxiv.org/pdf/2510.00566). struct ArrayInvertedListsPanorama : ArrayInvertedLists { - static constexpr size_t kBatchSize = 128; std::vector> cum_sums; const size_t n_levels; const size_t level_width; // in code units @@ -289,7 +288,8 @@ struct ArrayInvertedListsPanorama : ArrayInvertedLists { ArrayInvertedListsPanorama( size_t nlist_in, size_t code_size_in, - size_t n_levels_in); + size_t n_levels_in, + size_t batch_size = Panorama::kDefaultBatchSize); const float* get_cum_sums(size_t list_no) const;