diff --git a/benchs/bench_flat_l2_panorama.py b/benchs/bench_flat_l2_panorama.py
index 660109ba6c..0dc101172f 100644
--- a/benchs/bench_flat_l2_panorama.py
+++ b/benchs/bench_flat_l2_panorama.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import multiprocessing as mp
 import time
 
@@ -11,11 +12,18 @@
 import numpy as np
 
 try:
-    from faiss.contrib.datasets_fb import DatasetGIST1M
+    from faiss.contrib.datasets_fb import DatasetSIFT1M, DatasetGIST1M
 except ImportError:
-    from faiss.contrib.datasets import DatasetGIST1M
+    from faiss.contrib.datasets import DatasetSIFT1M, DatasetGIST1M
 
-ds = DatasetGIST1M()
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", default="gist1m", choices=["sift1m", "gist1m"])
+args = parser.parse_args()
+
+if args.dataset == "sift1m":
+    ds = DatasetSIFT1M()
+else:
+    ds = DatasetGIST1M()
 
 nq = 10
 xq = ds.get_queries()[:nq]
@@ -60,7 +68,7 @@ def build_index(name):
     return index
 
 
-nlevels = 8
+nlevels = 16 if args.dataset == "gist1m" else 8
 batch_size = 512
 
 plt.figure(figsize=(8, 6), dpi=80)
@@ -93,7 +101,8 @@ def build_index(name):
 )
 plt.xticks(x, labels, rotation=0)
 plt.ylabel("QPS")
-plt.title("Flat Indexes on GIST1M")
+dataset_label = args.dataset.upper()
+plt.title(f"Flat Indexes on {dataset_label}")
 
 plt.tight_layout()
-plt.savefig("bench_flat_l2_panorama.png", bbox_inches="tight")
+plt.savefig(f"bench_flat_l2_panorama_{args.dataset}.png", bbox_inches="tight")
diff --git a/benchs/bench_ivf_flat_panorama.py b/benchs/bench_ivf_flat_panorama.py
index 85cf840591..4c5fe96870 100644
--- a/benchs/bench_ivf_flat_panorama.py
+++ b/benchs/bench_ivf_flat_panorama.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import multiprocessing as mp
 import time
 
@@ -11,11 +12,18 @@
 import numpy as np
 
 try:
-    from faiss.contrib.datasets_fb import DatasetGIST1M
+    from faiss.contrib.datasets_fb import DatasetSIFT1M, DatasetGIST1M
 except ImportError:
-    from faiss.contrib.datasets import DatasetGIST1M
+    from faiss.contrib.datasets import DatasetSIFT1M, DatasetGIST1M
 
-ds = DatasetGIST1M()
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", default="gist1m", choices=["sift1m", "gist1m"])
+args = parser.parse_args()
+
+if args.dataset == "sift1m":
+    ds = DatasetSIFT1M()
+else:
+    ds = DatasetGIST1M()
 
 xq = ds.get_queries()
 xb = ds.get_database()
@@ -29,7 +37,7 @@
 
 k = 10
 gt = gt[:, :k]
-nlevels = 8
+nlevels = 16 if args.dataset == "gist1m" else 8
 
 
 def get_ivf_index(index):
@@ -90,12 +98,12 @@ def eval_and_plot(name, plot=True):
 eval_and_plot(f"IVF{nlist},Flat")
 
 # IVFFlatPanorama (with PCA transform to concentrate energy in early dimensions)
-eval_and_plot(f"PCA{d},IVF{nlist},FlatPanorama{nlevels}")
+eval_and_plot(f"PCA{d},IVF{nlist},FlatPanorama{nlevels}_{1024}")
 
-plt.title("IVF Flat Indexes on GIST1M")
-plt.title("Indices on GIST1M")
+dataset_label = args.dataset.upper()
+plt.title(f"IVF Flat Indexes on {dataset_label}")
 plt.xlabel(f"Recall@{k}")
 plt.ylabel("QPS")
 plt.yscale("log")
 plt.legend(bbox_to_anchor=(1.02, 0.1), loc="upper left", borderaxespad=0)
-plt.savefig("bench_ivf_flat_panorama.png", bbox_inches="tight")
+plt.savefig(f"bench_ivf_flat_panorama_{args.dataset}.png", bbox_inches="tight")
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index e6eda3df75..ae5863531c 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -362,6 +362,15 @@ endif()
 # Export FAISS_HEADERS variable to parent scope.
 set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
 
+# Detect BMI2 compiler support.
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2)
+if(COMPILER_SUPPORTS_BMI2)
+  set(FAISS_BMI2_FLAGS "-mbmi2")
+else()
+  set(FAISS_BMI2_FLAGS "")
+endif()
+
 add_library(faiss ${FAISS_SRC})
 
 add_library(faiss_avx2 ${FAISS_SRC})
@@ -369,7 +378,7 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512"
   set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
 endif()
 if(NOT WIN32)
-  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
+  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt ${FAISS_BMI2_FLAGS}>)
 else()
   # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
   # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
@@ -389,7 +398,7 @@ endif()
 if(NOT WIN32)
   # All modern CPUs support F, CD, VL, DQ, BW extensions.
   # Ref: https://en.wikipedia.org/wiki/AVX512
-  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
+  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt ${FAISS_BMI2_FLAGS}>)
 else()
   target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
   # we need bigobj for the swig wrapper
@@ -405,7 +414,7 @@ endif()
 if(NOT WIN32)
   # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
   # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
-  target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16>)
+  target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16 ${FAISS_BMI2_FLAGS}>)
 else()
   target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
   # we need bigobj for the swig wrapper
diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index 5aa78b9a24..a4b3bcf938 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -628,8 +628,10 @@ inline void flat_pano_search_core(
         SingleResultHandler res(handler);
 
         std::vector<float> query_cum_norms(index.n_levels + 1);
-        std::vector<float> exact_distances(index.batch_size);
         std::vector<uint32_t> active_indices(index.batch_size);
+        std::vector<uint8_t> active_byteset(index.batch_size);
+        std::vector<float> exact_distances(index.batch_size);
+        std::vector<float> dot_buffer(index.batch_size);
 
 #pragma omp for
         for (int64_t i = 0; i < n; i++) {
@@ -664,7 +666,9 @@ inline void flat_pano_search_core(
                                     nullptr,
                                     use_sel,
                                     active_indices,
+                                    active_byteset,
                                     exact_distances,
+                                    dot_buffer,
                                     threshold,
                                     local_stats);
                         });
diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp
index ba32fce132..ec57162f44 100644
--- a/faiss/IndexIVFFlatPanorama.cpp
+++ b/faiss/IndexIVFFlatPanorama.cpp
@@ -32,19 +32,23 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama(
         size_t nlist_in,
         int n_levels_in,
         MetricType metric,
-        bool own_invlists_in)
+        bool own_invlists_in,
+        size_t batch_size_in)
         : IndexIVFFlat(quantizer_in, d_in, nlist_in, metric, false),
-          n_levels(n_levels_in) {
+          n_levels(n_levels_in),
+          batch_size(batch_size_in) {
     FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
 
     // We construct the inverted lists here so that we can use the
     // level-oriented storage. This does not cause a leak as we constructed
     // IndexIVF first, with own_invlists set to false.
-    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, n_levels);
+    this->invlists = new ArrayInvertedListsPanorama(
+            nlist, code_size, n_levels, batch_size);
     this->own_invlists = own_invlists_in;
 }
 
-IndexIVFFlatPanorama::IndexIVFFlatPanorama() : n_levels(0) {}
+IndexIVFFlatPanorama::IndexIVFFlatPanorama()
+        : n_levels(0), batch_size(Panorama::kDefaultBatchSize) {}
 
 namespace {
 
@@ -55,6 +59,11 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
     using C = typename VectorDistance::C;
     static constexpr MetricType metric = VectorDistance::metric;
 
+    mutable std::vector<uint32_t> active_indices_;
+    mutable std::vector<uint8_t> active_byteset_;
+    mutable std::vector<float> exact_distances_;
+    mutable std::vector<float> dot_buffer_;
+
     IVFFlatScannerPanorama(
             const VectorDistance& vd_in,
             const ArrayInvertedListsPanorama* storage_in,
@@ -65,7 +74,11 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
               storage(storage_in) {
         keep_max = vd.is_similarity;
         code_size = vd.d * sizeof(float);
-        cum_sums.resize(storage->n_levels + 1);
+        cum_sums.resize(storage->pano.n_levels + 1);
+        active_indices_.resize(storage->pano.batch_size);
+        active_byteset_.resize(storage->pano.batch_size);
+        exact_distances_.resize(storage->pano.batch_size);
+        dot_buffer_.resize(storage->pano.batch_size);
     }
 
     const float* xi = nullptr;
@@ -90,6 +103,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
     }
 
     using InvertedListScanner::scan_codes;
+
     size_t scan_codes(
             size_t list_size,
             const uint8_t* codes,
@@ -97,20 +111,16 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
             ResultHandler& handler) const override {
         size_t nup = 0;
 
-        const size_t n_batches =
-                (list_size + storage->kBatchSize - 1) / storage->kBatchSize;
+        const size_t bs = storage->pano.batch_size;
+        const size_t n_batches = (list_size + bs - 1) / bs;
 
         const float* cum_sums_data = storage->get_cum_sums(list_no);
 
-        std::vector<float> exact_distances(storage->kBatchSize);
-        std::vector<uint32_t> active_indices(storage->kBatchSize);
-
         PanoramaStats local_stats;
         local_stats.reset();
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t batch_start = batch_no * storage->kBatchSize;
-
+            size_t batch_start = batch_no * bs;
             size_t num_active = with_metric_type(metric, [&]<MetricType M>() {
                 return storage->pano.progressive_filter_batch<C, M>(
                         codes,
@@ -122,17 +132,18 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
                         sel,
                         ids,
                         use_sel,
-                        active_indices,
-                        exact_distances,
+                        active_indices_,
+                        active_byteset_,
+                        exact_distances_,
+                        dot_buffer_,
                         handler.threshold,
                         local_stats);
             });
 
-            // Add batch survivors to heap.
             for (size_t i = 0; i < num_active; i++) {
-                uint32_t idx = active_indices[i];
+                uint32_t idx = active_indices_[i];
                 size_t global_idx = batch_start + idx;
-                float dis = exact_distances[idx];
+                float dis = exact_distances_[idx];
 
                 if (C::cmp(handler.threshold, dis)) {
                     int64_t id = store_pairs ? lo_build(list_no, global_idx)
diff --git a/faiss/IndexIVFFlatPanorama.h b/faiss/IndexIVFFlatPanorama.h
index 9ec897895d..6144e43ce3 100644
--- a/faiss/IndexIVFFlatPanorama.h
+++ b/faiss/IndexIVFFlatPanorama.h
@@ -37,6 +37,7 @@ namespace faiss {
 /// `ArrayInvertedListsPanorama`, which is a struct member of `IndexIVF`.
 struct IndexIVFFlatPanorama : IndexIVFFlat {
     size_t n_levels;
+    size_t batch_size;
 
     std::vector<MaybeOwnedVector<float>> cum_sums;
 
@@ -46,7 +47,8 @@ struct IndexIVFFlatPanorama : IndexIVFFlat {
             size_t nlist_,
             int n_levels,
             MetricType = METRIC_L2,
-            bool own_invlists = true);
+            bool own_invlists = true,
+            size_t batch_size = Panorama::kDefaultBatchSize);
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h
index 79a23a64a7..1ec917ec8f 100644
--- a/faiss/impl/Panorama.h
+++ b/faiss/impl/Panorama.h
@@ -18,10 +18,190 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <vector>
 
+#if defined(__BMI2__) && defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
 namespace faiss {
 
+#ifndef SWIG
+
+/// Compute dot products between query_level and active vectors.
+///
+/// @tparam AllActive  If true, vectors are at sequential positions 0..N-1
+///                    (first level, full batch). If false, positions come
+///                    from active_indices (subsequent levels after pruning).
+/// @tparam LevelWidth Compile-time level width in floats (0 = use runtime
+///                    level_width_dims). Enables full loop unrolling.
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <bool AllActive = false, size_t LevelWidth = 0>
+static inline void compute_level_dot_kernel(
+        const float* FAISS_RESTRICT query_level,
+        const float* FAISS_RESTRICT level_storage,
+        const uint32_t* active_indices,
+        const size_t num_active,
+        const size_t level_width_dims,
+        float* FAISS_RESTRICT dot_products) {
+    const size_t width = LevelWidth > 0 ? LevelWidth : level_width_dims;
+    size_t i = 0;
+    for (; i + 4 <= num_active; i += 4) {
+        const float* y0 = level_storage +
+                (AllActive ? (i + 0) : active_indices[i + 0]) * width;
+        const float* y1 = level_storage +
+                (AllActive ? (i + 1) : active_indices[i + 1]) * width;
+        const float* y2 = level_storage +
+                (AllActive ? (i + 2) : active_indices[i + 2]) * width;
+        const float* y3 = level_storage +
+                (AllActive ? (i + 3) : active_indices[i + 3]) * width;
+
+        float dp0 = 0, dp1 = 0, dp2 = 0, dp3 = 0;
+        FAISS_PRAGMA_IMPRECISE_LOOP
+        for (size_t j = 0; j < width; j++) {
+            float q = query_level[j];
+            dp0 += q * y0[j];
+            dp1 += q * y1[j];
+            dp2 += q * y2[j];
+            dp3 += q * y3[j];
+        }
+
+        dot_products[i + 0] = dp0;
+        dot_products[i + 1] = dp1;
+        dot_products[i + 2] = dp2;
+        dot_products[i + 3] = dp3;
+    }
+    for (; i < num_active; i++) {
+        const float* yj =
+                level_storage + (AllActive ? i : active_indices[i]) * width;
+        float dp = 0;
+        FAISS_PRAGMA_IMPRECISE_LOOP
+        for (size_t j = 0; j < width; j++) {
+            dp += query_level[j] * yj[j];
+        }
+        dot_products[i] = dp;
+    }
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+/// Update exact distances with the current level's dot products, then apply
+/// Panorama pruning: for each active vector, compute a lower bound on
+/// the final distance and mark it for removal if it cannot beat the current
+/// threshold. Writes 0/1 into active_byteset for subsequent compaction.
+///
+/// Uses `if constexpr` on C::is_max rather than C::cmp() to ensure the
+/// comparison autovectorizes (C::cmp generates scalar function calls).
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <bool AllActive, typename C, MetricType M>
+static inline void prune_kernel(
+        float* FAISS_RESTRICT exact_distances,
+        const float* FAISS_RESTRICT dot_buffer,
+        const float* FAISS_RESTRICT level_cum_sums,
+        uint8_t* FAISS_RESTRICT active_byteset,
+        const uint32_t* FAISS_RESTRICT active_indices,
+        const uint32_t num_active,
+        const float query_cum_norm,
+        const float threshold) {
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (uint32_t i = 0; i < num_active; i++) {
+        uint32_t idx = AllActive ? i : active_indices[i];
+        if constexpr (M == METRIC_INNER_PRODUCT) {
+            exact_distances[idx] += dot_buffer[i];
+        } else {
+            exact_distances[idx] -= 2.0f * dot_buffer[i];
+        }
+
+        float cum_sum = level_cum_sums[idx];
+        float cauchy_schwarz_bound;
+        if constexpr (M == METRIC_INNER_PRODUCT) {
+            cauchy_schwarz_bound = -cum_sum * query_cum_norm;
+        } else {
+            cauchy_schwarz_bound = 2.0f * cum_sum * query_cum_norm;
+        }
+
+        float lower_bound = exact_distances[idx] - cauchy_schwarz_bound;
+        if constexpr (C::is_max) {
+            active_byteset[i] = (threshold > lower_bound) ? 1 : 0;
+        } else {
+            active_byteset[i] = (threshold < lower_bound) ? 1 : 0;
+        }
+    }
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+/// Compact active_indices in-place, removing entries where active_byteset[i]
+/// is zero. Returns the new count of active elements. Uses a branchless BMI2 +
+/// AVX2 fast path (8 elements/iteration via _pext_u64 permutation) with a
+/// scalar fallback for the tail and non-x86 platforms.
+static inline size_t compact_active_kernel(
+        uint32_t* active_indices,
+        const uint8_t* FAISS_RESTRICT active_byteset,
+        const size_t num_active) {
+    size_t next_active = 0;
+    size_t i = 0;
+
+#if defined(__BMI2__) && defined(__AVX2__)
+    for (; i + 8 <= num_active; i += 8) {
+        uint64_t bytes;
+        memcpy(&bytes, &active_byteset[i], 8);
+
+        uint64_t expanded = bytes * 0xFFULL;
+        uint64_t packed = _pext_u64(0x0706050403020100ULL, expanded);
+
+        __m256i perm = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128((int64_t)packed));
+        __m256i data = _mm256_loadu_si256((const __m256i*)&active_indices[i]);
+        __m256i compacted = _mm256_permutevar8x32_epi32(data, perm);
+        _mm256_storeu_si256((__m256i*)&active_indices[next_active], compacted);
+
+        next_active += __builtin_popcountll(bytes);
+    }
+#endif
+
+    for (; i < num_active; i++) {
+        active_indices[next_active] = active_indices[i];
+        next_active += active_byteset[i] ? 1 : 0;
+    }
+
+    return next_active;
+}
+
+/// Compile-time dispatch: converts a runtime `width` value into a template
+/// parameter by generating an if-else chain over [Lo, Hi] in steps of Step.
+/// Falls through to LevelWidth=0 (runtime path) if no specialization matches.
+/// Allows for specialization of common level widths.
+namespace detail {
+template <size_t Lo, size_t Hi, size_t Step, typename Lambda>
+inline auto dispatch_width(size_t width, Lambda&& fn) {
+    if constexpr (Lo > Hi) {
+        return fn.template operator()<0>();
+    } else {
+        if (width == Lo) {
+            return fn.template operator()<Lo>();
+        }
+        return dispatch_width<Lo + Step, Hi, Step>(
+                width, std::forward<Lambda>(fn));
+    }
+}
+} // namespace detail
+
+/// Specialize for common float level widths (multiples of 8 up to 128).
+template <typename LambdaType>
+inline auto with_level_width(size_t width, LambdaType&& action) {
+    return detail::dispatch_width<8, 128, 8>(
+            width, std::forward<LambdaType>(action));
+}
+
+template <typename Lambda>
+inline auto with_bool(bool value, Lambda&& fn) {
+    if (value) {
+        return fn.template operator()<true>();
+    } else {
+        return fn.template operator()<false>();
+    }
+}
+#endif // SWIG
+
 /**
  * Implements the core logic of Panorama-based refinement.
  * arXiv: https://arxiv.org/abs/2510.00566
@@ -42,6 +222,8 @@ namespace faiss {
  * accelerating the refinement stage.
  */
 struct Panorama {
+    static constexpr size_t kDefaultBatchSize = 128;
+
     size_t d = 0;
     size_t code_size = 0;
     size_t n_levels = 0;
@@ -98,6 +280,7 @@ struct Panorama {
     /// 4. After all levels, survivors are exact distances; update heap.
     /// This achieves early termination while maintaining SIMD-friendly
     /// sequential access patterns in the level-oriented storage layout.
+#ifndef SWIG
     template <typename C, MetricType M>
     size_t progressive_filter_batch(
             const uint8_t* codes_base,
@@ -110,111 +293,99 @@ struct Panorama {
             const idx_t* ids,
             bool use_sel,
             std::vector<uint32_t>& active_indices,
+            std::vector<uint8_t>& active_byteset,
             std::vector<float>& exact_distances,
+            std::vector<float>& dot_buffer,
             float threshold,
-            PanoramaStats& local_stats) const;
+            PanoramaStats& local_stats) const {
+        size_t batch_start = batch_no * batch_size;
+        size_t curr_batch_size = std::min(list_size - batch_start, batch_size);
 
-    void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const;
-};
+        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
+        const float* batch_cum_sums = cum_sums + cumsum_batch_offset;
+        const float* level_cum_sums = batch_cum_sums + batch_size;
+        float q_norm = query_cum_sums[0] * query_cum_sums[0];
 
-template <typename C, MetricType M>
-size_t Panorama::progressive_filter_batch(
-        const uint8_t* codes_base,
-        const float* cum_sums,
-        const float* query,
-        const float* query_cum_sums,
-        size_t batch_no,
-        size_t list_size,
-        const IDSelector* sel,
-        const idx_t* ids,
-        bool use_sel,
-        std::vector<uint32_t>& active_indices,
-        std::vector<float>& exact_distances,
-        float threshold,
-        PanoramaStats& local_stats) const {
-    size_t batch_start = batch_no * batch_size;
-    size_t curr_batch_size = std::min(list_size - batch_start, batch_size);
-
-    size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
-    const float* batch_cum_sums = cum_sums + cumsum_batch_offset;
-    const float* level_cum_sums = batch_cum_sums + batch_size;
-    float q_norm = query_cum_sums[0] * query_cum_sums[0];
-
-    size_t batch_offset = batch_no * batch_size * code_size;
-    const uint8_t* storage_base = codes_base + batch_offset;
-
-    // Initialize active set with ID-filtered vectors.
-    size_t num_active = 0;
-    for (size_t i = 0; i < curr_batch_size; i++) {
-        size_t global_idx = batch_start + i;
-        idx_t id = (ids == nullptr) ? global_idx : ids[global_idx];
-        bool include = !use_sel || sel->is_member(id);
-
-        active_indices[num_active] = i;
-        float cum_sum = batch_cum_sums[i];
+        size_t batch_offset = batch_no * batch_size * code_size;
+        const uint8_t* storage_base = codes_base + batch_offset;
 
-        if constexpr (M == METRIC_INNER_PRODUCT) {
-            exact_distances[i] = 0.0f;
-        } else {
-            exact_distances[i] = cum_sum * cum_sum + q_norm;
-        }
+        // Initialize active set with ID-filtered vectors.
+        size_t num_active = 0;
+        for (size_t i = 0; i < curr_batch_size; i++) {
+            size_t global_idx = batch_start + i;
+            idx_t id = (ids == nullptr) ? global_idx : ids[global_idx];
+            bool include = !use_sel || sel->is_member(id);
 
-        num_active += include;
-    }
+            active_indices[num_active] = i;
+            float cum_sum = batch_cum_sums[i];
 
-    if (num_active == 0) {
-        return 0;
-    }
-
-    size_t total_active = num_active;
-    for (size_t level = 0; level < n_levels; level++) {
-        local_stats.total_dims_scanned += num_active;
-        local_stats.total_dims += total_active;
-
-        float query_cum_norm = query_cum_sums[level + 1];
-
-        size_t level_offset = level * level_width * batch_size;
-        const float* level_storage =
-                (const float*)(storage_base + level_offset);
+            if constexpr (M == METRIC_INNER_PRODUCT) {
+                exact_distances[i] = 0.0f;
+            } else {
+                exact_distances[i] = cum_sum * cum_sum + q_norm;
+            }
 
-        size_t next_active = 0;
-        for (size_t i = 0; i < num_active; i++) {
-            uint32_t idx = active_indices[i];
-            size_t actual_level_width = std::min(
-                    level_width_floats, d - level * level_width_floats);
+            num_active += include;
+        }
 
-            const float* yj = level_storage + idx * actual_level_width;
-            const float* query_level = query + level * level_width_floats;
+        size_t total_active = num_active;
+        const bool first_level_full = (num_active == curr_batch_size);
 
-            float dot_product =
-                    fvec_inner_product(query_level, yj, actual_level_width);
+        local_stats.total_dims += total_active * n_levels;
 
-            if constexpr (M == METRIC_INNER_PRODUCT) {
-                exact_distances[idx] += dot_product;
-            } else {
-                exact_distances[idx] -= 2.0f * dot_product;
-            }
+        for (size_t level = 0; (level < n_levels) && (num_active > 0);
+             level++) {
+            local_stats.total_dims_scanned += num_active;
 
-            float cum_sum = level_cum_sums[idx];
-            float cauchy_schwarz_bound;
-            if constexpr (M == METRIC_INNER_PRODUCT) {
-                cauchy_schwarz_bound = -cum_sum * query_cum_norm;
-            } else {
-                cauchy_schwarz_bound = 2.0f * cum_sum * query_cum_norm;
-            }
+            float query_cum_norm = query_cum_sums[level + 1];
 
-            float lower_bound = exact_distances[idx] - cauchy_schwarz_bound;
+            size_t level_offset = level * level_width * batch_size;
+            const float* level_storage =
+                    (const float*)(storage_base + level_offset);
+            const float* query_level = query + level * level_width_floats;
+            size_t actual_level_width = std::min(
+                    level_width_floats, d - level * level_width_floats);
 
-            active_indices[next_active] = idx;
-            next_active += C::cmp(threshold, lower_bound) ? 1 : 0;
+            num_active = with_bool(
+                    level == 0 && first_level_full, [&]<bool AllActive>() {
+                        with_level_width(
+                                actual_level_width, [&]<size_t LevelWidth>() {
+                                    compute_level_dot_kernel<
+                                            AllActive,
+                                            LevelWidth>(
+                                            query_level,
+                                            level_storage,
+                                            active_indices.data(),
+                                            num_active,
+                                            actual_level_width,
+                                            dot_buffer.data());
+                                });
+
+                        prune_kernel<AllActive, C, M>(
+                                exact_distances.data(),
+                                dot_buffer.data(),
+                                level_cum_sums,
+                                active_byteset.data(),
+                                active_indices.data(),
+                                (uint32_t)num_active,
+                                query_cum_norm,
+                                threshold);
+
+                        return compact_active_kernel(
+                                active_indices.data(),
+                                active_byteset.data(),
+                                num_active);
+                    });
+
+            level_cum_sums += batch_size;
         }
 
-        num_active = next_active;
-        level_cum_sums += batch_size;
-    }
+        return num_active;
+    };
+#endif // SWIG
 
-    return num_active;
-}
+    void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const;
+};
 } // namespace faiss
 
 #endif
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index abe6746b94..40b31778da 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -495,16 +495,44 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         READ1(n_levels);
         FAISS_THROW_IF_NOT_FMT(
                 n_levels > 0, "invalid ilpn n_levels %zd", n_levels);
+        constexpr size_t bs = Panorama::kDefaultBatchSize;
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
-                nlist, code_size, n_levels);
+                nlist, code_size, n_levels, bs);
         std::vector<size_t> sizes(nlist);
         read_ArrayInvertedLists_sizes(f, sizes);
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
-            size_t num_elems =
-                    ((sizes[i] + ArrayInvertedListsPanorama::kBatchSize - 1) /
-                     ArrayInvertedListsPanorama::kBatchSize) *
-                    ArrayInvertedListsPanorama::kBatchSize;
+            size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs;
+            ailp->codes[i].resize(num_elems * code_size);
+            ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
+        }
+        for (size_t i = 0; i < nlist; i++) {
+            size_t n = sizes[i];
+            if (n > 0) {
+                read_vector_with_known_size(
+                        ailp->codes[i], f, ailp->codes[i].size());
+                read_vector_with_known_size(ailp->ids[i], f, n);
+                read_vector_with_known_size(
+                        ailp->cum_sums[i], f, ailp->cum_sums[i].size());
+            }
+        }
+        return ailp;
+    } else if (h == fourcc("ilp2") && !(io_flags & IO_FLAG_SKIP_IVF_DATA)) {
+        size_t nlist, code_size, n_levels, bs;
+        READ1(nlist);
+        FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilp2 nlist");
+        READ1(code_size);
+        READ1(n_levels);
+        READ1(bs);
+        FAISS_THROW_IF_NOT_FMT(
+                n_levels > 0, "invalid ilp2 n_levels %zd", n_levels);
+        auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
+                nlist, code_size, n_levels, bs);
+        std::vector<size_t> sizes(nlist);
+        read_ArrayInvertedLists_sizes(f, sizes);
+        for (size_t i = 0; i < nlist; i++) {
+            ailp->ids[i].resize(sizes[i]);
+            size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
         }
@@ -1633,6 +1661,15 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         read_ivf_header(ivfp.get(), f);
         ivfp->code_size = ivfp->d * sizeof(float);
         READ1(ivfp->n_levels);
+        ivfp->batch_size = Panorama::kDefaultBatchSize;
+        read_InvertedLists(*ivfp, f, io_flags);
+        idx = std::move(ivfp);
+    } else if (h == fourcc("IwP2")) {
+        auto ivfp = std::make_unique<IndexIVFFlatPanorama>();
+        read_ivf_header(ivfp.get(), f);
+        ivfp->code_size = ivfp->d * sizeof(float);
+        READ1(ivfp->n_levels);
+        READ1(ivfp->batch_size);
         read_InvertedLists(*ivfp, f, io_flags);
         idx = std::move(ivfp);
     } else if (h == fourcc("IwFl")) {
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 03cc9bdd69..f9b5c8d346 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -269,11 +269,20 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
     } else if (
             const auto& ailp =
                     dynamic_cast<const ArrayInvertedListsPanorama*>(ils)) {
-        uint32_t h = fourcc("ilpn");
-        WRITE1(h);
-        WRITE1(ailp->nlist);
-        WRITE1(ailp->code_size);
-        WRITE1(ailp->n_levels);
+        if (ailp->pano.batch_size == Panorama::kDefaultBatchSize) {
+            uint32_t h = fourcc("ilpn");
+            WRITE1(h);
+            WRITE1(ailp->nlist);
+            WRITE1(ailp->code_size);
+            WRITE1(ailp->n_levels);
+        } else {
+            uint32_t h = fourcc("ilp2");
+            WRITE1(h);
+            WRITE1(ailp->nlist);
+            WRITE1(ailp->code_size);
+            WRITE1(ailp->n_levels);
+            WRITE1(ailp->pano.batch_size);
+        }
         uint32_t list_type = fourcc("full");
         WRITE1(list_type);
         std::vector<size_t> sizes;
@@ -707,10 +716,18 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) {
     } else if (
             const IndexIVFFlatPanorama* ivfp =
                     dynamic_cast<const IndexIVFFlatPanorama*>(idx)) {
-        uint32_t h = fourcc("IwPn");
-        WRITE1(h);
-        write_ivf_header(ivfp, f);
-        WRITE1(ivfp->n_levels);
+        if (ivfp->batch_size == Panorama::kDefaultBatchSize) {
+            uint32_t h = fourcc("IwPn");
+            WRITE1(h);
+            write_ivf_header(ivfp, f);
+            WRITE1(ivfp->n_levels);
+        } else {
+            uint32_t h = fourcc("IwP2");
+            WRITE1(h);
+            write_ivf_header(ivfp, f);
+            WRITE1(ivfp->n_levels);
+            WRITE1(ivfp->batch_size);
+        }
         write_InvertedLists(ivfp->invlists, f);
     } else if (
             const IndexIVFFlat* ivfl_2 =
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index b7aa4111e3..d72d2d37da 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -110,6 +110,7 @@ inline int __builtin_clzll(uint64_t x) {
 // MSVC uses pragma pack instead of __attribute__((packed))
 // Use FAISS_PACK_STRUCTS_BEGIN/END to wrap packed structure definitions
 #define FAISS_PACKED
+#define FAISS_RESTRICT __restrict
 #define FAISS_PACK_STRUCTS_BEGIN __pragma(pack(push, 1))
 #define FAISS_PACK_STRUCTS_END __pragma(pack(pop))
 
@@ -126,9 +127,11 @@ inline int __builtin_clzll(uint64_t x) {
 #ifdef SWIG
 #define ALIGNED(x)
 #define FAISS_PACKED
+#define FAISS_RESTRICT
 #else
 #define ALIGNED(x) __attribute__((aligned(x)))
 #define FAISS_PACKED __attribute__((packed))
+#define FAISS_RESTRICT __restrict
 #endif
 
 // On non-Windows, FAISS_PACKED handles packing, so these are no-ops
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index 1e66227aea..ea3af51ad7 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -341,9 +341,11 @@ IndexIVF* parse_IndexIVF(
     if (match("FlatDedup")) {
         return new IndexIVFFlatDedup(get_q(), d, nlist, mt, own_il);
     }
-    if (match("FlatPanorama([0-9]+)?")) {
+    if (match("FlatPanorama([0-9]+)?(_([0-9]+))?")) {
         int nlevels = mres_to_int(sm[1], 8); // default to 8 levels
-        return new IndexIVFFlatPanorama(get_q(), d, nlist, nlevels, mt, own_il);
+        int bs = mres_to_int(sm[3], 128);
+        return new IndexIVFFlatPanorama(
+                get_q(), d, nlist, nlevels, mt, own_il, bs);
     }
     if (match(sq_pattern)) {
         return new IndexIVFScalarQuantizer(
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index 6f8e88ef35..c92b9d94e5 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -354,14 +354,15 @@ ArrayInvertedLists::~ArrayInvertedLists() {}
 ArrayInvertedListsPanorama::ArrayInvertedListsPanorama(
         size_t nlist_in,
         size_t code_size_in,
-        size_t n_levels_in)
+        size_t n_levels_in,
+        size_t batch_size)
         : ArrayInvertedLists(nlist_in, code_size_in),
           n_levels(n_levels_in),
           level_width(
                   (((code_size_in / sizeof(float)) + n_levels_in - 1) /
                    n_levels_in) *
                   sizeof(float)),
-          pano(code_size_in, n_levels_in, kBatchSize) {
+          pano(code_size_in, n_levels_in, batch_size) {
     FAISS_THROW_IF_NOT(n_levels_in > 0);
     FAISS_THROW_IF_NOT(code_size_in % sizeof(float) == 0);
     FAISS_THROW_IF_NOT_MSG(
@@ -389,9 +390,9 @@ size_t ArrayInvertedListsPanorama::add_entries(
     memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
 
     size_t new_size = o + n_entry;
-    size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1));
+    size_t num_batches = (new_size + pano.batch_size - 1) / pano.batch_size;
+    codes[list_no].resize(num_batches * pano.batch_size * code_size);
+    cum_sums[list_no].resize(num_batches * pano.batch_size * (n_levels + 1));
 
     // Cast to float* is safe here as we guarantee codes are always float
     // vectors for `IndexIVFFlatPanorama` (verified by the constructor).
@@ -425,9 +426,9 @@ void ArrayInvertedListsPanorama::update_entries(
 void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
     ids[list_no].resize(new_size);
 
-    size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1));
+    size_t num_batches = (new_size + pano.batch_size - 1) / pano.batch_size;
+    codes[list_no].resize(num_batches * pano.batch_size * code_size);
+    cum_sums[list_no].resize(num_batches * pano.batch_size * (n_levels + 1));
 }
 
 const uint8_t* ArrayInvertedListsPanorama::get_single_code(
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index 743bcad62d..6b0e37c5c6 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -280,7 +280,6 @@ struct ArrayInvertedLists : InvertedLists {
 /// Level-oriented storage as defined in the IVFFlat section of Panorama
 /// (https://www.arxiv.org/pdf/2510.00566).
 struct ArrayInvertedListsPanorama : ArrayInvertedLists {
-    static constexpr size_t kBatchSize = 128;
     std::vector<MaybeOwnedVector<float>> cum_sums;
     const size_t n_levels;
     const size_t level_width; // in code units
@@ -289,7 +288,8 @@ struct ArrayInvertedListsPanorama : ArrayInvertedLists {
     ArrayInvertedListsPanorama(
             size_t nlist_in,
             size_t code_size_in,
-            size_t n_levels_in);
+            size_t n_levels_in,
+            size_t batch_size = Panorama::kDefaultBatchSize);
 
     const float* get_cum_sums(size_t list_no) const;