Add experimental advise_will_need for page cache prefetching (#131)

marcin-krystianc · web-flow · commit 93f0c003da23 · 2026-04-20T12:05:27.000+02:00
Adds an `experimental_advise_will_need()` method that computes the
coalesced byte ranges via `ParquetFileReader::GetReadRanges` and calls
`posix_fadvise(WILLNEED)` to trigger kernel readahead into the page
cache.

Arrow's `pre_buffer=True` dispatches reads to a shared IO thread pool
(`ReadAsync`), allocating large buffers that are not CPU-cache friendly.
This causes LLC misses when a different thread later decodes from that
buffer. The new method lets users warm the page cache before calling
`read_into_numpy` with `pre_buffer=False`. Each worker thread then
performs its own `pread` and decoding, keeping allocations small and
CPU-cache-friendly.

FYI, using an experimental prefix, as it is an experimental feature that
is not guaranteed to become a part of the stable API.
diff --git a/jollyjack/cjollyjack.pxd b/jollyjack/cjollyjack.pxd
@@ -32,6 +32,14 @@ cdef extern from "jollyjack.h":
         size_t dst_stride1_size,
         vector[int] row_indices) except + nogil
 
+    cdef void ExperimentalAdviseWillNeed (shared_ptr[CRandomAccessFile] source
+        , shared_ptr[CFileMetaData] file_metadata
+        , vector[int] column_indices
+        , const vector[int] &row_groups
+        , const vector[string] &column_names
+        , CCacheOptions cache_options
+        ) except + nogil
+
     cdef shared_ptr[CRandomAccessFile] GetIOUringReader1 (const string& path) except + nogil
     cdef shared_ptr[CRandomAccessFile] GetDirectReader (const string& path) except + nogil
 
diff --git a/jollyjack/jollyjack.cc b/jollyjack/jollyjack.cc
@@ -649,6 +649,41 @@ void CopyToRowMajor (void* src_buffer, size_t src_stride0_size, size_t src_strid
 
 }
 
+void ExperimentalAdviseWillNeed(
+    std::shared_ptr<arrow::io::RandomAccessFile> source,
+    std::shared_ptr<parquet::FileMetaData> file_metadata,
+    std::vector<int> column_indices,
+    const std::vector<int>& row_groups,
+    const std::vector<std::string>& column_names,
+    arrow::io::CacheOptions cache_options) {
+  parquet::ReaderProperties reader_properties = parquet::default_reader_properties();
+  auto reader = parquet::ParquetFileReader::Open(source, reader_properties, file_metadata);
+  auto metadata = reader->metadata();
+
+  if (!column_names.empty()) {
+    column_indices.reserve(column_names.size());
+    auto schema = metadata->schema();
+    for (const auto& name : column_names) {
+      int idx = schema->ColumnIndex(name);
+      if (idx < 0) {
+        throw std::logic_error(std::string("Column '") + name + "' was not found!");
+      }
+      column_indices.push_back(idx);
+    }
+  }
+
+  auto read_ranges = reader->GetReadRanges(row_groups, 
+                            column_indices,
+                            cache_options.hole_size_limit,
+                            cache_options.range_size_limit
+                          ).ValueOrDie();
+
+  auto status = source->WillNeed(read_ranges);
+  if (!status.ok()) {
+    throw std::logic_error(status.message());
+  }
+}
+
 #ifdef WITH_IO_URING
 #include "io_uring_reader_1.h"
 std::shared_ptr<arrow::io::RandomAccessFile> GetIOUringReader1(const std::string& filename)
diff --git a/jollyjack/jollyjack.h b/jollyjack/jollyjack.h
@@ -26,6 +26,14 @@ void CopyToRowMajor (void* src_buffer,
     size_t dst_stride1_size,
     std::vector<int> row_indices);
 
+void ExperimentalAdviseWillNeed(
+    std::shared_ptr<arrow::io::RandomAccessFile> source,
+    std::shared_ptr<parquet::FileMetaData> file_metadata,
+    std::vector<int> column_indices,
+    const std::vector<int>& row_groups,
+    const std::vector<std::string>& column_names,
+    arrow::io::CacheOptions cache_options);
+
 std::shared_ptr<arrow::io::RandomAccessFile> GetIOUringReader1(const std::string& filename);
 std::shared_ptr<arrow::io::RandomAccessFile> GetDirectReader(const std::string& filename);
 
diff --git a/jollyjack/jollyjack_cython.pyx b/jollyjack/jollyjack_cython.pyx
@@ -150,6 +150,41 @@ cpdef void read_into_numpy (object source, FileMetaData metadata, cnp.ndarray np
             , c_cache_options)
         return
 
+cpdef void experimental_advise_will_need (object source, FileMetaData metadata, row_group_indices, column_indices = [], column_names = [], use_memory_map = False, CacheOptions cache_options = None):
+
+    cdef vector[int] crow_group_indices = row_group_indices
+    cdef vector[int] ccolumn_indices
+    cdef vector[string] ccolumn_names
+    cdef shared_ptr[CFileMetaData] c_metadata
+    cdef shared_ptr[CRandomAccessFile] rd_handle
+
+    if metadata is not None:
+        c_metadata = metadata.sp_metadata
+
+    if column_indices:
+        ccolumn_indices = column_indices
+
+    if column_names:
+        ccolumn_names = [c.encode('utf8') for c in column_names]
+
+    assert (column_indices or column_names) and (not column_indices or not column_names), f"Either column_indices or column_names needs to be set"
+
+    cdef CCacheOptions c_cache_options
+    if cache_options is not None:
+        c_cache_options = cache_options.unwrap()
+    else:
+        c_cache_options = CCacheOptions.LazyDefaults()
+
+    get_reader(source, use_memory_map, &rd_handle)
+
+    with nogil:
+        cjollyjack.ExperimentalAdviseWillNeed (rd_handle
+            , c_metadata
+            , ccolumn_indices
+            , crow_group_indices
+            , ccolumn_names
+            , c_cache_options)
+
 cpdef void copy_to_torch_row_major (src_tensor, dst_tensor, row_indices):
     import torch
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "jollyjack"
-version = "0.22.1"
+version = "0.22.2"
 description = "High-performance Parquet reader for loading data directly into NumPy arrays and PyTorch tensors"
 readme = "README.md"
 requires-python = ">=3.9"