G-Research
diff --git a/‎.github/workflows/python.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 107 additions & 6 deletions b/‎README.md‎
Lines changed: 107 additions & 6 deletions
diff --git a/‎benchmarks/benchmark_jollyjack.py‎
Lines changed: 10 additions & 6 deletions b/‎benchmarks/benchmark_jollyjack.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎jollyjack/cjollyjack.pxd‎
Lines changed: 2 additions & 1 deletion b/‎jollyjack/cjollyjack.pxd‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎jollyjack/jollyjack.cc‎
Lines changed: 15 additions & 1 deletion b/‎jollyjack/jollyjack.cc‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎jollyjack/jollyjack.h‎
Lines changed: 3 additions & 1 deletion b/‎jollyjack/jollyjack.h‎
Lines changed: 3 additions & 1 deletion
@@ -225,7 +225,7 @@ jobs:
         pip install JollyJack --pre --find-links ./dist --break-system-packages --only-binary=:all: --force-reinstall --no-index --no-deps
         python3 ./benchmarks/benchmark_jollyjack.py
       env:
-        JJ_benchmark_mode: ${{ matrix.mode }}
+        JJB_benchmark_mode: ${{ matrix.mode }}
 
   publish:
     if: ${{ !github.event.repository.fork && startsWith(github.ref, 'refs/tags/v') }}
 
@@ -48,21 +48,28 @@ your workload is I/O-bound or memory-/CPU-bound.
 
 ### Large datasets (exceed filesystem cache)
 
-For datasets larger than the available page cache, performance is typically I/O-bound.
+For datasets larger than the available page cache, performance is typically
+I/O-bound. Enabling either `pre_buffer=True` or `prefetch_page_cache=True`
+brings throughput close to the raw I/O ceiling.
 
 Recommended configuration:
 
-- `use_threads = True`, `pre_buffer = True`, `JJ_READER_BACKEND = io_uring_odirect`
+- `use_threads = True`, `prefetch_page_cache = True`, `pre_buffer = False`,
+  with the default reader backend.
 
-This combination bypasses the page cache, reduces double-buffering, and allows deeper I/O queues via io_uring.
+Both options reach near-identical throughput. `prefetch_page_cache` avoids the
+temporary buffer copies that `pre_buffer` uses (see section below) and the
+increased LLC miss rate.
 
 ### Small datasets (fit in filesystem cache)
 
-For datasets that comfortably fit in RAM, performance is typically CPU- or memory-bound.
+For datasets that comfortably fit in RAM, performance is typically CPU- or
+memory-bound.
 
 Recommended configuration:
 
-- `use_threads = False`, `pre_buffer = False`, and the default reader backend (no io_uring).
+- `use_threads = True`, `prefetch_page_cache = True`, `pre_buffer = False`,
+  with the default reader backend.
 
 ### Pre-buffering and `cache_options`
 
@@ -102,10 +109,62 @@ To debug allocator issues with mimalloc, run with `MIMALLOC_SHOW_STATS=1` and
 
 ### Pre-buffering and `ARROW_IO_THREADS`
 
-When `pre_buffer=True`, Arrow dispatches reads to its IO thread pooll,
+When `pre_buffer=True`, Arrow dispatches reads to its IO thread pool,
 configured via the `ARROW_IO_THREADS` environment variable (default: 8). 
 Tuning this value may improve performance.
 
+### Page cache prefetching with `prefetch_page_cache`
+
+With `pre_buffer=True`, Arrow's IO thread pool allocates temporary buffers
+and fills them on the IO thread's core. When worker threads on different
+cores later consume those buffers, the data is cold in their caches,
+causing LLC misses.
+
+`prefetch_page_cache` provides an alternative: it calls
+`posix_fadvise(POSIX_FADV_WILLNEED)` to tell the kernel to start loading
+the relevant byte ranges into the page cache. Each worker thread then
+reads directly via `pread` into its own locally-allocated buffer, keeping
+data hot in its local CPU caches.
+
+Two ways to use it:
+
+**As a parameter on `read_into_numpy`:**
+
+```python
+jj.read_into_numpy(
+    source=path,
+    metadata=pr.metadata,
+    np_array=np_array,
+    row_group_indices=range(pr.metadata.num_row_groups),
+    column_indices=range(pr.metadata.num_columns),
+    prefetch_page_cache=True,
+)
+```
+
+This is only useful for local or network-mounted file systems that have a
+page cache. Remote file systems such as S3 will not benefit from this.
+
+**As a standalone call** (when you want to prefetch ahead of time, e.g.
+from a different thread):
+
+```python
+jj.prefetch_page_cache(
+    source=path,
+    metadata=pr.metadata,
+    row_group_indices=range(pr.metadata.num_row_groups),
+    column_indices=range(pr.metadata.num_columns),
+)
+
+jj.read_into_numpy(
+    source=path,
+    metadata=pr.metadata,
+    np_array=np_array,
+    row_group_indices=range(pr.metadata.num_row_groups),
+    column_indices=range(pr.metadata.num_columns),
+    pre_buffer=False,
+)
+```
+
 ## Requirements
 
 - pyarrow ~= 24.0.0
@@ -253,6 +312,48 @@ with fs.LocalFileSystem().open_input_file(path) as f:
 print(np_array)
 ```
 
+### Using page cache prefetching
+```python
+np_array = np.zeros((n_rows, n_columns), dtype="f", order="F")
+pr = pq.ParquetReader()
+pr.open(path)
+
+# cache_options controls which byte ranges are prefetched into the page cache
+cache_options = pa.CacheOptions(
+    hole_size_limit=8192,
+    range_size_limit=16*1024*1024,
+    lazy=False,
+)
+
+# Prefetch and read in one call
+jj.read_into_numpy(
+    source=path,
+    metadata=pr.metadata,
+    np_array=np_array,
+    row_group_indices=range(pr.metadata.num_row_groups),
+    column_indices=range(pr.metadata.num_columns),
+    cache_options=cache_options,
+    prefetch_page_cache=True,
+)
+
+# Or prefetch separately, then read
+jj.prefetch_page_cache(
+    source=path,
+    metadata=pr.metadata,
+    row_group_indices=range(pr.metadata.num_row_groups),
+    column_indices=range(pr.metadata.num_columns),
+    cache_options=cache_options,
+)
+jj.read_into_numpy(
+    source=path,
+    metadata=pr.metadata,
+    np_array=np_array,
+    row_group_indices=range(pr.metadata.num_row_groups),
+    column_indices=range(pr.metadata.num_columns),
+    pre_buffer=False,
+)
+```
+
 ### Generating a PyTorch tensor to read into
 ```python
 import torch
 
@@ -48,6 +48,7 @@ class BenchmarkSettings(BaseSettings):
     dtypes: list[str] = ["float32", "float16"]
     compressions: list[str] = ["none"]
     pre_buffer: list[bool] = [False, True]
+    prefetch_page_cache: list[bool] = [False, True]
     use_threads: list[bool] = [False, True]
 
     @classmethod
@@ -170,7 +171,6 @@ def generate_data(n_columns, n_row_groups, path, compression, dtype):
     if parquet_matches(
         path, n_columns, n_row_groups, cfg.chunk_size, compression, dtype
     ):
-        print(f"Reusing existing {path}")
         return
 
     t = time.time()
@@ -215,21 +215,23 @@ def get_thread_local_np_array(dtype):
     return np_array
 
 
-def worker_jollyjack_numpy(use_threads, pre_buffer, dtype, path):
+def worker_jollyjack_numpy(use_threads, pre_buffer, prefetch_page_cache, dtype, path):
 
     np_array = get_thread_local_np_array(dtype)
     cache_options = pa.CacheOptions(
         hole_size_limit=8192,  # default
         range_size_limit=16 * 1024 * 1024,  # 16 MB, fits in mimalloc arena
         lazy=False,
     )
+
     jj.read_into_numpy(
         source=path,
         metadata=None,
         np_array=np_array,
         row_group_indices=row_groups_to_read,
         column_indices=column_indices_to_read,
         pre_buffer=pre_buffer,
+        prefetch_page_cache=prefetch_page_cache,
         use_threads=use_threads,
         cache_options=cache_options,
     )
@@ -436,10 +438,12 @@ def measure_reading(max_workers, worker):
                 print(f".")
                 for n_workers in cfg.worker_counts:
                     for pre_buffer in cfg.pre_buffer:
-                        for use_threads in cfg.use_threads:
-                            print(
-                                f"`jj.read_into_numpy` jj_reader:{jj_reader}, n_workers:{n_workers}, use_threads:{use_threads}, pre_buffer:{pre_buffer}, duration:{measure_reading(n_workers, lambda path:worker_jollyjack_numpy(use_threads, pre_buffer, dtype.to_pandas_dtype(), path = path))}"
-                            )
+                        for prefetch_page_cache in cfg.prefetch_page_cache:
+                            for use_threads in cfg.use_threads:
+
+                                print(
+                                    f"`jj.read_into_numpy` jj_reader:{jj_reader}, n_workers:{n_workers}, use_threads:{use_threads}, pre_buffer:{pre_buffer}, prefetch_page_cache:{prefetch_page_cache}, duration:{measure_reading(n_workers, lambda path:worker_jollyjack_numpy(use_threads, pre_buffer, prefetch_page_cache, dtype.to_pandas_dtype(), path = path))}"
+                                )
 
         if {"all", "jj_torch"} & cfg.benchmarks_to_run:
             print(f".")
 
@@ -17,6 +17,7 @@ cdef extern from "jollyjack.h":
         , const vector[string] &column_names
         , const vector[int] &target_column_indices
         , bool pre_buffer
+        , bool prefetch_page_cache
         , bool use_threads
         , int64_t expected_rows
         , CCacheOptions cache_options
@@ -32,7 +33,7 @@ cdef extern from "jollyjack.h":
         size_t dst_stride1_size,
         vector[int] row_indices) except + nogil
 
-    cdef void ExperimentalAdviseWillNeed (shared_ptr[CRandomAccessFile] source
+    cdef void PrefetchPageCache (shared_ptr[CRandomAccessFile] source
         , shared_ptr[CFileMetaData] file_metadata
         , vector[int] column_indices
         , const vector[int] &row_groups
 
@@ -295,6 +295,7 @@ void ReadIntoMemory (std::shared_ptr<arrow::io::RandomAccessFile> source
     , const std::vector<std::string> &column_names
     , const std::vector<int> &target_column_indices
     , bool pre_buffer
+    , bool prefetch_page_cache
     , bool use_threads
     , int64_t expected_rows
     , arrow::io::CacheOptions cache_options)
@@ -328,6 +329,19 @@ void ReadIntoMemory (std::shared_ptr<arrow::io::RandomAccessFile> source
       }
   }
 
+  if (prefetch_page_cache)
+  {
+    auto read_ranges = parquet_reader->GetReadRanges(row_groups,
+                              column_indices,
+                              cache_options.hole_size_limit,
+                              cache_options.range_size_limit
+                            ).ValueOrDie();
+    auto status = source->WillNeed(read_ranges);
+    if (!status.ok()) {
+      throw std::logic_error(status.message());
+    }
+  }
+
   if (pre_buffer)
   {
     parquet_reader->PreBuffer(row_groups, column_indices, arrowReaderProperties.io_context(), cache_options);
@@ -649,7 +663,7 @@ void CopyToRowMajor (void* src_buffer, size_t src_stride0_size, size_t src_strid
 
 }
 
-void ExperimentalAdviseWillNeed(
+void PrefetchPageCache(
     std::shared_ptr<arrow::io::RandomAccessFile> source,
     std::shared_ptr<parquet::FileMetaData> file_metadata,
     std::vector<int> column_indices,
 
@@ -12,6 +12,7 @@ void ReadIntoMemory (std::shared_ptr<arrow::io::RandomAccessFile> source
     , const std::vector<std::string> &column_names
     , const std::vector<int> &target_column_indices
     , bool pre_buffer
+    , bool prefetch_page_cache
     , bool use_threads
     , int64_t expected_rows
     , arrow::io::CacheOptions cache_options);
@@ -26,7 +27,8 @@ void CopyToRowMajor (void* src_buffer,
     size_t dst_stride1_size,
     std::vector<int> row_indices);
 
-void ExperimentalAdviseWillNeed(
+// Calls posix_fadvise(POSIX_FADV_WILLNEED) on the byte ranges for the requested columns/row groups.
+void PrefetchPageCache(
     std::shared_ptr<arrow::io::RandomAccessFile> source,
     std::shared_ptr<parquet::FileMetaData> file_metadata,
     std::vector<int> column_indices,