Skip to content

Commit 93f0c00

Browse files
Add experimental advise_will_need for page cache prefetching (#131)
Adds an `experimental_advise_will_need()` method that computes the coalesced byte ranges via `ParquetFileReader::GetReadRanges` and calls `posix_fadvise(WILLNEED)` to trigger kernel readahead into the page cache. Arrow's `pre_buffer=True` dispatches reads to a shared IO thread pool (`ReadAsync`), allocating large buffers that are not CPU-cache friendly. This causes LLC misses when a different thread later decodes from that buffer. The new method lets users warm the page cache before calling `read_into_numpy` with `pre_buffer=False`. Each worker thread then performs its own `pread` and decoding, keeping allocations small and CPU-cache-friendly. FYI, using an experimental prefix, as it is an experimental feature that is not guaranteed to become a part of the stable API.
1 parent dc32b42 commit 93f0c00

5 files changed

Lines changed: 87 additions & 1 deletion

File tree

jollyjack/cjollyjack.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ cdef extern from "jollyjack.h":
3232
size_t dst_stride1_size,
3333
vector[int] row_indices) except + nogil
3434

35+
cdef void ExperimentalAdviseWillNeed (shared_ptr[CRandomAccessFile] source
36+
, shared_ptr[CFileMetaData] file_metadata
37+
, vector[int] column_indices
38+
, const vector[int] &row_groups
39+
, const vector[string] &column_names
40+
, CCacheOptions cache_options
41+
) except + nogil
42+
3543
cdef shared_ptr[CRandomAccessFile] GetIOUringReader1 (const string& path) except + nogil
3644
cdef shared_ptr[CRandomAccessFile] GetDirectReader (const string& path) except + nogil
3745

jollyjack/jollyjack.cc

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,41 @@ void CopyToRowMajor (void* src_buffer, size_t src_stride0_size, size_t src_strid
649649

650650
}
651651

652+
void ExperimentalAdviseWillNeed(
653+
std::shared_ptr<arrow::io::RandomAccessFile> source,
654+
std::shared_ptr<parquet::FileMetaData> file_metadata,
655+
std::vector<int> column_indices,
656+
const std::vector<int>& row_groups,
657+
const std::vector<std::string>& column_names,
658+
arrow::io::CacheOptions cache_options) {
659+
parquet::ReaderProperties reader_properties = parquet::default_reader_properties();
660+
auto reader = parquet::ParquetFileReader::Open(source, reader_properties, file_metadata);
661+
auto metadata = reader->metadata();
662+
663+
if (!column_names.empty()) {
664+
column_indices.reserve(column_names.size());
665+
auto schema = metadata->schema();
666+
for (const auto& name : column_names) {
667+
int idx = schema->ColumnIndex(name);
668+
if (idx < 0) {
669+
throw std::logic_error(std::string("Column '") + name + "' was not found!");
670+
}
671+
column_indices.push_back(idx);
672+
}
673+
}
674+
675+
auto read_ranges = reader->GetReadRanges(row_groups,
676+
column_indices,
677+
cache_options.hole_size_limit,
678+
cache_options.range_size_limit
679+
).ValueOrDie();
680+
681+
auto status = source->WillNeed(read_ranges);
682+
if (!status.ok()) {
683+
throw std::logic_error(status.message());
684+
}
685+
}
686+
652687
#ifdef WITH_IO_URING
653688
#include "io_uring_reader_1.h"
654689
std::shared_ptr<arrow::io::RandomAccessFile> GetIOUringReader1(const std::string& filename)

jollyjack/jollyjack.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ void CopyToRowMajor (void* src_buffer,
2626
size_t dst_stride1_size,
2727
std::vector<int> row_indices);
2828

29+
void ExperimentalAdviseWillNeed(
30+
std::shared_ptr<arrow::io::RandomAccessFile> source,
31+
std::shared_ptr<parquet::FileMetaData> file_metadata,
32+
std::vector<int> column_indices,
33+
const std::vector<int>& row_groups,
34+
const std::vector<std::string>& column_names,
35+
arrow::io::CacheOptions cache_options);
36+
2937
std::shared_ptr<arrow::io::RandomAccessFile> GetIOUringReader1(const std::string& filename);
3038
std::shared_ptr<arrow::io::RandomAccessFile> GetDirectReader(const std::string& filename);
3139

jollyjack/jollyjack_cython.pyx

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,41 @@ cpdef void read_into_numpy (object source, FileMetaData metadata, cnp.ndarray np
150150
, c_cache_options)
151151
return
152152

153+
cpdef void experimental_advise_will_need (object source, FileMetaData metadata, row_group_indices, column_indices = [], column_names = [], use_memory_map = False, CacheOptions cache_options = None):
154+
155+
cdef vector[int] crow_group_indices = row_group_indices
156+
cdef vector[int] ccolumn_indices
157+
cdef vector[string] ccolumn_names
158+
cdef shared_ptr[CFileMetaData] c_metadata
159+
cdef shared_ptr[CRandomAccessFile] rd_handle
160+
161+
if metadata is not None:
162+
c_metadata = metadata.sp_metadata
163+
164+
if column_indices:
165+
ccolumn_indices = column_indices
166+
167+
if column_names:
168+
ccolumn_names = [c.encode('utf8') for c in column_names]
169+
170+
assert (column_indices or column_names) and (not column_indices or not column_names), f"Either column_indices or column_names needs to be set"
171+
172+
cdef CCacheOptions c_cache_options
173+
if cache_options is not None:
174+
c_cache_options = cache_options.unwrap()
175+
else:
176+
c_cache_options = CCacheOptions.LazyDefaults()
177+
178+
get_reader(source, use_memory_map, &rd_handle)
179+
180+
with nogil:
181+
cjollyjack.ExperimentalAdviseWillNeed (rd_handle
182+
, c_metadata
183+
, ccolumn_indices
184+
, crow_group_indices
185+
, ccolumn_names
186+
, c_cache_options)
187+
153188
cpdef void copy_to_torch_row_major (src_tensor, dst_tensor, row_indices):
154189
import torch
155190

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
1010

1111
[project]
1212
name = "jollyjack"
13-
version = "0.22.1"
13+
version = "0.22.2"
1414
description = "High-performance Parquet reader for loading data directly into NumPy arrays and PyTorch tensors"
1515
readme = "README.md"
1616
requires-python = ">=3.9"

0 commit comments

Comments
 (0)