Skip to content

Commit b413698

Browse files
xiaoxmengmeta-codesync[bot]
authored andcommitted
feat: Add block bloom filter for dense index (#637)
Summary: Pull Request resolved: #637 CONTEXT: Dense index needs a bloom filter for fast negative lookups to avoid unnecessary hash table probes. WHAT: Add BloomFilter class implementing a split block bloom filter (Parquet-style, 256-bit blocks with multiple hash probes per block). Uses velox::BufferPtr for pool-tracked memory. Includes comprehensive unit tests. Reviewed By: HuamengJiang Differential Revision: D99539155 fbshipit-source-id: d3d2dc2d5b80d9d0a13a6f11da59b5badc843a94
1 parent 9568682 commit b413698

5 files changed

Lines changed: 481 additions & 0 deletions

File tree

dwio/nimble/index/BloomFilter.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "dwio/nimble/index/BloomFilter.h"
17+
18+
#include <algorithm>
19+
#include <cstring>
20+
21+
#include "dwio/nimble/common/Exceptions.h"
22+
23+
#define XXH_INLINE_ALL
24+
#include <xxhash.h>
25+
26+
#include "velox/common/base/BitUtil.h"
27+
28+
namespace facebook::nimble::index {
29+
30+
namespace {
31+
32+
// Computes the number of 256-bit blocks needed for the bloom filter based on
33+
// the expected number of entries and the desired bits per key.
34+
uint32_t computeNumBlocks(uint64_t numEntries, float bitsPerKey) {
35+
// Total bits needed, rounded up to block boundaries.
36+
const uint64_t totalBits = std::max(
37+
static_cast<uint64_t>(numEntries * bitsPerKey),
38+
uint64_t{BloomFilter::kBlockSizeBits});
39+
const uint32_t numBlocks = static_cast<uint32_t>(
40+
velox::bits::divRoundUp(totalBits, BloomFilter::kBlockSizeBits));
41+
// Always allocate at least one block to avoid degenerate cases.
42+
return std::max(numBlocks, uint32_t{1});
43+
}
44+
45+
} // namespace
46+
47+
BloomFilter::BloomFilter(
48+
uint64_t numEntries,
49+
float bitsPerKey,
50+
velox::memory::MemoryPool* pool)
51+
: numBlocks_{computeNumBlocks(numEntries, bitsPerKey)},
52+
bitsPerKey_{bitsPerKey},
53+
data_{velox::AlignedBuffer::allocate<uint8_t>(
54+
numBlocks_ * kBlockSizeBytes,
55+
pool,
56+
0)} {
57+
NIMBLE_CHECK_GT(bitsPerKey, 0.0f, "bitsPerKey must be positive");
58+
}
59+
60+
BloomFilter::BloomFilter(
61+
uint32_t numBlocks,
62+
const uint8_t* data,
63+
size_t dataSize,
64+
velox::memory::MemoryPool* pool)
65+
: numBlocks_{numBlocks},
66+
bitsPerKey_{0.0f},
67+
data_{velox::AlignedBuffer::allocate<uint8_t>(dataSize, pool)} {
68+
NIMBLE_CHECK_GT(numBlocks, 0u, "numBlocks must be positive");
69+
NIMBLE_CHECK_EQ(
70+
dataSize,
71+
static_cast<size_t>(numBlocks_) * kBlockSizeBytes,
72+
"data size mismatch");
73+
std::memcpy(data_->asMutable<uint8_t>(), data, dataSize);
74+
}
75+
76+
uint64_t BloomFilter::hashKey(std::string_view key) {
77+
return XXH64(key.data(), key.size(), /*seed=*/0);
78+
}
79+
80+
void BloomFilter::insert(std::string_view key) {
81+
insertHash(hashKey(key));
82+
}
83+
84+
bool BloomFilter::testKey(std::string_view key) const {
85+
return testHash(hashKey(key));
86+
}
87+
88+
void BloomFilter::insertHash(uint64_t hash) {
89+
auto* words = mutableBlock(blockIndex(hash));
90+
const auto key32 = static_cast<uint32_t>(hash);
91+
for (uint32_t i = 0; i < kNumProbesPerBlock; ++i) {
92+
const uint32_t bitIndex = (key32 * kSalts[i]) >> kBitShift;
93+
words[i] |= (uint32_t{1} << bitIndex);
94+
}
95+
}
96+
97+
bool BloomFilter::testHash(uint64_t hash) const {
98+
const auto* words = block(blockIndex(hash));
99+
const auto key32 = static_cast<uint32_t>(hash);
100+
for (uint32_t i = 0; i < kNumProbesPerBlock; ++i) {
101+
const uint32_t bitIndex = (key32 * kSalts[i]) >> kBitShift;
102+
if ((words[i] & (uint32_t{1} << bitIndex)) == 0) {
103+
return false;
104+
}
105+
}
106+
return true;
107+
}
108+
109+
} // namespace facebook::nimble::index

dwio/nimble/index/BloomFilter.h

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cstdint>
19+
#include <string_view>
20+
21+
#include "velox/buffer/Buffer.h"
22+
#include "velox/common/memory/Memory.h"
23+
24+
namespace facebook::nimble::index {
25+
26+
/// Split block bloom filter following the Parquet bloom filter design.
27+
///
28+
/// Uses 256-bit (32-byte) blocks with multiple hash probes per block.
29+
/// Each lookup touches exactly one block, making it cache-friendly.
30+
/// The filter uses xxHash64 to generate hash values.
31+
///
32+
/// Design reference: "Cache-, Hash- and Space-Efficient Bloom Filters"
33+
/// (Putze, Sanders, Singler, 2007) and Apache Parquet format spec.
34+
class BloomFilter {
35+
public:
36+
/// Number of hash probes per block. Each probe sets/tests one bit in a
37+
/// separate uint32_t word, so the block has exactly this many words.
38+
static constexpr uint32_t kNumProbesPerBlock{8};
39+
/// Bytes per word used in each block.
40+
static constexpr uint32_t kBytesPerWord{sizeof(uint32_t)};
41+
/// Bits per word.
42+
static constexpr uint32_t kBitsPerWord{kBytesPerWord * 8};
43+
/// Block size in bytes (one word per probe).
44+
static constexpr uint32_t kBlockSizeBytes{kNumProbesPerBlock * kBytesPerWord};
45+
/// Block size in bits.
46+
static constexpr uint32_t kBlockSizeBits{kNumProbesPerBlock * kBitsPerWord};
47+
48+
/// Constructs a bloom filter sized for the expected number of entries.
49+
///
50+
/// @param numEntries Expected number of distinct entries.
51+
/// @param bitsPerKey Target bits per key (controls false positive rate).
52+
/// @param pool Memory pool for filter data allocation.
53+
BloomFilter(
54+
uint64_t numEntries,
55+
float bitsPerKey,
56+
velox::memory::MemoryPool* pool);
57+
58+
/// Constructs a bloom filter from pre-existing data (for reading).
59+
///
60+
/// @param numBlocks Number of 256-bit blocks.
61+
/// @param data Raw filter data. Must be numBlocks * kBlockSizeBytes bytes.
62+
/// @param pool Memory pool for filter data allocation.
63+
BloomFilter(
64+
uint32_t numBlocks,
65+
const uint8_t* data,
66+
size_t dataSize,
67+
velox::memory::MemoryPool* pool);
68+
69+
/// Inserts a key into the filter.
70+
void insert(std::string_view key);
71+
72+
/// Tests whether a key might be in the filter.
73+
/// Returns false if the key is definitely not present (no false negatives).
74+
/// Returns true if the key might be present (possible false positives).
75+
bool testKey(std::string_view key) const;
76+
77+
/// Returns the number of 256-bit blocks.
78+
uint32_t numBlocks() const {
79+
return numBlocks_;
80+
}
81+
82+
/// Returns the raw filter data.
83+
const uint8_t* data() const {
84+
return data_->as<uint8_t>();
85+
}
86+
87+
/// Returns the size of the raw filter data in bytes.
88+
size_t dataSize() const {
89+
return data_->size();
90+
}
91+
92+
/// Returns the bits per key used during construction.
93+
float bitsPerKey() const {
94+
return bitsPerKey_;
95+
}
96+
97+
private:
98+
// Right-shift to select a bit position within a word (top 5 bits of the
99+
// salted 32-bit hash select one of 32 bit positions).
100+
static constexpr uint32_t kBitShift{32 - 5};
101+
102+
// Salts for generating multiple hash probes from a single hash value.
103+
// These are odd constants that provide good bit mixing when multiplied
104+
// with the hash value.
105+
static constexpr uint32_t kSalts[kNumProbesPerBlock] = {
106+
0x47b6137bU,
107+
0x44974d91U,
108+
0x8824ad5bU,
109+
0xa2b7289dU,
110+
0x705495c7U,
111+
0x2df1424bU,
112+
0x9efc4947U,
113+
0x5c6bfb31U,
114+
};
115+
116+
// Maps hash to block index using the upper 32 bits.
117+
uint32_t blockIndex(uint64_t hash) const {
118+
return static_cast<uint32_t>((hash >> 32) % numBlocks_);
119+
}
120+
121+
// Accesses the block words at the given block index.
122+
uint32_t* mutableBlock(uint32_t index) {
123+
return reinterpret_cast<uint32_t*>(
124+
data_->asMutable<uint8_t>() +
125+
static_cast<size_t>(index) * kBlockSizeBytes);
126+
}
127+
128+
const uint32_t* block(uint32_t index) const {
129+
return reinterpret_cast<const uint32_t*>(
130+
data_->as<uint8_t>() + static_cast<size_t>(index) * kBlockSizeBytes);
131+
}
132+
133+
void insertHash(uint64_t hash);
134+
bool testHash(uint64_t hash) const;
135+
136+
static uint64_t hashKey(std::string_view key);
137+
138+
const uint32_t numBlocks_;
139+
const float bitsPerKey_;
140+
velox::BufferPtr data_;
141+
};
142+
143+
} // namespace facebook::nimble::index

dwio/nimble/index/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
add_library(
1515
nimble_index
16+
BloomFilter.cpp
1617
ChunkIndex.cpp
1718
ChunkIndexGroup.cpp
1819
ClusterIndex.cpp

0 commit comments

Comments
 (0)