Skip to content

Commit 8ccce3e

Browse files
srsuryadevmeta-codesync[bot]
authored andcommitted
perf(encoding): varint encoding - manual loop-unroll in decodeSingleByteRun for single-byte varints (#578)
Summary: Pull Request resolved: #578 Manually loop-unroll `decodeSingleByteRun` with a 3-tier approach: 1. 32-element (4-word) unrolled loop with combined high-bit check `(w0 | w1 | w2 | w3) & kHighBits` to minimize branch overhead 2. 8-element (1-word) loop for smaller runs 3. Single-element trailing loop to pick up individual single-byte varints before multi-byte values Also extracts the byte-expansion logic into a reusable `expandWord()` helper for clarity. Reviewed By: xiaoxmeng Differential Revision: D96619597
1 parent e876453 commit 8ccce3e

File tree

1 file changed

+56
-13
lines changed

1 file changed

+56
-13
lines changed

dwio/nimble/common/Varint.cpp

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -846,28 +846,71 @@ using varint::bulkVarintDecodeBmi2;
846846
using varint::readVarint32;
847847
using varint::readVarint64;
848848

849+
// Expand a single 8-byte word of single-byte varints into output elements.
850+
template <typename T>
851+
inline void expandByteWord(uint64_t word, T* output) {
852+
output[0] = static_cast<T>(word & 0xFF);
853+
output[1] = static_cast<T>((word >> 8) & 0xFF);
854+
output[2] = static_cast<T>((word >> 16) & 0xFF);
855+
output[3] = static_cast<T>((word >> 24) & 0xFF);
856+
output[4] = static_cast<T>((word >> 32) & 0xFF);
857+
output[5] = static_cast<T>((word >> 40) & 0xFF);
858+
output[6] = static_cast<T>((word >> 48) & 0xFF);
859+
output[7] = static_cast<T>((word >> 56) & 0xFF);
860+
}
861+
849862
// Process runs of single-byte varints using 8-byte word reads.
863+
// Unrolled 4x (32 elements per iteration) for the common case where
864+
// most values are single-byte (0-127).
850865
// Returns the number of elements remaining after processing.
851866
template <typename T>
852867
inline uint64_t
853868
bulkDecodeSingleByteRun(uint64_t n, const char*& pos, T*& output) {
854-
while (n >= 8) {
869+
constexpr uint64_t kHighBits = 0x8080808080808080ULL;
870+
constexpr uint64_t batchSize = 32;
871+
constexpr uint64_t wordSize = 8;
872+
873+
// Process 32 elements (4 words) at a time.
874+
while (n >= batchSize) {
875+
uint64_t w0 = *reinterpret_cast<const uint64_t*>(pos);
876+
uint64_t w1 = *reinterpret_cast<const uint64_t*>(pos + wordSize);
877+
uint64_t w2 = *reinterpret_cast<const uint64_t*>(pos + 2 * wordSize);
878+
uint64_t w3 = *reinterpret_cast<const uint64_t*>(pos + 3 * wordSize);
879+
if ((w0 | w1 | w2 | w3) & kHighBits) {
880+
break;
881+
}
882+
expandByteWord(w0, output);
883+
expandByteWord(w1, output + wordSize);
884+
expandByteWord(w2, output + (2 * wordSize));
885+
expandByteWord(w3, output + (3 * wordSize));
886+
pos += batchSize;
887+
output += batchSize;
888+
n -= batchSize;
889+
}
890+
891+
// Process 8 elements (1 word) at a time.
892+
while (n >= wordSize) {
855893
uint64_t word = *reinterpret_cast<const uint64_t*>(pos);
856-
if (word & 0x8080808080808080ULL) {
894+
if (word & kHighBits) {
857895
break;
858896
}
859-
output[0] = static_cast<T>(word & 0xFF);
860-
output[1] = static_cast<T>((word >> 8) & 0xFF);
861-
output[2] = static_cast<T>((word >> 16) & 0xFF);
862-
output[3] = static_cast<T>((word >> 24) & 0xFF);
863-
output[4] = static_cast<T>((word >> 32) & 0xFF);
864-
output[5] = static_cast<T>((word >> 40) & 0xFF);
865-
output[6] = static_cast<T>((word >> 48) & 0xFF);
866-
output[7] = static_cast<T>((word >> 56) & 0xFF);
867-
pos += 8;
868-
output += 8;
869-
n -= 8;
897+
expandByteWord(word, output);
898+
pos += wordSize;
899+
output += wordSize;
900+
n -= wordSize;
870901
}
902+
903+
// Handle trailing single-byte varints one at a time.
904+
while (n > 0) {
905+
uint8_t byte = static_cast<uint8_t>(*pos);
906+
if (byte & 0x80) {
907+
break;
908+
}
909+
*output++ = static_cast<T>(byte);
910+
++pos;
911+
--n;
912+
}
913+
871914
return n;
872915
}
873916

0 commit comments

Comments
 (0)