@@ -846,28 +846,71 @@ using varint::bulkVarintDecodeBmi2;
846846using varint::readVarint32;
847847using varint::readVarint64;
848848
849+ // Expand a single 8-byte word of single-byte varints into output elements.
850+ template <typename T>
851+ inline void expandByteWord (uint64_t word, T* output) {
852+ output[0 ] = static_cast <T>(word & 0xFF );
853+ output[1 ] = static_cast <T>((word >> 8 ) & 0xFF );
854+ output[2 ] = static_cast <T>((word >> 16 ) & 0xFF );
855+ output[3 ] = static_cast <T>((word >> 24 ) & 0xFF );
856+ output[4 ] = static_cast <T>((word >> 32 ) & 0xFF );
857+ output[5 ] = static_cast <T>((word >> 40 ) & 0xFF );
858+ output[6 ] = static_cast <T>((word >> 48 ) & 0xFF );
859+ output[7 ] = static_cast <T>((word >> 56 ) & 0xFF );
860+ }
861+
849862// Process runs of single-byte varints using 8-byte word reads.
863+ // Unrolled 4x (32 elements per iteration) for the common case where
864+ // most values are single-byte (0-127).
850865// Returns the number of elements remaining after processing.
851866template <typename T>
852867inline uint64_t
853868bulkDecodeSingleByteRun (uint64_t n, const char *& pos, T*& output) {
854- while (n >= 8 ) {
869+ constexpr uint64_t kHighBits = 0x8080808080808080ULL ;
870+ constexpr uint64_t batchSize = 32 ;
871+ constexpr uint64_t wordSize = 8 ;
872+
873+ // Process 32 elements (4 words) at a time.
874+ while (n >= batchSize) {
875+ uint64_t w0 = *reinterpret_cast <const uint64_t *>(pos);
876+ uint64_t w1 = *reinterpret_cast <const uint64_t *>(pos + wordSize);
877+ uint64_t w2 = *reinterpret_cast <const uint64_t *>(pos + 2 * wordSize);
878+ uint64_t w3 = *reinterpret_cast <const uint64_t *>(pos + 3 * wordSize);
879+ if ((w0 | w1 | w2 | w3) & kHighBits ) {
880+ break ;
881+ }
882+ expandByteWord (w0, output);
883+ expandByteWord (w1, output + wordSize);
884+ expandByteWord (w2, output + (2 * wordSize));
885+ expandByteWord (w3, output + (3 * wordSize));
886+ pos += batchSize;
887+ output += batchSize;
888+ n -= batchSize;
889+ }
890+
891+ // Process 8 elements (1 word) at a time.
892+ while (n >= wordSize) {
855893 uint64_t word = *reinterpret_cast <const uint64_t *>(pos);
856- if (word & 0x8080808080808080ULL ) {
894+ if (word & kHighBits ) {
857895 break ;
858896 }
859- output[0 ] = static_cast <T>(word & 0xFF );
860- output[1 ] = static_cast <T>((word >> 8 ) & 0xFF );
861- output[2 ] = static_cast <T>((word >> 16 ) & 0xFF );
862- output[3 ] = static_cast <T>((word >> 24 ) & 0xFF );
863- output[4 ] = static_cast <T>((word >> 32 ) & 0xFF );
864- output[5 ] = static_cast <T>((word >> 40 ) & 0xFF );
865- output[6 ] = static_cast <T>((word >> 48 ) & 0xFF );
866- output[7 ] = static_cast <T>((word >> 56 ) & 0xFF );
867- pos += 8 ;
868- output += 8 ;
869- n -= 8 ;
897+ expandByteWord (word, output);
898+ pos += wordSize;
899+ output += wordSize;
900+ n -= wordSize;
870901 }
902+
903+ // Handle trailing single-byte varints one at a time.
904+ while (n > 0 ) {
905+ uint8_t byte = static_cast <uint8_t >(*pos);
906+ if (byte & 0x80 ) {
907+ break ;
908+ }
909+ *output++ = static_cast <T>(byte);
910+ ++pos;
911+ --n;
912+ }
913+
871914 return n;
872915}
873916
0 commit comments