Skip to content

Commit 224cac5

Browse files
committed
Sync engine source with McBopomofo
This includes changes from: - openvanilla/McBopomofo#499 - openvanilla/McBopomofo#794 - openvanilla/McBopomofo#822 Also did a clang-format pass.
1 parent a01f482 commit 224cac5

12 files changed

Lines changed: 305 additions & 56 deletions

src/Engine/ByteBlockBackedDictionary.cpp

Lines changed: 224 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@
3131
#endif
3232
#endif
3333

34+
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON
35+
#if defined(__ARM_NEON)
36+
#include <arm_neon.h>
37+
38+
#include <cstdint>
39+
#else
40+
#error ARM NEON support required
41+
#endif
42+
#endif
43+
3444
#include "ByteBlockBackedDictionary.h"
3545

3646
namespace McBopomofo {
@@ -84,7 +94,8 @@ const char* AdvanceToNextNonContentCharacter(const char* ptr, const char* end) {
8494
return ptr;
8595
}
8696

87-
#ifndef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
97+
#if !defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512) && \
98+
!defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
8899
const char* FindFirstNULL(const char* ptr, const char* end,
89100
size_t* firstLineNumber = nullptr) {
90101
const char* i = ptr;
@@ -271,6 +282,188 @@ const char* AVX512_FindFirstNULL(const char* ptr, const char* end,
271282

272283
#endif
273284

285+
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON
286+
287+
// Returns the index of the first non-zero byte in v, or 16 if all zero
288+
static inline int FirstNonZeroLane16(uint8x16_t v) {
289+
if (vmaxvq_u8(v) == 0) {
290+
return 16;
291+
}
292+
// Scalar fallback for the position
293+
alignas(16) uint8_t tmp[16];
294+
vst1q_u8(tmp, v);
295+
for (int i = 0; i < 16; ++i) {
296+
if (tmp[i]) {
297+
return i;
298+
}
299+
}
300+
return 16;
301+
}
302+
303+
const char* NEON_AdvanceToNextCRLF(const char* ptr, const char* unaligned16End,
304+
const char* end) {
305+
const uint8x16_t lfs = vdupq_n_u8(static_cast<uint8_t>('\n'));
306+
const uint8x16_t crs = vdupq_n_u8(static_cast<uint8_t>('\r'));
307+
308+
while (ptr < unaligned16End) {
309+
const uint8x16_t block = vld1q_u8(reinterpret_cast<const uint8_t*>(ptr));
310+
const uint8x16_t matchLF = vceqq_u8(block, lfs);
311+
const uint8x16_t matchCR = vceqq_u8(block, crs);
312+
const uint8x16_t match = vorrq_u8(matchLF, matchCR);
313+
const int pos = FirstNonZeroLane16(match);
314+
if (pos < 16) {
315+
return ptr + pos;
316+
}
317+
ptr += 16;
318+
}
319+
320+
return AdvanceToNextCRLF(ptr, end);
321+
}
322+
323+
// Four chars: 0x09 (Tab), 0x0a (LF), 0x0d (CR), 0x20 (Space)
324+
// Tab maps to 0x01
325+
// LF maps to 0x02
326+
// CR maps to 0x04
327+
// Tab|LF|CR = 0x07
328+
// Space maps to 0x08
329+
alignas(16) constexpr uint8_t NEON_LO_NIBBLES_LOOKUP[16] = {
330+
0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
331+
0x00, 0x01, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00,
332+
};
333+
334+
alignas(16) constexpr uint8_t NEON_HI_NIBBLES_LOOKUP[16] = {
335+
0x07, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
336+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
337+
};
338+
339+
const char* NEON_AdvanceToNextNonContentCharacter(const char* ptr,
340+
const char* unaligned16End,
341+
const char* end) {
342+
const uint8x16_t loTbl =
343+
vld1q_u8(reinterpret_cast<const uint8_t*>(NEON_LO_NIBBLES_LOOKUP));
344+
const uint8x16_t hiTbl =
345+
vld1q_u8(reinterpret_cast<const uint8_t*>(NEON_HI_NIBBLES_LOOKUP));
346+
const uint8x16_t nibbleMask = vdupq_n_u8(0x0f);
347+
348+
while (ptr < unaligned16End) {
349+
const uint8x16_t input = vld1q_u8(reinterpret_cast<const uint8_t*>(ptr));
350+
const uint8x16_t loNibbles = vandq_u8(input, nibbleMask);
351+
const uint8x16_t hiNibbles = vandq_u8(vshrq_n_u8(input, 4), nibbleMask);
352+
const uint8x16_t lo = vqtbl1q_u8(loTbl, loNibbles);
353+
const uint8x16_t hi = vqtbl1q_u8(hiTbl, hiNibbles);
354+
const uint8x16_t intersection = vandq_u8(lo, hi);
355+
// non-content characters have a non-zero intersection
356+
const int pos = FirstNonZeroLane16(intersection);
357+
if (pos < 16) {
358+
return ptr + pos;
359+
}
360+
ptr += 16;
361+
}
362+
363+
return AdvanceToNextNonContentCharacter(ptr, end);
364+
}
365+
366+
constexpr uintptr_t NEON_ALIGN16 = 16;
367+
constexpr uintptr_t NEON_ALIGN16_MASK = NEON_ALIGN16 - 1;
368+
369+
const char* NEON_FindFirstNULL(const char* ptr, const char* end,
370+
size_t* firstLineNumber = nullptr) {
371+
const char* i = ptr;
372+
bool found = false;
373+
374+
// Handle unaligned head
375+
if ((reinterpret_cast<uintptr_t>(i) & NEON_ALIGN16_MASK) != 0) {
376+
const char* headEnd = reinterpret_cast<const char*>(
377+
(reinterpret_cast<uintptr_t>(i) + NEON_ALIGN16_MASK) &
378+
~NEON_ALIGN16_MASK);
379+
headEnd = headEnd < end ? headEnd : end;
380+
while (i != headEnd) {
381+
if (*i == '\0') {
382+
found = true;
383+
break;
384+
}
385+
++i;
386+
}
387+
}
388+
389+
if (!found && i != end) {
390+
const char* middleEnd = reinterpret_cast<const char*>(
391+
reinterpret_cast<uintptr_t>(end) & ~NEON_ALIGN16_MASK);
392+
const uint8x16_t zeros = vdupq_n_u8(0);
393+
while (i < middleEnd) {
394+
const uint8x16_t block = vld1q_u8(reinterpret_cast<const uint8_t*>(i));
395+
const uint8x16_t match = vceqq_u8(block, zeros);
396+
const int pos = FirstNonZeroLane16(match);
397+
if (pos < 16) {
398+
i += pos;
399+
found = true;
400+
break;
401+
}
402+
i += NEON_ALIGN16;
403+
}
404+
}
405+
406+
// Handle tail
407+
if (!found) {
408+
while (i != end) {
409+
if (*i == '\0') {
410+
found = true;
411+
break;
412+
}
413+
++i;
414+
}
415+
}
416+
417+
if (!found || firstLineNumber == nullptr) {
418+
return i;
419+
}
420+
421+
// Count newlines from ptr to i
422+
size_t lineCounter = 1;
423+
const char* p = ptr;
424+
425+
// Scalar head
426+
if ((reinterpret_cast<uintptr_t>(p) & NEON_ALIGN16_MASK) != 0) {
427+
const char* headEnd = reinterpret_cast<const char*>(
428+
(reinterpret_cast<uintptr_t>(p) + NEON_ALIGN16_MASK) &
429+
~NEON_ALIGN16_MASK);
430+
headEnd = headEnd < i ? headEnd : i;
431+
while (p != headEnd) {
432+
if (*p == '\n') {
433+
++lineCounter;
434+
}
435+
++p;
436+
}
437+
}
438+
439+
// NEON middle
440+
if (p != i) {
441+
const char* middleEnd = reinterpret_cast<const char*>(
442+
reinterpret_cast<uintptr_t>(i) & ~NEON_ALIGN16_MASK);
443+
const uint8x16_t linefeeds = vdupq_n_u8(static_cast<uint8_t>('\n'));
444+
while (p < middleEnd) {
445+
const uint8x16_t block = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
446+
const uint8x16_t match = vceqq_u8(block, linefeeds);
447+
// Count set bytes
448+
lineCounter += vaddvq_u8(vshrq_n_u8(match, 7));
449+
p += NEON_ALIGN16;
450+
}
451+
}
452+
453+
// Scalar tail
454+
while (p != i) {
455+
if (*p == '\n') {
456+
++lineCounter;
457+
}
458+
++p;
459+
}
460+
461+
*firstLineNumber = lineCounter;
462+
return i;
463+
}
464+
465+
#endif
466+
274467
} // namespace
275468

276469
void ByteBlockBackedDictionary::clear() {
@@ -305,12 +498,20 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
305498
if (unaligned32End < ptr) {
306499
unaligned32End = ptr;
307500
}
501+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
502+
const char* unaligned16End =
503+
reinterpret_cast<const char*>(reinterpret_cast<uintptr_t>(end) - 16);
504+
if (unaligned16End < ptr) {
505+
unaligned16End = ptr;
506+
}
308507
#endif
309508

310509
// Validate that no NULL characters are in the text.
311510
size_t errorAtLine = 0;
312-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
511+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
313512
const char* ctrlCharPtr = AVX512_FindFirstNULL(ptr, end, &errorAtLine);
513+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
514+
const char* ctrlCharPtr = NEON_FindFirstNULL(ptr, end, &errorAtLine);
314515
#else
315516
const char* ctrlCharPtr = FindFirstNULL(ptr, end, &errorAtLine);
316517
#endif
@@ -330,17 +531,21 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
330531
}
331532

332533
if (*ptr == '#') {
333-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
534+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
334535
ptr = AVX512_AdvanceToNextCRLF(ptr, unaligned32End, end);
536+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
537+
ptr = NEON_AdvanceToNextCRLF(ptr, unaligned16End, end);
335538
#else
336539
ptr = AdvanceToNextCRLF(ptr, end);
337540
#endif
338541
continue;
339542
}
340543

341544
const char* keyStart = ptr;
342-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
545+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
343546
ptr = AVX512_AdvanceToNextNonContentCharacter(ptr, unaligned32End, end);
547+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
548+
ptr = NEON_AdvanceToNextNonContentCharacter(ptr, unaligned16End, end);
344549
#else
345550
ptr = AdvanceToNextNonContentCharacter(ptr, end);
346551
#endif
@@ -356,8 +561,10 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
356561
}
357562

358563
const char* valueStart = ptr;
359-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
564+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
360565
ptr = AVX512_AdvanceToNextCRLF(ptr, unaligned32End, end);
566+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
567+
ptr = NEON_AdvanceToNextCRLF(ptr, unaligned16End, end);
361568
#else
362569
ptr = AdvanceToNextCRLF(ptr, end);
363570
#endif
@@ -405,17 +612,21 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
405612
}
406613

407614
if (*ptr == '#') {
408-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
615+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
409616
ptr = AVX512_AdvanceToNextCRLF(ptr, unaligned32End, end);
617+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
618+
ptr = NEON_AdvanceToNextCRLF(ptr, unaligned16End, end);
410619
#else
411620
ptr = AdvanceToNextCRLF(ptr, end);
412621
#endif
413622
continue;
414623
}
415624

416625
const char* valueStart = ptr;
417-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
626+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
418627
ptr = AVX512_AdvanceToNextNonContentCharacter(ptr, unaligned32End, end);
628+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
629+
ptr = NEON_AdvanceToNextNonContentCharacter(ptr, unaligned16End, end);
419630
#else
420631
ptr = AdvanceToNextNonContentCharacter(ptr, end);
421632
#endif
@@ -430,8 +641,10 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
430641
}
431642

432643
const char* maybeKeyStart = ptr;
433-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
644+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
434645
ptr = AVX512_AdvanceToNextNonContentCharacter(ptr, unaligned32End, end);
646+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
647+
ptr = NEON_AdvanceToNextNonContentCharacter(ptr, unaligned16End, end);
435648
#else
436649
ptr = AdvanceToNextNonContentCharacter(ptr, end);
437650
#endif
@@ -457,8 +670,10 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
457670
// More content incoming.
458671
valueEnd = maybeKeyEnd;
459672
maybeKeyStart = ptr;
460-
#ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
673+
#if defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
461674
ptr = AVX512_AdvanceToNextNonContentCharacter(ptr, unaligned32End, end);
675+
#elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
676+
ptr = NEON_AdvanceToNextNonContentCharacter(ptr, unaligned16End, end);
462677
#else
463678
ptr = AdvanceToNextNonContentCharacter(ptr, end);
464679
#endif

src/Engine/CMakeLists.txt

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,19 @@ if (ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
4646
endif()
4747
endif ()
4848

49+
# NEON is supported by default on AArch64 (ARM64), so we can enable it automatically
50+
# Users can disable it by setting ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON to OFF
51+
# if (NOT DEFINED ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
52+
# if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
53+
# message(STATUS "Detected architecture: ${CMAKE_SYSTEM_PROCESSOR}, enabling NEON support automatically")
54+
# set(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON ON)
55+
# endif ()
56+
# endif ()
57+
58+
if (ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
59+
target_compile_definitions(McBopomofoLMLib PRIVATE ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON=1)
60+
endif ()
61+
4962
if (ENABLE_TEST)
5063
enable_testing()
5164
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
@@ -75,17 +88,43 @@ if (ENABLE_TEST)
7588
)
7689
add_dependencies(runMcBopomofoLMLibTest McBopomofoLMLibTest)
7790

78-
# Benchmark for ByteBlockBackedDictionary; not enabled by default
79-
#
80-
# find_package(benchmark)
81-
# add_executable(ByteBlockBackedDictionaryBenchmark
82-
# ByteBlockBackedDictionaryBenchmark.cpp)
83-
# target_link_libraries(ByteBlockBackedDictionaryBenchmark McBopomofoLMLib benchmark::benchmark)
84-
85-
# Stressed benchmark for ParselessLM; not enabled by default
86-
#
87-
# find_package(benchmark)
88-
# add_executable(ParselessLMBenchmark
89-
# ParselessLMBenchmark.cpp)
90-
# target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)
91+
if (ENABLE_BENCHMARK)
92+
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
93+
set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "" FORCE)
94+
set(BENCHMARK_VERSION "v1.9.5")
95+
MESSAGE(STATUS "Fetching Google Benchmark ${BENCHMARK_VERSION} from GitHub")
96+
include(FetchContent)
97+
FetchContent_Declare(
98+
benchmark
99+
URL "https://github.com/google/benchmark/archive/refs/tags/${BENCHMARK_VERSION}.zip"
100+
)
101+
FetchContent_MakeAvailable(benchmark)
102+
103+
add_executable(ByteBlockBackedDictionaryBenchmark
104+
ByteBlockBackedDictionaryBenchmark.cpp)
105+
target_link_libraries(ByteBlockBackedDictionaryBenchmark McBopomofoLMLib benchmark::benchmark)
106+
107+
if (ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
108+
target_compile_definitions(ByteBlockBackedDictionaryBenchmark PRIVATE ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON=1)
109+
endif ()
110+
if (ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
111+
target_compile_options(ByteBlockBackedDictionaryBenchmark PRIVATE -mavx512f -march=native -DENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512=1)
112+
endif ()
113+
114+
add_custom_target(
115+
runByteBlockBackedDictionaryBenchmark
116+
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ByteBlockBackedDictionaryBenchmark
117+
)
118+
add_dependencies(runByteBlockBackedDictionaryBenchmark ByteBlockBackedDictionaryBenchmark)
119+
120+
add_executable(ParselessLMBenchmark
121+
ParselessLMBenchmark.cpp)
122+
target_link_libraries(ParselessLMBenchmark McBopomofoLMLib benchmark::benchmark)
123+
124+
add_custom_target(
125+
runParselessLMBenchmark
126+
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ParselessLMBenchmark
127+
)
128+
add_dependencies(runParselessLMBenchmark ParselessLMBenchmark)
129+
endif ()
91130
endif ()

0 commit comments

Comments
 (0)