3131#endif
3232#endif
3333
34+ #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON
35+ #if defined(__ARM_NEON)
36+ #include < arm_neon.h>
37+
38+ #include < cstdint>
39+ #else
40+ #error ARM NEON support required
41+ #endif
42+ #endif
43+
3444#include " ByteBlockBackedDictionary.h"
3545
3646namespace McBopomofo {
@@ -84,7 +94,8 @@ const char* AdvanceToNextNonContentCharacter(const char* ptr, const char* end) {
8494 return ptr;
8595}
8696
87- #ifndef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
97+ #if !defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512) && \
98+ !defined (ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
8899const char * FindFirstNULL (const char * ptr, const char * end,
89100 size_t * firstLineNumber = nullptr ) {
90101 const char * i = ptr;
@@ -271,6 +282,188 @@ const char* AVX512_FindFirstNULL(const char* ptr, const char* end,
271282
272283#endif
273284
285+ #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON
286+
287+ // Returns the index of the first non-zero byte in v, or 16 if all zero
288+ static inline int FirstNonZeroLane16 (uint8x16_t v) {
289+ if (vmaxvq_u8 (v) == 0 ) {
290+ return 16 ;
291+ }
292+ // Scalar fallback for the position
293+ alignas (16 ) uint8_t tmp[16 ];
294+ vst1q_u8 (tmp, v);
295+ for (int i = 0 ; i < 16 ; ++i) {
296+ if (tmp[i]) {
297+ return i;
298+ }
299+ }
300+ return 16 ;
301+ }
302+
303+ const char * NEON_AdvanceToNextCRLF (const char * ptr, const char * unaligned16End,
304+ const char * end) {
305+ const uint8x16_t lfs = vdupq_n_u8 (static_cast <uint8_t >(' \n ' ));
306+ const uint8x16_t crs = vdupq_n_u8 (static_cast <uint8_t >(' \r ' ));
307+
308+ while (ptr < unaligned16End) {
309+ const uint8x16_t block = vld1q_u8 (reinterpret_cast <const uint8_t *>(ptr));
310+ const uint8x16_t matchLF = vceqq_u8 (block, lfs);
311+ const uint8x16_t matchCR = vceqq_u8 (block, crs);
312+ const uint8x16_t match = vorrq_u8 (matchLF, matchCR);
313+ const int pos = FirstNonZeroLane16 (match);
314+ if (pos < 16 ) {
315+ return ptr + pos;
316+ }
317+ ptr += 16 ;
318+ }
319+
320+ return AdvanceToNextCRLF (ptr, end);
321+ }
322+
323+ // Four chars: 0x09 (Tab), 0x0a (LF), 0x0d (CR), 0x20 (Space)
324+ // Tab maps to 0x01
325+ // LF maps to 0x02
326+ // CR maps to 0x04
327+ // Tab|LF|CR = 0x07
328+ // Space maps to 0x08
329+ alignas (16 ) constexpr uint8_t NEON_LO_NIBBLES_LOOKUP[16 ] = {
330+ 0x08 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
331+ 0x00 , 0x01 , 0x02 , 0x00 , 0x00 , 0x04 , 0x00 , 0x00 ,
332+ };
333+
334+ alignas (16 ) constexpr uint8_t NEON_HI_NIBBLES_LOOKUP[16 ] = {
335+ 0x07 , 0x00 , 0x08 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
336+ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
337+ };
338+
339+ const char * NEON_AdvanceToNextNonContentCharacter (const char * ptr,
340+ const char * unaligned16End,
341+ const char * end) {
342+ const uint8x16_t loTbl =
343+ vld1q_u8 (reinterpret_cast <const uint8_t *>(NEON_LO_NIBBLES_LOOKUP));
344+ const uint8x16_t hiTbl =
345+ vld1q_u8 (reinterpret_cast <const uint8_t *>(NEON_HI_NIBBLES_LOOKUP));
346+ const uint8x16_t nibbleMask = vdupq_n_u8 (0x0f );
347+
348+ while (ptr < unaligned16End) {
349+ const uint8x16_t input = vld1q_u8 (reinterpret_cast <const uint8_t *>(ptr));
350+ const uint8x16_t loNibbles = vandq_u8 (input, nibbleMask);
351+ const uint8x16_t hiNibbles = vandq_u8 (vshrq_n_u8 (input, 4 ), nibbleMask);
352+ const uint8x16_t lo = vqtbl1q_u8 (loTbl, loNibbles);
353+ const uint8x16_t hi = vqtbl1q_u8 (hiTbl, hiNibbles);
354+ const uint8x16_t intersection = vandq_u8 (lo, hi);
355+ // non-content characters have a non-zero intersection
356+ const int pos = FirstNonZeroLane16 (intersection);
357+ if (pos < 16 ) {
358+ return ptr + pos;
359+ }
360+ ptr += 16 ;
361+ }
362+
363+ return AdvanceToNextNonContentCharacter (ptr, end);
364+ }
365+
366+ constexpr uintptr_t NEON_ALIGN16 = 16 ;
367+ constexpr uintptr_t NEON_ALIGN16_MASK = NEON_ALIGN16 - 1 ;
368+
369+ const char * NEON_FindFirstNULL (const char * ptr, const char * end,
370+ size_t * firstLineNumber = nullptr ) {
371+ const char * i = ptr;
372+ bool found = false ;
373+
374+ // Handle unaligned head
375+ if ((reinterpret_cast <uintptr_t >(i) & NEON_ALIGN16_MASK) != 0 ) {
376+ const char * headEnd = reinterpret_cast <const char *>(
377+ (reinterpret_cast <uintptr_t >(i) + NEON_ALIGN16_MASK) &
378+ ~NEON_ALIGN16_MASK);
379+ headEnd = headEnd < end ? headEnd : end;
380+ while (i != headEnd) {
381+ if (*i == ' \0 ' ) {
382+ found = true ;
383+ break ;
384+ }
385+ ++i;
386+ }
387+ }
388+
389+ if (!found && i != end) {
390+ const char * middleEnd = reinterpret_cast <const char *>(
391+ reinterpret_cast <uintptr_t >(end) & ~NEON_ALIGN16_MASK);
392+ const uint8x16_t zeros = vdupq_n_u8 (0 );
393+ while (i < middleEnd) {
394+ const uint8x16_t block = vld1q_u8 (reinterpret_cast <const uint8_t *>(i));
395+ const uint8x16_t match = vceqq_u8 (block, zeros);
396+ const int pos = FirstNonZeroLane16 (match);
397+ if (pos < 16 ) {
398+ i += pos;
399+ found = true ;
400+ break ;
401+ }
402+ i += NEON_ALIGN16;
403+ }
404+ }
405+
406+ // Handle tail
407+ if (!found) {
408+ while (i != end) {
409+ if (*i == ' \0 ' ) {
410+ found = true ;
411+ break ;
412+ }
413+ ++i;
414+ }
415+ }
416+
417+ if (!found || firstLineNumber == nullptr ) {
418+ return i;
419+ }
420+
421+ // Count newlines from ptr to i
422+ size_t lineCounter = 1 ;
423+ const char * p = ptr;
424+
425+ // Scalar head
426+ if ((reinterpret_cast <uintptr_t >(p) & NEON_ALIGN16_MASK) != 0 ) {
427+ const char * headEnd = reinterpret_cast <const char *>(
428+ (reinterpret_cast <uintptr_t >(p) + NEON_ALIGN16_MASK) &
429+ ~NEON_ALIGN16_MASK);
430+ headEnd = headEnd < i ? headEnd : i;
431+ while (p != headEnd) {
432+ if (*p == ' \n ' ) {
433+ ++lineCounter;
434+ }
435+ ++p;
436+ }
437+ }
438+
439+ // NEON middle
440+ if (p != i) {
441+ const char * middleEnd = reinterpret_cast <const char *>(
442+ reinterpret_cast <uintptr_t >(i) & ~NEON_ALIGN16_MASK);
443+ const uint8x16_t linefeeds = vdupq_n_u8 (static_cast <uint8_t >(' \n ' ));
444+ while (p < middleEnd) {
445+ const uint8x16_t block = vld1q_u8 (reinterpret_cast <const uint8_t *>(p));
446+ const uint8x16_t match = vceqq_u8 (block, linefeeds);
447+ // Count set bytes
448+ lineCounter += vaddvq_u8 (vshrq_n_u8 (match, 7 ));
449+ p += NEON_ALIGN16;
450+ }
451+ }
452+
453+ // Scalar tail
454+ while (p != i) {
455+ if (*p == ' \n ' ) {
456+ ++lineCounter;
457+ }
458+ ++p;
459+ }
460+
461+ *firstLineNumber = lineCounter;
462+ return i;
463+ }
464+
465+ #endif
466+
274467} // namespace
275468
276469void ByteBlockBackedDictionary::clear () {
@@ -305,12 +498,20 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
305498 if (unaligned32End < ptr) {
306499 unaligned32End = ptr;
307500 }
501+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
502+ const char * unaligned16End =
503+ reinterpret_cast <const char *>(reinterpret_cast <uintptr_t >(end) - 16 );
504+ if (unaligned16End < ptr) {
505+ unaligned16End = ptr;
506+ }
308507#endif
309508
310509 // Validate that no NULL characters are in the text.
311510 size_t errorAtLine = 0 ;
312- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
511+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
313512 const char * ctrlCharPtr = AVX512_FindFirstNULL (ptr, end, &errorAtLine);
513+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
514+ const char * ctrlCharPtr = NEON_FindFirstNULL (ptr, end, &errorAtLine);
314515#else
315516 const char * ctrlCharPtr = FindFirstNULL (ptr, end, &errorAtLine);
316517#endif
@@ -330,17 +531,21 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
330531 }
331532
332533 if (*ptr == ' #' ) {
333- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
534+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
334535 ptr = AVX512_AdvanceToNextCRLF (ptr, unaligned32End, end);
536+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
537+ ptr = NEON_AdvanceToNextCRLF (ptr, unaligned16End, end);
335538#else
336539 ptr = AdvanceToNextCRLF (ptr, end);
337540#endif
338541 continue ;
339542 }
340543
341544 const char * keyStart = ptr;
342- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
545+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
343546 ptr = AVX512_AdvanceToNextNonContentCharacter (ptr, unaligned32End, end);
547+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
548+ ptr = NEON_AdvanceToNextNonContentCharacter (ptr, unaligned16End, end);
344549#else
345550 ptr = AdvanceToNextNonContentCharacter (ptr, end);
346551#endif
@@ -356,8 +561,10 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
356561 }
357562
358563 const char * valueStart = ptr;
359- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
564+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
360565 ptr = AVX512_AdvanceToNextCRLF (ptr, unaligned32End, end);
566+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
567+ ptr = NEON_AdvanceToNextCRLF (ptr, unaligned16End, end);
361568#else
362569 ptr = AdvanceToNextCRLF (ptr, end);
363570#endif
@@ -405,17 +612,21 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
405612 }
406613
407614 if (*ptr == ' #' ) {
408- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
615+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
409616 ptr = AVX512_AdvanceToNextCRLF (ptr, unaligned32End, end);
617+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
618+ ptr = NEON_AdvanceToNextCRLF (ptr, unaligned16End, end);
410619#else
411620 ptr = AdvanceToNextCRLF (ptr, end);
412621#endif
413622 continue ;
414623 }
415624
416625 const char * valueStart = ptr;
417- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
626+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
418627 ptr = AVX512_AdvanceToNextNonContentCharacter (ptr, unaligned32End, end);
628+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
629+ ptr = NEON_AdvanceToNextNonContentCharacter (ptr, unaligned16End, end);
419630#else
420631 ptr = AdvanceToNextNonContentCharacter (ptr, end);
421632#endif
@@ -430,8 +641,10 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
430641 }
431642
432643 const char * maybeKeyStart = ptr;
433- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
644+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
434645 ptr = AVX512_AdvanceToNextNonContentCharacter (ptr, unaligned32End, end);
646+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
647+ ptr = NEON_AdvanceToNextNonContentCharacter (ptr, unaligned16End, end);
435648#else
436649 ptr = AdvanceToNextNonContentCharacter (ptr, end);
437650#endif
@@ -457,8 +670,10 @@ bool ByteBlockBackedDictionary::parse(const char* block, size_t size,
457670 // More content incoming.
458671 valueEnd = maybeKeyEnd;
459672 maybeKeyStart = ptr;
460- #ifdef ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512
673+ #if defined( ENABLE_EXPERIMENTAL_SIMD_SUPPORT_AVX512)
461674 ptr = AVX512_AdvanceToNextNonContentCharacter (ptr, unaligned32End, end);
675+ #elif defined(ENABLE_EXPERIMENTAL_SIMD_SUPPORT_NEON)
676+ ptr = NEON_AdvanceToNextNonContentCharacter (ptr, unaligned16End, end);
462677#else
463678 ptr = AdvanceToNextNonContentCharacter (ptr, end);
464679#endif
0 commit comments