Aaru.Checksums.Native/crc64_vmull.c at devel · aaru-dps/Aaru.Checksums.Native · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/*
 * This file is part of the Aaru Data Preservation Suite.
 * Copyright (c) 2019-2026 Natalia Portillo.
 *
 * This file is under the public domain:
 * https://github.com/rawrunprotected/crc
 */

#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)

#include <arm_neon.h>
#include <stddef.h>
#include <stdint.h>

#include "library.h"
#include "arm_vmull.h"
#include "crc64.h"

static const uint8_t shuffleMasks[] = {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
};

TARGET_WITH_NEON FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t *outLeft, uint64x2_t *outRight)
{
    const uint64x2_t maskA =
        vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - n))));
    uint64x2_t       b     = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
                                                           vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));
    const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));

    *outLeft  = mm_shuffle_epi8(in, maskB);
    *outRight = mm_shuffle_epi8(in, maskA);
}

TARGET_WITH_NEON FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)
{
    return

        veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)),
                  sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants))

        );
}

/**
 * @brief Calculates the CRC-64 checksum using the vmull instruction.
 *
 * This function calculates the CRC-64 checksum of the given data using the
 * vmull instruction for optimized performance. It takes the previous CRC value,
 * the data buffer, and the length of data as parameters. The function returns
 * the resulting CRC-32 checksum.
 *
 * @param previous_crc The previous CRC value.
 * @param data The data buffer.
 * @param len The length of the data buffer.
 *
 * @return The CRC-64 checksum of the given data.
 */
AARU_EXPORT TARGET_WITH_NEON uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t *data, long len)
{
    const uint64_t k1 = 0xe05dd497ca393ae4;  // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
    const uint64_t k2 = 0xdabe95afc7875f40;  // bitReflect(expMod65(128, poly, 1)) << 1;
    const uint64_t mu = 0x9c3e466c172963d5;  // (bitReflect(div129by65(poly)) << 1) | 1;
    const uint64_t p  = 0x92d8af2baf0e1e85;  // (bitReflect(poly) << 1) | 1;

    const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));
    const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));

    const uint8_t *end = data + len;

    // Align pointers
    const uint64x2_t *alignedData = (const uint64x2_t *)((uintptr_t)data & ~(uintptr_t)15);
    const uint64x2_t *alignedEnd  = (const uint64x2_t *)(((uintptr_t)end + 15) & ~(uintptr_t)15);

    const size_t leadInSize  = data - (const uint8_t *)alignedData;
    const size_t leadOutSize = (const uint8_t *)alignedEnd - end;

    const size_t alignedLength = alignedEnd - alignedData;

    const uint64x2_t leadInMask =
        vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - leadInSize))));
    uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
    uint64x2_t b = vreinterpretq_u64_u32(
        vld1q_u32((const uint32_t *)alignedData));  // Use a signed shift right to create a mask with the sign bit
    const uint64x2_t data0 =
        vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
                                      vreinterpretq_u8_u64(b), vreinterpretq_u8_u64(a)));

    const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);

    uint64x2_t R;
    if(alignedLength == 1)
    {
        // Single data block, initial CRC possibly bleeds into zero padding
        uint64x2_t crc0, crc1;
        shiftRight128(initialCrc, 16 - len, &crc0, &crc1);

        uint64x2_t A, B;
        shiftRight128(data0, leadOutSize, &A, &B);

        const uint64x2_t P = veorq_u64(A, crc0);
        R                  = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
                                       veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
    }
    else if(alignedLength == 2)
    {
        const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(alignedData + 1)));

        if(len < 8)
        {
            // Initial CRC bleeds into the zero padding
            uint64x2_t crc0, crc1;
            shiftRight128(initialCrc, 16 - len, &crc0, &crc1);

            uint64x2_t A, B, C, D;
            shiftRight128(data0, leadOutSize, &A, &B);
            shiftRight128(data1, leadOutSize, &C, &D);

            const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0);
            R                  = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
                                           veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
        }
        else
        {
            // We can fit the initial CRC into the data without bleeding into the zero padding
            uint64x2_t crc0, crc1;
            shiftRight128(initialCrc, leadInSize, &crc0, &crc1);

            uint64x2_t A, B, C, D;
            shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B);
            shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D);

            const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C));
            R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
        }
    }
    else
    {
        alignedData++;
        len -= 16 - leadInSize;

        // Initial CRC can simply be added to data
        uint64x2_t crc0, crc1;
        shiftRight128(initialCrc, leadInSize, &crc0, &crc1);

        uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);

        while(len >= 32)
        {
            accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)), accumulator),
                               foldConstants1);

            len -= 16;
            alignedData++;
        }

        uint64x2_t P;
        if(len == 16)
            P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
        else
        {
            const uint64x2_t end0 =
                veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
            const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(alignedData + 1)));

            uint64x2_t A, B, C, D;
            shiftRight128(end0, leadOutSize, &A, &B);
            shiftRight128(end1, leadOutSize, &C, &D);

            P = veorq_u64(fold(A, foldConstants1),
                          vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C))));
        }

        R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
    }

    // Final Barrett reduction
    const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2));
    const uint64x2_t T2 = veorq_u64(
        veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R);

    return ~vgetq_lane_u64(T2, 1);
}

#endif