118 #if FOLLY_SSE_PREREQ(4, 2) 121 crc32_hw_aligned(
uint32_t remainder,
const __m128i* p,
size_t vec_count) {
123 const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95);
124 const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA);
125 const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191);
126 const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765);
127 const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
128 const __m128i barrett_reduction_constants =
129 _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641);
131 const __m128i*
const end = p + vec_count;
132 const __m128i*
const end512 = p + (vec_count & ~3);
133 __m128i x0, x1, x2, x3;
157 x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder));
160 goto _128_bits_at_a_time;
166 for (; p != end512; p += 4) {
167 __m128i y0, y1, y2, y3;
181 y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00));
182 y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00));
183 y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00));
184 y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00));
185 y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11));
186 y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11));
187 y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11));
188 y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11));
197 x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00));
198 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00));
199 x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11));
200 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11));
201 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00));
202 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11));
209 x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00));
210 x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11));
222 _mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10));
226 _mm_srli_si128(x0, 4),
227 _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00));
272 x0 = _mm_clmulepi64_si128(
273 _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00);
274 x0 = _mm_clmulepi64_si128(
275 _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10);
276 return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4));
—— Concurrent Priority Queue Implementation ——
auto end(TestAdlIterable &instance)