proxygen
Crc32cDetail.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
3  * This software is provided 'as-is', without any express or implied
4  * warranty. In no event will the author be held liable for any damages
5  * arising from the use of this software.
6  * Permission is granted to anyone to use this software for any purpose,
7  * including commercial applications, and to alter it and redistribute it
8  * freely, subject to the following restrictions:
9  * 1. The origin of this software must not be misrepresented; you must not
10  * claim that you wrote the original software. If you use this software
11  * in a product, an acknowledgment in the product documentation would be
12  * appreciated but is not required.
13  * 2. Altered source versions must be plainly marked as such, and must not be
14  * misrepresented as being the original software.
15  * 3. This notice may not be removed or altered from any source distribution.
16  * Ferry Toth
17  * ftoth@exalondelft.nl
18  *
19  * https://github.com/htot/crc32c
20  *
21  * Modified by Facebook
22  *
23  * Original intel whitepaper:
24  * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
25  * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
26  *
27  * 32-bit support dropped
28  * use intrinsics instead of inline asm
29  * other code cleanup
30  */
31 
32 #include <stdexcept>
33 
35 
36 #include <folly/CppAttributes.h>
37 
38 #include <boost/preprocessor/arithmetic/add.hpp>
39 #include <boost/preprocessor/arithmetic/sub.hpp>
40 #include <boost/preprocessor/repetition/repeat_from_to.hpp>
41 
42 namespace folly {
43 namespace detail {
44 
45 #if defined(FOLLY_X64) && FOLLY_SSE_PREREQ(4, 2)
46 
47 namespace crc32_detail {
48 
49 #define CRCtriplet(crc, buf, offset) \
50  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
51  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
52  crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); \
53  FOLLY_FALLTHROUGH;
54 
55 #define CRCduplet(crc, buf, offset) \
56  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
57  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
58 
59 #define CRCsinglet(crc, buf, offset) \
60  crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); \
61  FOLLY_FALLTHROUGH;
62 
63 #define CASEREPEAT_TRIPLET(unused, count, total) \
64  case BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)): \
65  CRCtriplet(crc, next, -BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)));
66 
67 #define CASEREPEAT_SINGLET(unused, count, total) \
68  case BOOST_PP_SUB(total, count): \
69  CRCsinglet(crc0, next, -BOOST_PP_SUB(total, count) * 8);
70 
71 // Numbers taken directly from intel whitepaper.
72 // clang-format off
73 const uint64_t clmul_constants[] = {
74  0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
75  0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
76  0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
77  0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
78  0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
79  0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
80  0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
81  0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
82  0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
83  0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
84  0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
85  0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
86  0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
87  0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
88  0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
89  0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
90  0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
91  0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
92  0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
93  0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
94  0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
95  0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
96  0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
97  0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
98  0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
99  0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
100  0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
101  0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
102  0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
103  0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
104  0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
105  0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
106  0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
107  0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
108  0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
109  0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
110  0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
111  0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
112  0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
113  0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
114  0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
115  0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
116  0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
117  0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
118  0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
119  0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
120  0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
121  0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
122  0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
123  0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
124  0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
125  0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
126  0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
127  0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
128  0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
129  0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
130  0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
131  0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
132  0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
133  0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
134  0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
135  0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
136  0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
137  0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
138 };
139 // clang-format on
140 
141 /*
142  * CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
143  * chosen constant and xor's these with the remaining CRC.
144  */
145 uint64_t CombineCRC(
146  size_t block_size,
147  uint64_t crc0,
148  uint64_t crc1,
149  uint64_t crc2,
150  const uint64_t* next2) {
151  const auto multiplier =
152  *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
153  const auto crc0_xmm = _mm_set_epi64x(0, crc0);
154  const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
155  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
156  const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
157  const auto res = _mm_xor_si128(res0, res1);
158  crc0 = _mm_cvtsi128_si64(res);
159  crc0 = crc0 ^ *((uint64_t*)next2 - 1);
160  crc2 = _mm_crc32_u64(crc2, crc0);
161  return crc2;
162 }
163 
164 // Generates a block that will crc up to 7 bytes of unaligned data.
165 // Always inline to avoid overhead on small crc sizes.
166 FOLLY_ALWAYS_INLINE void align_to_8(
167  size_t align,
168  uint64_t& crc0, // crc so far, updated on return
169  const unsigned char*& next) { // next data pointer, updated on return
170  uint32_t crc32bit = static_cast<uint32_t>(crc0);
171  if (align & 0x04) {
172  crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
173  next += sizeof(uint32_t);
174  }
175  if (align & 0x02) {
176  crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
177  next += sizeof(uint16_t);
178  }
179  if (align & 0x01) {
180  crc32bit = _mm_crc32_u8(crc32bit, *(next));
181  next++;
182  }
183  crc0 = crc32bit;
184 }
185 
186 // The main loop for large crc sizes. Generates three crc32c
187 // streams, of varying block sizes, using a duff's device.
188 void triplet_loop(
189  size_t block_size,
190  uint64_t& crc0, // crc so far, updated on return
191  const unsigned char*& next, // next data pointer, updated on return
192  size_t n) { // block count
193  uint64_t crc1 = 0, crc2 = 0;
194  // points to the first byte of the next block
195  const uint64_t* next0 = (uint64_t*)next + block_size;
196  const uint64_t* next1 = next0 + block_size;
197  const uint64_t* next2 = next1 + block_size;
198 
199  // Use Duff's device, a for() loop inside a switch()
200  // statement. This needs to execute at least once, round len
201  // down to nearest triplet multiple
202  switch (block_size) {
203  case 128:
204  do {
205  // jumps here for a full block of len 128
206  CRCtriplet(crc, next, -128);
207 
208  // Generates case statements from 127 to 2 of form:
209  // case 127:
210  // CRCtriplet(crc, next, -127);
211  //
212  // MSVC has a max preprocessor expansion depth of 255, which is
213  // exceeded if this is a single statement.
214  BOOST_PP_REPEAT_FROM_TO(0, 64, CASEREPEAT_TRIPLET, 126);
215  BOOST_PP_REPEAT_FROM_TO(0, 62, CASEREPEAT_TRIPLET, 62);
216 
217  // For the last byte, the three crc32c streams must be combined
218  // using carry-less multiplication.
219  case 1:
220  CRCduplet(crc, next, -1); // the final triplet is actually only 2
221  crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
222  if (--n > 0) {
223  crc1 = crc2 = 0;
224  block_size = 128;
225  // points to the first byte of the next block
226  next0 = next2 + 128;
227  next1 = next0 + 128; // from here on all blocks are 128 long
228  next2 = next1 + 128;
229  }
231  case 0:;
232  } while (n > 0);
233  }
234 
235  next = (const unsigned char*)next2;
236 }
237 
238 } // namespace crc32_detail
239 
240 /* Compute CRC-32C using the Intel hardware instruction. */
241 FOLLY_TARGET_ATTRIBUTE("sse4.2")
242 uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
243  const unsigned char* next = (const unsigned char*)buf;
244  size_t count;
245  uint64_t crc0;
246  crc0 = crc;
247 
248  if (len >= 8) {
249  // if len > 216 then align and use triplets
250  if (len > 216) {
251  size_t align = (8 - (uintptr_t)next) & 7;
252  crc32_detail::align_to_8(align, crc0, next);
253  len -= align;
254 
255  count = len / 24; // number of triplets
256  len %= 24; // bytes remaining
257  size_t n = count >> 7; // #blocks = first block + full blocks
258  size_t block_size = count & 127;
259  if (block_size == 0) {
260  block_size = 128;
261  } else {
262  n++;
263  }
264 
265  // This is a separate function call mainly to stop
266  // clang from spilling registers.
267  crc32_detail::triplet_loop(block_size, crc0, next, n);
268  }
269 
270  size_t count2 = len >> 3;
271  len = len & 7;
272  next += (count2 * 8);
273 
274  // Generates a duff device for the last 128 bytes of aligned data.
275  switch (count2) {
276  // Generates case statements of the form:
277  // case 27:
278  // CRCsinglet(crc0, next, -27 * 8);
279  BOOST_PP_REPEAT_FROM_TO(0, 27, CASEREPEAT_SINGLET, 27);
280  case 0:;
281  }
282  }
283 
284  // compute the crc for up to seven trailing bytes
285  crc32_detail::align_to_8(len, crc0, next);
286  return (uint32_t)crc0;
287 }
288 
289 #else
290 
291 uint32_t
292 crc32c_hw(const uint8_t* /* buf */, size_t /* len */, uint32_t /* crc */) {
293  throw std::runtime_error("crc32_hw is not implemented on this platform");
294 }
295 
296 #endif
297 
298 } // namespace detail
299 } // namespace folly
#define FOLLY_ALWAYS_INLINE
Definition: CPortability.h:151
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
uint32_t crc32c_hw(const uint8_t *data, size_t nbytes, uint32_t startingChecksum=~0U)
int * count
const
Definition: upload.py:398
#define FOLLY_FALLTHROUGH
Definition: CppAttributes.h:63
#define FOLLY_TARGET_ATTRIBUTE(target)
Definition: Portability.h:72
def next(obj)
Definition: ast.py:58