/* * (C) 2025 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ #include #include #include #include #include namespace Botan { namespace SM4_AVX512_GFNI { namespace { template BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_T sm4_sbox(const SIMD_T& x) { /* * See https://eprint.iacr.org/2022/1154 section 3.3 for details on * how this works */ constexpr uint64_t pre_a = gfni_matrix(R"( 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0)"); constexpr uint8_t pre_c = 0b00111110; constexpr uint64_t post_a = gfni_matrix(R"( 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 0 1)"); constexpr uint8_t post_c = 0b11010011; auto y = gf2p8affine(x); return gf2p8affineinv(y); } template BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI SIMD_T sm4_f(const SIMD_T& x) { const auto sx = sm4_sbox(x); return sx ^ sx.template rotl<2>() ^ sx.template rotl<10>() ^ sx.template rotl<18>() ^ sx.template rotl<24>(); } template BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void encrypt(const uint8_t ptext[16 * 4 * M], uint8_t ctext[16 * 4 * M], std::span RK) { SIMD_T B0 = SIMD_T::load_be(ptext); SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M); SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M); SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M); SIMD_T::transpose(B0, B1, B2, B3); B0 = B0.rev_words(); B1 = B1.rev_words(); B2 = B2.rev_words(); B3 = B3.rev_words(); for(size_t j = 0; j != 8; ++j) { B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j])); B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1])); B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2])); B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3])); } SIMD_T::transpose(B0, B1, B2, B3); B3.rev_words().store_be(ctext); B2.rev_words().store_be(ctext + 16 * M); B1.rev_words().store_be(ctext + 16 * 2 * M); B0.rev_words().store_be(ctext + 16 * 3 * M); } template BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void encrypt_x2(const uint8_t ptext[32 * 4 * M], uint8_t ctext[32 * 4 * M], std::span RK) { SIMD_T B0 = SIMD_T::load_be(ptext); SIMD_T B1 = SIMD_T::load_be(ptext + 16 * M); SIMD_T B2 = SIMD_T::load_be(ptext + 16 * 2 * M); SIMD_T B3 = SIMD_T::load_be(ptext + 16 * 3 * M); SIMD_T B4 = SIMD_T::load_be(ptext + 16 * 4 * M); SIMD_T B5 = SIMD_T::load_be(ptext + 16 * 5 * M); SIMD_T B6 = SIMD_T::load_be(ptext + 16 * 6 * M); SIMD_T B7 = SIMD_T::load_be(ptext + 16 * 7 * M); SIMD_T::transpose(B0, B1, B2, B3); SIMD_T::transpose(B4, B5, B6, B7); B0 = B0.rev_words(); B1 = B1.rev_words(); B2 = B2.rev_words(); B3 = B3.rev_words(); B4 = B4.rev_words(); B5 = B5.rev_words(); B6 = B6.rev_words(); B7 = B7.rev_words(); for(size_t j = 0; j != 8; ++j) { B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[4 * j])); B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[4 * j])); B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[4 * j + 1])); B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[4 * j + 1])); B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[4 * j + 2])); B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[4 * j + 2])); B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[4 * j + 3])); B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[4 * j + 3])); } SIMD_T::transpose(B0, B1, B2, B3); SIMD_T::transpose(B4, B5, B6, B7); B3.rev_words().store_be(ctext); B2.rev_words().store_be(ctext + 16 * M); B1.rev_words().store_be(ctext + 16 * 2 * M); B0.rev_words().store_be(ctext + 16 * 3 * M); B7.rev_words().store_be(ctext + 16 * 4 * M); B6.rev_words().store_be(ctext + 16 * 5 * M); B5.rev_words().store_be(ctext + 16 * 6 * M); B4.rev_words().store_be(ctext + 16 * 7 * M); } template BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void decrypt(const uint8_t ctext[16 * 4 * M], uint8_t ptext[16 * 4 * M], std::span RK) { SIMD_T B0 = SIMD_T::load_be(ctext); SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M); SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M); SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M); SIMD_T::transpose(B0, B1, B2, B3); B0 = B0.rev_words(); B1 = B1.rev_words(); B2 = B2.rev_words(); B3 = B3.rev_words(); for(size_t j = 0; j != 8; ++j) { B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)])); B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)])); B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)])); B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)])); } SIMD_T::transpose(B0, B1, B2, B3); B3.rev_words().store_be(ptext); B2.rev_words().store_be(ptext + 16 * M); B1.rev_words().store_be(ptext + 16 * 2 * M); B0.rev_words().store_be(ptext + 16 * 3 * M); } template BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512_GFNI void decrypt_x2(const uint8_t ctext[32 * 4 * M], uint8_t ptext[32 * 4 * M], std::span RK) { SIMD_T B0 = SIMD_T::load_be(ctext); SIMD_T B1 = SIMD_T::load_be(ctext + 16 * M); SIMD_T B2 = SIMD_T::load_be(ctext + 16 * 2 * M); SIMD_T B3 = SIMD_T::load_be(ctext + 16 * 3 * M); SIMD_T B4 = SIMD_T::load_be(ctext + 16 * 4 * M); SIMD_T B5 = SIMD_T::load_be(ctext + 16 * 5 * M); SIMD_T B6 = SIMD_T::load_be(ctext + 16 * 6 * M); SIMD_T B7 = SIMD_T::load_be(ctext + 16 * 7 * M); SIMD_T::transpose(B0, B1, B2, B3); SIMD_T::transpose(B4, B5, B6, B7); B0 = B0.rev_words(); B1 = B1.rev_words(); B2 = B2.rev_words(); B3 = B3.rev_words(); B4 = B4.rev_words(); B5 = B5.rev_words(); B6 = B6.rev_words(); B7 = B7.rev_words(); for(size_t j = 0; j != 8; ++j) { B0 ^= sm4_f(B1 ^ B2 ^ B3 ^ SIMD_T::splat(RK[32 - (4 * j + 1)])); B4 ^= sm4_f(B5 ^ B6 ^ B7 ^ SIMD_T::splat(RK[32 - (4 * j + 1)])); B1 ^= sm4_f(B2 ^ B3 ^ B0 ^ SIMD_T::splat(RK[32 - (4 * j + 2)])); B5 ^= sm4_f(B6 ^ B7 ^ B4 ^ SIMD_T::splat(RK[32 - (4 * j + 2)])); B2 ^= sm4_f(B3 ^ B0 ^ B1 ^ SIMD_T::splat(RK[32 - (4 * j + 3)])); B6 ^= sm4_f(B7 ^ B4 ^ B5 ^ SIMD_T::splat(RK[32 - (4 * j + 3)])); B3 ^= sm4_f(B0 ^ B1 ^ B2 ^ SIMD_T::splat(RK[32 - (4 * j + 4)])); B7 ^= sm4_f(B4 ^ B5 ^ B6 ^ SIMD_T::splat(RK[32 - (4 * j + 4)])); } SIMD_T::transpose(B0, B1, B2, B3); SIMD_T::transpose(B4, B5, B6, B7); B3.rev_words().store_be(ptext); B2.rev_words().store_be(ptext + 16 * M); B1.rev_words().store_be(ptext + 16 * 2 * M); B0.rev_words().store_be(ptext + 16 * 3 * M); B7.rev_words().store_be(ptext + 16 * 4 * M); B6.rev_words().store_be(ptext + 16 * 5 * M); B5.rev_words().store_be(ptext + 16 * 6 * M); B4.rev_words().store_be(ptext + 16 * 7 * M); } } // namespace } // namespace SM4_AVX512_GFNI void BOTAN_FN_ISA_AVX512_GFNI SM4::sm4_avx512_gfni_encrypt(const uint8_t ptext[], uint8_t ctext[], size_t blocks) const { while(blocks >= 32) { SM4_AVX512_GFNI::encrypt_x2(ptext, ctext, m_RK); ptext += 16 * 32; ctext += 16 * 32; blocks -= 32; } while(blocks >= 16) { SM4_AVX512_GFNI::encrypt(ptext, ctext, m_RK); ptext += 16 * 16; ctext += 16 * 16; blocks -= 16; } while(blocks >= 8) { SM4_AVX512_GFNI::encrypt(ptext, ctext, m_RK); ptext += 16 * 8; ctext += 16 * 8; blocks -= 8; } if(blocks > 0) { uint8_t pbuf[16 * 8] = {0}; uint8_t cbuf[16 * 8] = {0}; copy_mem(pbuf, ptext, blocks * 16); SM4_AVX512_GFNI::encrypt(pbuf, cbuf, m_RK); copy_mem(ctext, cbuf, blocks * 16); } } void BOTAN_FN_ISA_AVX512_GFNI SM4::sm4_avx512_gfni_decrypt(const uint8_t ctext[], uint8_t ptext[], size_t blocks) const { while(blocks >= 32) { SM4_AVX512_GFNI::decrypt_x2(ctext, ptext, m_RK); ptext += 16 * 32; ctext += 16 * 32; blocks -= 32; } while(blocks >= 16) { SM4_AVX512_GFNI::decrypt(ctext, ptext, m_RK); ptext += 16 * 16; ctext += 16 * 16; blocks -= 16; } while(blocks >= 8) { SM4_AVX512_GFNI::decrypt(ctext, ptext, m_RK); ptext += 16 * 8; ctext += 16 * 8; blocks -= 8; } if(blocks > 0) { uint8_t cbuf[16 * 8] = {0}; uint8_t pbuf[16 * 8] = {0}; copy_mem(cbuf, ctext, blocks * 16); SM4_AVX512_GFNI::decrypt(cbuf, pbuf, m_RK); copy_mem(ptext, pbuf, blocks * 16); } } } // namespace Botan