/* * (C) 2026 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ #include #include #include namespace Botan { namespace Camellia_HWAES { namespace { /* Helpers for 64-bit operations on SIMD_4x32 */ BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 load_be64(const uint8_t* in) { const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B); return SIMD_4x32::byte_shuffle(SIMD_4x32::load_le(in), bswap64); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void store_be64(uint8_t* out, SIMD_4x32 v) { const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B); SIMD_4x32::byte_shuffle(v, bswap64).store_le(out); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 splat64(uint64_t v) { const uint32_t lo = static_cast(v); const uint32_t hi = static_cast(v >> 32); return SIMD_4x32(lo, hi, lo, hi); } /* The Camellia round function */ BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 camellia_f(SIMD_4x32 x) { // Pre-affine shared by S1/S2/S3 constexpr uint64_t pre123_a = gfni_matrix(R"( 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0)"); // Pre-affine for S4 constexpr uint64_t pre4_a = gfni_matrix(R"( 1 1 0 1 1 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 0 1)"); constexpr uint8_t pre_c = 0x45; // Post-affine for S1 and S4 constexpr uint64_t post14_a = gfni_matrix(R"( 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0)"); constexpr uint8_t post14_c = 0x6E; // Post-affine for S2 constexpr uint64_t post2_a = gfni_matrix(R"( 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1)"); constexpr uint8_t post2_c = 0xDC; // Post-affine for S3 constexpr uint64_t post3_a = gfni_matrix(R"( 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1)"); constexpr uint8_t post3_c = 0x37; constexpr auto PRE123 = Gf2AffineTransformation(pre123_a, pre_c); constexpr auto PRE4 = Gf2AffineTransformation(pre4_a, pre_c); constexpr auto POST14 = Gf2AffineTransformation::post_sbox(post14_a, post14_c); constexpr auto POST2 = Gf2AffineTransformation::post_sbox(post2_a, post2_c); constexpr auto POST3 = Gf2AffineTransformation::post_sbox(post3_a, post3_c); const auto mask_s2 = SIMD_4x32(0xFF000000, 0x00FF0000, 0xFF000000, 0x00FF0000); const auto mask_s3 = SIMD_4x32(0x00FF0000, 0x0000FF00, 0x00FF0000, 0x0000FF00); const auto mask_s4 = SIMD_4x32(0x0000FF00, 0x000000FF, 0x0000FF00, 0x000000FF); const auto pre123 = PRE123.affine_transform(x); const auto pre4 = PRE4.affine_transform(x); const auto sub = hw_aes_sbox(SIMD_4x32::byte_blend(mask_s4, pre4, pre123)); const auto s14 = POST14.affine_transform(sub); const auto s2 = POST2.affine_transform(sub); const auto s3 = POST3.affine_transform(sub); // Final merged Sbox output for all bytes const auto sbox = SIMD_4x32::byte_blend(mask_s3, s3, SIMD_4x32::byte_blend(mask_s2, s2, s14)); // The linear mixing step const auto P1 = SIMD_4x32(0x00000001, 0x00000001, 0x08080809, 0x08080809); const auto P2 = SIMD_4x32(0x01010202, 0x01010202, 0x09090A0A, 0x09090A0A); const auto P3 = SIMD_4x32(0x02030303, 0x02030303, 0x0A0B0B0B, 0x0A0B0B0B); const auto P4 = SIMD_4x32(0x06050404, 0x04040504, 0x0E0D0C0C, 0x0C0C0D0C); const auto P5 = SIMD_4x32(0x07060507, 0x05060605, 0x0F0E0D0F, 0x0D0E0E0D); const auto P6 = SIMD_4x32(0xFFFFFFFF, 0x07070706, 0xFFFFFFFF, 0x0F0F0F0E); const auto sxp1 = SIMD_4x32::byte_shuffle(sbox, P1); const auto sxp2 = SIMD_4x32::byte_shuffle(sbox, P2); const auto sxp3 = SIMD_4x32::byte_shuffle(sbox, P3); const auto sxp4 = SIMD_4x32::byte_shuffle(sbox, P4); const auto sxp5 = SIMD_4x32::byte_shuffle(sbox, P5); const auto sxp6 = SIMD_4x32::byte_shuffle(sbox, P6); return (sxp1 ^ sxp2 ^ sxp3 ^ sxp4 ^ sxp5 ^ sxp6); } /* * FL and FL-inverse operate on 32-bit sub-halves within each 64-bit element. * We use byte_shuffle to broadcast each 32-bit half, then recombine with byte_blend. */ BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FL_2(SIMD_4x32 v, uint64_t K) { const uint32_t k1 = static_cast(K >> 32); const uint32_t k2 = static_cast(K); // Broadcast upper/lower 32-bit halves of each 64-bit element const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C); const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908); auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi); auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo); x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>(); x1 ^= x2 | SIMD_4x32::splat(k2); // Recombine: lo from x2, hi from x1 const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF); return SIMD_4x32::byte_blend(mask_hi, x1, x2); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FLINV_2(SIMD_4x32 v, uint64_t K) { const uint32_t k1 = static_cast(K >> 32); const uint32_t k2 = static_cast(K); const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C); const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908); auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi); auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo); x1 ^= x2 | SIMD_4x32::splat(k2); x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>(); const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF); return SIMD_4x32::byte_blend(mask_hi, x1, x2); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void load_and_deinterleave(const uint8_t in[], SIMD_4x32& L, SIMD_4x32& R) { auto A = load_be64(in); // block 0: [L0, R0] auto B = load_be64(in + 16); // block 1: [L1, R1] const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF); L = SIMD_4x32::byte_blend(mask_upper, B.swap_halves(), A); // [L0, L1] R = SIMD_4x32::byte_blend(mask_upper, B, A.swap_halves()); // [R0, R1] } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void interleave_and_store(uint8_t out[], SIMD_4x32 L, SIMD_4x32 R) { // Camellia output swaps L and R const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF); auto A = SIMD_4x32::byte_blend(mask_upper, L.swap_halves(), R); // [R0, L0] auto B = SIMD_4x32::byte_blend(mask_upper, L, R.swap_halves()); // [R1, L1] store_be64(out, A); store_be64(out + 16, B); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_e_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) { R ^= camellia_f(L ^ splat64(SK[0])); L ^= camellia_f(R ^ splat64(SK[1])); R ^= camellia_f(L ^ splat64(SK[2])); L ^= camellia_f(R ^ splat64(SK[3])); R ^= camellia_f(L ^ splat64(SK[4])); L ^= camellia_f(R ^ splat64(SK[5])); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_d_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) { R ^= camellia_f(L ^ splat64(SK[5])); L ^= camellia_f(R ^ splat64(SK[4])); R ^= camellia_f(L ^ splat64(SK[3])); L ^= camellia_f(R ^ splat64(SK[2])); R ^= camellia_f(L ^ splat64(SK[1])); L ^= camellia_f(R ^ splat64(SK[0])); } BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span SK) { SIMD_4x32 L; SIMD_4x32 R; load_and_deinterleave(in, L, R); L ^= splat64(SK[0]); R ^= splat64(SK[1]); six_e_rounds(L, R, &SK[2]); L = FL_2(L, SK[8]); R = FLINV_2(R, SK[9]); six_e_rounds(L, R, &SK[10]); L = FL_2(L, SK[16]); R = FLINV_2(R, SK[17]); six_e_rounds(L, R, &SK[18]); R ^= splat64(SK[24]); L ^= splat64(SK[25]); interleave_and_store(out, L, R); } BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span SK) { SIMD_4x32 L; SIMD_4x32 R; load_and_deinterleave(in, L, R); R ^= splat64(SK[25]); L ^= splat64(SK[24]); six_d_rounds(L, R, &SK[18]); L = FL_2(L, SK[17]); R = FLINV_2(R, SK[16]); six_d_rounds(L, R, &SK[10]); L = FL_2(L, SK[9]); R = FLINV_2(R, SK[8]); six_d_rounds(L, R, &SK[2]); L ^= splat64(SK[1]); R ^= splat64(SK[0]); interleave_and_store(out, L, R); } BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span SK) { SIMD_4x32 L; SIMD_4x32 R; load_and_deinterleave(in, L, R); L ^= splat64(SK[0]); R ^= splat64(SK[1]); six_e_rounds(L, R, &SK[2]); L = FL_2(L, SK[8]); R = FLINV_2(R, SK[9]); six_e_rounds(L, R, &SK[10]); L = FL_2(L, SK[16]); R = FLINV_2(R, SK[17]); six_e_rounds(L, R, &SK[18]); L = FL_2(L, SK[24]); R = FLINV_2(R, SK[25]); six_e_rounds(L, R, &SK[26]); R ^= splat64(SK[32]); L ^= splat64(SK[33]); interleave_and_store(out, L, R); } BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span SK) { SIMD_4x32 L; SIMD_4x32 R; load_and_deinterleave(in, L, R); R ^= splat64(SK[33]); L ^= splat64(SK[32]); six_d_rounds(L, R, &SK[26]); L = FL_2(L, SK[25]); R = FLINV_2(R, SK[24]); six_d_rounds(L, R, &SK[18]); L = FL_2(L, SK[17]); R = FLINV_2(R, SK[16]); six_d_rounds(L, R, &SK[10]); L = FL_2(L, SK[9]); R = FLINV_2(R, SK[8]); six_d_rounds(L, R, &SK[2]); L ^= splat64(SK[1]); R ^= splat64(SK[0]); interleave_and_store(out, L, R); } } // namespace } // namespace Camellia_HWAES // static void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_encrypt(const uint8_t in[], uint8_t out[], size_t blocks, std::span SK) { while(blocks >= 2) { Camellia_HWAES::camellia_encrypt_x2_18r(in, out, SK); in += 2 * 16; out += 2 * 16; blocks -= 2; } if(blocks > 0) { uint8_t ibuf[2 * 16] = {0}; uint8_t obuf[2 * 16] = {0}; copy_mem(ibuf, in, 16); Camellia_HWAES::camellia_encrypt_x2_18r(ibuf, obuf, SK); copy_mem(out, obuf, 16); } } // static void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_decrypt(const uint8_t in[], uint8_t out[], size_t blocks, std::span SK) { while(blocks >= 2) { Camellia_HWAES::camellia_decrypt_x2_18r(in, out, SK); in += 2 * 16; out += 2 * 16; blocks -= 2; } if(blocks > 0) { uint8_t ibuf[2 * 16] = {0}; uint8_t obuf[2 * 16] = {0}; copy_mem(ibuf, in, 16); Camellia_HWAES::camellia_decrypt_x2_18r(ibuf, obuf, SK); copy_mem(out, obuf, 16); } } // static void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_encrypt(const uint8_t in[], uint8_t out[], size_t blocks, std::span SK) { while(blocks >= 2) { Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK); in += 2 * 16; out += 2 * 16; blocks -= 2; } if(blocks > 0) { uint8_t ibuf[2 * 16] = {0}; uint8_t obuf[2 * 16] = {0}; copy_mem(ibuf, in, 16); Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK); copy_mem(out, obuf, 16); } } // static void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_decrypt(const uint8_t in[], uint8_t out[], size_t blocks, std::span SK) { while(blocks >= 2) { Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK); in += 2 * 16; out += 2 * 16; blocks -= 2; } if(blocks > 0) { uint8_t ibuf[2 * 16] = {0}; uint8_t obuf[2 * 16] = {0}; copy_mem(ibuf, in, 16); Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK); copy_mem(out, obuf, 16); } } // static void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_encrypt(const uint8_t in[], uint8_t out[], size_t blocks, std::span SK) { while(blocks >= 2) { Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK); in += 2 * 16; out += 2 * 16; blocks -= 2; } if(blocks > 0) { uint8_t ibuf[2 * 16] = {0}; uint8_t obuf[2 * 16] = {0}; copy_mem(ibuf, in, 16); Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK); copy_mem(out, obuf, 16); } } // static void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_decrypt(const uint8_t in[], uint8_t out[], size_t blocks, std::span SK) { while(blocks >= 2) { Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK); in += 2 * 16; out += 2 * 16; blocks -= 2; } if(blocks > 0) { uint8_t ibuf[2 * 16] = {0}; uint8_t obuf[2 * 16] = {0}; copy_mem(ibuf, in, 16); Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK); copy_mem(out, obuf, 16); } } } // namespace Botan