/*
* (C) 2026 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/

#include <botan/internal/camellia.h>

#include <botan/mem_ops.h>
#include <botan/internal/simd_hwaes.h>

namespace Botan {

namespace Camellia_HWAES {

namespace {

/* Helpers for 64-bit operations on SIMD_4x32 */

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 load_be64(const uint8_t* in) {
   const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B);
   return SIMD_4x32::byte_shuffle(SIMD_4x32::load_le(in), bswap64);
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void store_be64(uint8_t* out, SIMD_4x32 v) {
   const auto bswap64 = SIMD_4x32(0x04050607, 0x00010203, 0x0C0D0E0F, 0x08090A0B);
   SIMD_4x32::byte_shuffle(v, bswap64).store_le(out);
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 splat64(uint64_t v) {
   const uint32_t lo = static_cast<uint32_t>(v);
   const uint32_t hi = static_cast<uint32_t>(v >> 32);
   return SIMD_4x32(lo, hi, lo, hi);
}

/* The Camellia round function */
BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 camellia_f(SIMD_4x32 x) {
   // Pre-affine shared by S1/S2/S3
   constexpr uint64_t pre123_a = gfni_matrix(R"(
      1 1 1 0 1 1 0 1
      0 0 1 1 0 0 1 0
      1 1 0 1 0 0 0 0
      1 0 1 1 0 0 1 1
      0 0 0 0 1 1 0 0
      1 0 1 0 0 1 0 0
      0 0 1 0 1 1 0 0
      1 0 0 0 0 1 1 0)");

   // Pre-affine for S4
   constexpr uint64_t pre4_a = gfni_matrix(R"(
      1 1 0 1 1 0 1 1
      0 1 1 0 0 1 0 0
      1 0 1 0 0 0 0 1
      0 1 1 0 0 1 1 1
      0 0 0 1 1 0 0 0
      0 1 0 0 1 0 0 1
      0 1 0 1 1 0 0 0
      0 0 0 0 1 1 0 1)");

   constexpr uint8_t pre_c = 0x45;

   // Post-affine for S1 and S4
   constexpr uint64_t post14_a = gfni_matrix(R"(
      0 0 0 0 0 0 0 1
      0 1 1 0 0 1 1 0
      1 0 1 1 1 1 1 0
      0 0 0 1 1 0 1 1
      1 0 0 0 1 1 1 0
      0 1 0 1 1 1 1 0
      0 1 1 1 1 1 1 1
      0 0 0 1 1 1 0 0)");
   constexpr uint8_t post14_c = 0x6E;

   // Post-affine for S2
   constexpr uint64_t post2_a = gfni_matrix(R"(
      0 0 0 1 1 1 0 0
      0 0 0 0 0 0 0 1
      0 1 1 0 0 1 1 0
      1 0 1 1 1 1 1 0
      0 0 0 1 1 0 1 1
      1 0 0 0 1 1 1 0
      0 1 0 1 1 1 1 0
      0 1 1 1 1 1 1 1)");
   constexpr uint8_t post2_c = 0xDC;

   // Post-affine for S3
   constexpr uint64_t post3_a = gfni_matrix(R"(
      0 1 1 0 0 1 1 0
      1 0 1 1 1 1 1 0
      0 0 0 1 1 0 1 1
      1 0 0 0 1 1 1 0
      0 1 0 1 1 1 1 0
      0 1 1 1 1 1 1 1
      0 0 0 1 1 1 0 0
      0 0 0 0 0 0 0 1)");
   constexpr uint8_t post3_c = 0x37;

   constexpr auto PRE123 = Gf2AffineTransformation(pre123_a, pre_c);
   constexpr auto PRE4 = Gf2AffineTransformation(pre4_a, pre_c);
   constexpr auto POST14 = Gf2AffineTransformation::post_sbox(post14_a, post14_c);
   constexpr auto POST2 = Gf2AffineTransformation::post_sbox(post2_a, post2_c);
   constexpr auto POST3 = Gf2AffineTransformation::post_sbox(post3_a, post3_c);

   const auto mask_s2 = SIMD_4x32(0xFF000000, 0x00FF0000, 0xFF000000, 0x00FF0000);
   const auto mask_s3 = SIMD_4x32(0x00FF0000, 0x0000FF00, 0x00FF0000, 0x0000FF00);
   const auto mask_s4 = SIMD_4x32(0x0000FF00, 0x000000FF, 0x0000FF00, 0x000000FF);

   const auto pre123 = PRE123.affine_transform(x);
   const auto pre4 = PRE4.affine_transform(x);

   const auto sub = hw_aes_sbox(SIMD_4x32::byte_blend(mask_s4, pre4, pre123));

   const auto s14 = POST14.affine_transform(sub);
   const auto s2 = POST2.affine_transform(sub);
   const auto s3 = POST3.affine_transform(sub);

   // Final merged Sbox output for all bytes
   const auto sbox = SIMD_4x32::byte_blend(mask_s3, s3, SIMD_4x32::byte_blend(mask_s2, s2, s14));

   // The linear mixing step
   const auto P1 = SIMD_4x32(0x00000001, 0x00000001, 0x08080809, 0x08080809);
   const auto P2 = SIMD_4x32(0x01010202, 0x01010202, 0x09090A0A, 0x09090A0A);
   const auto P3 = SIMD_4x32(0x02030303, 0x02030303, 0x0A0B0B0B, 0x0A0B0B0B);
   const auto P4 = SIMD_4x32(0x06050404, 0x04040504, 0x0E0D0C0C, 0x0C0C0D0C);
   const auto P5 = SIMD_4x32(0x07060507, 0x05060605, 0x0F0E0D0F, 0x0D0E0E0D);
   const auto P6 = SIMD_4x32(0xFFFFFFFF, 0x07070706, 0xFFFFFFFF, 0x0F0F0F0E);

   const auto sxp1 = SIMD_4x32::byte_shuffle(sbox, P1);
   const auto sxp2 = SIMD_4x32::byte_shuffle(sbox, P2);
   const auto sxp3 = SIMD_4x32::byte_shuffle(sbox, P3);
   const auto sxp4 = SIMD_4x32::byte_shuffle(sbox, P4);
   const auto sxp5 = SIMD_4x32::byte_shuffle(sbox, P5);
   const auto sxp6 = SIMD_4x32::byte_shuffle(sbox, P6);

   return (sxp1 ^ sxp2 ^ sxp3 ^ sxp4 ^ sxp5 ^ sxp6);
}

/*
* FL and FL-inverse operate on 32-bit sub-halves within each 64-bit element.
* We use byte_shuffle to broadcast each 32-bit half, then recombine with byte_blend.
*/
BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FL_2(SIMD_4x32 v, uint64_t K) {
   const uint32_t k1 = static_cast<uint32_t>(K >> 32);
   const uint32_t k2 = static_cast<uint32_t>(K);

   // Broadcast upper/lower 32-bit halves of each 64-bit element
   const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C);
   const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908);

   auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi);
   auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo);

   x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>();
   x1 ^= x2 | SIMD_4x32::splat(k2);

   // Recombine: lo from x2, hi from x1
   const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
   return SIMD_4x32::byte_blend(mask_hi, x1, x2);
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES SIMD_4x32 FLINV_2(SIMD_4x32 v, uint64_t K) {
   const uint32_t k1 = static_cast<uint32_t>(K >> 32);
   const uint32_t k2 = static_cast<uint32_t>(K);

   const auto shuf_hi = SIMD_4x32(0x07060504, 0x07060504, 0x0F0E0D0C, 0x0F0E0D0C);
   const auto shuf_lo = SIMD_4x32(0x03020100, 0x03020100, 0x0B0A0908, 0x0B0A0908);

   auto x1 = SIMD_4x32::byte_shuffle(v, shuf_hi);
   auto x2 = SIMD_4x32::byte_shuffle(v, shuf_lo);

   x1 ^= x2 | SIMD_4x32::splat(k2);
   x2 ^= (x1 & SIMD_4x32::splat(k1)).rotl<1>();

   const auto mask_hi = SIMD_4x32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
   return SIMD_4x32::byte_blend(mask_hi, x1, x2);
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void load_and_deinterleave(const uint8_t in[], SIMD_4x32& L, SIMD_4x32& R) {
   auto A = load_be64(in);       // block 0: [L0, R0]
   auto B = load_be64(in + 16);  // block 1: [L1, R1]
   const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF);
   L = SIMD_4x32::byte_blend(mask_upper, B.swap_halves(), A);  // [L0, L1]
   R = SIMD_4x32::byte_blend(mask_upper, B, A.swap_halves());  // [R0, R1]
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void interleave_and_store(uint8_t out[], SIMD_4x32 L, SIMD_4x32 R) {
   // Camellia output swaps L and R
   const auto mask_upper = SIMD_4x32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF);
   auto A = SIMD_4x32::byte_blend(mask_upper, L.swap_halves(), R);  // [R0, L0]
   auto B = SIMD_4x32::byte_blend(mask_upper, L, R.swap_halves());  // [R1, L1]
   store_be64(out, A);
   store_be64(out + 16, B);
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_e_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) {
   R ^= camellia_f(L ^ splat64(SK[0]));
   L ^= camellia_f(R ^ splat64(SK[1]));
   R ^= camellia_f(L ^ splat64(SK[2]));
   L ^= camellia_f(R ^ splat64(SK[3]));
   R ^= camellia_f(L ^ splat64(SK[4]));
   L ^= camellia_f(R ^ splat64(SK[5]));
}

BOTAN_FORCE_INLINE BOTAN_FN_ISA_HWAES void six_d_rounds(SIMD_4x32& L, SIMD_4x32& R, const uint64_t SK[]) {
   R ^= camellia_f(L ^ splat64(SK[5]));
   L ^= camellia_f(R ^ splat64(SK[4]));
   R ^= camellia_f(L ^ splat64(SK[3]));
   L ^= camellia_f(R ^ splat64(SK[2]));
   R ^= camellia_f(L ^ splat64(SK[1]));
   L ^= camellia_f(R ^ splat64(SK[0]));
}

BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
   SIMD_4x32 L;
   SIMD_4x32 R;
   load_and_deinterleave(in, L, R);

   L ^= splat64(SK[0]);
   R ^= splat64(SK[1]);

   six_e_rounds(L, R, &SK[2]);
   L = FL_2(L, SK[8]);
   R = FLINV_2(R, SK[9]);
   six_e_rounds(L, R, &SK[10]);
   L = FL_2(L, SK[16]);
   R = FLINV_2(R, SK[17]);
   six_e_rounds(L, R, &SK[18]);

   R ^= splat64(SK[24]);
   L ^= splat64(SK[25]);

   interleave_and_store(out, L, R);
}

BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_18r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
   SIMD_4x32 L;
   SIMD_4x32 R;
   load_and_deinterleave(in, L, R);

   R ^= splat64(SK[25]);
   L ^= splat64(SK[24]);

   six_d_rounds(L, R, &SK[18]);
   L = FL_2(L, SK[17]);
   R = FLINV_2(R, SK[16]);
   six_d_rounds(L, R, &SK[10]);
   L = FL_2(L, SK[9]);
   R = FLINV_2(R, SK[8]);
   six_d_rounds(L, R, &SK[2]);

   L ^= splat64(SK[1]);
   R ^= splat64(SK[0]);

   interleave_and_store(out, L, R);
}

BOTAN_FN_ISA_HWAES void camellia_encrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
   SIMD_4x32 L;
   SIMD_4x32 R;
   load_and_deinterleave(in, L, R);

   L ^= splat64(SK[0]);
   R ^= splat64(SK[1]);

   six_e_rounds(L, R, &SK[2]);
   L = FL_2(L, SK[8]);
   R = FLINV_2(R, SK[9]);
   six_e_rounds(L, R, &SK[10]);
   L = FL_2(L, SK[16]);
   R = FLINV_2(R, SK[17]);
   six_e_rounds(L, R, &SK[18]);
   L = FL_2(L, SK[24]);
   R = FLINV_2(R, SK[25]);
   six_e_rounds(L, R, &SK[26]);

   R ^= splat64(SK[32]);
   L ^= splat64(SK[33]);

   interleave_and_store(out, L, R);
}

BOTAN_FN_ISA_HWAES void camellia_decrypt_x2_24r(const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
   SIMD_4x32 L;
   SIMD_4x32 R;
   load_and_deinterleave(in, L, R);

   R ^= splat64(SK[33]);
   L ^= splat64(SK[32]);

   six_d_rounds(L, R, &SK[26]);
   L = FL_2(L, SK[25]);
   R = FLINV_2(R, SK[24]);
   six_d_rounds(L, R, &SK[18]);
   L = FL_2(L, SK[17]);
   R = FLINV_2(R, SK[16]);
   six_d_rounds(L, R, &SK[10]);
   L = FL_2(L, SK[9]);
   R = FLINV_2(R, SK[8]);
   six_d_rounds(L, R, &SK[2]);

   L ^= splat64(SK[1]);
   R ^= splat64(SK[0]);

   interleave_and_store(out, L, R);
}

}  // namespace

}  // namespace Camellia_HWAES

// static
void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_encrypt(const uint8_t in[],
                                                    uint8_t out[],
                                                    size_t blocks,
                                                    std::span<const uint64_t> SK) {
   while(blocks >= 2) {
      Camellia_HWAES::camellia_encrypt_x2_18r(in, out, SK);
      in += 2 * 16;
      out += 2 * 16;
      blocks -= 2;
   }

   if(blocks > 0) {
      uint8_t ibuf[2 * 16] = {0};
      uint8_t obuf[2 * 16] = {0};
      copy_mem(ibuf, in, 16);
      Camellia_HWAES::camellia_encrypt_x2_18r(ibuf, obuf, SK);
      copy_mem(out, obuf, 16);
   }
}

// static
void BOTAN_FN_ISA_HWAES Camellia_128::hwaes_decrypt(const uint8_t in[],
                                                    uint8_t out[],
                                                    size_t blocks,
                                                    std::span<const uint64_t> SK) {
   while(blocks >= 2) {
      Camellia_HWAES::camellia_decrypt_x2_18r(in, out, SK);
      in += 2 * 16;
      out += 2 * 16;
      blocks -= 2;
   }

   if(blocks > 0) {
      uint8_t ibuf[2 * 16] = {0};
      uint8_t obuf[2 * 16] = {0};
      copy_mem(ibuf, in, 16);
      Camellia_HWAES::camellia_decrypt_x2_18r(ibuf, obuf, SK);
      copy_mem(out, obuf, 16);
   }
}

// static
void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_encrypt(const uint8_t in[],
                                                    uint8_t out[],
                                                    size_t blocks,
                                                    std::span<const uint64_t> SK) {
   while(blocks >= 2) {
      Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK);
      in += 2 * 16;
      out += 2 * 16;
      blocks -= 2;
   }

   if(blocks > 0) {
      uint8_t ibuf[2 * 16] = {0};
      uint8_t obuf[2 * 16] = {0};
      copy_mem(ibuf, in, 16);
      Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK);
      copy_mem(out, obuf, 16);
   }
}

// static
void BOTAN_FN_ISA_HWAES Camellia_192::hwaes_decrypt(const uint8_t in[],
                                                    uint8_t out[],
                                                    size_t blocks,
                                                    std::span<const uint64_t> SK) {
   while(blocks >= 2) {
      Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK);
      in += 2 * 16;
      out += 2 * 16;
      blocks -= 2;
   }

   if(blocks > 0) {
      uint8_t ibuf[2 * 16] = {0};
      uint8_t obuf[2 * 16] = {0};
      copy_mem(ibuf, in, 16);
      Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK);
      copy_mem(out, obuf, 16);
   }
}

// static
void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_encrypt(const uint8_t in[],
                                                    uint8_t out[],
                                                    size_t blocks,
                                                    std::span<const uint64_t> SK) {
   while(blocks >= 2) {
      Camellia_HWAES::camellia_encrypt_x2_24r(in, out, SK);
      in += 2 * 16;
      out += 2 * 16;
      blocks -= 2;
   }

   if(blocks > 0) {
      uint8_t ibuf[2 * 16] = {0};
      uint8_t obuf[2 * 16] = {0};
      copy_mem(ibuf, in, 16);
      Camellia_HWAES::camellia_encrypt_x2_24r(ibuf, obuf, SK);
      copy_mem(out, obuf, 16);
   }
}

// static
void BOTAN_FN_ISA_HWAES Camellia_256::hwaes_decrypt(const uint8_t in[],
                                                    uint8_t out[],
                                                    size_t blocks,
                                                    std::span<const uint64_t> SK) {
   while(blocks >= 2) {
      Camellia_HWAES::camellia_decrypt_x2_24r(in, out, SK);
      in += 2 * 16;
      out += 2 * 16;
      blocks -= 2;
   }

   if(blocks > 0) {
      uint8_t ibuf[2 * 16] = {0};
      uint8_t obuf[2 * 16] = {0};
      copy_mem(ibuf, in, 16);
      Camellia_HWAES::camellia_decrypt_x2_24r(ibuf, obuf, SK);
      copy_mem(out, obuf, 16);
   }
}

}  // namespace Botan