/* * (C) 2025 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ #include #include #include namespace Botan { namespace SHACAL2_AVX512_F { namespace { // NOLINTBEGIN(portability-simd-intrinsics) /* * 8x16 Transpose * * Convert from * * A00 B00 C00 ... H00 * A01 B01 C01 ... H01 * .. * A15 B15 C15 ... H15 * * with two blocks stored in each register, into * * A00 A01 ... A15 * B00 B01 ... B15 * ... * H00 H01 ... H15 */ BOTAN_FN_ISA_AVX512 void transpose_in(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3, SIMD_16x32& B4, SIMD_16x32& B5, SIMD_16x32& B6, SIMD_16x32& B7) { auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw()); auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw()); auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw()); auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw()); auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw()); auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw()); auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw()); auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw()); auto r0 = _mm512_unpacklo_epi64(t0, t2); auto r1 = _mm512_unpackhi_epi64(t0, t2); auto r2 = _mm512_unpacklo_epi64(t1, t3); auto r3 = _mm512_unpackhi_epi64(t1, t3); auto r4 = _mm512_unpacklo_epi64(t4, t6); auto r5 = _mm512_unpackhi_epi64(t4, t6); auto r6 = _mm512_unpacklo_epi64(t5, t7); auto r7 = _mm512_unpackhi_epi64(t5, t7); const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0); const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4)); B0 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4)); B1 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5)); B2 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6)); B3 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7)); B4 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4)); B5 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5)); B6 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6)); B7 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7)); } BOTAN_FN_ISA_AVX512 void transpose_out(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3, SIMD_16x32& B4, SIMD_16x32& B5, SIMD_16x32& B6, SIMD_16x32& B7) { auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw()); auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw()); auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw()); auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw()); auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw()); auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw()); auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw()); auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw()); auto r0 = _mm512_unpacklo_epi64(t0, t2); auto r1 = _mm512_unpackhi_epi64(t0, t2); auto r2 = _mm512_unpacklo_epi64(t1, t3); auto r3 = _mm512_unpackhi_epi64(t1, t3); auto r4 = _mm512_unpacklo_epi64(t4, t6); auto r5 = _mm512_unpackhi_epi64(t4, t6); auto r6 = _mm512_unpacklo_epi64(t5, t7); auto r7 = _mm512_unpackhi_epi64(t5, t7); const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0); const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8)); auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4); auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5); auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6); auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7); auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4); auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5); auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6); auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7); B0 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100)); B1 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100)); B2 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110)); B3 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110)); B4 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100)); B5 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100)); B6 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110)); B7 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110)); } // NOLINTEND(portability-simd-intrinsics) template void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Fwd(const SimdT& A, const SimdT& B, const SimdT& C, SimdT& D, const SimdT& E, const SimdT& F, const SimdT& G, SimdT& H, uint32_t RK) { H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK); D += H; H += A.sigma0() + SimdT::majority(A, B, C); } template void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Rev(const SimdT& A, const SimdT& B, const SimdT& C, SimdT& D, const SimdT& E, const SimdT& F, const SimdT& G, SimdT& H, uint32_t RK) { H -= A.sigma0() + SimdT::majority(A, B, C); D -= H; H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK); } } // namespace } // namespace SHACAL2_AVX512_F size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const { using namespace SHACAL2_AVX512_F; size_t consumed = 0; while(blocks >= 16) { SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0); SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1); SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2); SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3); SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4); SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5); SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6); SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7); transpose_in(A, B, C, D, E, F, G, H); for(size_t r = 0; r != 64; r += 8) { SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]); SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]); SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]); SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]); SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]); SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]); SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]); SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]); } transpose_out(A, B, C, D, E, F, G, H); A.store_be(out + 64 * 0); B.store_be(out + 64 * 1); C.store_be(out + 64 * 2); D.store_be(out + 64 * 3); E.store_be(out + 64 * 4); F.store_be(out + 64 * 5); G.store_be(out + 64 * 6); H.store_be(out + 64 * 7); in += 16 * BLOCK_SIZE; out += 16 * BLOCK_SIZE; blocks -= 16; consumed += 16; } while(blocks >= 8) { SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0); SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1); SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2); SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3); SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4); SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5); SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6); SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7); SIMD_8x32::transpose(A, B, C, D, E, F, G, H); for(size_t r = 0; r != 64; r += 8) { SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]); SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]); SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]); SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]); SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]); SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]); SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]); SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]); } SIMD_8x32::transpose(A, B, C, D, E, F, G, H); A.store_be(out + 32 * 0); B.store_be(out + 32 * 1); C.store_be(out + 32 * 2); D.store_be(out + 32 * 3); E.store_be(out + 32 * 4); F.store_be(out + 32 * 5); G.store_be(out + 32 * 6); H.store_be(out + 32 * 7); in += 8 * BLOCK_SIZE; out += 8 * BLOCK_SIZE; blocks -= 8; consumed += 8; } return consumed; } size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const { using namespace SHACAL2_AVX512_F; size_t consumed = 0; while(blocks >= 16) { SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0); SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1); SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2); SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3); SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4); SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5); SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6); SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7); transpose_in(A, B, C, D, E, F, G, H); for(size_t r = 0; r != 64; r += 8) { SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]); SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]); SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]); SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]); SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]); SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]); SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]); SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]); } transpose_out(A, B, C, D, E, F, G, H); A.store_be(out + 64 * 0); B.store_be(out + 64 * 1); C.store_be(out + 64 * 2); D.store_be(out + 64 * 3); E.store_be(out + 64 * 4); F.store_be(out + 64 * 5); G.store_be(out + 64 * 6); H.store_be(out + 64 * 7); in += 16 * BLOCK_SIZE; out += 16 * BLOCK_SIZE; blocks -= 16; consumed += 16; } while(blocks >= 8) { SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0); SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1); SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2); SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3); SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4); SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5); SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6); SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7); SIMD_8x32::transpose(A, B, C, D, E, F, G, H); for(size_t r = 0; r != 64; r += 8) { SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]); SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]); SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]); SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]); SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]); SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]); SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]); SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]); } SIMD_8x32::transpose(A, B, C, D, E, F, G, H); A.store_be(out + 32 * 0); B.store_be(out + 32 * 1); C.store_be(out + 32 * 2); D.store_be(out + 32 * 3); E.store_be(out + 32 * 4); F.store_be(out + 32 * 5); G.store_be(out + 32 * 6); H.store_be(out + 32 * 7); in += 8 * BLOCK_SIZE; out += 8 * BLOCK_SIZE; blocks -= 8; consumed += 8; } return consumed; } } // namespace Botan