/* * (C) 2026 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ #include #include #include namespace Botan { namespace { // clang-format off alignas(64) const uint32_t SM3_TJ[64] = { 0x79CC4519, 0xF3988A32, 0xE7311465, 0xCE6228CB, 0x9CC45197, 0x3988A32F, 0x7311465E, 0xE6228CBC, 0xCC451979, 0x988A32F3, 0x311465E7, 0x6228CBCE, 0xC451979C, 0x88A32F39, 0x11465E73, 0x228CBCE6, 0x9D8A7A87, 0x3B14F50F, 0x7629EA1E, 0xEC53D43C, 0xD8A7A879, 0xB14F50F3, 0x629EA1E7, 0xC53D43CE, 0x8A7A879D, 0x14F50F3B, 0x29EA1E76, 0x53D43CEC, 0xA7A879D8, 0x4F50F3B1, 0x9EA1E762, 0x3D43CEC5, 0x7A879D8A, 0xF50F3B14, 0xEA1E7629, 0xD43CEC53, 0xA879D8A7, 0x50F3B14F, 0xA1E7629E, 0x43CEC53D, 0x879D8A7A, 0x0F3B14F5, 0x1E7629EA, 0x3CEC53D4, 0x79D8A7A8, 0xF3B14F50, 0xE7629EA1, 0xCEC53D43, 0x9D8A7A87, 0x3B14F50F, 0x7629EA1E, 0xEC53D43C, 0xD8A7A879, 0xB14F50F3, 0x629EA1E7, 0xC53D43CE, 0x8A7A879D, 0x14F50F3B, 0x29EA1E76, 0x53D43CEC, 0xA7A879D8, 0x4F50F3B1, 0x9EA1E762, 0x3D43CEC5, }; // clang-format on // The SM3 instructions expect the state words in reverse order (why??) BOTAN_FORCE_INLINE BOTAN_FN_ISA_SM3 uint32x4_t sm3_reverse_words(uint32x4_t v) { v = vrev64q_u32(v); return vextq_u32(v, v, 2); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_SM3 uint32x4_t sm3_tj(size_t round) { // vsm3ss1q expects the constant to be in the top word return vsetq_lane_u32(SM3_TJ[round], vdupq_n_u32(0), 3); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_SM3 void sm3_x4_r1( uint32x4_t& S0, uint32x4_t& S1, uint32x4_t w, uint32x4_t w_prime, size_t round) { auto t = vsm3ss1q_u32(S0, S1, sm3_tj(round)); S0 = vsm3tt1aq_u32(S0, t, w_prime, 0); S1 = vsm3tt2aq_u32(S1, t, w, 0); t = vsm3ss1q_u32(S0, S1, sm3_tj(round + 1)); S0 = vsm3tt1aq_u32(S0, t, w_prime, 1); S1 = vsm3tt2aq_u32(S1, t, w, 1); t = vsm3ss1q_u32(S0, S1, sm3_tj(round + 2)); S0 = vsm3tt1aq_u32(S0, t, w_prime, 2); S1 = vsm3tt2aq_u32(S1, t, w, 2); t = vsm3ss1q_u32(S0, S1, sm3_tj(round + 3)); S0 = vsm3tt1aq_u32(S0, t, w_prime, 3); S1 = vsm3tt2aq_u32(S1, t, w, 3); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_SM3 void sm3_x4_r2( uint32x4_t& S0, uint32x4_t& S1, uint32x4_t w, uint32x4_t w_prime, size_t round) { auto t = vsm3ss1q_u32(S0, S1, sm3_tj(round)); S0 = vsm3tt1bq_u32(S0, t, w_prime, 0); S1 = vsm3tt2bq_u32(S1, t, w, 0); t = vsm3ss1q_u32(S0, S1, sm3_tj(round + 1)); S0 = vsm3tt1bq_u32(S0, t, w_prime, 1); S1 = vsm3tt2bq_u32(S1, t, w, 1); t = vsm3ss1q_u32(S0, S1, sm3_tj(round + 2)); S0 = vsm3tt1bq_u32(S0, t, w_prime, 2); S1 = vsm3tt2bq_u32(S1, t, w, 2); t = vsm3ss1q_u32(S0, S1, sm3_tj(round + 3)); S0 = vsm3tt1bq_u32(S0, t, w_prime, 3); S1 = vsm3tt2bq_u32(S1, t, w, 3); } BOTAN_FORCE_INLINE BOTAN_FN_ISA_SM3 void sm3_msg_expand(uint32x4_t& w0, const uint32x4_t& w1, const uint32x4_t& w2, const uint32x4_t& w3) { const uint32x4_t w7_10 = vextq_u32(w1, w2, 3); const uint32x4_t w3_6 = vextq_u32(w0, w1, 3); const uint32x4_t w10_13 = vextq_u32(w2, w3, 2); uint32x4_t t = vsm3partw1q_u32(w0, w7_10, w3); w0 = vsm3partw2q_u32(t, w10_13, w3_6); } } // namespace void BOTAN_FN_ISA_SM3 SM3::compress_digest_armv8(digest_type& digest, std::span input, size_t blocks) { uint32x4_t S0 = sm3_reverse_words(vld1q_u32(&digest[0])); // NOLINT(*-container-data-pointer) uint32x4_t S1 = sm3_reverse_words(vld1q_u32(&digest[4])); const uint8_t* data = input.data(); while(blocks > 0) { const uint32x4_t S0_save = S0; const uint32x4_t S1_save = S1; uint32x4_t W0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data))); uint32x4_t W1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 16))); uint32x4_t W2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 32))); uint32x4_t W3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 48))); data += block_bytes; blocks -= 1; sm3_x4_r1(S0, S1, W0, veorq_u32(W0, W1), 0); sm3_msg_expand(W0, W1, W2, W3); sm3_x4_r1(S0, S1, W1, veorq_u32(W1, W2), 4); sm3_msg_expand(W1, W2, W3, W0); sm3_x4_r1(S0, S1, W2, veorq_u32(W2, W3), 8); sm3_msg_expand(W2, W3, W0, W1); sm3_x4_r1(S0, S1, W3, veorq_u32(W3, W0), 12); sm3_msg_expand(W3, W0, W1, W2); sm3_x4_r2(S0, S1, W0, veorq_u32(W0, W1), 16); sm3_msg_expand(W0, W1, W2, W3); sm3_x4_r2(S0, S1, W1, veorq_u32(W1, W2), 20); sm3_msg_expand(W1, W2, W3, W0); sm3_x4_r2(S0, S1, W2, veorq_u32(W2, W3), 24); sm3_msg_expand(W2, W3, W0, W1); sm3_x4_r2(S0, S1, W3, veorq_u32(W3, W0), 28); sm3_msg_expand(W3, W0, W1, W2); sm3_x4_r2(S0, S1, W0, veorq_u32(W0, W1), 32); sm3_msg_expand(W0, W1, W2, W3); sm3_x4_r2(S0, S1, W1, veorq_u32(W1, W2), 36); sm3_msg_expand(W1, W2, W3, W0); sm3_x4_r2(S0, S1, W2, veorq_u32(W2, W3), 40); sm3_msg_expand(W2, W3, W0, W1); sm3_x4_r2(S0, S1, W3, veorq_u32(W3, W0), 44); sm3_msg_expand(W3, W0, W1, W2); sm3_x4_r2(S0, S1, W0, veorq_u32(W0, W1), 48); sm3_msg_expand(W0, W1, W2, W3); sm3_x4_r2(S0, S1, W1, veorq_u32(W1, W2), 52); sm3_x4_r2(S0, S1, W2, veorq_u32(W2, W3), 56); sm3_x4_r2(S0, S1, W3, veorq_u32(W3, W0), 60); S0 = veorq_u32(S0, S0_save); S1 = veorq_u32(S1, S1_save); } vst1q_u32(&digest[0], sm3_reverse_words(S0)); // NOLINT(*-container-data-pointer) vst1q_u32(&digest[4], sm3_reverse_words(S1)); } } // namespace Botan