/* * Copyright (c) 2025, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ #define AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "third_party/highway/hwy/highway.h" #include "aom_dsp/txfm_common.h" #include "av1/common/av1_txfm.h" #include "av1/common/enums.h" #include "av1/encoder/av1_fwd_txfm1d.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #define FOR_EACH_TXFM2D(X, suffix) \ X(4, 4, suffix) \ X(8, 8, suffix) \ X(16, 16, suffix) \ X(32, 32, suffix) \ X(64, 64, suffix) \ X(4, 8, suffix) \ X(8, 4, suffix) \ X(8, 16, suffix) \ X(16, 8, suffix) \ X(16, 32, suffix) \ X(32, 16, suffix) \ X(32, 64, suffix) \ X(64, 32, suffix) \ X(4, 16, suffix) \ X(16, 4, suffix) \ X(8, 32, suffix) \ X(32, 8, suffix) \ X(16, 64, suffix) \ X(64, 16, suffix) #if HWY_CXX_LANG >= 201703L #define CONSTEXPR_IF constexpr #else #define CONSTEXPR_IF #endif HWY_BEFORE_NAMESPACE(); namespace { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; constexpr int8_t kForwardTransformShift[TX_SIZES_ALL][3] = { { 2, 0, 0 }, // { 2, -1, 0 }, // { 2, -2, 0 }, // { 2, -4, 0 }, // { 0, -2, -2 }, // { 2, -1, 0 }, // { 2, -1, 0 }, // { 2, -2, 0 }, // { 2, -2, 0 }, // { 2, -4, 0 }, // { 2, -4, 0 }, // { 0, -2, -2 }, // { 2, -4, -2 }, // { 2, -1, 0 }, // { 2, -1, 0 }, // { 2, -2, 0 }, // { 2, -2, 0 }, // { 0, -2, 0 }, // { 2, -4, 0 }, // }; constexpr int kTxSizeWideLog2[TX_SIZES_ALL] = { 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, }; // Transform block height in log2 constexpr int kTxSizeHighLog2[TX_SIZES_ALL] = { 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, }; constexpr bool kApplyRectScaleList[TX_SIZES_ALL] = { false, false, false, false, false, true, true, true, true, true, true, true, true, false, false, false, false, false, false, }; constexpr int8_t kForwardCosBitCol[MAX_TXWH_IDX /*txw_idx*/] [MAX_TXWH_IDX /*txh_idx*/] = { { 13, 13, 13, 0, 0 }, { 13, 13, 13, 12, 0 }, { 13, 13, 13, 12, 13 }, { 0, 13, 13, 12, 13 }, { 0, 0, 13, 12, 13 } }; constexpr int8_t kForwardCosBitRow[MAX_TXWH_IDX /*txw_idx*/] [MAX_TXWH_IDX /*txh_idx*/] = { { 13, 13, 12, 0, 0 }, { 13, 13, 13, 12, 0 }, { 13, 13, 12, 13, 12 }, { 0, 12, 13, 12, 11 }, { 0, 0, 12, 11, 10 } }; // Transform block width in pixels constexpr int8_t kTxSizeWide[TX_SIZES_ALL] = { 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, }; // Transform block height in pixels constexpr int8_t kTxSizeHigh[TX_SIZES_ALL] = { 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, }; constexpr int GetTxwIndex(TX_SIZE tx_size) { return kTxSizeWideLog2[tx_size] - kTxSizeWideLog2[0]; } constexpr int GetTxhIndex(TX_SIZE tx_size) { return kTxSizeHighLog2[tx_size] - kTxSizeHighLog2[0]; } template HWY_ATTR HWY_INLINE hn::VFromD SetPair(D int_tag, int a, int b) { return hn::BitCast( int_tag, hn::Set(hn::RepartitionToWide(), static_cast( static_cast(a) | (static_cast(static_cast(b)) << 16)))); } template struct ButterflyTraits {}; template <> struct ButterflyTraits<2> { template HWY_ATTR HWY_INLINE static void Whole( D int_tag, int w0, int w1, const hn::TFromD *HWY_RESTRICT in0, const hn::TFromD *HWY_RESTRICT in1, hn::TFromD *HWY_RESTRICT out0, hn::TFromD *HWY_RESTRICT out1, int bit, hn::VFromD> round) { constexpr hn::RepartitionToWide int32_tag; const auto ww0 = SetPair(int_tag, w0, w1); const auto ww1 = SetPair(int_tag, w1, -w0); const auto i0 = hn::Load(int_tag, in0); const auto i1 = hn::Load(int_tag, in1); const auto t0 = hn::InterleaveLower(int_tag, i0, i1); const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); const auto v0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww1); const auto v1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww1); const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); const auto d0 = hn::ShiftRightSame(hn::Add(v0, round), bit); const auto d1 = hn::ShiftRightSame(hn::Add(v1, round), bit); hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out0); hn::Store(hn::ReorderDemote2To(int_tag, d0, d1), int_tag, out1); } template HWY_ATTR HWY_INLINE static void Half( D int_tag, int w0, int w1, const hn::TFromD *HWY_RESTRICT in0, const hn::TFromD *HWY_RESTRICT in1, hn::TFromD *HWY_RESTRICT out, int bit, hn::VFromD> round) { constexpr hn::RepartitionToWide int32_tag; const auto i0 = hn::Load(int_tag, in0); const auto i1 = hn::Load(int_tag, in1); const auto t0 = hn::InterleaveLower(int_tag, i0, i1); const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); const auto ww0 = SetPair(int_tag, w0, w1); const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out); } }; template <> struct ButterflyTraits<4> { template HWY_ATTR HWY_INLINE static void Whole( D int_tag, int w0, int w1, const hn::TFromD *HWY_RESTRICT in0, const hn::TFromD *HWY_RESTRICT in1, hn::TFromD *HWY_RESTRICT out0, hn::TFromD *HWY_RESTRICT out1, int bit, hn::VFromD> round) { const auto i0 = hn::Load(int_tag, in0); const auto i1 = hn::Load(int_tag, in1); const auto ww0 = hn::Set(int_tag, w0); const auto ww1 = hn::Set(int_tag, w1); const auto in1_w1 = hn::Mul(i1, ww1); const auto o0 = hn::MulAdd(i0, ww0, in1_w1); hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out0); const auto in1_w0 = hn::Mul(i1, ww0); const auto o1 = hn::MulSub(i0, ww1, in1_w0); hn::Store(hn::ShiftRightSame(hn::Add(o1, round), bit), int_tag, out1); } template HWY_ATTR HWY_INLINE static void Half( D int_tag, int w0, int w1, const hn::TFromD *HWY_RESTRICT in0, const hn::TFromD *HWY_RESTRICT in1, hn::TFromD *HWY_RESTRICT out, int bit, hn::VFromD> round) { const auto i0 = hn::Load(int_tag, in0); const auto i1 = hn::Load(int_tag, in1); const auto ww0 = hn::Set(int_tag, w0); const auto ww1 = hn::Set(int_tag, w1); const auto in1_w1 = hn::Mul(i1, ww1); const auto o0 = hn::MulAdd(i0, ww0, in1_w1); hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out); } }; template HWY_ATTR HWY_INLINE void Butterfly( D int_tag, int w0, int w1, const hn::TFromD *HWY_RESTRICT in0, const hn::TFromD *HWY_RESTRICT in1, hn::TFromD *HWY_RESTRICT out0, hn::TFromD *HWY_RESTRICT out1, int bit, hn::VFromD> round) { ButterflyTraits)>::Whole(int_tag, w0, w1, in0, in1, out0, out1, bit, round); } template HWY_ATTR HWY_INLINE void HalfButterfly( D int_tag, int w0, int w1, const hn::TFromD *HWY_RESTRICT in0, const hn::TFromD *HWY_RESTRICT in1, hn::TFromD *HWY_RESTRICT out, int bit, hn::VFromD> round) { ButterflyTraits)>::Half(int_tag, w0, w1, in0, in1, out, bit, round); } template HWY_ATTR HWY_INLINE void AddSub(D int_tag, const hn::TFromD *in0, const hn::TFromD *in1, hn::TFromD *out_add, hn::TFromD *out_sub) { const auto i0 = hn::Load(int_tag, in0); const auto i1 = hn::Load(int_tag, in1); if CONSTEXPR_IF (sizeof(hn::TFromD) == 2) { hn::Store(hn::SaturatedAdd(i0, i1), int_tag, out_add); hn::Store(hn::SaturatedSub(i0, i1), int_tag, out_sub); } else { hn::Store(hn::Add(i0, i1), int_tag, out_add); hn::Store(hn::Sub(i0, i1), int_tag, out_sub); } } template struct Fdct4Traits { template HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, size_t instride) { using T = hn::TFromD; constexpr size_t kNumLanes = hn::MaxLanes(int_tag); HWY_ALIGN_MAX T buf0[4 * kNumLanes]; const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); constexpr hn::Repartition int32_tag; const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); AddSub(int_tag, &in[0 * instride], &in[3 * instride], &buf0[0 * kNumLanes], &buf0[3 * kNumLanes]); AddSub(int_tag, &in[1 * instride], &in[2 * instride], &buf0[1 * kNumLanes], &buf0[2 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &in[0 * instride], &in[2 * instride], cos_bit, round); Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], &buf0[2 * kNumLanes], &in[1 * instride], &in[3 * instride], cos_bit, round); } }; template <> struct Fdct4Traits<2, 4> { template HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, size_t instride) { const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); constexpr hn::FixedTag, 8> demote_tag; constexpr hn::Repartition int32_tag; const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); const auto cospi_p32_p32 = SetPair(int_tag, cospi[32], cospi[32]); const auto cospi_p32_m32 = SetPair(int_tag, cospi[32], -cospi[32]); const auto cospi_p16_p48 = SetPair(int_tag, cospi[16], cospi[48]); const auto cospi_p48_m16 = SetPair(int_tag, cospi[48], -cospi[16]); const auto i0 = hn::Load(int_tag, &in[0 * instride]); const auto i1 = hn::Load(int_tag, &in[1 * instride]); const auto i2 = hn::Load(int_tag, &in[2 * instride]); const auto i3 = hn::Load(int_tag, &in[3 * instride]); const auto u0 = hn::InterleaveLower(int_tag, i0, i1); const auto u1 = hn::InterleaveLower(int_tag, i3, i2); const auto v0 = hn::Add(u0, u1); const auto v1 = hn::Sub(u0, u1); const auto x0 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_p32); const auto x1 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_m32); const auto x2 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p16_p48); const auto x3 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p48_m16); const auto v0w0 = hn::ShiftRightSame(hn::Add(x0, round), cos_bit); const auto v0w1 = hn::ShiftRightSame(hn::Add(x1, round), cos_bit); const auto v1w0 = hn::ShiftRightSame(hn::Add(x2, round), cos_bit); const auto v1w1 = hn::ShiftRightSame(hn::Add(x3, round), cos_bit); const auto o0 = hn::ReorderDemote2To(demote_tag, v0w0, v0w1); const auto o1 = hn::ReorderDemote2To(demote_tag, v1w0, v1w1); hn::Store(o0, demote_tag, &in[0 * instride]); hn::Store(o1, demote_tag, &in[1 * instride]); hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, &in[2 * instride]); hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, &in[3 * instride]); } }; template HWY_ATTR HWY_INLINE void Fdct4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, size_t instride) { Fdct4Traits), hn::MaxLanes(int_tag)>::Fdct4( int_tag, in, cos_bit, instride); } template HWY_ATTR HWY_INLINE void Fdct8(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, size_t instride) { constexpr size_t kNumLanes = hn::MaxLanes(int_tag); HWY_ALIGN_MAX hn::TFromD buf0[8 * kNumLanes]; HWY_ALIGN_MAX hn::TFromD buf1[8 * kNumLanes]; const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); const auto round = hn::Set(hn::Repartition(), 1 << (cos_bit - 1)); // Even 8 points 0, 2, ..., 14 // stage 0 // stage 1 // buf0/buf1 AddSub(int_tag, &in[0 * instride], &in[7 * instride], &buf0[0 * kNumLanes], &buf1[7 * kNumLanes]); // buf0/buf0 AddSub(int_tag, &in[1 * instride], &in[6 * instride], &buf0[1 * kNumLanes], &buf0[6 * kNumLanes]); // buf0/buf0 AddSub(int_tag, &in[2 * instride], &in[5 * instride], &buf0[2 * kNumLanes], &buf0[5 * kNumLanes]); // buf0/buf1 AddSub(int_tag, &in[3 * instride], &in[4 * instride], &buf0[3 * kNumLanes], &buf1[4 * kNumLanes]); // stage 2 for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes], &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); // stage 3 // type 0 Butterfly(int_tag, cospi[32], cospi[32], &buf1[0 * kNumLanes], &buf1[1 * kNumLanes], &in[0 * instride], &in[4 * instride], cos_bit, round); // type 1 Butterfly(int_tag, cospi[16], cospi[48], &buf1[3 * kNumLanes], &buf1[2 * kNumLanes], &in[2 * instride], &in[6 * instride], cos_bit, round); AddSub(int_tag, &buf1[4 * kNumLanes], &buf1[5 * kNumLanes], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes]); AddSub(int_tag, &buf1[7 * kNumLanes], &buf1[6 * kNumLanes], &buf0[7 * kNumLanes], &buf0[6 * kNumLanes]); // stage 4 // stage 5 Butterfly(int_tag, cospi[8], cospi[56], &buf0[7 * kNumLanes], &buf0[4 * kNumLanes], &in[1 * instride], &in[7 * instride], cos_bit, round); Butterfly(int_tag, cospi[40], cospi[24], &buf0[6 * kNumLanes], &buf0[5 * kNumLanes], &in[5 * instride], &in[3 * instride], cos_bit, round); } template HWY_ATTR HWY_INLINE void Fdct16(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, size_t instride) { constexpr size_t kNumLanes = hn::MaxLanes(int_tag); HWY_ALIGN_MAX hn::TFromD buf0[16 * kNumLanes]; HWY_ALIGN_MAX hn::TFromD buf1[16 * kNumLanes]; const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); const auto round = hn::Set(hn::Repartition(), 1 << (cos_bit - 1)); // Calculate the column 0, 1, 2, 3 // stage 0 // stage 1 for (size_t i = 0; i < 8; ++i) { AddSub(int_tag, &in[i * instride], &in[(15 - i) * instride], &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); } // stage 2 for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); // stage 3 for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], cos_bit, round); for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); } // stage 4 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &in[0 * instride], &in[8 * instride], cos_bit, round); Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], &buf0[2 * kNumLanes], &in[4 * instride], &in[12 * instride], cos_bit, round); AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); // stage 5 Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], &buf1[4 * kNumLanes], &in[2 * instride], &in[14 * instride], cos_bit, round); Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], &buf1[5 * kNumLanes], &in[10 * instride], &in[6 * instride], cos_bit, round); AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); // stage 6 Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], &buf0[8 * kNumLanes], &in[1 * instride], &in[15 * instride], cos_bit, round); Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], &buf0[9 * kNumLanes], &in[9 * instride], &in[7 * instride], cos_bit, round); Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], &buf0[10 * kNumLanes], &in[5 * instride], &in[11 * instride], cos_bit, round); Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], &buf0[11 * kNumLanes], &in[13 * instride], &in[3 * instride], cos_bit, round); } template HWY_ATTR HWY_INLINE void Fdct32(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, size_t instride) { constexpr size_t kNumLanes = hn::MaxLanes(int_tag); HWY_ALIGN_MAX hn::TFromD buf0[32 * kNumLanes]; HWY_ALIGN_MAX hn::TFromD buf1[32 * kNumLanes]; const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); const auto round = hn::Set(hn::Repartition(), 1 << (cos_bit - 1)); // stage 0 // stage 1 for (size_t i = 0; i < 16; ++i) { AddSub(int_tag, &in[i * instride], &in[(31 - i) * instride], &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); } // stage 2 for (size_t i = 0; i < 8; ++i) { AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf1[20 * kNumLanes], &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[32], cospi[32], &buf1[21 * kNumLanes], &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[32], cospi[32], &buf1[22 * kNumLanes], &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[32], cospi[32], &buf1[23 * kNumLanes], &buf1[24 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], cos_bit, round); // stage 3 for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); } // stage 4 for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], cos_bit, round); for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); } Butterfly(int_tag, -cospi[16], cospi[48], &buf1[18 * kNumLanes], &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[16], cospi[48], &buf1[19 * kNumLanes], &buf1[28 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[20 * kNumLanes], &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[21 * kNumLanes], &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], cos_bit, round); // stage 5 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &in[0 * instride], &in[16 * instride], cos_bit, round); Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], &buf0[2 * kNumLanes], &in[8 * instride], &in[24 * instride], cos_bit, round); AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[19 * kNumLanes], &buf1[16 * kNumLanes], &buf1[19 * kNumLanes]); AddSub(int_tag, &buf1[17 * kNumLanes], &buf0[18 * kNumLanes], &buf1[17 * kNumLanes], &buf1[18 * kNumLanes]); AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[20 * kNumLanes], &buf1[23 * kNumLanes], &buf1[20 * kNumLanes]); AddSub(int_tag, &buf1[22 * kNumLanes], &buf0[21 * kNumLanes], &buf1[22 * kNumLanes], &buf1[21 * kNumLanes]); AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[27 * kNumLanes], &buf1[24 * kNumLanes], &buf1[27 * kNumLanes]); AddSub(int_tag, &buf1[25 * kNumLanes], &buf0[26 * kNumLanes], &buf1[25 * kNumLanes], &buf1[26 * kNumLanes]); AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[28 * kNumLanes], &buf1[31 * kNumLanes], &buf1[28 * kNumLanes]); AddSub(int_tag, &buf1[30 * kNumLanes], &buf0[29 * kNumLanes], &buf1[30 * kNumLanes], &buf1[29 * kNumLanes]); // stage 6 Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], &buf1[4 * kNumLanes], &in[4 * instride], &in[28 * instride], cos_bit, round); Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], &buf1[5 * kNumLanes], &in[20 * instride], &in[12 * instride], cos_bit, round); AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], cos_bit, round); // stage 7 Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], &buf0[8 * kNumLanes], &in[2 * instride], &in[30 * instride], cos_bit, round); Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], &buf0[9 * kNumLanes], &in[18 * instride], &in[14 * instride], cos_bit, round); Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], &buf0[10 * kNumLanes], &in[10 * instride], &in[22 * instride], cos_bit, round); Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], &buf0[11 * kNumLanes], &in[26 * instride], &in[6 * instride], cos_bit, round); AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); // stage 8 & 9 Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], &buf1[16 * kNumLanes], &in[1 * instride], &in[31 * instride], cos_bit, round); Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], &buf1[17 * kNumLanes], &in[17 * instride], &in[15 * instride], cos_bit, round); Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], &buf1[18 * kNumLanes], &in[9 * instride], &in[23 * instride], cos_bit, round); Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], &buf1[19 * kNumLanes], &in[25 * instride], &in[7 * instride], cos_bit, round); Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], &buf1[20 * kNumLanes], &in[5 * instride], &in[27 * instride], cos_bit, round); Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], &buf1[21 * kNumLanes], &in[21 * instride], &in[11 * instride], cos_bit, round); Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], &buf1[22 * kNumLanes], &in[13 * instride], &in[19 * instride], cos_bit, round); Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], &buf1[23 * kNumLanes], &in[29 * instride], &in[3 * instride], cos_bit, round); // stage 9 was fused with prior stages. } template HWY_ATTR HWY_NOINLINE void Fdct64(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit) { constexpr size_t kNumLanes = hn::MaxLanes(int_tag); constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD); HWY_ALIGN_MAX hn::TFromD buf0[64 * kNumLanes]; HWY_ALIGN_MAX hn::TFromD buf1[64 * kNumLanes]; const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); const auto round = hn::Set(hn::Repartition(), 1 << (cos_bit - 1)); // stage 1 #if HWY_TARGET == HWY_SSE4 // For whatever reason, some compilers don't unroll this when building for // SSE4; help them along. HWY_UNROLL(32) #endif for (size_t i = 0; i < 32; ++i) { AddSub(int_tag, &in[i * InStride], &in[(63 - i) * InStride], &buf0[i * kNumLanes], &buf0[(63 - i) * kNumLanes]); } // stage 2 for (size_t i = 0; i < 16; ++i) { AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(31 - i) * kNumLanes], &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); } for (size_t i = 0; i < 8; ++i) { Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(40 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], &buf1[(55 - i) * kNumLanes], cos_bit, round); } // stage 3 for (size_t i = 0; i < 8; ++i) { AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { Butterfly(int_tag, -cospi[32], cospi[32], &buf1[(20 + i) * kNumLanes], &buf1[(27 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], &buf0[(27 - i) * kNumLanes], cos_bit, round); } for (size_t i = 0; i < 8; ++i) { AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(47 - i) * kNumLanes], &buf0[(32 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes]); } for (size_t i = 0; i < 8; ++i) { AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(48 + i) * kNumLanes], &buf0[(63 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes]); } // stage 4 for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[(0 + i) * kNumLanes], &buf0[(7 - i) * kNumLanes], &buf1[(0 + i) * kNumLanes], &buf1[(7 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(10 + i) * kNumLanes], &buf0[(13 - i) * kNumLanes], &buf1[(10 + i) * kNumLanes], &buf1[(13 - i) * kNumLanes], cos_bit, round); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { Butterfly(int_tag, -cospi[16], cospi[48], &buf0[(36 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes], cos_bit, round); } for (size_t i = 4; i < 8; ++i) { Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[(36 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes], cos_bit, round); } // stage 5 for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(3 - i) * kNumLanes], &buf0[(0 + i) * kNumLanes], &buf0[(3 - i) * kNumLanes]); } Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], cos_bit, round); for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { Butterfly(int_tag, -cospi[16], cospi[48], &buf1[(18 + i) * kNumLanes], &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], &buf0[(29 - i) * kNumLanes], cos_bit, round); } for (size_t i = 2; i < 4; ++i) { Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[(18 + i) * kNumLanes], &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], &buf0[(29 - i) * kNumLanes], cos_bit, round); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(39 - i) * kNumLanes], &buf0[(32 + i) * kNumLanes], &buf0[(39 - i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes], &buf0[(40 + i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(55 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes]); } for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(56 + i) * kNumLanes], &buf0[(63 - i) * kNumLanes], &buf0[(56 + i) * kNumLanes]); } // stage 6 Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], &buf0[2 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(19 - i) * kNumLanes], &buf1[(16 + i) * kNumLanes], &buf1[(19 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(23 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes], &buf1[(20 + i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(24 + i) * kNumLanes], &buf0[(27 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes], &buf1[(27 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(28 + i) * kNumLanes], &buf1[(31 - i) * kNumLanes], &buf1[(28 + i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { Butterfly(int_tag, -cospi[8], cospi[56], &buf0[(34 + i) * kNumLanes], &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], &buf1[(61 - i) * kNumLanes], cos_bit, round); } for (size_t i = 2; i < 4; ++i) { Butterfly(int_tag, -cospi[56], -cospi[8], &buf0[(34 + i) * kNumLanes], &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], &buf1[(61 - i) * kNumLanes], cos_bit, round); } for (size_t i = 0; i < 2; ++i) { Butterfly(int_tag, -cospi[40], cospi[24], &buf0[(42 + i) * kNumLanes], &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], &buf1[(53 - i) * kNumLanes], cos_bit, round); } for (size_t i = 2; i < 4; ++i) { Butterfly(int_tag, -cospi[24], -cospi[40], &buf0[(42 + i) * kNumLanes], &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], &buf1[(53 - i) * kNumLanes], cos_bit, round); } // stage 7 Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], &buf1[4 * kNumLanes], &buf0[4 * kNumLanes], &buf0[7 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], &buf1[5 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], cos_bit, round); AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], cos_bit, round); for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(35 - i) * kNumLanes], &buf0[(32 + i) * kNumLanes], &buf0[(35 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(39 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], &buf0[(39 - i) * kNumLanes], &buf0[(36 + i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(40 + i) * kNumLanes], &buf1[(43 - i) * kNumLanes], &buf0[(40 + i) * kNumLanes], &buf0[(43 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(44 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes], &buf0[(44 + i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(51 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes], &buf0[(51 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(55 - i) * kNumLanes], &buf1[(52 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes], &buf0[(52 + i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(56 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes], &buf0[(56 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes]); } for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(60 + i) * kNumLanes], &buf0[(63 - i) * kNumLanes], &buf0[(60 + i) * kNumLanes]); } // stage 8 Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], &buf0[8 * kNumLanes], &buf1[8 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], &buf0[10 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); Butterfly(int_tag, -cospi[4], cospi[60], &buf0[33 * kNumLanes], &buf0[62 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[60], -cospi[4], &buf0[34 * kNumLanes], &buf0[61 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[36], cospi[28], &buf0[37 * kNumLanes], &buf0[58 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[28], -cospi[36], &buf0[38 * kNumLanes], &buf0[57 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[20], cospi[44], &buf0[41 * kNumLanes], &buf0[54 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[44], -cospi[20], &buf0[42 * kNumLanes], &buf0[53 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[52], cospi[12], &buf0[45 * kNumLanes], &buf0[50 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], cos_bit, round); Butterfly(int_tag, -cospi[12], -cospi[52], &buf0[46 * kNumLanes], &buf0[49 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], cos_bit, round); // stage 9 Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], &buf1[16 * kNumLanes], &buf0[16 * kNumLanes], &buf0[31 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], &buf1[17 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], &buf1[18 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], &buf1[19 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], &buf1[20 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], &buf1[21 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], &buf1[22 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], &buf1[23 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], cos_bit, round); AddSub(int_tag, &buf0[32 * kNumLanes], &buf1[33 * kNumLanes], &buf0[32 * kNumLanes], &buf0[33 * kNumLanes]); AddSub(int_tag, &buf0[35 * kNumLanes], &buf1[34 * kNumLanes], &buf0[35 * kNumLanes], &buf0[34 * kNumLanes]); AddSub(int_tag, &buf0[36 * kNumLanes], &buf1[37 * kNumLanes], &buf0[36 * kNumLanes], &buf0[37 * kNumLanes]); AddSub(int_tag, &buf0[39 * kNumLanes], &buf1[38 * kNumLanes], &buf0[39 * kNumLanes], &buf0[38 * kNumLanes]); AddSub(int_tag, &buf0[40 * kNumLanes], &buf1[41 * kNumLanes], &buf0[40 * kNumLanes], &buf0[41 * kNumLanes]); AddSub(int_tag, &buf0[43 * kNumLanes], &buf1[42 * kNumLanes], &buf0[43 * kNumLanes], &buf0[42 * kNumLanes]); AddSub(int_tag, &buf0[44 * kNumLanes], &buf1[45 * kNumLanes], &buf0[44 * kNumLanes], &buf0[45 * kNumLanes]); AddSub(int_tag, &buf0[47 * kNumLanes], &buf1[46 * kNumLanes], &buf0[47 * kNumLanes], &buf0[46 * kNumLanes]); AddSub(int_tag, &buf0[48 * kNumLanes], &buf1[49 * kNumLanes], &buf0[48 * kNumLanes], &buf0[49 * kNumLanes]); AddSub(int_tag, &buf0[51 * kNumLanes], &buf1[50 * kNumLanes], &buf0[51 * kNumLanes], &buf0[50 * kNumLanes]); AddSub(int_tag, &buf0[52 * kNumLanes], &buf1[53 * kNumLanes], &buf0[52 * kNumLanes], &buf0[53 * kNumLanes]); AddSub(int_tag, &buf0[55 * kNumLanes], &buf1[54 * kNumLanes], &buf0[55 * kNumLanes], &buf0[54 * kNumLanes]); AddSub(int_tag, &buf0[56 * kNumLanes], &buf1[57 * kNumLanes], &buf0[56 * kNumLanes], &buf0[57 * kNumLanes]); AddSub(int_tag, &buf0[59 * kNumLanes], &buf1[58 * kNumLanes], &buf0[59 * kNumLanes], &buf0[58 * kNumLanes]); AddSub(int_tag, &buf0[60 * kNumLanes], &buf1[61 * kNumLanes], &buf0[60 * kNumLanes], &buf0[61 * kNumLanes]); AddSub(int_tag, &buf0[63 * kNumLanes], &buf1[62 * kNumLanes], &buf0[63 * kNumLanes], &buf0[62 * kNumLanes]); // stage 10 Butterfly(int_tag, cospi[1], cospi[63], &buf0[63 * kNumLanes], &buf0[32 * kNumLanes], &buf1[32 * kNumLanes], &buf1[63 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[33], cospi[31], &buf0[62 * kNumLanes], &buf0[33 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[17], cospi[47], &buf0[61 * kNumLanes], &buf0[34 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[49], cospi[15], &buf0[60 * kNumLanes], &buf0[35 * kNumLanes], &buf1[35 * kNumLanes], &buf1[60 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[9], cospi[55], &buf0[59 * kNumLanes], &buf0[36 * kNumLanes], &buf1[36 * kNumLanes], &buf1[59 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[41], cospi[23], &buf0[58 * kNumLanes], &buf0[37 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[25], cospi[39], &buf0[57 * kNumLanes], &buf0[38 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[57], cospi[7], &buf0[56 * kNumLanes], &buf0[39 * kNumLanes], &buf1[39 * kNumLanes], &buf1[56 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[05], cospi[59], &buf0[55 * kNumLanes], &buf0[40 * kNumLanes], &buf1[40 * kNumLanes], &buf1[55 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[37], cospi[27], &buf0[54 * kNumLanes], &buf0[41 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[21], cospi[43], &buf0[53 * kNumLanes], &buf0[42 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[53], cospi[11], &buf0[52 * kNumLanes], &buf0[43 * kNumLanes], &buf1[43 * kNumLanes], &buf1[52 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[13], cospi[51], &buf0[51 * kNumLanes], &buf0[44 * kNumLanes], &buf1[44 * kNumLanes], &buf1[51 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[45], cospi[19], &buf0[50 * kNumLanes], &buf0[45 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[29], cospi[35], &buf0[49 * kNumLanes], &buf0[46 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], cos_bit, round); Butterfly(int_tag, cospi[61], cospi[3], &buf0[48 * kNumLanes], &buf0[47 * kNumLanes], &buf1[47 * kNumLanes], &buf1[48 * kNumLanes], cos_bit, round); // stage 11 hwy::CopyBytes(&buf1[0 * kNumLanes], &in[0 * OutStride]); hwy::CopyBytes(&buf1[63 * kNumLanes], &in[63 * OutStride]); hwy::CopyBytes(&buf1[32 * kNumLanes], &in[1 * OutStride]); hwy::CopyBytes(&buf1[31 * kNumLanes], &in[62 * OutStride]); hwy::CopyBytes(&buf0[16 * kNumLanes], &in[2 * OutStride]); hwy::CopyBytes(&buf1[47 * kNumLanes], &in[61 * OutStride]); hwy::CopyBytes(&buf1[48 * kNumLanes], &in[3 * OutStride]); hwy::CopyBytes(&buf1[15 * kNumLanes], &in[60 * OutStride]); hwy::CopyBytes(&buf1[8 * kNumLanes], &in[4 * OutStride]); hwy::CopyBytes(&buf1[55 * kNumLanes], &in[59 * OutStride]); hwy::CopyBytes(&buf1[40 * kNumLanes], &in[5 * OutStride]); hwy::CopyBytes(&buf1[23 * kNumLanes], &in[58 * OutStride]); hwy::CopyBytes(&buf0[24 * kNumLanes], &in[6 * OutStride]); hwy::CopyBytes(&buf1[39 * kNumLanes], &in[57 * OutStride]); hwy::CopyBytes(&buf1[56 * kNumLanes], &in[7 * OutStride]); hwy::CopyBytes(&buf1[7 * kNumLanes], &in[56 * OutStride]); hwy::CopyBytes(&buf0[4 * kNumLanes], &in[8 * OutStride]); hwy::CopyBytes(&buf1[59 * kNumLanes], &in[55 * OutStride]); hwy::CopyBytes(&buf1[36 * kNumLanes], &in[9 * OutStride]); hwy::CopyBytes(&buf1[27 * kNumLanes], &in[54 * OutStride]); hwy::CopyBytes(&buf0[20 * kNumLanes], &in[10 * OutStride]); hwy::CopyBytes(&buf1[43 * kNumLanes], &in[53 * OutStride]); hwy::CopyBytes(&buf1[52 * kNumLanes], &in[11 * OutStride]); hwy::CopyBytes(&buf1[11 * kNumLanes], &in[52 * OutStride]); hwy::CopyBytes(&buf1[12 * kNumLanes], &in[12 * OutStride]); hwy::CopyBytes(&buf1[51 * kNumLanes], &in[51 * OutStride]); hwy::CopyBytes(&buf1[44 * kNumLanes], &in[13 * OutStride]); hwy::CopyBytes(&buf1[19 * kNumLanes], &in[50 * OutStride]); hwy::CopyBytes(&buf0[28 * kNumLanes], &in[14 * OutStride]); hwy::CopyBytes(&buf1[35 * kNumLanes], &in[49 * OutStride]); hwy::CopyBytes(&buf1[60 * kNumLanes], &in[15 * OutStride]); hwy::CopyBytes(&buf1[3 * kNumLanes], &in[48 * OutStride]); hwy::CopyBytes(&buf1[2 * kNumLanes], &in[16 * OutStride]); hwy::CopyBytes(&buf1[61 * kNumLanes], &in[47 * OutStride]); hwy::CopyBytes(&buf1[34 * kNumLanes], &in[17 * OutStride]); hwy::CopyBytes(&buf1[29 * kNumLanes], &in[46 * OutStride]); hwy::CopyBytes(&buf0[18 * kNumLanes], &in[18 * OutStride]); hwy::CopyBytes(&buf1[45 * kNumLanes], &in[45 * OutStride]); hwy::CopyBytes(&buf1[50 * kNumLanes], &in[19 * OutStride]); hwy::CopyBytes(&buf1[13 * kNumLanes], &in[44 * OutStride]); hwy::CopyBytes(&buf1[10 * kNumLanes], &in[20 * OutStride]); hwy::CopyBytes(&buf1[53 * kNumLanes], &in[43 * OutStride]); hwy::CopyBytes(&buf1[42 * kNumLanes], &in[21 * OutStride]); hwy::CopyBytes(&buf1[21 * kNumLanes], &in[42 * OutStride]); hwy::CopyBytes(&buf0[26 * kNumLanes], &in[22 * OutStride]); hwy::CopyBytes(&buf1[37 * kNumLanes], &in[41 * OutStride]); hwy::CopyBytes(&buf1[58 * kNumLanes], &in[23 * OutStride]); hwy::CopyBytes(&buf1[5 * kNumLanes], &in[40 * OutStride]); hwy::CopyBytes(&buf0[6 * kNumLanes], &in[24 * OutStride]); hwy::CopyBytes(&buf1[57 * kNumLanes], &in[39 * OutStride]); hwy::CopyBytes(&buf1[38 * kNumLanes], &in[25 * OutStride]); hwy::CopyBytes(&buf1[25 * kNumLanes], &in[38 * OutStride]); hwy::CopyBytes(&buf0[22 * kNumLanes], &in[26 * OutStride]); hwy::CopyBytes(&buf1[41 * kNumLanes], &in[37 * OutStride]); hwy::CopyBytes(&buf1[54 * kNumLanes], &in[27 * OutStride]); hwy::CopyBytes(&buf1[9 * kNumLanes], &in[36 * OutStride]); hwy::CopyBytes(&buf1[14 * kNumLanes], &in[28 * OutStride]); hwy::CopyBytes(&buf1[49 * kNumLanes], &in[35 * OutStride]); hwy::CopyBytes(&buf1[46 * kNumLanes], &in[29 * OutStride]); hwy::CopyBytes(&buf1[17 * kNumLanes], &in[34 * OutStride]); hwy::CopyBytes(&buf0[30 * kNumLanes], &in[30 * OutStride]); hwy::CopyBytes(&buf1[33 * kNumLanes], &in[33 * OutStride]); hwy::CopyBytes(&buf1[62 * kNumLanes], &in[31 * OutStride]); hwy::CopyBytes(&buf1[1 * kNumLanes], &in[32 * OutStride]); } template struct Fadst4Traits { template HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, const size_t instride) { const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); const auto round = hn::Set(int_tag, 1 << (cos_bit - 1)); const auto sinpi1 = hn::Set(int_tag, sinpi[1]); const auto sinpi2 = hn::Set(int_tag, sinpi[2]); const auto sinpi3 = hn::Set(int_tag, sinpi[3]); const auto sinpi4 = hn::Set(int_tag, sinpi[4]); const auto in0 = hn::Load(int_tag, &in[0 * instride]); const auto in1 = hn::Load(int_tag, &in[1 * instride]); const auto in2 = hn::Load(int_tag, &in[2 * instride]); const auto in3 = hn::Load(int_tag, &in[3 * instride]); auto s0 = hn::Mul(in0, sinpi1); auto s1 = hn::Mul(in0, sinpi4); auto s2 = hn::Mul(in1, sinpi2); auto s3 = hn::Mul(in1, sinpi1); auto s4 = hn::Mul(in2, sinpi3); auto s5 = hn::Mul(in3, sinpi4); auto s6 = hn::Mul(in3, sinpi2); auto s7 = hn::Sub(hn::Add(in0, in1), in3); auto x0 = hn::Add(hn::Add(s0, s2), s5); auto x1 = hn::Mul(s7, sinpi3); auto x2 = hn::Add(hn::Sub(s1, s3), s6); auto x3 = s4; s0 = hn::Add(x0, x3); s1 = x1; s2 = hn::Sub(x2, x3); s3 = hn::Add(hn::Sub(x2, x0), x3); auto u0 = hn::Add(s0, round); u0 = hn::ShiftRightSame(u0, cos_bit); auto u1 = hn::Add(s1, round); u1 = hn::ShiftRightSame(u1, cos_bit); auto u2 = hn::Add(s2, round); u2 = hn::ShiftRightSame(u2, cos_bit); auto u3 = hn::Add(s3, round); u3 = hn::ShiftRightSame(u3, cos_bit); hn::Store(u0, int_tag, &in[0 * instride]); hn::Store(u1, int_tag, &in[1 * instride]); hn::Store(u2, int_tag, &in[2 * instride]); hn::Store(u3, int_tag, &in[3 * instride]); } }; template <> struct Fadst4Traits<2, 4> { template HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, const size_t instride) { (void)int_tag; const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); constexpr hn::FixedTag, 8> demote_tag; constexpr hn::RepartitionToWide int32_tag; const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); const auto sinpi_p01_p02 = SetPair(demote_tag, sinpi[1], sinpi[2]); const auto sinpi_p04_m01 = SetPair(demote_tag, sinpi[4], -sinpi[1]); const auto sinpi_p03_p04 = SetPair(demote_tag, sinpi[3], sinpi[4]); const auto sinpi_m03_p02 = SetPair(demote_tag, -sinpi[3], sinpi[2]); const auto sinpi_p03_p03 = hn::Set(demote_tag, sinpi[3]); const auto in0 = hn::Load(demote_tag, &in[0 * instride]); const auto in1 = hn::Load(demote_tag, &in[1 * instride]); const auto in2 = hn::Load(demote_tag, &in[2 * instride]); const auto in3 = hn::Load(demote_tag, &in[3 * instride]); const auto in7 = hn::Add(in0, in1); auto u0 = hn::InterleaveLower(in0, in1); auto u1 = hn::InterleaveLower(in2, in3); auto u2 = hn::InterleaveLower(in7, hn::Zero(demote_tag)); auto u3 = hn::InterleaveLower(in2, hn::Zero(demote_tag)); auto u4 = hn::InterleaveLower(in3, hn::Zero(demote_tag)); auto v0 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p01_p02); // s0 + s2 auto v1 = hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_p03_p04); // s4 + s5 auto v2 = hn::WidenMulPairwiseAdd(int32_tag, u2, sinpi_p03_p03); // x1 auto v3 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p04_m01); // s1 - s3 auto v4 = hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_m03_p02); // -s4 + s6 auto v5 = hn::WidenMulPairwiseAdd(int32_tag, u3, sinpi_p03_p03); // s4 auto v6 = hn::WidenMulPairwiseAdd(int32_tag, u4, sinpi_p03_p03); auto w0 = hn::Add(v0, v1); auto w1 = hn::Sub(v2, v6); auto w2 = hn::Add(v3, v4); auto w3 = hn::Sub(w2, w0); auto w4 = hn::ShiftLeft<2>(v5); auto w5 = hn::Sub(w4, v5); auto w6 = hn::Add(w3, w5); v0 = hn::Add(w0, round); v1 = hn::Add(w1, round); v2 = hn::Add(w2, round); v3 = hn::Add(w6, round); w0 = hn::ShiftRightSame(v0, cos_bit); w1 = hn::ShiftRightSame(v1, cos_bit); w2 = hn::ShiftRightSame(v2, cos_bit); w3 = hn::ShiftRightSame(v3, cos_bit); auto o0 = hn::ReorderDemote2To(demote_tag, w0, w2); auto o1 = hn::ReorderDemote2To(demote_tag, w1, w3); hn::Store(o0, demote_tag, &in[0 * instride]); hn::Store(o1, demote_tag, &in[1 * instride]); hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, &in[2 * instride]); hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, &in[3 * instride]); } }; template struct Fadst4Traits<2, NumLanes> { template HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, const size_t instride) { const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); constexpr hn::RepartitionToWide int32_tag; const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); const auto sinpi_p01_p02 = SetPair(int_tag, sinpi[1], sinpi[2]); const auto sinpi_p04_m01 = SetPair(int_tag, sinpi[4], -sinpi[1]); const auto sinpi_p03_p04 = SetPair(int_tag, sinpi[3], sinpi[4]); const auto sinpi_m03_p02 = SetPair(int_tag, -sinpi[3], sinpi[2]); const auto sinpi_p03_p03 = hn::Set(int_tag, sinpi[3]); const auto in0 = hn::Load(int_tag, &in[0 * instride]); const auto in1 = hn::Load(int_tag, &in[1 * instride]); const auto in2 = hn::Load(int_tag, &in[2 * instride]); const auto in3 = hn::Load(int_tag, &in[3 * instride]); const auto in7 = hn::Add(in0, in1); auto ul0 = hn::InterleaveLower(int_tag, in0, in1); auto uh0 = hn::InterleaveUpper(int_tag, in0, in1); auto ul1 = hn::InterleaveLower(int_tag, in2, in3); auto uh1 = hn::InterleaveUpper(int_tag, in2, in3); auto ul2 = hn::InterleaveLower(int_tag, in7, hn::Zero(int_tag)); auto uh2 = hn::InterleaveUpper(int_tag, in7, hn::Zero(int_tag)); auto ul3 = hn::InterleaveLower(int_tag, in2, hn::Zero(int_tag)); auto uh3 = hn::InterleaveUpper(int_tag, in2, hn::Zero(int_tag)); auto ul4 = hn::InterleaveLower(int_tag, in3, hn::Zero(int_tag)); auto uh4 = hn::InterleaveUpper(int_tag, in3, hn::Zero(int_tag)); auto vl0 = hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p01_p02); // s0 + s2 auto vh0 = hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p01_p02); // s0 + s2 auto vl1 = hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_p03_p04); // s4 + s5 auto vh1 = hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_p03_p04); // s4 + s5 auto vl2 = hn::WidenMulPairwiseAdd(int32_tag, ul2, sinpi_p03_p03); // x1 auto vh2 = hn::WidenMulPairwiseAdd(int32_tag, uh2, sinpi_p03_p03); // x1 auto vl3 = hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p04_m01); // s1 - s3 auto vh3 = hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p04_m01); // s1 - s3 auto vl4 = hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_m03_p02); // -s4 + s6 auto vh4 = hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_m03_p02); // -s4 + s6 auto vl5 = hn::WidenMulPairwiseAdd(int32_tag, ul3, sinpi_p03_p03); // s4 auto vh5 = hn::WidenMulPairwiseAdd(int32_tag, uh3, sinpi_p03_p03); // s4 auto vl6 = hn::WidenMulPairwiseAdd(int32_tag, ul4, sinpi_p03_p03); auto vh6 = hn::WidenMulPairwiseAdd(int32_tag, uh4, sinpi_p03_p03); auto wl0 = hn::Add(vl0, vl1); auto wh0 = hn::Add(vh0, vh1); auto wl1 = hn::Sub(vl2, vl6); auto wh1 = hn::Sub(vh2, vh6); auto wl2 = hn::Add(vl3, vl4); auto wh2 = hn::Add(vh3, vh4); auto wl3 = hn::Sub(wl2, wl0); auto wh3 = hn::Sub(wh2, wh0); auto wl4 = hn::ShiftLeft<2>(vl5); auto wh4 = hn::ShiftLeft<2>(vh5); auto wl5 = hn::Sub(wl4, vl5); auto wh5 = hn::Sub(wh4, vh5); auto wl6 = hn::Add(wl3, wl5); auto wh6 = hn::Add(wh3, wh5); vl0 = hn::Add(wl0, round); vh0 = hn::Add(wh0, round); vl1 = hn::Add(wl1, round); vh1 = hn::Add(wh1, round); vl2 = hn::Add(wl2, round); vh2 = hn::Add(wh2, round); vl3 = hn::Add(wl6, round); vh3 = hn::Add(wh6, round); wl0 = hn::ShiftRightSame(vl0, cos_bit); wh0 = hn::ShiftRightSame(vh0, cos_bit); wl1 = hn::ShiftRightSame(vl1, cos_bit); wh1 = hn::ShiftRightSame(vh1, cos_bit); wl2 = hn::ShiftRightSame(vl2, cos_bit); wh2 = hn::ShiftRightSame(vh2, cos_bit); wl3 = hn::ShiftRightSame(vl3, cos_bit); wh3 = hn::ShiftRightSame(vh3, cos_bit); auto o0 = hn::ReorderDemote2To(int_tag, wl0, wh0); auto o1 = hn::ReorderDemote2To(int_tag, wl1, wh1); auto o2 = hn::ReorderDemote2To(int_tag, wl2, wh2); auto o3 = hn::ReorderDemote2To(int_tag, wl3, wh3); hn::Store(o0, int_tag, &in[0 * instride]); hn::Store(o1, int_tag, &in[1 * instride]); hn::Store(o2, int_tag, &in[2 * instride]); hn::Store(o3, int_tag, &in[3 * instride]); } }; template HWY_ATTR HWY_INLINE void Fadst4(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, const size_t instride) { Fadst4Traits), hn::MaxLanes(int_tag)>::template Fadst4(int_tag, in, cos_bit, instride); } template HWY_ATTR HWY_INLINE void Fadst8(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, const size_t instride) { constexpr size_t kNumLanes = hn::MaxLanes(int_tag); constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD); HWY_ALIGN_MAX hn::TFromD buf0[8 * kNumLanes]; HWY_ALIGN_MAX hn::TFromD buf1[8 * kNumLanes]; const int32_t *HWY_RESTRICT cospi = cospi_arr(cos_bit); const auto round = hn::Set(hn::Repartition(), 1 << (cos_bit - 1)); // stage 0 // stage 1 hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, &buf0[0 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, &buf0[1 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, &buf0[2 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, &buf0[3 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, &buf0[4 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, &buf0[5 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, &buf0[6 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, &buf0[7 * kNumLanes]); // stage 2 hwy::CopyBytes<2 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); hwy::CopyBytes<2 * kNumBytes>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); // stage 3 for (size_t j = 0; j < 8; j += 4) { for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], &buf0[(2 + i + j) * kNumLanes]); } } // stage 4 hwy::CopyBytes<4 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); // stage 5 for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(4 + i) * kNumLanes], &buf0[(0 + i) * kNumLanes], &buf0[(4 + i) * kNumLanes]); } // stage 6 HalfButterfly(int_tag, cospi[4], cospi[60], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[60], -cospi[4], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[20], cospi[44], &buf0[2 * kNumLanes], &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[44], -cospi[20], &buf0[2 * kNumLanes], &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[36], cospi[28], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[28], -cospi[36], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[52], cospi[12], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[12], -cospi[52], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); // stage 7 hwy::CopyBytes(&buf1[1 * kNumLanes], &in[0 * instride]); hwy::CopyBytes(&buf1[6 * kNumLanes], &in[1 * instride]); hwy::CopyBytes(&buf1[3 * kNumLanes], &in[2 * instride]); hwy::CopyBytes(&buf1[4 * kNumLanes], &in[3 * instride]); hwy::CopyBytes(&buf1[5 * kNumLanes], &in[4 * instride]); hwy::CopyBytes(&buf1[2 * kNumLanes], &in[5 * instride]); hwy::CopyBytes(&buf1[7 * kNumLanes], &in[6 * instride]); hwy::CopyBytes(&buf1[0 * kNumLanes], &in[7 * instride]); } template HWY_ATTR HWY_INLINE void Fadst16(D int_tag, hn::TFromD *HWY_RESTRICT in, const int8_t cos_bit, const size_t instride) { constexpr size_t kNumLanes = hn::MaxLanes(int_tag); constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD); HWY_ALIGN_MAX hn::TFromD buf0[16 * kNumLanes]; HWY_ALIGN_MAX hn::TFromD buf1[16 * kNumLanes]; const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); const auto round = hn::Set(hn::Repartition(), 1 << (cos_bit - 1)); // stage 0 // stage 1 hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, &buf0[0 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[15 * instride])), int_tag, &buf0[1 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, &buf0[2 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[8 * instride]), int_tag, &buf0[3 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, &buf0[4 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[12 * instride]), int_tag, &buf0[5 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, &buf0[6 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[11 * instride])), int_tag, &buf0[7 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, &buf0[8 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[14 * instride]), int_tag, &buf0[9 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, &buf0[10 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[9 * instride])), int_tag, &buf0[11 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, &buf0[12 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[13 * instride])), int_tag, &buf0[13 * kNumLanes]); hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, &buf0[14 * kNumLanes]); hn::Store(hn::Load(int_tag, &in[10 * instride]), int_tag, &buf0[15 * kNumLanes]); // stage 2 hwy::CopyBytes(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); hwy::CopyBytes(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); hwy::CopyBytes(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[10 * kNumLanes], &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); hwy::CopyBytes(&buf0[12 * kNumLanes], &buf1[12 * kNumLanes]); Butterfly(int_tag, cospi[32], cospi[32], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); // stage 3 for (size_t j = 0; j < 16; j += 4) { for (size_t i = 0; i < 2; ++i) { AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], &buf0[(2 + i + j) * kNumLanes]); } } // stage 4 hwy::CopyBytes(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); hwy::CopyBytes(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); // stage 5 for (size_t j = 0; j < 16; j += 8) { for (size_t i = 0; i < 4; ++i) { AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], &buf1[(4 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], &buf0[(4 + i + j) * kNumLanes]); } } // stage 6 hwy::CopyBytes(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[56], -cospi[8], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[10 * kNumLanes], &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[24], -cospi[40], &buf0[10 * kNumLanes], &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, -cospi[56], cospi[8], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, -cospi[24], cospi[40], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); // stage 7 for (size_t i = 0; i < 8; ++i) { AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(8 + i) * kNumLanes], &buf0[(0 + i) * kNumLanes], &buf0[(8 + i) * kNumLanes]); } // stage 8 HalfButterfly(int_tag, cospi[2], cospi[62], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[62], -cospi[2], &buf0[0 * kNumLanes], &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[10], cospi[54], &buf0[2 * kNumLanes], &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[54], -cospi[10], &buf0[2 * kNumLanes], &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[18], cospi[46], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[46], -cospi[18], &buf0[4 * kNumLanes], &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[26], cospi[38], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[38], -cospi[26], &buf0[6 * kNumLanes], &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[34], cospi[30], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[30], -cospi[34], &buf0[8 * kNumLanes], &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[42], cospi[22], &buf0[10 * kNumLanes], &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[22], -cospi[42], &buf0[10 * kNumLanes], &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[50], cospi[14], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[14], -cospi[50], &buf0[12 * kNumLanes], &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[58], cospi[6], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); HalfButterfly(int_tag, cospi[6], -cospi[58], &buf0[14 * kNumLanes], &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); // stage 9 hwy::CopyBytes(&buf1[1 * kNumLanes], &in[0 * instride]); hwy::CopyBytes(&buf1[14 * kNumLanes], &in[1 * instride]); hwy::CopyBytes(&buf1[3 * kNumLanes], &in[2 * instride]); hwy::CopyBytes(&buf1[12 * kNumLanes], &in[3 * instride]); hwy::CopyBytes(&buf1[5 * kNumLanes], &in[4 * instride]); hwy::CopyBytes(&buf1[10 * kNumLanes], &in[5 * instride]); hwy::CopyBytes(&buf1[7 * kNumLanes], &in[6 * instride]); hwy::CopyBytes(&buf1[8 * kNumLanes], &in[7 * instride]); hwy::CopyBytes(&buf1[9 * kNumLanes], &in[8 * instride]); hwy::CopyBytes(&buf1[6 * kNumLanes], &in[9 * instride]); hwy::CopyBytes(&buf1[11 * kNumLanes], &in[10 * instride]); hwy::CopyBytes(&buf1[4 * kNumLanes], &in[11 * instride]); hwy::CopyBytes(&buf1[13 * kNumLanes], &in[12 * instride]); hwy::CopyBytes(&buf1[2 * kNumLanes], &in[13 * instride]); hwy::CopyBytes(&buf1[15 * kNumLanes], &in[14 * instride]); hwy::CopyBytes(&buf1[0 * kNumLanes], &in[15 * instride]); } template HWY_ATTR HWY_INLINE void IdtxAdd2(D tag, hn::TFromD *HWY_RESTRICT in) { for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { auto v = hn::Load(tag, &in[x]); hn::Store(hn::Add(v, v), tag, &in[x]); } } template HWY_ATTR HWY_INLINE void IdtxShift(D tag, hn::TFromD *HWY_RESTRICT in) { for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { hn::Store(hn::ShiftLeft(hn::Load(tag, &in[x])), tag, &in[x]); } } template HWY_ATTR HWY_INLINE void PromoteScale2x16ByNewSqrt2( D tag, hn::VFromD v, hn::VFromD> &out0, hn::VFromD> &out1) { constexpr hn::RepartitionToWide int32_tag; auto one = hn::Set(tag, 1); auto scale_rounding = SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); auto a0 = hn::InterleaveLower(tag, v, one); auto a1 = hn::InterleaveUpper(tag, v, one); out0 = hn::ShiftRight( hn::WidenMulPairwiseAdd(int32_tag, a0, scale_rounding)); out1 = hn::ShiftRight( hn::WidenMulPairwiseAdd(int32_tag, a1, scale_rounding)); } template struct ScaleByNewSqrt2Traits { template HWY_ATTR HWY_INLINE static hn::VFromD ScaleByNewSqrt2(D tag, hn::VFromD v) { auto fact = hn::Set(tag, Scale * NewSqrt2); auto offset = hn::Set(tag, 1 << (NewSqrt2Bits - 1)); return hn::ShiftRight(hn::MulAdd(v, fact, offset)); } }; template <> struct ScaleByNewSqrt2Traits<2, 4> { template HWY_ATTR HWY_INLINE static hn::VFromD ScaleByNewSqrt2(D tag, hn::VFromD v) { auto one = hn::Set(tag, 1); auto scale_rounding = SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); constexpr hn::Rebind int32_tag; auto a = hn::InterleaveLower(tag, v, one); auto b = hn::ShiftRight( hn::WidenMulPairwiseAdd(int32_tag, a, scale_rounding)); return hn::DemoteTo(tag, b); } }; template struct ScaleByNewSqrt2Traits<2, NumLanes> { template HWY_ATTR HWY_INLINE static hn::VFromD ScaleByNewSqrt2(D tag, hn::VFromD v) { hn::VFromD> b0, b1; PromoteScale2x16ByNewSqrt2(tag, v, b0, b1); return hn::ReorderDemote2To(tag, b0, b1); } }; template HWY_ATTR HWY_INLINE hn::VFromD ScaleByNewSqrt2(D tag, hn::VFromD v) { return ScaleByNewSqrt2Traits), hn::MaxLanes(tag)>:: template ScaleByNewSqrt2(tag, v); } template HWY_ATTR HWY_INLINE void IdtxSqrt2(D tag, hn::TFromD *HWY_RESTRICT in) { for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { hn::Store(ScaleByNewSqrt2(tag, hn::Load(tag, &in[x])), tag, &in[x]); } } template HWY_ATTR void FdctNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fdct4(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void FdctNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fdct8(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void FdctNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fdct16(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void FdctNx32Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fdct32(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void FdctNx64Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < OutWidth; i += hn::MaxLanes(int_tag)) { Fdct64(int_tag, &in[i], cos_bit); } } template HWY_ATTR HWY_INLINE void FadstNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fadst4(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void FadstNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fadst8(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void FadstNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { constexpr auto int_tag = hn::CappedTag(); for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { Fadst16(int_tag, &in[i], cos_bit, Stride); } } template HWY_ATTR void IdtxAdd2Block(T *HWY_RESTRICT in, int8_t cos_bit) { (void)cos_bit; constexpr auto int_tag = hn::CappedTag(); for (size_t y = 0; y < BlockHeight; ++y) { IdtxAdd2(int_tag, &in[y * Stride]); } } template HWY_ATTR void IdtxSqrt2Block(T *HWY_RESTRICT in, int8_t cos_bit) { (void)cos_bit; constexpr auto int_tag = hn::CappedTag(); for (size_t y = 0; y < BlockHeight; ++y) { IdtxSqrt2(int_tag, &in[y * Stride]); } } template HWY_ATTR void IdtxShiftBlock(T *HWY_RESTRICT in, int8_t cos_bit) { (void)cos_bit; constexpr auto int_tag = hn::CappedTag(); for (size_t y = 0; y < BlockHeight; ++y) { IdtxShift(int_tag, &in[y * Stride]); } } template void TransformFail(T *in, int8_t cos_bit) { (void)in; (void)cos_bit; assert(false && "Incorrect transform requested."); } template using Transform1D = void (*)(T *in, int8_t cos_bit); template struct RoundShiftTraits {}; template <> struct RoundShiftTraits { template HWY_ATTR HWY_INLINE static hn::VFromD Shift(D int_tag, hn::VFromD value) { (void)int_tag; if CONSTEXPR_IF (Bit == 0) { return value; } else { return hn::ShiftLeft(value); } } }; template <> struct RoundShiftTraits { template HWY_ATTR HWY_INLINE static hn::VFromD Shift(D int_tag, hn::VFromD value) { const auto round = hn::Set(int_tag, 1 << (-Bit - 1)); return hn::ShiftRight<-Bit>(hn::Add(value, round)); } }; template HWY_ATTR HWY_INLINE hn::VFromD RoundShift(D int_tag, hn::VFromD value) { return RoundShiftTraits<(Bit >= 0)>::template Shift(int_tag, value); } template HWY_ATTR HWY_INLINE hn::VFromD RectScale(D int_tag, hn::VFromD v) { if CONSTEXPR_IF (ApplyRectScale) { return ScaleByNewSqrt2<1>(int_tag, v); } return v; } template struct MaybePromoteTraits {}; template <> struct MaybePromoteTraits { template HWY_ATTR HWY_INLINE static hn::VFromD PromoteTo(D out_tag, VIn in) { (void)out_tag; return in; } template HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, hn::TFromD *out) { hn::StoreU(v, int_tag, out); } }; template <> struct MaybePromoteTraits { template HWY_ATTR HWY_INLINE static hn::VFromD PromoteTo(D out_tag, VIn in) { return hn::PromoteTo(out_tag, in); } template HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, TOut *out) { (void)int_tag; constexpr hn::Repartition store_tag; hn::StoreU(hn::PromoteLowerTo(store_tag, v), store_tag, out); hn::StoreU(hn::PromoteUpperTo(store_tag, v), store_tag, out + hn::MaxLanes(store_tag)); } }; template HWY_ATTR HWY_INLINE hn::VFromD MaybePromoteTo(D out_tag, VIn in) { return MaybePromoteTraits< std::is_same, hn::TFromV>::value>::PromoteTo(out_tag, in); } template HWY_ATTR HWY_INLINE void Transpose4(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { constexpr hn::FixedTag int_tag; auto i0 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[0 * instride]))); auto i1 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[1 * instride]))); auto i2 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[2 * instride]))); auto i3 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[3 * instride]))); HWY_ALIGN_MAX TOut interleaved[16]; constexpr hn::FixedTag out_tag; hn::StoreInterleaved4(MaybePromoteTo(out_tag, i0), MaybePromoteTo(out_tag, i1), MaybePromoteTo(out_tag, i2), MaybePromoteTo(out_tag, i3), out_tag, interleaved); for (size_t i = 0; i < 4; ++i) { hwy::CopyBytes(&interleaved[i * 4], &out[i * outstride]); } } template HWY_ATTR HWY_INLINE void Transpose8(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { constexpr hn::FixedTag int_tag; constexpr hn::Rebind out_tag; // N.B. there isn't a StoreInterleaved8, so hand-code Transpose8. constexpr hn::RepartitionToWide wide_int_tag; HWY_ALIGN_MAX hn::TFromD interleaved0[16]; HWY_ALIGN_MAX hn::TFromD interleaved1[16]; auto i0 = hn::Load(int_tag, &in[0 * instride]); auto i1 = hn::Load(int_tag, &in[1 * instride]); auto i2 = hn::Load(int_tag, &in[2 * instride]); auto i3 = hn::Load(int_tag, &in[3 * instride]); auto i4 = hn::Load(int_tag, &in[4 * instride]); auto i5 = hn::Load(int_tag, &in[5 * instride]); auto i6 = hn::Load(int_tag, &in[6 * instride]); auto i7 = hn::Load(int_tag, &in[7 * instride]); auto s0 = hn::Undefined(out_tag); auto s1 = hn::Undefined(out_tag); auto s2 = hn::Undefined(out_tag); auto s3 = hn::Undefined(out_tag); auto s4 = hn::Undefined(out_tag); auto s5 = hn::Undefined(out_tag); auto s6 = hn::Undefined(out_tag); auto s7 = hn::Undefined(out_tag); auto ip0 = MaybePromoteTo(out_tag, i0); auto ip1 = MaybePromoteTo(out_tag, i1); auto ip2 = MaybePromoteTo(out_tag, i2); auto ip3 = MaybePromoteTo(out_tag, i3); auto ip4 = MaybePromoteTo(out_tag, i4); auto ip5 = MaybePromoteTo(out_tag, i5); auto ip6 = MaybePromoteTo(out_tag, i6); auto ip7 = MaybePromoteTo(out_tag, i7); s0 = RectScale(out_tag, RoundShift(out_tag, ip0)); s1 = RectScale(out_tag, RoundShift(out_tag, ip1)); s2 = RectScale(out_tag, RoundShift(out_tag, ip2)); s3 = RectScale(out_tag, RoundShift(out_tag, ip3)); s4 = RectScale(out_tag, RoundShift(out_tag, ip4)); s5 = RectScale(out_tag, RoundShift(out_tag, ip5)); s6 = RectScale(out_tag, RoundShift(out_tag, ip6)); s7 = RectScale(out_tag, RoundShift(out_tag, ip7)); auto u0 = hn::ZipLower(wide_int_tag, s0, s1); auto u1 = hn::ZipUpper(wide_int_tag, s0, s1); auto u2 = hn::ZipLower(wide_int_tag, s2, s3); auto u3 = hn::ZipUpper(wide_int_tag, s2, s3); auto u4 = hn::ZipLower(wide_int_tag, s4, s5); auto u5 = hn::ZipUpper(wide_int_tag, s4, s5); auto u6 = hn::ZipLower(wide_int_tag, s6, s7); auto u7 = hn::ZipUpper(wide_int_tag, s6, s7); hn::StoreInterleaved4(u0, u2, u4, u6, wide_int_tag, interleaved0); hn::StoreInterleaved4(u1, u3, u5, u7, wide_int_tag, interleaved1); constexpr size_t kNumBytes = hn::MaxLanes(int_tag) * sizeof(*out); if CONSTEXPR_IF (sizeof(TOut) == 2) { hwy::CopyBytes(&interleaved0[0], &out[0 * outstride]); hwy::CopyBytes(&interleaved0[4], &out[1 * outstride]); hwy::CopyBytes(&interleaved0[8], &out[2 * outstride]); hwy::CopyBytes(&interleaved0[12], &out[3 * outstride]); hwy::CopyBytes(&interleaved1[0], &out[4 * outstride]); hwy::CopyBytes(&interleaved1[4], &out[5 * outstride]); hwy::CopyBytes(&interleaved1[8], &out[6 * outstride]); hwy::CopyBytes(&interleaved1[12], &out[7 * outstride]); } else { hwy::CopyBytes(&interleaved0[0], &out[0 * outstride]); hwy::CopyBytes(&interleaved0[4], &out[1 * outstride]); hwy::CopyBytes(&interleaved1[0], &out[2 * outstride]); hwy::CopyBytes(&interleaved1[4], &out[3 * outstride]); hwy::CopyBytes(&interleaved0[8], &out[4 * outstride]); hwy::CopyBytes(&interleaved0[12], &out[5 * outstride]); hwy::CopyBytes(&interleaved1[8], &out[6 * outstride]); hwy::CopyBytes(&interleaved1[12], &out[7 * outstride]); } } template HWY_ATTR HWY_INLINE hn::VFromD LocalInterleaveEvenBlocks(D tag, hn::VFromD a, hn::VFromD b) { static_assert(sizeof(hn::TFromD) == 8, "LocalInterleaveEvenBlocks requires 64-bit lanes."); HWY_ALIGN static constexpr int64_t kIndices[] = { 0, 1, 8 + 0, 8 + 1, 4, 5, 8 + 4, 8 + 5 }; auto indices = hn::SetTableIndices(tag, kIndices); return hn::TwoTablesLookupLanes(tag, a, b, indices); } template HWY_ATTR HWY_INLINE hn::VFromD LocalInterleaveOddBlocks(D tag, hn::VFromD a, hn::VFromD b) { static_assert(sizeof(hn::TFromD) == 8, "LocalInterleaveOddBlocks requires 64-bit lanes."); HWY_ALIGN static constexpr int64_t kIndices[] = { 2, 3, 8 + 2, 8 + 3, 6, 7, 8 + 6, 8 + 7 }; auto indices = hn::SetTableIndices(tag, kIndices); return hn::TwoTablesLookupLanes(tag, a, b, indices); } template struct Transpose16Traits {}; template <> struct Transpose16Traits<2> { template HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { constexpr hn::FixedTag int_tag; static_assert(hn::MaxLanes(int_tag) == 16, "16-bit Transpose16 requires an 16-lane int_tag"); constexpr hn::RepartitionToWide wide_int_tag; constexpr hn::RepartitionToWide widex2_int_tag; HWY_ALIGN_MAX hn::TFromD y[16 * hn::MaxLanes(wide_int_tag)]; HWY_ALIGN_MAX hn::TFromD z[16 * hn::MaxLanes(widex2_int_tag)]; for (size_t i = 0; i < 16; i += 2) { auto i0 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); auto i1 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, &y[(i + 0) * hn::MaxLanes(wide_int_tag)]); hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, &y[(i + 1) * hn::MaxLanes(wide_int_tag)]); } for (size_t i = 0; i < 16; i += 4) { for (size_t j = 0; j < 2; ++j) { auto i0 = hn::Load(wide_int_tag, &y[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); auto i2 = hn::Load(wide_int_tag, &y[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); hn::Store(hn::ZipLower(widex2_int_tag, i0, i2), widex2_int_tag, &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); hn::Store(hn::ZipUpper(widex2_int_tag, i0, i2), widex2_int_tag, &z[(i + j + 2) * hn::MaxLanes(widex2_int_tag)]); } } for (size_t i = 0; i < 16; i += 8) { for (size_t j = 0; j < 4; ++j) { auto i0 = hn::Load(widex2_int_tag, &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); auto i4 = hn::Load(widex2_int_tag, &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); hn::Store(hn::InterleaveLower(widex2_int_tag, i0, i4), widex2_int_tag, &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); hn::Store(hn::InterleaveUpper(widex2_int_tag, i0, i4), widex2_int_tag, &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); } } static constexpr size_t kStoreIndex[] = { 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15 }; for (size_t j = 0; j < 8; ++j) { auto i0 = hn::Load(widex2_int_tag, &z[(j + 0) * hn::MaxLanes(widex2_int_tag)]); auto i8 = hn::Load(widex2_int_tag, &z[(j + 8) * hn::MaxLanes(widex2_int_tag)]); hn::StoreU( hn::BitCast(int_tag, hn::ConcatLowerLower(widex2_int_tag, i8, i0)), int_tag, &out[kStoreIndex[j + 0] * outstride]); hn::StoreU( hn::BitCast(int_tag, hn::ConcatUpperUpper(widex2_int_tag, i8, i0)), int_tag, &out[kStoreIndex[j + 8] * outstride]); } } }; template <> struct Transpose16Traits<4> { template HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { constexpr hn::FixedTag int_tag; static_assert(hn::MaxLanes(int_tag) == 16, "32-bit Transpose16 requires an 16-lane int_tag"); constexpr hn::RepartitionToWide wide_int_tag; HWY_ALIGN_MAX hn::TFromD z[16 * hn::MaxLanes(wide_int_tag)]; for (size_t i = 0; i < 16; i += 2) { auto i0 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); auto i1 = RectScale( int_tag, RoundShift(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, &z[(i + 0) * hn::MaxLanes(wide_int_tag)]); hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, &z[(i + 1) * hn::MaxLanes(wide_int_tag)]); } for (size_t i = 0; i < 16; i += 4) { for (size_t j = 0; j < 2; ++j) { auto i0 = hn::Load(wide_int_tag, &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); auto i2 = hn::Load(wide_int_tag, &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); hn::Store(hn::InterleaveLower(wide_int_tag, i0, i2), wide_int_tag, &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); hn::Store(hn::InterleaveUpper(wide_int_tag, i0, i2), wide_int_tag, &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); } } for (size_t i = 0; i < 16; i += 8) { for (size_t j = 0; j < 4; ++j) { auto i0 = hn::Load(wide_int_tag, &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); auto i4 = hn::Load(wide_int_tag, &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); hn::Store(LocalInterleaveEvenBlocks(wide_int_tag, i0, i4), wide_int_tag, &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); hn::Store(LocalInterleaveOddBlocks(wide_int_tag, i0, i4), wide_int_tag, &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); } } static constexpr size_t kStoreIndex[] = { 0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15 }; for (size_t j = 0; j < 8; ++j) { auto i0 = hn::Load(wide_int_tag, &z[(j + 0) * hn::MaxLanes(wide_int_tag)]); auto i8 = hn::Load(wide_int_tag, &z[(j + 8) * hn::MaxLanes(wide_int_tag)]); hn::StoreU( hn::BitCast(int_tag, hn::ConcatLowerLower(wide_int_tag, i8, i0)), int_tag, &out[kStoreIndex[j + 0] * outstride]); hn::StoreU( hn::BitCast(int_tag, hn::ConcatUpperUpper(wide_int_tag, i8, i0)), int_tag, &out[kStoreIndex[j + 8] * outstride]); } } }; template HWY_ATTR HWY_INLINE void Transpose16(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { static_assert(sizeof(TOut) == sizeof(TIn), "Transpose16 does not directly support integer promotion."); Transpose16Traits::template Transpose16( in, out, instride, outstride); } template struct TransposeTraits {}; template <> struct TransposeTraits<16, true> { template HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { constexpr auto int_tag = hn::CappedTag(); constexpr hn::Rebind input_tag; HWY_ALIGN_MAX hn::TFromD p[16 * hn::MaxLanes(int_tag)]; for (size_t r = 0; r < Height; r += 16) { for (size_t c = 0; c < Width; c += 16) { for (size_t i = 0; i < 16; ++i) { hn::Store( hn::PromoteTo(int_tag, hn::Load(input_tag, &in[(r + i) * instride + c])), int_tag, &p[i * hn::MaxLanes(int_tag)]); } Transpose16(p, &out[c * outstride + r], hn::MaxLanes(int_tag), outstride); } } } }; template <> struct TransposeTraits<16, false> { template HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { for (size_t r = 0; r < Height; r += 16) { for (size_t c = 0; c < Width; c += 16) { Transpose16(&in[r * instride + c], &out[c * outstride + r], instride, outstride); } } } }; template struct TransposeTraits<8, RequiresPromotion> { template HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { for (size_t r = 0; r < Height; r += 8) { for (size_t c = 0; c < Width; c += 8) { Transpose8(&in[r * instride + c], &out[c * outstride + r], instride, outstride); } } } }; template struct TransposeTraits<4, RequiresPromotion> { template HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { for (size_t r = 0; r < Height; r += 4) { for (size_t c = 0; c < Width; c += 4) { Transpose4(&in[r * instride + c], &out[c * outstride + r], instride, outstride); } } } }; template HWY_ATTR HWY_INLINE void Transpose(const TIn *HWY_RESTRICT in, TOut *HWY_RESTRICT out, size_t instride, size_t outstride) { constexpr auto int_tag = hn::CappedTag(); TransposeTraits::value>:: template Transpose(in, out, instride, outstride); } template HWY_ATTR HWY_INLINE void StoreBlock(const TIn *HWY_RESTRICT in, size_t instride, TOut *HWY_RESTRICT out, size_t outstride) { constexpr hn::CappedTag load_tag; for (size_t r = 0; r < Height; ++r) { for (size_t c = 0; c < Width; c += hn::MaxLanes(load_tag)) { auto v = RectScale( load_tag, RoundShift( load_tag, hn::Load(load_tag, &in[r * instride + c]))); MaybePromoteTraits::value>::PromoteStore2( load_tag, v, &out[r * outstride + c]); } } } template HWY_ATTR HWY_INLINE void LoadLine(const TInput *HWY_RESTRICT input, TIn *HWY_RESTRICT in) { constexpr hn::CappedTag store_tag; constexpr hn::Rebind load_tag; for (size_t x = 0; x < Width / hn::MaxLanes(load_tag); ++x) { auto v = hn::LoadU(load_tag, &input[x * hn::MaxLanes(load_tag)]); if CONSTEXPR_IF (FlipLeftRight) { v = hn::Reverse(load_tag, v); } auto vp = MaybePromoteTo(store_tag, v); hn::Store( hn::ShiftLeft(vp), store_tag, &in[(FlipLeftRight ? (Width / hn::MaxLanes(store_tag)) - x - 1 : x) * hn::MaxLanes(store_tag)]); } } template HWY_ATTR HWY_INLINE void LoadBuffer(const TInput *HWY_RESTRICT input, TIn *HWY_RESTRICT in, size_t stride) { for (size_t y = 0; y < Height; ++y) { LoadLine( input + y * stride, &in[(FlipUpDown ? Height - y - 1 : y) * OutStride]); } } template HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform4(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { switch (tx_type) { case DCT_1D: FdctNx4Block(in, cos_bit); break; case IDTX_1D: IdtxSqrt2Block(in, cos_bit); break; default: FadstNx4Block(in, cos_bit); break; } } template HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform8(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { switch (tx_type) { case DCT_1D: FdctNx8Block(in, cos_bit); break; case IDTX_1D: IdtxAdd2Block(in, cos_bit); break; default: FadstNx8Block(in, cos_bit); break; } } template HWY_ATTR HWY_INLINE void Transform16(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { static const Transform1D kTransform[] = { FdctNx16Block, // DCT_1D FadstNx16Block, // ADST_1D FadstNx16Block, // FLIPADST_1D IdtxSqrt2Block, // IDTX_1D }; kTransform[tx_type](in, cos_bit); } template HWY_ATTR HWY_INLINE void Transform32(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { static const Transform1D kTransform[] = { FdctNx32Block, // DCT_1D TransformFail, // ADST_1D TransformFail, // FLIPADST_1D IdtxShiftBlock, // IDTX_1D }; kTransform[tx_type](in, cos_bit); } template HWY_ATTR HWY_INLINE void TransformFull64(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { (void)tx_type; assert(tx_type == DCT_1D); FdctNx64Block( in, cos_bit); } template HWY_ATTR HWY_INLINE void TransformBelow32(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { if CONSTEXPR_IF (TransformHeight == 4) { Transform4(tx_type, in, cos_bit); } else if CONSTEXPR_IF (TransformHeight == 8) { Transform8(tx_type, in, cos_bit); } else if CONSTEXPR_IF (TransformHeight == 16) { Transform16(tx_type, in, cos_bit); } else if CONSTEXPR_IF (TransformHeight == 32) { Transform32(tx_type, in, cos_bit); } else { assert(false && "Unsupported transform size."); } } template HWY_ATTR HWY_INLINE void RowTransform(TX_TYPE_1D tx_type, T *in, int8_t cos_bit) { if CONSTEXPR_IF (TransformWidth == 64 && TransformHeight == 64) { assert(tx_type == DCT_1D); // 64x64 only writes 32x32 of coefficients. FdctNx64Block(in, cos_bit); } else if CONSTEXPR_IF (TransformHeight == 64) { TransformFull64(tx_type, in, cos_bit); } else { TransformBelow32( tx_type, in, cos_bit); } } template HWY_ATTR HWY_MAYBE_UNUSED void ForwardTransform2D(const int16_t *input, int32_t *output, size_t stride, TX_TYPE tx_type) { constexpr size_t kWidth = kTxSizeWide[TxSize]; constexpr size_t kHeight = kTxSizeHigh[TxSize]; // Ensure the storage is aligned to the architecture's block width. constexpr size_t kMinVectorSize = hn::BlockDFromD>().MaxBytes() / sizeof(uint8_t); constexpr size_t kBlockWidth = AOMMAX(kMinVectorSize / sizeof(T), kWidth); constexpr size_t kBlockHeight = AOMMAX(kMinVectorSize / sizeof(T), kHeight); HWY_ALIGN_MAX T buf0[kBlockWidth * kBlockHeight]; constexpr bool kBigRectangle = (kBlockWidth == 64 && kBlockHeight >= 32) || (kBlockWidth >= 32 && kBlockHeight == 64); using T2 = typename std::conditional::type; HWY_ALIGN_MAX T2 buf1[kBlockWidth * kBlockHeight]; constexpr int8_t kShift[3] = { kForwardTransformShift[TxSize][0], kForwardTransformShift[TxSize][1], kForwardTransformShift[TxSize][2] }; constexpr int kTransformWidthIndex = GetTxwIndex(TxSize); constexpr int kTransformHeightIndex = GetTxhIndex(TxSize); constexpr int8_t cos_bit_col = kForwardCosBitCol[kTransformWidthIndex][kTransformHeightIndex]; constexpr int8_t cos_bit_row = kForwardCosBitRow[kTransformWidthIndex][kTransformHeightIndex]; const TX_TYPE_1D vertical_transform = vtx_tab[tx_type]; const TX_TYPE_1D horizontal_transform = htx_tab[tx_type]; constexpr bool kApplyRectScale = kApplyRectScaleList[TxSize]; switch ((vertical_transform == FLIPADST_1D ? 1 : 0) | (horizontal_transform == FLIPADST_1D ? 2 : 0)) { case 0: LoadBuffer( input, buf0, stride); break; case 1: LoadBuffer( input, buf0, stride); break; case 2: LoadBuffer( input, buf0, stride); break; case 3: LoadBuffer( input, buf0, stride); break; } if CONSTEXPR_IF (kHeight == 64) { TransformFull64(vertical_transform, buf0, cos_bit_col); } else { TransformBelow32( vertical_transform, buf0, cos_bit_col); } Transpose(buf0, buf1, kBlockWidth, kBlockHeight); if CONSTEXPR_IF (kWidth == 64 && kHeight == 64) { // 64x64 only writes 32x32 of coefficients. assert(tx_type == DCT_1D); FdctNx64Block(buf1, cos_bit_row); StoreBlock<32, 32, kShift[2], kApplyRectScale>(buf1, 32, output, 32); } else if CONSTEXPR_IF (kHeight == 64 && (kWidth == 16 || kWidth == 32)) { // 32x64 and 16x64 coefficients are packed into Wx32, discarding the // right-most results. RowTransform<32, kBlockHeight, kWidth, kBlockWidth>(horizontal_transform, buf1, cos_bit_row); StoreBlock(buf1, kBlockHeight, output, 32); } else { RowTransform( horizontal_transform, buf1, cos_bit_row); StoreBlock(buf1, kBlockHeight, output, kHeight); } if CONSTEXPR_IF (kHeight <= 16 && kWidth == 64) { hwy::ZeroBytes(output + kHeight * 32); } } HWY_MAYBE_UNUSED void LowBitdepthForwardTransform2D(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { assert(txfm_param->tx_type == DCT_DCT); av1_fwht4x4(src_diff, coeff, diff_stride); return; } using TransformFunction = decltype(&ForwardTransform2D); constexpr TransformFunction kTable[] = { #define POINTER(w, h, _) &ForwardTransform2D, FOR_EACH_TXFM2D(POINTER, _) #undef POINTER }; kTable[txfm_param->tx_size](src_diff, coeff, diff_stride, txfm_param->tx_type); } } // namespace HWY_NAMESPACE } // namespace HWY_AFTER_NAMESPACE(); #define MAKE_HIGHBD_TXFM2D(w, h, suffix) \ extern "C" void av1_fwd_txfm2d_##w##x##h##_##suffix( \ const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ int bd); \ HWY_ATTR void av1_fwd_txfm2d_##w##x##h##_##suffix( \ const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ int bd) { \ (void)bd; \ HWY_NAMESPACE::ForwardTransform2D(input, output, \ stride, tx_type); \ } #define MAKE_LOWBD_TXFM2D(w, h, suffix) \ extern "C" void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix( \ const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ int bd); \ HWY_ATTR void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix( \ const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ int bd) { \ (void)bd; \ HWY_NAMESPACE::ForwardTransform2D(input, output, \ stride, tx_type); \ } #define MAKE_LOWBD_TXFM2D_DISPATCH(suffix) \ extern "C" void av1_lowbd_fwd_txfm_##suffix( \ const int16_t *src_diff, tran_low_t *coeff, int diff_stride, \ TxfmParam *txfm_param); \ HWY_ATTR void av1_lowbd_fwd_txfm_##suffix( \ const int16_t *src_diff, tran_low_t *coeff, int diff_stride, \ TxfmParam *txfm_param) { \ HWY_NAMESPACE::LowBitdepthForwardTransform2D(src_diff, coeff, diff_stride, \ txfm_param); \ } #endif // AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_