/* * Copyright (c) 2025, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_REDUCE_SUM_HWY_H_ #define AOM_AOM_DSP_REDUCE_SUM_HWY_H_ #include #include "third_party/highway/hwy/highway.h" HWY_BEFORE_NAMESPACE(); namespace { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; template struct BlockReduceTraits; template <> struct BlockReduceTraits<1> { template HWY_ATTR HWY_INLINE static hn::VFromD ReduceSum(D d, hn::VFromD v) { (void)d; return v; } }; template struct BlockReduceTraits { static_assert(NumBlocks > 1, "Primary template BlockReduceTraits assumes NumBlocks > 1"); static_assert((NumBlocks & (NumBlocks - 1)) == 0, "BlockReduceTraits requires NumBlocks to be a power of 2."); template HWY_ATTR HWY_INLINE static hn::VFromD> ReduceSum( D d, hn::VFromD v) { (void)d; constexpr hn::Half half_d; auto v_half = hn::Add(hn::LowerHalf(half_d, v), hn::UpperHalf(half_d, v)); return BlockReduceTraits::ReduceSum(half_d, v_half); } }; // ReduceSum across blocks. // For example, with a 4-block vector with 16 lanes of uint32_t: // [a3 b3 c3 d3 a2 b2 c2 d2 a1 b1 c1 d1 a0 b0 c0 d0] // returns a vector with 4 lanes: // [a3+a2+a1+a0 b3+b2+b1+b0 c3+c2+c1+c0 d3+d2+d1+d0] template HWY_ATTR HWY_INLINE hn::Vec> BlockReduceSum( D int_tag, hn::VFromD v) { return BlockReduceTraits::ReduceSum(int_tag, v); } } // namespace HWY_NAMESPACE } // namespace HWY_AFTER_NAMESPACE(); #endif // AOM_AOM_DSP_REDUCE_SUM_HWY_H_