/* * Copyright (c) 2025, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/cdef_block.h" // partial A is a 16-bit vector of the form: // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: // [0 y1 y2 y3 y4 y5 y6 y7]. // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 // and const2. static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala, vint16m1_t partialb, vuint32m1_t const1, vuint32m1_t const2) { // Square and add the corresponding x and y values. vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8); cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8); // Multiply by constant. vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost); vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1( __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4); tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8); vuint32m1_t ret = __riscv_vmacc_vv_u32m1( cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4); return ret; } // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal // down-right, 6 is vertical). // // For each direction the lines are shifted so that we can perform a // basic sum on each vector element. For example, direction 5 is "south by // southeast", so we need to add the pixels along each line i below: // // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // For this to fit nicely in vectors, the lines need to be shifted like so: // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // In this configuration we can now perform SIMD additions to get the cost // along direction 5. Since this won't fit into a single 128-bit vector, we use // two of them to compute each half of the new configuration, and pad the empty // spaces with zeros. Similar shifting is done for other directions, except // direction 6 which is straightforward as it's the vertical direction. static vuint32m1_t compute_vert_directions_rvv( vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); // Partial sums for lines 0 and 1. vint16m1_t partial4a = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl); vint16m1_t tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl); tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN); partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN); vint16m1_t partial5a = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl); vint16m1_t partial5b = __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN); vint16m1_t partial7a = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl); vint16m1_t partial7b = __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN); vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl); // Partial sums for lines 2 and 3. tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN); partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN); partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN); partial5a = __riscv_vadd_vv_i16m1( partial5a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); partial5b = __riscv_vadd_vv_i16m1( partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); partial7a = __riscv_vadd_vv_i16m1( partial7a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); partial7b = __riscv_vadd_vv_i16m1( partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); // Partial sums for lines 4 and 5. partial4a = __riscv_vadd_vv_i16m1( partial4a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl); partial4a = __riscv_vadd_vv_i16m1( partial4a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); partial4b = __riscv_vadd_vv_i16m1( partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl); partial4b = __riscv_vadd_vv_i16m1( partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN); partial5a = __riscv_vadd_vv_i16m1( partial5a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); partial5b = __riscv_vadd_vv_i16m1( partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); partial7a = __riscv_vadd_vv_i16m1( partial7a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); partial7b = __riscv_vadd_vv_i16m1( partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); // Partial sums for lines 6 and 7. partial4a = __riscv_vadd_vv_i16m1( partial4a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl); partial4b = __riscv_vadd_vv_i16m1( partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN); partial5a = __riscv_vadd_vv_i16m1( partial5a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl); partial5b = __riscv_vadd_vv_i16m1( partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl); partial7a = __riscv_vadd_vv_i16m1( partial7a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl); partial7b = __riscv_vadd_vv_i16m1( partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl); partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); // const0 = { 840, 420, 280, 210, } vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); // const1 = { 168, 140, 120, 105, } vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); // const2 = { 0, 0, 420, 210, } vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4); const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4); const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4); // const3 = { 140, 105, 105, 105, }; vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4); const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4); // Compute costs in terms of partial sums. vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl); vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl); partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4); // Reverse partial B. // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }. vuint32m1_t costs_0, costs_1, costs_2, costs_3; static const uint16_t tab_u16[8] = { 6, 5, 4, 3, 2, 1, 0, 7, }; vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); vint16m1_t partial4b_rv = __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8); costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1); vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1( __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32)); costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4); vint16m1_t partial5b_rv = __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8); costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3); vint16m1_t partial7b_rv = __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8); costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3); // combine values vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); vuint32m1_t cost0_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); vuint32m1_t cost1_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); vuint32m1_t cost2_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); vuint32m1_t cost3_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4); cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4); __riscv_vse32_v_u32m1(&cost[0], cost47, 4); return cost47; } static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala, vint16m1_t partialb, vint16m1_t partialc, vuint32m1_t const0) { vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4); vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4); vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8); vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8); tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8); vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8); tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8); vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8); partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8); partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8); vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4)); partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8); partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8); vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4)); vuint32m1_t cost = __riscv_vmul_vx_u32m1( __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4); cost = __riscv_vmacc_vv_u32m1( cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4); return cost; } static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, vint16m1_t lines_3) { vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1); vint32m1_t lines0_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8); vint32m1_t lines1_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8); vint32m1_t lines2_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8); vint32m1_t lines3_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8); vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4); ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4); ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4); return ret; } // This function computes the cost along directions 0, 1, 2, 3. (0 means // 45-degree up-right, 2 is horizontal). // // For direction 1 and 3 ("east northeast" and "east southeast") the shifted // lines need three vectors instead of two. For direction 1 for example, we need // to compute the sums along the line i below: // 0 0 1 1 2 2 3 3 // 1 1 2 2 3 3 4 4 // 2 2 3 3 4 4 5 5 // 3 3 4 4 5 5 6 6 // 4 4 5 5 6 6 7 7 // 5 5 6 6 7 7 8 8 // 6 6 7 7 8 8 9 9 // 7 7 8 8 9 9 10 10 // // Which means we need the following configuration: // 0 0 1 1 2 2 3 3 // 1 1 2 2 3 3 4 4 // 2 2 3 3 4 4 5 5 // 3 3 4 4 5 5 6 6 // 4 4 5 5 6 6 7 7 // 5 5 6 6 7 7 8 8 // 6 6 7 7 8 8 9 9 // 7 7 8 8 9 9 10 10 // // Three vectors are needed to compute this, as well as some extra pairwise // additions. static vuint32m1_t compute_horiz_directions_rvv( vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { // Compute diagonal directions (1, 2, 3). // Partial sums for lines 0 and 1. size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl); vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN); vint16m1_t partial1a = __riscv_vadd_vv_i16m1( lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl), vl); vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN); vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN); partial3a = __riscv_vadd_vv_i16m1( partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl); vint16m1_t partial3b = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl); // Partial sums for lines 2 and 3. partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl); partial1a = __riscv_vadd_vv_i16m1( partial1a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl); partial1a = __riscv_vadd_vv_i16m1( partial1a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl); partial3a = __riscv_vadd_vv_i16m1( partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl); // Partial sums for lines 4 and 5. partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl); partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl); vint16m1_t partial3c = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl); partial3c = __riscv_vadd_vv_i16m1( partial3c, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl); // Partial sums for lines 6 and 7. partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl); partial1c = __riscv_vadd_vv_i16m1( partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl); partial1c = __riscv_vadd_vv_i16m1( partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl); partial3c = __riscv_vadd_vv_i16m1( partial3c, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl); partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl); // Special case for direction 2 as it's just a sum along each line. vint32m1_t partial2a = horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3); vint32m1_t partial2b = horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7); vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1( __riscv_vmul_vv_i32m1(partial2a, partial2a, 4)); vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1( __riscv_vmul_vv_i32m1(partial2b, partial2b, 4)); // const0 = { 840, 420, 280, 210, } vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); // const1 = { 168, 140, 120, 105, } vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); // const2 = { 420, 210, 140, 105, }; vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4); const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4); const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4); const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4); static const uint16_t tab_u16[8] = { 0, 6, 5, 4, 3, 2, 1, 0, }; vuint32m1_t costs_0, costs_1, costs_2, costs_3; vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); // Reverse partial c. // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, } vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8); vint16m1_t partial0b_rv = __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8); costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1); // Reverse partial c. // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, } vuint16m1_t index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8); index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8); vint16m1_t partialc_rv = __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8); costs_1 = fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2); costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4); costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4); vint16m1_t partial3a_rv = __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8); costs_3 = fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2); // combine values vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); vuint32m1_t cost0_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); vuint32m1_t cost1_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); vuint32m1_t cost2_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); vuint32m1_t cost3_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4); costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4); __riscv_vse32_v_u32m1(&cost[0], costs_0, 4); return costs_0; } int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { size_t vl = 8; size_t vlmax = __riscv_vsetvlmax_e16m1(); vuint16m1_t s; vint16m1_t lines_0, lines_1, lines_2, lines_3; vint16m1_t lines_4, lines_5, lines_6, lines_7; vuint16m1_t vec_zero_u16m1 = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16)); if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_0 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_1 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_2 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_3 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_4 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_5 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_6 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_7 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl); // Compute "mostly vertical" directions. uint32_t cost[8]; vuint32m1_t cost47 = compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, lines_5, lines_6, lines_7, cost + 4, vl); // Compute "mostly horizontal" directions. vuint32m1_t cost03 = compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, lines_5, lines_6, lines_7, cost, vl); // Find max cost as well as its index to get best_dir. // The max cost needs to be propagated in the whole vector to find its // position in the original cost vectors cost03 and cost47. vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4); uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32( __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4)); vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4); long best_dir = __riscv_vfirst_m_b32(mask_cost, 4); if (best_dir == -1) { mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4); best_dir = __riscv_vfirst_m_b32(mask_cost, 4); best_dir += 4; } // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. *var = best_cost - cost[(best_dir + 4) & 7]; // We'd normally divide by 840, but dividing by 1024 is close enough // for what we're going to do with this. *var >>= 10; return (int)best_dir; } void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { do { int w = 0; size_t num_cols = width; while (num_cols > 0) { size_t vl = __riscv_vsetvl_e8mf2(num_cols); vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl); vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl); __riscv_vse16_v_u16m1(dst + w, u16_src, vl); w += vl; num_cols -= vl; } src += sstride; dst += dstride; } while (--height != 0); } void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height) { do { int w = 0; size_t num_cols = width; while (num_cols > 0) { size_t vl = __riscv_vsetvl_e16m1(num_cols); vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl); __riscv_vse16_v_u16m1(dst + w, u16_src, vl); w += vl; num_cols -= vl; } src += sstride; dst += dstride; } while (--height != 0); } static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b, int16_t threshold, int16_t adjdamp, size_t vl) { if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl); const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl); const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl); const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl); const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl); const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl); const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl); const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl); const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl); return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl); } static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) { const vbool16_t mask = __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl); const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl); return __riscv_vmax_vv_i16m1(val, b, vl); } static inline vint16m1_t load_strided_i16_4x2(int16_t *addr, const ptrdiff_t stride, size_t vl) { const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl); const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl); return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl); } static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst, const ptrdiff_t stride, size_t vl) { __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1); vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl); __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1); } static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst, const ptrdiff_t stride, size_t vl) { __riscv_vse16_v_u16m1(addr, vdst, vl >> 1); vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl); __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1); } #define LOAD_PIX(addr) \ const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \ vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) #define LOAD_PIX4(addr) \ const vint16m1_t px = \ load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \ vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) #define LOAD_DIR(p, addr, o0, o1) \ const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \ const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \ const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \ const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl) #define LOAD_DIR4(p, addr, o0, o1) \ const vint16m1_t p##0 = \ load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \ const vint16m1_t p##1 = \ load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \ const vint16m1_t p##2 = \ load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \ const vint16m1_t p##3 = \ load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl) #define MAKE_TAPS \ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \ const int16_t tap0 = (int16_t)(pri_taps[0]); \ const int16_t tap1 = (int16_t)(pri_taps[1]) #define CONSTRAIN(p, strength, shift) \ vint16m1_t p##_c0 = \ constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \ vint16m1_t p##_c1 = \ constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \ vint16m1_t p##_c2 = \ constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \ vint16m1_t p##_c3 = \ constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl) #define SETUP_MINMAX \ vint16m1_t max = px; \ vint16m1_t min = px #define MIN_MAX(p) \ do { \ max = vmax_mask(p##0, max, vl); \ min = __riscv_vmin_vv_i16m1(p##0, min, vl); \ max = vmax_mask(p##1, max, vl); \ min = __riscv_vmin_vv_i16m1(p##1, min, vl); \ max = vmax_mask(p##2, max, vl); \ min = __riscv_vmin_vv_i16m1(p##2, min, vl); \ max = vmax_mask(p##3, max, vl); \ min = __riscv_vmin_vv_i16m1(p##3, min, vl); \ } while (0) #define PRI_0_UPDATE_SUM(p) \ const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl); \ sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl) #define UPDATE_SUM(p) \ const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl); \ sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl) #define SEC_0_UPDATE_SUM(p) \ const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \ sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl) #define BIAS \ const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl); \ const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl); \ const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl); \ const vint16m1_t unclamped = __riscv_vadd_vv_i16m1( \ px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \ vl) #define STORE4 \ do { \ store_strided_u8_4x2(dst8, vdst, dstride, vl); \ \ in += (CDEF_BSTRIDE << 1); \ dst8 += (dstride << 1); \ } while (0) #define STORE4_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ STORE4; \ } while (0) #define STORE4_UNCLAMPED \ do { \ BIAS; \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ STORE4; \ } while (0) #define STORE8 \ do { \ __riscv_vse8_v_u8mf2(dst8, vdst, vl); \ \ in += CDEF_BSTRIDE; \ dst8 += dstride; \ } while (0) #define STORE8_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ STORE8; \ } while (0) #define STORE8_UNCLAMPED \ do { \ BIAS; \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ STORE8; \ } while (0) #define STORE16_4 \ do { \ store_strided_u16_4x2(dst16, vdst, dstride, vl); \ \ in += (CDEF_BSTRIDE << 1); \ dst16 += (dstride << 1); \ } while (0) #define STORE16_4_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ STORE16_4; \ } while (0) #define STORE16_4_UNCLAMPED \ do { \ BIAS; \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ STORE16_4; \ } while (0) #define STORE16 \ do { \ __riscv_vse16_v_u16m1(dst16, vdst, vl); \ \ in += CDEF_BSTRIDE; \ dst16 += dstride; \ } while (0) #define STORE16_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ STORE16; \ } while (0) #define STORE16_UNCLAMPED \ do { \ BIAS; \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ STORE16; \ } while (0) void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); SETUP_MINMAX; // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE8_CLAMPED; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); SETUP_MINMAX; // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE4_CLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)sec_strength; (void)sec_damping; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE8_UNCLAMPED; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)pri_damping; (void)coeff_shift; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE8_UNCLAMPED; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl); __riscv_vse8_v_u8mf2(dst8, vdst, vl); in += CDEF_BSTRIDE; dst8 += dstride; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { const vint16m1_t px = load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl); store_strided_u8_4x2(dst8, vdst, dstride, vl); in += 2 * CDEF_BSTRIDE; dst8 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); SETUP_MINMAX; // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE16_CLAMPED; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); SETUP_MINMAX; // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE16_4_CLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)sec_strength; (void)sec_damping; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE16_UNCLAMPED; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE16_4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)pri_damping; (void)coeff_shift; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE16_UNCLAMPED; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE16_4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); __riscv_vse16_v_u16m1(dst16, px, vl); in += CDEF_BSTRIDE; dst16 += dstride; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { const vint16m1_t px = load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px); store_strided_u16_4x2(dst16, vdst, dstride, vl); in += 2 * CDEF_BSTRIDE; dst16 += 2 * dstride; h -= 2; } while (h != 0); } }