From 0c494114ccbedf339c3ac6ea9923d726b6563879 Mon Sep 17 00:00:00 2001 From: Seppo Tomperi Date: Thu, 5 Feb 2015 06:22:19 +0000 Subject: [PATCH] hevcdsp: ARM NEON optimized deblocking filter cherry picked from commit 1b9ee47d2f43b0a029a9468233626102eb1473b8 Signed-off-by: Michael Niedermayer --- libavcodec/arm/Makefile | 2 + libavcodec/arm/hevcdsp_deblock_neon.S | 385 ++++++++++++++++++++++++++++++++++ libavcodec/arm/hevcdsp_init_neon.c | 48 +++++ libavcodec/hevcdsp.c | 2 + libavcodec/hevcdsp.h | 2 +- 5 files changed, 438 insertions(+), 1 deletion(-) create mode 100644 libavcodec/arm/hevcdsp_deblock_neon.S create mode 100644 libavcodec/arm/hevcdsp_init_neon.c diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index e0af6bc..eb9da26 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -132,6 +132,8 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ arm/synth_filter_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ + arm/hevcdsp_deblock_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S new file mode 100644 index 0000000..e5998c1 --- /dev/null +++ b/libavcodec/arm/hevcdsp_deblock_neon.S @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.macro hevc_loop_filter_chroma_start + ldr r12, [r2] + ldr r3, [r2, #4] + add r2, r3, r12 + cmp r2, #0 + it eq + bxeq lr +.endm + +.macro hevc_loop_filter_chroma_body + vsubl.u8 q4, d4, d2 + vsubl.u8 q11, d18, d19 + vshl.i16 q4, #2 + vadd.i16 q11, q4 + vdup.16 d0, r12 + vdup.16 d1, r3 + vrshr.s16 q11, q11, #3 + vneg.s16 q12, q0 + vmovl.u8 q2, d4 + vmin.s16 q11, q11, q0 + vmax.s16 q11, q11, q12 + vaddw.u8 q1, q11, d2 + vsub.i16 q2, q11 + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 +.endm + +.macro hevc_loop_filter_luma_start + ldr r12, [r3] + ldr r3, [r3, #4] + lsl r3, #16 + orr r3, r12 + cmp r3, #0 + it eq + bxeq lr + lsr r3, #16 +.endm + +.macro hevc_loop_filter_luma_body + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + vmovl.u8 q14, d28 + vmovl.u8 q15, d30 + + vadd.i16 q7, q9, q11 + vadd.i16 q6, q14, q12 + vsub.i16 q7, q10 + vsub.i16 q6, q13 + vabd.s16 q7, q7, q10 + vabd.s16 q6, q6, q13 + + + vdup.16 q0, r2 + vmov q4, q7 + vmov q5, q6 + vdup.16 d4, r12 + vtrn.16 q7, q4 + vtrn.16 q6, q5 + + vshl.u64 q7, #32 + vshr.u64 q4, #32 + vshl.u64 q6, #32 + vshr.u64 q5, #32 + vshr.u64 q7, #32 + vshr.u64 q6, #32 + vshl.u64 q5, #32 + vshl.u64 q4, #32 + vorr q6, q5 + vorr q7, q4 + vdup.16 d5, r3 + vadd.i16 q5, q7, q6 + + vmov q4, q5 + vmov q3, q5 + vtrn.32 q3, q4 + + vadd.i16 q4, q3 + + vshl.s16 q5, q5, #1 + vcgt.s16 q3, q0, q4 + + vmovn.i16 d6, q3 + vshr.s16 q1, q0, #2 + vmovn.i16 d6, q3 + vcgt.s16 q5, q1, q5 + vmov r7, s12 + cmp r7, #0 + beq bypasswrite + + vpadd.i32 d0, d14, d12 + vpadd.i32 d1, d15, d13 + vmov q4, q2 + vshl.s16 q2, #2 + vshr.s16 q1, q1, #1 + vrhadd.s16 q2, q4 + + vabd.s16 q7, q8, q11 + vaba.s16 q7, q15, q12 + + vmovn.i32 d0, q0 + vmov r5, r6, s0, s1 + vcgt.s16 q6, q1, q7 + vand q5, q5, q6 + vabd.s16 q7, q11, q12 + vcgt.s16 q6, q2, q7 + vand q5, q5, q6 + + vmov q2, q5 + vtrn.s16 q5, q2 + vshr.u64 q2, #32 + vshl.u64 q5, #32 + vshl.u64 q2, #32 + vshr.u64 q5, #32 + vorr q5, q2 + + vmov q2, q5 + vshl.i16 q7, q4, #1 + vtrn.32 q2, q5 + vand q5, q2 + vneg.s16 q6, q7 + vmovn.i16 d4, q5 + vmovn.i16 d4, q2 + vmov r8, s8 + + and r9, r8, r7 + cmp r9, #0 + beq weakfilter_\@ + + vadd.i16 q2, q11, q12 + vadd.i16 q4, q9, q8 + vadd.i16 q1, q2, q10 + vdup.16 d10, r9 + vadd.i16 q0, q1, q9 + vshl.i16 q4, #1 + lsr r9, #16 + vadd.i16 q1, q0 + vrshr.s16 q3, q0, #2 + vadd.i16 q1, q13 + vadd.i16 q4, q0 + vsub.i16 q3, q10 + vrshr.s16 q1, #3 + vrshr.s16 q4, #3 + vmax.s16 q3, q6 + vsub.i16 q1, q11 + vsub.i16 q4, q9 + vmin.s16 q3, q7 + vmax.s16 q4, q6 + vmax.s16 q1, q6 + vadd.i16 q3, q10 + vmin.s16 q4, q7 + vmin.s16 q1, q7 + vdup.16 d11, r9 + vadd.i16 q4, q9 + vadd.i16 q1, q11 + vbit q9, q4, q5 + vadd.i16 q4, q2, q13 + vbit q11, q1, q5 + vadd.i16 q0, q4, q14 + vadd.i16 q2, q15, q14 + vadd.i16 q4, q0 + + vshl.i16 q2, #1 + vadd.i16 q4, q10 + vbit q10, q3, q5 + vrshr.s16 q4, #3 + vadd.i16 q2, q0 + vrshr.s16 q3, q0, #2 + vsub.i16 q4, q12 + vrshr.s16 q2, #3 + vsub.i16 q3, q13 + vmax.s16 q4, q6 + vsub.i16 q2, q14 + vmax.s16 q3, q6 + vmin.s16 q4, q7 + vmax.s16 q2, q6 + vmin.s16 q3, q7 + vadd.i16 q4, q12 + vmin.s16 q2, q7 + vadd.i16 q3, q13 + vbit q12, q4, q5 + vadd.i16 q2, q14 + vbit q13, q3, q5 + vbit q14, q2, q5 + +weakfilter_\@: + mvn r8, r8 + and r9, r8, r7 + cmp r9, #0 + beq ready_\@ + + vdup.16 q4, r2 + + vdup.16 d10, r9 + lsr r9, #16 + vmov q1, q4 + vdup.16 d11, r9 + vshr.s16 q1, #1 + vsub.i16 q2, q12, q11 + vadd.i16 q4, q1 + vshl.s16 q0, q2, #3 + vshr.s16 q4, #3 + vadd.i16 q2, q0 + vsub.i16 q0, q13, q10 + vsub.i16 q2, q0 + vshl.i16 q0, q0, #1 + vsub.i16 q2, q0 + vshl.s16 q1, q7, 2 + vrshr.s16 q2, q2, #4 + vadd.i16 q1, q7 + vabs.s16 q3, q2 + vshr.s16 q6, q6, #1 + vcgt.s16 q1, q1, q3 + vand q5, q1 + vshr.s16 q7, q7, #1 + vmax.s16 q2, q2, q6 + vmin.s16 q2, q2, q7 + + vshr.s16 q7, q7, #1 + vrhadd.s16 q3, q9, q11 + vneg.s16 q6, q7 + vsub.s16 q3, q10 + vdup.16 d2, r5 + vhadd.s16 q3, q2 + vdup.16 d3, r6 + vmax.s16 q3, q3, q6 + vcgt.s16 q1, q4, q1 + vmin.s16 q3, q3, q7 + vand q1, q5 + vadd.i16 q3, q10 + lsr r5, #16 + lsr r6, #16 + vbit q10, q3, q1 + + vrhadd.s16 q3, q14, q12 + vdup.16 d2, r5 + vsub.s16 q3, q13 + vdup.16 d3, r6 + vhsub.s16 q3, q2 + vcgt.s16 q1, q4, q1 + vmax.s16 q3, q3, q6 + vand q1, q5 + vmin.s16 q3, q3, q7 + vadd.i16 q3, q13 + vbit q13, q3, q1 + vadd.i16 q0, q11, q2 + vsub.i16 q4, q12, q2 + vbit q11, q0, q5 + vbit q12, q4, q5 + +ready_\@: + vqmovun.s16 d16, q8 + vqmovun.s16 d18, q9 + vqmovun.s16 d20, q10 + vqmovun.s16 d22, q11 + vqmovun.s16 d24, q12 + vqmovun.s16 d26, q13 + vqmovun.s16 d28, q14 + vqmovun.s16 d30, q15 +.endm + +function ff_hevc_v_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start + push {r5-r11} + vpush {d8-d15} + sub r0, #4 + vld1.8 {d16}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d22}, [r0], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d28}, [r0], r1 + vld1.8 {d30}, [r0], r1 + sub r0, r0, r1, lsl #3 + transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 + hevc_loop_filter_luma_body + transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 + vst1.8 {d16}, [r0], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d22}, [r0], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d28}, [r0], r1 + vst1.8 {d30}, [r0] + vpop {d8-d15} + pop {r5-r11} + bx lr +endfunc + +function ff_hevc_h_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start + push {r5-r11} + vpush {d8-d15} + sub r0, r0, r1, lsl #2 + vld1.8 {d16}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d22}, [r0], r1 + vld1.8 {d24}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d28}, [r0], r1 + vld1.8 {d30}, [r0], r1 + sub r0, r0, r1, lsl #3 + add r0, r1 + hevc_loop_filter_luma_body + vst1.8 {d18}, [r0], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d22}, [r0], r1 + vst1.8 {d24}, [r0], r1 + vst1.8 {d26}, [r0], r1 + vst1.8 {d28}, [r0] +bypasswrite: + vpop {d8-d15} + pop {r5-r11} + bx lr +endfunc + +function ff_hevc_v_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, #4 + vld1.8 {d16}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d19}, [r0], r1 + vld1.8 {d20}, [r0], r1 + vld1.8 {d21}, [r0], r1 + sub r0, r0, r1, lsl #3 + transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 + hevc_loop_filter_chroma_body + transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r0], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d2}, [r0], r1 + vst1.8 {d4}, [r0], r1 + vst1.8 {d19}, [r0], r1 + vst1.8 {d20}, [r0], r1 + vst1.8 {d21}, [r0] + bx lr +endfunc + +function ff_hevc_h_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, r0, r1, lsl #1 + vld1.8 {d18}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d4}, [r0], r1 + vld1.8 {d19}, [r0] + sub r0, r0, r1, lsl #1 + hevc_loop_filter_chroma_body + vst1.8 {d2}, [r0], r1 + vst1.8 {d4}, [r0] + bx lr +endfunc diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c new file mode 100644 index 0000000..d63b293 --- /dev/null +++ b/libavcodec/arm/hevcdsp_init_neon.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/hevcdsp.h" + +void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + +static av_cold void hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +{ +#if HAVE_NEON + if (bit_depth == 8) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; + } +#endif // HAVE_NEON +} + +void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + hevcdsp_init_neon(c, bit_depth); +} diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c index 7dcdfff..04af178 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -259,4 +259,6 @@ int i = 0; if (ARCH_X86) ff_hevc_dsp_init_x86(hevcdsp, bit_depth); + if (ARCH_ARM) + ff_hevcdsp_init_arm(hevcdsp, bit_depth); } diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index bf7777f..a891ea7 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -128,5 +128,5 @@ extern const int8_t ff_hevc_epel_filters[7][4]; extern const int8_t ff_hevc_qpel_filters[3][16]; void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth); - +void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth); #endif /* AVCODEC_HEVCDSP_H */