From ebee0f22455e65e615b63fab3376156d74882271 Mon Sep 17 00:00:00 2001 From: shinchiro Date: Sun, 27 Dec 2015 16:39:01 +0800 Subject: [PATCH] Add HEVC intrinsics All patches come from: http://git.1f0.de/gitweb?p=ffmpeg.git;a=summary;js=1 https://github.com/mpc-hc/FFmpeg.git Series of patches: hevc: port intrinsic SSE2 IDCT from OpenHEVC hevc: port intra pred SIMD from OpenHEVC x86/hevc: use DECLARE_ALIGNED for on-stack tmp arrays to avoid crashes hevc: don't use deprecated YUVJ pixel formats x86/hevc: correctly mark intrapred functions as SSE4 --- libavcodec/hevc_ps.c | 2 - libavcodec/hevcpred.c | 2 + libavcodec/hevcpred.h | 1 + libavcodec/x86/Makefile | 4 +- libavcodec/x86/hevc_idct_intrinsic.c | 763 ++++++++++++++++++++++++++++ libavcodec/x86/hevc_intra_intrinsic.c | 922 ++++++++++++++++++++++++++++++++++ libavcodec/x86/hevcdsp.h | 22 + libavcodec/x86/hevcdsp_init.c | 50 ++ libavcodec/x86/hevcpred.h | 24 + 9 files changed, 1787 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/hevc_idct_intrinsic.c create mode 100644 libavcodec/x86/hevc_intra_intrinsic.c create mode 100644 libavcodec/x86/hevcpred.h diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c index 14f908e..e68cf88 100644 --- a/libavcodec/hevc_ps.c +++ b/libavcodec/hevc_ps.c @@ -543,8 +543,6 @@ static void decode_vui(GetBitContext *gb, AVCodecContext *avctx, vui->video_format = get_bits(gb, 3); vui->video_full_range_flag = get_bits1(gb); vui->colour_description_present_flag = get_bits1(gb); - if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P) - sps->pix_fmt = AV_PIX_FMT_YUVJ420P; if (vui->colour_description_present_flag) { vui->colour_primaries = get_bits(gb, 8); vui->transfer_characteristic = get_bits(gb, 8); diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c index 02c1766..bd7ddc9 100644 --- a/libavcodec/hevcpred.c +++ b/libavcodec/hevcpred.c @@ -77,4 +77,6 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) if (ARCH_MIPS) ff_hevc_pred_init_mips(hpc, bit_depth); + if (ARCH_X86) + ff_hevc_pred_init_x86(hpc, bit_depth); } diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h index eb17663..57e233d 100644 --- a/libavcodec/hevcpred.h +++ b/libavcodec/hevcpred.h @@ -42,5 +42,6 @@ typedef struct HEVCPredContext { void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); void ff_hevc_pred_init_mips(HEVCPredContext *hpc, int bit_depth); +void ff_hevc_pred_init_x86(HEVCPredContext *hpc, int bit_depth); #endif /* AVCODEC_HEVCPRED_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 0d09fe6..9cbfaeb 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -145,7 +145,9 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \ x86/hevc_idct.o \ x86/hevc_res_add.o \ x86/hevc_sao.o \ - x86/hevc_sao_10bit.o + x86/hevc_sao_10bit.o \ + x86/hevc_idct_intrinsic.o \ + x86/hevc_intra_intrinsic.o YASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o YASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o YASM-OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct.o diff --git a/libavcodec/x86/hevc_idct_intrinsic.c b/libavcodec/x86/hevc_idct_intrinsic.c new file mode 100644 index 0000000..ef970d7 --- /dev/null +++ b/libavcodec/x86/hevc_idct_intrinsic.c @@ -0,0 +1,763 @@ +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavcodec/hevc.h" +#include "libavcodec/x86/hevcdsp.h" + +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target("sse2") +#endif + +#if HAVE_SSE2 +#include +#endif + +DECLARE_ALIGNED(16, static const int16_t, transform4x4_luma[8][8] )= +{ + { 29, +84, 29, +84, 29, +84, 29, +84 }, + { +74, +55, +74, +55, +74, +55, +74, +55 }, + { 55, -29, 55, -29, 55, -29, 55, -29 }, + { +74, -84, +74, -84, +74, -84, +74, -84 }, + { 74, -74, 74, -74, 74, -74, 74, -74 }, + { 0, +74, 0, +74, 0, +74, 0, +74 }, + { 84, +55, 84, +55, 84, +55, 84, +55 }, + { -74, -29, -74, -29, -74, -29, -74, -29 } +}; + +DECLARE_ALIGNED( 16, static const int16_t, transform4x4[4][8] ) = { + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 } +}; + +DECLARE_ALIGNED(16, static const int16_t, transform8x8[12][1][8] )= +{ + {{ 89, 75, 89, 75, 89, 75, 89, 75 }}, + {{ 50, 18, 50, 18, 50, 18, 50, 18 }}, + {{ 75, -18, 75, -18, 75, -18, 75, -18 }}, + {{ -89, -50, -89, -50,-89, -50,-89, -50 }}, + {{ 50, -89, 50, -89, 50, -89, 50, -89 }}, + {{ 18, 75, 18, 75, 18, 75, 18, 75 }}, + {{ 18, -50, 18, -50, 18, -50, 18, -50 }}, + {{ 75, -89, 75, -89, 75, -89, 75, -89 }}, + {{ 64, 64, 64, 64, 64, 64, 64, 64 }}, + {{ 64, -64, 64, -64, 64, -64, 64, -64 }}, + {{ 83, 36, 83, 36, 83, 36, 83, 36 }}, + {{ 36, -83, 36, -83, 36, -83, 36, -83 }} +}; + +DECLARE_ALIGNED(16, static const int16_t, transform16x16_1[4][8][8] )= +{ + {/*1-3*/ /*2-6*/ + { 90, 87, 90, 87, 90, 87, 90, 87 }, + { 87, 57, 87, 57, 87, 57, 87, 57 }, + { 80, 9, 80, 9, 80, 9, 80, 9 }, + { 70, -43, 70, -43, 70, -43, 70, -43 }, + { 57, -80, 57, -80, 57, -80, 57, -80 }, + { 43, -90, 43, -90, 43, -90, 43, -90 }, + { 25, -70, 25, -70, 25, -70, 25, -70 }, + { 9, -25, 9, -25, 9, -25, 9, -25 }, + },{ /*5-7*/ /*10-14*/ + { 80, 70, 80, 70, 80, 70, 80, 70 }, + { 9, -43, 9, -43, 9, -43, 9, -43 }, + { -70, -87, -70, -87, -70, -87, -70, -87 }, + { -87, 9, -87, 9, -87, 9, -87, 9 }, + { -25, 90, -25, 90, -25, 90, -25, 90 }, + { 57, 25, 57, 25, 57, 25, 57, 25 }, + { 90, -80, 90, -80, 90, -80, 90, -80 }, + { 43, -57, 43, -57, 43, -57, 43, -57 }, + },{ /*9-11*/ /*18-22*/ + { 57, 43, 57, 43, 57, 43, 57, 43 }, + { -80, -90, -80, -90, -80, -90, -80, -90 }, + { -25, 57, -25, 57, -25, 57, -25, 57 }, + { 90, 25, 90, 25, 90, 25, 90, 25 }, + { -9, -87, -9, -87, -9, -87, -9, -87 }, + { -87, 70, -87, 70, -87, 70, -87, 70 }, + { 43, 9, 43, 9, 43, 9, 43, 9 }, + { 70, -80, 70, -80, 70, -80, 70, -80 }, + },{/*13-15*/ /* 26-30 */ + { 25, 9, 25, 9, 25, 9, 25, 9 }, + { -70, -25, -70, -25, -70, -25, -70, -25 }, + { 90, 43, 90, 43, 90, 43, 90, 43 }, + { -80, -57, -80, -57, -80, -57, -80, -57 }, + { 43, 70, 43, 70, 43, 70, 43, 70 }, + { 9, -80, 9, -80, 9, -80, 9, -80 }, + { -57, 87, -57, 87, -57, 87, -57, 87 }, + { 87, -90, 87, -90, 87, -90, 87, -90 }, + } +}; + +DECLARE_ALIGNED(16, static const int16_t, transform32x32[8][16][8] )= +{ + { /* 1-3 */ + { 90, 90, 90, 90, 90, 90, 90, 90 }, + { 90, 82, 90, 82, 90, 82, 90, 82 }, + { 88, 67, 88, 67, 88, 67, 88, 67 }, + { 85, 46, 85, 46, 85, 46, 85, 46 }, + { 82, 22, 82, 22, 82, 22, 82, 22 }, + { 78, -4, 78, -4, 78, -4, 78, -4 }, + { 73, -31, 73, -31, 73, -31, 73, -31 }, + { 67, -54, 67, -54, 67, -54, 67, -54 }, + { 61, -73, 61, -73, 61, -73, 61, -73 }, + { 54, -85, 54, -85, 54, -85, 54, -85 }, + { 46, -90, 46, -90, 46, -90, 46, -90 }, + { 38, -88, 38, -88, 38, -88, 38, -88 }, + { 31, -78, 31, -78, 31, -78, 31, -78 }, + { 22, -61, 22, -61, 22, -61, 22, -61 }, + { 13, -38, 13, -38, 13, -38, 13, -38 }, + { 4, -13, 4, -13, 4, -13, 4, -13 }, + },{/* 5-7 */ + { 88, 85, 88, 85, 88, 85, 88, 85 }, + { 67, 46, 67, 46, 67, 46, 67, 46 }, + { 31, -13, 31, -13, 31, -13, 31, -13 }, + { -13, -67, -13, -67, -13, -67, -13, -67 }, + { -54, -90, -54, -90, -54, -90, -54, -90 }, + { -82, -73, -82, -73, -82, -73, -82, -73 }, + { -90, -22, -90, -22, -90, -22, -90, -22 }, + { -78, 38, -78, 38, -78, 38, -78, 38 }, + { -46, 82, -46, 82, -46, 82, -46, 82 }, + { -4, 88, -4, 88, -4, 88, -4, 88 }, + { 38, 54, 38, 54, 38, 54, 38, 54 }, + { 73, -4, 73, -4, 73, -4, 73, -4 }, + { 90, -61, 90, -61, 90, -61, 90, -61 }, + { 85, -90, 85, -90, 85, -90, 85, -90 }, + { 61, -78, 61, -78, 61, -78, 61, -78 }, + { 22, -31, 22, -31, 22, -31, 22, -31 }, + },{/* 9-11 */ + { 82, 78, 82, 78, 82, 78, 82, 78 }, + { 22, -4, 22, -4, 22, -4, 22, -4 }, + { -54, -82, -54, -82, -54, -82, -54, -82 }, + { -90, -73, -90, -73, -90, -73, -90, -73 }, + { -61, 13, -61, 13, -61, 13, -61, 13 }, + { 13, 85, 13, 85, 13, 85, 13, 85 }, + { 78, 67, 78, 67, 78, 67, 78, 67 }, + { 85, -22, 85, -22, 85, -22, 85, -22 }, + { 31, -88, 31, -88, 31, -88, 31, -88 }, + { -46, -61, -46, -61, -46, -61, -46, -61 }, + { -90, 31, -90, 31, -90, 31, -90, 31 }, + { -67, 90, -67, 90, -67, 90, -67, 90 }, + { 4, 54, 4, 54, 4, 54, 4, 54 }, + { 73, -38, 73, -38, 73, -38, 73, -38 }, + { 88, -90, 88, -90, 88, -90, 88, -90 }, + { 38, -46, 38, -46, 38, -46, 38, -46 }, + },{/* 13-15 */ + { 73, 67, 73, 67, 73, 67, 73, 67 }, + { -31, -54, -31, -54, -31, -54, -31, -54 }, + { -90, -78, -90, -78, -90, -78, -90, -78 }, + { -22, 38, -22, 38, -22, 38, -22, 38 }, + { 78, 85, 78, 85, 78, 85, 78, 85 }, + { 67, -22, 67, -22, 67, -22, 67, -22 }, + { -38, -90, -38, -90, -38, -90, -38, -90 }, + { -90, 4, -90, 4, -90, 4, -90, 4 }, + { -13, 90, -13, 90, -13, 90, -13, 90 }, + { 82, 13, 82, 13, 82, 13, 82, 13 }, + { 61, -88, 61, -88, 61, -88, 61, -88 }, + { -46, -31, -46, -31, -46, -31, -46, -31 }, + { -88, 82, -88, 82, -88, 82, -88, 82 }, + { -4, 46, -4, 46, -4, 46, -4, 46 }, + { 85, -73, 85, -73, 85, -73, 85, -73 }, + { 54, -61, 54, -61, 54, -61, 54, -61 }, + },{/* 17-19 */ + { 61, 54, 61, 54, 61, 54, 61, 54 }, + { -73, -85, -73, -85, -73, -85, -73, -85 }, + { -46, -4, -46, -4, -46, -4, -46, -4 }, + { 82, 88, 82, 88, 82, 88, 82, 88 }, + { 31, -46, 31, -46, 31, -46, 31, -46 }, + { -88, -61, -88, -61, -88, -61, -88, -61 }, + { -13, 82, -13, 82, -13, 82, -13, 82 }, + { 90, 13, 90, 13, 90, 13, 90, 13 }, + { -4, -90, -4, -90, -4, -90, -4, -90 }, + { -90, 38, -90, 38, -90, 38, -90, 38 }, + { 22, 67, 22, 67, 22, 67, 22, 67 }, + { 85, -78, 85, -78, 85, -78, 85, -78 }, + { -38, -22, -38, -22, -38, -22, -38, -22 }, + { -78, 90, -78, 90, -78, 90, -78, 90 }, + { 54, -31, 54, -31, 54, -31, 54, -31 }, + { 67, -73, 67, -73, 67, -73, 67, -73 }, + },{ /* 21-23 */ + { 46, 38, 46, 38, 46, 38, 46, 38 }, + { -90, -88, -90, -88, -90, -88, -90, -88 }, + { 38, 73, 38, 73, 38, 73, 38, 73 }, + { 54, -4, 54, -4, 54, -4, 54, -4 }, + { -90, -67, -90, -67, -90, -67, -90, -67 }, + { 31, 90, 31, 90, 31, 90, 31, 90 }, + { 61, -46, 61, -46, 61, -46, 61, -46 }, + { -88, -31, -88, -31, -88, -31, -88, -31 }, + { 22, 85, 22, 85, 22, 85, 22, 85 }, + { 67, -78, 67, -78, 67, -78, 67, -78 }, + { -85, 13, -85, 13, -85, 13, -85, 13 }, + { 13, 61, 13, 61, 13, 61, 13, 61 }, + { 73, -90, 73, -90, 73, -90, 73, -90 }, + { -82, 54, -82, 54, -82, 54, -82, 54 }, + { 4, 22, 4, 22, 4, 22, 4, 22 }, + { 78, -82, 78, -82, 78, -82, 78, -82 }, + },{ /* 25-27 */ + { 31, 22, 31, 22, 31, 22, 31, 22 }, + { -78, -61, -78, -61, -78, -61, -78, -61 }, + { 90, 85, 90, 85, 90, 85, 90, 85 }, + { -61, -90, -61, -90, -61, -90, -61, -90 }, + { 4, 73, 4, 73, 4, 73, 4, 73 }, + { 54, -38, 54, -38, 54, -38, 54, -38 }, + { -88, -4, -88, -4, -88, -4, -88, -4 }, + { 82, 46, 82, 46, 82, 46, 82, 46 }, + { -38, -78, -38, -78, -38, -78, -38, -78 }, + { -22, 90, -22, 90, -22, 90, -22, 90 }, + { 73, -82, 73, -82, 73, -82, 73, -82 }, + { -90, 54, -90, 54, -90, 54, -90, 54 }, + { 67, -13, 67, -13, 67, -13, 67, -13 }, + { -13, -31, -13, -31, -13, -31, -13, -31 }, + { -46, 67, -46, 67, -46, 67, -46, 67 }, + { 85, -88, 85, -88, 85, -88, 85, -88 }, + },{/* 29-31 */ + { 13, 4, 13, 4, 13, 4, 13, 4 }, + { -38, -13, -38, -13, -38, -13, -38, -13 }, + { 61, 22, 61, 22, 61, 22, 61, 22 }, + { -78, -31, -78, -31, -78, -31, -78, -31 }, + { 88, 38, 88, 38, 88, 38, 88, 38 }, + { -90, -46, -90, -46, -90, -46, -90, -46 }, + { 85, 54, 85, 54, 85, 54, 85, 54 }, + { -73, -61, -73, -61, -73, -61, -73, -61 }, + { 54, 67, 54, 67, 54, 67, 54, 67 }, + { -31, -73, -31, -73, -31, -73, -31, -73 }, + { 4, 78, 4, 78, 4, 78, 4, 78 }, + { 22, -82, 22, -82, 22, -82, 22, -82 }, + { -46, 85, -46, 85, -46, 85, -46, 85 }, + { 67, -88, 67, -88, 67, -88, 67, -88 }, + { -82, 90, -82, 90, -82, 90, -82, 90 }, + { 90, -90, 90, -90, 90, -90, 90, -90 }, + } +}; + +#define shift_1st 7 +#define add_1st (1 << (shift_1st - 1)) + +#define CLIP_PIXEL_MAX_10 0x03FF +#define CLIP_PIXEL_MAX_12 0x0FFF + +#if HAVE_SSE2 +void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) +{ + uint8_t *dst = (uint8_t*)_dst; + ptrdiff_t stride = _stride; + int shift = 5; + int offset = 16; + __m128i r0, r1, r2, r3, r4, r5, r6, r9; + + r9 = _mm_setzero_si128(); + r2 = _mm_set1_epi16(offset); + + r0 = _mm_load_si128((__m128i*)(coeffs)); + r1 = _mm_load_si128((__m128i*)(coeffs + 8)); + + + r0 = _mm_adds_epi16(r0, r2); + r1 = _mm_adds_epi16(r1, r2); + + r0 = _mm_srai_epi16(r0, shift); + r1 = _mm_srai_epi16(r1, shift); + + r3 = _mm_loadl_epi64((__m128i*)(dst)); + r4 = _mm_loadl_epi64((__m128i*)(dst + stride)); + r5 = _mm_loadl_epi64((__m128i*)(dst + 2 * stride)); + r6 = _mm_loadl_epi64((__m128i*)(dst + 3 * stride)); + + r3 = _mm_unpacklo_epi8(r3, r9); + r4 = _mm_unpacklo_epi8(r4, r9); + r5 = _mm_unpacklo_epi8(r5, r9); + r6 = _mm_unpacklo_epi8(r6, r9); + r3 = _mm_unpacklo_epi64(r3, r4); + r4 = _mm_unpacklo_epi64(r5, r6); + + + r3 = _mm_adds_epi16(r3, r0); + r4 = _mm_adds_epi16(r4, r1); + + r3 = _mm_packus_epi16(r3, r4); + + *((uint32_t *)(dst)) = _mm_cvtsi128_si32(r3); + dst+=stride; + *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 4)); + dst+=stride; + *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 8)); + dst+=stride; + *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 12)); +} + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define INIT_8() \ + uint8_t *dst = (uint8_t*) _dst; \ + ptrdiff_t stride = _stride +#define INIT_10() \ + uint16_t *dst = (uint16_t*) _dst; \ + ptrdiff_t stride = _stride>>1 + +#define INIT_12() INIT_10() +#define INIT8_12() INIT8_10() + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define LOAD_EMPTY(dst, src) +#define LOAD4x4(dst, src) \ + dst ## 0 = _mm_load_si128((__m128i *) &src[0]); \ + dst ## 1 = _mm_load_si128((__m128i *) &src[8]) +#define LOAD4x4_STEP(dst, src, sstep) \ + tmp0 = _mm_loadl_epi64((__m128i *) &src[0 * sstep]); \ + tmp1 = _mm_loadl_epi64((__m128i *) &src[1 * sstep]); \ + tmp2 = _mm_loadl_epi64((__m128i *) &src[2 * sstep]); \ + tmp3 = _mm_loadl_epi64((__m128i *) &src[3 * sstep]); \ + dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp2); \ + dst ## 1 = _mm_unpacklo_epi16(tmp1, tmp3) +#define LOAD8x8_E(dst, src, sstep) \ + dst ## 0 = _mm_load_si128((__m128i *) &src[0 * sstep]); \ + dst ## 1 = _mm_load_si128((__m128i *) &src[1 * sstep]); \ + dst ## 2 = _mm_load_si128((__m128i *) &src[2 * sstep]); \ + dst ## 3 = _mm_load_si128((__m128i *) &src[3 * sstep]) +#define LOAD8x8_O(dst, src, sstep) \ + tmp0 = _mm_load_si128((__m128i *) &src[1 * sstep]); \ + tmp1 = _mm_load_si128((__m128i *) &src[3 * sstep]); \ + tmp2 = _mm_load_si128((__m128i *) &src[5 * sstep]); \ + tmp3 = _mm_load_si128((__m128i *) &src[7 * sstep]); \ + dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp1); \ + dst ## 1 = _mm_unpackhi_epi16(tmp0, tmp1); \ + dst ## 2 = _mm_unpacklo_epi16(tmp2, tmp3); \ + dst ## 3 = _mm_unpackhi_epi16(tmp2, tmp3) +#define LOAD16x16_O(dst, src, sstep) \ + LOAD8x8_O(dst, src, sstep); \ + tmp0 = _mm_load_si128((__m128i *) &src[ 9 * sstep]); \ + tmp1 = _mm_load_si128((__m128i *) &src[11 * sstep]); \ + tmp2 = _mm_load_si128((__m128i *) &src[13 * sstep]); \ + tmp3 = _mm_load_si128((__m128i *) &src[15 * sstep]); \ + dst ## 4 = _mm_unpacklo_epi16(tmp0, tmp1); \ + dst ## 5 = _mm_unpackhi_epi16(tmp0, tmp1); \ + dst ## 6 = _mm_unpacklo_epi16(tmp2, tmp3); \ + dst ## 7 = _mm_unpackhi_epi16(tmp2, tmp3) + +#define LOAD_8x32(dst, dst_stride, src0, src1, idx) \ + src0 = _mm_load_si128((__m128i *) &dst[idx*dst_stride]); \ + src1 = _mm_load_si128((__m128i *) &dst[idx*dst_stride+4]) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define ASSIGN_EMPTY(dst, dst_stride, src) +#define SAVE_8x16(dst, dst_stride, src) \ + _mm_store_si128((__m128i *) dst, src); \ + dst += dst_stride +#define SAVE_8x32(dst, dst_stride, src0, src1, idx) \ + _mm_store_si128((__m128i *) &dst[idx*dst_stride] , src0); \ + _mm_store_si128((__m128i *) &dst[idx*dst_stride+4], src1) + +#define ASSIGN2(dst, dst_stride, src0, src1, assign) \ + assign(dst, dst_stride, src0); \ + assign(dst, dst_stride, _mm_srli_si128(src0, 8)); \ + assign(dst, dst_stride, src1); \ + assign(dst, dst_stride, _mm_srli_si128(src1, 8)) +#define ASSIGN4(dst, dst_stride, src0, src1, src2, src3, assign) \ + assign(dst, dst_stride, src0); \ + assign(dst, dst_stride, src1); \ + assign(dst, dst_stride, src2); \ + assign(dst, dst_stride, src3) +#define ASSIGN4_LO(dst, dst_stride, src, assign) \ + ASSIGN4(dst, dst_stride, src ## 0, src ## 1, src ## 2, src ## 3, assign) +#define ASSIGN4_HI(dst, dst_stride, src, assign) \ + ASSIGN4(dst, dst_stride, src ## 4, src ## 5, src ## 6, src ## 7, assign) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define TRANSPOSE4X4_16(dst) \ + tmp0 = _mm_unpacklo_epi16(dst ## 0, dst ## 1); \ + tmp1 = _mm_unpackhi_epi16(dst ## 0, dst ## 1); \ + dst ## 0 = _mm_unpacklo_epi16(tmp0, tmp1); \ + dst ## 1 = _mm_unpackhi_epi16(tmp0, tmp1) +#define TRANSPOSE4X4_16_S(dst, dst_stride, src, assign) \ + TRANSPOSE4X4_16(src); \ + ASSIGN2(dst, dst_stride, src ## 0, src ## 1, assign) + +#define TRANSPOSE8X8_16(dst) \ + tmp0 = _mm_unpacklo_epi16(dst ## 0, dst ## 1); \ + tmp1 = _mm_unpacklo_epi16(dst ## 2, dst ## 3); \ + tmp2 = _mm_unpacklo_epi16(dst ## 4, dst ## 5); \ + tmp3 = _mm_unpacklo_epi16(dst ## 6, dst ## 7); \ + src0 = _mm_unpacklo_epi32(tmp0, tmp1); \ + src1 = _mm_unpacklo_epi32(tmp2, tmp3); \ + src2 = _mm_unpackhi_epi32(tmp0, tmp1); \ + src3 = _mm_unpackhi_epi32(tmp2, tmp3); \ + tmp0 = _mm_unpackhi_epi16(dst ## 0, dst ## 1); \ + tmp1 = _mm_unpackhi_epi16(dst ## 2, dst ## 3); \ + tmp2 = _mm_unpackhi_epi16(dst ## 4, dst ## 5); \ + tmp3 = _mm_unpackhi_epi16(dst ## 6, dst ## 7); \ + dst ## 0 = _mm_unpacklo_epi64(src0 , src1); \ + dst ## 1 = _mm_unpackhi_epi64(src0 , src1); \ + dst ## 2 = _mm_unpacklo_epi64(src2 , src3); \ + dst ## 3 = _mm_unpackhi_epi64(src2 , src3); \ + src0 = _mm_unpacklo_epi32(tmp0, tmp1); \ + src1 = _mm_unpacklo_epi32(tmp2, tmp3); \ + src2 = _mm_unpackhi_epi32(tmp0, tmp1); \ + src3 = _mm_unpackhi_epi32(tmp2, tmp3); \ + dst ## 4 = _mm_unpacklo_epi64(src0 , src1); \ + dst ## 5 = _mm_unpackhi_epi64(src0 , src1); \ + dst ## 6 = _mm_unpacklo_epi64(src2 , src3); \ + dst ## 7 = _mm_unpackhi_epi64(src2 , src3) +#define TRANSPOSE8x8_16_S(out, sstep_out, src, assign) \ + TRANSPOSE8X8_16(src); \ + p_dst = out; \ + ASSIGN4_LO(p_dst, sstep_out, src, assign); \ + ASSIGN4_HI(p_dst, sstep_out, src, assign) +#define TRANSPOSE8x8_16_LS(out, sstep_out, in, sstep_in, assign) \ + e0 = _mm_load_si128((__m128i *) &in[0*sstep_in]); \ + e1 = _mm_load_si128((__m128i *) &in[1*sstep_in]); \ + e2 = _mm_load_si128((__m128i *) &in[2*sstep_in]); \ + e3 = _mm_load_si128((__m128i *) &in[3*sstep_in]); \ + e4 = _mm_load_si128((__m128i *) &in[4*sstep_in]); \ + e5 = _mm_load_si128((__m128i *) &in[5*sstep_in]); \ + e6 = _mm_load_si128((__m128i *) &in[6*sstep_in]); \ + e7 = _mm_load_si128((__m128i *) &in[7*sstep_in]); \ + TRANSPOSE8x8_16_S(out, sstep_out, e, assign) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define TR_COMPUTE_TRANFORM(dst1, dst2, src0, src1, src2, src3, i, j, transform)\ + tmp1 = _mm_load_si128((__m128i *) transform[i ][j]); \ + tmp3 = _mm_load_si128((__m128i *) transform[i+1][j]); \ + tmp0 = _mm_madd_epi16(src0, tmp1); \ + tmp1 = _mm_madd_epi16(src1, tmp1); \ + tmp2 = _mm_madd_epi16(src2, tmp3); \ + tmp3 = _mm_madd_epi16(src3, tmp3); \ + dst1 = _mm_add_epi32(tmp0, tmp2); \ + dst2 = _mm_add_epi32(tmp1, tmp3) + +#define SCALE8x8_2x32(dst0, src0, src1) \ + src0 = _mm_srai_epi32(src0, shift); \ + src1 = _mm_srai_epi32(src1, shift); \ + dst0 = _mm_packs_epi32(src0, src1) +#define SCALE_4x32(dst0, dst1, src0, src1, src2, src3) \ + SCALE8x8_2x32(dst0, src0, src1); \ + SCALE8x8_2x32(dst1, src2, src3) +#define SCALE16x16_2x32(dst, dst_stride, src0, src1, j) \ + e0 = _mm_load_si128((__m128i *) &o16[j*8+0]); \ + e7 = _mm_load_si128((__m128i *) &o16[j*8+4]); \ + tmp4 = _mm_add_epi32(src0, e0); \ + src0 = _mm_sub_epi32(src0, e0); \ + e0 = _mm_add_epi32(src1, e7); \ + src1 = _mm_sub_epi32(src1, e7); \ + SCALE_4x32(e0, e7, tmp4, e0, src0, src1); \ + _mm_store_si128((__m128i *) &dst[dst_stride*( j)] , e0); \ + _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)] , e7) + +#define SCALE32x32_2x32(dst, dst_stride, j) \ + e0 = _mm_load_si128((__m128i *) &e32[j*16+0]); \ + e1 = _mm_load_si128((__m128i *) &e32[j*16+4]); \ + e4 = _mm_load_si128((__m128i *) &o32[j*16+0]); \ + e5 = _mm_load_si128((__m128i *) &o32[j*16+4]); \ + tmp0 = _mm_add_epi32(e0, e4); \ + tmp1 = _mm_add_epi32(e1, e5); \ + tmp2 = _mm_sub_epi32(e1, e5); \ + tmp3 = _mm_sub_epi32(e0, e4); \ + SCALE_4x32(tmp0, tmp1, tmp0, tmp1, tmp3, tmp2); \ + _mm_store_si128((__m128i *) &dst[dst_stride*i+0] , tmp0); \ + _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-i)+0] , tmp1) + +#define SAVE16x16_2x32(dst, dst_stride, src0, src1, j) \ + e0 = _mm_load_si128((__m128i *) &o16[j*8+0]); \ + e7 = _mm_load_si128((__m128i *) &o16[j*8+4]); \ + tmp4 = _mm_add_epi32(src0, e0); \ + src0 = _mm_sub_epi32(src0, e0); \ + e0 = _mm_add_epi32(src1, e7); \ + src1 = _mm_sub_epi32(src1, e7); \ + _mm_store_si128((__m128i *) &dst[dst_stride*( j)] , tmp4); \ + _mm_store_si128((__m128i *) &dst[dst_stride*( j)+4], e0); \ + _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)] , src0); \ + _mm_store_si128((__m128i *) &dst[dst_stride*(dst_stride-1-j)+4], src1) + + +#define SCALE8x8_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx) \ + SCALE8x8_2x32(dst0, src0, src1) +#define SCALE16x16_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx) \ + SCALE16x16_2x32(dst, dst_stride, src0, src1, idx) +#define SAVE16x16_2x32_WRAPPER(dst, dst_stride, dst0, src0, src1, idx) \ + SAVE16x16_2x32(dst, dst_stride, src0, src1, idx) + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_4x4_luma_X_sse2 +//////////////////////////////////////////////////////////////////////////////// +#define COMPUTE_LUMA(dst , idx) \ + tmp0 = _mm_load_si128((__m128i *) (transform4x4_luma[idx ])); \ + tmp1 = _mm_load_si128((__m128i *) (transform4x4_luma[idx+1])); \ + tmp0 = _mm_madd_epi16(src0, tmp0); \ + tmp1 = _mm_madd_epi16(src1, tmp1); \ + dst = _mm_add_epi32(tmp0, tmp1); \ + dst = _mm_add_epi32(dst, add); \ + dst = _mm_srai_epi32(dst, shift) +#define COMPUTE_LUMA_ALL() \ + add = _mm_set1_epi32(1 << (shift - 1)); \ + src0 = _mm_unpacklo_epi16(tmp0, tmp1); \ + src1 = _mm_unpackhi_epi16(tmp0, tmp1); \ + COMPUTE_LUMA(res2 , 0); \ + COMPUTE_LUMA(res3 , 2); \ + res0 = _mm_packs_epi32(res2, res3); \ + COMPUTE_LUMA(res2 , 4); \ + COMPUTE_LUMA(res3 , 6); \ + res1 = _mm_packs_epi32(res2, res3) + +#define TRANSFORM_LUMA(D) \ +void ff_hevc_transform_4x4_luma ## _ ## D ## _sse2(int16_t *_coeffs) { \ + uint8_t shift = 7; \ + int16_t *src = _coeffs; \ + int16_t *coeffs = _coeffs; \ + __m128i res0, res1, res2, res3; \ + __m128i tmp0, tmp1, src0, src1, add; \ + LOAD4x4(tmp, src); \ + COMPUTE_LUMA_ALL(); \ + shift = 20 - D; \ + res2 = _mm_unpacklo_epi16(res0, res1); \ + res3 = _mm_unpackhi_epi16(res0, res1); \ + tmp0 = _mm_unpacklo_epi16(res2, res3); \ + tmp1 = _mm_unpackhi_epi16(res2, res3); \ + COMPUTE_LUMA_ALL(); \ + TRANSPOSE4X4_16(res); \ + _mm_store_si128((__m128i *) coeffs , res0); \ + _mm_store_si128((__m128i *) (coeffs + 8), res1); \ +} + +TRANSFORM_LUMA( 8); +TRANSFORM_LUMA( 10); +TRANSFORM_LUMA( 12); + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_4x4_X_sse2 +//////////////////////////////////////////////////////////////////////////////// +#define COMPUTE4x4(dst0, dst1, dst2, dst3) \ + tmp0 = _mm_load_si128((__m128i *) transform4x4[0]); \ + tmp1 = _mm_load_si128((__m128i *) transform4x4[1]); \ + tmp2 = _mm_load_si128((__m128i *) transform4x4[2]); \ + tmp3 = _mm_load_si128((__m128i *) transform4x4[3]); \ + tmp0 = _mm_madd_epi16(e6, tmp0); \ + tmp1 = _mm_madd_epi16(e6, tmp1); \ + tmp2 = _mm_madd_epi16(e7, tmp2); \ + tmp3 = _mm_madd_epi16(e7, tmp3); \ + e6 = _mm_set1_epi32(add); \ + tmp0 = _mm_add_epi32(tmp0, e6); \ + tmp1 = _mm_add_epi32(tmp1, e6); \ + dst0 = _mm_add_epi32(tmp0, tmp2); \ + dst1 = _mm_add_epi32(tmp1, tmp3); \ + dst2 = _mm_sub_epi32(tmp1, tmp3); \ + dst3 = _mm_sub_epi32(tmp0, tmp2) +#define COMPUTE4x4_LO() \ + COMPUTE4x4(e0, e1, e2, e3) +#define COMPUTE4x4_HI(dst) \ + COMPUTE4x4(e7, e6, e5, e4) + +#define TR_4(dst, dst_stride, in, sstep, load, assign) \ + load(e, in); \ + e6 = _mm_unpacklo_epi16(e0, e1); \ + e7 = _mm_unpackhi_epi16(e0, e1); \ + COMPUTE4x4_LO(); \ + SCALE_4x32(e0, e1, e0, e1, e2, e3); \ + TRANSPOSE4X4_16_S(dst, dst_stride, e, assign) \ + +#define TR_4_1( dst, dst_stride, src) TR_4( dst, dst_stride, src, 4, LOAD4x4, ASSIGN_EMPTY) +#define TR_4_2( dst, dst_stride, src, D) TR_4( dst, dst_stride, src, 4, LOAD_EMPTY, ASSIGN_EMPTY) + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_8x8_X_sse2 +//////////////////////////////////////////////////////////////////////////////// +#define TR_4_set8x4(in, sstep) \ + LOAD8x8_E(src, in, sstep); \ + e6 = _mm_unpacklo_epi16(src0, src2); \ + e7 = _mm_unpacklo_epi16(src1, src3); \ + COMPUTE4x4_LO(); \ + e6 = _mm_unpackhi_epi16(src0, src2); \ + e7 = _mm_unpackhi_epi16(src1, src3); \ + COMPUTE4x4_HI() + +#define TR_COMPUTE8x8(e0, e1, i) \ + TR_COMPUTE_TRANFORM(tmp2, tmp3, src0, src1, src2, src3, i, 0, transform8x8);\ + tmp0 = _mm_add_epi32(e0, tmp2); \ + tmp1 = _mm_add_epi32(e1, tmp3); \ + tmp3 = _mm_sub_epi32(e1, tmp3); \ + tmp2 = _mm_sub_epi32(e0, tmp2) + +#define TR_8(dst, dst_stride, in, sstep, assign) \ + TR_4_set8x4(in, 2 * sstep); \ + LOAD8x8_O(src, in, sstep); \ + TR_COMPUTE8x8(e0, e7, 0); \ + assign(dst, dst_stride, e0, tmp0, tmp1, 0); \ + assign(dst, dst_stride, e7, tmp2, tmp3, 7); \ + TR_COMPUTE8x8(e1, e6, 2); \ + assign(dst, dst_stride, e1, tmp0, tmp1, 1); \ + assign(dst, dst_stride, e6, tmp2, tmp3, 6); \ + TR_COMPUTE8x8(e2, e5, 4); \ + assign(dst, dst_stride, e2, tmp0, tmp1, 2); \ + assign(dst, dst_stride, e5, tmp2, tmp3, 5); \ + TR_COMPUTE8x8(e3, e4, 6); \ + assign(dst, dst_stride, e3, tmp0, tmp1, 3); \ + assign(dst, dst_stride, e4, tmp2, tmp3, 4); \ + +#define TR_8_1( dst, dst_stride, src) \ + TR_8( dst, dst_stride, src, 8, SCALE8x8_2x32_WRAPPER); \ + TRANSPOSE8x8_16_S(dst, dst_stride, e, SAVE_8x16) + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_XxX_X_sse2 +//////////////////////////////////////////////////////////////////////////////// + +#define TRANSFORM_4x4(D) \ +void ff_hevc_transform_4x4_ ## D ## _sse2 (int16_t *_coeffs, int col_limit) { \ + int16_t *src = _coeffs; \ + int16_t *coeffs = _coeffs; \ + int shift = 7; \ + int add = 1 << (shift - 1); \ + __m128i tmp0, tmp1, tmp2, tmp3; \ + __m128i e0, e1, e2, e3, e6, e7; \ + TR_4_1(p_dst1, 4, src); \ + shift = 20 - D; \ + add = 1 << (shift - 1); \ + TR_4_2(coeffs, 8, tmp, D); \ + _mm_store_si128((__m128i *) coeffs , e0); \ + _mm_store_si128((__m128i *) (coeffs + 8), e1); \ +} +#define TRANSFORM_8x8(D) \ +void ff_hevc_transform_8x8_ ## D ## _sse2 (int16_t *coeffs, int col_limit) { \ + DECLARE_ALIGNED(16, int16_t, tmp[8*8]); \ + int16_t *src = coeffs; \ + int16_t *p_dst1 = tmp; \ + int16_t *p_dst; \ + int shift = 7; \ + int add = 1 << (shift - 1); \ + __m128i src0, src1, src2, src3; \ + __m128i tmp0, tmp1, tmp2, tmp3; \ + __m128i e0, e1, e2, e3, e4, e5, e6, e7; \ + TR_8_1(p_dst1, 8, src); \ + shift = 20 - D; \ + add = 1 << (shift - 1); \ + TR_8_1(coeffs, 8, tmp); \ +} + +TRANSFORM_4x4( 8) +TRANSFORM_4x4(10) +TRANSFORM_4x4(12) +TRANSFORM_8x8( 8) +TRANSFORM_8x8(10) +TRANSFORM_8x8(12) + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_16x16_X_sse2 +//////////////////////////////////////////////////////////////////////////////// +#define TR_COMPUTE16x16(dst1, dst2,src0, src1, src2, src3, i, j) \ + TR_COMPUTE_TRANFORM(dst1, dst2,src0, src1, src2, src3, i, j, transform16x16_1) +#define TR_COMPUTE16x16_FIRST(j) \ + TR_COMPUTE16x16(src0, src1, e0, e1, e2, e3, 0, j) +#define TR_COMPUTE16x16_NEXT(i, j) \ + TR_COMPUTE16x16(tmp0, tmp1, e4, e5, e6, e7, i, j); \ + src0 = _mm_add_epi32(src0, tmp0); \ + src1 = _mm_add_epi32(src1, tmp1) + +#define TR_16(dst, dst_stride, in, sstep, assign) \ + { \ + int i; \ + int o16[8*8]; \ + LOAD16x16_O(e, in, sstep); \ + for (i = 0; i < 8; i++) { \ + TR_COMPUTE16x16_FIRST(i); \ + TR_COMPUTE16x16_NEXT(2, i); \ + SAVE_8x32(o16, 8, src0, src1, i); \ + } \ + TR_8(dst, dst_stride, in, 2 * sstep, assign); \ + } + +#define TR_16_1( dst, dst_stride, src) TR_16( dst, dst_stride, src, 16, SCALE16x16_2x32_WRAPPER) +#define TR_16_2( dst, dst_stride, src, sstep) TR_16( dst, dst_stride, src, sstep, SAVE16x16_2x32_WRAPPER ) + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_32x32_X_sse2 +//////////////////////////////////////////////////////////////////////////////// +#define TR_COMPUTE32x32(dst1, dst2,src0, src1, src2, src3, i, j) \ + TR_COMPUTE_TRANFORM(dst1, dst2, src0, src1, src2, src3, i, j, transform32x32) +#define TR_COMPUTE32x32_FIRST(i, j) \ + TR_COMPUTE32x32(tmp0, tmp1, e0, e1, e2, e3, i, j); \ + src0 = _mm_add_epi32(src0, tmp0); \ + src1 = _mm_add_epi32(src1, tmp1) +#define TR_COMPUTE32x32_NEXT(i, j) \ + TR_COMPUTE32x32(tmp0, tmp1, e4, e5, e6, e7, i, j); \ + src0 = _mm_add_epi32(src0, tmp0); \ + src1 = _mm_add_epi32(src1, tmp1) + +#define TR_32(dst, dst_stride, in, sstep) \ + { \ + int i; \ + DECLARE_ALIGNED(16, int, e32[16*16]); \ + DECLARE_ALIGNED(16, int, o32[16*16]); \ + LOAD16x16_O(e, in, sstep); \ + for (i = 0; i < 16; i++) { \ + src0 = _mm_setzero_si128(); \ + src1 = _mm_setzero_si128(); \ + TR_COMPUTE32x32_FIRST(0, i); \ + TR_COMPUTE32x32_NEXT(2, i); \ + SAVE_8x32(o32, 16, src0, src1, i); \ + } \ + LOAD16x16_O(e, (&in[16*sstep]), sstep); \ + for (i = 0; i < 16; i++) { \ + LOAD_8x32(o32, 16, src0, src1, i); \ + TR_COMPUTE32x32_FIRST(4, i); \ + TR_COMPUTE32x32_NEXT(6, i); \ + SAVE_8x32(o32, 16, src0, src1, i); \ + } \ + TR_16_2(e32, 16, in, 2 * sstep); \ + for (i = 0; i < 16; i++) { \ + SCALE32x32_2x32(dst, dst_stride, i); \ + } \ + } + +#define TR_32_1( dst, dst_stride, src) TR_32( dst, dst_stride, src, 32) + +//////////////////////////////////////////////////////////////////////////////// +// ff_hevc_transform_XxX_X_sse2 +//////////////////////////////////////////////////////////////////////////////// +#define TRANSFORM2(H, D) \ +void ff_hevc_transform_ ## H ## x ## H ## _ ## D ## _sse2 ( \ + int16_t *coeffs, int col_limit) { \ + int i, j, k, add; \ + int shift = 7; \ + int16_t *src = coeffs; \ + DECLARE_ALIGNED(16, int16_t, tmp[H*H]); \ + DECLARE_ALIGNED(16, int16_t, tmp_2[H*H]); \ + int16_t *p_dst, *p_tra = tmp_2; \ + __m128i src0, src1, src2, src3; \ + __m128i tmp0, tmp1, tmp2, tmp3, tmp4; \ + __m128i e0, e1, e2, e3, e4, e5, e6, e7; \ + for (k = 0; k < 2; k++) { \ + add = 1 << (shift - 1); \ + for (i = 0; i < H; i+=8) { \ + p_dst = tmp + i; \ + TR_ ## H ## _1(p_dst, H, src); \ + src += 8; \ + for (j = 0; j < H; j+=8) { \ + TRANSPOSE8x8_16_LS((&p_tra[i*H+j]), H, (&tmp[j*H+i]), H, SAVE_8x16);\ + } \ + } \ + src = tmp_2; \ + p_tra = coeffs; \ + shift = 20 - D; \ + } \ +} + +TRANSFORM2(16, 8); +TRANSFORM2(16, 10); +TRANSFORM2(16, 12); + +TRANSFORM2(32, 8); +TRANSFORM2(32, 10); +TRANSFORM2(32, 12); + +#endif + +#ifdef __GNUC__ +#pragma GCC pop_options +#endif diff --git a/libavcodec/x86/hevc_intra_intrinsic.c b/libavcodec/x86/hevc_intra_intrinsic.c new file mode 100644 index 0000000..3a8a331 --- /dev/null +++ b/libavcodec/x86/hevc_intra_intrinsic.c @@ -0,0 +1,922 @@ +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavcodec/get_bits.h" +#include "libavcodec/hevc.h" +#include "libavcodec/x86/hevcpred.h" + +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target("sse4.1") +#endif + +#if HAVE_SSE2 +#include +#endif +#if HAVE_SSSE3 +#include +#endif +#if HAVE_SSE4 +#include +#endif + +#if HAVE_SSE4 +#define _MM_PACKUS_EPI32 _mm_packus_epi32 +#else +static av_always_inline __m128i _MM_PACKUS_EPI32( __m128i a, __m128i b ) +{ + a = _mm_slli_epi32 (a, 16); + a = _mm_srai_epi32 (a, 16); + b = _mm_slli_epi32 (b, 16); + b = _mm_srai_epi32 (b, 16); + a = _mm_packs_epi32 (a, b); + return a; +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#if HAVE_SSE4 +#define PLANAR_INIT_8() \ + uint8_t *src = (uint8_t*)_src; \ + const uint8_t *top = (const uint8_t*)_top; \ + const uint8_t *left = (const uint8_t*)_left +#define PLANAR_INIT_10() \ + uint16_t *src = (uint16_t*)_src; \ + const uint16_t *top = (const uint16_t*)_top; \ + const uint16_t *left = (const uint16_t*)_left + +#define PLANAR_COMPUTE(val, shift) \ + add = _mm_mullo_epi16(_mm_set1_epi16(1+y), l0); \ + ly1 = _mm_unpacklo_epi16(ly , ly ); \ + ly1 = _mm_unpacklo_epi32(ly1, ly1); \ + ly1 = _mm_unpacklo_epi64(ly1, ly1); \ + c0 = _mm_mullo_epi16(tmp1, ly1); \ + x0 = _mm_mullo_epi16(_mm_set1_epi16(val - y), tx); \ + c0 = _mm_add_epi16(c0, c1); \ + x0 = _mm_add_epi16(x0, c0); \ + x0 = _mm_add_epi16(x0, add); \ + c0 = _mm_srli_epi16(x0, shift) + +#define PLANAR_COMPUTE_HI(val, shift) \ + C0 = _mm_mullo_epi16(tmp2, ly1); \ + x0 = _mm_mullo_epi16(_mm_set1_epi16(val - y), th); \ + C0 = _mm_add_epi16(C0, C1); \ + x0 = _mm_add_epi16(x0, C0); \ + x0 = _mm_add_epi16(x0, add); \ + C0 = _mm_srli_epi16(x0, shift) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define PLANAR_LOAD_0_8() \ + ly = _mm_loadl_epi64((__m128i*) left); \ + tx = _mm_loadl_epi64((__m128i*) top); \ + ly = _mm_unpacklo_epi8(ly, _mm_setzero_si128()); \ + tx = _mm_unpacklo_epi8(tx, _mm_setzero_si128()); \ + ly = _mm_unpacklo_epi16(ly, ly); \ + tx = _mm_unpacklo_epi64(tx, tx) +#define PLANAR_LOAD_0_10() \ + ly = _mm_loadl_epi64((__m128i*) left); \ + tx = _mm_loadl_epi64((__m128i*) top); \ + ly = _mm_unpacklo_epi16(ly, ly); \ + tx = _mm_unpacklo_epi64(tx, tx) + +#define PLANAR_COMPUTE_0(dst , v1, v2, v3, v4) \ + dst = _mm_mullo_epi16(tmp1, ly1); \ + x0 = _mm_mullo_epi16(_mm_set_epi16(v1,v1,v1,v1,v2,v2,v2,v2), tx); \ + add = _mm_mullo_epi16(_mm_set_epi16(v3,v3,v3,v3,v4,v4,v4,v4), l0); \ + dst = _mm_add_epi16(dst, c1); \ + x0 = _mm_add_epi16(x0, add); \ + dst = _mm_add_epi16(dst, x0); \ + dst = _mm_srli_epi16(dst, 3) + +#define PLANAR_STORE_0_8() \ + c0 = _mm_packus_epi16(c0,C0); \ + *((uint32_t *) src ) = _mm_cvtsi128_si32(c0 ); \ + *((uint32_t *)(src + stride)) = _mm_extract_epi32(c0, 1); \ + *((uint32_t *)(src + 2 * stride)) = _mm_extract_epi32(c0, 2); \ + *((uint32_t *)(src + 3 * stride)) = _mm_extract_epi32(c0, 3) +#define PLANAR_STORE_0_10() \ + _mm_storel_epi64((__m128i*)(src ), c0); \ + _mm_storel_epi64((__m128i*)(src + stride), _mm_unpackhi_epi64(c0, c0));\ + _mm_storel_epi64((__m128i*)(src + 2 * stride), C0); \ + _mm_storel_epi64((__m128i*)(src + 3 * stride), _mm_unpackhi_epi64(C0, C0)) + +#define PRED_PLANAR_0(D) \ +void pred_planar_0_ ## D ## _sse(uint8_t *_src, const uint8_t *_top, \ + const uint8_t *_left, ptrdiff_t stride) { \ + __m128i ly, l0, tx, ly1; \ + __m128i tmp1, add, x0, c0, c1, C0; \ + PLANAR_INIT_ ## D(); \ + tx = _mm_set1_epi16(top[4]); \ + l0 = _mm_set1_epi16(left[4]); \ + add = _mm_set1_epi16(4); \ + tmp1 = _mm_set_epi16(0,1,2,3,0,1,2,3); \ + c1 = _mm_mullo_epi16(_mm_set_epi16(4,3,2,1,4,3,2,1), tx); \ + c1 = _mm_add_epi16(c1, add); \ + PLANAR_LOAD_0_ ##D(); \ + \ + ly1 = _mm_unpacklo_epi32(ly, ly); \ + PLANAR_COMPUTE_0(c0, 2, 3, 2, 1); \ + ly1 = _mm_unpackhi_epi32(ly, ly); \ + PLANAR_COMPUTE_0(C0, 0, 1, 4, 3); \ + PLANAR_STORE_0_ ## D(); \ +} +PRED_PLANAR_0( 8) +PRED_PLANAR_0(10) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define PLANAR_LOAD_1_8() \ + ly = _mm_loadl_epi64((__m128i*)left); \ + tx = _mm_loadl_epi64((__m128i*)top); \ + ly = _mm_unpacklo_epi8(ly,_mm_setzero_si128()); \ + tx = _mm_unpacklo_epi8(tx,_mm_setzero_si128()) +#define PLANAR_LOAD_1_10() \ + ly = _mm_loadu_si128((__m128i*)left); \ + tx = _mm_loadu_si128((__m128i*)top) + +#define PLANAR_COMPUTE_1() \ + PLANAR_COMPUTE(7, 4) + +#define PLANAR_STORE_1_8() \ + c0 = _mm_packus_epi16(c0,_mm_setzero_si128()); \ + _mm_storel_epi64((__m128i*)(src), c0); \ + src+= stride; \ + ly = _mm_srli_si128(ly,2) +#define PLANAR_STORE_1_10() \ + _mm_storeu_si128((__m128i*)(src), c0); \ + src+= stride; \ + ly = _mm_srli_si128(ly,2) + +#define PRED_PLANAR_1(D) \ +void pred_planar_1_ ## D ## _sse(uint8_t *_src, const uint8_t *_top, \ + const uint8_t *_left, ptrdiff_t stride) { \ + int y; \ + __m128i ly, l0, tx, ly1; \ + __m128i tmp1, add, x0, c0, c1; \ + PLANAR_INIT_ ## D(); \ + tx = _mm_set1_epi16(top[8]); \ + l0 = _mm_set1_epi16(left[8]); \ + add = _mm_set1_epi16(8); \ + tmp1 = _mm_set_epi16(0,1,2,3,4,5,6,7); \ + c1 = _mm_mullo_epi16(_mm_set_epi16(8,7,6,5,4,3,2,1), tx); \ + c1 = _mm_add_epi16(c1,add); \ + PLANAR_LOAD_1_ ## D(); \ + for (y = 0; y < 8; y++) { \ + PLANAR_COMPUTE_1(); \ + PLANAR_STORE_1_ ## D(); \ + } \ +} + +PRED_PLANAR_1( 8) +PRED_PLANAR_1(10) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define PLANAR_LOAD_2_8() \ + ly = _mm_loadu_si128((__m128i*) left); \ + tx = _mm_loadu_si128((__m128i*) top); \ + lh = _mm_unpackhi_epi8(ly,_mm_setzero_si128()); \ + ly = _mm_unpacklo_epi8(ly,_mm_setzero_si128()); \ + th = _mm_unpackhi_epi8(tx,_mm_setzero_si128()); \ + tx = _mm_unpacklo_epi8(tx,_mm_setzero_si128()) + +#define PLANAR_LOAD_2_10() \ + ly = _mm_loadu_si128((__m128i*) left); \ + lh = _mm_loadu_si128((__m128i*)&left[8]); \ + tx = _mm_loadu_si128((__m128i*) top); \ + th = _mm_loadu_si128((__m128i*)&top[8]) + +#define PLANAR_COMPUTE_2() \ + PLANAR_COMPUTE(15, 5) +#define PLANAR_COMPUTE_HI_2() \ + PLANAR_COMPUTE_HI(15, 5) + +#define PLANAR_STORE_2_8() \ + c0 = _mm_packus_epi16(c0, C0); \ + _mm_storeu_si128((__m128i*) src, c0); \ + src+= stride; \ + ly = _mm_srli_si128(ly,2) +#define PLANAR_STORE_2_10() \ + _mm_storeu_si128((__m128i*) src , c0); \ + _mm_storeu_si128((__m128i*)&src[8], C0); \ + src+= stride; \ + ly = _mm_srli_si128(ly,2) + +#define PRED_PLANAR_2(D) \ +void pred_planar_2_ ## D ## _sse(uint8_t *_src, const uint8_t *_top, \ + const uint8_t *_left, ptrdiff_t stride) { \ + int y, i; \ + __m128i ly, lh, l0, tx, th, ly1; \ + __m128i tmp1, tmp2, add, x0, c0, c1, C0, C1; \ + PLANAR_INIT_ ## D(); \ + tx = _mm_set1_epi16(top[16]); \ + l0 = _mm_set1_epi16(left[16]); \ + add = _mm_set1_epi16(16); \ + tmp1 = _mm_set_epi16( 8, 9,10,11,12,13,14,15); \ + tmp2 = _mm_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7); \ + c1 = _mm_mullo_epi16(_mm_set_epi16( 8, 7, 6, 5, 4, 3, 2, 1), tx); \ + C1 = _mm_mullo_epi16(_mm_set_epi16(16,15,14,13,12,11,10, 9), tx); \ + c1 = _mm_add_epi16(c1, add); \ + C1 = _mm_add_epi16(C1, add); \ + PLANAR_LOAD_2_ ## D(); \ + for (i = 0; i < 2; i++) { \ + for (y = i*8; y < i*8+8; y++) { \ + PLANAR_COMPUTE_2(); \ + PLANAR_COMPUTE_HI_2(); \ + PLANAR_STORE_2_ ## D(); \ + } \ + ly = lh; \ + } \ +} + +PRED_PLANAR_2( 8) +PRED_PLANAR_2(10) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define PLANAR_LOAD_3_8() \ + ly = _mm_loadu_si128((__m128i*) left); \ + lh = _mm_unpackhi_epi8(ly,_mm_setzero_si128()); \ + ly = _mm_unpacklo_epi8(ly,_mm_setzero_si128()); \ + tx = _mm_loadu_si128((__m128i*) top); \ + th = _mm_unpackhi_epi8(tx,_mm_setzero_si128()); \ + tx = _mm_unpacklo_epi8(tx,_mm_setzero_si128()); \ + TX = _mm_loadu_si128((__m128i*)(top + 16)); \ + TH = _mm_unpackhi_epi8(TX,_mm_setzero_si128()); \ + TX = _mm_unpacklo_epi8(TX,_mm_setzero_si128()) +#define PLANAR_LOAD_3_10() \ + ly = _mm_loadu_si128((__m128i*) left ); \ + lh = _mm_loadu_si128((__m128i*)&left[8]); \ + tx = _mm_loadu_si128((__m128i*) top ); \ + th = _mm_loadu_si128((__m128i*)&top[ 8]); \ + TX = _mm_loadu_si128((__m128i*)&top[16]); \ + TH = _mm_loadu_si128((__m128i*)&top[24]) + +#define PLANAR_RELOAD_3_8() \ + ly = _mm_loadu_si128((__m128i*)(left+16)); \ + lh = _mm_unpackhi_epi8(ly,_mm_setzero_si128()); \ + ly = _mm_unpacklo_epi8(ly,_mm_setzero_si128()) +#define PLANAR_RELOAD_3_10() \ + ly = _mm_loadu_si128((__m128i*)&left[16]); \ + lh = _mm_loadu_si128((__m128i*)&left[24]) + +#define PLANAR_COMPUTE_3() \ + PLANAR_COMPUTE(31, 6) +#define PLANAR_COMPUTE_HI_3() \ + PLANAR_COMPUTE_HI(31, 6) +#define PLANAR_COMPUTE_HI2_3() \ + c0 = _mm_mullo_epi16(TMP1, ly1); \ + x0 = _mm_mullo_epi16(_mm_set1_epi16(31 - y), TX); \ + c0 = _mm_add_epi16(c0, c2); \ + x0 = _mm_add_epi16(x0, c0); \ + x0 = _mm_add_epi16(x0, add); \ + c0 = _mm_srli_epi16(x0, 6) +#define PLANAR_COMPUTE_HI3_3() \ + C0 = _mm_mullo_epi16(TMP2, ly1); \ + x0 = _mm_mullo_epi16(_mm_set1_epi16(31 - y), TH); \ + C0 = _mm_add_epi16(C0, C2); \ + x0 = _mm_add_epi16(x0, C0); \ + x0 = _mm_add_epi16(x0, add); \ + C0 = _mm_srli_epi16(x0, 6) + +#define PLANAR_STORE1_3_8() \ + c0 = _mm_packus_epi16(c0, C0); \ + _mm_storeu_si128((__m128i*) src, c0) +#define PLANAR_STORE2_3_8() \ + c0 = _mm_packus_epi16(c0, C0); \ + _mm_storeu_si128((__m128i*) (src + 16), c0); \ + src+= stride; \ + ly = _mm_srli_si128(ly, 2) + +#define PLANAR_STORE1_3_10() \ + _mm_storeu_si128((__m128i*) src , c0); \ + _mm_storeu_si128((__m128i*)&src[ 8], C0) +#define PLANAR_STORE2_3_10() \ + _mm_storeu_si128((__m128i*)&src[16], c0); \ + _mm_storeu_si128((__m128i*)&src[24], C0); \ + src+= stride; \ + ly = _mm_srli_si128(ly, 2) + + +#define PRED_PLANAR_3(D) \ +void pred_planar_3_ ## D ## _sse(uint8_t *_src, const uint8_t *_top, \ + const uint8_t *_left, ptrdiff_t stride) { \ + int y, i; \ + __m128i l0, ly, lh, ly1, tx, th, TX, TH, tmp1, tmp2, TMP1, TMP2; \ + __m128i x0, c0, c1, c2, C0, C1, C2, add; \ + PLANAR_INIT_ ## D(); \ + tx = _mm_set1_epi16(top[32]); \ + l0 = _mm_set1_epi16(left[32]); \ + add = _mm_set1_epi16(32); \ + tmp1 = _mm_set_epi16(24,25,26,27,28,29,30,31); \ + tmp2 = _mm_set_epi16(16,17,18,19,20,21,22,23); \ + TMP1 = _mm_set_epi16( 8, 9,10,11,12,13,14,15); \ + TMP2 = _mm_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7); \ + c1 = _mm_mullo_epi16(_mm_set_epi16( 8, 7, 6, 5, 4, 3, 2, 1), tx); \ + C1 = _mm_mullo_epi16(_mm_set_epi16(16,15,14,13,12,11,10, 9), tx); \ + c2 = _mm_mullo_epi16(_mm_set_epi16(24,23,22,21,20,19,18,17), tx); \ + C2 = _mm_mullo_epi16(_mm_set_epi16(32,31,30,29,28,27,26,25), tx); \ + c1 = _mm_add_epi16(c1, add); \ + C1 = _mm_add_epi16(C1, add); \ + c2 = _mm_add_epi16(c2, add); \ + C2 = _mm_add_epi16(C2, add); \ + PLANAR_LOAD_3_ ## D(); \ + for (i = 0; i < 4; i++) { \ + for (y = 0+i*8; y < 8+i*8; y++) { \ + PLANAR_COMPUTE_3(); \ + PLANAR_COMPUTE_HI_3(); \ + PLANAR_STORE1_3_ ## D(); \ + PLANAR_COMPUTE_HI2_3(); \ + PLANAR_COMPUTE_HI3_3(); \ + PLANAR_STORE2_3_ ## D(); \ + } \ + if (i == 0 || i == 2) { \ + ly = lh; \ + } else { \ + PLANAR_RELOAD_3_ ## D(); \ + } \ + } \ +} + +PRED_PLANAR_3( 8) +PRED_PLANAR_3(10) + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define STORE8(out, sstep_out) \ + _mm_storel_epi64((__m128i*)&out[0*sstep_out], m10); \ + _mm_storel_epi64((__m128i*)&out[2*sstep_out], m12); \ + _mm_storel_epi64((__m128i*)&out[4*sstep_out], m11); \ + _mm_storel_epi64((__m128i*)&out[6*sstep_out], m13); \ + m10 = _mm_unpackhi_epi64(m10, m10); \ + m12 = _mm_unpackhi_epi64(m12, m12); \ + m11 = _mm_unpackhi_epi64(m11, m11); \ + m13 = _mm_unpackhi_epi64(m13, m13); \ + _mm_storel_epi64((__m128i*)&out[1*sstep_out], m10); \ + _mm_storel_epi64((__m128i*)&out[3*sstep_out], m12); \ + _mm_storel_epi64((__m128i*)&out[5*sstep_out], m11); \ + _mm_storel_epi64((__m128i*)&out[7*sstep_out], m13) + +#define STORE16(out, sstep_out) \ + _mm_storeu_si128((__m128i *) &out[0*sstep_out], m0); \ + _mm_storeu_si128((__m128i *) &out[1*sstep_out], m1); \ + _mm_storeu_si128((__m128i *) &out[2*sstep_out], m2); \ + _mm_storeu_si128((__m128i *) &out[3*sstep_out], m3); \ + _mm_storeu_si128((__m128i *) &out[4*sstep_out], m4); \ + _mm_storeu_si128((__m128i *) &out[5*sstep_out], m5); \ + _mm_storeu_si128((__m128i *) &out[6*sstep_out], m6); \ + _mm_storeu_si128((__m128i *) &out[7*sstep_out], m7) + +#define TRANSPOSE4x4_8(in, sstep_in, out, sstep_out) \ + { \ + __m128i m0 = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]); \ + __m128i m1 = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]); \ + __m128i m2 = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]); \ + __m128i m3 = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]); \ + \ + __m128i m10 = _mm_unpacklo_epi8(m0, m1); \ + __m128i m11 = _mm_unpacklo_epi8(m2, m3); \ + \ + m0 = _mm_unpacklo_epi16(m10, m11); \ + \ + *((uint32_t *) (out+0*sstep_out)) =_mm_cvtsi128_si32(m0); \ + *((uint32_t *) (out+1*sstep_out)) =_mm_extract_epi32(m0, 1); \ + *((uint32_t *) (out+2*sstep_out)) =_mm_extract_epi32(m0, 2); \ + *((uint32_t *) (out+3*sstep_out)) =_mm_extract_epi32(m0, 3); \ + } +#define TRANSPOSE8x8_8(in, sstep_in, out, sstep_out) \ + { \ + __m128i m0 = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]); \ + __m128i m1 = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]); \ + __m128i m2 = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]); \ + __m128i m3 = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]); \ + __m128i m4 = _mm_loadl_epi64((__m128i *) &in[4*sstep_in]); \ + __m128i m5 = _mm_loadl_epi64((__m128i *) &in[5*sstep_in]); \ + __m128i m6 = _mm_loadl_epi64((__m128i *) &in[6*sstep_in]); \ + __m128i m7 = _mm_loadl_epi64((__m128i *) &in[7*sstep_in]); \ + \ + __m128i m10 = _mm_unpacklo_epi8(m0, m1); \ + __m128i m11 = _mm_unpacklo_epi8(m2, m3); \ + __m128i m12 = _mm_unpacklo_epi8(m4, m5); \ + __m128i m13 = _mm_unpacklo_epi8(m6, m7); \ + \ + m0 = _mm_unpacklo_epi16(m10, m11); \ + m1 = _mm_unpacklo_epi16(m12, m13); \ + m2 = _mm_unpackhi_epi16(m10, m11); \ + m3 = _mm_unpackhi_epi16(m12, m13); \ + \ + m10 = _mm_unpacklo_epi32(m0 , m1 ); \ + m11 = _mm_unpacklo_epi32(m2 , m3 ); \ + m12 = _mm_unpackhi_epi32(m0 , m1 ); \ + m13 = _mm_unpackhi_epi32(m2 , m3 ); \ + \ + STORE8(out, sstep_out); \ + } +#define TRANSPOSE16x16_8(in, sstep_in, out, sstep_out) \ + for (y = 0; y < sstep_in; y+=8) \ + for (x = 0; x < sstep_in; x+=8) \ + TRANSPOSE8x8_8((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out) +#define TRANSPOSE32x32_8(in, sstep_in, out, sstep_out) \ + for (y = 0; y < sstep_in; y+=8) \ + for (x = 0; x < sstep_in; x+=8) \ + TRANSPOSE8x8_8((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define TRANSPOSE4x4_10(in, sstep_in, out, sstep_out) \ + { \ + __m128i m0 = _mm_loadl_epi64((__m128i *) &in[0*sstep_in]); \ + __m128i m1 = _mm_loadl_epi64((__m128i *) &in[1*sstep_in]); \ + __m128i m2 = _mm_loadl_epi64((__m128i *) &in[2*sstep_in]); \ + __m128i m3 = _mm_loadl_epi64((__m128i *) &in[3*sstep_in]); \ + \ + __m128i m10 = _mm_unpacklo_epi16(m0, m1); \ + __m128i m11 = _mm_unpacklo_epi16(m2, m3); \ + \ + m0 = _mm_unpacklo_epi32(m10, m11); \ + m1 = _mm_unpackhi_epi32(m10, m11); \ + \ + _mm_storel_epi64((__m128i *) (out+0*sstep_out) , m0); \ + _mm_storel_epi64((__m128i *) (out+1*sstep_out) , _mm_unpackhi_epi64(m0, m0));\ + _mm_storel_epi64((__m128i *) (out+2*sstep_out) , m1); \ + _mm_storel_epi64((__m128i *) (out+3*sstep_out) , _mm_unpackhi_epi64(m1, m1));\ + } +#define TRANSPOSE8x8_10(in, sstep_in, out, sstep_out) \ + { \ + __m128i tmp0, tmp1, tmp2, tmp3, src0, src1, src2, src3; \ + __m128i m0 = _mm_loadu_si128((__m128i *) &in[0*sstep_in]); \ + __m128i m1 = _mm_loadu_si128((__m128i *) &in[1*sstep_in]); \ + __m128i m2 = _mm_loadu_si128((__m128i *) &in[2*sstep_in]); \ + __m128i m3 = _mm_loadu_si128((__m128i *) &in[3*sstep_in]); \ + __m128i m4 = _mm_loadu_si128((__m128i *) &in[4*sstep_in]); \ + __m128i m5 = _mm_loadu_si128((__m128i *) &in[5*sstep_in]); \ + __m128i m6 = _mm_loadu_si128((__m128i *) &in[6*sstep_in]); \ + __m128i m7 = _mm_loadu_si128((__m128i *) &in[7*sstep_in]); \ + \ + tmp0 = _mm_unpacklo_epi16(m0, m1); \ + tmp1 = _mm_unpacklo_epi16(m2, m3); \ + tmp2 = _mm_unpacklo_epi16(m4, m5); \ + tmp3 = _mm_unpacklo_epi16(m6, m7); \ + src0 = _mm_unpacklo_epi32(tmp0, tmp1); \ + src1 = _mm_unpacklo_epi32(tmp2, tmp3); \ + src2 = _mm_unpackhi_epi32(tmp0, tmp1); \ + src3 = _mm_unpackhi_epi32(tmp2, tmp3); \ + tmp0 = _mm_unpackhi_epi16(m0, m1); \ + tmp1 = _mm_unpackhi_epi16(m2, m3); \ + tmp2 = _mm_unpackhi_epi16(m4, m5); \ + tmp3 = _mm_unpackhi_epi16(m6, m7); \ + m0 = _mm_unpacklo_epi64(src0 , src1); \ + m1 = _mm_unpackhi_epi64(src0 , src1); \ + m2 = _mm_unpacklo_epi64(src2 , src3); \ + m3 = _mm_unpackhi_epi64(src2 , src3); \ + src0 = _mm_unpacklo_epi32(tmp0, tmp1); \ + src1 = _mm_unpacklo_epi32(tmp2, tmp3); \ + src2 = _mm_unpackhi_epi32(tmp0, tmp1); \ + src3 = _mm_unpackhi_epi32(tmp2, tmp3); \ + m4 = _mm_unpacklo_epi64(src0 , src1); \ + m5 = _mm_unpackhi_epi64(src0 , src1); \ + m6 = _mm_unpacklo_epi64(src2 , src3); \ + m7 = _mm_unpackhi_epi64(src2 , src3); \ + STORE16(out, sstep_out); \ + } +#define TRANSPOSE16x16_10(in, sstep_in, out, sstep_out) \ + for (y = 0; y < sstep_in; y+=8) \ + for (x = 0; x < sstep_in; x+=8) \ + TRANSPOSE8x8_10((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out) +#define TRANSPOSE32x32_10(in, sstep_in, out, sstep_out) \ + for (y = 0; y < sstep_in; y+=8) \ + for (x = 0; x < sstep_in; x+=8) \ + TRANSPOSE8x8_10((&in[y*sstep_in+x]), sstep_in, (&out[x*sstep_out+y]), sstep_out) + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define ANGULAR_COMPUTE_8(W) \ + for (x = 0; x < W; x += 8) { \ + r3 = _mm_set1_epi16((fact << 8) + (32 - fact)); \ + r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1])); \ + r0 = _mm_srli_si128(r1, 1); \ + r1 = _mm_unpacklo_epi8(r1, r0); \ + r1 = _mm_maddubs_epi16(r1, r3); \ + r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024)); \ + r1 = _mm_packus_epi16(r1, r1); \ + _mm_storel_epi64((__m128i *) &p_src[x], r1); \ + } + + +#define ANGULAR_COMPUTE4_8() \ + r3 = _mm_set1_epi16((fact << 8) + (32 - fact)); \ + r1 = _mm_loadu_si128((__m128i*)(&ref[idx+1])); \ + r0 = _mm_srli_si128(r1, 1); \ + r1 = _mm_unpacklo_epi8(r1, r0); \ + r1 = _mm_maddubs_epi16(r1, r3); \ + r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024)); \ + r1 = _mm_packus_epi16(r1, r1); \ + *((uint32_t *)p_src) = _mm_cvtsi128_si32(r1) +#define ANGULAR_COMPUTE8_8() ANGULAR_COMPUTE_8( 8) +#define ANGULAR_COMPUTE16_8() ANGULAR_COMPUTE_8(16) +#define ANGULAR_COMPUTE32_8() ANGULAR_COMPUTE_8(32) + +#define ANGULAR_COMPUTE_ELSE4_8() \ + r1 = _mm_loadl_epi64((__m128i*) &ref[idx+1]); \ + *((uint32_t *)p_src) = _mm_cvtsi128_si32(r1) +#define ANGULAR_COMPUTE_ELSE8_8() \ + r1 = _mm_loadl_epi64((__m128i*) &ref[idx+1]); \ + _mm_storel_epi64((__m128i *) p_src, r1) +#define ANGULAR_COMPUTE_ELSE16_8() \ + r1 = _mm_loadu_si128((__m128i*) &ref[idx+1]); \ + _mm_storeu_si128((__m128i *) p_src, r1) +#define ANGULAR_COMPUTE_ELSE32_8() \ + r1 = _mm_loadu_si128((__m128i*) &ref[idx+1]); \ + _mm_storeu_si128((__m128i *) p_src ,r1); \ + r1 = _mm_loadu_si128((__m128i*) &ref[idx+17]); \ + _mm_storeu_si128((__m128i *)&p_src[16] ,r1) + +#define CLIP_PIXEL(src1, src2) \ + r3 = _mm_loadu_si128((__m128i*)src1); \ + r1 = _mm_set1_epi16(src1[-1]); \ + r2 = _mm_set1_epi16(src2[0]); \ + r0 = _mm_unpacklo_epi8(r3,_mm_setzero_si128()); \ + r0 = _mm_subs_epi16(r0, r1); \ + r0 = _mm_srai_epi16(r0, 1); \ + r0 = _mm_add_epi16(r0, r2) +#define CLIP_PIXEL_HI() \ + r3 = _mm_unpackhi_epi8(r3,_mm_setzero_si128()); \ + r3 = _mm_subs_epi16(r3, r1); \ + r3 = _mm_srai_epi16(r3, 1); \ + r3 = _mm_add_epi16(r3, r2) + +#define CLIP_PIXEL1_4_8() \ + p_src = src; \ + CLIP_PIXEL(src2, src1); \ + r0 = _mm_packus_epi16(r0, r0); \ + *((char *) p_src) = _mm_extract_epi8(r0, 0); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 1); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 2); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 3) +#define CLIP_PIXEL1_8_8() \ + CLIP_PIXEL1_4_8(); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 4); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 5); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 6); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 7) +#define CLIP_PIXEL1_16_8() \ + p_src = src; \ + CLIP_PIXEL(src2, src1); \ + CLIP_PIXEL_HI(); \ + r0 = _mm_packus_epi16(r0, r3); \ + *((char *) p_src) = _mm_extract_epi8(r0, 0); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 1); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 2); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 3); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 4); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 5); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 6); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 7); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 8); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0, 9); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0,10); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0,11); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0,12); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0,13); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0,14); \ + p_src += stride; \ + *((char *) p_src) = _mm_extract_epi8(r0,15) +#define CLIP_PIXEL1_32_8() + +#define CLIP_PIXEL2_4_8() \ + CLIP_PIXEL(src2, src1); \ + r0 = _mm_packus_epi16(r0, r0); \ + *((uint32_t *)_src) = _mm_cvtsi128_si32(r0) +#define CLIP_PIXEL2_8_8() \ + CLIP_PIXEL(src2, src1); \ + r0 = _mm_packus_epi16(r0, r0); \ + _mm_storel_epi64((__m128i*)_src, r0) +#define CLIP_PIXEL2_16_8() \ + CLIP_PIXEL(src2, src1); \ + CLIP_PIXEL_HI(); \ + r0 = _mm_packus_epi16(r0, r3); \ + _mm_storeu_si128((__m128i*) _src , r0) +#define CLIP_PIXEL2_32_8() + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#if HAVE_SSE4 +#define ANGULAR_COMPUTE_10(W) \ + for (x = 0; x < W; x += 4) { \ + r3 = _mm_set1_epi32((fact << 16) + (32 - fact)); \ + r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1])); \ + r0 = _mm_srli_si128(r1, 2); \ + r1 = _mm_unpacklo_epi16(r1, r0); \ + r1 = _mm_madd_epi16(r1, r3); \ + r1 = _mm_mulhrs_epi16(r1, _mm_set1_epi16(1024)); \ + r1 = _MM_PACKUS_EPI32(r1, r1); \ + _mm_storel_epi64((__m128i *) &p_src[x], r1); \ + } +#define ANGULAR_COMPUTE4_10() ANGULAR_COMPUTE_10( 4) +#define ANGULAR_COMPUTE8_10() ANGULAR_COMPUTE_10( 8) +#define ANGULAR_COMPUTE16_10() ANGULAR_COMPUTE_10(16) +#define ANGULAR_COMPUTE32_10() ANGULAR_COMPUTE_10(32) + +#define ANGULAR_COMPUTE_ELSE_10(W) \ + for (x = 0; x < W; x += 8) { \ + r1 = _mm_loadu_si128((__m128i*)(&ref[x+idx+1])); \ + _mm_storeu_si128((__m128i *) &p_src[x], r1); \ + } + +#define ANGULAR_COMPUTE_ELSE4_10() \ + r1 = _mm_loadl_epi64((__m128i*)(&ref[idx+1])); \ + _mm_storel_epi64((__m128i *) p_src, r1) + +#define ANGULAR_COMPUTE_ELSE8_10() ANGULAR_COMPUTE_ELSE_10(8) +#define ANGULAR_COMPUTE_ELSE16_10() ANGULAR_COMPUTE_ELSE_10(16) +#define ANGULAR_COMPUTE_ELSE32_10() ANGULAR_COMPUTE_ELSE_10(32) + +#define CLIP_PIXEL_10() \ + r0 = _mm_loadu_si128((__m128i*)src2); \ + r1 = _mm_set1_epi16(src2[-1]); \ + r2 = _mm_set1_epi16(src1[0]); \ + r0 = _mm_subs_epi16(r0, r1); \ + r0 = _mm_srai_epi16(r0, 1); \ + r0 = _mm_add_epi16(r0, r2) +#define CLIP_PIXEL_HI_10() \ + r3 = _mm_loadu_si128((__m128i*)&src2[8]); \ + r3 = _mm_subs_epi16(r3, r1); \ + r3 = _mm_srai_epi16(r3, 1); \ + r3 = _mm_add_epi16(r3, r2) + +#define CLIP_PIXEL1_4_10() \ + p_src = src; \ + CLIP_PIXEL_10(); \ + r0 = _mm_max_epi16(r0, _mm_setzero_si128()); \ + r0 = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff)); \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 0); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 1); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 2); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 3) +#define CLIP_PIXEL1_8_10() \ + CLIP_PIXEL1_4_10(); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 4); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 5); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 6); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 7) +#define CLIP_PIXEL1_16_10() \ + p_src = src; \ + CLIP_PIXEL_10(); \ + CLIP_PIXEL_HI_10(); \ + r0 = _mm_max_epi16(r0, _mm_setzero_si128()); \ + r0 = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff)); \ + r3 = _mm_max_epi16(r3, _mm_setzero_si128()); \ + r3 = _mm_min_epi16(r3, _mm_set1_epi16(0x03ff)); \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 0); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 1); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 2); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 3); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 4); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 5); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 6); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r0, 7); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 0); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 1); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 2); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 3); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 4); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 5); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 6); \ + p_src += stride; \ + *((uint16_t *) p_src) = _mm_extract_epi16(r3, 7) +#define CLIP_PIXEL1_32_10() + +#define CLIP_PIXEL2_4_10() \ + CLIP_PIXEL_10(); \ + r0 = _mm_max_epi16(r0, _mm_setzero_si128()); \ + r0 = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff)); \ + _mm_storel_epi64((__m128i*) _src , r0) +#define CLIP_PIXEL2_8_10() \ + CLIP_PIXEL_10(); \ + r0 = _mm_max_epi16(r0, _mm_setzero_si128()); \ + r0 = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff)); \ + _mm_storeu_si128((__m128i*) _src , r0) +#define CLIP_PIXEL2_16_10() \ + CLIP_PIXEL_10(); \ + CLIP_PIXEL_HI_10(); \ + r0 = _mm_max_epi16(r0, _mm_setzero_si128()); \ + r0 = _mm_min_epi16(r0, _mm_set1_epi16(0x03ff)); \ + r3 = _mm_max_epi16(r3, _mm_setzero_si128()); \ + r3 = _mm_min_epi16(r3, _mm_set1_epi16(0x03ff)); \ + _mm_storeu_si128((__m128i*) p_out , r0); \ + _mm_storeu_si128((__m128i*) &p_out[8], r3); + +#define CLIP_PIXEL2_32_10() + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define PRED_ANGULAR_INIT_8(W) \ + const uint8_t *src1; \ + const uint8_t *src2; \ + uint8_t *ref, *p_src, *src, *p_out; \ + uint8_t src_tmp[W*W]; \ + if (mode >= 18) { \ + src1 = (const uint8_t*) _top; \ + src2 = (const uint8_t*) _left; \ + src = (uint8_t*) _src; \ + stride = _stride; \ + p_src = src; \ + } else { \ + src1 = (const uint8_t*) _left; \ + src2 = (const uint8_t*) _top; \ + src = &src_tmp[0]; \ + stride = W; \ + p_src = src; \ + } \ + p_out = (uint8_t*) _src; \ + ref = (uint8_t*) (src1 - 1) +#define PRED_ANGULAR_INIT_10(W) \ + const uint16_t *src1; \ + const uint16_t *src2; \ + uint16_t *ref, *p_src, *src, *p_out; \ + uint16_t src_tmp[W*W]; \ + if (mode >= 18) { \ + src1 = (const uint16_t*) _top; \ + src2 = (const uint16_t*) _left; \ + src = (uint16_t*) _src; \ + stride = _stride; \ + p_src = src; \ + } else { \ + src1 = (const uint16_t*) _left; \ + src2 = (const uint16_t*) _top; \ + src = &src_tmp[0]; \ + stride = W; \ + p_src = src; \ + } \ + p_out = (uint16_t*) _src; \ + ref = (uint16_t*) (src1 - 1) + +#define PRED_ANGULAR_WAR() \ + int y; \ + __m128i r0, r1, r3 + +#define PRED_ANGULAR_WAR4_8() \ + PRED_ANGULAR_WAR(); \ + __m128i r2 +#define PRED_ANGULAR_WAR8_8() \ + PRED_ANGULAR_WAR4_8(); \ + int x +#define PRED_ANGULAR_WAR16_8() \ + PRED_ANGULAR_WAR8_8() +#define PRED_ANGULAR_WAR32_8() \ + PRED_ANGULAR_WAR(); \ + int x + +#define PRED_ANGULAR_WAR4_10() PRED_ANGULAR_WAR8_8() +#define PRED_ANGULAR_WAR8_10() PRED_ANGULAR_WAR8_8() +#define PRED_ANGULAR_WAR16_10() PRED_ANGULAR_WAR16_8() +#define PRED_ANGULAR_WAR32_10() PRED_ANGULAR_WAR32_8() + +#define PRED_ANGULAR(W, D) \ +static av_always_inline void pred_angular_ ## W ##_ ## D ## _sse(uint8_t *_src,\ + const uint8_t *_top, const uint8_t *_left, ptrdiff_t _stride, int c_idx, int mode) {\ + const int intra_pred_angle[] = { \ + 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9,-13,-17,-21,-26, \ + -32,-26,-21,-17,-13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 \ + }; \ + const int inv_angle[] = { \ + -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, \ + -630, -910, -1638, -4096 \ + }; \ + PRED_ANGULAR_WAR ## W ## _ ## D(); \ + int angle = intra_pred_angle[mode-2]; \ + int angle_i = angle; \ + int last = (W * angle) >> 5; \ + int stride; \ + PRED_ANGULAR_INIT_ ## D(W); \ + if (angle < 0 && last < -1) { \ + for (y = last; y <= -1; y++) \ + ref[y] = src2[-1 + ((y * inv_angle[mode-11] + 128) >> 8)]; \ + } \ + for (y = 0; y < W; y++) { \ + int idx = (angle_i) >> 5; \ + int fact = (angle_i) & 31; \ + if (fact) { \ + ANGULAR_COMPUTE ## W ## _ ## D(); \ + } else { \ + ANGULAR_COMPUTE_ELSE ## W ## _ ## D(); \ + } \ + angle_i += angle; \ + p_src += stride; \ + } \ + if (mode >= 18) { \ + if (mode == 26 && c_idx == 0) { \ + CLIP_PIXEL1_ ## W ## _ ## D(); \ + } \ + } else { \ + TRANSPOSE ## W ## x ## W ## _ ## D(src_tmp, W, p_out, _stride); \ + if (mode == 10 && c_idx == 0) { \ + CLIP_PIXEL2_ ## W ## _ ## D(); \ + } \ + } \ +} + +PRED_ANGULAR( 4, 8) +PRED_ANGULAR( 8, 8) +PRED_ANGULAR(16, 8) +PRED_ANGULAR(32, 8) + +PRED_ANGULAR( 4,10) +PRED_ANGULAR( 8,10) +PRED_ANGULAR(16,10) +PRED_ANGULAR(32,10) + +void pred_angular_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_4_8_sse(_src, _top, _left, _stride, c_idx, mode); +} +void pred_angular_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_8_8_sse(_src, _top, _left, _stride, c_idx, mode); +} +void pred_angular_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_16_8_sse(_src, _top, _left, _stride, c_idx, mode); +} +void pred_angular_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_32_8_sse(_src, _top, _left, _stride, c_idx, mode); +} + +void pred_angular_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_4_10_sse(_src, _top, _left, _stride, c_idx, mode); +} +void pred_angular_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_8_10_sse(_src, _top, _left, _stride, c_idx, mode); +} +void pred_angular_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_16_10_sse(_src, _top, _left, _stride, c_idx, mode); +} +void pred_angular_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, + ptrdiff_t _stride, int c_idx, int mode) { + pred_angular_32_10_sse(_src, _top, _left, _stride, c_idx, mode); +} +#endif + +#ifdef __GNUC__ +#pragma GCC pop_options +#endif diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index ad8168f..74184f6 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -258,4 +258,26 @@ void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t st void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride); + +void ff_hevc_transform_4x4_luma_8_sse2(int16_t *coeffs); +void ff_hevc_transform_4x4_luma_10_sse2(int16_t *coeffs); +void ff_hevc_transform_4x4_luma_12_sse2(int16_t *coeffs); + +#define IDCT_FUNC(s, b) void ff_hevc_transform_ ## s ## x ## s ##_## b ##_sse2\ + (int16_t *coeffs, int col_limit); + +IDCT_FUNC(4, 8) +IDCT_FUNC(4, 10) +IDCT_FUNC(4, 12) +IDCT_FUNC(8, 8) +IDCT_FUNC(8, 10) +IDCT_FUNC(8, 12) +IDCT_FUNC(16, 8) +IDCT_FUNC(16, 10) +IDCT_FUNC(16, 12) +IDCT_FUNC(32, 8) +IDCT_FUNC(32, 10) +IDCT_FUNC(32, 12) + #endif // AVCODEC_X86_HEVCDSP_H diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 0de0163..5ca8df1 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -719,6 +719,12 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->transform_add[1] = ff_hevc_transform_add8_8_sse2; c->transform_add[2] = ff_hevc_transform_add16_8_sse2; c->transform_add[3] = ff_hevc_transform_add32_8_sse2; + + c->idct_4x4_luma = ff_hevc_transform_4x4_luma_8_sse2; + c->idct[0] = ff_hevc_transform_4x4_8_sse2; + c->idct[1] = ff_hevc_transform_8x8_8_sse2; + c->idct[2] = ff_hevc_transform_16x16_8_sse2; + c->idct[3] = ff_hevc_transform_32x32_8_sse2; } if (EXTERNAL_SSSE3(cpu_flags)) { if(ARCH_X86_64) { @@ -871,6 +877,12 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->transform_add[1] = ff_hevc_transform_add8_10_sse2; c->transform_add[2] = ff_hevc_transform_add16_10_sse2; c->transform_add[3] = ff_hevc_transform_add32_10_sse2; + + c->idct_4x4_luma = ff_hevc_transform_4x4_luma_10_sse2; + c->idct[0] = ff_hevc_transform_4x4_10_sse2; + c->idct[1] = ff_hevc_transform_8x8_10_sse2; + c->idct[2] = ff_hevc_transform_16x16_10_sse2; + c->idct[3] = ff_hevc_transform_32x32_10_sse2; } if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; @@ -1069,6 +1081,12 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2; c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2; c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2; + + c->idct_4x4_luma = ff_hevc_transform_4x4_luma_12_sse2; + c->idct[0] = ff_hevc_transform_4x4_12_sse2; + c->idct[1] = ff_hevc_transform_8x8_12_sse2; + c->idct[2] = ff_hevc_transform_16x16_12_sse2; + c->idct[3] = ff_hevc_transform_32x32_12_sse2; } if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3; @@ -1103,3 +1121,35 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) } } } + +#include "libavcodec/hevcpred.h" +#include "libavcodec/x86/hevcpred.h" + +#undef FUNC +#define FUNC(a, depth) a ## _ ## depth ## _sse + +#define HEVC_PRED(depth) \ + hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ + hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ + hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ + hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ + hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ + hpc->pred_angular[3] = FUNC(pred_angular_3, depth) + +void ff_hevc_pred_init_x86(HEVCPredContext *hpc, int bit_depth) +{ + int mm_flags = av_get_cpu_flags(); + + if (bit_depth == 8) { + if (EXTERNAL_SSE4(mm_flags)) { + HEVC_PRED(8); + } + } + if (bit_depth == 10) { + if (EXTERNAL_SSE4(mm_flags)) { + HEVC_PRED(10); + } + } +} diff --git a/libavcodec/x86/hevcpred.h b/libavcodec/x86/hevcpred.h new file mode 100644 index 0000000..d26e5bf --- /dev/null +++ b/libavcodec/x86/hevcpred.h @@ -0,0 +1,24 @@ +#ifndef AVCODEC_X86_HEVCPRED_H +#define AVCODEC_X86_HEVCPRED_H + +void pred_planar_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); +void pred_planar_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); +void pred_planar_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); +void pred_planar_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); + +void pred_angular_0_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); +void pred_angular_1_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); +void pred_angular_2_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); +void pred_angular_3_8_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); + +void pred_planar_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); +void pred_planar_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); +void pred_planar_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); +void pred_planar_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride); + +void pred_angular_0_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); +void pred_angular_1_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); +void pred_angular_2_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); +void pred_angular_3_10_sse(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int c_idx, int mode); + +#endif // AVCODEC_X86_HEVCPRED_H \ No newline at end of file -- 2.6.3