From 998fd438172395d82d41b1ff064374d98d2f3376 Mon Sep 17 00:00:00 2001
From: Lin Xie <lin.xie@intel.com>
Date: Tue, 26 Mar 2019 14:03:35 +0800
Subject: [PATCH] Enable vaapi-scale for IE inference filter

* This patch requires matched IE C API patch update

Change-Id: Ic0fa65963a0708f424a4c2dd98260d38f0fc96af
---
 libavfilter/dnn_backend_intel_ie.c  |  11 +-
 libavfilter/dnn_data.h              |   8 +-
 libavfilter/inference.c             | 373 ++++++++++++++++++++++++++++++++----
 libavfilter/inference.h             |  97 ++++++++--
 libavfilter/vf_inference_classify.c | 120 ++++++++----
 libavfilter/vf_inference_detect.c   | 132 +++++++++----
 6 files changed, 611 insertions(+), 130 deletions(-)

diff --git a/libavfilter/dnn_backend_intel_ie.c b/libavfilter/dnn_backend_intel_ie.c
index 5742a8e..a600ee6 100644
--- a/libavfilter/dnn_backend_intel_ie.c
+++ b/libavfilter/dnn_backend_intel_ie.c
@@ -221,7 +221,7 @@ static DNNReturnType get_execute_result_intel_ie(void *model, DNNIOData *result)
     if (!model || !result)
         return DNN_ERROR;
 
-    result->data = IEGetResultSpace(ie_model->context, result->in_out_idx, &size);
+    result->data[0] = IEGetResultSpace(ie_model->context, result->in_out_idx, &size);
     if (!result->data)
         return DNN_ERROR;
 
@@ -330,6 +330,7 @@ static DNNReturnType set_output_info_intel_ie(void *model, DNNModelInfo *info)
 
 static DNNReturnType set_input_intel_ie(void *model, const DNNIOData *input)
 {
+    int i;
     IEData data;
     DNNIntelIEModel *ie_model = (DNNIntelIEModel *)model;
 
@@ -338,12 +339,12 @@ static DNNReturnType set_input_intel_ie(void *model, const DNNIOData *input)
 
     memset(&data, 0, sizeof(IEData));
 
-    data.size         = input->size;
+    for (i = 0; i < NUM_DATA_POINTS; i++) {
+        data.data[i]     = input->data[i];
+        data.linesize[i] = input->linesize[i];
+    }
     data.width        = input->width;
     data.height       = input->height;
-    data.widthStride  = input->width_stride;
-    data.heightStride = input->height_stride;
-    data.buffer       = (void *)input->data;
     data.channelNum   = input->channels;
     data.batchIdx     = input->batch_idx;
     data.precision    = get_precision(input->precision);
diff --git a/libavfilter/dnn_data.h b/libavfilter/dnn_data.h
index 8c70366..0add8ee 100644
--- a/libavfilter/dnn_data.h
+++ b/libavfilter/dnn_data.h
@@ -20,6 +20,8 @@
 #ifndef AVFILTER_DNN_DATA_H
 #define AVFILTER_DNN_DATA_H
 
+#include <stdint.h>
+
 /**
 * @enum TargetDevice
 * @brief Describes known device types
@@ -113,12 +115,12 @@ typedef struct DNNDevice {
 * spencial for single 1D: height/height_stride/channels are 1 width_stride=width, output only
 */
 typedef struct DNNIOData {
-    void *data;             // the type depend on the data precision
+#define NUM_DATA_POINTS 4
+    uint8_t *data[NUM_DATA_POINTS];
+    int  linesize[NUM_DATA_POINTS];
     unsigned int size;      // size=width x height x channels,it is for 1D output/input. unit is byte.
     unsigned int width;
     unsigned int height;
-    unsigned int width_stride;      // it is for HW memory or padding memory
-    unsigned int height_stride;
     unsigned int channels;
     // the index of the batch when batch size is bigger than 1. default value is zero when batch size is 1.
     unsigned int batch_idx;
diff --git a/libavfilter/inference.c b/libavfilter/inference.c
index 20934fc..596c5b4 100644
--- a/libavfilter/inference.c
+++ b/libavfilter/inference.c
@@ -33,6 +33,19 @@
 
 #include "inference.h"
 
+#if CONFIG_VAAPI
+#define VA_CALL(_FUNC)                                     \
+    {                                                      \
+        VAStatus _status = _FUNC;                          \
+        if (_status != VA_STATUS_SUCCESS)                  \
+        {                                                  \
+            printf(#_FUNC " failed, sts = %d (%s).\n",     \
+                    _status, vaErrorStr(_status));         \
+            return AVERROR(EINVAL);                        \
+        }                                                  \
+    }
+#endif
+
 struct InferenceBaseContext
 {
     char *infer_type;
@@ -49,13 +62,19 @@ struct InferenceBaseContext
     InferencePreProcess preprocess;
 };
 
+static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp, AVFrame *input, Rect *crop_rect,
+        int scale_w, int scale_h, uint8_t *data[],  int stride[]);
+
+static int va_vpp_scale(VAAPIVpp *va_vpp, AVFrame *input,
+        int scale_w, int scale_h, uint8_t *data[],  int stride[]);
+
 static int fill_dnn_data_from_frame(DNNIOData *data,
                                     const AVFrame *frame,
                                     int batch_idx,
                                     int is_image,
                                     int input_idx)
 {
-    int channels_nb;
+    int i, channels_nb;
     DNNDataFormat dnn_fmt;
     DNNDataPrecisionType precision;
     enum AVPixelFormat pix_fmt = frame->format;
@@ -68,6 +87,7 @@ static int fill_dnn_data_from_frame(DNNIOData *data,
         channels_nb = 1;
         break;
     case AV_PIX_FMT_BGRA:
+    case AV_PIX_FMT_BGR0:
         precision = DNN_DATA_PRECISION_U8;
         dnn_fmt = DNN_DATA_BGR_PACKED;
         channels_nb = 4;
@@ -77,16 +97,22 @@ static int fill_dnn_data_from_frame(DNNIOData *data,
         dnn_fmt = DNN_DATA_BGR_PACKED;
         channels_nb = 3;
         break;
+    case AV_PIX_FMT_RGBP:
+        precision = DNN_DATA_PRECISION_U8;
+        dnn_fmt = DNN_DATA_RGB_PLANAR;
+        channels_nb = 3;
+        break;
     default:
         av_log(NULL, AV_LOG_ERROR, "format unsupport!\n");
         return AVERROR(EINVAL);
     };
 
-    data->data           = (void *)frame->data[0];
+    for (i = 0; i < NUM_DATA_POINTS; i++) {
+        data->data[i]     = frame->data[i];
+        data->linesize[i] = frame->linesize[i];
+    }
     data->width          = frame->width;
     data->height         = frame->height;
-    data->width_stride   = frame->linesize[0]/channels_nb;
-    data->height_stride  = frame->height;
     data->channels       = channels_nb;
     data->data_format    = dnn_fmt;
     data->precision      = precision;
@@ -98,25 +124,26 @@ static int fill_dnn_data_from_frame(DNNIOData *data,
     return 0;
 }
 
-static int sw_crop_and_scale(AVFrame *frame,
-                             float x0, float y0,
-                             float x1, float y1,
-                             int out_w, int out_h,
+static int sw_crop_and_scale(AVFrame *frame, Rect *crop_rect,
+                             int out_w,      int out_h,
                              enum AVPixelFormat out_format,
                              uint8_t *data[], int stride[])
 {
-    int err, bufsize;
+    int bufsize;
+    AVFrame *temp;
     struct SwsContext *sws_ctx;
     const AVPixFmtDescriptor *desc;
     int x, y, w, h, hsub, vsub;
     int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes
     enum AVPixelFormat expect_format = out_format;
 
-    AVFrame *temp = av_frame_alloc();
-    if (!temp) {
-        err = AVERROR(ENOMEM);
-        return err;
-    }
+    if (!crop_rect)
+        return AVERROR(EINVAL);
+
+    temp = av_frame_alloc();
+    if (!temp)
+        return AVERROR(ENOMEM);
+
     av_frame_ref(temp, frame);
 
     desc = av_pix_fmt_desc_get(temp->format);
@@ -126,10 +153,10 @@ static int sw_crop_and_scale(AVFrame *frame,
 
     /* cropping */
     {
-        x = lrintf(x0);
-        y = lrintf(y0);
-        w = lrintf(x1) - x;
-        h = lrintf(y1) - y;
+        x = lrintf(crop_rect->x0);
+        y = lrintf(crop_rect->y0);
+        w = lrintf(crop_rect->x1) - x;
+        h = lrintf(crop_rect->y1) - y;
 
         temp->width  = w;
         temp->height = h;
@@ -157,8 +184,7 @@ static int sw_crop_and_scale(AVFrame *frame,
                              SWS_BILINEAR, NULL, NULL, NULL);
     if (!sws_ctx) {
         av_log(NULL, AV_LOG_ERROR, "Create scaling context failed!\n");
-        err = AVERROR(EINVAL);
-        return err;
+        return AVERROR(EINVAL);
     }
 
     if (!data[0]) {
@@ -250,8 +276,6 @@ int ff_inference_base_create(AVFilterContext *ctx,
     if (!s)
         return AVERROR(ENOMEM);
 
-    // TODO: handle hw ctx
-
     s->module = ff_get_dnn_module(param->backend_type);
     if (!s->module) {
         av_log(ctx, AV_LOG_ERROR, "could not create DNN backend module\n");
@@ -307,9 +331,13 @@ int ff_inference_base_create(AVFilterContext *ctx,
     vpp = &s->vpp;
 
     // vpp init
-    vpp->swscale        = &sws_scale;
-    vpp->crop_and_scale = &sw_crop_and_scale;
-    vpp->expect_format  = AV_PIX_FMT_BGR24;
+    vpp->sw_vpp = av_mallocz(sizeof(*vpp->sw_vpp));
+    if (!vpp->sw_vpp)
+        goto fail;
+
+    vpp->expect_format          = AV_PIX_FMT_BGR24;
+    vpp->sw_vpp->scale          = &sws_scale;
+    vpp->sw_vpp->crop_and_scale = &sw_crop_and_scale;
 
     *base = s;
 #undef DNN_ERR_CHECK
@@ -326,14 +354,21 @@ int ff_inference_base_free(InferenceBaseContext **base)
     if (!s)
         return 0;
 
-    if (s->vpp.device == VPP_DEVICE_SW) {
-        for (int i = 0; i < MAX_VPP_NUM; i++) {
-            if (s->vpp.frames[i])
-                av_frame_free(&s->vpp.frames[i]);
-            if (s->vpp.scale_contexts[i])
-                sws_freeContext(s->vpp.scale_contexts[i]);
-        }
+    // VPP clean up
+    for (int i = 0; i < MAX_VPP_NUM; i++) {
+        if (s->vpp.frames[i])
+            av_frame_free(&s->vpp.frames[i]);
+        if (s->vpp.sw_vpp->scale_contexts[i])
+            sws_freeContext(s->vpp.sw_vpp->scale_contexts[i]);
     }
+    av_freep(&s->vpp.sw_vpp);
+
+#if CONFIG_VAAPI
+    if (s->vpp.va_vpp) {
+        va_vpp_device_free(s->vpp.va_vpp);
+        av_freep(&s->vpp.va_vpp);
+    }
+#endif
 
     if (s->module) {
         s->module->free_model(&s->model);
@@ -353,6 +388,11 @@ int ff_inference_base_submit_frame(InferenceBaseContext *base,
     fill_dnn_data_from_frame(&input, frame, batch_idx, 1, input_idx);
     base->model->set_input(base->model->model, &input);
 
+#if CONFIG_VAAPI
+    if (base->vpp.va_vpp)
+        va_vpp_surface_release(base->vpp.va_vpp);
+#endif
+
     return 0;
 }
 
@@ -373,10 +413,16 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in)
     for (int i = 0; i < info->numbers; i++) {
         AVFrame *processed_frame;
         for (int j = 0; j < base->batch_size; j++) {
-            if (base->preprocess)
-                base->preprocess(base, i, in, &processed_frame);
+            if (base->preprocess) {
+                if (base->preprocess(base, i, in, &processed_frame) < 0)
+                    return AVERROR(EINVAL);
+            }
             fill_dnn_data_from_frame(&input, processed_frame, j, 1, i);
             base->model->set_input(base->model->model, &input);
+#if CONFIG_VAAPI
+            if (base->vpp.va_vpp)
+                va_vpp_surface_release(base->vpp.va_vpp);
+#endif
         }
     }
 
@@ -410,7 +456,7 @@ int ff_inference_base_get_infer_result(InferenceBaseContext *base,
     metadata->layout    = info->layout[0];
     metadata->precision = info->precision[0];
 
-    metadata->data        = data.data;
+    metadata->data        = data.data[0];
     metadata->total_bytes = data.size;
 
     return 0;
@@ -430,3 +476,260 @@ VideoPP* ff_inference_base_get_vpp(InferenceBaseContext *base)
 {
     return &base->vpp;
 }
+
+/********************************************
+ *                                          *
+ *              VAAPI VPP APIs              *
+ *                                          *
+ *******************************************/
+#if CONFIG_VAAPI
+static int ff_vaapi_vpp_colour_standard(enum AVColorSpace av_cs)
+{
+    switch(av_cs) {
+#define CS(av, va) case AVCOL_SPC_ ## av: return VAProcColorStandard ## va;
+        CS(BT709,     BT709);
+        CS(BT470BG,   BT601);
+        CS(SMPTE170M, SMPTE170M);
+        CS(SMPTE240M, SMPTE240M);
+#undef CS
+    default:
+        return VAProcColorStandardNone;
+    }
+}
+
+int va_vpp_device_create(VAAPIVpp *va_vpp, AVFilterLink *inlink)
+{
+    AVFilterContext *avctx = inlink->dst;
+    VADisplay display = NULL;
+    VAImageFormat *image_list = NULL;
+    VAStatus vas;
+    int err, image_count;
+    AVBufferRef *device_ref = NULL;
+    AVHWFramesContext *hw_frames_ctx;
+
+    hw_frames_ctx = (AVHWFramesContext *)inlink->hw_frames_ctx->data;
+    av_assert0(hw_frames_ctx);
+
+    device_ref = av_buffer_ref(hw_frames_ctx->device_ref);
+    if (!device_ref) {
+        av_log(avctx, AV_LOG_ERROR, "A device reference create failed.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    va_vpp->hwctx         = ((AVHWDeviceContext *)device_ref->data)->hwctx;
+    va_vpp->hw_frames_ref = inlink->hw_frames_ctx;
+
+    av_buffer_unref(&device_ref);
+
+    display = va_vpp->hwctx->display;
+
+    image_count = vaMaxNumImageFormats(display);
+    if (image_count <= 0) {
+        err = AVERROR(EIO);
+        goto fail;
+    }
+    image_list = av_malloc(image_count * sizeof(*image_list));
+    if (!image_list) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+    vas = vaQueryImageFormats(display, image_list, &image_count);
+    if (vas != VA_STATUS_SUCCESS) {
+        err = AVERROR(EIO);
+        goto fail;
+    }
+
+    va_vpp->format_list = image_list;
+    va_vpp->nb_formats  = image_count;
+    va_vpp->va_config   = VA_INVALID_ID;
+    va_vpp->va_context  = VA_INVALID_ID;
+    va_vpp->va_surface  = VA_INVALID_ID;
+
+    va_vpp->scale          = &va_vpp_scale;
+    va_vpp->crop_and_scale = &va_vpp_crop_and_scale;
+
+    return VA_STATUS_SUCCESS;
+fail:
+    if (image_list)
+        av_free(image_list);
+    return err;
+}
+
+int va_vpp_device_free(VAAPIVpp *va_vpp)
+{
+    VAStatus vas;
+
+    if (!va_vpp)
+        return 0;
+
+    if (va_vpp->va_surface != VA_INVALID_ID) {
+        vaDestroySurfaces(va_vpp->hwctx->display, &va_vpp->va_surface, 1);
+        if (vas != VA_STATUS_SUCCESS) {
+            av_log(NULL, AV_LOG_ERROR, "Failed to destroy surface %#x: "
+                    "%d (%s).\n", va_vpp->va_surface, vas, vaErrorStr(vas));
+        }
+    }
+
+    if (va_vpp->va_context != VA_INVALID_ID) {
+        vaDestroyContext(va_vpp->hwctx->display, va_vpp->va_context);
+        va_vpp->va_context = VA_INVALID_ID;
+    }
+
+    if (va_vpp->va_config != VA_INVALID_ID) {
+        vaDestroyConfig(va_vpp->hwctx->display, va_vpp->va_config);
+        va_vpp->va_config = VA_INVALID_ID;
+    }
+
+    av_free(va_vpp->format_list);
+
+    return VA_STATUS_SUCCESS;
+}
+
+int va_vpp_surface_alloc(VAAPIVpp *va_vpp, size_t width, size_t height, const char *format)
+{
+    int i, rt_format, fourcc;
+    VASurfaceAttrib surface_attrib;
+    enum AVPixelFormat av_format;
+
+    if (!va_vpp)
+        return -1;
+
+    if (format == NULL || strstr(format, "bgrx")) {
+        fourcc = VA_FOURCC_BGRX; rt_format = VA_RT_FORMAT_RGB32; av_format = AV_PIX_FMT_BGR0;
+    } else if (strstr(format, "rgbp")) {
+        fourcc = VA_FOURCC_RGBP; rt_format = VA_RT_FORMAT_RGBP;  av_format = AV_PIX_FMT_RGBP;
+    } else
+        return -1;
+
+    surface_attrib.type          = VASurfaceAttribPixelFormat;
+    surface_attrib.flags         = VA_SURFACE_ATTRIB_SETTABLE;
+    surface_attrib.value.type    = VAGenericValueTypeInteger;
+    surface_attrib.value.value.i = fourcc;
+
+    VA_CALL(vaCreateConfig(va_vpp->hwctx->display, VAProfileNone,
+                           VAEntrypointVideoProc, 0, 0, &va_vpp->va_config));
+
+    VA_CALL(vaCreateSurfaces(va_vpp->hwctx->display, rt_format, width, height,
+                             &va_vpp->va_surface, 1, &surface_attrib, 1));
+
+    VA_CALL(vaCreateContext(va_vpp->hwctx->display, va_vpp->va_config,
+                            width, height, VA_PROGRESSIVE,
+                            &va_vpp->va_surface, 1, &va_vpp->va_context));
+
+    for (i = 0; i < va_vpp->nb_formats; i++) {
+        if (va_vpp->format_list[i].fourcc == fourcc) {
+            va_vpp->va_format_selected = va_vpp->format_list[i];
+            break;
+        }
+    }
+
+    va_vpp->av_format = av_format;
+
+    return VA_STATUS_SUCCESS;
+}
+
+/* release mapped memory */
+int va_vpp_surface_release(VAAPIVpp *va_vpp)
+{
+    VA_CALL(vaUnmapBuffer(va_vpp->hwctx->display, va_vpp->va_image.buf));
+    VA_CALL(vaDestroyImage(va_vpp->hwctx->display, va_vpp->va_image.image_id));
+
+    return VA_STATUS_SUCCESS;
+}
+
+/* HW scale and map to system memory */
+static int va_vpp_scale(VAAPIVpp *va_vpp, AVFrame *input,
+                        int scale_w,      int scale_h,
+                        uint8_t *data[],  int stride[])
+{
+    return va_vpp_crop_and_scale(va_vpp, input, NULL, scale_w, scale_h, data, stride);
+}
+
+static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp,
+                                 AVFrame *input,   Rect *crop_rect,
+                                 int scale_w,      int scale_h,
+                                 uint8_t *data[],  int stride[])
+{
+    int i;
+    void *address = NULL;
+    VAImage   *va_image_ptr;
+    VABufferID params_id;
+    VASurfaceID input_surface, output_surface;
+    VAProcPipelineParameterBuffer params;
+    VARectangle input_region;
+
+    input_surface = (VASurfaceID)(uintptr_t)input->data[3];
+    av_log(NULL, AV_LOG_DEBUG, "Using surface %#x for scale input.\n",
+           input_surface);
+
+    output_surface = va_vpp->va_surface;
+
+    if (crop_rect == NULL) {
+        input_region = (VARectangle) {
+            .x      = input->crop_left,
+            .y      = input->crop_top,
+            .width  = input->width -
+                     (input->crop_left + input->crop_right),
+            .height = input->height -
+                     (input->crop_top + input->crop_bottom),
+        };
+    } else {
+        int _x = lrintf(crop_rect->x0);
+        int _y = lrintf(crop_rect->y0);
+        input_region = (VARectangle) {
+            .x      = _x,
+            .y      = _y,
+            .width  = lrintf(crop_rect->x1) - _x,
+            .height = lrintf(crop_rect->y1) - _y,
+        };
+    }
+
+    memset(&params, 0, sizeof(params));
+
+    params.surface = input_surface;
+    params.surface_region = &input_region;
+    params.surface_color_standard =
+        ff_vaapi_vpp_colour_standard(input->colorspace);
+
+    params.output_region = 0;
+    params.output_background_color = 0xff000000;
+    params.output_color_standard = params.surface_color_standard;
+
+    params.pipeline_flags = 0;
+    params.filter_flags = VA_FILTER_SCALING_HQ;
+
+    VA_CALL(vaBeginPicture(va_vpp->hwctx->display, va_vpp->va_context, output_surface));
+
+    VA_CALL(vaCreateBuffer(va_vpp->hwctx->display, va_vpp->va_context,
+                           VAProcPipelineParameterBufferType,
+                           sizeof(params), 1, &params, &params_id));
+
+    VA_CALL(vaRenderPicture(va_vpp->hwctx->display, va_vpp->va_context,
+                            &params_id, 1));
+
+    VA_CALL(vaEndPicture(va_vpp->hwctx->display, va_vpp->va_context));
+
+    VA_CALL(vaDestroyBuffer(va_vpp->hwctx->display, params_id));
+
+    VA_CALL(vaSyncSurface(va_vpp->hwctx->display, output_surface));
+
+    // map surface to system memory
+    va_image_ptr = &va_vpp->va_image;
+
+    VA_CALL(vaCreateImage(va_vpp->hwctx->display, &va_vpp->va_format_selected,
+                          scale_w, scale_h, va_image_ptr));
+
+    VA_CALL(vaGetImage(va_vpp->hwctx->display, output_surface,
+                       0, 0, scale_w, scale_h,
+                       va_image_ptr->image_id));
+
+    VA_CALL(vaMapBuffer(va_vpp->hwctx->display, va_image_ptr->buf, &address));
+
+    for (i = 0; i < va_image_ptr->num_planes; i++) {
+        data[i]   = (uint8_t *)address + va_image_ptr->offsets[i];
+        stride[i] = va_image_ptr->pitches[i];
+    }
+
+    return VA_STATUS_SUCCESS;
+}
+#endif
diff --git a/libavfilter/inference.h b/libavfilter/inference.h
index 33de54b..d0515e2 100644
--- a/libavfilter/inference.h
+++ b/libavfilter/inference.h
@@ -19,8 +19,17 @@
 #ifndef AVFILTER_INFERENCE_H
 #define AVFILTER_INFERENCE_H
 
+#if CONFIG_VAAPI
+#include <va/va.h>
+#endif
+
 #include "libavutil/common.h"
 #include "libswscale/swscale.h"
+#include "libavutil/hwcontext.h"
+#if CONFIG_VAAPI
+#include "libavutil/hwcontext_vaapi.h"
+#endif
+
 #include "dnn_interface.h"
 
 typedef struct InferenceBaseContext InferenceBaseContext;
@@ -47,25 +56,71 @@ typedef struct InferenceParam {
 
 #define MAX_VPP_NUM DNN_INPUT_OUTPUT_NUM
 
+/*
+ * Vpp device type detected according to frame format
+ */
 typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } VPPDevice;
 
+typedef struct _SwVpp    SwVpp;
+
+typedef struct _VAAPIVpp VAAPIVpp;
+
+/*
+ * Generic rectangle structure consists of two diagonal points
+ */
+typedef struct Rect {
+    float x0; float y0; float x1; float y1;
+} Rect;
+
+#if CONFIG_VAAPI
+struct _VAAPIVpp {
+    AVVAAPIDeviceContext *hwctx;
+    AVBufferRef  *hw_frames_ref;
+    VASurfaceID   va_surface;
+    VAConfigID    va_config;
+    VAContextID   va_context;
+
+    VAImageFormat *format_list; //!< Surface formats which can be used with this device.
+    int            nb_formats;
+
+    VAImage            va_image;
+    VAImageFormat      va_format_selected;
+    enum AVPixelFormat av_format;
+
+    int  (*scale)(VAAPIVpp *va_vpp, AVFrame *input,
+                  int scale_w,      int scale_h,
+                  uint8_t *data[],  int stride[]);
+
+    int  (*crop_and_scale)(VAAPIVpp *va_vpp, AVFrame *input,
+                           Rect *crop_rect,
+                           int scale_w, int scale_h,
+                           uint8_t *data[],  int stride[]);
+};
+#endif
+
+struct _SwVpp {
+    struct SwsContext *scale_contexts[MAX_VPP_NUM];
+
+    int  (*scale)(struct SwsContext *context,
+                  const uint8_t * const srcSlice[],
+                  const int srcStride[], int srcSliceY,
+                  int srcSliceH, uint8_t *const dst[],
+                  const int dstStride[]);
+
+    int  (*crop_and_scale)(AVFrame *frame, Rect *crop_rect,
+                           int   scale_w,  int   scale_h,
+                           enum AVPixelFormat scale_format,
+                           uint8_t *dst[], int   dstStride[]);
+};
+
 typedef struct VideoPP {
-    int      device;
-    int      expect_format;
-    void    *scale_contexts[MAX_VPP_NUM];
-    AVFrame *frames[MAX_VPP_NUM];          //<! frames to save vpp output
-
-    int    (*swscale)(struct SwsContext *context,
-                      const uint8_t * const srcSlice[],
-                      const int srcStride[], int srcSliceY,
-                      int srcSliceH, uint8_t *const dst[],
-                      const int dstStride[]);
-    int    (*crop_and_scale)(AVFrame *frame,
-                             float crop_x0,  float crop_y0,
-                             float crop_x1,  float crop_y1,
-                             int   scale_w,  int   scale_h,
-                             enum AVPixelFormat scale_format,
-                             uint8_t *dst[], int   dstStride[]);
+    int       device;
+    int       expect_format;
+    AVFrame  *frames[MAX_VPP_NUM];  //<! frames to save vpp output
+    SwVpp    *sw_vpp;
+#if CONFIG_VAAPI
+    VAAPIVpp *va_vpp;
+#endif
 } VideoPP;
 
 #define MAX_TENSOR_DIM_NUM 8
@@ -152,4 +207,14 @@ DNNModelInfo* ff_inference_base_get_input_info(InferenceBaseContext *base);
 DNNModelInfo* ff_inference_base_get_output_info(InferenceBaseContext *base);
 VideoPP*      ff_inference_base_get_vpp(InferenceBaseContext *base);
 
+#if CONFIG_VAAPI
+int va_vpp_device_create(VAAPIVpp *ctx, AVFilterLink *inlink);
+
+int va_vpp_device_free(VAAPIVpp *ctx);
+
+int va_vpp_surface_alloc(VAAPIVpp *ctx, size_t width, size_t height, const char *format);
+
+int va_vpp_surface_release(VAAPIVpp *ctx);
+#endif
+
 #endif
diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c
index b2e435b..0681a2c 100644
--- a/libavfilter/vf_inference_classify.c
+++ b/libavfilter/vf_inference_classify.c
@@ -62,6 +62,7 @@ typedef struct InferenceClassifyContext {
     char  *labels;
     char  *names;
     char  *model_file;
+    char  *vpp_format;
     char  *feature_file;    ///< binary feature file for face identification
     int    feature_num;     ///< identification face feature number
     double feature_angle;   ///< face identification threshold angle value
@@ -410,7 +411,8 @@ static int query_formats(AVFilterContext *context)
         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV422P,  AV_PIX_FMT_YUV444P,
         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P,
         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV411P,  AV_PIX_FMT_GRAY8,
-        AV_PIX_FMT_BGR24,    AV_PIX_FMT_BGRA,     AV_PIX_FMT_NONE};
+        AV_PIX_FMT_BGR24,    AV_PIX_FMT_BGRA,     AV_PIX_FMT_VAAPI,
+        AV_PIX_FMT_NONE};
 
     formats_list = ff_make_format_list(pixel_formats);
     if (!formats_list) {
@@ -511,11 +513,6 @@ static av_cold int classify_init(AVFilterContext *ctx)
         } else if (strstr(names[i], "age") && strstr(names[i], "gend")) {
             s->post_process[i] = &age_gender_classify_result_process;
         } else if (strstr(names[i], "face")) {
-            VideoPP *vpp = ff_inference_base_get_vpp(s->infer_bases[i]);
-
-            // face reidentification model requires RGB format
-            vpp->expect_format = AV_PIX_FMT_RGB24;
-
             s->init[i]         = &face_identify_init;
             s->uninit[i]       = &face_identify_uninit;
             s->post_process[i] = &face_identify_result_process;
@@ -603,16 +600,27 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
             DNNModelInfo *iinfo = ff_inference_base_get_input_info(base);
             DNNModelInfo *oinfo = ff_inference_base_get_output_info(base);
 
-            ret = vpp->crop_and_scale(in,
-                                      bbox->x_min * in->width,
-                                      bbox->y_min * in->height,
-                                      bbox->x_max * in->width,
-                                      bbox->y_max * in->height,
-                                      iinfo->width[0],
-                                      iinfo->height[0],
-                                      vpp->expect_format,
-                                      tmp->data,
-                                      tmp->linesize);
+            Rect crop_rect = (Rect) {
+                .x0 = bbox->x_min * in->width,
+                .y0 = bbox->y_min * in->height,
+                .x1 = bbox->x_max * in->width,
+                .y1 = bbox->y_max * in->height,
+            };
+
+            if (vpp->device == VPP_DEVICE_SW) {
+                ret = vpp->sw_vpp->crop_and_scale(in, &crop_rect,
+                        iinfo->width[0], iinfo->height[0],
+                        vpp->expect_format, tmp->data, tmp->linesize);
+            } else {
+#if CONFIG_VAAPI
+                ret = vpp->va_vpp->crop_and_scale(vpp->va_vpp, in, &crop_rect,
+                        iinfo->width[0], iinfo->height[0], tmp->data, tmp->linesize);
+#endif
+            }
+            if (ret != 0) {
+                ret = AVERROR(EINVAL);
+                goto fail;
+            }
 
             // TODO: support dynamic batch for faces
             ff_inference_base_submit_frame(base, tmp, 0, 0);
@@ -651,41 +659,84 @@ fail:
 
 static av_cold int config_input(AVFilterLink *inlink)
 {
-    int i, j;
-    AVFilterContext      *ctx        = inlink->dst;
-    InferenceClassifyContext *s      = ctx->priv;
+    int i, ret;
+    AVFrame *frame;
+
+    AVFilterContext             *ctx = inlink->dst;
+    InferenceClassifyContext      *s = ctx->priv;
     enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24;
-    const AVPixFmtDescriptor *desc   = av_pix_fmt_desc_get(inlink->format);
+    const AVPixFmtDescriptor   *desc = av_pix_fmt_desc_get(inlink->format);
 
     for (i = 0; i < s->loaded_num; i++) {
         InferenceBaseContext *base = s->infer_bases[i];
-        DNNModelInfo *info         = ff_inference_base_get_input_info(base);
-        VideoPP *vpp               = ff_inference_base_get_vpp(base);
+        DNNModelInfo         *info = ff_inference_base_get_input_info(base);
+        VideoPP               *vpp = ff_inference_base_get_vpp(base);
+
+        // right now, no model needs multiple inputs
+        av_assert0(info->numbers == 1);
 
         vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ?
             VPP_DEVICE_HW : VPP_DEVICE_SW;
 
         // allocate avframes to save preprocessed data
-        for (j = 0; j < info->numbers; j++) {
-            int ret;
-            AVFrame *frame = av_frame_alloc();
-            if (!frame)
-                return AVERROR(ENOMEM);
+        frame = av_frame_alloc();
+        if (!frame)
+            return AVERROR(ENOMEM);
+        frame->width   = info->width[0];
+        frame->height  = info->height[0];
+        frame->format  = expect_format;
+        vpp->frames[0] = frame;
+
+        if (vpp->device == VPP_DEVICE_SW) {
+            if (ret = av_frame_get_buffer(frame, 0) < 0)
+                goto fail;
+        } else {
+#if CONFIG_VAAPI
+            vpp->va_vpp = av_mallocz(sizeof(*vpp->va_vpp));
+            if (!vpp->va_vpp) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
 
-            frame->format = expect_format;
-            frame->width  = info->width[j];
-            frame->height = info->height[j];
+            ret = va_vpp_device_create(vpp->va_vpp, inlink);
+            if (ret < 0) {
+                av_log(ctx, AV_LOG_ERROR, "Create va vpp device failed\n");
+                ret = AVERROR(EINVAL);
+                goto fail;
+            }
 
-            ret = av_frame_get_buffer(frame, 0);
+            ret = va_vpp_surface_alloc(vpp->va_vpp,
+                    info->width[0], info->height[0], s->vpp_format);
             if (ret < 0) {
-                av_frame_free(&frame);
-                return ret;
+                av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n");
+                ret = AVERROR(EINVAL);
+                goto fail;
             }
-            vpp->frames[j] = frame;
+
+            frame->format = vpp->va_vpp->av_format;
+#endif
         }
     }
 
     return 0;
+fail:
+    for (i = 0; i < s->loaded_num; i++) {
+        VideoPP *vpp = ff_inference_base_get_vpp(s->infer_bases[i]);
+
+        frame = vpp->frames[0];
+        if (!frame)
+            continue;
+
+        av_frame_free(&frame);
+
+#if CONFIG_VAAPI
+        if (vpp->va_vpp) {
+            va_vpp_device_free(vpp->va_vpp);
+            av_freep(&vpp->va_vpp);
+        }
+#endif
+    }
+    return ret;
 }
 
 static av_cold int config_output(AVFilterLink *outlink)
@@ -698,6 +749,7 @@ static const AVOption inference_classify_options[] = {
     { "model",          "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
     { "label",          "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
     { "name",           "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "vpp_format",     "specify vpp output format",       OFFSET(vpp_format),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
     { "device",         "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12,   FLAGS },
     { "interval",       "do infer every Nth frame",        OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },                     1, 1024, FLAGS },
     { "batch_size",     "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },                     1, 1024, FLAGS },
diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c
index 624cb47..42a0e02 100644
--- a/libavfilter/vf_inference_detect.c
+++ b/libavfilter/vf_inference_detect.c
@@ -51,6 +51,7 @@ typedef struct InferenceDetectContext {
 
     char  *model_file;
     char  *label_file;
+    char  *vpp_format;
     int    backend_type;
     int    device_type;
 
@@ -176,13 +177,21 @@ static int detect_preprocess(InferenceBaseContext *base, int index, AVFrame *in,
     VideoPP *vpp = ff_inference_base_get_vpp(base);
     AVFrame *tmp = vpp->frames[index];
 
-    if (!vpp->scale_contexts[index]) {
-        *out = in;
-        return 0;
-    }
+    if (vpp->device == VPP_DEVICE_SW) {
+        if (!vpp->sw_vpp->scale_contexts[index]) {
+            *out = in;
+            return 0;
+        }
 
-    ret = vpp->swscale(vpp->scale_contexts[index], (const uint8_t * const*)in->data,
-                       in->linesize, 0, in->height, tmp->data, tmp->linesize);
+        ret = vpp->sw_vpp->scale(vpp->sw_vpp->scale_contexts[index],
+                (const uint8_t * const*)in->data,
+                in->linesize, 0, in->height, tmp->data, tmp->linesize);
+    } else {
+#if CONFIG_VAAPI
+        ret = vpp->va_vpp->scale(vpp->va_vpp, in,
+                tmp->width, tmp->height, tmp->data, tmp->linesize);
+#endif
+    }
     *out = tmp;
     return ret;
 }
@@ -194,7 +203,8 @@ static int query_formats(AVFilterContext *context)
         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV422P,  AV_PIX_FMT_YUV444P,
         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P,
         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV411P,  AV_PIX_FMT_GRAY8,
-        AV_PIX_FMT_BGR24,    AV_PIX_FMT_BGRA,     AV_PIX_FMT_NONE};
+        AV_PIX_FMT_BGR24,    AV_PIX_FMT_BGRA,     AV_PIX_FMT_VAAPI,
+        AV_PIX_FMT_NONE};
 
     formats_list = ff_make_format_list(pixel_formats);
     if (!formats_list) {
@@ -207,7 +217,8 @@ static int query_formats(AVFilterContext *context)
 
 static int config_input(AVFilterLink *inlink)
 {
-    int i;
+    int i, ret;
+    AVFrame *frame;
     AVFilterContext      *ctx        = inlink->dst;
     InferenceDetectContext *s        = ctx->priv;
     enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24;
@@ -222,51 +233,86 @@ static int config_input(AVFilterLink *inlink)
                info->is_image[i], info->precision[i], info->layout[i]);
     }
 
-    vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW;
+    // right now, no model needs multiple inputs
+    av_assert0(info->numbers == 1);
 
-    // TODO: now just handle sw vpp
-    for (i = 0; i < info->numbers; i++) {
-        if (expect_format   != inlink->format ||
-            info->width[i]  != inlink->w      ||
-            info->height[i] != inlink->h)
-        {
-            int ret;
-            AVFrame *frame;
-
-            vpp->scale_contexts[i] = sws_getContext(
-                inlink->w,      inlink->h,       inlink->format,
-                info->width[i], info->height[i], expect_format,
-                SWS_BILINEAR, NULL, NULL, NULL);
-
-            if (!vpp->scale_contexts[i]) {
-                av_log(ctx, AV_LOG_ERROR, "Impossible to create scale context");
-                return AVERROR(EINVAL);
-            }
+    vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW;
 
-            frame = av_frame_alloc();
-            if (!frame)
+    // allocate frame to save scaled output
+    frame = av_frame_alloc();
+    if (!frame)
+        return AVERROR(ENOMEM);
+    frame->width   = info->width[0];
+    frame->height  = info->height[0];
+    frame->format  = expect_format;
+    vpp->frames[0] = frame;
+
+    if (vpp->device == VPP_DEVICE_SW) {
+        int need_scale = expect_format   != inlink->format ||
+                         info->width[0]  != inlink->w      ||
+                         info->height[0] != inlink->h;
+
+        if (need_scale) {
+            if (av_frame_get_buffer(frame, 0) < 0) {
+                av_frame_free(&frame);
                 return AVERROR(ENOMEM);
+            }
 
-            frame->format = expect_format;
-            frame->width  = info->width[i];
-            frame->height = info->height[i];
+            vpp->sw_vpp->scale_contexts[0] = sws_getContext(
+                    inlink->w, inlink->h, inlink->format,
+                    info->width[0], info->height[0], expect_format,
+                    SWS_BILINEAR, NULL, NULL, NULL);
 
-            ret = av_frame_get_buffer(frame, 0);
-            if (ret < 0) {
+            if (!vpp->sw_vpp->scale_contexts[0]) {
+                av_log(ctx, AV_LOG_ERROR, "Impossible to create scale context\n");
                 av_frame_free(&frame);
-                return ret;
+                return AVERROR(EINVAL);
             }
-            vpp->frames[i] = frame;
         }
+    } else {
+#if CONFIG_VAAPI
+        vpp->va_vpp = av_mallocz(sizeof(*vpp->va_vpp));
+        if (!vpp->va_vpp) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        ret = va_vpp_device_create(vpp->va_vpp, inlink);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Create va vpp device failed\n");
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        ret = va_vpp_surface_alloc(vpp->va_vpp,
+                info->width[0], info->height[0], s->vpp_format);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n");
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
+
+        frame->format = vpp->va_vpp->av_format;
+#endif
     }
 
     return 0;
+fail:
+    av_frame_free(&frame);
+#if CONFIG_VAAPI
+    if (vpp->va_vpp) {
+        va_vpp_device_free(vpp->va_vpp);
+        av_freep(&vpp->va_vpp);
+    }
+#endif
+    return ret;
 }
 
 static int config_output(AVFilterLink *outlink)
 {
     AVFilterContext      *ctx = outlink->src;
     InferenceDetectContext *s = ctx->priv;
+    VideoPP *vpp              = ff_inference_base_get_vpp(s->base);
 
     DNNModelInfo *info = ff_inference_base_get_output_info(s->base);
 
@@ -277,7 +323,18 @@ static int config_output(AVFilterLink *outlink)
             info->is_image[i], info->precision[i], info->layout[i]);
     }
 
-    // TODO: define how to handle model output data
+#if CONFIG_VAAPI
+    if (vpp->device == VPP_DEVICE_HW) {
+        if (!vpp->va_vpp || !vpp->va_vpp->hw_frames_ref) {
+            av_log(ctx, AV_LOG_ERROR, "The input must have a hardware frame "
+                    "reference.\n");
+            return AVERROR(EINVAL);
+        }
+        outlink->hw_frames_ctx = av_buffer_ref(vpp->va_vpp->hw_frames_ref);
+        if (!outlink->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+    }
+#endif
 
     return 0;
 }
@@ -385,6 +442,7 @@ static const AVOption inference_detect_options[] = {
     { "model",       "path to model file for network",  OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
     { "label",       "label file path for detection",   OFFSET(label_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
+    { "vpp_format",  "specify vpp output format",       OFFSET(vpp_format),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "interval",    "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
     { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
     { "threshold",   "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1,    FLAGS},
-- 
2.7.4