From 496aa35db719e03b1847845581a96bff096f1e2c Mon Sep 17 00:00:00 2001
From: Lin Xie <lin.xie@intel.com>
Date: Thu, 17 Jan 2019 15:48:31 +0800
Subject: [PATCH] New filter to do inference classify

* Add classify filter and support multiple models in a row
* Add a trick to keep side data refbuf when do frame copy
* Refined some interfaces for inference
* Add classify metadata and label files processing

Signed-off-by: Lin Xie <lin.xie@intel.com>
---
 configure                           |   4 +
 libavfilter/Makefile                |   1 +
 libavfilter/allfilters.c            |   1 +
 libavfilter/inference.c             | 127 ++++++++-
 libavfilter/inference.h             |  44 ++-
 libavfilter/vf_inference_classify.c | 523 ++++++++++++++++++++++++++++++++++++
 libavfilter/vf_inference_detect.c   |  28 +-
 libavutil/frame.c                   |   7 +-
 libavutil/frame.h                   |   2 +
 9 files changed, 717 insertions(+), 20 deletions(-)
 create mode 100644 libavfilter/vf_inference_classify.c

diff --git a/configure b/configure
index 68b7dfb..fb87f47 100755
--- a/configure
+++ b/configure
@@ -3408,6 +3408,10 @@ fspp_filter_deps="gpl"
 geq_filter_deps="gpl"
 histeq_filter_deps="gpl"
 hqdn3d_filter_deps="gpl"
+inference_classify_filter_deps="libinference_engine"
+inference_classify_filter_select="dnn"
+inference_detect_filter_deps="libinference_engine"
+inference_detect_filter_select="dnn"
 interlace_filter_deps="gpl"
 kerndeint_filter_deps="gpl"
 ladspa_filter_deps="ladspa libdl"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 06ebd61..d9e0602 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -258,6 +258,7 @@ OBJS-$(CONFIG_HWUPLOAD_FILTER)               += vf_hwupload.o
 OBJS-$(CONFIG_HYSTERESIS_FILTER)             += vf_hysteresis.o framesync.o
 OBJS-$(CONFIG_IDET_FILTER)                   += vf_idet.o
 OBJS-$(CONFIG_IL_FILTER)                     += vf_il.o
+OBJS-$(CONFIG_INFERENCE_CLASSIFY_FILTER)     += vf_inference_classify.o
 OBJS-$(CONFIG_INFERENCE_DETECT_FILTER)       += vf_inference_detect.o
 OBJS-$(CONFIG_INFLATE_FILTER)                += vf_neighbor.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += vf_tinterlace.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 4c6fa26..158c75b 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -244,6 +244,7 @@ extern AVFilter ff_vf_hwupload_cuda;
 extern AVFilter ff_vf_hysteresis;
 extern AVFilter ff_vf_idet;
 extern AVFilter ff_vf_il;
+extern AVFilter ff_vf_inference_classify;
 extern AVFilter ff_vf_inference_detect;
 extern AVFilter ff_vf_inflate;
 extern AVFilter ff_vf_interlace;
diff --git a/libavfilter/inference.c b/libavfilter/inference.c
index ea788ba..14b4093 100644
--- a/libavfilter/inference.c
+++ b/libavfilter/inference.c
@@ -29,6 +29,7 @@
 #include "libswscale/swscale.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/avassert.h"
+#include "libavutil/imgutils.h"
 
 #include "inference.h"
 
@@ -46,6 +47,8 @@ struct InferenceBaseContext
     DNNModelInfo output_info;
 
     VideoPP vpp;
+
+    InferencePreProcess preprocess;
 };
 
 static int fill_dnn_data_from_frame(DNNIOData *data,
@@ -97,12 +100,88 @@ static int fill_dnn_data_from_frame(DNNIOData *data,
     return 0;
 }
 
+static int sw_crop_and_scale(AVFrame *frame,
+                             float x0, float y0, float x1, float y1,
+                             int out_w, int out_h, uint8_t *data[], int stride[])
+{
+    int err, bufsize;
+    struct SwsContext *sws_ctx;
+    const AVPixFmtDescriptor *desc;
+    int x, y, w, h, hsub, vsub;
+    int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes
+    enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24;
+
+    AVFrame *temp = av_frame_alloc();
+    if (!temp) {
+        err = AVERROR(ENOMEM);
+        return err;
+    }
+    av_frame_ref(temp, frame);
+
+    desc = av_pix_fmt_desc_get(temp->format);
+    hsub = desc->log2_chroma_w;
+    vsub = desc->log2_chroma_h;
+    av_image_fill_max_pixsteps(max_step, NULL, desc);
+
+    /* cropping */
+    {
+        x = lrintf(x0);
+        y = lrintf(y0);
+        w = lrintf(x1) - x;
+        h = lrintf(y1) - y;
+
+        temp->width  = w;
+        temp->height = h;
+
+        temp->data[0] += y * temp->linesize[0];
+        temp->data[0] += x * max_step[0];
+
+        for (int i = 1; i < 3; i ++) {
+            if (temp->data[i]) {
+                temp->data[i] += (y >> vsub) * temp->linesize[i];
+                temp->data[i] += (x * max_step[i]) >> hsub;
+            }
+        }
+
+        /* alpha plane */
+        if (temp->data[3]) {
+            temp->data[3] += y * temp->linesize[3];
+            temp->data[3] += x * max_step[3];
+        }
+    }
+
+    /* create scaling context */
+    sws_ctx = sws_getContext(temp->width, temp->height, temp->format,
+                             out_w, out_h, expect_format,
+                             SWS_BILINEAR, NULL, NULL, NULL);
+    if (!sws_ctx) {
+        av_log(NULL, AV_LOG_ERROR, "Create scaling context failed!\n");
+        err = AVERROR(EINVAL);
+        return err;
+    }
+
+    if (!data[0]) {
+        bufsize = av_image_alloc(data, stride, out_w, out_h, expect_format, 1);
+        if (bufsize < 0)
+            return AVERROR(ENOMEM);
+    }
+
+    sws_scale(sws_ctx, (const uint8_t * const*)temp->data,
+              temp->linesize, 0, temp->height, data, stride);
+
+    av_frame_free(&temp);
+    sws_freeContext(sws_ctx);
+
+    return 0;
+}
+
 int ff_inference_base_create(AVFilterContext *ctx,
                              InferenceBaseContext **base,
                              InferenceParam *param) {
     int i, ret;
     InferenceBaseContext *s;
     DNNModelInfo *info;
+    VideoPP *vpp;
 
     if (!param)
         return AVERROR(EINVAL);
@@ -162,10 +241,17 @@ int ff_inference_base_create(AVFilterContext *ctx,
     s->batch_size      = param->batch_size;
     s->every_nth_frame = param->every_nth_frame;
     s->threshold       = param->threshold;
+    s->preprocess      = param->preprocess;
 
     ret = s->model->create_model(s->model->model);
     DNN_ERR_CHECK(ctx);
 
+    vpp = &s->vpp;
+
+    // vpp init
+    vpp->swscale        = &sws_scale;
+    vpp->crop_and_scale = &sw_crop_and_scale;
+
     *base = s;
 #undef DNN_ERR_CHECK
     return 0;
@@ -199,23 +285,40 @@ int ff_inference_base_free(InferenceBaseContext **base)
     return 0;
 }
 
+int ff_inference_base_submit_frame(InferenceBaseContext *base,
+                                   AVFrame *frame,
+                                   int input_idx,
+                                   int batch_idx)
+{
+    DNNIOData input = { };
+    fill_dnn_data_from_frame(&input, frame, batch_idx, 1, input_idx);
+    base->model->set_input(base->model->model, &input);
+
+    return 0;
+}
+
+int ff_inference_base_infer(InferenceBaseContext *base)
+{
+    DNNReturnType dnn_ret;
+    dnn_ret = base->module->execute_model(base->model);
+    av_assert0(dnn_ret == DNN_SUCCESS);
+    return 0;
+}
+
 int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in)
 {
-    VideoPP       *vpp = &base->vpp;
     DNNModelInfo *info = &base->input_info;
     DNNReturnType dnn_ret;
     DNNIOData input = { };
 
     for (int i = 0; i < info->numbers; i++) {
-        if (!vpp->scale_contexts[i]) {
-            fill_dnn_data_from_frame(&input, in, 0, 1, i);
-        } else {
-            AVFrame *tmp = vpp->frames[i];
-            sws_scale(vpp->scale_contexts[i], (const uint8_t * const*)in->data,
-                      in->linesize, 0, in->height, tmp->data, tmp->linesize);
-            fill_dnn_data_from_frame(&input, tmp, 0, 1, i);
+        AVFrame *processed_frame;
+        for (int j = 0; j < base->batch_size; j++) {
+            if (base->preprocess)
+                base->preprocess(base, i, in, &processed_frame);
+            fill_dnn_data_from_frame(&input, processed_frame, j, 1, i);
+            base->model->set_input(base->model->model, &input);
         }
-        base->model->set_input(base->model->model, &input);
     }
 
     dnn_ret = base->module->execute_model(base->model);
@@ -224,7 +327,9 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in)
     return 0;
 }
 
-int ff_inference_base_get_infer_result(InferenceBaseContext *base, InferTensorMeta *metadata)
+int ff_inference_base_get_infer_result(InferenceBaseContext *base,
+                                       int output_index,
+                                       InferTensorMeta *metadata)
 {
     DNNModelInfo *info = &base->output_info;
     DNNIOData     data = { };
@@ -233,7 +338,7 @@ int ff_inference_base_get_infer_result(InferenceBaseContext *base, InferTensorMe
     av_assert0(metadata != NULL);
 
     // TODO: change to layer name for multiple outputs
-    data.in_out_idx = 0;
+    data.in_out_idx = output_index;
 
     ret = base->model->get_execute_result(base->model->model, &data);
     av_assert0(ret == DNN_SUCCESS);
diff --git a/libavfilter/inference.h b/libavfilter/inference.h
index 8466f90..eebfd10 100644
--- a/libavfilter/inference.h
+++ b/libavfilter/inference.h
@@ -24,6 +24,8 @@
 
 typedef struct InferenceBaseContext InferenceBaseContext;
 
+typedef int (*InferencePreProcess)(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out);
+
 typedef struct InferenceParam {
     char  *model_file;
     char  *labels_file;
@@ -40,6 +42,8 @@ typedef struct InferenceParam {
     int    input_layout;
     int    input_precision;
     int    input_is_image; //!< image or data
+
+    InferencePreProcess preprocess;
 } InferenceParam;
 
 #define MAX_VPP_NUM DNN_INPUT_OUTPUT_NUM
@@ -49,7 +53,18 @@ typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } VPPDevice;
 typedef struct VideoPP {
     int      device;
     void    *scale_contexts[MAX_VPP_NUM];
-    AVFrame *frames[MAX_VPP_NUM];
+    AVFrame *frames[MAX_VPP_NUM];          //<! frames to save vpp output
+
+    int    (*swscale)(void *context,
+                      const uint8_t * const srcSlice[],
+                      const int srcStride[], int srcSliceY,
+                      int srcSliceH, uint8_t *const dst[],
+                      const int dstStride[]);
+    int    (*crop_and_scale)(AVFrame *frame,
+                             float crop_x0,  float crop_y0,
+                             float crop_x1,  float crop_y1,
+                             int   scale_w,  int   scale_h,
+                             uint8_t *dst[], int   dstStride[]);
 } VideoPP;
 
 #define MAX_TENSOR_DIM_NUM 8
@@ -93,16 +108,39 @@ typedef struct InferDetectionMeta {
     BBoxesArray *bboxes;
 } InferDetectionMeta;
 
+typedef struct InferClassification {
+    int     detect_id;        ///< detected bbox index
+    char   *name;             ///< class name, e.g. emotion, age
+    int     label_id;         ///< label index in labels
+    float   confidence;
+    float   value;
+    AVBufferRef *label_buf;   ///< label ref buf from label file
+} InferClassification;
+
+/* dynamic classifications array */
+typedef struct ClassifyArray {
+    InferClassification **classifications;
+    int                   num;
+} ClassifyArray;
+
+typedef struct InferClassificationMeta {
+    ClassifyArray *c_array;
+} InferClassificationMeta;
+
 int ff_inference_base_create(AVFilterContext *avctx, InferenceBaseContext **base, InferenceParam *p);
 
 int ff_inference_base_free(InferenceBaseContext **base);
 
+int ff_inference_base_submit_frame(InferenceBaseContext *base, AVFrame *frame, int input_idx, int batch_idx);
+
+int ff_inference_base_infer(InferenceBaseContext *base);
+
 int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in);
 
-int ff_inference_base_get_infer_result(InferenceBaseContext *base, InferTensorMeta *metadata);
+int ff_inference_base_get_infer_result(InferenceBaseContext *base, int index, InferTensorMeta *metadata);
 
 DNNModelInfo* ff_inference_base_get_input_info(InferenceBaseContext *base);
 DNNModelInfo* ff_inference_base_get_output_info(InferenceBaseContext *base);
-VideoPP* ff_inference_base_get_vpp(InferenceBaseContext *base);
+VideoPP*      ff_inference_base_get_vpp(InferenceBaseContext *base);
 
 #endif
diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c
new file mode 100644
index 0000000..625fb97
--- /dev/null
+++ b/libavfilter/vf_inference_classify.c
@@ -0,0 +1,523 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * dnn inference classify filter
+ */
+#include "libavutil/opt.h"
+#include "libavutil/mem.h"
+#include "libavutil/eval.h"
+#include "libavutil/avassert.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/mathematics.h"
+
+#include "formats.h"
+#include "internal.h"
+#include "avfilter.h"
+#include "libavcodec/avcodec.h"
+#include "libavformat/avformat.h"
+#include "libswscale/swscale.h"
+
+#include "inference.h"
+#include "dnn_interface.h"
+
+#define OFFSET(x) offsetof(InferenceClassifyContext, x)
+#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
+
+#define MAX_MODEL_NUM 8
+
+static char string_age[]    = "age";
+static char string_gender[] = "gender";
+
+typedef int (*ResultProcess)(AVFilterContext*, int, int, int,
+                             InferTensorMeta*, InferClassificationMeta*);
+
+typedef struct InferenceClassifyContext {
+    const AVClass *class;
+
+    InferenceBaseContext *infer_bases[MAX_MODEL_NUM];
+
+    char  *labels;
+    char  *names;
+    char  *model_file;
+    int    loaded_num;
+    int    backend_type;
+    int    device_type;
+
+    int    batch_size;
+    int    every_nth_frame;
+
+    char         *name_array[MAX_MODEL_NUM];
+    AVBufferRef  *label_bufs[MAX_MODEL_NUM];
+    ResultProcess post_process[MAX_MODEL_NUM];
+} InferenceClassifyContext;
+
+static void split(char *str, const char *dilim, char **array, int *num, int max)
+{
+    int i = 0;
+    char *p;
+    if (!str || !dilim || !array || !num)
+        return;
+
+    p = strtok(str, dilim);
+    while (p != NULL) {
+        array[i++] = p;
+
+        av_assert0 (i < max);
+
+        p = strtok(NULL, dilim);
+    }
+    *num = i;
+}
+
+static void infer_labels_buffer_free(void *opaque, uint8_t *data)
+{
+    int i;
+    LabelsArray *labels = (LabelsArray *)data;
+
+    for (i = 0; i < labels->num; i++)
+        av_freep(&labels->label[i]);
+
+    av_free(data);
+}
+
+static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data)
+{
+    int i;
+    InferClassificationMeta *meta = (InferClassificationMeta *)data;
+    ClassifyArray *classes        = meta->c_array;
+
+    if (classes) {
+        for (i = 0; i < classes->num; i++) {
+            InferClassification *c = classes->classifications[i];
+            av_buffer_unref(&c->label_buf);
+            av_freep(&c);
+        }
+        av_freep(&classes);
+    }
+
+    av_free(data);
+}
+
+static av_cold void dump_emotion(AVFilterContext *ctx, int label_id)
+{
+    const char *emotions[] = { "neutral", "happy", "sad", "surprise", "anger" };
+
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - label:%d emotion:%s \n",
+           label_id, emotions[label_id]);
+}
+
+static int emotion_classify_result_process(AVFilterContext *ctx,
+                                           int detect_id,
+                                           int result_id,
+                                           int model_index,
+                                           InferTensorMeta *meta,
+                                           InferClassificationMeta *c_meta)
+{
+    int i, label_id = 0;
+    InferenceClassifyContext *s = ctx->priv;
+    const float *emo_confidence = (float *)meta->data;
+    size_t labels_num           = meta->dims[2];
+    float max                   = emo_confidence[0];
+
+    InferClassification *classify = av_mallocz(sizeof(*classify));
+    if (!classify)
+        return AVERROR(ENOMEM);
+
+    // Get the emotion with max confidence
+    for (i = 1; i < labels_num; i++)
+        if (emo_confidence[i] > max) { max = emo_confidence[i]; label_id = i; }
+
+    classify->detect_id  = detect_id;
+    classify->name       = s->name_array[model_index];
+    classify->label_id   = label_id;
+    classify->confidence = emo_confidence[label_id];
+    classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
+
+    dump_emotion(ctx, classify->label_id);
+
+    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
+
+    return 0;
+}
+
+static av_cold void dump_gender(AVFilterContext *ctx, int label_id, float conf)
+{
+    const char *genders[] = { "female", "male" };
+
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Confidence:%f\n",
+           genders[label_id], conf);
+}
+
+static av_cold void dump_age(AVFilterContext *ctx, float age)
+{
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%f \n", age);
+}
+
+static int age_gender_classify_result_process(AVFilterContext *ctx,
+                                              int detect_id,
+                                              int result_id,
+                                              int model_index,
+                                              InferTensorMeta *meta,
+                                              InferClassificationMeta *c_meta)
+{
+    InferenceClassifyContext *s = ctx->priv;
+    const float *data = (float *)meta->data;
+
+    InferClassification *classify = av_mallocz(sizeof(*classify));
+    if (!classify)
+        return AVERROR(ENOMEM);
+
+    classify->detect_id  = detect_id;
+
+    if (result_id == 0) {
+        // Age
+        classify->name  = string_age;
+        classify->value = *data * 100.0;
+        dump_age(ctx, classify->value);
+    } else {
+        // Gender
+        classify->name       = string_gender;
+        // 0 - Femal, 1 - Male
+        classify->label_id   = data[0] > data[1] ? 0 : 1;
+        classify->confidence = data[classify->label_id];
+        classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
+        dump_gender(ctx, classify->label_id, classify->confidence);
+    }
+
+    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
+
+    return 0;
+}
+
+static int query_formats(AVFilterContext *context)
+{
+    AVFilterFormats *formats_list;
+    const enum AVPixelFormat pixel_formats[] = {
+        AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV422P,  AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P,
+        AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV411P,  AV_PIX_FMT_GRAY8,
+        AV_PIX_FMT_BGR24,    AV_PIX_FMT_BGRA,     AV_PIX_FMT_NONE};
+
+    formats_list = ff_make_format_list(pixel_formats);
+    if (!formats_list) {
+        av_log(context, AV_LOG_ERROR, "could not create formats list\n");
+        return AVERROR(ENOMEM);
+    }
+
+    return ff_set_common_formats(context, formats_list);
+}
+
+static av_cold int classify_init(AVFilterContext *ctx)
+{
+    InferenceClassifyContext *s = ctx->priv;
+    int i, ret;
+    int model_num = 0, label_num = 0, name_num = 0;
+    const int max_num = MAX_MODEL_NUM;
+    char  *names[MAX_MODEL_NUM] = { };
+    char *models[MAX_MODEL_NUM] = { };
+    char *labels[MAX_MODEL_NUM] = { };
+    InferenceParam p = {};
+
+    av_assert0(s->model_file);
+
+    split(s->model_file, "&", models, &model_num, max_num);
+    for (i = 0; i < model_num; i++)
+        av_log(ctx, AV_LOG_INFO, "model[%d]:%s\n", i, models[i]);
+
+    split(s->labels, "&", labels, &label_num, max_num);
+    for (i = 0; i < label_num; i++)
+        av_log(ctx, AV_LOG_INFO, "label[%d]:%s\n", i, labels[i]);
+
+    split(s->names, "&", names, &name_num, max_num);
+    for (i = 0; i < name_num; i++)
+        av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]);
+
+    av_assert0(s->backend_type == DNN_INTEL_IE);
+
+    p.backend_type    = s->backend_type;
+    p.device_type     = s->device_type;
+    p.batch_size      = s->batch_size;
+    p.every_nth_frame = s->every_nth_frame;
+    p.input_precision = DNN_DATA_PRECISION_U8;
+    p.input_layout    = DNN_DATA_LAYOUT_NCHW;
+    p.input_is_image  = 1;
+
+    for (i = 0; i < model_num; i++) {
+        InferenceBaseContext *base = NULL;
+
+        p.model_file  = models[i];
+        ret = ff_inference_base_create(ctx, &base, &p);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR, "could not create inference\n");
+            return ret;
+        }
+
+        s->infer_bases[i] = base;
+    }
+    s->loaded_num = model_num;
+
+    for (i = 0; i < label_num; i++) {
+        int n, labels_num;
+        AVBufferRef *ref    = NULL;
+        LabelsArray *larray = NULL;
+        char buffer[4096]   = { };
+        char *_labels[100]  = { };
+
+        FILE *fp = fopen(labels[i], "r+b");
+        if (!fp) {
+            av_log(ctx, AV_LOG_ERROR, "could not open file:%s\n", labels[i]);
+            ret = AVERROR(EIO);
+            fclose(fp);
+            goto fail;
+        }
+        fread(buffer, sizeof(buffer), 1, fp);
+        fclose(fp);
+
+        buffer[strcspn(buffer, "\n")] = 0;
+        split(buffer, ",", _labels, &labels_num, 100);
+
+        larray = av_mallocz(sizeof(*larray));
+        if (!larray) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+
+        for (n = 0; n < labels_num; n++) {
+            char *l = av_strdup(_labels[n]);
+            av_dynarray_add(&larray->label, &larray->num, l);
+        }
+
+        ref = av_buffer_create((uint8_t *)larray, sizeof(*larray),
+                               &infer_labels_buffer_free, NULL, 0);
+        s->label_bufs[i] = ref;
+    }
+
+    for (i = 0; i < name_num; i++) {
+        s->name_array[i] = names[i];
+        if (strstr(names[i], "emotion"))
+            s->post_process[i] = &emotion_classify_result_process;
+        else if (strstr(names[i], "age") && strstr(names[i], "gend"))
+            s->post_process[i] = &age_gender_classify_result_process;
+    }
+
+    return 0;
+
+fail:
+    for (i = 0; i < model_num; i++) {
+        ff_inference_base_free(&s->infer_bases[i]);
+        if (s->label_bufs[i])
+            av_buffer_unref(&s->label_bufs[i]);
+    }
+
+    return ret;
+}
+
+static av_cold void classify_uninit(AVFilterContext *ctx)
+{
+    int i;
+    InferenceClassifyContext *s = ctx->priv;
+
+    for (i = 0; i < s->loaded_num; i++) {
+        ff_inference_base_free(&s->infer_bases[i]);
+        av_buffer_unref(&s->label_bufs[i]);
+    }
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    int i, ret;
+    AVFilterContext *ctx        = inlink->dst;
+    InferenceClassifyContext *s = ctx->priv;
+    AVFilterLink *outlink       = inlink->dst->outputs[0];
+    AVBufferRef             *ref;
+    AVFrameSideData         *sd, *new_sd;
+    BBoxesArray             *boxes;
+    InferDetectionMeta      *d_meta;
+    ClassifyArray           *c_array;
+    InferClassificationMeta *c_meta;
+
+    sd = av_frame_get_side_data(in, AV_FRAME_DATA_INFERENCE_DETECTION);
+    if (!sd)
+        goto done;
+
+    d_meta = (InferDetectionMeta *)sd->data;
+    if (!d_meta)
+        goto fail;
+
+    boxes = d_meta->bboxes;
+    if (!boxes || !boxes->num)
+        goto done;
+
+    c_meta = av_mallocz(sizeof(*c_meta));
+    c_array = av_mallocz(sizeof(*c_array));
+    if (!c_meta || !c_array) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    c_meta->c_array = c_array;
+
+    // handle according to detected metadata one by one
+    for (i = 0; i < boxes->num; i++) {
+        int j;
+        InferDetection *bbox = boxes->bbox[i];
+
+        // process for each model
+        for (j = 0; j < s->loaded_num; j++) {
+            int output;
+            InferenceBaseContext *base = s->infer_bases[j];
+
+            VideoPP *vpp        = ff_inference_base_get_vpp(base);
+            AVFrame *tmp        = vpp->frames[0];
+            DNNModelInfo *iinfo = ff_inference_base_get_input_info(base);
+            DNNModelInfo *oinfo = ff_inference_base_get_output_info(base);
+
+            ret = vpp->crop_and_scale(in,
+                                      bbox->x_min * in->width,
+                                      bbox->y_min * in->height,
+                                      bbox->x_max * in->width,
+                                      bbox->y_max * in->height,
+                                      iinfo->width[0],
+                                      iinfo->height[0],
+                                      tmp->data,
+                                      tmp->linesize);
+
+            // TODO: support dynamic batch for faces
+            ff_inference_base_submit_frame(base, tmp, 0, 0);
+            ff_inference_base_infer(base);
+
+            for (output = 0; output < oinfo->numbers; output++) {
+                InferTensorMeta tensor_meta = { };
+                ff_inference_base_get_infer_result(base, output, &tensor_meta);
+
+                if (s->post_process[j])
+                    s->post_process[j](ctx, i, output, j, &tensor_meta, c_meta);
+            }
+        }
+    }
+
+    ref = av_buffer_create((uint8_t *)c_meta, sizeof(*c_meta),
+                           &infer_classify_metadata_buffer_free, NULL, 0);
+    if (!ref)
+        return AVERROR(ENOMEM);
+
+    // add meta data to side data
+    new_sd = av_frame_new_side_data_from_buf(in, AV_FRAME_DATA_INFERENCE_CLASSIFICATION, ref);
+    if (!new_sd) {
+        av_buffer_unref(&ref);
+        av_log(NULL, AV_LOG_ERROR, "could not add new side data\n");
+        return AVERROR(ENOMEM);
+    }
+
+done:
+    return ff_filter_frame(outlink, in);
+fail:
+    av_frame_free(&in);
+    return ret;
+}
+
+static av_cold int config_input(AVFilterLink *inlink)
+{
+    int i, j;
+    AVFilterContext      *ctx        = inlink->dst;
+    InferenceClassifyContext *s      = ctx->priv;
+    enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24;
+    const AVPixFmtDescriptor *desc   = av_pix_fmt_desc_get(inlink->format);
+
+    for (i = 0; i < s->loaded_num; i++) {
+        InferenceBaseContext *base = s->infer_bases[i];
+        DNNModelInfo *info         = ff_inference_base_get_input_info(base);
+        VideoPP *vpp               = ff_inference_base_get_vpp(base);
+
+        vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ?
+            VPP_DEVICE_HW : VPP_DEVICE_SW;
+
+        // allocate avframes to save preprocessed data
+        for (j = 0; j < info->numbers; j++) {
+            int ret;
+            AVFrame *frame = av_frame_alloc();
+            if (!frame)
+                return AVERROR(ENOMEM);
+
+            frame->format = expect_format;
+            frame->width  = info->width[j];
+            frame->height = info->height[j];
+
+            ret = av_frame_get_buffer(frame, 0);
+            if (ret < 0) {
+                av_frame_free(&frame);
+                return ret;
+            }
+            vpp->frames[j] = frame;
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int config_output(AVFilterLink *outlink)
+{
+    return 0;
+}
+
+static const AVOption inference_classify_options[] = {
+    { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,  FLAGS, "engine" },
+    { "models",      "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
+    { "labels",      "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
+    { "names",       "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
+    { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
+    { "interval",    "do infer every Nth frame",        OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 15, FLAGS},
+    { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 }, 1, 1024, FLAGS},
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(inference_classify);
+
+static const AVFilterPad classify_inputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = config_input,
+        .filter_frame  = filter_frame,
+    },
+    { NULL }
+};
+
+static const AVFilterPad classify_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_inference_classify = {
+    .name          = "classify",
+    .description   = NULL_IF_CONFIG_SMALL("DNN Inference classification."),
+    .priv_size     = sizeof(InferenceClassifyContext),
+    .query_formats = query_formats,
+    .init          = classify_init,
+    .uninit        = classify_uninit,
+    .inputs        = classify_inputs,
+    .outputs       = classify_outputs,
+    .priv_class    = &inference_classify_class,
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c
index 66aa494..8d676cf 100644
--- a/libavfilter/vf_inference_detect.c
+++ b/libavfilter/vf_inference_detect.c
@@ -114,11 +114,11 @@ static int  face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A
     AVFrameSideData *sd;
     InferDetectionMeta *detect_meta = NULL;
 
-    BBoxesArray *boxes        = av_mallocz(sizeof(BBoxesArray));
+    BBoxesArray *boxes        = av_mallocz(sizeof(*boxes));
     if (!boxes)
         return AVERROR(ENOMEM);
 
-    detect_meta = av_malloc(sizeof(InferDetectionMeta));
+    detect_meta = av_malloc(sizeof(*detect_meta));
     if (!detect_meta)
         return AVERROR(ENOMEM);
 
@@ -130,7 +130,7 @@ static int  face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A
     av_assert0(meta->total_bytes >= max_proposal_count * object_size * sizeof(float));
 
     for (i = 0; i < max_proposal_count; i++) {
-        InferDetection *new_bbox = av_mallocz(sizeof(InferDetection));
+        InferDetection *new_bbox = av_mallocz(sizeof(*new_bbox));
 
         new_bbox->label_id   = (int)detection[i * object_size + 1];
         new_bbox->confidence = detection[i * object_size + 2];
@@ -189,6 +189,23 @@ static int  logo_init(AVFilterContext *ctx, const char *args) {return 0;}
 static void logo_uninit(AVFilterContext *ctx) {}
 static int  logo_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame) { return 0; }
 
+static int detect_preprocess(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out)
+{
+    int ret;
+    VideoPP *vpp = ff_inference_base_get_vpp(base);
+    AVFrame *tmp = vpp->frames[index];
+
+    if (!vpp->scale_contexts[index]) {
+        *out = in;
+        return 0;
+    }
+
+    ret = vpp->swscale(vpp->scale_contexts[index], (const uint8_t * const*)in->data,
+                       in->linesize, 0, in->height, tmp->data, tmp->linesize);
+    *out = tmp;
+    return ret;
+}
+
 static int query_formats(AVFilterContext *context)
 {
     AVFilterFormats *formats_list;
@@ -331,6 +348,7 @@ static av_cold int detect_init(AVFilterContext *ctx)
     p.input_precision = DNN_DATA_PRECISION_U8;
     p.input_layout    = DNN_DATA_LAYOUT_NCHW;
     p.input_is_image  = 1;
+    p.preprocess      = &detect_preprocess;
 
     ret = ff_inference_base_create(ctx, &s->base, &p);
     if (ret < 0) {
@@ -369,7 +387,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     if (ret < 0)
         goto fail;
 
-    ret = ff_inference_base_get_infer_result(s->base, &tensor_meta);
+    ret = ff_inference_base_get_infer_result(s->base, 0, &tensor_meta);
     if (ret < 0)
         goto fail;
 
@@ -386,7 +404,7 @@ static const AVOption inference_detect_options[] = {
     { "model",       "path to model file for network",  OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
     { "interval",    "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 15, FLAGS},
-    { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 1024, FLAGS},
+    { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 }, 1, 1024, FLAGS},
     { "threshold",   "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1, FLAGS},
 
     { "name",        "detection type name",             OFFSET(name),            AV_OPT_TYPE_STRING, .flags = FLAGS, "detection" },
diff --git a/libavutil/frame.c b/libavutil/frame.c
index 0b228cd..1866d85 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
@@ -383,12 +383,16 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     for (i = 0; i < src->nb_side_data; i++) {
+        int keep_ref = 0;
         const AVFrameSideData *sd_src = src->side_data[i];
         AVFrameSideData *sd_dst;
         if (   sd_src->type == AV_FRAME_DATA_PANSCAN
             && (src->width != dst->width || src->height != dst->height))
             continue;
-        if (force_copy) {
+        if (sd_src->type == AV_FRAME_DATA_INFERENCE_CLASSIFICATION ||
+            sd_src->type == AV_FRAME_DATA_INFERENCE_DETECTION)
+            keep_ref = 1;
+        if (force_copy && !keep_ref) {
             sd_dst = av_frame_new_side_data(dst, sd_src->type,
                                             sd_src->size);
             if (!sd_dst) {
@@ -836,6 +840,7 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type)
     case AV_FRAME_DATA_S12M_TIMECODE:               return "SMPTE 12-1 timecode";
     case AV_FRAME_DATA_SPHERICAL:                   return "Spherical Mapping";
     case AV_FRAME_DATA_ICC_PROFILE:                 return "ICC profile";
+    case AV_FRAME_DATA_INFERENCE_CLASSIFICATION:    return "Inference classification metadata";
     case AV_FRAME_DATA_INFERENCE_DETECTION:         return "Inference detection metadata";
 #if FF_API_FRAME_QP
     case AV_FRAME_DATA_QP_TABLE_PROPERTIES:         return "QP table properties";
diff --git a/libavutil/frame.h b/libavutil/frame.h
index 2dcf8da..a7e5caa 100644
--- a/libavutil/frame.h
+++ b/libavutil/frame.h
@@ -142,6 +142,8 @@ enum AVFrameSideDataType {
      */
     AV_FRAME_DATA_ICC_PROFILE,
 
+    AV_FRAME_DATA_INFERENCE_CLASSIFICATION,
+
     AV_FRAME_DATA_INFERENCE_DETECTION,
 
 #if FF_API_FRAME_QP
-- 
2.7.4