From ca12fa90c7eef79992dffc1dcc6edfec5e94b673 Mon Sep 17 00:00:00 2001
From: Lin Xie <lin.xie@intel.com>
Date: Tue, 23 Apr 2019 15:41:30 +0800
Subject: [PATCH] Change IE filters to use model proc

* dnn interfaces changed to match refined IE APIs
* enable JSON file reader based on libcjson
* removed obsolete parameters in detect filter

TODO:
* do parameters clean up in classify filter after
  face reidentify changed to use model proc.
* use model proc to config input color format

Change-Id: I6cd78390b9e3bf4726607ceecf36d10e93609eb6
Signed-off-by: Lin Xie <lin.xie@intel.com>
---
 libavfilter/dnn_backend_intel_ie.c  |  76 +++-----
 libavfilter/dnn_data.h              |  12 +-
 libavfilter/inference.c             | 304 ++++++++++++++++++++++++++++++--
 libavfilter/inference.h             |  49 +++++-
 libavfilter/vf_inference_classify.c | 337 +++++++++++++++++++++++++++---------
 libavfilter/vf_inference_detect.c   | 107 +++++-------
 6 files changed, 659 insertions(+), 226 deletions(-)

diff --git a/libavfilter/dnn_backend_intel_ie.c b/libavfilter/dnn_backend_intel_ie.c
index a600ee6..561cc15 100644
--- a/libavfilter/dnn_backend_intel_ie.c
+++ b/libavfilter/dnn_backend_intel_ie.c
@@ -241,19 +241,20 @@ static DNNReturnType get_input_info_intel_ie(void *model, DNNModelInfo *info)
 
     IEGetModelInputInfo(ie_model->context, ie_model->input_infos);
 
-    if (ie_model->input_infos->numbers > DNN_INPUT_OUTPUT_NUM)
+    if (ie_model->input_infos->number > DNN_INPUT_OUTPUT_NUM)
         return DNN_ERROR;
 
-    for (id = 0; id < ie_model->input_infos->numbers; id++) {
-        info->width[id]     = ie_model->input_infos->width[id];
-        info->height[id]    = ie_model->input_infos->height[id];
-        info->channels[id]  = ie_model->input_infos->channels[id];
-        info->precision[id] = get_dnn_precision(ie_model->input_infos->precision[id]);
-        info->layout[id]    = get_dnn_layout(ie_model->input_infos->layout[id]);
-        info->is_image[id]  = 0;
+    for (id = 0; id < ie_model->input_infos->number; id++) {
+        memcpy(&info->dims[id][0],
+               &ie_model->input_infos->tensorMeta[id].dims[0],
+               4 * sizeof(info->dims[id][0]));
+
+        info->layer_name[id] = ie_model->input_infos->tensorMeta[id].layer_name;
+        info->precision[id]  = get_dnn_precision(ie_model->input_infos->tensorMeta[id].precision);
+        info->layout[id]     = get_dnn_layout(ie_model->input_infos->tensorMeta[id].layout);
     }
     info->batch_size = ie_model->input_infos->batch_size;
-    info->numbers    = ie_model->input_infos->numbers;
+    info->number     = ie_model->input_infos->number;
 
     return DNN_SUCCESS;
 }
@@ -263,15 +264,15 @@ static DNNReturnType set_input_info_intel_ie(void *model, DNNModelInfo *info)
     int id = 0;
     DNNIntelIEModel *ie_model = (DNNIntelIEModel *)model;
 
-    if (!model || !info || info->numbers > DNN_INPUT_OUTPUT_NUM)
+    if (!model || !info || info->number > DNN_INPUT_OUTPUT_NUM)
         return DNN_ERROR;
 
-    for (id = 0; id < info->numbers; id++) {
-        ie_model->input_infos->precision[id] = get_precision(info->precision[id]);
-        ie_model->input_infos->layout[id]    = get_layout(info->layout[id]);
-        ie_model->input_infos->dataType[id]  = info->is_image[id];
-    }
-    ie_model->input_infos->numbers = info->numbers;
+    // image set to input 0
+    ie_model->input_infos->tensorMeta[0].precision = get_precision(info->precision[id]);
+    ie_model->input_infos->tensorMeta[0].layout    = get_layout(info->layout[id]);
+    ie_model->input_infos->tensorMeta[0].dataType  = info->is_image[id];
+
+    ie_model->input_infos->number = info->number;
 
     IESetModelInputInfo(ie_model->context, ie_model->input_infos);
 
@@ -288,42 +289,20 @@ static DNNReturnType get_output_info_intel_ie(void *model, DNNModelInfo *info)
 
     IEGetModelOutputInfo(ie_model->context, ie_model->output_infos);
 
-    if (ie_model->output_infos->numbers > DNN_INPUT_OUTPUT_NUM)
+    if (ie_model->output_infos->number > DNN_INPUT_OUTPUT_NUM)
         return DNN_ERROR;
 
-    for (id = 0; id < ie_model->output_infos->numbers; id++) {
-        info->width[id]     = ie_model->output_infos->width[id];
-        info->height[id]    = ie_model->output_infos->height[id];
-        info->channels[id]  = ie_model->output_infos->channels[id];
-        info->precision[id] = get_dnn_precision(ie_model->output_infos->precision[id]);
-        info->layout[id]    = get_dnn_layout(ie_model->output_infos->layout[id]);
-        info->is_image[id]  = 0;
-    }
-    info->batch_size = ie_model->output_infos->batch_size;
-    info->numbers    = ie_model->output_infos->numbers;
-
-    return DNN_SUCCESS;
-}
+    for (id = 0; id < ie_model->output_infos->number; id++) {
+        memcpy(&info->dims[id][0],
+               &ie_model->output_infos->tensorMeta[id].dims[0],
+               4 * sizeof(info->dims[id][0]));
 
-static DNNReturnType set_output_info_intel_ie(void *model, DNNModelInfo *info)
-{
-    int id = 0;
-    DNNIntelIEModel *ie_model = (DNNIntelIEModel *)model;
-
-    if (!model || !info)
-        return DNN_ERROR;
-
-    if (info->numbers > DNN_INPUT_OUTPUT_NUM)
-        return DNN_ERROR;
-
-    for (id = 0; id < info->numbers; id++) {
-        ie_model->output_infos->precision[id] = get_precision(info->precision[id]);
-        ie_model->output_infos->layout[id]    = get_layout(info->layout[id]);
-        ie_model->output_infos->dataType[id]  = info->is_image[id];
+        info->layer_name[id] = ie_model->output_infos->tensorMeta[id].layer_name;
+        info->precision[id]  = get_dnn_precision(ie_model->output_infos->tensorMeta[id].precision);
+        info->layout[id]     = get_dnn_layout(ie_model->output_infos->tensorMeta[id].layout);
     }
-    ie_model->output_infos->numbers = info->numbers;
-
-    IESetModelOutputInfo(ie_model->context, ie_model->output_infos);
+    info->batch_size = ie_model->output_infos->batch_size;
+    info->number     = ie_model->output_infos->number;
 
     return DNN_SUCCESS;
 }
@@ -407,7 +386,6 @@ DNNModel* ff_dnn_load_model_intel_ie(void *config)
     model->get_input_info     = &get_input_info_intel_ie;
     model->set_input_info     = &set_input_info_intel_ie;
     model->get_output_info    = &get_output_info_intel_ie;
-    model->set_output_info    = &set_output_info_intel_ie;
     model->create_model       = &create_model_intel_ie;
 
     return model;
diff --git a/libavfilter/dnn_data.h b/libavfilter/dnn_data.h
index 0add8ee..7b7f4a5 100644
--- a/libavfilter/dnn_data.h
+++ b/libavfilter/dnn_data.h
@@ -21,6 +21,7 @@
 #define AVFILTER_DNN_DATA_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 /**
 * @enum TargetDevice
@@ -136,17 +137,18 @@ typedef struct DNNIOData {
 * @struct model input info
 * @brief model input info
 */
-#define DNN_INPUT_OUTPUT_NUM 10
+#define DNN_INPUT_OUTPUT_NUM 8
 typedef struct DNNModelInfo {
-    unsigned int width[DNN_INPUT_OUTPUT_NUM];
-    unsigned int height[DNN_INPUT_OUTPUT_NUM];
-    unsigned int channels[DNN_INPUT_OUTPUT_NUM];
+    char  *layer_name[DNN_INPUT_OUTPUT_NUM];
+    size_t       dims[DNN_INPUT_OUTPUT_NUM][4];
+
     DNNDataPrecisionType precision[DNN_INPUT_OUTPUT_NUM];
     DNNDataLayoutType layout[DNN_INPUT_OUTPUT_NUM];
+
     // 0 non-image; 1 image.
     unsigned int is_image[DNN_INPUT_OUTPUT_NUM];
     unsigned int batch_size;
-    unsigned int numbers;
+    unsigned int number;
 } DNNModelInfo;
 
 /**
diff --git a/libavfilter/inference.c b/libavfilter/inference.c
index 596c5b4..380a5ed 100644
--- a/libavfilter/inference.c
+++ b/libavfilter/inference.c
@@ -33,6 +33,10 @@
 
 #include "inference.h"
 
+#if CONFIG_LIBCJSON
+#include <cjson/cJSON.h>
+#endif
+
 #if CONFIG_VAAPI
 #define VA_CALL(_FUNC)                                     \
     {                                                      \
@@ -46,7 +50,7 @@
     }
 #endif
 
-struct InferenceBaseContext
+struct _InferenceBaseContext
 {
     char *infer_type;
     int   batch_size;
@@ -68,6 +72,50 @@ static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp, AVFrame *input, Rect *crop_re
 static int va_vpp_scale(VAAPIVpp *va_vpp, AVFrame *input,
         int scale_w, int scale_h, uint8_t *data[],  int stride[]);
 
+static void infer_labels_buffer_free(void *opaque, uint8_t *data)
+{
+    int i;
+    LabelsArray *labels = (LabelsArray *)data;
+
+    for (i = 0; i < labels->num; i++)
+        av_freep(&labels->label[i]);
+
+    av_free(data);
+}
+
+// helper functions
+static void infer_labels_dump(uint8_t *data)
+{
+    int i;
+    LabelsArray *labels = (LabelsArray *)data;
+    printf("labels: ");
+    for (i = 0; i < labels->num; i++)
+        printf("%s ", labels->label[i]);
+    printf("\n");
+}
+
+int ff_get_file_size(FILE *fp)
+{
+    int file_size, current_pos;
+
+    if (!fp)
+        return -1;
+
+    current_pos = ftell(fp);
+
+    if (fseek(fp, 0, SEEK_END)) {
+        fprintf(stderr, "Couldn't seek to the end of feature file.\n");
+        return -1;
+    }
+
+    file_size = ftell(fp);
+
+    fseek(fp, current_pos, SEEK_SET);
+
+    return file_size;
+}
+
+
 static int fill_dnn_data_from_frame(DNNIOData *data,
                                     const AVFrame *frame,
                                     int batch_idx,
@@ -314,7 +362,7 @@ int ff_inference_base_create(AVFilterContext *ctx,
     DNN_ERR_CHECK(ctx);
 
     info = &s->input_info;
-    for (i = 0; i < info->numbers; i++) {
+    for (i = 0; i < info->number; i++) {
         info->layout[i]    = param->input_layout;
         info->precision[i] = param->input_precision;
         info->is_image[i]  = param->input_is_image;
@@ -387,7 +435,6 @@ int ff_inference_base_submit_frame(InferenceBaseContext *base,
     DNNIOData input = { };
     fill_dnn_data_from_frame(&input, frame, batch_idx, 1, input_idx);
     base->model->set_input(base->model->model, &input);
-
 #if CONFIG_VAAPI
     if (base->vpp.va_vpp)
         va_vpp_surface_release(base->vpp.va_vpp);
@@ -410,7 +457,7 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in)
     DNNReturnType dnn_ret;
     DNNIOData input = { };
 
-    for (int i = 0; i < info->numbers; i++) {
+    for (int i = 0; i < info->number; i++) {
         AVFrame *processed_frame;
         for (int j = 0; j < base->batch_size; j++) {
             if (base->preprocess) {
@@ -433,7 +480,7 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in)
 }
 
 int ff_inference_base_get_infer_result(InferenceBaseContext *base,
-                                       int output_index,
+                                       int id,
                                        InferTensorMeta *metadata)
 {
     DNNModelInfo *info = &base->output_info;
@@ -443,18 +490,17 @@ int ff_inference_base_get_infer_result(InferenceBaseContext *base,
     av_assert0(metadata != NULL);
 
     // TODO: change to layer name for multiple outputs
-    data.in_out_idx = output_index;
+    data.in_out_idx = id;
 
     ret = base->model->get_execute_result(base->model->model, &data);
     av_assert0(ret == DNN_SUCCESS);
 
-    //TODO: refine by new interface
-    metadata->dim_size  = 3;
-    metadata->dims[0]   = info->width[0];
-    metadata->dims[1]   = info->height[0];
-    metadata->dims[2]   = info->channels[0];
-    metadata->layout    = info->layout[0];
-    metadata->precision = info->precision[0];
+    metadata->dim_size  = 4;
+    memcpy(&metadata->dims[0], &info->dims[id][0],
+            metadata->dim_size * sizeof(metadata->dims[0]));
+
+    metadata->layout    = info->layout[id];
+    metadata->precision = info->precision[id];
 
     metadata->data        = data.data[0];
     metadata->total_bytes = data.size;
@@ -477,11 +523,23 @@ VideoPP* ff_inference_base_get_vpp(InferenceBaseContext *base)
     return &base->vpp;
 }
 
-/********************************************
- *                                          *
- *              VAAPI VPP APIs              *
- *                                          *
- *******************************************/
+void ff_inference_dump_model_info(void *ctx, DNNModelInfo *info)
+{
+    int i;
+    for (i = 0; i < info->number; i++) {
+        size_t *p = &info->dims[i][0];
+        av_log(ctx, AV_LOG_DEBUG, "Info id:%d layer\"%-16s\" "
+               "batch size:%d - dim: %3lu %3lu %3lu %3lu - img:%d pre:%d layout:%d\n",
+               i, info->layer_name[i],
+               info->batch_size, p[0], p[1], p[2], p[3],
+               info->is_image[i], info->precision[i], info->layout[i]);
+    }
+}
+
+/*
+ * VAAPI VPP APIs
+ */
+
 #if CONFIG_VAAPI
 static int ff_vaapi_vpp_colour_standard(enum AVColorSpace av_cs)
 {
@@ -733,3 +791,213 @@ static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp,
     return VA_STATUS_SUCCESS;
 }
 #endif
+
+#if CONFIG_LIBCJSON
+/*
+ * model proc parsing functions using cJSON
+ */
+static inline void json_print(cJSON *j)
+{
+    char *string = cJSON_Print(j);
+    if (string)
+        printf("%s\n", string);
+}
+
+void *ff_read_model_proc(const char *path)
+{
+    int n, file_size;
+    cJSON *proc_config = NULL;
+    uint8_t *proc_json = NULL;
+    FILE *fp = fopen(path, "rb");
+    if (!fp) {
+        fprintf(stderr, "File open error:%s\n", path);
+        return NULL;
+    }
+
+    file_size = ff_get_file_size(fp);
+
+    proc_json = av_mallocz(file_size);
+    if (!proc_json)
+        goto end;
+
+    n = fread(proc_json, file_size, 1, fp);
+
+    UNUSED(n);
+
+    proc_config = cJSON_Parse(proc_json);
+    if (proc_config == NULL) {
+        const char *error_ptr = cJSON_GetErrorPtr();
+        if (error_ptr != NULL)
+            fprintf(stderr, "Error before: %s\n", error_ptr);
+        goto end;
+    }
+
+end:
+    if (proc_json)
+        av_freep(&proc_json);
+    fclose(fp);
+    return proc_config;
+}
+
+void ff_load_default_model_proc(ModelInputPreproc *preproc, ModelOutputPostproc *postproc)
+{
+    if (preproc) {
+        /*
+         * format is a little tricky, an ideal input format for IE is BGR planer
+         * however, neither soft csc nor hardware vpp could support that format.
+         * Here, we set a close soft format. The actual one coverted before sent
+         * to IE will be decided by user config and hardware vpp used or not.
+         */
+        preproc->color_format = AV_PIX_FMT_BGR24;
+        preproc->layer_name   = NULL;
+    }
+
+    if (postproc) {
+        // do nothing
+    }
+}
+
+int ff_parse_input_preproc(const void *json, ModelInputPreproc *m_preproc)
+{
+    cJSON *item, *preproc, *color, *layer, *object_class;
+
+    preproc = cJSON_GetObjectItem(json, "input_preproc");
+    if (preproc == NULL) {
+        av_log(NULL, AV_LOG_DEBUG, "No input_preproc.\n");
+        return 0;
+    }
+
+    // not support multiple inputs yet
+    av_assert0(cJSON_GetArraySize(preproc) <= 1);
+
+    cJSON_ArrayForEach(item, preproc)
+    {
+        color = cJSON_GetObjectItemCaseSensitive(item, "color_format");
+        layer = cJSON_GetObjectItemCaseSensitive(item, "layer_name");
+        object_class = cJSON_GetObjectItemCaseSensitive(item, "object_class");
+    }
+
+    if (color) {
+        if (!cJSON_IsString(color) || (color->valuestring == NULL))
+            return -1;
+
+        av_log(NULL, AV_LOG_INFO, "Color Format:\"%s\"\n", color->valuestring);
+
+        if (!strcmp(color->valuestring, "BGR"))
+            m_preproc->color_format = AV_PIX_FMT_BGR24;
+        else if (!strcmp(color->valuestring, "RGB"))
+            m_preproc->color_format = AV_PIX_FMT_RGB24;
+        else
+            return -1;
+    }
+
+    if (object_class) {
+        if (!cJSON_IsString(object_class) || (object_class->valuestring == NULL))
+            return -1;
+
+        av_log(NULL, AV_LOG_INFO, "Object_class:\"%s\"\n", object_class->valuestring);
+
+        m_preproc->object_class = object_class->valuestring;
+    }
+
+    UNUSED(layer);
+
+    return 0;
+}
+
+// For detection, we now care labels only.
+// Layer name and type can be got from output blob.
+int ff_parse_output_postproc(const void *json, ModelOutputPostproc *m_postproc)
+{
+    size_t index = 0;
+    cJSON *item, *postproc;
+    cJSON *attribute, *converter, *labels, *layer, *method, *threshold;
+    cJSON *tensor2text_scale, *tensor2text_precision;
+
+    postproc = cJSON_GetObjectItem(json, "output_postproc");
+    if (postproc == NULL) {
+        av_log(NULL, AV_LOG_DEBUG, "No output_postproc.\n");
+        return 0;
+    }
+
+    av_assert0(cJSON_GetArraySize(postproc) <= MAX_MODEL_OUTPUT);
+    cJSON_ArrayForEach(item, postproc)
+    {
+        OutputPostproc *proc = &m_postproc->procs[index];
+
+#define FETCH_STRING(var, name)                                  \
+        do { var = cJSON_GetObjectItemCaseSensitive(item, #name);\
+            if (var) proc->name = var->valuestring;              \
+        } while(0)
+#define FETCH_DOUBLE(var, name)                                  \
+        do { var = cJSON_GetObjectItemCaseSensitive(item, #name);\
+            if (var) proc->name = var->valuedouble;              \
+        } while(0)
+#define FETCH_INTEGER(var, name)                                 \
+        do { var = cJSON_GetObjectItemCaseSensitive(item, #name);\
+            if (var) proc->name = var->valueint;                 \
+        } while(0)
+
+        FETCH_STRING(layer, layer_name);
+        FETCH_STRING(method, method);
+        FETCH_STRING(attribute, attribute_name);
+        FETCH_STRING(converter, converter);
+
+        FETCH_DOUBLE(threshold, threshold);
+        FETCH_DOUBLE(tensor2text_scale, tensor2text_scale);
+
+        FETCH_INTEGER(tensor2text_precision, tensor2text_precision);
+
+        // handle labels
+        labels = cJSON_GetObjectItemCaseSensitive(item, "labels");
+        if (labels) {
+            cJSON *label;
+            size_t labels_num = cJSON_GetArraySize(labels);
+
+            if (labels_num > 0) {
+                AVBufferRef *ref    = NULL;
+                LabelsArray *larray = av_mallocz(sizeof(*larray));
+
+                if (!larray)
+                    return AVERROR(ENOMEM);
+
+                cJSON_ArrayForEach(label, labels) {
+                    char *l = av_strdup(label->valuestring);
+                    av_dynarray_add(&larray->label, &larray->num, l);
+                }
+
+                ref = av_buffer_create((uint8_t *)larray, sizeof(*larray),
+                        &infer_labels_buffer_free, NULL, 0);
+
+                proc->labels = ref;
+
+                infer_labels_dump(ref->data);
+            }
+        }
+
+        index++;
+    }
+#undef FETCH_STRING
+#undef FETCH_DOUBLE
+#undef FETCH_INTEGER
+
+    return 0;
+}
+
+void ff_release_model_proc(const void *json,
+        ModelInputPreproc *preproc, ModelOutputPostproc *postproc)
+{
+    size_t index = 0;
+
+    if (!json) return;
+
+    if (postproc) {
+        for (index = 0; index < MAX_MODEL_OUTPUT; index++) {
+            if (postproc->procs[index].labels)
+                av_buffer_unref(&postproc->procs[index].labels);
+        }
+    }
+
+    cJSON_Delete((cJSON *)json);
+}
+#endif
diff --git a/libavfilter/inference.h b/libavfilter/inference.h
index d0515e2..0512403 100644
--- a/libavfilter/inference.h
+++ b/libavfilter/inference.h
@@ -32,10 +32,15 @@
 
 #include "dnn_interface.h"
 
-typedef struct InferenceBaseContext InferenceBaseContext;
+typedef struct _InferenceBaseContext InferenceBaseContext;
+typedef struct _InputPreproc         ModelInputPreproc;
+typedef struct _OutputPostproc       OutputPostproc;
+typedef struct _ModelOutputPostproc  ModelOutputPostproc;
 
 typedef int (*InferencePreProcess)(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out);
 
+#define UNUSED(x) (void)(x)
+
 typedef struct InferenceParam {
     char  *model_file;
     char  *labels_file;
@@ -116,14 +121,36 @@ struct _SwVpp {
 typedef struct VideoPP {
     int       device;
     int       expect_format;
-    AVFrame  *frames[MAX_VPP_NUM];  //<! frames to save vpp output
+    AVFrame  *frames[MAX_VPP_NUM];  ///<! frames to save vpp output
     SwVpp    *sw_vpp;
 #if CONFIG_VAAPI
     VAAPIVpp *va_vpp;
 #endif
 } VideoPP;
 
-#define MAX_TENSOR_DIM_NUM 8
+struct _InputPreproc {
+    int   color_format;     ///<! input data format
+    char *layer_name;       ///<! layer name of input
+    char *object_class;     ///<! interested object class
+};
+
+struct _OutputPostproc {
+    char  *layer_name;
+    char  *converter;
+    char  *attribute_name;
+    char  *method;
+    double threshold;
+    double tensor2text_scale;
+    int    tensor2text_precision;
+    AVBufferRef *labels;
+};
+
+#define MAX_MODEL_OUTPUT 4
+struct _ModelOutputPostproc {
+    OutputPostproc procs[MAX_MODEL_OUTPUT];
+};
+
+#define MAX_TENSOR_DIM_NUM 4
 typedef struct InferTensorMeta {
     size_t  dim_size;
     size_t  dims[MAX_TENSOR_DIM_NUM];
@@ -207,6 +234,8 @@ DNNModelInfo* ff_inference_base_get_input_info(InferenceBaseContext *base);
 DNNModelInfo* ff_inference_base_get_output_info(InferenceBaseContext *base);
 VideoPP*      ff_inference_base_get_vpp(InferenceBaseContext *base);
 
+void ff_inference_dump_model_info(void *ctx, DNNModelInfo *info);
+
 #if CONFIG_VAAPI
 int va_vpp_device_create(VAAPIVpp *ctx, AVFilterLink *inlink);
 
@@ -217,4 +246,18 @@ int va_vpp_surface_alloc(VAAPIVpp *ctx, size_t width, size_t height, const char
 int va_vpp_surface_release(VAAPIVpp *ctx);
 #endif
 
+int ff_get_file_size(FILE *fp);
+
+#if CONFIG_LIBCJSON
+void *ff_read_model_proc(const char *path);
+
+void ff_load_default_model_proc(ModelInputPreproc *preproc, ModelOutputPostproc *postproc);
+
+int ff_parse_input_preproc(const void *json, ModelInputPreproc *m_preproc);
+
+int ff_parse_output_postproc(const void *json, ModelOutputPostproc *m_postproc);
+
+void ff_release_model_proc(const void *json, ModelInputPreproc *preproc, ModelOutputPostproc *postproc);
+#endif
+
 #endif
diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c
index 0681a2c..283f9f4 100644
--- a/libavfilter/vf_inference_classify.c
+++ b/libavfilter/vf_inference_classify.c
@@ -44,9 +44,6 @@
 #define MAX_MODEL_NUM 8
 #define FACE_FEATURE_VECTOR_LEN 256
 
-static char string_age[]    = "age";
-static char string_gender[] = "gender";
-
 typedef int (*ClassifyInit)(AVFilterContext *ctx, size_t index);
 
 typedef int (*ClassifyUnInit)(AVFilterContext *ctx, size_t index);
@@ -61,7 +58,9 @@ typedef struct InferenceClassifyContext {
 
     char  *labels;
     char  *names;
+
     char  *model_file;
+    char  *model_proc;
     char  *vpp_format;
     char  *feature_file;    ///< binary feature file for face identification
     int    feature_num;     ///< identification face feature number
@@ -81,6 +80,10 @@ typedef struct InferenceClassifyContext {
     ClassifyInit    init[MAX_MODEL_NUM];
     ClassifyUnInit  uninit[MAX_MODEL_NUM];
     ClassifyProcess post_process[MAX_MODEL_NUM];
+
+    void *proc_config[MAX_MODEL_NUM];
+    ModelInputPreproc   model_preproc[MAX_MODEL_NUM];
+    ModelOutputPostproc model_postproc[MAX_MODEL_NUM];
 } InferenceClassifyContext;
 
 typedef struct FaceIdentifyContext {
@@ -118,95 +121,189 @@ static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data)
     av_free(data);
 }
 
-static av_cold void dump_emotion(AVFilterContext *ctx, int label_id,
+static av_cold void dump_softmax(AVFilterContext *ctx, char *name, int label_id,
                                  float conf, AVBufferRef *label_buf)
 {
     LabelsArray *array = (LabelsArray *)label_buf->data;
 
-    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Label id:%d Emotion:%s Conf:%f\n",
-           label_id, array->label[label_id], conf);
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Label id:%d %s:%s Conf:%f\n",
+           label_id, name, array->label[label_id], conf);
 }
 
-static int emotion_classify_result_process(AVFilterContext *ctx,
-                                           int detect_id,
-                                           int result_id,
-                                           int model_index,
-                                           InferTensorMeta *meta,
-                                           InferClassificationMeta *c_meta)
+static av_cold void dump_tensor_value(AVFilterContext *ctx, char *name, float value)
 {
-    int i, label_id = 0;
-    InferenceClassifyContext *s = ctx->priv;
-    const float *emo_confidence = (float *)meta->data;
-    size_t labels_num           = meta->dims[2];
-    float max                   = emo_confidence[0];
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - %s:%1.2f\n", name, value);
+}
+
+static void find_max_element_index(const float *array, int len,
+                                   int *index, float *value)
+{
+    int i;
+    *index = 0;
+    *value = array[0];
+    for (i = 1; i < len; i++) {
+        if (array[i] > *value) {
+            *index = i;
+            *value = array[i];
+        }
+    }
+}
 
-    InferClassification *classify = av_mallocz(sizeof(*classify));
+static int attributes_to_text(AVFilterContext *ctx,
+                              int detect_id,
+                              OutputPostproc *proc,
+                              InferTensorMeta *meta,
+                              InferClassificationMeta *c_meta)
+{
+    InferClassification *classify;
+    uint32_t method_max, method_compound, method_index;
+    const float *data = (const float *)meta->data;
+
+    method_max      = !strcmp(proc->method, "max");
+    method_compound = !strcmp(proc->method, "compound");
+    method_index    = !strcmp(proc->method, "index");
+
+    if (!data) return -1;
+
+    classify = av_mallocz(sizeof(*classify));
     if (!classify)
         return AVERROR(ENOMEM);
 
-    // Get the emotion with max confidence
-    for (i = 1; i < labels_num; i++)
-        if (emo_confidence[i] > max) { max = emo_confidence[i]; label_id = i; }
+    if (method_max) {
+        int    index;
+        float  confidence;
+        size_t n = meta->dims[1];
+
+        find_max_element_index(data, n, &index, &confidence);
+
+        classify->detect_id  = detect_id;
+        classify->name       = proc->attribute_name;
+        classify->label_id   = index;
+        classify->confidence = confidence;
+        classify->label_buf  = av_buffer_ref(proc->labels);
+
+        dump_softmax(ctx, classify->name, classify->label_id,
+                     classify->confidence,classify->label_buf);
+    } else if (method_compound) {
+        int i;
+        double threshold  = 0.5;
+        float  confidence = 0;
+        char attributes[4096] = {};
+        LabelsArray *array;
+
+        if (proc->threshold != 0)
+            threshold = proc->threshold;
+
+        array = (LabelsArray *)proc->labels->data;
+        for (i = 0; i < array->num; i++) {
+            if (data[i] >= threshold)
+                strncat(attributes, array->label[i], (strlen(array->label[i]) + 1));
+            if (data[i] > confidence)
+                confidence = data[i];
+        }
 
-    classify->detect_id  = detect_id;
-    classify->name       = s->name_array[model_index];
-    classify->label_id   = label_id;
-    classify->confidence = emo_confidence[label_id];
-    classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
+        classify->name = proc->attribute_name;
+        classify->confidence = confidence;
+
+        av_log(ctx, AV_LOG_DEBUG, "Attributes: %s\n", attributes);
+        // TODO: to add into side data
+        av_free(classify);
+        return 0;
+    } else if (method_index) {
+        int i;
+        char attributes[1024] = {};
+        LabelsArray *array;
+
+        array = (LabelsArray *)proc->labels->data;
+        for (i = 0; i < array->num; i++) {
+            int value = data[i];
+            if (value < 0 || value >= array->num)
+                break;
+            strncat(attributes, array->label[value], (strlen(array->label[value]) + 1));
+        }
 
-    dump_emotion(ctx, classify->label_id, classify->confidence, classify->label_buf);
+        classify->name = proc->attribute_name;
 
-    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
+        av_log(ctx, AV_LOG_DEBUG, "Attributes: %s\n", attributes);
+        // TODO: to add into side data
+        av_free(classify);
+        return 0;
+    }
 
+    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
     return 0;
 }
 
-static av_cold void dump_gender(AVFilterContext *ctx, int label_id,
-                                float conf, AVBufferRef *label_buf)
+static int tensor_to_text(AVFilterContext *ctx,
+                          int detect_id,
+                          OutputPostproc *proc,
+                          InferTensorMeta *meta,
+                          InferClassificationMeta *c_meta)
 {
-    LabelsArray *array = (LabelsArray *)label_buf->data;
+    InferClassification *classify;
+    const float *data = (const float *)meta->data;
+    double scale = 1.0;
 
-    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Conf:%1.2f\n",
-           array->label[label_id], conf);
-}
+    if (!data) return -1;
 
-static av_cold void dump_age(AVFilterContext *ctx, float age)
-{
-    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%1.2f\n", age);
+    classify = av_mallocz(sizeof(*classify));
+    if (!classify)
+        return AVERROR(ENOMEM);
+
+    if (proc->tensor2text_scale != 0)
+        scale = proc->tensor2text_scale;
+
+    classify->detect_id = detect_id;
+    classify->name      = proc->attribute_name;
+    classify->value     = *data * scale;
+
+    dump_tensor_value(ctx, classify->name, classify->value);
+
+    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
+    return 0;
 }
 
-static int age_gender_classify_result_process(AVFilterContext *ctx,
-                                              int detect_id,
-                                              int result_id,
-                                              int model_index,
-                                              InferTensorMeta *meta,
-                                              InferClassificationMeta *c_meta)
+static int commmon_postprocess(AVFilterContext *ctx,
+                               int detect_id,
+                               int result_id,
+                               int model_id,
+                               InferTensorMeta *meta,
+                               InferClassificationMeta *c_meta)
 {
+    int proc_id;
     InferenceClassifyContext *s = ctx->priv;
-    const float *data = (float *)meta->data;
+    InferenceBaseContext *base  = s->infer_bases[model_id];
 
-    InferClassification *classify = av_mallocz(sizeof(*classify));
-    if (!classify)
-        return AVERROR(ENOMEM);
+    OutputPostproc *proc;
+    DNNModelInfo *info = ff_inference_base_get_output_info(base);
 
-    classify->detect_id  = detect_id;
+    // search model postproc
+    for (proc_id = 0; proc_id < MAX_MODEL_OUTPUT; proc_id++) {
+        char *proc_layer_name = s->model_postproc[model_id].procs[proc_id].layer_name;
 
-    if (result_id == 0) {
-        // Age
-        classify->name  = string_age;
-        classify->value = *data * 100.0;
-        dump_age(ctx, classify->value);
-    } else {
-        // Gender
-        classify->name       = string_gender;
-        // 0 - Femal, 1 - Male
-        classify->label_id   = data[0] > data[1] ? 0 : 1;
-        classify->confidence = data[classify->label_id];
-        classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
-        dump_gender(ctx, classify->label_id, classify->confidence, classify->label_buf);
+        // skip this output process
+        if (!proc_layer_name)
+            continue;
+
+        if (!strcmp(info->layer_name[result_id], proc_layer_name))
+            break;
     }
 
-    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
+    if (proc_id == MAX_MODEL_OUTPUT) {
+        av_log(ctx, AV_LOG_DEBUG, "Could not find proc:%s\n", info->layer_name[result_id]);
+        return 0;
+    }
+
+    proc = &s->model_postproc[model_id].procs[proc_id];
+
+    if (proc->converter == NULL)
+        return 0;
+
+    if (!strcmp(proc->converter, "attributes"))
+        return attributes_to_text(ctx, detect_id, proc, meta, c_meta);
+
+    if (!strcmp(proc->converter, "tensor2text"))
+        return tensor_to_text(ctx, detect_id, proc, meta, c_meta);
 
     return 0;
 }
@@ -227,13 +324,7 @@ static int face_identify_init(AVFilterContext *ctx, size_t index)
 
     av_assert0(index < MAX_MODEL_NUM);
 
-    if (fseek(fp, 0, SEEK_END)) {
-        av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of feature file.\n");
-        fclose(fp);
-        return AVERROR(EINVAL);
-    }
-
-    feature_size = ftell(fp);
+    feature_size = ff_get_file_size(fp);
 
     if (feature_size == -1) {
         fclose(fp);
@@ -427,11 +518,12 @@ static av_cold int classify_init(AVFilterContext *ctx)
 {
     InferenceClassifyContext *s = ctx->priv;
     int i, ret;
-    int model_num = 0, label_num = 0, name_num = 0;
+    int model_num = 0, model_proc_num = 0, label_num = 0, name_num = 0;
     const int max_num = MAX_MODEL_NUM;
     char  *names[MAX_MODEL_NUM] = { };
     char *models[MAX_MODEL_NUM] = { };
     char *labels[MAX_MODEL_NUM] = { };
+    char *models_proc[MAX_MODEL_NUM] = { };
     InferenceParam p = {};
 
     av_assert0(s->model_file);
@@ -448,6 +540,13 @@ static av_cold int classify_init(AVFilterContext *ctx)
     for (i = 0; i < name_num; i++)
         av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]);
 
+    av_split(s->model_proc, "&", models_proc, &model_proc_num, max_num);
+    for (i = 0; i < model_proc_num; i++)
+        av_log(ctx, AV_LOG_INFO, "proc[%d]:%s\n", i, models_proc[i]);
+
+    // TODO: uncomment this after face reidentify use proc file
+    // av_assert0(model_proc_num == model_num);
+
     av_assert0(s->backend_type == DNN_INTEL_IE);
 
     p.backend_type    = s->backend_type;
@@ -458,9 +557,10 @@ static av_cold int classify_init(AVFilterContext *ctx)
     p.input_is_image  = 1;
 
     for (i = 0; i < model_num; i++) {
+        void *proc;
         InferenceBaseContext *base = NULL;
 
-        p.model_file  = models[i];
+        p.model_file = models[i];
         ret = ff_inference_base_create(ctx, &base, &p);
         if (ret < 0) {
             av_log(ctx, AV_LOG_ERROR, "Could not create inference\n");
@@ -468,6 +568,33 @@ static av_cold int classify_init(AVFilterContext *ctx)
         }
 
         s->infer_bases[i] = base;
+
+        ff_load_default_model_proc(&s->model_preproc[i], &s->model_postproc[i]);
+
+        if (!models_proc[i])
+            continue;
+
+        proc = ff_read_model_proc(models_proc[i]);
+        if (!proc) {
+            av_log(ctx, AV_LOG_ERROR, "Could not read proc config file:"
+                    "%s\n", models_proc[i]);
+            ret = AVERROR(EIO);
+            goto fail;
+        }
+
+        if (ff_parse_input_preproc(proc, &s->model_preproc[i]) < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Parse input preproc error.\n");
+            ret = AVERROR(EIO);
+            goto fail;
+        }
+
+        if (ff_parse_output_postproc(proc, &s->model_postproc[i]) < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Parse output postproc error.\n");
+            ret = AVERROR(EIO);
+            goto fail;
+        }
+
+        s->proc_config[i] = proc;
     }
     s->loaded_num = model_num;
 
@@ -506,16 +633,14 @@ static av_cold int classify_init(AVFilterContext *ctx)
         s->label_bufs[i] = ref;
     }
 
-    for (i = 0; i < name_num; i++) {
+    for (i = 0; i < model_num; i++) {
         s->name_array[i] = names[i];
-        if (strstr(names[i], "emotion")) {
-            s->post_process[i] = &emotion_classify_result_process;
-        } else if (strstr(names[i], "age") && strstr(names[i], "gend")) {
-            s->post_process[i] = &age_gender_classify_result_process;
-        } else if (strstr(names[i], "face")) {
+        if (names[i] && strstr(names[i], "face")) {
             s->init[i]         = &face_identify_init;
             s->uninit[i]       = &face_identify_uninit;
             s->post_process[i] = &face_identify_result_process;
+        } else {
+            s->post_process[i] = &commmon_postprocess;
         }
 
         if (s->init[i] && s->init[i](ctx, i) < 0)
@@ -545,6 +670,8 @@ static av_cold void classify_uninit(AVFilterContext *ctx)
         ff_inference_base_free(&s->infer_bases[i]);
 
         av_buffer_unref(&s->label_bufs[i]);
+
+        ff_release_model_proc(s->proc_config[i], &s->model_preproc[i], &s->model_postproc[i]);
     }
 }
 
@@ -594,11 +721,14 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         for (j = 0; j < s->loaded_num; j++) {
             int output;
             InferenceBaseContext *base = s->infer_bases[j];
+            ModelInputPreproc *preproc = &s->model_preproc[j];
 
             VideoPP *vpp        = ff_inference_base_get_vpp(base);
             AVFrame *tmp        = vpp->frames[0];
             DNNModelInfo *iinfo = ff_inference_base_get_input_info(base);
             DNNModelInfo *oinfo = ff_inference_base_get_output_info(base);
+            int scale_width     = iinfo->dims[0][0];
+            int scale_height    = iinfo->dims[0][1];
 
             Rect crop_rect = (Rect) {
                 .x0 = bbox->x_min * in->width,
@@ -607,14 +737,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
                 .y1 = bbox->y_max * in->height,
             };
 
+            // care interested object class only
+            if (preproc && preproc->object_class && bbox->label_buf) {
+                LabelsArray *array = (LabelsArray *)bbox->label_buf->data;
+                if (0 != strcmp(preproc->object_class, array->label[bbox->label_id]))
+                    continue;
+            }
+
             if (vpp->device == VPP_DEVICE_SW) {
                 ret = vpp->sw_vpp->crop_and_scale(in, &crop_rect,
-                        iinfo->width[0], iinfo->height[0],
+                        scale_width, scale_height,
                         vpp->expect_format, tmp->data, tmp->linesize);
             } else {
 #if CONFIG_VAAPI
                 ret = vpp->va_vpp->crop_and_scale(vpp->va_vpp, in, &crop_rect,
-                        iinfo->width[0], iinfo->height[0], tmp->data, tmp->linesize);
+                        scale_width, scale_height, tmp->data, tmp->linesize);
 #endif
             }
             if (ret != 0) {
@@ -626,7 +763,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
             ff_inference_base_submit_frame(base, tmp, 0, 0);
             ff_inference_base_infer(base);
 
-            for (output = 0; output < oinfo->numbers; output++) {
+            for (output = 0; output < oinfo->number; output++) {
                 InferTensorMeta tensor_meta = { };
                 ff_inference_base_get_infer_result(base, output, &tensor_meta);
 
@@ -672,8 +809,13 @@ static av_cold int config_input(AVFilterLink *inlink)
         DNNModelInfo         *info = ff_inference_base_get_input_info(base);
         VideoPP               *vpp = ff_inference_base_get_vpp(base);
 
+        int input_width  = info->dims[0][0];
+        int input_height = info->dims[0][1];
+
         // right now, no model needs multiple inputs
-        av_assert0(info->numbers == 1);
+        // av_assert0(info->number == 1);
+
+        ff_inference_dump_model_info(ctx, info);
 
         vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ?
             VPP_DEVICE_HW : VPP_DEVICE_SW;
@@ -682,8 +824,8 @@ static av_cold int config_input(AVFilterLink *inlink)
         frame = av_frame_alloc();
         if (!frame)
             return AVERROR(ENOMEM);
-        frame->width   = info->width[0];
-        frame->height  = info->height[0];
+        frame->width   = input_width;
+        frame->height  = input_height;
         frame->format  = expect_format;
         vpp->frames[0] = frame;
 
@@ -706,7 +848,7 @@ static av_cold int config_input(AVFilterLink *inlink)
             }
 
             ret = va_vpp_surface_alloc(vpp->va_vpp,
-                    info->width[0], info->height[0], s->vpp_format);
+                    input_width, input_height, s->vpp_format);
             if (ret < 0) {
                 av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n");
                 ret = AVERROR(EINVAL);
@@ -741,12 +883,39 @@ fail:
 
 static av_cold int config_output(AVFilterLink *outlink)
 {
+    int i;
+    AVFilterContext        *ctx = outlink->src;
+    InferenceClassifyContext *s = ctx->priv;
+
+    for (i = 0; i < s->loaded_num; i++) {
+        InferenceBaseContext *base = s->infer_bases[i];
+        DNNModelInfo *info = ff_inference_base_get_output_info(base);
+        ff_inference_dump_model_info(ctx, info);
+
+#if CONFIG_VAAPI
+        if (!outlink->hw_frames_ctx) {
+            VideoPP *vpp = ff_inference_base_get_vpp(base);
+            if (vpp->device == VPP_DEVICE_HW) {
+                if (!vpp->va_vpp || !vpp->va_vpp->hw_frames_ref) {
+                    av_log(ctx, AV_LOG_ERROR, "The input must have a hardware frame "
+                            "reference.\n");
+                    return AVERROR(EINVAL);
+                }
+                outlink->hw_frames_ctx = av_buffer_ref(vpp->va_vpp->hw_frames_ref);
+                if (!outlink->hw_frames_ctx)
+                    return AVERROR(ENOMEM);
+            }
+        }
+#endif
+    }
+
     return 0;
 }
 
 static const AVOption inference_classify_options[] = {
     { "dnn_backend",    "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,    FLAGS, "engine" },
     { "model",          "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "model_proc",     "model preproc and postproc",      OFFSET(model_proc),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
     { "label",          "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
     { "name",           "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
     { "vpp_format",     "specify vpp output format",       OFFSET(vpp_format),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c
index 42a0e02..cfc0524 100644
--- a/libavfilter/vf_inference_detect.c
+++ b/libavfilter/vf_inference_detect.c
@@ -50,8 +50,8 @@ typedef struct InferenceDetectContext {
     InferenceBaseContext *base;
 
     char  *model_file;
-    char  *label_file;
     char  *vpp_format;
+    char  *model_proc;
     int    backend_type;
     int    device_type;
 
@@ -64,22 +64,12 @@ typedef struct InferenceDetectContext {
     int    input_precision;
     int    input_is_image;
 
-    char  *name;
+    void  *proc_config;
 
-    AVBufferRef *label_buf;
+    ModelInputPreproc   model_preproc;
+    ModelOutputPostproc model_postproc;
 } InferenceDetectContext;
 
-static void infer_labels_buffer_free(void *opaque, uint8_t *data)
-{
-    int i;
-    LabelsArray *labels = (LabelsArray *)data;
-
-    for (i = 0; i < labels->num; i++)
-        av_freep(&labels->label[i]);
-
-    av_free(data);
-}
-
 static void infer_detect_metadata_buffer_free(void *opaque, uint8_t *data)
 {
     BBoxesArray *bboxes = ((InferDetectionMeta *)data)->bboxes;
@@ -102,8 +92,8 @@ static int detect_postprocess(AVFilterContext *ctx, InferTensorMeta *meta, AVFra
 {
     int i;
     InferenceDetectContext *s = ctx->priv;
-    int object_size           = meta->dims[0];
-    int max_proposal_count    = meta->dims[1];
+    int object_size           = meta->dims[3];
+    int max_proposal_count    = meta->dims[2];
     const float *detection    = (float *)meta->data;
     AVBufferRef *ref;
     AVFrameSideData *sd;
@@ -136,11 +126,12 @@ static int detect_postprocess(AVFilterContext *ctx, InferTensorMeta *meta, AVFra
 
         if (new_bbox->confidence < s->threshold) {
             av_freep(&new_bbox);
-            break;
+            continue;
         }
 
-        if (s->label_buf)
-            new_bbox->label_buf = av_buffer_ref(s->label_buf);
+        // TODO: use layer name to get proc
+        if (s->model_postproc.procs[0].labels)
+            new_bbox->label_buf = av_buffer_ref(s->model_postproc.procs[0].labels);
 
         av_dynarray_add(&boxes->bbox, &boxes->num, new_bbox);
     }
@@ -217,7 +208,7 @@ static int query_formats(AVFilterContext *context)
 
 static int config_input(AVFilterLink *inlink)
 {
-    int i, ret;
+    int ret;
     AVFrame *frame;
     AVFilterContext      *ctx        = inlink->dst;
     InferenceDetectContext *s        = ctx->priv;
@@ -227,14 +218,12 @@ static int config_input(AVFilterLink *inlink)
     DNNModelInfo *info               = ff_inference_base_get_input_info(s->base);
     VideoPP *vpp                     = ff_inference_base_get_vpp(s->base);
 
-    for (i = 0; i < info->numbers; i++) {
-        av_log(ctx, AV_LOG_DEBUG, "Input info [%d] %d - %d %d %d - %d %d %d\n",
-               i, info->batch_size, info->width[i], info->height[i], info->channels[i],
-               info->is_image[i], info->precision[i], info->layout[i]);
-    }
+    int width = info->dims[0][0], height = info->dims[0][1];
+
+    ff_inference_dump_model_info(ctx, info);
 
     // right now, no model needs multiple inputs
-    av_assert0(info->numbers == 1);
+    av_assert0(info->number == 1);
 
     vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW;
 
@@ -242,15 +231,15 @@ static int config_input(AVFilterLink *inlink)
     frame = av_frame_alloc();
     if (!frame)
         return AVERROR(ENOMEM);
-    frame->width   = info->width[0];
-    frame->height  = info->height[0];
+    frame->width   = width;
+    frame->height  = height;
     frame->format  = expect_format;
     vpp->frames[0] = frame;
 
     if (vpp->device == VPP_DEVICE_SW) {
-        int need_scale = expect_format   != inlink->format ||
-                         info->width[0]  != inlink->w      ||
-                         info->height[0] != inlink->h;
+        int need_scale = expect_format != inlink->format ||
+                         width         != inlink->w      ||
+                         height        != inlink->h;
 
         if (need_scale) {
             if (av_frame_get_buffer(frame, 0) < 0) {
@@ -260,7 +249,7 @@ static int config_input(AVFilterLink *inlink)
 
             vpp->sw_vpp->scale_contexts[0] = sws_getContext(
                     inlink->w, inlink->h, inlink->format,
-                    info->width[0], info->height[0], expect_format,
+                    width, height, expect_format,
                     SWS_BILINEAR, NULL, NULL, NULL);
 
             if (!vpp->sw_vpp->scale_contexts[0]) {
@@ -284,8 +273,7 @@ static int config_input(AVFilterLink *inlink)
             goto fail;
         }
 
-        ret = va_vpp_surface_alloc(vpp->va_vpp,
-                info->width[0], info->height[0], s->vpp_format);
+        ret = va_vpp_surface_alloc(vpp->va_vpp, width, height, s->vpp_format);
         if (ret < 0) {
             av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n");
             ret = AVERROR(EINVAL);
@@ -316,12 +304,7 @@ static int config_output(AVFilterLink *outlink)
 
     DNNModelInfo *info = ff_inference_base_get_output_info(s->base);
 
-    for (int i = 0; i < info->numbers; i++) {
-        av_log(ctx, AV_LOG_DEBUG, "Output info [%d] %d - %d %d %d - %d %d %d\n",
-            i, info->batch_size,
-            info->width[i], info->height[i], info->channels[i],
-            info->is_image[i], info->precision[i], info->layout[i]);
-    }
+    ff_inference_dump_model_info(ctx, info);
 
 #if CONFIG_VAAPI
     if (vpp->device == VPP_DEVICE_HW) {
@@ -345,40 +328,31 @@ static av_cold int detect_init(AVFilterContext *ctx)
     InferenceDetectContext *s = ctx->priv;
     InferenceParam p = {};
 
-    av_assert0(s->model_file && s->name);
+    av_assert0(s->model_file);
 
     av_assert0(s->backend_type == DNN_INTEL_IE);
 
-    if (s->label_file) {
-        int n, labels_num;
-        AVBufferRef *ref    = NULL;
-        LabelsArray *larray = NULL;
-        char buffer[4096]   = { };
-        char *_labels[100]  = { };
+    ff_load_default_model_proc(&s->model_preproc, &s->model_postproc);
 
-        FILE *fp = fopen(s->label_file, "rb");
-        if (!fp) {
-            av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", s->label_file);
+    if (s->model_proc) {
+        void *proc = ff_read_model_proc(s->model_proc);
+        if (!proc) {
+            av_log(ctx, AV_LOG_ERROR, "Could not read proc config file:"
+                    "%s\n", s->model_proc);
             return AVERROR(EIO);
         }
 
-        n = fread(buffer, sizeof(buffer), 1, fp);
-        fclose(fp);
-
-        av_split(buffer, ",", _labels, &labels_num, 100);
-
-        larray = av_mallocz(sizeof(*larray));
-        if (!larray)
-            return AVERROR(ENOMEM);
+        if (ff_parse_input_preproc(proc, &s->model_preproc) < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Parse input preproc error.\n");
+            return AVERROR(EIO);
+        }
 
-        for (n = 0; n < labels_num; n++) {
-            char *l = av_strdup(_labels[n]);
-            av_dynarray_add(&larray->label, &larray->num, l);
+        if (ff_parse_output_postproc(proc, &s->model_postproc) < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Parse output postproc error.\n");
+            return AVERROR(EIO);
         }
 
-        ref = av_buffer_create((uint8_t *)larray, sizeof(*larray),
-                               &infer_labels_buffer_free, NULL, 0);
-        s->label_buf = ref;
+        s->proc_config = proc;
     }
 
     p.model_file      = s->model_file;
@@ -405,7 +379,7 @@ static av_cold void detect_uninit(AVFilterContext *ctx)
 
     ff_inference_base_free(&s->base);
 
-    if (s->label_buf) av_buffer_unref(&s->label_buf);
+    ff_release_model_proc(s->proc_config, &s->model_preproc, &s->model_postproc);
 }
 
 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
@@ -440,14 +414,13 @@ fail:
 static const AVOption inference_detect_options[] = {
     { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,  FLAGS, "engine" },
     { "model",       "path to model file for network",  OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
+    { "model_proc",  "model preproc and postproc",      OFFSET(model_proc),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
-    { "label",       "label file path for detection",   OFFSET(label_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "vpp_format",  "specify vpp output format",       OFFSET(vpp_format),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "interval",    "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
     { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
     { "threshold",   "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1,    FLAGS},
 
-    { "name",        "detection type name",             OFFSET(name),            AV_OPT_TYPE_STRING, .flags = FLAGS, "detection" },
     { NULL }
 };
 
-- 
2.7.4