From 819d5169680f30b7176c450b2298b70400b41691 Mon Sep 17 00:00:00 2001
From: Lin Xie <lin.xie@intel.com>
Date: Sun, 3 Feb 2019 11:04:19 +0800
Subject: [PATCH] Support object detection and featured face identification

Signed-off-by: Lin Xie <lin.xie@intel.com>
---
 libavfilter/inference.c             |  59 ++++++-
 libavfilter/inference.h             |  33 ++--
 libavfilter/vf_inference_classify.c | 321 ++++++++++++++++++++++++++++++------
 libavfilter/vf_inference_detect.c   | 148 +++++++----------
 libavformat/iemetadataenc.c         |  12 +-
 5 files changed, 416 insertions(+), 157 deletions(-)

diff --git a/libavfilter/inference.c b/libavfilter/inference.c
index 14b4093..e569620 100644
--- a/libavfilter/inference.c
+++ b/libavfilter/inference.c
@@ -101,15 +101,18 @@ static int fill_dnn_data_from_frame(DNNIOData *data,
 }
 
 static int sw_crop_and_scale(AVFrame *frame,
-                             float x0, float y0, float x1, float y1,
-                             int out_w, int out_h, uint8_t *data[], int stride[])
+                             float x0, float y0,
+                             float x1, float y1,
+                             int out_w, int out_h,
+                             enum AVPixelFormat out_format,
+                             uint8_t *data[], int stride[])
 {
     int err, bufsize;
     struct SwsContext *sws_ctx;
     const AVPixFmtDescriptor *desc;
     int x, y, w, h, hsub, vsub;
     int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes
-    enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24;
+    enum AVPixelFormat expect_format = out_format;
 
     AVFrame *temp = av_frame_alloc();
     if (!temp) {
@@ -175,13 +178,56 @@ static int sw_crop_and_scale(AVFrame *frame,
     return 0;
 }
 
+void av_split(char *str, const char *delim, char **array, int *num, int max)
+{
+    char *p;
+    int i = 0;
+
+    if (!str || !delim || !array || !num)
+        return;
+
+    p = strtok(str, delim);
+    while (p != NULL) {
+        array[i++] = p;
+
+        av_assert0 (i < max);
+
+        p = strtok(NULL, delim);
+    }
+    *num = i;
+}
+
+double av_norm(float vec[], size_t num)
+{
+    size_t i;
+    double result = 0.0;
+
+    for (i = 0; i < num; i++)
+        result += vec[i] * vec[i];
+
+    return sqrt(result);
+}
+
+double av_dot(float vec1[], float vec2[], size_t num)
+{
+    size_t i;
+    double result = 0.0;
+
+    for (i = 0; i < num; i++)
+        result += vec1[i] * vec2[i];
+
+    return result;
+}
+
 int ff_inference_base_create(AVFilterContext *ctx,
                              InferenceBaseContext **base,
-                             InferenceParam *param) {
+                             InferenceParam *param)
+{
     int i, ret;
     InferenceBaseContext *s;
-    DNNModelInfo *info;
     VideoPP *vpp;
+    DNNModelInfo *info;
+    DNNModelIntelIEConfig config;
 
     if (!param)
         return AVERROR(EINVAL);
@@ -202,7 +248,7 @@ int ff_inference_base_create(AVFilterContext *ctx,
     // parameter sanity check
     if (param->batch_size <= 0) param->batch_size = 1;
 
-    DNNModelIntelIEConfig config = {
+    config = (DNNModelIntelIEConfig) {
         .model         = param->model_file,
         .labels        = param->labels_file,
         .device        = param->device_type,
@@ -251,6 +297,7 @@ int ff_inference_base_create(AVFilterContext *ctx,
     // vpp init
     vpp->swscale        = &sws_scale;
     vpp->crop_and_scale = &sw_crop_and_scale;
+    vpp->expect_format  = AV_PIX_FMT_BGR24;
 
     *base = s;
 #undef DNN_ERR_CHECK
diff --git a/libavfilter/inference.h b/libavfilter/inference.h
index eebfd10..843e05c 100644
--- a/libavfilter/inference.h
+++ b/libavfilter/inference.h
@@ -20,6 +20,7 @@
 #define AVFILTER_INFERENCE_H
 
 #include "libavutil/common.h"
+#include "libswscale/swscale.h"
 #include "dnn_interface.h"
 
 typedef struct InferenceBaseContext InferenceBaseContext;
@@ -52,10 +53,11 @@ typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } VPPDevice;
 
 typedef struct VideoPP {
     int      device;
+    int      expect_format;
     void    *scale_contexts[MAX_VPP_NUM];
     AVFrame *frames[MAX_VPP_NUM];          //<! frames to save vpp output
 
-    int    (*swscale)(void *context,
+    int    (*swscale)(struct SwsContext *context,
                       const uint8_t * const srcSlice[],
                       const int srcStride[], int srcSliceY,
                       int srcSliceH, uint8_t *const dst[],
@@ -64,6 +66,7 @@ typedef struct VideoPP {
                              float crop_x0,  float crop_y0,
                              float crop_x1,  float crop_y1,
                              int   scale_w,  int   scale_h,
+                             enum AVPixelFormat scale_format,
                              uint8_t *dst[], int   dstStride[]);
 } VideoPP;
 
@@ -77,18 +80,18 @@ typedef struct InferTensorMeta {
     char   *model_name;
     void   *data;
     size_t  total_bytes;
-    AVBufferRef *labels;
+    // AVBufferRef *labels;
 } InferTensorMeta;
 
 typedef struct InferDetection {
-    float x_min;
-    float y_min;
-    float x_max;
-    float y_max;
-    float confidence;
-    int   label_id;
-    int   object_id;
-    AVBufferRef *text;
+    float   x_min;
+    float   y_min;
+    float   x_max;
+    float   y_max;
+    float   confidence;
+    int     label_id;
+    int     object_id;
+    AVBufferRef *label_buf;
 } InferDetection;
 
 /* dynamic bounding boxes array */
@@ -104,7 +107,6 @@ typedef struct LabelsArray {
 } LabelsArray;
 
 typedef struct InferDetectionMeta {
-    LabelsArray *labels;
     BBoxesArray *bboxes;
 } InferDetectionMeta;
 
@@ -127,6 +129,15 @@ typedef struct InferClassificationMeta {
     ClassifyArray *c_array;
 } InferClassificationMeta;
 
+/* split strings by delimiter */
+void av_split(char *str, const char *delim, char **array, int *num, int max);
+
+/* 2-dimensional norm */
+double av_norm(float vec[], size_t num);
+
+/* Dot Product */
+double av_dot(float vec1[], float vec2[], size_t num);
+
 int ff_inference_base_create(AVFilterContext *avctx, InferenceBaseContext **base, InferenceParam *p);
 
 int ff_inference_base_free(InferenceBaseContext **base);
diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c
index 625fb97..cba547a 100644
--- a/libavfilter/vf_inference_classify.c
+++ b/libavfilter/vf_inference_classify.c
@@ -40,13 +40,20 @@
 #define OFFSET(x) offsetof(InferenceClassifyContext, x)
 #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
 
+#define PI 3.1415926
 #define MAX_MODEL_NUM 8
+#define FACE_FEATURE_VECTOR_LEN 256
+#define THRESHOLD_RECOGNITION   70
 
 static char string_age[]    = "age";
 static char string_gender[] = "gender";
 
-typedef int (*ResultProcess)(AVFilterContext*, int, int, int,
-                             InferTensorMeta*, InferClassificationMeta*);
+typedef int (*ClassifyInit)(AVFilterContext *ctx, size_t index);
+
+typedef int (*ClassifyUnInit)(AVFilterContext *ctx, size_t index);
+
+typedef int (*ClassifyProcess)(AVFilterContext*, int, int, int,
+                               InferTensorMeta*, InferClassificationMeta*);
 
 typedef struct InferenceClassifyContext {
     const AVClass *class;
@@ -56,6 +63,8 @@ typedef struct InferenceClassifyContext {
     char  *labels;
     char  *names;
     char  *model_file;
+    char  *feature_file;    ///< binary feature file for face identification
+    int    feature_num;     ///< identification face feature number
     int    loaded_num;
     int    backend_type;
     int    device_type;
@@ -63,28 +72,20 @@ typedef struct InferenceClassifyContext {
     int    batch_size;
     int    every_nth_frame;
 
-    char         *name_array[MAX_MODEL_NUM];
-    AVBufferRef  *label_bufs[MAX_MODEL_NUM];
-    ResultProcess post_process[MAX_MODEL_NUM];
-} InferenceClassifyContext;
-
-static void split(char *str, const char *dilim, char **array, int *num, int max)
-{
-    int i = 0;
-    char *p;
-    if (!str || !dilim || !array || !num)
-        return;
+    void           *priv[MAX_MODEL_NUM];
+    char           *name_array[MAX_MODEL_NUM];
+    AVBufferRef    *label_bufs[MAX_MODEL_NUM];
 
-    p = strtok(str, dilim);
-    while (p != NULL) {
-        array[i++] = p;
+    ClassifyInit    init[MAX_MODEL_NUM];
+    ClassifyUnInit  uninit[MAX_MODEL_NUM];
+    ClassifyProcess post_process[MAX_MODEL_NUM];
+} InferenceClassifyContext;
 
-        av_assert0 (i < max);
-
-        p = strtok(NULL, dilim);
-    }
-    *num = i;
-}
+typedef struct FaceIdentifyContext {
+    size_t   vector_num;
+    double  *norm_std;
+    float  **feature_vecs;
+} FaceIdentifyContext;
 
 static void infer_labels_buffer_free(void *opaque, uint8_t *data)
 {
@@ -115,12 +116,13 @@ static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data)
     av_free(data);
 }
 
-static av_cold void dump_emotion(AVFilterContext *ctx, int label_id)
+static av_cold void dump_emotion(AVFilterContext *ctx, int label_id,
+                                 float conf, AVBufferRef *label_buf)
 {
-    const char *emotions[] = { "neutral", "happy", "sad", "surprise", "anger" };
+    LabelsArray *array = (LabelsArray *)label_buf->data;
 
-    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - label:%d emotion:%s \n",
-           label_id, emotions[label_id]);
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Label id:%d Emotion:%s Conf:%f\n",
+           label_id, array->label[label_id], conf);
 }
 
 static int emotion_classify_result_process(AVFilterContext *ctx,
@@ -150,24 +152,25 @@ static int emotion_classify_result_process(AVFilterContext *ctx,
     classify->confidence = emo_confidence[label_id];
     classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
 
-    dump_emotion(ctx, classify->label_id);
+    dump_emotion(ctx, classify->label_id, classify->confidence, classify->label_buf);
 
     av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
 
     return 0;
 }
 
-static av_cold void dump_gender(AVFilterContext *ctx, int label_id, float conf)
+static av_cold void dump_gender(AVFilterContext *ctx, int label_id,
+                                float conf, AVBufferRef *label_buf)
 {
-    const char *genders[] = { "female", "male" };
+    LabelsArray *array = (LabelsArray *)label_buf->data;
 
-    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Confidence:%f\n",
-           genders[label_id], conf);
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Conf:%1.2f\n",
+           array->label[label_id], conf);
 }
 
 static av_cold void dump_age(AVFilterContext *ctx, float age)
 {
-    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%f \n", age);
+    av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%1.2f\n", age);
 }
 
 static int age_gender_classify_result_process(AVFilterContext *ctx,
@@ -198,7 +201,7 @@ static int age_gender_classify_result_process(AVFilterContext *ctx,
         classify->label_id   = data[0] > data[1] ? 0 : 1;
         classify->confidence = data[classify->label_id];
         classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
-        dump_gender(ctx, classify->label_id, classify->confidence);
+        dump_gender(ctx, classify->label_id, classify->confidence, classify->label_buf);
     }
 
     av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
@@ -206,6 +209,199 @@ static int age_gender_classify_result_process(AVFilterContext *ctx,
     return 0;
 }
 
+static int face_identify_init(AVFilterContext *ctx, size_t index)
+{
+    FaceIdentifyContext *identify_ctx;
+    InferenceClassifyContext *s = ctx->priv;
+
+    int i, ret, feature_size, expected_size;
+    size_t vec_size_in_bytes = sizeof(float) * FACE_FEATURE_VECTOR_LEN;
+
+    FILE *fp = fopen(s->feature_file, "rb");
+    if (!fp) {
+        av_log(ctx, AV_LOG_ERROR, "Could not open feature file:%s\n", s->feature_file);
+        return AVERROR(EIO);
+    }
+
+    av_assert0(index < MAX_MODEL_NUM);
+
+    if (fseek(fp, 0, SEEK_END)) {
+        av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of feature file.\n");
+        fclose(fp);
+        return AVERROR(EINVAL);
+    }
+
+    feature_size = ftell(fp);
+
+    if (feature_size == -1) {
+        fclose(fp);
+        av_log(ctx, AV_LOG_ERROR, "Couldn't get size of feature file.\n");
+        return AVERROR(EINVAL);
+    } else if (feature_size % FACE_FEATURE_VECTOR_LEN) {
+        fclose(fp);
+        av_log(ctx, AV_LOG_ERROR, "Feature data must align to %d.\n", FACE_FEATURE_VECTOR_LEN);
+        return AVERROR(EINVAL);
+    }
+
+    if (s->feature_num > 0) {
+        expected_size = s->feature_num * vec_size_in_bytes;
+        if (expected_size != feature_size) {
+            fclose(fp);
+            av_log(ctx, AV_LOG_ERROR, "Unexpected feature file size.\n");
+            return AVERROR(EINVAL);
+        }
+    } else {
+        s->feature_num = feature_size / vec_size_in_bytes;
+    }
+
+    identify_ctx = av_mallocz(sizeof(*identify_ctx));
+    if (!identify_ctx) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    identify_ctx->vector_num = s->feature_num;
+
+    identify_ctx->feature_vecs = av_mallocz(sizeof(float *) * identify_ctx->vector_num);
+    if (!identify_ctx->feature_vecs) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    rewind(fp);
+
+    for (i = 0; i <identify_ctx->vector_num; i++) {
+        identify_ctx->feature_vecs[i] = av_malloc(vec_size_in_bytes);
+        if (!identify_ctx->feature_vecs[i]) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        if (fread(identify_ctx->feature_vecs[i], vec_size_in_bytes, 1, fp) != 1) {
+            ret = AVERROR(EINVAL);
+            goto fail;
+        }
+    }
+
+    identify_ctx->norm_std = av_mallocz(sizeof(double) * identify_ctx->vector_num);
+    if (!identify_ctx->norm_std) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (i = 0; i < identify_ctx->vector_num; i++)
+        identify_ctx->norm_std[i] = av_norm(identify_ctx->feature_vecs[i],
+                                            FACE_FEATURE_VECTOR_LEN);
+
+    s->priv[index] = identify_ctx;
+    fclose(fp);
+    return 0;
+fail:
+    fclose(fp);
+
+    if (identify_ctx) {
+        if (identify_ctx->feature_vecs) {
+            for (i = 0; i <identify_ctx->vector_num; i++) {
+                if (identify_ctx->feature_vecs[i])
+                    av_free(identify_ctx->feature_vecs[i]);
+            }
+            av_free(identify_ctx->feature_vecs);
+        }
+        av_free(identify_ctx);
+    }
+    return ret;
+}
+
+static int face_identify_uninit(AVFilterContext *ctx, size_t index)
+{
+    int i;
+    InferenceClassifyContext *s = ctx->priv;
+    FaceIdentifyContext *identify_ctx = s->priv[index];
+
+    if (!identify_ctx) {
+        av_log(ctx, AV_LOG_WARNING, "Empty face identify ctx.\n");
+        return 0;
+    }
+
+    if (identify_ctx->feature_vecs) {
+        for (i = 0; i < identify_ctx->vector_num; i++)
+            av_free(identify_ctx->feature_vecs[i]);
+        av_free(identify_ctx->feature_vecs);
+    }
+
+    if (identify_ctx->norm_std)
+        av_free(identify_ctx->norm_std);
+
+    av_free(identify_ctx);
+    s->priv[index] = NULL;
+
+    return 0;
+}
+
+static av_cold void dump_face_id(AVFilterContext *ctx, int label_id,
+                                 float conf, AVBufferRef *label_buf)
+{
+    LabelsArray *array = (LabelsArray *)label_buf->data;
+
+    av_log(ctx, AV_LOG_DEBUG,"CLASSIFY META - Face_id:%d Name:%s Conf:%1.2f\n",
+           label_id, array->label[label_id], conf);
+}
+
+static int face_identify_result_process(AVFilterContext *ctx,
+                                        int detect_id,
+                                        int result_id,
+                                        int model_index,
+                                        InferTensorMeta *meta,
+                                        InferClassificationMeta *c_meta)
+{
+    int i, label_id = 0;
+    InferClassification *classify;
+    double dot_product, norm_feature, confidence, *angles;
+    InferenceClassifyContext *s = ctx->priv;
+    FaceIdentifyContext      *f = s->priv[model_index];
+    double            min_angle = 180.0f;
+    float       *feature_vector = meta->data;
+
+    angles = av_malloc(sizeof(double) * f->vector_num);
+    if (!angles)
+        return AVERROR(ENOMEM);
+
+    norm_feature = av_norm(feature_vector, FACE_FEATURE_VECTOR_LEN);
+
+    for (i = 0; i < f->vector_num; i++) {
+        dot_product = av_dot(feature_vector,
+                             f->feature_vecs[i],
+                             FACE_FEATURE_VECTOR_LEN);
+
+        angles[i] = acos((dot_product - 0.0001f) /
+                         (f->norm_std[i] * norm_feature)) /
+                    PI * 180.0;
+        if (angles[i] < THRESHOLD_RECOGNITION && angles[i] < min_angle) {
+            label_id  = i;
+            min_angle = angles[i];
+        }
+    }
+
+    confidence = (90.0f - min_angle) / 90.0f;
+
+    av_free(angles);
+
+    classify = av_mallocz(sizeof(*classify));
+    if (!classify)
+        return AVERROR(ENOMEM);
+
+    classify->detect_id  = detect_id;
+    classify->name       = s->name_array[model_index];
+    classify->label_id   = label_id;
+    classify->confidence = (float)confidence;
+    classify->label_buf  = av_buffer_ref(s->label_bufs[model_index]);
+
+    dump_face_id(ctx, label_id, confidence, s->label_bufs[model_index]);
+
+    av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify);
+
+    return 0;
+}
+
 static int query_formats(AVFilterContext *context)
 {
     AVFilterFormats *formats_list;
@@ -217,7 +413,7 @@ static int query_formats(AVFilterContext *context)
 
     formats_list = ff_make_format_list(pixel_formats);
     if (!formats_list) {
-        av_log(context, AV_LOG_ERROR, "could not create formats list\n");
+        av_log(context, AV_LOG_ERROR, "Could not create formats list\n");
         return AVERROR(ENOMEM);
     }
 
@@ -237,15 +433,15 @@ static av_cold int classify_init(AVFilterContext *ctx)
 
     av_assert0(s->model_file);
 
-    split(s->model_file, "&", models, &model_num, max_num);
+    av_split(s->model_file, "&", models, &model_num, max_num);
     for (i = 0; i < model_num; i++)
         av_log(ctx, AV_LOG_INFO, "model[%d]:%s\n", i, models[i]);
 
-    split(s->labels, "&", labels, &label_num, max_num);
+    av_split(s->labels, "&", labels, &label_num, max_num);
     for (i = 0; i < label_num; i++)
         av_log(ctx, AV_LOG_INFO, "label[%d]:%s\n", i, labels[i]);
 
-    split(s->names, "&", names, &name_num, max_num);
+    av_split(s->names, "&", names, &name_num, max_num);
     for (i = 0; i < name_num; i++)
         av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]);
 
@@ -265,7 +461,7 @@ static av_cold int classify_init(AVFilterContext *ctx)
         p.model_file  = models[i];
         ret = ff_inference_base_create(ctx, &base, &p);
         if (ret < 0) {
-            av_log(ctx, AV_LOG_ERROR, "could not create inference\n");
+            av_log(ctx, AV_LOG_ERROR, "Could not create inference\n");
             return ret;
         }
 
@@ -280,18 +476,18 @@ static av_cold int classify_init(AVFilterContext *ctx)
         char buffer[4096]   = { };
         char *_labels[100]  = { };
 
-        FILE *fp = fopen(labels[i], "r+b");
+        FILE *fp = fopen(labels[i], "rb");
         if (!fp) {
-            av_log(ctx, AV_LOG_ERROR, "could not open file:%s\n", labels[i]);
+            av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", labels[i]);
             ret = AVERROR(EIO);
-            fclose(fp);
             goto fail;
         }
-        fread(buffer, sizeof(buffer), 1, fp);
+
+        n = fread(buffer, sizeof(buffer), 1, fp);
         fclose(fp);
 
         buffer[strcspn(buffer, "\n")] = 0;
-        split(buffer, ",", _labels, &labels_num, 100);
+        av_split(buffer, ",", _labels, &labels_num, 100);
 
         larray = av_mallocz(sizeof(*larray));
         if (!larray) {
@@ -311,10 +507,23 @@ static av_cold int classify_init(AVFilterContext *ctx)
 
     for (i = 0; i < name_num; i++) {
         s->name_array[i] = names[i];
-        if (strstr(names[i], "emotion"))
+        if (strstr(names[i], "emotion")) {
             s->post_process[i] = &emotion_classify_result_process;
-        else if (strstr(names[i], "age") && strstr(names[i], "gend"))
+        } else if (strstr(names[i], "age") && strstr(names[i], "gend")) {
             s->post_process[i] = &age_gender_classify_result_process;
+        } else if (strstr(names[i], "face")) {
+            VideoPP *vpp = ff_inference_base_get_vpp(s->infer_bases[i]);
+
+            // face reidentification model requires RGB format
+            vpp->expect_format = AV_PIX_FMT_RGB24;
+
+            s->init[i]         = &face_identify_init;
+            s->uninit[i]       = &face_identify_uninit;
+            s->post_process[i] = &face_identify_result_process;
+        }
+
+        if (s->init[i] && s->init[i](ctx, i) < 0)
+            goto fail;
     }
 
     return 0;
@@ -335,7 +544,10 @@ static av_cold void classify_uninit(AVFilterContext *ctx)
     InferenceClassifyContext *s = ctx->priv;
 
     for (i = 0; i < s->loaded_num; i++) {
+        if (s->uninit[i]) s->uninit[i](ctx, i);
+
         ff_inference_base_free(&s->infer_bases[i]);
+
         av_buffer_unref(&s->label_bufs[i]);
     }
 }
@@ -396,6 +608,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
                                       bbox->y_max * in->height,
                                       iinfo->width[0],
                                       iinfo->height[0],
+                                      vpp->expect_format,
                                       tmp->data,
                                       tmp->linesize);
 
@@ -422,7 +635,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     new_sd = av_frame_new_side_data_from_buf(in, AV_FRAME_DATA_INFERENCE_CLASSIFICATION, ref);
     if (!new_sd) {
         av_buffer_unref(&ref);
-        av_log(NULL, AV_LOG_ERROR, "could not add new side data\n");
+        av_log(NULL, AV_LOG_ERROR, "Could not add new side data\n");
         return AVERROR(ENOMEM);
     }
 
@@ -478,13 +691,15 @@ static av_cold int config_output(AVFilterLink *outlink)
 }
 
 static const AVOption inference_classify_options[] = {
-    { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,  FLAGS, "engine" },
-    { "models",      "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
-    { "labels",      "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
-    { "names",       "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
-    { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
-    { "interval",    "do infer every Nth frame",        OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 15, FLAGS},
-    { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 }, 1, 1024, FLAGS},
+    { "dnn_backend",  "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,    FLAGS, "engine" },
+    { "model",        "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "label",        "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "name",         "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "device",       "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12,   FLAGS },
+    { "interval",     "do infer every Nth frame",        OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },                     0, 15,   FLAGS },
+    { "batch_size",   "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },                     1, 1024, FLAGS },
+    { "feature_file", "registered face feature data",    OFFSET(feature_file),    AV_OPT_TYPE_STRING, { .str = NULL},                   0,    0, FLAGS, "face_identify" },
+    { "feature_num",  "registered face number",          OFFSET(feature_num),     AV_OPT_TYPE_INT,    { .i64 = 0},                      0, 1024, FLAGS, "face_identify" },
     { NULL }
 };
 
diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c
index 8d676cf..73d77ab 100644
--- a/libavfilter/vf_inference_detect.c
+++ b/libavfilter/vf_inference_detect.c
@@ -50,6 +50,7 @@ typedef struct InferenceDetectContext {
     InferenceBaseContext *base;
 
     char  *model_file;
+    char  *label_file;
     int    backend_type;
     int    device_type;
 
@@ -63,47 +64,40 @@ typedef struct InferenceDetectContext {
 
     char  *name;
     char  *params;
-    int  (*init)  (AVFilterContext *ctx, const char *args);
-    void (*uninit)(AVFilterContext *ctx);
-    int  (*end_frame_filter)(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame);
-    void  *priv;
+
+    AVBufferRef *label_buf;
 } InferenceDetectContext;
 
-static void infer_detect_metadata_buffer_free(void *opaque, uint8_t *data)
+static void infer_labels_buffer_free(void *opaque, uint8_t *data)
 {
     int i;
-    InferDetectionMeta *meta = (InferDetectionMeta *)data;
-    LabelsArray *labels = meta->labels;
-    BBoxesArray *bboxes = meta->bboxes;
+    LabelsArray *labels = (LabelsArray *)data;
+
+    for (i = 0; i < labels->num; i++)
+        av_freep(&labels->label[i]);
+
+    av_free(data);
+}
+
+static void infer_detect_metadata_buffer_free(void *opaque, uint8_t *data)
+{
+    BBoxesArray *bboxes = ((InferDetectionMeta *)data)->bboxes;
 
     if (bboxes) {
+        int i;
         for (i = 0; i < bboxes->num; i++) {
             InferDetection *p = bboxes->bbox[i];
+            if (p->label_buf)
+                av_buffer_unref(&p->label_buf);
             av_freep(&p);
         }
         av_freep(&bboxes);
     }
 
-    if (labels) {
-        for (i = 0; i < labels->num; i++) {
-            char *l = labels->label[i];
-            av_freep(&l);
-        }
-        av_freep(&labels);
-    }
-
     av_free(data);
 }
 
-typedef struct FaceDetectContext {
-    int max_num;
-
-} FaceDetectContext;
-
-static int  face_init(AVFilterContext *ctx, const char *args) {return 0;}
-static void face_uninit(AVFilterContext *ctx) {}
-
-static int  face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, AVFrame *frame)
+static int detect_postprocess(AVFilterContext *ctx, InferTensorMeta *meta, AVFrame *frame)
 {
     int i;
     InferenceDetectContext *s = ctx->priv;
@@ -144,14 +138,18 @@ static int  face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A
             break;
         }
 
+        if (s->label_buf)
+            new_bbox->label_buf = av_buffer_ref(s->label_buf);
+
         av_dynarray_add(&boxes->bbox, &boxes->num, new_bbox);
     }
 
     // dump face detected meta
     for (i = 0; i < boxes->num; i++) {
         InferDetection *p = boxes->bbox[i];
-        av_log(ctx, AV_LOG_DEBUG, "DETECT META - label:%d confi:%f coord:%f %f %f %f\n",
-               p->label_id, p->confidence, p->x_min, p->y_min, p->x_max, p->y_max);
+        av_log(ctx, AV_LOG_DEBUG,
+               "DETECT META - label:%d confi:%f coord:%f %f %f %f\n",
+               p->label_id, p->confidence,p->x_min, p->y_min, p->x_max, p->y_max);
     }
 
     ref = av_buffer_create((uint8_t *)detect_meta, sizeof(*detect_meta),
@@ -160,35 +158,18 @@ static int  face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A
         return AVERROR(ENOMEM);
 
     detect_meta->bboxes = boxes;
-    detect_meta->labels = NULL;
 
     // add meta data to side data
     sd = av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_INFERENCE_DETECTION, ref);
     if (!sd) {
         av_buffer_unref(&ref);
-        av_log(NULL, AV_LOG_ERROR, "could not add new side data\n");
+        av_log(NULL, AV_LOG_ERROR, "Could not add new side data\n");
         return AVERROR(ENOMEM);
     }
 
     return 0;
 }
 
-typedef struct EmotionDetectContext {
-    int max_num;
-
-} EmotionDetectContext;
-static int  emotion_init(AVFilterContext *ctx, const char *args) {return 0;}
-static void emotion_uninit(AVFilterContext *ctx) {}
-static int  emotion_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame) { return 0; }
-
-typedef struct LogoDetectContext {
-    int max_num;
-
-} LogoDetectContext;
-static int  logo_init(AVFilterContext *ctx, const char *args) {return 0;}
-static void logo_uninit(AVFilterContext *ctx) {}
-static int  logo_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame) { return 0; }
-
 static int detect_preprocess(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out)
 {
     int ret;
@@ -217,7 +198,7 @@ static int query_formats(AVFilterContext *context)
 
     formats_list = ff_make_format_list(pixel_formats);
     if (!formats_list) {
-        av_log(context, AV_LOG_ERROR, "could not create formats list\n");
+        av_log(context, AV_LOG_ERROR, "Could not create formats list\n");
         return AVERROR(ENOMEM);
     }
 
@@ -236,7 +217,7 @@ static int config_input(AVFilterLink *inlink)
     VideoPP *vpp                     = ff_inference_base_get_vpp(s->base);
 
     for (i = 0; i < info->numbers; i++) {
-        av_log(ctx, AV_LOG_DEBUG, "input info [%d] %d - %d %d %d - %d %d %d\n",
+        av_log(ctx, AV_LOG_DEBUG, "Input info [%d] %d - %d %d %d - %d %d %d\n",
                i, info->batch_size, info->width[i], info->height[i], info->channels[i],
                info->is_image[i], info->precision[i], info->layout[i]);
     }
@@ -290,7 +271,7 @@ static int config_output(AVFilterLink *outlink)
     DNNModelInfo *info = ff_inference_base_get_output_info(s->base);
 
     for (int i = 0; i < info->numbers; i++) {
-        av_log(ctx, AV_LOG_DEBUG, "output info [%d] %d - %d %d %d - %d %d %d\n",
+        av_log(ctx, AV_LOG_DEBUG, "Output info [%d] %d - %d %d %d - %d %d %d\n",
             i, info->batch_size,
             info->width[i], info->height[i], info->channels[i],
             info->is_image[i], info->precision[i], info->layout[i]);
@@ -301,43 +282,48 @@ static int config_output(AVFilterLink *outlink)
     return 0;
 }
 
-typedef struct DetectFilterEntry {
-    const char *name;
-    size_t priv_size;
-    int  (*init)(AVFilterContext *ctx, const char *args);
-    void (*uninit)(AVFilterContext *ctx);
-    int  (*end_frame_filter)(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame);
-} DetectFilterEntry;
-
-static const DetectFilterEntry detect_filter_entries[] = {
-    { "face",    sizeof(FaceDetectContext),    face_init,    face_uninit,    face_end_frame_filter },
-    { "emotion", sizeof(EmotionDetectContext), emotion_init, emotion_uninit, emotion_end_frame_filter  },
-    { "logo",    sizeof(LogoDetectContext),    logo_init,    logo_uninit,    logo_end_frame_filter },
-};
-
 static av_cold int detect_init(AVFilterContext *ctx)
 {
-    int i, ret;
+    int ret;
     InferenceDetectContext *s = ctx->priv;
     InferenceParam p = {};
 
     av_assert0(s->model_file && s->name);
 
-    for (i = 0; i < FF_ARRAY_ELEMS(detect_filter_entries); i++) {
-        const DetectFilterEntry *entry = &detect_filter_entries[i];
-        if (!strcmp(s->name, entry->name)) {
-            s->init             = entry->init;
-            s->uninit           = entry->uninit;
-            s->end_frame_filter = entry->end_frame_filter;
+    av_assert0(s->backend_type == DNN_INTEL_IE);
 
-            if (!(s->priv = av_mallocz(entry->priv_size)))
-                return AVERROR(ENOMEM);
+    if (s->label_file) {
+        int n, labels_num;
+        AVBufferRef *ref    = NULL;
+        LabelsArray *larray = NULL;
+        char buffer[4096]   = { };
+        char *_labels[100]  = { };
+
+        FILE *fp = fopen(s->label_file, "rb");
+        if (!fp) {
+            av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", s->label_file);
+            return AVERROR(EIO);
         }
-    }
 
-    av_assert0(s->init);
+        n = fread(buffer, sizeof(buffer), 1, fp);
+        fclose(fp);
 
-    av_assert0(s->backend_type == DNN_INTEL_IE);
+        buffer[strcspn(buffer, "\n")] = 0;
+        av_split(buffer, ",", _labels, &labels_num, 100);
+
+        larray = av_mallocz(sizeof(*larray));
+        if (!larray)
+            return AVERROR(ENOMEM);
+
+        for (n = 0; n < labels_num; n++) {
+            char *l = av_strdup(_labels[n]);
+            av_dynarray_add(&larray->label, &larray->num, l);
+        }
+
+        ref = av_buffer_create((uint8_t *)larray, sizeof(*larray),
+                               &infer_labels_buffer_free, NULL, 0);
+        s->label_buf = ref;
+    }
 
     p.model_file      = s->model_file;
     p.backend_type    = s->backend_type;
@@ -352,14 +338,7 @@ static av_cold int detect_init(AVFilterContext *ctx)
 
     ret = ff_inference_base_create(ctx, &s->base, &p);
     if (ret < 0) {
-        av_log(ctx, AV_LOG_ERROR, "could not create inference\n");
-        return ret;
-    }
-
-    ret = s->init(ctx, s->params);
-    if (ret < 0) {
-        ff_inference_base_free(&s->base);
-        av_log(ctx, AV_LOG_ERROR, "init '%s' failed\n", s->name);
+        av_log(ctx, AV_LOG_ERROR, "Could not create inference\n");
         return ret;
     }
 
@@ -372,7 +351,7 @@ static av_cold void detect_uninit(AVFilterContext *ctx)
 
     ff_inference_base_free(&s->base);
 
-    av_freep(&s->priv);
+    if (s->label_buf) av_buffer_unref(&s->label_buf);
 }
 
 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
@@ -391,7 +370,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     if (ret < 0)
         goto fail;
 
-    s->end_frame_filter(ctx, &tensor_meta, in);
+    detect_postprocess(ctx, &tensor_meta, in);
 
     return ff_filter_frame(outlink, in);
 fail:
@@ -403,6 +382,7 @@ static const AVOption inference_detect_options[] = {
     { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,  FLAGS, "engine" },
     { "model",       "path to model file for network",  OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
+    { "label",       "label file path for detection",   OFFSET(label_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "interval",    "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 15, FLAGS},
     { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 }, 1, 1024, FLAGS},
     { "threshold",   "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1, FLAGS},
diff --git a/libavformat/iemetadataenc.c b/libavformat/iemetadataenc.c
index eb332ef..7d47aae 100644
--- a/libavformat/iemetadataenc.c
+++ b/libavformat/iemetadataenc.c
@@ -154,7 +154,6 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt)
         return 0;
     AVFrameSideData *sd;
     InferDetectionMeta *meta;
-    LabelsArray *labels;
     BBoxesArray *bboxes;
     AVFrameSideData *c_sd;
     InferClassificationMeta *cmeta;
@@ -167,7 +166,6 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt)
 
         if (meta) {
             bboxes = meta->bboxes;
-            labels = meta->labels;
             if (bboxes) {
                 if (bboxes->num > 0) {
                     jhead_write(s, frm_data);
@@ -198,7 +196,15 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt)
 
                     JSON_IVALUE(tmp_str, "object_id", bboxes->bbox[i]->object_id);
                     fill_line(s, tmp_str, md->current_escape_num, 0);
-                    JSON_STRING(tmp_str, "label", "");
+                    if (!bboxes->bbox[i]->label_buf) {
+                        JSON_STRING(tmp_str, "label", "face");
+                    } else {
+                        // object detection label index start from 1
+                        int label_id = bboxes->bbox[i]->label_id - 1;
+                        LabelsArray *array = (LabelsArray*)(bboxes->bbox[i]->label_buf->data);
+                        JSON_STRING(tmp_str, "label", array->label[label_id]);
+                    }
+
                     fill_line(s, tmp_str, md->current_escape_num, 0);
                     JSON_IVALUE(tmp_str, "label_id", bboxes->bbox[i]->label_id);
                     fill_line(s, tmp_str, md->current_escape_num, 0);
-- 
2.7.4