From 66fac57b2945c4f75aa7efebabf5e6ea12db29bf Mon Sep 17 00:00:00 2001
From: Lin Xie <lin.xie@intel.com>
Date: Wed, 27 Feb 2019 07:32:27 +0800
Subject: [PATCH] Refine features of IE filters

* Enable do inference with specific intervals
* Change detection label index starting from 0

Change-Id: I619ba4fe301fba5b0fd077b8f6bb2d5c714f137e
---
 libavfilter/inference.c             | 28 ++++++++++++++++++++--------
 libavfilter/inference.h             |  2 --
 libavfilter/vf_inference_classify.c | 30 +++++++++++++++++-------------
 libavfilter/vf_inference_detect.c   | 17 +++++++++--------
 libavformat/iemetadataenc.c         |  3 +--
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/libavfilter/inference.c b/libavfilter/inference.c
index e569620..20934fc 100644
--- a/libavfilter/inference.c
+++ b/libavfilter/inference.c
@@ -37,8 +37,6 @@ struct InferenceBaseContext
 {
     char *infer_type;
     int   batch_size;
-    int   every_nth_frame;
-    float threshold;
 
     DNNModule *module;
     DNNModel  *model;
@@ -186,14 +184,30 @@ void av_split(char *str, const char *delim, char **array, int *num, int max)
     if (!str || !delim || !array || !num)
         return;
 
-    p = strtok(str, delim);
-    while (p != NULL) {
-        array[i++] = p;
+    while (p = strtok(str, delim)) {
+        int j = 0;
+        char *s;
+        size_t end;
 
+        /* remove head blanks */
+        while (p[j] == '\n' || p[j] == ' ')
+            j++;
+
+        if (!p[j]) continue;
+
+        /* remove tail blanks */
+        s   = p + j;
+        end = strlen(s) - 1;
+        while (s[end] == '\n' || s[end] == ' ')
+            s[end--] = '\0';
+
+        array[i++] = s;
         av_assert0 (i < max);
 
-        p = strtok(NULL, delim);
+        /* string is cached */
+        str = NULL;
     }
+
     *num = i;
 }
 
@@ -285,8 +299,6 @@ int ff_inference_base_create(AVFilterContext *ctx,
     DNN_ERR_CHECK(ctx);
 
     s->batch_size      = param->batch_size;
-    s->every_nth_frame = param->every_nth_frame;
-    s->threshold       = param->threshold;
     s->preprocess      = param->preprocess;
 
     ret = s->model->create_model(s->model->model);
diff --git a/libavfilter/inference.h b/libavfilter/inference.h
index 843e05c..33de54b 100644
--- a/libavfilter/inference.h
+++ b/libavfilter/inference.h
@@ -36,8 +36,6 @@ typedef struct InferenceParam {
     char  *gpu_extension;
 
     int    batch_size;
-    int    every_nth_frame;
-    float  threshold;
 
     // TODO: inputs attributes are different
     int    input_layout;
diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c
index cba547a..b2e435b 100644
--- a/libavfilter/vf_inference_classify.c
+++ b/libavfilter/vf_inference_classify.c
@@ -43,7 +43,6 @@
 #define PI 3.1415926
 #define MAX_MODEL_NUM 8
 #define FACE_FEATURE_VECTOR_LEN 256
-#define THRESHOLD_RECOGNITION   70
 
 static char string_age[]    = "age";
 static char string_gender[] = "gender";
@@ -65,11 +64,13 @@ typedef struct InferenceClassifyContext {
     char  *model_file;
     char  *feature_file;    ///< binary feature file for face identification
     int    feature_num;     ///< identification face feature number
+    double feature_angle;   ///< face identification threshold angle value
     int    loaded_num;
     int    backend_type;
     int    device_type;
 
     int    batch_size;
+    int    frame_number;
     int    every_nth_frame;
 
     void           *priv[MAX_MODEL_NUM];
@@ -375,7 +376,7 @@ static int face_identify_result_process(AVFilterContext *ctx,
         angles[i] = acos((dot_product - 0.0001f) /
                          (f->norm_std[i] * norm_feature)) /
                     PI * 180.0;
-        if (angles[i] < THRESHOLD_RECOGNITION && angles[i] < min_angle) {
+        if (angles[i] < s->feature_angle && angles[i] < min_angle) {
             label_id  = i;
             min_angle = angles[i];
         }
@@ -450,7 +451,6 @@ static av_cold int classify_init(AVFilterContext *ctx)
     p.backend_type    = s->backend_type;
     p.device_type     = s->device_type;
     p.batch_size      = s->batch_size;
-    p.every_nth_frame = s->every_nth_frame;
     p.input_precision = DNN_DATA_PRECISION_U8;
     p.input_layout    = DNN_DATA_LAYOUT_NCHW;
     p.input_is_image  = 1;
@@ -486,7 +486,6 @@ static av_cold int classify_init(AVFilterContext *ctx)
         n = fread(buffer, sizeof(buffer), 1, fp);
         fclose(fp);
 
-        buffer[strcspn(buffer, "\n")] = 0;
         av_split(buffer, ",", _labels, &labels_num, 100);
 
         larray = av_mallocz(sizeof(*larray));
@@ -565,6 +564,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     ClassifyArray           *c_array;
     InferClassificationMeta *c_meta;
 
+    if (s->frame_number % s->every_nth_frame != 0)
+        goto done;
+
     sd = av_frame_get_side_data(in, AV_FRAME_DATA_INFERENCE_DETECTION);
     if (!sd)
         goto done;
@@ -640,6 +642,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     }
 
 done:
+    s->frame_number++;
     return ff_filter_frame(outlink, in);
 fail:
     av_frame_free(&in);
@@ -691,15 +694,16 @@ static av_cold int config_output(AVFilterLink *outlink)
 }
 
 static const AVOption inference_classify_options[] = {
-    { "dnn_backend",  "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,    FLAGS, "engine" },
-    { "model",        "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
-    { "label",        "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
-    { "name",         "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
-    { "device",       "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12,   FLAGS },
-    { "interval",     "do infer every Nth frame",        OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },                     0, 15,   FLAGS },
-    { "batch_size",   "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },                     1, 1024, FLAGS },
-    { "feature_file", "registered face feature data",    OFFSET(feature_file),    AV_OPT_TYPE_STRING, { .str = NULL},                   0,    0, FLAGS, "face_identify" },
-    { "feature_num",  "registered face number",          OFFSET(feature_num),     AV_OPT_TYPE_INT,    { .i64 = 0},                      0, 1024, FLAGS, "face_identify" },
+    { "dnn_backend",    "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = DNN_INTEL_IE },          0, 2,    FLAGS, "engine" },
+    { "model",          "path to model files for network", OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "label",          "labels for classify",             OFFSET(labels),          AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "name",           "classify type names",             OFFSET(names),           AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,    FLAGS },
+    { "device",         "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12,   FLAGS },
+    { "interval",       "do infer every Nth frame",        OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },                     1, 1024, FLAGS },
+    { "batch_size",     "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },                     1, 1024, FLAGS },
+    { "feature_file",   "registered face feature data",    OFFSET(feature_file),    AV_OPT_TYPE_STRING, { .str = NULL},                   0,    0, FLAGS, "face_identify" },
+    { "feature_num",    "registered face number",          OFFSET(feature_num),     AV_OPT_TYPE_INT,    { .i64 = 0},                      0, 1024, FLAGS, "face_identify" },
+    { "identify_angle", "face identify threshold angle",   OFFSET(feature_angle),   AV_OPT_TYPE_DOUBLE, { .dbl = 70},                     0, 90,   FLAGS, "face_identify" },
     { NULL }
 };
 
diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c
index 73d77ab..624cb47 100644
--- a/libavfilter/vf_inference_detect.c
+++ b/libavfilter/vf_inference_detect.c
@@ -55,6 +55,7 @@ typedef struct InferenceDetectContext {
     int    device_type;
 
     int    batch_size;
+    int    frame_number;
     int    every_nth_frame;
     float  threshold;
 
@@ -63,7 +64,6 @@ typedef struct InferenceDetectContext {
     int    input_is_image;
 
     char  *name;
-    char  *params;
 
     AVBufferRef *label_buf;
 } InferenceDetectContext;
@@ -308,7 +308,6 @@ static av_cold int detect_init(AVFilterContext *ctx)
         n = fread(buffer, sizeof(buffer), 1, fp);
         fclose(fp);
 
-        buffer[strcspn(buffer, "\n")] = 0;
         av_split(buffer, ",", _labels, &labels_num, 100);
 
         larray = av_mallocz(sizeof(*larray));
@@ -329,8 +328,6 @@ static av_cold int detect_init(AVFilterContext *ctx)
     p.backend_type    = s->backend_type;
     p.device_type     = s->device_type;
     p.batch_size      = s->batch_size;
-    p.every_nth_frame = s->every_nth_frame;
-    p.threshold       = s->threshold;
     p.input_precision = DNN_DATA_PRECISION_U8;
     p.input_layout    = DNN_DATA_LAYOUT_NCHW;
     p.input_is_image  = 1;
@@ -362,6 +359,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     AVFilterLink *outlink     = inlink->dst->outputs[0];
     InferTensorMeta tensor_meta = { };
 
+    if (s->frame_number % s->every_nth_frame != 0)
+        goto done;
+
     ret = ff_inference_base_filter_frame(s->base, in);
     if (ret < 0)
         goto fail;
@@ -372,6 +372,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 
     detect_postprocess(ctx, &tensor_meta, in);
 
+done:
+    s->frame_number++;
     return ff_filter_frame(outlink, in);
 fail:
     av_frame_free(&in);
@@ -383,12 +385,11 @@ static const AVOption inference_detect_options[] = {
     { "model",       "path to model file for network",  OFFSET(model_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
     { "device",      "running on device type",          OFFSET(device_type),     AV_OPT_TYPE_FLAGS,  { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS },
     { "label",       "label file path for detection",   OFFSET(label_file),      AV_OPT_TYPE_STRING, { .str = NULL},                   0, 0,  FLAGS },
-    { "interval",    "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 }, 0, 15, FLAGS},
-    { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 }, 1, 1024, FLAGS},
-    { "threshold",   "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1, FLAGS},
+    { "interval",    "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
+    { "batch_size",  "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
+    { "threshold",   "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1,    FLAGS},
 
     { "name",        "detection type name",             OFFSET(name),            AV_OPT_TYPE_STRING, .flags = FLAGS, "detection" },
-    { "filter_params", NULL,                            OFFSET(params),          AV_OPT_TYPE_STRING, .flags = FLAGS, "detection" },
     { NULL }
 };
 
diff --git a/libavformat/iemetadataenc.c b/libavformat/iemetadataenc.c
index 8539e7f..506b82a 100644
--- a/libavformat/iemetadataenc.c
+++ b/libavformat/iemetadataenc.c
@@ -241,8 +241,7 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt)
                     if (!bboxes->bbox[i]->label_buf) {
                         sprintf(tmp_str, "%s", "face");
                     } else {
-                        // object detection label index start from 1
-                        int label_id = bboxes->bbox[i]->label_id - 1;
+                        int label_id = bboxes->bbox[i]->label_id;
                         LabelsArray *array = (LabelsArray*)(bboxes->bbox[i]->label_buf->data);
                         sprintf(tmp_str, "%s", array->label[label_id]);
                     }
-- 
2.7.4