From 819d5169680f30b7176c450b2298b70400b41691 Mon Sep 17 00:00:00 2001 From: Lin Xie Date: Sun, 3 Feb 2019 11:04:19 +0800 Subject: [PATCH] Support object detection and featured face identification Signed-off-by: Lin Xie --- libavfilter/inference.c | 59 ++++++- libavfilter/inference.h | 33 ++-- libavfilter/vf_inference_classify.c | 321 ++++++++++++++++++++++++++++++------ libavfilter/vf_inference_detect.c | 148 +++++++---------- libavformat/iemetadataenc.c | 12 +- 5 files changed, 416 insertions(+), 157 deletions(-) diff --git a/libavfilter/inference.c b/libavfilter/inference.c index 14b4093..e569620 100644 --- a/libavfilter/inference.c +++ b/libavfilter/inference.c @@ -101,15 +101,18 @@ static int fill_dnn_data_from_frame(DNNIOData *data, } static int sw_crop_and_scale(AVFrame *frame, - float x0, float y0, float x1, float y1, - int out_w, int out_h, uint8_t *data[], int stride[]) + float x0, float y0, + float x1, float y1, + int out_w, int out_h, + enum AVPixelFormat out_format, + uint8_t *data[], int stride[]) { int err, bufsize; struct SwsContext *sws_ctx; const AVPixFmtDescriptor *desc; int x, y, w, h, hsub, vsub; int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes - enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24; + enum AVPixelFormat expect_format = out_format; AVFrame *temp = av_frame_alloc(); if (!temp) { @@ -175,13 +178,56 @@ static int sw_crop_and_scale(AVFrame *frame, return 0; } +void av_split(char *str, const char *delim, char **array, int *num, int max) +{ + char *p; + int i = 0; + + if (!str || !delim || !array || !num) + return; + + p = strtok(str, delim); + while (p != NULL) { + array[i++] = p; + + av_assert0 (i < max); + + p = strtok(NULL, delim); + } + *num = i; +} + +double av_norm(float vec[], size_t num) +{ + size_t i; + double result = 0.0; + + for (i = 0; i < num; i++) + result += vec[i] * vec[i]; + + return sqrt(result); +} + +double av_dot(float vec1[], float vec2[], size_t num) +{ + size_t i; + double result = 0.0; + + for (i = 0; i < num; i++) + result += vec1[i] * vec2[i]; + + return result; +} + int ff_inference_base_create(AVFilterContext *ctx, InferenceBaseContext **base, - InferenceParam *param) { + InferenceParam *param) +{ int i, ret; InferenceBaseContext *s; - DNNModelInfo *info; VideoPP *vpp; + DNNModelInfo *info; + DNNModelIntelIEConfig config; if (!param) return AVERROR(EINVAL); @@ -202,7 +248,7 @@ int ff_inference_base_create(AVFilterContext *ctx, // parameter sanity check if (param->batch_size <= 0) param->batch_size = 1; - DNNModelIntelIEConfig config = { + config = (DNNModelIntelIEConfig) { .model = param->model_file, .labels = param->labels_file, .device = param->device_type, @@ -251,6 +297,7 @@ int ff_inference_base_create(AVFilterContext *ctx, // vpp init vpp->swscale = &sws_scale; vpp->crop_and_scale = &sw_crop_and_scale; + vpp->expect_format = AV_PIX_FMT_BGR24; *base = s; #undef DNN_ERR_CHECK diff --git a/libavfilter/inference.h b/libavfilter/inference.h index eebfd10..843e05c 100644 --- a/libavfilter/inference.h +++ b/libavfilter/inference.h @@ -20,6 +20,7 @@ #define AVFILTER_INFERENCE_H #include "libavutil/common.h" +#include "libswscale/swscale.h" #include "dnn_interface.h" typedef struct InferenceBaseContext InferenceBaseContext; @@ -52,10 +53,11 @@ typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } VPPDevice; typedef struct VideoPP { int device; + int expect_format; void *scale_contexts[MAX_VPP_NUM]; AVFrame *frames[MAX_VPP_NUM]; //data; - av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - label:%d emotion:%s \n", - label_id, emotions[label_id]); + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Label id:%d Emotion:%s Conf:%f\n", + label_id, array->label[label_id], conf); } static int emotion_classify_result_process(AVFilterContext *ctx, @@ -150,24 +152,25 @@ static int emotion_classify_result_process(AVFilterContext *ctx, classify->confidence = emo_confidence[label_id]; classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); - dump_emotion(ctx, classify->label_id); + dump_emotion(ctx, classify->label_id, classify->confidence, classify->label_buf); av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); return 0; } -static av_cold void dump_gender(AVFilterContext *ctx, int label_id, float conf) +static av_cold void dump_gender(AVFilterContext *ctx, int label_id, + float conf, AVBufferRef *label_buf) { - const char *genders[] = { "female", "male" }; + LabelsArray *array = (LabelsArray *)label_buf->data; - av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Confidence:%f\n", - genders[label_id], conf); + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Conf:%1.2f\n", + array->label[label_id], conf); } static av_cold void dump_age(AVFilterContext *ctx, float age) { - av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%f \n", age); + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%1.2f\n", age); } static int age_gender_classify_result_process(AVFilterContext *ctx, @@ -198,7 +201,7 @@ static int age_gender_classify_result_process(AVFilterContext *ctx, classify->label_id = data[0] > data[1] ? 0 : 1; classify->confidence = data[classify->label_id]; classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); - dump_gender(ctx, classify->label_id, classify->confidence); + dump_gender(ctx, classify->label_id, classify->confidence, classify->label_buf); } av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); @@ -206,6 +209,199 @@ static int age_gender_classify_result_process(AVFilterContext *ctx, return 0; } +static int face_identify_init(AVFilterContext *ctx, size_t index) +{ + FaceIdentifyContext *identify_ctx; + InferenceClassifyContext *s = ctx->priv; + + int i, ret, feature_size, expected_size; + size_t vec_size_in_bytes = sizeof(float) * FACE_FEATURE_VECTOR_LEN; + + FILE *fp = fopen(s->feature_file, "rb"); + if (!fp) { + av_log(ctx, AV_LOG_ERROR, "Could not open feature file:%s\n", s->feature_file); + return AVERROR(EIO); + } + + av_assert0(index < MAX_MODEL_NUM); + + if (fseek(fp, 0, SEEK_END)) { + av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of feature file.\n"); + fclose(fp); + return AVERROR(EINVAL); + } + + feature_size = ftell(fp); + + if (feature_size == -1) { + fclose(fp); + av_log(ctx, AV_LOG_ERROR, "Couldn't get size of feature file.\n"); + return AVERROR(EINVAL); + } else if (feature_size % FACE_FEATURE_VECTOR_LEN) { + fclose(fp); + av_log(ctx, AV_LOG_ERROR, "Feature data must align to %d.\n", FACE_FEATURE_VECTOR_LEN); + return AVERROR(EINVAL); + } + + if (s->feature_num > 0) { + expected_size = s->feature_num * vec_size_in_bytes; + if (expected_size != feature_size) { + fclose(fp); + av_log(ctx, AV_LOG_ERROR, "Unexpected feature file size.\n"); + return AVERROR(EINVAL); + } + } else { + s->feature_num = feature_size / vec_size_in_bytes; + } + + identify_ctx = av_mallocz(sizeof(*identify_ctx)); + if (!identify_ctx) { + ret = AVERROR(ENOMEM); + goto fail; + } + + identify_ctx->vector_num = s->feature_num; + + identify_ctx->feature_vecs = av_mallocz(sizeof(float *) * identify_ctx->vector_num); + if (!identify_ctx->feature_vecs) { + ret = AVERROR(ENOMEM); + goto fail; + } + + rewind(fp); + + for (i = 0; i vector_num; i++) { + identify_ctx->feature_vecs[i] = av_malloc(vec_size_in_bytes); + if (!identify_ctx->feature_vecs[i]) { + ret = AVERROR(ENOMEM); + goto fail; + } + if (fread(identify_ctx->feature_vecs[i], vec_size_in_bytes, 1, fp) != 1) { + ret = AVERROR(EINVAL); + goto fail; + } + } + + identify_ctx->norm_std = av_mallocz(sizeof(double) * identify_ctx->vector_num); + if (!identify_ctx->norm_std) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (i = 0; i < identify_ctx->vector_num; i++) + identify_ctx->norm_std[i] = av_norm(identify_ctx->feature_vecs[i], + FACE_FEATURE_VECTOR_LEN); + + s->priv[index] = identify_ctx; + fclose(fp); + return 0; +fail: + fclose(fp); + + if (identify_ctx) { + if (identify_ctx->feature_vecs) { + for (i = 0; i vector_num; i++) { + if (identify_ctx->feature_vecs[i]) + av_free(identify_ctx->feature_vecs[i]); + } + av_free(identify_ctx->feature_vecs); + } + av_free(identify_ctx); + } + return ret; +} + +static int face_identify_uninit(AVFilterContext *ctx, size_t index) +{ + int i; + InferenceClassifyContext *s = ctx->priv; + FaceIdentifyContext *identify_ctx = s->priv[index]; + + if (!identify_ctx) { + av_log(ctx, AV_LOG_WARNING, "Empty face identify ctx.\n"); + return 0; + } + + if (identify_ctx->feature_vecs) { + for (i = 0; i < identify_ctx->vector_num; i++) + av_free(identify_ctx->feature_vecs[i]); + av_free(identify_ctx->feature_vecs); + } + + if (identify_ctx->norm_std) + av_free(identify_ctx->norm_std); + + av_free(identify_ctx); + s->priv[index] = NULL; + + return 0; +} + +static av_cold void dump_face_id(AVFilterContext *ctx, int label_id, + float conf, AVBufferRef *label_buf) +{ + LabelsArray *array = (LabelsArray *)label_buf->data; + + av_log(ctx, AV_LOG_DEBUG,"CLASSIFY META - Face_id:%d Name:%s Conf:%1.2f\n", + label_id, array->label[label_id], conf); +} + +static int face_identify_result_process(AVFilterContext *ctx, + int detect_id, + int result_id, + int model_index, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) +{ + int i, label_id = 0; + InferClassification *classify; + double dot_product, norm_feature, confidence, *angles; + InferenceClassifyContext *s = ctx->priv; + FaceIdentifyContext *f = s->priv[model_index]; + double min_angle = 180.0f; + float *feature_vector = meta->data; + + angles = av_malloc(sizeof(double) * f->vector_num); + if (!angles) + return AVERROR(ENOMEM); + + norm_feature = av_norm(feature_vector, FACE_FEATURE_VECTOR_LEN); + + for (i = 0; i < f->vector_num; i++) { + dot_product = av_dot(feature_vector, + f->feature_vecs[i], + FACE_FEATURE_VECTOR_LEN); + + angles[i] = acos((dot_product - 0.0001f) / + (f->norm_std[i] * norm_feature)) / + PI * 180.0; + if (angles[i] < THRESHOLD_RECOGNITION && angles[i] < min_angle) { + label_id = i; + min_angle = angles[i]; + } + } + + confidence = (90.0f - min_angle) / 90.0f; + + av_free(angles); + + classify = av_mallocz(sizeof(*classify)); + if (!classify) + return AVERROR(ENOMEM); + + classify->detect_id = detect_id; + classify->name = s->name_array[model_index]; + classify->label_id = label_id; + classify->confidence = (float)confidence; + classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); + + dump_face_id(ctx, label_id, confidence, s->label_bufs[model_index]); + + av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + + return 0; +} + static int query_formats(AVFilterContext *context) { AVFilterFormats *formats_list; @@ -217,7 +413,7 @@ static int query_formats(AVFilterContext *context) formats_list = ff_make_format_list(pixel_formats); if (!formats_list) { - av_log(context, AV_LOG_ERROR, "could not create formats list\n"); + av_log(context, AV_LOG_ERROR, "Could not create formats list\n"); return AVERROR(ENOMEM); } @@ -237,15 +433,15 @@ static av_cold int classify_init(AVFilterContext *ctx) av_assert0(s->model_file); - split(s->model_file, "&", models, &model_num, max_num); + av_split(s->model_file, "&", models, &model_num, max_num); for (i = 0; i < model_num; i++) av_log(ctx, AV_LOG_INFO, "model[%d]:%s\n", i, models[i]); - split(s->labels, "&", labels, &label_num, max_num); + av_split(s->labels, "&", labels, &label_num, max_num); for (i = 0; i < label_num; i++) av_log(ctx, AV_LOG_INFO, "label[%d]:%s\n", i, labels[i]); - split(s->names, "&", names, &name_num, max_num); + av_split(s->names, "&", names, &name_num, max_num); for (i = 0; i < name_num; i++) av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]); @@ -265,7 +461,7 @@ static av_cold int classify_init(AVFilterContext *ctx) p.model_file = models[i]; ret = ff_inference_base_create(ctx, &base, &p); if (ret < 0) { - av_log(ctx, AV_LOG_ERROR, "could not create inference\n"); + av_log(ctx, AV_LOG_ERROR, "Could not create inference\n"); return ret; } @@ -280,18 +476,18 @@ static av_cold int classify_init(AVFilterContext *ctx) char buffer[4096] = { }; char *_labels[100] = { }; - FILE *fp = fopen(labels[i], "r+b"); + FILE *fp = fopen(labels[i], "rb"); if (!fp) { - av_log(ctx, AV_LOG_ERROR, "could not open file:%s\n", labels[i]); + av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", labels[i]); ret = AVERROR(EIO); - fclose(fp); goto fail; } - fread(buffer, sizeof(buffer), 1, fp); + + n = fread(buffer, sizeof(buffer), 1, fp); fclose(fp); buffer[strcspn(buffer, "\n")] = 0; - split(buffer, ",", _labels, &labels_num, 100); + av_split(buffer, ",", _labels, &labels_num, 100); larray = av_mallocz(sizeof(*larray)); if (!larray) { @@ -311,10 +507,23 @@ static av_cold int classify_init(AVFilterContext *ctx) for (i = 0; i < name_num; i++) { s->name_array[i] = names[i]; - if (strstr(names[i], "emotion")) + if (strstr(names[i], "emotion")) { s->post_process[i] = &emotion_classify_result_process; - else if (strstr(names[i], "age") && strstr(names[i], "gend")) + } else if (strstr(names[i], "age") && strstr(names[i], "gend")) { s->post_process[i] = &age_gender_classify_result_process; + } else if (strstr(names[i], "face")) { + VideoPP *vpp = ff_inference_base_get_vpp(s->infer_bases[i]); + + // face reidentification model requires RGB format + vpp->expect_format = AV_PIX_FMT_RGB24; + + s->init[i] = &face_identify_init; + s->uninit[i] = &face_identify_uninit; + s->post_process[i] = &face_identify_result_process; + } + + if (s->init[i] && s->init[i](ctx, i) < 0) + goto fail; } return 0; @@ -335,7 +544,10 @@ static av_cold void classify_uninit(AVFilterContext *ctx) InferenceClassifyContext *s = ctx->priv; for (i = 0; i < s->loaded_num; i++) { + if (s->uninit[i]) s->uninit[i](ctx, i); + ff_inference_base_free(&s->infer_bases[i]); + av_buffer_unref(&s->label_bufs[i]); } } @@ -396,6 +608,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) bbox->y_max * in->height, iinfo->width[0], iinfo->height[0], + vpp->expect_format, tmp->data, tmp->linesize); @@ -422,7 +635,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) new_sd = av_frame_new_side_data_from_buf(in, AV_FRAME_DATA_INFERENCE_CLASSIFICATION, ref); if (!new_sd) { av_buffer_unref(&ref); - av_log(NULL, AV_LOG_ERROR, "could not add new side data\n"); + av_log(NULL, AV_LOG_ERROR, "Could not add new side data\n"); return AVERROR(ENOMEM); } @@ -478,13 +691,15 @@ static av_cold int config_output(AVFilterLink *outlink) } static const AVOption inference_classify_options[] = { - { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, - { "models", "path to model files for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, - { "labels", "labels for classify", OFFSET(labels), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, - { "names", "classify type names", OFFSET(names), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, - { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, - { "interval", "do infer every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 15, FLAGS}, - { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, + { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, + { "model", "path to model files for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "label", "labels for classify", OFFSET(labels), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "name", "classify type names", OFFSET(names), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, + { "interval", "do infer every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 15, FLAGS }, + { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS }, + { "feature_file", "registered face feature data", OFFSET(feature_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS, "face_identify" }, + { "feature_num", "registered face number", OFFSET(feature_num), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 1024, FLAGS, "face_identify" }, { NULL } }; diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c index 8d676cf..73d77ab 100644 --- a/libavfilter/vf_inference_detect.c +++ b/libavfilter/vf_inference_detect.c @@ -50,6 +50,7 @@ typedef struct InferenceDetectContext { InferenceBaseContext *base; char *model_file; + char *label_file; int backend_type; int device_type; @@ -63,47 +64,40 @@ typedef struct InferenceDetectContext { char *name; char *params; - int (*init) (AVFilterContext *ctx, const char *args); - void (*uninit)(AVFilterContext *ctx); - int (*end_frame_filter)(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame); - void *priv; + + AVBufferRef *label_buf; } InferenceDetectContext; -static void infer_detect_metadata_buffer_free(void *opaque, uint8_t *data) +static void infer_labels_buffer_free(void *opaque, uint8_t *data) { int i; - InferDetectionMeta *meta = (InferDetectionMeta *)data; - LabelsArray *labels = meta->labels; - BBoxesArray *bboxes = meta->bboxes; + LabelsArray *labels = (LabelsArray *)data; + + for (i = 0; i < labels->num; i++) + av_freep(&labels->label[i]); + + av_free(data); +} + +static void infer_detect_metadata_buffer_free(void *opaque, uint8_t *data) +{ + BBoxesArray *bboxes = ((InferDetectionMeta *)data)->bboxes; if (bboxes) { + int i; for (i = 0; i < bboxes->num; i++) { InferDetection *p = bboxes->bbox[i]; + if (p->label_buf) + av_buffer_unref(&p->label_buf); av_freep(&p); } av_freep(&bboxes); } - if (labels) { - for (i = 0; i < labels->num; i++) { - char *l = labels->label[i]; - av_freep(&l); - } - av_freep(&labels); - } - av_free(data); } -typedef struct FaceDetectContext { - int max_num; - -} FaceDetectContext; - -static int face_init(AVFilterContext *ctx, const char *args) {return 0;} -static void face_uninit(AVFilterContext *ctx) {} - -static int face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, AVFrame *frame) +static int detect_postprocess(AVFilterContext *ctx, InferTensorMeta *meta, AVFrame *frame) { int i; InferenceDetectContext *s = ctx->priv; @@ -144,14 +138,18 @@ static int face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A break; } + if (s->label_buf) + new_bbox->label_buf = av_buffer_ref(s->label_buf); + av_dynarray_add(&boxes->bbox, &boxes->num, new_bbox); } // dump face detected meta for (i = 0; i < boxes->num; i++) { InferDetection *p = boxes->bbox[i]; - av_log(ctx, AV_LOG_DEBUG, "DETECT META - label:%d confi:%f coord:%f %f %f %f\n", - p->label_id, p->confidence, p->x_min, p->y_min, p->x_max, p->y_max); + av_log(ctx, AV_LOG_DEBUG, + "DETECT META - label:%d confi:%f coord:%f %f %f %f\n", + p->label_id, p->confidence,p->x_min, p->y_min, p->x_max, p->y_max); } ref = av_buffer_create((uint8_t *)detect_meta, sizeof(*detect_meta), @@ -160,35 +158,18 @@ static int face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A return AVERROR(ENOMEM); detect_meta->bboxes = boxes; - detect_meta->labels = NULL; // add meta data to side data sd = av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_INFERENCE_DETECTION, ref); if (!sd) { av_buffer_unref(&ref); - av_log(NULL, AV_LOG_ERROR, "could not add new side data\n"); + av_log(NULL, AV_LOG_ERROR, "Could not add new side data\n"); return AVERROR(ENOMEM); } return 0; } -typedef struct EmotionDetectContext { - int max_num; - -} EmotionDetectContext; -static int emotion_init(AVFilterContext *ctx, const char *args) {return 0;} -static void emotion_uninit(AVFilterContext *ctx) {} -static int emotion_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame) { return 0; } - -typedef struct LogoDetectContext { - int max_num; - -} LogoDetectContext; -static int logo_init(AVFilterContext *ctx, const char *args) {return 0;} -static void logo_uninit(AVFilterContext *ctx) {} -static int logo_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame) { return 0; } - static int detect_preprocess(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out) { int ret; @@ -217,7 +198,7 @@ static int query_formats(AVFilterContext *context) formats_list = ff_make_format_list(pixel_formats); if (!formats_list) { - av_log(context, AV_LOG_ERROR, "could not create formats list\n"); + av_log(context, AV_LOG_ERROR, "Could not create formats list\n"); return AVERROR(ENOMEM); } @@ -236,7 +217,7 @@ static int config_input(AVFilterLink *inlink) VideoPP *vpp = ff_inference_base_get_vpp(s->base); for (i = 0; i < info->numbers; i++) { - av_log(ctx, AV_LOG_DEBUG, "input info [%d] %d - %d %d %d - %d %d %d\n", + av_log(ctx, AV_LOG_DEBUG, "Input info [%d] %d - %d %d %d - %d %d %d\n", i, info->batch_size, info->width[i], info->height[i], info->channels[i], info->is_image[i], info->precision[i], info->layout[i]); } @@ -290,7 +271,7 @@ static int config_output(AVFilterLink *outlink) DNNModelInfo *info = ff_inference_base_get_output_info(s->base); for (int i = 0; i < info->numbers; i++) { - av_log(ctx, AV_LOG_DEBUG, "output info [%d] %d - %d %d %d - %d %d %d\n", + av_log(ctx, AV_LOG_DEBUG, "Output info [%d] %d - %d %d %d - %d %d %d\n", i, info->batch_size, info->width[i], info->height[i], info->channels[i], info->is_image[i], info->precision[i], info->layout[i]); @@ -301,43 +282,48 @@ static int config_output(AVFilterLink *outlink) return 0; } -typedef struct DetectFilterEntry { - const char *name; - size_t priv_size; - int (*init)(AVFilterContext *ctx, const char *args); - void (*uninit)(AVFilterContext *ctx); - int (*end_frame_filter)(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame); -} DetectFilterEntry; - -static const DetectFilterEntry detect_filter_entries[] = { - { "face", sizeof(FaceDetectContext), face_init, face_uninit, face_end_frame_filter }, - { "emotion", sizeof(EmotionDetectContext), emotion_init, emotion_uninit, emotion_end_frame_filter }, - { "logo", sizeof(LogoDetectContext), logo_init, logo_uninit, logo_end_frame_filter }, -}; - static av_cold int detect_init(AVFilterContext *ctx) { - int i, ret; + int ret; InferenceDetectContext *s = ctx->priv; InferenceParam p = {}; av_assert0(s->model_file && s->name); - for (i = 0; i < FF_ARRAY_ELEMS(detect_filter_entries); i++) { - const DetectFilterEntry *entry = &detect_filter_entries[i]; - if (!strcmp(s->name, entry->name)) { - s->init = entry->init; - s->uninit = entry->uninit; - s->end_frame_filter = entry->end_frame_filter; + av_assert0(s->backend_type == DNN_INTEL_IE); - if (!(s->priv = av_mallocz(entry->priv_size))) - return AVERROR(ENOMEM); + if (s->label_file) { + int n, labels_num; + AVBufferRef *ref = NULL; + LabelsArray *larray = NULL; + char buffer[4096] = { }; + char *_labels[100] = { }; + + FILE *fp = fopen(s->label_file, "rb"); + if (!fp) { + av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", s->label_file); + return AVERROR(EIO); } - } - av_assert0(s->init); + n = fread(buffer, sizeof(buffer), 1, fp); + fclose(fp); - av_assert0(s->backend_type == DNN_INTEL_IE); + buffer[strcspn(buffer, "\n")] = 0; + av_split(buffer, ",", _labels, &labels_num, 100); + + larray = av_mallocz(sizeof(*larray)); + if (!larray) + return AVERROR(ENOMEM); + + for (n = 0; n < labels_num; n++) { + char *l = av_strdup(_labels[n]); + av_dynarray_add(&larray->label, &larray->num, l); + } + + ref = av_buffer_create((uint8_t *)larray, sizeof(*larray), + &infer_labels_buffer_free, NULL, 0); + s->label_buf = ref; + } p.model_file = s->model_file; p.backend_type = s->backend_type; @@ -352,14 +338,7 @@ static av_cold int detect_init(AVFilterContext *ctx) ret = ff_inference_base_create(ctx, &s->base, &p); if (ret < 0) { - av_log(ctx, AV_LOG_ERROR, "could not create inference\n"); - return ret; - } - - ret = s->init(ctx, s->params); - if (ret < 0) { - ff_inference_base_free(&s->base); - av_log(ctx, AV_LOG_ERROR, "init '%s' failed\n", s->name); + av_log(ctx, AV_LOG_ERROR, "Could not create inference\n"); return ret; } @@ -372,7 +351,7 @@ static av_cold void detect_uninit(AVFilterContext *ctx) ff_inference_base_free(&s->base); - av_freep(&s->priv); + if (s->label_buf) av_buffer_unref(&s->label_buf); } static int filter_frame(AVFilterLink *inlink, AVFrame *in) @@ -391,7 +370,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) if (ret < 0) goto fail; - s->end_frame_filter(ctx, &tensor_meta, in); + detect_postprocess(ctx, &tensor_meta, in); return ff_filter_frame(outlink, in); fail: @@ -403,6 +382,7 @@ static const AVOption inference_detect_options[] = { { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, { "model", "path to model file for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, + { "label", "label file path for detection", OFFSET(label_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "interval", "detect every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 15, FLAGS}, { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, { "threshold", "threshod to filter output data", OFFSET(threshold), AV_OPT_TYPE_FLOAT, { .dbl = 0.5}, 0, 1, FLAGS}, diff --git a/libavformat/iemetadataenc.c b/libavformat/iemetadataenc.c index eb332ef..7d47aae 100644 --- a/libavformat/iemetadataenc.c +++ b/libavformat/iemetadataenc.c @@ -154,7 +154,6 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt) return 0; AVFrameSideData *sd; InferDetectionMeta *meta; - LabelsArray *labels; BBoxesArray *bboxes; AVFrameSideData *c_sd; InferClassificationMeta *cmeta; @@ -167,7 +166,6 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt) if (meta) { bboxes = meta->bboxes; - labels = meta->labels; if (bboxes) { if (bboxes->num > 0) { jhead_write(s, frm_data); @@ -198,7 +196,15 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt) JSON_IVALUE(tmp_str, "object_id", bboxes->bbox[i]->object_id); fill_line(s, tmp_str, md->current_escape_num, 0); - JSON_STRING(tmp_str, "label", ""); + if (!bboxes->bbox[i]->label_buf) { + JSON_STRING(tmp_str, "label", "face"); + } else { + // object detection label index start from 1 + int label_id = bboxes->bbox[i]->label_id - 1; + LabelsArray *array = (LabelsArray*)(bboxes->bbox[i]->label_buf->data); + JSON_STRING(tmp_str, "label", array->label[label_id]); + } + fill_line(s, tmp_str, md->current_escape_num, 0); JSON_IVALUE(tmp_str, "label_id", bboxes->bbox[i]->label_id); fill_line(s, tmp_str, md->current_escape_num, 0); -- 2.7.4