From ff921d4331896e7b1bfcda980694298214bc6145 Mon Sep 17 00:00:00 2001 From: Lin Xie Date: Mon, 10 Jun 2019 16:03:28 +0800 Subject: [PATCH] Face reidentification refine --- configure | 3 + libavfilter/Makefile | 2 + libavfilter/allfilters.c | 2 + libavfilter/inference.h | 5 +- libavfilter/vf_inference_classify.c | 335 ++++----------------------------- libavfilter/vf_inference_identify.c | 328 ++++++++++++++++++++++++++++++++ libavfilter/vf_inference_metaconvert.c | 190 +++++++++++++++++++ 7 files changed, 570 insertions(+), 295 deletions(-) create mode 100644 libavfilter/vf_inference_identify.c create mode 100644 libavfilter/vf_inference_metaconvert.c diff --git a/configure b/configure index b8f9c4a..dcaaf95 100755 --- a/configure +++ b/configure @@ -3416,6 +3416,9 @@ inference_classify_filter_deps="libinference_engine libcjson" inference_classify_filter_select="dnn" inference_detect_filter_deps="libinference_engine libcjson" inference_detect_filter_select="dnn" +inference_identify_filter_deps="libinference_engine libcjson" +inference_identify_filter_select="dnn" +inference_metaconvert_filter_deps="libinference_engine libcjson" interlace_filter_deps="gpl" kerndeint_filter_deps="gpl" ladspa_filter_deps="ladspa libdl" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index d9e0602..11f0fb4 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -260,6 +260,8 @@ OBJS-$(CONFIG_IDET_FILTER) += vf_idet.o OBJS-$(CONFIG_IL_FILTER) += vf_il.o OBJS-$(CONFIG_INFERENCE_CLASSIFY_FILTER) += vf_inference_classify.o OBJS-$(CONFIG_INFERENCE_DETECT_FILTER) += vf_inference_detect.o +OBJS-$(CONFIG_INFERENCE_IDENTIFY_FILTER) += vf_inference_identify.o +OBJS-$(CONFIG_INFERENCE_METACONVERT_FILTER) += vf_inference_metaconvert.o OBJS-$(CONFIG_INFLATE_FILTER) += vf_neighbor.o OBJS-$(CONFIG_INTERLACE_FILTER) += vf_tinterlace.o OBJS-$(CONFIG_INTERLEAVE_FILTER) += f_interleave.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 158c75b..4588c2b 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -246,6 +246,8 @@ extern AVFilter ff_vf_idet; extern AVFilter ff_vf_il; extern AVFilter ff_vf_inference_classify; extern AVFilter ff_vf_inference_detect; +extern AVFilter ff_vf_inference_identify; +extern AVFilter ff_vf_inference_metaconvert; extern AVFilter ff_vf_inflate; extern AVFilter ff_vf_interlace; extern AVFilter ff_vf_interleave; diff --git a/libavfilter/inference.h b/libavfilter/inference.h index 0512403..1d7971e 100644 --- a/libavfilter/inference.h +++ b/libavfilter/inference.h @@ -193,10 +193,13 @@ typedef struct InferDetectionMeta { typedef struct InferClassification { int detect_id; ///< detected bbox index char *name; ///< class name, e.g. emotion, age + char *layer_name; ///< output layer name + char *model; ///< model name int label_id; ///< label index in labels float confidence; float value; - AVBufferRef *label_buf; ///< label ref buf from label file + AVBufferRef *label_buf; ///< label buffer + AVBufferRef *tensor_buf; ///< output tensor buffer } InferClassification; /* dynamic classifications array */ diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c index 2367a8c..b36a7ca 100644 --- a/libavfilter/vf_inference_classify.c +++ b/libavfilter/vf_inference_classify.c @@ -40,13 +40,7 @@ #define OFFSET(x) offsetof(InferenceClassifyContext, x) #define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM) -#define PI 3.1415926 #define MAX_MODEL_NUM 8 -#define FACE_FEATURE_VECTOR_LEN 256 - -typedef int (*ClassifyInit)(AVFilterContext *ctx, size_t index); - -typedef int (*ClassifyUnInit)(AVFilterContext *ctx, size_t index); typedef int (*ClassifyProcess)(AVFilterContext*, int, int, int, InferTensorMeta*, InferClassificationMeta*); @@ -56,15 +50,10 @@ typedef struct InferenceClassifyContext { InferenceBaseContext *infer_bases[MAX_MODEL_NUM]; - char *labels; - char *names; - char *model_file; char *model_proc; char *vpp_format; - char *feature_file; ///< binary feature file for face identification - int feature_num; ///< identification face feature number - double feature_angle; ///< face identification threshold angle value + int loaded_num; int backend_type; int device_type; @@ -73,12 +62,6 @@ typedef struct InferenceClassifyContext { int frame_number; int every_nth_frame; - void *priv[MAX_MODEL_NUM]; - char *name_array[MAX_MODEL_NUM]; - AVBufferRef *label_bufs[MAX_MODEL_NUM]; - - ClassifyInit init[MAX_MODEL_NUM]; - ClassifyUnInit uninit[MAX_MODEL_NUM]; ClassifyProcess post_process[MAX_MODEL_NUM]; void *proc_config[MAX_MODEL_NUM]; @@ -86,23 +69,6 @@ typedef struct InferenceClassifyContext { ModelOutputPostproc model_postproc[MAX_MODEL_NUM]; } InferenceClassifyContext; -typedef struct FaceIdentifyContext { - size_t vector_num; - double *norm_std; - float **feature_vecs; -} FaceIdentifyContext; - -static void infer_labels_buffer_free(void *opaque, uint8_t *data) -{ - int i; - LabelsArray *labels = (LabelsArray *)data; - - for (i = 0; i < labels->num; i++) - av_freep(&labels->label[i]); - - av_free(data); -} - static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data) { int i; @@ -113,6 +79,7 @@ static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data) for (i = 0; i < classes->num; i++) { InferClassification *c = classes->classifications[i]; av_buffer_unref(&c->label_buf); + av_buffer_unref(&c->tensor_buf); av_freep(&c); } av_freep(&classes); @@ -265,6 +232,40 @@ static int tensor_to_text(AVFilterContext *ctx, return 0; } +static int default_postprocess(AVFilterContext *ctx, + int detect_id, + int result_id, + int model_id, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) +{ + InferenceClassifyContext *s = ctx->priv; + InferenceBaseContext *base = s->infer_bases[model_id]; + DNNModelInfo *info = ff_inference_base_get_output_info(base); + InferClassification *classify; + + if (!meta->data) return -1; + + classify = av_mallocz(sizeof(*classify)); + if (!classify) + return AVERROR(ENOMEM); + + classify->detect_id = detect_id; + classify->layer_name = info->layer_name[result_id]; + classify->model = s->model_file; + + classify->tensor_buf = av_buffer_alloc(meta->total_bytes); + if (!classify->tensor_buf) + return AVERROR(ENOMEM); + if (meta->total_bytes > 0) + memcpy(classify->tensor_buf->data, meta->data, meta->total_bytes); + + av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + + av_log(ctx, AV_LOG_DEBUG, "default output[%s] size: %zu\n", classify->layer_name, meta->total_bytes); + return 0; +} + static int commmon_postprocess(AVFilterContext *ctx, int detect_id, int result_id, @@ -299,7 +300,7 @@ static int commmon_postprocess(AVFilterContext *ctx, proc = &s->model_postproc[model_id].procs[proc_id]; if (proc->converter == NULL) - return 0; + return default_postprocess(ctx, detect_id, result_id, model_id, meta, c_meta); if (!strcmp(proc->converter, "attributes")) return attributes_to_text(ctx, detect_id, proc, meta, c_meta); @@ -310,193 +311,6 @@ static int commmon_postprocess(AVFilterContext *ctx, return 0; } -static int face_identify_init(AVFilterContext *ctx, size_t index) -{ - FaceIdentifyContext *identify_ctx; - InferenceClassifyContext *s = ctx->priv; - - int i, ret, feature_size, expected_size; - size_t vec_size_in_bytes = sizeof(float) * FACE_FEATURE_VECTOR_LEN; - - FILE *fp = fopen(s->feature_file, "rb"); - if (!fp) { - av_log(ctx, AV_LOG_ERROR, "Could not open feature file:%s\n", s->feature_file); - return AVERROR(EIO); - } - - av_assert0(index < MAX_MODEL_NUM); - - feature_size = ff_get_file_size(fp); - - if (feature_size == -1) { - fclose(fp); - av_log(ctx, AV_LOG_ERROR, "Couldn't get size of feature file.\n"); - return AVERROR(EINVAL); - } else if (feature_size % FACE_FEATURE_VECTOR_LEN) { - fclose(fp); - av_log(ctx, AV_LOG_ERROR, "Feature data must align to %d.\n", FACE_FEATURE_VECTOR_LEN); - return AVERROR(EINVAL); - } - - if (s->feature_num > 0) { - expected_size = s->feature_num * vec_size_in_bytes; - if (expected_size != feature_size) { - fclose(fp); - av_log(ctx, AV_LOG_ERROR, "Unexpected feature file size.\n"); - return AVERROR(EINVAL); - } - } else { - s->feature_num = feature_size / vec_size_in_bytes; - } - - identify_ctx = av_mallocz(sizeof(*identify_ctx)); - if (!identify_ctx) { - ret = AVERROR(ENOMEM); - goto fail; - } - - identify_ctx->vector_num = s->feature_num; - - identify_ctx->feature_vecs = av_mallocz(sizeof(float *) * identify_ctx->vector_num); - if (!identify_ctx->feature_vecs) { - ret = AVERROR(ENOMEM); - goto fail; - } - - rewind(fp); - - for (i = 0; i vector_num; i++) { - identify_ctx->feature_vecs[i] = av_malloc(vec_size_in_bytes); - if (!identify_ctx->feature_vecs[i]) { - ret = AVERROR(ENOMEM); - goto fail; - } - if (fread(identify_ctx->feature_vecs[i], vec_size_in_bytes, 1, fp) != 1) { - ret = AVERROR(EINVAL); - goto fail; - } - } - - identify_ctx->norm_std = av_mallocz(sizeof(double) * identify_ctx->vector_num); - if (!identify_ctx->norm_std) { - ret = AVERROR(ENOMEM); - goto fail; - } - - for (i = 0; i < identify_ctx->vector_num; i++) - identify_ctx->norm_std[i] = av_norm(identify_ctx->feature_vecs[i], - FACE_FEATURE_VECTOR_LEN); - - s->priv[index] = identify_ctx; - fclose(fp); - return 0; -fail: - fclose(fp); - - if (identify_ctx) { - if (identify_ctx->feature_vecs) { - for (i = 0; i vector_num; i++) { - if (identify_ctx->feature_vecs[i]) - av_free(identify_ctx->feature_vecs[i]); - } - av_free(identify_ctx->feature_vecs); - } - av_free(identify_ctx); - } - return ret; -} - -static int face_identify_uninit(AVFilterContext *ctx, size_t index) -{ - int i; - InferenceClassifyContext *s = ctx->priv; - FaceIdentifyContext *identify_ctx = s->priv[index]; - - if (!identify_ctx) { - av_log(ctx, AV_LOG_WARNING, "Empty face identify ctx.\n"); - return 0; - } - - if (identify_ctx->feature_vecs) { - for (i = 0; i < identify_ctx->vector_num; i++) - av_free(identify_ctx->feature_vecs[i]); - av_free(identify_ctx->feature_vecs); - } - - if (identify_ctx->norm_std) - av_free(identify_ctx->norm_std); - - av_free(identify_ctx); - s->priv[index] = NULL; - - return 0; -} - -static av_cold void dump_face_id(AVFilterContext *ctx, int label_id, - float conf, AVBufferRef *label_buf) -{ - LabelsArray *array = (LabelsArray *)label_buf->data; - - av_log(ctx, AV_LOG_DEBUG,"CLASSIFY META - Face_id:%d Name:%s Conf:%1.2f\n", - label_id, array->label[label_id], conf); -} - -static int face_identify_result_process(AVFilterContext *ctx, - int detect_id, - int result_id, - int model_index, - InferTensorMeta *meta, - InferClassificationMeta *c_meta) -{ - int i, label_id = 0; - InferClassification *classify; - double dot_product, norm_feature, confidence, *angles; - InferenceClassifyContext *s = ctx->priv; - FaceIdentifyContext *f = s->priv[model_index]; - double min_angle = 180.0f; - float *feature_vector = meta->data; - - angles = av_malloc(sizeof(double) * f->vector_num); - if (!angles) - return AVERROR(ENOMEM); - - norm_feature = av_norm(feature_vector, FACE_FEATURE_VECTOR_LEN); - - for (i = 0; i < f->vector_num; i++) { - dot_product = av_dot(feature_vector, - f->feature_vecs[i], - FACE_FEATURE_VECTOR_LEN); - - angles[i] = acos((dot_product - 0.0001f) / - (f->norm_std[i] * norm_feature)) / - PI * 180.0; - if (angles[i] < s->feature_angle && angles[i] < min_angle) { - label_id = i; - min_angle = angles[i]; - } - } - - confidence = (90.0f - min_angle) / 90.0f; - - av_free(angles); - - classify = av_mallocz(sizeof(*classify)); - if (!classify) - return AVERROR(ENOMEM); - - classify->detect_id = detect_id; - classify->name = s->name_array[model_index]; - classify->label_id = label_id; - classify->confidence = (float)confidence; - classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); - - dump_face_id(ctx, label_id, confidence, s->label_bufs[model_index]); - - av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); - - return 0; -} - static int query_formats(AVFilterContext *context) { AVFilterFormats *formats_list; @@ -520,11 +334,9 @@ static av_cold int classify_init(AVFilterContext *ctx) { InferenceClassifyContext *s = ctx->priv; int i, ret; - int model_num = 0, model_proc_num = 0, label_num = 0, name_num = 0; + int model_num = 0, model_proc_num = 0; const int max_num = MAX_MODEL_NUM; - char *names[MAX_MODEL_NUM] = { }; char *models[MAX_MODEL_NUM] = { }; - char *labels[MAX_MODEL_NUM] = { }; char *models_proc[MAX_MODEL_NUM] = { }; InferenceParam p = {}; @@ -534,21 +346,10 @@ static av_cold int classify_init(AVFilterContext *ctx) for (i = 0; i < model_num; i++) av_log(ctx, AV_LOG_INFO, "model[%d]:%s\n", i, models[i]); - av_split(s->labels, "&", labels, &label_num, max_num); - for (i = 0; i < label_num; i++) - av_log(ctx, AV_LOG_INFO, "label[%d]:%s\n", i, labels[i]); - - av_split(s->names, "&", names, &name_num, max_num); - for (i = 0; i < name_num; i++) - av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]); - av_split(s->model_proc, "&", models_proc, &model_proc_num, max_num); for (i = 0; i < model_proc_num; i++) av_log(ctx, AV_LOG_INFO, "proc[%d]:%s\n", i, models_proc[i]); - // TODO: uncomment this after face reidentify use proc file - // av_assert0(model_proc_num == model_num); - av_assert0(s->backend_type == DNN_INTEL_IE); p.backend_type = s->backend_type; @@ -600,53 +401,11 @@ static av_cold int classify_init(AVFilterContext *ctx) } s->loaded_num = model_num; - for (i = 0; i < label_num; i++) { - int n, labels_num; - AVBufferRef *ref = NULL; - LabelsArray *larray = NULL; - char buffer[4096] = { }; - char *_labels[100] = { }; - - FILE *fp = fopen(labels[i], "rb"); - if (!fp) { - av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", labels[i]); - ret = AVERROR(EIO); - goto fail; - } - - n = fread(buffer, sizeof(buffer), 1, fp); - fclose(fp); - - av_split(buffer, ",", _labels, &labels_num, 100); - - larray = av_mallocz(sizeof(*larray)); - if (!larray) { - ret = AVERROR(ENOMEM); - goto fail; - } - - for (n = 0; n < labels_num; n++) { - char *l = av_strdup(_labels[n]); - av_dynarray_add(&larray->label, &larray->num, l); - } - - ref = av_buffer_create((uint8_t *)larray, sizeof(*larray), - &infer_labels_buffer_free, NULL, 0); - s->label_bufs[i] = ref; - } - for (i = 0; i < model_num; i++) { - s->name_array[i] = names[i]; - if (names[i] && strstr(names[i], "face")) { - s->init[i] = &face_identify_init; - s->uninit[i] = &face_identify_uninit; - s->post_process[i] = &face_identify_result_process; - } else { + if (!models_proc[i]) + s->post_process[i] = &default_postprocess; + else s->post_process[i] = &commmon_postprocess; - } - - if (s->init[i] && s->init[i](ctx, i) < 0) - goto fail; } return 0; @@ -654,8 +413,6 @@ static av_cold int classify_init(AVFilterContext *ctx) fail: for (i = 0; i < model_num; i++) { ff_inference_base_free(&s->infer_bases[i]); - if (s->label_bufs[i]) - av_buffer_unref(&s->label_bufs[i]); } return ret; @@ -667,12 +424,7 @@ static av_cold void classify_uninit(AVFilterContext *ctx) InferenceClassifyContext *s = ctx->priv; for (i = 0; i < s->loaded_num; i++) { - if (s->uninit[i]) s->uninit[i](ctx, i); - ff_inference_base_free(&s->infer_bases[i]); - - av_buffer_unref(&s->label_bufs[i]); - ff_release_model_proc(s->proc_config[i], &s->model_preproc[i], &s->model_postproc[i]); } } @@ -926,15 +678,10 @@ static const AVOption inference_classify_options[] = { { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, { "model", "path to model files for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "model_proc", "model preproc and postproc", OFFSET(model_proc), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, - { "label", "labels for classify", OFFSET(labels), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, - { "name", "classify type names", OFFSET(names), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "vpp_format", "specify vpp output format", OFFSET(vpp_format), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, { "interval", "do infer every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS }, { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS }, - { "feature_file", "registered face feature data", OFFSET(feature_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS, "face_identify" }, - { "feature_num", "registered face number", OFFSET(feature_num), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 1024, FLAGS, "face_identify" }, - { "identify_angle", "face identify threshold angle", OFFSET(feature_angle), AV_OPT_TYPE_DOUBLE, { .dbl = 70}, 0, 90, FLAGS, "face_identify" }, { NULL } }; diff --git a/libavfilter/vf_inference_identify.c b/libavfilter/vf_inference_identify.c new file mode 100644 index 0000000..5e676dd --- /dev/null +++ b/libavfilter/vf_inference_identify.c @@ -0,0 +1,328 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * dnn inference identify filter + */ +#include "libavutil/opt.h" +#include "libavutil/mem.h" +#include "libavutil/eval.h" +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" +#include "libavutil/pixdesc.h" +#include "libavformat/avformat.h" + +#include "formats.h" +#include "internal.h" +#include "avfilter.h" + +#include "inference.h" + +#include + +#define OFFSET(x) offsetof(InferenceIdentifyContext, x) +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM) + +#define PI 3.1415926 +#define FACE_FEATURE_VECTOR_LEN 256 + +typedef struct FeatureLabelPair { + float *feature; + size_t label_id; +} FeatureLabelPair; + +typedef struct InferenceIdentifyContext { + const AVClass *class; + + char *gallery; ///<< gallery for identify features + double *norm_std; + + AVBufferRef *labels; + FeatureLabelPair **features; + int features_num; +} InferenceIdentifyContext; + +static const char *get_filename_ext(const char *filename) { + const char *dot = strrchr(filename, '.'); + if (!dot || dot == filename) + return NULL; + + return dot + 1; +} + +const char *gallery_file_suffix = "json"; + +static void infer_labels_buffer_free(void *opaque, uint8_t *data) +{ + int i; + LabelsArray *labels = (LabelsArray *)data; + + for (i = 0; i < labels->num; i++) + av_freep(&labels->label[i]); + + av_free(data); +} + +static int query_formats(AVFilterContext *context) +{ + AVFilterFormats *formats_list; + const enum AVPixelFormat pixel_formats[] = { + AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, + AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, + AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_GRAY8, + AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_VAAPI, + AV_PIX_FMT_NONE}; + + formats_list = ff_make_format_list(pixel_formats); + if (!formats_list) { + av_log(context, AV_LOG_ERROR, "Could not create formats list\n"); + return AVERROR(ENOMEM); + } + + return ff_set_common_formats(context, formats_list); +} + +static av_cold int identify_init(AVFilterContext *ctx) +{ + size_t i, index = 1; + char *dup, *unknown; + const char *dirname; + cJSON *entry, *item; + LabelsArray *larray = NULL; + AVBufferRef *ref = NULL; + InferenceIdentifyContext *s = ctx->priv; + size_t vec_size_in_bytes = sizeof(float) * FACE_FEATURE_VECTOR_LEN; + + av_assert0(s->gallery); + + if (strcmp(get_filename_ext(s->gallery), gallery_file_suffix)) { + av_log(ctx, AV_LOG_ERROR, "Face gallery '%s' is not a json file\n", s->gallery); + return AVERROR(EINVAL); + } + + entry = ff_read_model_proc(s->gallery); + if (!entry) { + av_log(ctx, AV_LOG_ERROR, "Could not open gallery file:%s\n", s->gallery); + return AVERROR(EIO); + } + + dup = av_strdup(s->gallery); + dirname = av_dirname(dup); + + larray = av_mallocz(sizeof(*larray)); + if (!larray) + return AVERROR(ENOMEM); + + // label id 0 reserved for unknown person + unknown = av_strdup("Unknown_Person"); + av_dynarray_add(&larray->label, &larray->num, unknown); + + cJSON_ArrayForEach(item, entry) + { + char *l = av_strdup(item->string); + cJSON *features, *feature; + + av_dynarray_add(&larray->label, &larray->num, l); + + features = cJSON_GetObjectItem(item, "features"); + + cJSON_ArrayForEach(feature, features) + { + FILE *vec_fp; + FeatureLabelPair *pair; + char path[4096]; + + memset(path, 0, sizeof(path)); + + if (!cJSON_IsString(feature) || !feature->valuestring) + continue; + + strncpy(path, dirname, strlen(dirname)); + strncat(path, "/", 1); + strncat(path, feature->valuestring, strlen(feature->valuestring)); + + vec_fp = fopen(path, "rb"); + if (!vec_fp) { + av_log(ctx, AV_LOG_ERROR, "Could not open feature file:%s\n", path); + continue; + } + + pair = av_mallocz(sizeof(FeatureLabelPair)); + if (!pair) + return AVERROR(ENOMEM); + + pair->feature = av_malloc(vec_size_in_bytes); + if (!pair->feature) + return AVERROR(ENOMEM); + + if (fread(pair->feature, vec_size_in_bytes, 1, vec_fp) != 1) { + av_log(ctx, AV_LOG_ERROR, "Feature vector size mismatch:%s\n", path); + fclose(vec_fp); + return AVERROR(EINVAL); + } + + fclose(vec_fp); + + pair->label_id = index; + av_dynarray_add(&s->features, &s->features_num, pair); + } + index++; + } + + s->norm_std = av_mallocz(sizeof(double) * s->features_num); + if (!s->norm_std) + return AVERROR(ENOMEM); + + for (i = 0; i < s->features_num; i++) + s->norm_std[i] = av_norm(s->features[i]->feature, FACE_FEATURE_VECTOR_LEN); + + ref = av_buffer_create((uint8_t *)larray, sizeof(*larray), + &infer_labels_buffer_free, NULL, 0); + + s->labels = ref; + av_free(dup); + + return 0; +} + +static av_cold void identify_uninit(AVFilterContext *ctx) +{ + int i; + InferenceIdentifyContext *s = ctx->priv; + + av_buffer_unref(&s->labels); + + for (i = 0; i < s->features_num; i++) { + av_freep(&s->features[i]->feature); + av_freep(&s->features[i]); + } + if (s->norm_std) + av_free(s->norm_std); +} + +static av_cold void dump_face_id(AVFilterContext *ctx, int label_id, + float conf, AVBufferRef *label_buf) +{ + LabelsArray *array = (LabelsArray *)label_buf->data; + + av_log(ctx, AV_LOG_DEBUG,"CLASSIFY META - Face_id:%d Name:%s Conf:%1.2f\n", + label_id, array->label[label_id], conf); +} + +static int face_identify(AVFilterContext *ctx, AVFrame *frame) +{ + int i; + InferenceIdentifyContext *s = ctx->priv; + AVFrameSideData *side_data; + ClassifyArray *c_array; + InferClassificationMeta *meta; + + side_data = av_frame_get_side_data(frame, + AV_FRAME_DATA_INFERENCE_CLASSIFICATION); + + if (!side_data) + return 0; + + meta = (InferClassificationMeta *)side_data->data; + if (!meta) + return 0; + + c_array = meta->c_array; + for (i = 0; i < c_array->num; i++) { + int n, label = 0; + float *vector; + InferClassification *c; + double dot_product, norm_feature, confidence, angle; + double min_angle = 180.0f; + + c = c_array->classifications[i]; + vector = (float *)c->tensor_buf->data; + norm_feature = av_norm(vector, FACE_FEATURE_VECTOR_LEN); + for (n = 0; n < s->features_num; n++) { + dot_product = av_dot(vector, s->features[n]->feature, FACE_FEATURE_VECTOR_LEN); + + angle = acos((dot_product - 0.0001f) / (s->norm_std[n] * norm_feature)) + / + PI * 180.0; + if (angle < 70 && angle < min_angle) { + label = s->features[n]->label_id; + min_angle = angle; + } + } + + confidence = (90.0f - min_angle) / 90.0f; + + c->label_id = label; + c->name = (char *)"face_id"; + c->confidence = (float)confidence; + c->label_buf = av_buffer_ref(s->labels); + + dump_face_id(ctx, label, confidence, s->labels); + } + + return 0; +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in) +{ + AVFilterContext *ctx = inlink->dst; + AVFilterLink *outlink = inlink->dst->outputs[0]; + + face_identify(ctx, in); + + return ff_filter_frame(outlink, in); +} + +static const AVOption inference_identify_options[] = { + { "gallery", "JSON file with list of image examples for each known object/face/person", + OFFSET(gallery), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { NULL } +}; + +AVFILTER_DEFINE_CLASS(inference_identify); + +static const AVFilterPad identify_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad identify_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, + { NULL } +}; + +AVFilter ff_vf_inference_identify= { + .name = "identify", + .description = NULL_IF_CONFIG_SMALL("DNN Inference identification."), + .priv_size = sizeof(InferenceIdentifyContext), + .query_formats = query_formats, + .init = identify_init, + .uninit = identify_uninit, + .inputs = identify_inputs, + .outputs = identify_outputs, + .priv_class = &inference_identify_class, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_inference_metaconvert.c b/libavfilter/vf_inference_metaconvert.c new file mode 100644 index 0000000..89178a8 --- /dev/null +++ b/libavfilter/vf_inference_metaconvert.c @@ -0,0 +1,190 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * dnn inference metadata convert filter + */ + +#include "libavutil/opt.h" +#include "libavutil/mem.h" +#include "libavutil/eval.h" +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavutil/mathematics.h" + +#include "formats.h" +#include "internal.h" +#include "avfilter.h" +#include "libavcodec/avcodec.h" +#include "libavformat/avformat.h" + +#include "inference.h" +#include "dnn_interface.h" + +#define OFFSET(x) offsetof(MetaConvertContext, x) +#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM) + +typedef struct MetaConvertContext { + const AVClass *class; + + char *model; + char *converter; + char *method; + char *location; + char *layer; + + void (*convert_func)(AVFilterContext *ctx, AVFrame *frame); + +} MetaConvertContext; + +static int query_formats(AVFilterContext *ctx) +{ + AVFilterFormats *formats_list; + const enum AVPixelFormat pixel_formats[] = { + AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, + AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, + AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_GRAY8, + AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_VAAPI, + AV_PIX_FMT_NONE}; + + formats_list = ff_make_format_list(pixel_formats); + if (!formats_list) { + av_log(ctx, AV_LOG_ERROR, "Could not create formats list\n"); + return AVERROR(ENOMEM); + } + + return ff_set_common_formats(ctx, formats_list); +} + +static av_cold void tensors_to_file(AVFilterContext *ctx, AVFrame *frame) +{ + AVFrameSideData *sd; + MetaConvertContext *s = ctx->priv; + InferClassificationMeta *c_meta; + + static uint32_t frame_num = 0; + + if (!(sd = av_frame_get_side_data(frame, AV_FRAME_DATA_INFERENCE_CLASSIFICATION))) + return; + + c_meta = (InferClassificationMeta *)sd->data; + + if (c_meta) { + int i; + uint32_t index = 0; + char filename[1024] = {0}; + const int meta_num = c_meta->c_array->num; + for (i = 0; i < meta_num; i++) { + FILE *f = NULL; + InferClassification *c = c_meta->c_array->classifications[i]; + //TODO:check model and layer + if (!c->tensor_buf || !c->tensor_buf->data) + continue; + + snprintf(filename, sizeof(filename), "%s/%s_frame_%u_idx_%u.tensor", s->location, + s->method, frame_num, index); + f = fopen(filename, "wb"); + if (!f) { + av_log(ctx, AV_LOG_WARNING, "Failed to open/create file: %s\n", filename); + } else { + fwrite(c->tensor_buf->data, sizeof(float), c->tensor_buf->size / sizeof(float), f); + fclose(f); + } + index++; + } + } + + frame_num++; +} + +static av_cold int metaconvert_init(AVFilterContext *ctx) +{ + MetaConvertContext *s = ctx->priv; + + if (!s->model || !s->converter || !s->method) { + av_log(ctx, AV_LOG_ERROR, "Missing key parameters!!\n"); + return AVERROR(EINVAL); + } + + av_log(ctx, AV_LOG_INFO, "\nmodel:%s\nconverter:%s\nmethod:%s\nlocation:%s\n", + s->model, s->converter, s->method, s->location); + + if (!strcmp(s->converter, "tensors-to-file")) { + if (!s->location) { + av_log(ctx, AV_LOG_ERROR, "Missing parameters location!"); + return AVERROR(EINVAL); + } + s->convert_func = &tensors_to_file; + } + + return 0; +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in) +{ + AVFilterContext *ctx = inlink->dst; + MetaConvertContext *s = ctx->priv; + AVFilterLink *outlink = inlink->dst->outputs[0]; + + if (s->convert_func) + s->convert_func(ctx, in); + + return ff_filter_frame(outlink, in); +} + +static const AVOption inference_metaconvert_options[] = { + { "model", "select tensor by model name", OFFSET(model), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "layer", "select tensor by layer name", OFFSET(layer), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "converter", "metadata conversion group", OFFSET(converter), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "method", "metadata conversion method", OFFSET(method), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "location", "location for output files", OFFSET(location), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + + { NULL } +}; + +AVFILTER_DEFINE_CLASS(inference_metaconvert); + +static const AVFilterPad metaconvert_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad metaconvert_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, + { NULL } +}; + +AVFilter ff_vf_inference_metaconvert = { + .name = "metaconvert", + .description = NULL_IF_CONFIG_SMALL("DNN Inference metaconvert."), + .priv_size = sizeof(MetaConvertContext), + .query_formats = query_formats, + .init = metaconvert_init, + .inputs = metaconvert_inputs, + .outputs = metaconvert_outputs, + .priv_class = &inference_metaconvert_class, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; -- 2.7.4