From 496aa35db719e03b1847845581a96bff096f1e2c Mon Sep 17 00:00:00 2001 From: Lin Xie Date: Thu, 17 Jan 2019 15:48:31 +0800 Subject: [PATCH] New filter to do inference classify * Add classify filter and support multiple models in a row * Add a trick to keep side data refbuf when do frame copy * Refined some interfaces for inference * Add classify metadata and label files processing Signed-off-by: Lin Xie --- configure | 4 + libavfilter/Makefile | 1 + libavfilter/allfilters.c | 1 + libavfilter/inference.c | 127 ++++++++- libavfilter/inference.h | 44 ++- libavfilter/vf_inference_classify.c | 523 ++++++++++++++++++++++++++++++++++++ libavfilter/vf_inference_detect.c | 28 +- libavutil/frame.c | 7 +- libavutil/frame.h | 2 + 9 files changed, 717 insertions(+), 20 deletions(-) create mode 100644 libavfilter/vf_inference_classify.c diff --git a/configure b/configure index 68b7dfb..fb87f47 100755 --- a/configure +++ b/configure @@ -3408,6 +3408,10 @@ fspp_filter_deps="gpl" geq_filter_deps="gpl" histeq_filter_deps="gpl" hqdn3d_filter_deps="gpl" +inference_classify_filter_deps="libinference_engine" +inference_classify_filter_select="dnn" +inference_detect_filter_deps="libinference_engine" +inference_detect_filter_select="dnn" interlace_filter_deps="gpl" kerndeint_filter_deps="gpl" ladspa_filter_deps="ladspa libdl" diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 06ebd61..d9e0602 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -258,6 +258,7 @@ OBJS-$(CONFIG_HWUPLOAD_FILTER) += vf_hwupload.o OBJS-$(CONFIG_HYSTERESIS_FILTER) += vf_hysteresis.o framesync.o OBJS-$(CONFIG_IDET_FILTER) += vf_idet.o OBJS-$(CONFIG_IL_FILTER) += vf_il.o +OBJS-$(CONFIG_INFERENCE_CLASSIFY_FILTER) += vf_inference_classify.o OBJS-$(CONFIG_INFERENCE_DETECT_FILTER) += vf_inference_detect.o OBJS-$(CONFIG_INFLATE_FILTER) += vf_neighbor.o OBJS-$(CONFIG_INTERLACE_FILTER) += vf_tinterlace.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 4c6fa26..158c75b 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -244,6 +244,7 @@ extern AVFilter ff_vf_hwupload_cuda; extern AVFilter ff_vf_hysteresis; extern AVFilter ff_vf_idet; extern AVFilter ff_vf_il; +extern AVFilter ff_vf_inference_classify; extern AVFilter ff_vf_inference_detect; extern AVFilter ff_vf_inflate; extern AVFilter ff_vf_interlace; diff --git a/libavfilter/inference.c b/libavfilter/inference.c index ea788ba..14b4093 100644 --- a/libavfilter/inference.c +++ b/libavfilter/inference.c @@ -29,6 +29,7 @@ #include "libswscale/swscale.h" #include "libavutil/pixdesc.h" #include "libavutil/avassert.h" +#include "libavutil/imgutils.h" #include "inference.h" @@ -46,6 +47,8 @@ struct InferenceBaseContext DNNModelInfo output_info; VideoPP vpp; + + InferencePreProcess preprocess; }; static int fill_dnn_data_from_frame(DNNIOData *data, @@ -97,12 +100,88 @@ static int fill_dnn_data_from_frame(DNNIOData *data, return 0; } +static int sw_crop_and_scale(AVFrame *frame, + float x0, float y0, float x1, float y1, + int out_w, int out_h, uint8_t *data[], int stride[]) +{ + int err, bufsize; + struct SwsContext *sws_ctx; + const AVPixFmtDescriptor *desc; + int x, y, w, h, hsub, vsub; + int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes + enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24; + + AVFrame *temp = av_frame_alloc(); + if (!temp) { + err = AVERROR(ENOMEM); + return err; + } + av_frame_ref(temp, frame); + + desc = av_pix_fmt_desc_get(temp->format); + hsub = desc->log2_chroma_w; + vsub = desc->log2_chroma_h; + av_image_fill_max_pixsteps(max_step, NULL, desc); + + /* cropping */ + { + x = lrintf(x0); + y = lrintf(y0); + w = lrintf(x1) - x; + h = lrintf(y1) - y; + + temp->width = w; + temp->height = h; + + temp->data[0] += y * temp->linesize[0]; + temp->data[0] += x * max_step[0]; + + for (int i = 1; i < 3; i ++) { + if (temp->data[i]) { + temp->data[i] += (y >> vsub) * temp->linesize[i]; + temp->data[i] += (x * max_step[i]) >> hsub; + } + } + + /* alpha plane */ + if (temp->data[3]) { + temp->data[3] += y * temp->linesize[3]; + temp->data[3] += x * max_step[3]; + } + } + + /* create scaling context */ + sws_ctx = sws_getContext(temp->width, temp->height, temp->format, + out_w, out_h, expect_format, + SWS_BILINEAR, NULL, NULL, NULL); + if (!sws_ctx) { + av_log(NULL, AV_LOG_ERROR, "Create scaling context failed!\n"); + err = AVERROR(EINVAL); + return err; + } + + if (!data[0]) { + bufsize = av_image_alloc(data, stride, out_w, out_h, expect_format, 1); + if (bufsize < 0) + return AVERROR(ENOMEM); + } + + sws_scale(sws_ctx, (const uint8_t * const*)temp->data, + temp->linesize, 0, temp->height, data, stride); + + av_frame_free(&temp); + sws_freeContext(sws_ctx); + + return 0; +} + int ff_inference_base_create(AVFilterContext *ctx, InferenceBaseContext **base, InferenceParam *param) { int i, ret; InferenceBaseContext *s; DNNModelInfo *info; + VideoPP *vpp; if (!param) return AVERROR(EINVAL); @@ -162,10 +241,17 @@ int ff_inference_base_create(AVFilterContext *ctx, s->batch_size = param->batch_size; s->every_nth_frame = param->every_nth_frame; s->threshold = param->threshold; + s->preprocess = param->preprocess; ret = s->model->create_model(s->model->model); DNN_ERR_CHECK(ctx); + vpp = &s->vpp; + + // vpp init + vpp->swscale = &sws_scale; + vpp->crop_and_scale = &sw_crop_and_scale; + *base = s; #undef DNN_ERR_CHECK return 0; @@ -199,23 +285,40 @@ int ff_inference_base_free(InferenceBaseContext **base) return 0; } +int ff_inference_base_submit_frame(InferenceBaseContext *base, + AVFrame *frame, + int input_idx, + int batch_idx) +{ + DNNIOData input = { }; + fill_dnn_data_from_frame(&input, frame, batch_idx, 1, input_idx); + base->model->set_input(base->model->model, &input); + + return 0; +} + +int ff_inference_base_infer(InferenceBaseContext *base) +{ + DNNReturnType dnn_ret; + dnn_ret = base->module->execute_model(base->model); + av_assert0(dnn_ret == DNN_SUCCESS); + return 0; +} + int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in) { - VideoPP *vpp = &base->vpp; DNNModelInfo *info = &base->input_info; DNNReturnType dnn_ret; DNNIOData input = { }; for (int i = 0; i < info->numbers; i++) { - if (!vpp->scale_contexts[i]) { - fill_dnn_data_from_frame(&input, in, 0, 1, i); - } else { - AVFrame *tmp = vpp->frames[i]; - sws_scale(vpp->scale_contexts[i], (const uint8_t * const*)in->data, - in->linesize, 0, in->height, tmp->data, tmp->linesize); - fill_dnn_data_from_frame(&input, tmp, 0, 1, i); + AVFrame *processed_frame; + for (int j = 0; j < base->batch_size; j++) { + if (base->preprocess) + base->preprocess(base, i, in, &processed_frame); + fill_dnn_data_from_frame(&input, processed_frame, j, 1, i); + base->model->set_input(base->model->model, &input); } - base->model->set_input(base->model->model, &input); } dnn_ret = base->module->execute_model(base->model); @@ -224,7 +327,9 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in) return 0; } -int ff_inference_base_get_infer_result(InferenceBaseContext *base, InferTensorMeta *metadata) +int ff_inference_base_get_infer_result(InferenceBaseContext *base, + int output_index, + InferTensorMeta *metadata) { DNNModelInfo *info = &base->output_info; DNNIOData data = { }; @@ -233,7 +338,7 @@ int ff_inference_base_get_infer_result(InferenceBaseContext *base, InferTensorMe av_assert0(metadata != NULL); // TODO: change to layer name for multiple outputs - data.in_out_idx = 0; + data.in_out_idx = output_index; ret = base->model->get_execute_result(base->model->model, &data); av_assert0(ret == DNN_SUCCESS); diff --git a/libavfilter/inference.h b/libavfilter/inference.h index 8466f90..eebfd10 100644 --- a/libavfilter/inference.h +++ b/libavfilter/inference.h @@ -24,6 +24,8 @@ typedef struct InferenceBaseContext InferenceBaseContext; +typedef int (*InferencePreProcess)(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out); + typedef struct InferenceParam { char *model_file; char *labels_file; @@ -40,6 +42,8 @@ typedef struct InferenceParam { int input_layout; int input_precision; int input_is_image; //!< image or data + + InferencePreProcess preprocess; } InferenceParam; #define MAX_VPP_NUM DNN_INPUT_OUTPUT_NUM @@ -49,7 +53,18 @@ typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } VPPDevice; typedef struct VideoPP { int device; void *scale_contexts[MAX_VPP_NUM]; - AVFrame *frames[MAX_VPP_NUM]; + AVFrame *frames[MAX_VPP_NUM]; //num; i++) + av_freep(&labels->label[i]); + + av_free(data); +} + +static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data) +{ + int i; + InferClassificationMeta *meta = (InferClassificationMeta *)data; + ClassifyArray *classes = meta->c_array; + + if (classes) { + for (i = 0; i < classes->num; i++) { + InferClassification *c = classes->classifications[i]; + av_buffer_unref(&c->label_buf); + av_freep(&c); + } + av_freep(&classes); + } + + av_free(data); +} + +static av_cold void dump_emotion(AVFilterContext *ctx, int label_id) +{ + const char *emotions[] = { "neutral", "happy", "sad", "surprise", "anger" }; + + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - label:%d emotion:%s \n", + label_id, emotions[label_id]); +} + +static int emotion_classify_result_process(AVFilterContext *ctx, + int detect_id, + int result_id, + int model_index, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) +{ + int i, label_id = 0; + InferenceClassifyContext *s = ctx->priv; + const float *emo_confidence = (float *)meta->data; + size_t labels_num = meta->dims[2]; + float max = emo_confidence[0]; + + InferClassification *classify = av_mallocz(sizeof(*classify)); + if (!classify) + return AVERROR(ENOMEM); + + // Get the emotion with max confidence + for (i = 1; i < labels_num; i++) + if (emo_confidence[i] > max) { max = emo_confidence[i]; label_id = i; } + + classify->detect_id = detect_id; + classify->name = s->name_array[model_index]; + classify->label_id = label_id; + classify->confidence = emo_confidence[label_id]; + classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); + + dump_emotion(ctx, classify->label_id); + + av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + + return 0; +} + +static av_cold void dump_gender(AVFilterContext *ctx, int label_id, float conf) +{ + const char *genders[] = { "female", "male" }; + + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Confidence:%f\n", + genders[label_id], conf); +} + +static av_cold void dump_age(AVFilterContext *ctx, float age) +{ + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%f \n", age); +} + +static int age_gender_classify_result_process(AVFilterContext *ctx, + int detect_id, + int result_id, + int model_index, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) +{ + InferenceClassifyContext *s = ctx->priv; + const float *data = (float *)meta->data; + + InferClassification *classify = av_mallocz(sizeof(*classify)); + if (!classify) + return AVERROR(ENOMEM); + + classify->detect_id = detect_id; + + if (result_id == 0) { + // Age + classify->name = string_age; + classify->value = *data * 100.0; + dump_age(ctx, classify->value); + } else { + // Gender + classify->name = string_gender; + // 0 - Femal, 1 - Male + classify->label_id = data[0] > data[1] ? 0 : 1; + classify->confidence = data[classify->label_id]; + classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); + dump_gender(ctx, classify->label_id, classify->confidence); + } + + av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + + return 0; +} + +static int query_formats(AVFilterContext *context) +{ + AVFilterFormats *formats_list; + const enum AVPixelFormat pixel_formats[] = { + AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, + AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, + AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_GRAY8, + AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_NONE}; + + formats_list = ff_make_format_list(pixel_formats); + if (!formats_list) { + av_log(context, AV_LOG_ERROR, "could not create formats list\n"); + return AVERROR(ENOMEM); + } + + return ff_set_common_formats(context, formats_list); +} + +static av_cold int classify_init(AVFilterContext *ctx) +{ + InferenceClassifyContext *s = ctx->priv; + int i, ret; + int model_num = 0, label_num = 0, name_num = 0; + const int max_num = MAX_MODEL_NUM; + char *names[MAX_MODEL_NUM] = { }; + char *models[MAX_MODEL_NUM] = { }; + char *labels[MAX_MODEL_NUM] = { }; + InferenceParam p = {}; + + av_assert0(s->model_file); + + split(s->model_file, "&", models, &model_num, max_num); + for (i = 0; i < model_num; i++) + av_log(ctx, AV_LOG_INFO, "model[%d]:%s\n", i, models[i]); + + split(s->labels, "&", labels, &label_num, max_num); + for (i = 0; i < label_num; i++) + av_log(ctx, AV_LOG_INFO, "label[%d]:%s\n", i, labels[i]); + + split(s->names, "&", names, &name_num, max_num); + for (i = 0; i < name_num; i++) + av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]); + + av_assert0(s->backend_type == DNN_INTEL_IE); + + p.backend_type = s->backend_type; + p.device_type = s->device_type; + p.batch_size = s->batch_size; + p.every_nth_frame = s->every_nth_frame; + p.input_precision = DNN_DATA_PRECISION_U8; + p.input_layout = DNN_DATA_LAYOUT_NCHW; + p.input_is_image = 1; + + for (i = 0; i < model_num; i++) { + InferenceBaseContext *base = NULL; + + p.model_file = models[i]; + ret = ff_inference_base_create(ctx, &base, &p); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "could not create inference\n"); + return ret; + } + + s->infer_bases[i] = base; + } + s->loaded_num = model_num; + + for (i = 0; i < label_num; i++) { + int n, labels_num; + AVBufferRef *ref = NULL; + LabelsArray *larray = NULL; + char buffer[4096] = { }; + char *_labels[100] = { }; + + FILE *fp = fopen(labels[i], "r+b"); + if (!fp) { + av_log(ctx, AV_LOG_ERROR, "could not open file:%s\n", labels[i]); + ret = AVERROR(EIO); + fclose(fp); + goto fail; + } + fread(buffer, sizeof(buffer), 1, fp); + fclose(fp); + + buffer[strcspn(buffer, "\n")] = 0; + split(buffer, ",", _labels, &labels_num, 100); + + larray = av_mallocz(sizeof(*larray)); + if (!larray) { + ret = AVERROR(ENOMEM); + goto fail; + } + + for (n = 0; n < labels_num; n++) { + char *l = av_strdup(_labels[n]); + av_dynarray_add(&larray->label, &larray->num, l); + } + + ref = av_buffer_create((uint8_t *)larray, sizeof(*larray), + &infer_labels_buffer_free, NULL, 0); + s->label_bufs[i] = ref; + } + + for (i = 0; i < name_num; i++) { + s->name_array[i] = names[i]; + if (strstr(names[i], "emotion")) + s->post_process[i] = &emotion_classify_result_process; + else if (strstr(names[i], "age") && strstr(names[i], "gend")) + s->post_process[i] = &age_gender_classify_result_process; + } + + return 0; + +fail: + for (i = 0; i < model_num; i++) { + ff_inference_base_free(&s->infer_bases[i]); + if (s->label_bufs[i]) + av_buffer_unref(&s->label_bufs[i]); + } + + return ret; +} + +static av_cold void classify_uninit(AVFilterContext *ctx) +{ + int i; + InferenceClassifyContext *s = ctx->priv; + + for (i = 0; i < s->loaded_num; i++) { + ff_inference_base_free(&s->infer_bases[i]); + av_buffer_unref(&s->label_bufs[i]); + } +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *in) +{ + int i, ret; + AVFilterContext *ctx = inlink->dst; + InferenceClassifyContext *s = ctx->priv; + AVFilterLink *outlink = inlink->dst->outputs[0]; + AVBufferRef *ref; + AVFrameSideData *sd, *new_sd; + BBoxesArray *boxes; + InferDetectionMeta *d_meta; + ClassifyArray *c_array; + InferClassificationMeta *c_meta; + + sd = av_frame_get_side_data(in, AV_FRAME_DATA_INFERENCE_DETECTION); + if (!sd) + goto done; + + d_meta = (InferDetectionMeta *)sd->data; + if (!d_meta) + goto fail; + + boxes = d_meta->bboxes; + if (!boxes || !boxes->num) + goto done; + + c_meta = av_mallocz(sizeof(*c_meta)); + c_array = av_mallocz(sizeof(*c_array)); + if (!c_meta || !c_array) { + ret = AVERROR(ENOMEM); + goto fail; + } + + c_meta->c_array = c_array; + + // handle according to detected metadata one by one + for (i = 0; i < boxes->num; i++) { + int j; + InferDetection *bbox = boxes->bbox[i]; + + // process for each model + for (j = 0; j < s->loaded_num; j++) { + int output; + InferenceBaseContext *base = s->infer_bases[j]; + + VideoPP *vpp = ff_inference_base_get_vpp(base); + AVFrame *tmp = vpp->frames[0]; + DNNModelInfo *iinfo = ff_inference_base_get_input_info(base); + DNNModelInfo *oinfo = ff_inference_base_get_output_info(base); + + ret = vpp->crop_and_scale(in, + bbox->x_min * in->width, + bbox->y_min * in->height, + bbox->x_max * in->width, + bbox->y_max * in->height, + iinfo->width[0], + iinfo->height[0], + tmp->data, + tmp->linesize); + + // TODO: support dynamic batch for faces + ff_inference_base_submit_frame(base, tmp, 0, 0); + ff_inference_base_infer(base); + + for (output = 0; output < oinfo->numbers; output++) { + InferTensorMeta tensor_meta = { }; + ff_inference_base_get_infer_result(base, output, &tensor_meta); + + if (s->post_process[j]) + s->post_process[j](ctx, i, output, j, &tensor_meta, c_meta); + } + } + } + + ref = av_buffer_create((uint8_t *)c_meta, sizeof(*c_meta), + &infer_classify_metadata_buffer_free, NULL, 0); + if (!ref) + return AVERROR(ENOMEM); + + // add meta data to side data + new_sd = av_frame_new_side_data_from_buf(in, AV_FRAME_DATA_INFERENCE_CLASSIFICATION, ref); + if (!new_sd) { + av_buffer_unref(&ref); + av_log(NULL, AV_LOG_ERROR, "could not add new side data\n"); + return AVERROR(ENOMEM); + } + +done: + return ff_filter_frame(outlink, in); +fail: + av_frame_free(&in); + return ret; +} + +static av_cold int config_input(AVFilterLink *inlink) +{ + int i, j; + AVFilterContext *ctx = inlink->dst; + InferenceClassifyContext *s = ctx->priv; + enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); + + for (i = 0; i < s->loaded_num; i++) { + InferenceBaseContext *base = s->infer_bases[i]; + DNNModelInfo *info = ff_inference_base_get_input_info(base); + VideoPP *vpp = ff_inference_base_get_vpp(base); + + vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? + VPP_DEVICE_HW : VPP_DEVICE_SW; + + // allocate avframes to save preprocessed data + for (j = 0; j < info->numbers; j++) { + int ret; + AVFrame *frame = av_frame_alloc(); + if (!frame) + return AVERROR(ENOMEM); + + frame->format = expect_format; + frame->width = info->width[j]; + frame->height = info->height[j]; + + ret = av_frame_get_buffer(frame, 0); + if (ret < 0) { + av_frame_free(&frame); + return ret; + } + vpp->frames[j] = frame; + } + } + + return 0; +} + +static av_cold int config_output(AVFilterLink *outlink) +{ + return 0; +} + +static const AVOption inference_classify_options[] = { + { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, + { "models", "path to model files for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "labels", "labels for classify", OFFSET(labels), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "names", "classify type names", OFFSET(names), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, + { "interval", "do infer every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 15, FLAGS}, + { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, + { NULL } +}; + +AVFILTER_DEFINE_CLASS(inference_classify); + +static const AVFilterPad classify_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = config_input, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad classify_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = config_output, + }, + { NULL } +}; + +AVFilter ff_vf_inference_classify = { + .name = "classify", + .description = NULL_IF_CONFIG_SMALL("DNN Inference classification."), + .priv_size = sizeof(InferenceClassifyContext), + .query_formats = query_formats, + .init = classify_init, + .uninit = classify_uninit, + .inputs = classify_inputs, + .outputs = classify_outputs, + .priv_class = &inference_classify_class, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c index 66aa494..8d676cf 100644 --- a/libavfilter/vf_inference_detect.c +++ b/libavfilter/vf_inference_detect.c @@ -114,11 +114,11 @@ static int face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A AVFrameSideData *sd; InferDetectionMeta *detect_meta = NULL; - BBoxesArray *boxes = av_mallocz(sizeof(BBoxesArray)); + BBoxesArray *boxes = av_mallocz(sizeof(*boxes)); if (!boxes) return AVERROR(ENOMEM); - detect_meta = av_malloc(sizeof(InferDetectionMeta)); + detect_meta = av_malloc(sizeof(*detect_meta)); if (!detect_meta) return AVERROR(ENOMEM); @@ -130,7 +130,7 @@ static int face_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *meta, A av_assert0(meta->total_bytes >= max_proposal_count * object_size * sizeof(float)); for (i = 0; i < max_proposal_count; i++) { - InferDetection *new_bbox = av_mallocz(sizeof(InferDetection)); + InferDetection *new_bbox = av_mallocz(sizeof(*new_bbox)); new_bbox->label_id = (int)detection[i * object_size + 1]; new_bbox->confidence = detection[i * object_size + 2]; @@ -189,6 +189,23 @@ static int logo_init(AVFilterContext *ctx, const char *args) {return 0;} static void logo_uninit(AVFilterContext *ctx) {} static int logo_end_frame_filter(AVFilterContext *ctx, InferTensorMeta *data, AVFrame *frame) { return 0; } +static int detect_preprocess(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out) +{ + int ret; + VideoPP *vpp = ff_inference_base_get_vpp(base); + AVFrame *tmp = vpp->frames[index]; + + if (!vpp->scale_contexts[index]) { + *out = in; + return 0; + } + + ret = vpp->swscale(vpp->scale_contexts[index], (const uint8_t * const*)in->data, + in->linesize, 0, in->height, tmp->data, tmp->linesize); + *out = tmp; + return ret; +} + static int query_formats(AVFilterContext *context) { AVFilterFormats *formats_list; @@ -331,6 +348,7 @@ static av_cold int detect_init(AVFilterContext *ctx) p.input_precision = DNN_DATA_PRECISION_U8; p.input_layout = DNN_DATA_LAYOUT_NCHW; p.input_is_image = 1; + p.preprocess = &detect_preprocess; ret = ff_inference_base_create(ctx, &s->base, &p); if (ret < 0) { @@ -369,7 +387,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) if (ret < 0) goto fail; - ret = ff_inference_base_get_infer_result(s->base, &tensor_meta); + ret = ff_inference_base_get_infer_result(s->base, 0, &tensor_meta); if (ret < 0) goto fail; @@ -386,7 +404,7 @@ static const AVOption inference_detect_options[] = { { "model", "path to model file for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, { "interval", "detect every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 15, FLAGS}, - { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1024, FLAGS}, + { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, { "threshold", "threshod to filter output data", OFFSET(threshold), AV_OPT_TYPE_FLOAT, { .dbl = 0.5}, 0, 1, FLAGS}, { "name", "detection type name", OFFSET(name), AV_OPT_TYPE_STRING, .flags = FLAGS, "detection" }, diff --git a/libavutil/frame.c b/libavutil/frame.c index 0b228cd..1866d85 100644 --- a/libavutil/frame.c +++ b/libavutil/frame.c @@ -383,12 +383,16 @@ FF_ENABLE_DEPRECATION_WARNINGS #endif for (i = 0; i < src->nb_side_data; i++) { + int keep_ref = 0; const AVFrameSideData *sd_src = src->side_data[i]; AVFrameSideData *sd_dst; if ( sd_src->type == AV_FRAME_DATA_PANSCAN && (src->width != dst->width || src->height != dst->height)) continue; - if (force_copy) { + if (sd_src->type == AV_FRAME_DATA_INFERENCE_CLASSIFICATION || + sd_src->type == AV_FRAME_DATA_INFERENCE_DETECTION) + keep_ref = 1; + if (force_copy && !keep_ref) { sd_dst = av_frame_new_side_data(dst, sd_src->type, sd_src->size); if (!sd_dst) { @@ -836,6 +840,7 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type) case AV_FRAME_DATA_S12M_TIMECODE: return "SMPTE 12-1 timecode"; case AV_FRAME_DATA_SPHERICAL: return "Spherical Mapping"; case AV_FRAME_DATA_ICC_PROFILE: return "ICC profile"; + case AV_FRAME_DATA_INFERENCE_CLASSIFICATION: return "Inference classification metadata"; case AV_FRAME_DATA_INFERENCE_DETECTION: return "Inference detection metadata"; #if FF_API_FRAME_QP case AV_FRAME_DATA_QP_TABLE_PROPERTIES: return "QP table properties"; diff --git a/libavutil/frame.h b/libavutil/frame.h index 2dcf8da..a7e5caa 100644 --- a/libavutil/frame.h +++ b/libavutil/frame.h @@ -142,6 +142,8 @@ enum AVFrameSideDataType { */ AV_FRAME_DATA_ICC_PROFILE, + AV_FRAME_DATA_INFERENCE_CLASSIFICATION, + AV_FRAME_DATA_INFERENCE_DETECTION, #if FF_API_FRAME_QP -- 2.7.4