From ca12fa90c7eef79992dffc1dcc6edfec5e94b673 Mon Sep 17 00:00:00 2001 From: Lin Xie Date: Tue, 23 Apr 2019 15:41:30 +0800 Subject: [PATCH] Change IE filters to use model proc * dnn interfaces changed to match refined IE APIs * enable JSON file reader based on libcjson * removed obsolete parameters in detect filter TODO: * do parameters clean up in classify filter after face reidentify changed to use model proc. * use model proc to config input color format Change-Id: I6cd78390b9e3bf4726607ceecf36d10e93609eb6 Signed-off-by: Lin Xie --- libavfilter/dnn_backend_intel_ie.c | 76 +++----- libavfilter/dnn_data.h | 12 +- libavfilter/inference.c | 304 ++++++++++++++++++++++++++++++-- libavfilter/inference.h | 49 +++++- libavfilter/vf_inference_classify.c | 337 +++++++++++++++++++++++++++--------- libavfilter/vf_inference_detect.c | 107 +++++------- 6 files changed, 659 insertions(+), 226 deletions(-) diff --git a/libavfilter/dnn_backend_intel_ie.c b/libavfilter/dnn_backend_intel_ie.c index a600ee6..561cc15 100644 --- a/libavfilter/dnn_backend_intel_ie.c +++ b/libavfilter/dnn_backend_intel_ie.c @@ -241,19 +241,20 @@ static DNNReturnType get_input_info_intel_ie(void *model, DNNModelInfo *info) IEGetModelInputInfo(ie_model->context, ie_model->input_infos); - if (ie_model->input_infos->numbers > DNN_INPUT_OUTPUT_NUM) + if (ie_model->input_infos->number > DNN_INPUT_OUTPUT_NUM) return DNN_ERROR; - for (id = 0; id < ie_model->input_infos->numbers; id++) { - info->width[id] = ie_model->input_infos->width[id]; - info->height[id] = ie_model->input_infos->height[id]; - info->channels[id] = ie_model->input_infos->channels[id]; - info->precision[id] = get_dnn_precision(ie_model->input_infos->precision[id]); - info->layout[id] = get_dnn_layout(ie_model->input_infos->layout[id]); - info->is_image[id] = 0; + for (id = 0; id < ie_model->input_infos->number; id++) { + memcpy(&info->dims[id][0], + &ie_model->input_infos->tensorMeta[id].dims[0], + 4 * sizeof(info->dims[id][0])); + + info->layer_name[id] = ie_model->input_infos->tensorMeta[id].layer_name; + info->precision[id] = get_dnn_precision(ie_model->input_infos->tensorMeta[id].precision); + info->layout[id] = get_dnn_layout(ie_model->input_infos->tensorMeta[id].layout); } info->batch_size = ie_model->input_infos->batch_size; - info->numbers = ie_model->input_infos->numbers; + info->number = ie_model->input_infos->number; return DNN_SUCCESS; } @@ -263,15 +264,15 @@ static DNNReturnType set_input_info_intel_ie(void *model, DNNModelInfo *info) int id = 0; DNNIntelIEModel *ie_model = (DNNIntelIEModel *)model; - if (!model || !info || info->numbers > DNN_INPUT_OUTPUT_NUM) + if (!model || !info || info->number > DNN_INPUT_OUTPUT_NUM) return DNN_ERROR; - for (id = 0; id < info->numbers; id++) { - ie_model->input_infos->precision[id] = get_precision(info->precision[id]); - ie_model->input_infos->layout[id] = get_layout(info->layout[id]); - ie_model->input_infos->dataType[id] = info->is_image[id]; - } - ie_model->input_infos->numbers = info->numbers; + // image set to input 0 + ie_model->input_infos->tensorMeta[0].precision = get_precision(info->precision[id]); + ie_model->input_infos->tensorMeta[0].layout = get_layout(info->layout[id]); + ie_model->input_infos->tensorMeta[0].dataType = info->is_image[id]; + + ie_model->input_infos->number = info->number; IESetModelInputInfo(ie_model->context, ie_model->input_infos); @@ -288,42 +289,20 @@ static DNNReturnType get_output_info_intel_ie(void *model, DNNModelInfo *info) IEGetModelOutputInfo(ie_model->context, ie_model->output_infos); - if (ie_model->output_infos->numbers > DNN_INPUT_OUTPUT_NUM) + if (ie_model->output_infos->number > DNN_INPUT_OUTPUT_NUM) return DNN_ERROR; - for (id = 0; id < ie_model->output_infos->numbers; id++) { - info->width[id] = ie_model->output_infos->width[id]; - info->height[id] = ie_model->output_infos->height[id]; - info->channels[id] = ie_model->output_infos->channels[id]; - info->precision[id] = get_dnn_precision(ie_model->output_infos->precision[id]); - info->layout[id] = get_dnn_layout(ie_model->output_infos->layout[id]); - info->is_image[id] = 0; - } - info->batch_size = ie_model->output_infos->batch_size; - info->numbers = ie_model->output_infos->numbers; - - return DNN_SUCCESS; -} + for (id = 0; id < ie_model->output_infos->number; id++) { + memcpy(&info->dims[id][0], + &ie_model->output_infos->tensorMeta[id].dims[0], + 4 * sizeof(info->dims[id][0])); -static DNNReturnType set_output_info_intel_ie(void *model, DNNModelInfo *info) -{ - int id = 0; - DNNIntelIEModel *ie_model = (DNNIntelIEModel *)model; - - if (!model || !info) - return DNN_ERROR; - - if (info->numbers > DNN_INPUT_OUTPUT_NUM) - return DNN_ERROR; - - for (id = 0; id < info->numbers; id++) { - ie_model->output_infos->precision[id] = get_precision(info->precision[id]); - ie_model->output_infos->layout[id] = get_layout(info->layout[id]); - ie_model->output_infos->dataType[id] = info->is_image[id]; + info->layer_name[id] = ie_model->output_infos->tensorMeta[id].layer_name; + info->precision[id] = get_dnn_precision(ie_model->output_infos->tensorMeta[id].precision); + info->layout[id] = get_dnn_layout(ie_model->output_infos->tensorMeta[id].layout); } - ie_model->output_infos->numbers = info->numbers; - - IESetModelOutputInfo(ie_model->context, ie_model->output_infos); + info->batch_size = ie_model->output_infos->batch_size; + info->number = ie_model->output_infos->number; return DNN_SUCCESS; } @@ -407,7 +386,6 @@ DNNModel* ff_dnn_load_model_intel_ie(void *config) model->get_input_info = &get_input_info_intel_ie; model->set_input_info = &set_input_info_intel_ie; model->get_output_info = &get_output_info_intel_ie; - model->set_output_info = &set_output_info_intel_ie; model->create_model = &create_model_intel_ie; return model; diff --git a/libavfilter/dnn_data.h b/libavfilter/dnn_data.h index 0add8ee..7b7f4a5 100644 --- a/libavfilter/dnn_data.h +++ b/libavfilter/dnn_data.h @@ -21,6 +21,7 @@ #define AVFILTER_DNN_DATA_H #include +#include /** * @enum TargetDevice @@ -136,17 +137,18 @@ typedef struct DNNIOData { * @struct model input info * @brief model input info */ -#define DNN_INPUT_OUTPUT_NUM 10 +#define DNN_INPUT_OUTPUT_NUM 8 typedef struct DNNModelInfo { - unsigned int width[DNN_INPUT_OUTPUT_NUM]; - unsigned int height[DNN_INPUT_OUTPUT_NUM]; - unsigned int channels[DNN_INPUT_OUTPUT_NUM]; + char *layer_name[DNN_INPUT_OUTPUT_NUM]; + size_t dims[DNN_INPUT_OUTPUT_NUM][4]; + DNNDataPrecisionType precision[DNN_INPUT_OUTPUT_NUM]; DNNDataLayoutType layout[DNN_INPUT_OUTPUT_NUM]; + // 0 non-image; 1 image. unsigned int is_image[DNN_INPUT_OUTPUT_NUM]; unsigned int batch_size; - unsigned int numbers; + unsigned int number; } DNNModelInfo; /** diff --git a/libavfilter/inference.c b/libavfilter/inference.c index 596c5b4..380a5ed 100644 --- a/libavfilter/inference.c +++ b/libavfilter/inference.c @@ -33,6 +33,10 @@ #include "inference.h" +#if CONFIG_LIBCJSON +#include +#endif + #if CONFIG_VAAPI #define VA_CALL(_FUNC) \ { \ @@ -46,7 +50,7 @@ } #endif -struct InferenceBaseContext +struct _InferenceBaseContext { char *infer_type; int batch_size; @@ -68,6 +72,50 @@ static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp, AVFrame *input, Rect *crop_re static int va_vpp_scale(VAAPIVpp *va_vpp, AVFrame *input, int scale_w, int scale_h, uint8_t *data[], int stride[]); +static void infer_labels_buffer_free(void *opaque, uint8_t *data) +{ + int i; + LabelsArray *labels = (LabelsArray *)data; + + for (i = 0; i < labels->num; i++) + av_freep(&labels->label[i]); + + av_free(data); +} + +// helper functions +static void infer_labels_dump(uint8_t *data) +{ + int i; + LabelsArray *labels = (LabelsArray *)data; + printf("labels: "); + for (i = 0; i < labels->num; i++) + printf("%s ", labels->label[i]); + printf("\n"); +} + +int ff_get_file_size(FILE *fp) +{ + int file_size, current_pos; + + if (!fp) + return -1; + + current_pos = ftell(fp); + + if (fseek(fp, 0, SEEK_END)) { + fprintf(stderr, "Couldn't seek to the end of feature file.\n"); + return -1; + } + + file_size = ftell(fp); + + fseek(fp, current_pos, SEEK_SET); + + return file_size; +} + + static int fill_dnn_data_from_frame(DNNIOData *data, const AVFrame *frame, int batch_idx, @@ -314,7 +362,7 @@ int ff_inference_base_create(AVFilterContext *ctx, DNN_ERR_CHECK(ctx); info = &s->input_info; - for (i = 0; i < info->numbers; i++) { + for (i = 0; i < info->number; i++) { info->layout[i] = param->input_layout; info->precision[i] = param->input_precision; info->is_image[i] = param->input_is_image; @@ -387,7 +435,6 @@ int ff_inference_base_submit_frame(InferenceBaseContext *base, DNNIOData input = { }; fill_dnn_data_from_frame(&input, frame, batch_idx, 1, input_idx); base->model->set_input(base->model->model, &input); - #if CONFIG_VAAPI if (base->vpp.va_vpp) va_vpp_surface_release(base->vpp.va_vpp); @@ -410,7 +457,7 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in) DNNReturnType dnn_ret; DNNIOData input = { }; - for (int i = 0; i < info->numbers; i++) { + for (int i = 0; i < info->number; i++) { AVFrame *processed_frame; for (int j = 0; j < base->batch_size; j++) { if (base->preprocess) { @@ -433,7 +480,7 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in) } int ff_inference_base_get_infer_result(InferenceBaseContext *base, - int output_index, + int id, InferTensorMeta *metadata) { DNNModelInfo *info = &base->output_info; @@ -443,18 +490,17 @@ int ff_inference_base_get_infer_result(InferenceBaseContext *base, av_assert0(metadata != NULL); // TODO: change to layer name for multiple outputs - data.in_out_idx = output_index; + data.in_out_idx = id; ret = base->model->get_execute_result(base->model->model, &data); av_assert0(ret == DNN_SUCCESS); - //TODO: refine by new interface - metadata->dim_size = 3; - metadata->dims[0] = info->width[0]; - metadata->dims[1] = info->height[0]; - metadata->dims[2] = info->channels[0]; - metadata->layout = info->layout[0]; - metadata->precision = info->precision[0]; + metadata->dim_size = 4; + memcpy(&metadata->dims[0], &info->dims[id][0], + metadata->dim_size * sizeof(metadata->dims[0])); + + metadata->layout = info->layout[id]; + metadata->precision = info->precision[id]; metadata->data = data.data[0]; metadata->total_bytes = data.size; @@ -477,11 +523,23 @@ VideoPP* ff_inference_base_get_vpp(InferenceBaseContext *base) return &base->vpp; } -/******************************************** - * * - * VAAPI VPP APIs * - * * - *******************************************/ +void ff_inference_dump_model_info(void *ctx, DNNModelInfo *info) +{ + int i; + for (i = 0; i < info->number; i++) { + size_t *p = &info->dims[i][0]; + av_log(ctx, AV_LOG_DEBUG, "Info id:%d layer\"%-16s\" " + "batch size:%d - dim: %3lu %3lu %3lu %3lu - img:%d pre:%d layout:%d\n", + i, info->layer_name[i], + info->batch_size, p[0], p[1], p[2], p[3], + info->is_image[i], info->precision[i], info->layout[i]); + } +} + +/* + * VAAPI VPP APIs + */ + #if CONFIG_VAAPI static int ff_vaapi_vpp_colour_standard(enum AVColorSpace av_cs) { @@ -733,3 +791,213 @@ static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp, return VA_STATUS_SUCCESS; } #endif + +#if CONFIG_LIBCJSON +/* + * model proc parsing functions using cJSON + */ +static inline void json_print(cJSON *j) +{ + char *string = cJSON_Print(j); + if (string) + printf("%s\n", string); +} + +void *ff_read_model_proc(const char *path) +{ + int n, file_size; + cJSON *proc_config = NULL; + uint8_t *proc_json = NULL; + FILE *fp = fopen(path, "rb"); + if (!fp) { + fprintf(stderr, "File open error:%s\n", path); + return NULL; + } + + file_size = ff_get_file_size(fp); + + proc_json = av_mallocz(file_size); + if (!proc_json) + goto end; + + n = fread(proc_json, file_size, 1, fp); + + UNUSED(n); + + proc_config = cJSON_Parse(proc_json); + if (proc_config == NULL) { + const char *error_ptr = cJSON_GetErrorPtr(); + if (error_ptr != NULL) + fprintf(stderr, "Error before: %s\n", error_ptr); + goto end; + } + +end: + if (proc_json) + av_freep(&proc_json); + fclose(fp); + return proc_config; +} + +void ff_load_default_model_proc(ModelInputPreproc *preproc, ModelOutputPostproc *postproc) +{ + if (preproc) { + /* + * format is a little tricky, an ideal input format for IE is BGR planer + * however, neither soft csc nor hardware vpp could support that format. + * Here, we set a close soft format. The actual one coverted before sent + * to IE will be decided by user config and hardware vpp used or not. + */ + preproc->color_format = AV_PIX_FMT_BGR24; + preproc->layer_name = NULL; + } + + if (postproc) { + // do nothing + } +} + +int ff_parse_input_preproc(const void *json, ModelInputPreproc *m_preproc) +{ + cJSON *item, *preproc, *color, *layer, *object_class; + + preproc = cJSON_GetObjectItem(json, "input_preproc"); + if (preproc == NULL) { + av_log(NULL, AV_LOG_DEBUG, "No input_preproc.\n"); + return 0; + } + + // not support multiple inputs yet + av_assert0(cJSON_GetArraySize(preproc) <= 1); + + cJSON_ArrayForEach(item, preproc) + { + color = cJSON_GetObjectItemCaseSensitive(item, "color_format"); + layer = cJSON_GetObjectItemCaseSensitive(item, "layer_name"); + object_class = cJSON_GetObjectItemCaseSensitive(item, "object_class"); + } + + if (color) { + if (!cJSON_IsString(color) || (color->valuestring == NULL)) + return -1; + + av_log(NULL, AV_LOG_INFO, "Color Format:\"%s\"\n", color->valuestring); + + if (!strcmp(color->valuestring, "BGR")) + m_preproc->color_format = AV_PIX_FMT_BGR24; + else if (!strcmp(color->valuestring, "RGB")) + m_preproc->color_format = AV_PIX_FMT_RGB24; + else + return -1; + } + + if (object_class) { + if (!cJSON_IsString(object_class) || (object_class->valuestring == NULL)) + return -1; + + av_log(NULL, AV_LOG_INFO, "Object_class:\"%s\"\n", object_class->valuestring); + + m_preproc->object_class = object_class->valuestring; + } + + UNUSED(layer); + + return 0; +} + +// For detection, we now care labels only. +// Layer name and type can be got from output blob. +int ff_parse_output_postproc(const void *json, ModelOutputPostproc *m_postproc) +{ + size_t index = 0; + cJSON *item, *postproc; + cJSON *attribute, *converter, *labels, *layer, *method, *threshold; + cJSON *tensor2text_scale, *tensor2text_precision; + + postproc = cJSON_GetObjectItem(json, "output_postproc"); + if (postproc == NULL) { + av_log(NULL, AV_LOG_DEBUG, "No output_postproc.\n"); + return 0; + } + + av_assert0(cJSON_GetArraySize(postproc) <= MAX_MODEL_OUTPUT); + cJSON_ArrayForEach(item, postproc) + { + OutputPostproc *proc = &m_postproc->procs[index]; + +#define FETCH_STRING(var, name) \ + do { var = cJSON_GetObjectItemCaseSensitive(item, #name);\ + if (var) proc->name = var->valuestring; \ + } while(0) +#define FETCH_DOUBLE(var, name) \ + do { var = cJSON_GetObjectItemCaseSensitive(item, #name);\ + if (var) proc->name = var->valuedouble; \ + } while(0) +#define FETCH_INTEGER(var, name) \ + do { var = cJSON_GetObjectItemCaseSensitive(item, #name);\ + if (var) proc->name = var->valueint; \ + } while(0) + + FETCH_STRING(layer, layer_name); + FETCH_STRING(method, method); + FETCH_STRING(attribute, attribute_name); + FETCH_STRING(converter, converter); + + FETCH_DOUBLE(threshold, threshold); + FETCH_DOUBLE(tensor2text_scale, tensor2text_scale); + + FETCH_INTEGER(tensor2text_precision, tensor2text_precision); + + // handle labels + labels = cJSON_GetObjectItemCaseSensitive(item, "labels"); + if (labels) { + cJSON *label; + size_t labels_num = cJSON_GetArraySize(labels); + + if (labels_num > 0) { + AVBufferRef *ref = NULL; + LabelsArray *larray = av_mallocz(sizeof(*larray)); + + if (!larray) + return AVERROR(ENOMEM); + + cJSON_ArrayForEach(label, labels) { + char *l = av_strdup(label->valuestring); + av_dynarray_add(&larray->label, &larray->num, l); + } + + ref = av_buffer_create((uint8_t *)larray, sizeof(*larray), + &infer_labels_buffer_free, NULL, 0); + + proc->labels = ref; + + infer_labels_dump(ref->data); + } + } + + index++; + } +#undef FETCH_STRING +#undef FETCH_DOUBLE +#undef FETCH_INTEGER + + return 0; +} + +void ff_release_model_proc(const void *json, + ModelInputPreproc *preproc, ModelOutputPostproc *postproc) +{ + size_t index = 0; + + if (!json) return; + + if (postproc) { + for (index = 0; index < MAX_MODEL_OUTPUT; index++) { + if (postproc->procs[index].labels) + av_buffer_unref(&postproc->procs[index].labels); + } + } + + cJSON_Delete((cJSON *)json); +} +#endif diff --git a/libavfilter/inference.h b/libavfilter/inference.h index d0515e2..0512403 100644 --- a/libavfilter/inference.h +++ b/libavfilter/inference.h @@ -32,10 +32,15 @@ #include "dnn_interface.h" -typedef struct InferenceBaseContext InferenceBaseContext; +typedef struct _InferenceBaseContext InferenceBaseContext; +typedef struct _InputPreproc ModelInputPreproc; +typedef struct _OutputPostproc OutputPostproc; +typedef struct _ModelOutputPostproc ModelOutputPostproc; typedef int (*InferencePreProcess)(InferenceBaseContext *base, int index, AVFrame *in, AVFrame **out); +#define UNUSED(x) (void)(x) + typedef struct InferenceParam { char *model_file; char *labels_file; @@ -116,14 +121,36 @@ struct _SwVpp { typedef struct VideoPP { int device; int expect_format; - AVFrame *frames[MAX_VPP_NUM]; //data; - av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Label id:%d Emotion:%s Conf:%f\n", - label_id, array->label[label_id], conf); + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Label id:%d %s:%s Conf:%f\n", + label_id, name, array->label[label_id], conf); } -static int emotion_classify_result_process(AVFilterContext *ctx, - int detect_id, - int result_id, - int model_index, - InferTensorMeta *meta, - InferClassificationMeta *c_meta) +static av_cold void dump_tensor_value(AVFilterContext *ctx, char *name, float value) { - int i, label_id = 0; - InferenceClassifyContext *s = ctx->priv; - const float *emo_confidence = (float *)meta->data; - size_t labels_num = meta->dims[2]; - float max = emo_confidence[0]; + av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - %s:%1.2f\n", name, value); +} + +static void find_max_element_index(const float *array, int len, + int *index, float *value) +{ + int i; + *index = 0; + *value = array[0]; + for (i = 1; i < len; i++) { + if (array[i] > *value) { + *index = i; + *value = array[i]; + } + } +} - InferClassification *classify = av_mallocz(sizeof(*classify)); +static int attributes_to_text(AVFilterContext *ctx, + int detect_id, + OutputPostproc *proc, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) +{ + InferClassification *classify; + uint32_t method_max, method_compound, method_index; + const float *data = (const float *)meta->data; + + method_max = !strcmp(proc->method, "max"); + method_compound = !strcmp(proc->method, "compound"); + method_index = !strcmp(proc->method, "index"); + + if (!data) return -1; + + classify = av_mallocz(sizeof(*classify)); if (!classify) return AVERROR(ENOMEM); - // Get the emotion with max confidence - for (i = 1; i < labels_num; i++) - if (emo_confidence[i] > max) { max = emo_confidence[i]; label_id = i; } + if (method_max) { + int index; + float confidence; + size_t n = meta->dims[1]; + + find_max_element_index(data, n, &index, &confidence); + + classify->detect_id = detect_id; + classify->name = proc->attribute_name; + classify->label_id = index; + classify->confidence = confidence; + classify->label_buf = av_buffer_ref(proc->labels); + + dump_softmax(ctx, classify->name, classify->label_id, + classify->confidence,classify->label_buf); + } else if (method_compound) { + int i; + double threshold = 0.5; + float confidence = 0; + char attributes[4096] = {}; + LabelsArray *array; + + if (proc->threshold != 0) + threshold = proc->threshold; + + array = (LabelsArray *)proc->labels->data; + for (i = 0; i < array->num; i++) { + if (data[i] >= threshold) + strncat(attributes, array->label[i], (strlen(array->label[i]) + 1)); + if (data[i] > confidence) + confidence = data[i]; + } - classify->detect_id = detect_id; - classify->name = s->name_array[model_index]; - classify->label_id = label_id; - classify->confidence = emo_confidence[label_id]; - classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); + classify->name = proc->attribute_name; + classify->confidence = confidence; + + av_log(ctx, AV_LOG_DEBUG, "Attributes: %s\n", attributes); + // TODO: to add into side data + av_free(classify); + return 0; + } else if (method_index) { + int i; + char attributes[1024] = {}; + LabelsArray *array; + + array = (LabelsArray *)proc->labels->data; + for (i = 0; i < array->num; i++) { + int value = data[i]; + if (value < 0 || value >= array->num) + break; + strncat(attributes, array->label[value], (strlen(array->label[value]) + 1)); + } - dump_emotion(ctx, classify->label_id, classify->confidence, classify->label_buf); + classify->name = proc->attribute_name; - av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + av_log(ctx, AV_LOG_DEBUG, "Attributes: %s\n", attributes); + // TODO: to add into side data + av_free(classify); + return 0; + } + av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); return 0; } -static av_cold void dump_gender(AVFilterContext *ctx, int label_id, - float conf, AVBufferRef *label_buf) +static int tensor_to_text(AVFilterContext *ctx, + int detect_id, + OutputPostproc *proc, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) { - LabelsArray *array = (LabelsArray *)label_buf->data; + InferClassification *classify; + const float *data = (const float *)meta->data; + double scale = 1.0; - av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Gender:%s Conf:%1.2f\n", - array->label[label_id], conf); -} + if (!data) return -1; -static av_cold void dump_age(AVFilterContext *ctx, float age) -{ - av_log(ctx, AV_LOG_DEBUG, "CLASSIFY META - Age:%1.2f\n", age); + classify = av_mallocz(sizeof(*classify)); + if (!classify) + return AVERROR(ENOMEM); + + if (proc->tensor2text_scale != 0) + scale = proc->tensor2text_scale; + + classify->detect_id = detect_id; + classify->name = proc->attribute_name; + classify->value = *data * scale; + + dump_tensor_value(ctx, classify->name, classify->value); + + av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + return 0; } -static int age_gender_classify_result_process(AVFilterContext *ctx, - int detect_id, - int result_id, - int model_index, - InferTensorMeta *meta, - InferClassificationMeta *c_meta) +static int commmon_postprocess(AVFilterContext *ctx, + int detect_id, + int result_id, + int model_id, + InferTensorMeta *meta, + InferClassificationMeta *c_meta) { + int proc_id; InferenceClassifyContext *s = ctx->priv; - const float *data = (float *)meta->data; + InferenceBaseContext *base = s->infer_bases[model_id]; - InferClassification *classify = av_mallocz(sizeof(*classify)); - if (!classify) - return AVERROR(ENOMEM); + OutputPostproc *proc; + DNNModelInfo *info = ff_inference_base_get_output_info(base); - classify->detect_id = detect_id; + // search model postproc + for (proc_id = 0; proc_id < MAX_MODEL_OUTPUT; proc_id++) { + char *proc_layer_name = s->model_postproc[model_id].procs[proc_id].layer_name; - if (result_id == 0) { - // Age - classify->name = string_age; - classify->value = *data * 100.0; - dump_age(ctx, classify->value); - } else { - // Gender - classify->name = string_gender; - // 0 - Femal, 1 - Male - classify->label_id = data[0] > data[1] ? 0 : 1; - classify->confidence = data[classify->label_id]; - classify->label_buf = av_buffer_ref(s->label_bufs[model_index]); - dump_gender(ctx, classify->label_id, classify->confidence, classify->label_buf); + // skip this output process + if (!proc_layer_name) + continue; + + if (!strcmp(info->layer_name[result_id], proc_layer_name)) + break; } - av_dynarray_add(&c_meta->c_array->classifications, &c_meta->c_array->num, classify); + if (proc_id == MAX_MODEL_OUTPUT) { + av_log(ctx, AV_LOG_DEBUG, "Could not find proc:%s\n", info->layer_name[result_id]); + return 0; + } + + proc = &s->model_postproc[model_id].procs[proc_id]; + + if (proc->converter == NULL) + return 0; + + if (!strcmp(proc->converter, "attributes")) + return attributes_to_text(ctx, detect_id, proc, meta, c_meta); + + if (!strcmp(proc->converter, "tensor2text")) + return tensor_to_text(ctx, detect_id, proc, meta, c_meta); return 0; } @@ -227,13 +324,7 @@ static int face_identify_init(AVFilterContext *ctx, size_t index) av_assert0(index < MAX_MODEL_NUM); - if (fseek(fp, 0, SEEK_END)) { - av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of feature file.\n"); - fclose(fp); - return AVERROR(EINVAL); - } - - feature_size = ftell(fp); + feature_size = ff_get_file_size(fp); if (feature_size == -1) { fclose(fp); @@ -427,11 +518,12 @@ static av_cold int classify_init(AVFilterContext *ctx) { InferenceClassifyContext *s = ctx->priv; int i, ret; - int model_num = 0, label_num = 0, name_num = 0; + int model_num = 0, model_proc_num = 0, label_num = 0, name_num = 0; const int max_num = MAX_MODEL_NUM; char *names[MAX_MODEL_NUM] = { }; char *models[MAX_MODEL_NUM] = { }; char *labels[MAX_MODEL_NUM] = { }; + char *models_proc[MAX_MODEL_NUM] = { }; InferenceParam p = {}; av_assert0(s->model_file); @@ -448,6 +540,13 @@ static av_cold int classify_init(AVFilterContext *ctx) for (i = 0; i < name_num; i++) av_log(ctx, AV_LOG_INFO, "name[%d]:%s\n", i, names[i]); + av_split(s->model_proc, "&", models_proc, &model_proc_num, max_num); + for (i = 0; i < model_proc_num; i++) + av_log(ctx, AV_LOG_INFO, "proc[%d]:%s\n", i, models_proc[i]); + + // TODO: uncomment this after face reidentify use proc file + // av_assert0(model_proc_num == model_num); + av_assert0(s->backend_type == DNN_INTEL_IE); p.backend_type = s->backend_type; @@ -458,9 +557,10 @@ static av_cold int classify_init(AVFilterContext *ctx) p.input_is_image = 1; for (i = 0; i < model_num; i++) { + void *proc; InferenceBaseContext *base = NULL; - p.model_file = models[i]; + p.model_file = models[i]; ret = ff_inference_base_create(ctx, &base, &p); if (ret < 0) { av_log(ctx, AV_LOG_ERROR, "Could not create inference\n"); @@ -468,6 +568,33 @@ static av_cold int classify_init(AVFilterContext *ctx) } s->infer_bases[i] = base; + + ff_load_default_model_proc(&s->model_preproc[i], &s->model_postproc[i]); + + if (!models_proc[i]) + continue; + + proc = ff_read_model_proc(models_proc[i]); + if (!proc) { + av_log(ctx, AV_LOG_ERROR, "Could not read proc config file:" + "%s\n", models_proc[i]); + ret = AVERROR(EIO); + goto fail; + } + + if (ff_parse_input_preproc(proc, &s->model_preproc[i]) < 0) { + av_log(ctx, AV_LOG_ERROR, "Parse input preproc error.\n"); + ret = AVERROR(EIO); + goto fail; + } + + if (ff_parse_output_postproc(proc, &s->model_postproc[i]) < 0) { + av_log(ctx, AV_LOG_ERROR, "Parse output postproc error.\n"); + ret = AVERROR(EIO); + goto fail; + } + + s->proc_config[i] = proc; } s->loaded_num = model_num; @@ -506,16 +633,14 @@ static av_cold int classify_init(AVFilterContext *ctx) s->label_bufs[i] = ref; } - for (i = 0; i < name_num; i++) { + for (i = 0; i < model_num; i++) { s->name_array[i] = names[i]; - if (strstr(names[i], "emotion")) { - s->post_process[i] = &emotion_classify_result_process; - } else if (strstr(names[i], "age") && strstr(names[i], "gend")) { - s->post_process[i] = &age_gender_classify_result_process; - } else if (strstr(names[i], "face")) { + if (names[i] && strstr(names[i], "face")) { s->init[i] = &face_identify_init; s->uninit[i] = &face_identify_uninit; s->post_process[i] = &face_identify_result_process; + } else { + s->post_process[i] = &commmon_postprocess; } if (s->init[i] && s->init[i](ctx, i) < 0) @@ -545,6 +670,8 @@ static av_cold void classify_uninit(AVFilterContext *ctx) ff_inference_base_free(&s->infer_bases[i]); av_buffer_unref(&s->label_bufs[i]); + + ff_release_model_proc(s->proc_config[i], &s->model_preproc[i], &s->model_postproc[i]); } } @@ -594,11 +721,14 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) for (j = 0; j < s->loaded_num; j++) { int output; InferenceBaseContext *base = s->infer_bases[j]; + ModelInputPreproc *preproc = &s->model_preproc[j]; VideoPP *vpp = ff_inference_base_get_vpp(base); AVFrame *tmp = vpp->frames[0]; DNNModelInfo *iinfo = ff_inference_base_get_input_info(base); DNNModelInfo *oinfo = ff_inference_base_get_output_info(base); + int scale_width = iinfo->dims[0][0]; + int scale_height = iinfo->dims[0][1]; Rect crop_rect = (Rect) { .x0 = bbox->x_min * in->width, @@ -607,14 +737,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) .y1 = bbox->y_max * in->height, }; + // care interested object class only + if (preproc && preproc->object_class && bbox->label_buf) { + LabelsArray *array = (LabelsArray *)bbox->label_buf->data; + if (0 != strcmp(preproc->object_class, array->label[bbox->label_id])) + continue; + } + if (vpp->device == VPP_DEVICE_SW) { ret = vpp->sw_vpp->crop_and_scale(in, &crop_rect, - iinfo->width[0], iinfo->height[0], + scale_width, scale_height, vpp->expect_format, tmp->data, tmp->linesize); } else { #if CONFIG_VAAPI ret = vpp->va_vpp->crop_and_scale(vpp->va_vpp, in, &crop_rect, - iinfo->width[0], iinfo->height[0], tmp->data, tmp->linesize); + scale_width, scale_height, tmp->data, tmp->linesize); #endif } if (ret != 0) { @@ -626,7 +763,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) ff_inference_base_submit_frame(base, tmp, 0, 0); ff_inference_base_infer(base); - for (output = 0; output < oinfo->numbers; output++) { + for (output = 0; output < oinfo->number; output++) { InferTensorMeta tensor_meta = { }; ff_inference_base_get_infer_result(base, output, &tensor_meta); @@ -672,8 +809,13 @@ static av_cold int config_input(AVFilterLink *inlink) DNNModelInfo *info = ff_inference_base_get_input_info(base); VideoPP *vpp = ff_inference_base_get_vpp(base); + int input_width = info->dims[0][0]; + int input_height = info->dims[0][1]; + // right now, no model needs multiple inputs - av_assert0(info->numbers == 1); + // av_assert0(info->number == 1); + + ff_inference_dump_model_info(ctx, info); vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW; @@ -682,8 +824,8 @@ static av_cold int config_input(AVFilterLink *inlink) frame = av_frame_alloc(); if (!frame) return AVERROR(ENOMEM); - frame->width = info->width[0]; - frame->height = info->height[0]; + frame->width = input_width; + frame->height = input_height; frame->format = expect_format; vpp->frames[0] = frame; @@ -706,7 +848,7 @@ static av_cold int config_input(AVFilterLink *inlink) } ret = va_vpp_surface_alloc(vpp->va_vpp, - info->width[0], info->height[0], s->vpp_format); + input_width, input_height, s->vpp_format); if (ret < 0) { av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n"); ret = AVERROR(EINVAL); @@ -741,12 +883,39 @@ fail: static av_cold int config_output(AVFilterLink *outlink) { + int i; + AVFilterContext *ctx = outlink->src; + InferenceClassifyContext *s = ctx->priv; + + for (i = 0; i < s->loaded_num; i++) { + InferenceBaseContext *base = s->infer_bases[i]; + DNNModelInfo *info = ff_inference_base_get_output_info(base); + ff_inference_dump_model_info(ctx, info); + +#if CONFIG_VAAPI + if (!outlink->hw_frames_ctx) { + VideoPP *vpp = ff_inference_base_get_vpp(base); + if (vpp->device == VPP_DEVICE_HW) { + if (!vpp->va_vpp || !vpp->va_vpp->hw_frames_ref) { + av_log(ctx, AV_LOG_ERROR, "The input must have a hardware frame " + "reference.\n"); + return AVERROR(EINVAL); + } + outlink->hw_frames_ctx = av_buffer_ref(vpp->va_vpp->hw_frames_ref); + if (!outlink->hw_frames_ctx) + return AVERROR(ENOMEM); + } + } +#endif + } + return 0; } static const AVOption inference_classify_options[] = { { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, { "model", "path to model files for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "model_proc", "model preproc and postproc", OFFSET(model_proc), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "label", "labels for classify", OFFSET(labels), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "name", "classify type names", OFFSET(names), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "vpp_format", "specify vpp output format", OFFSET(vpp_format), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c index 42a0e02..cfc0524 100644 --- a/libavfilter/vf_inference_detect.c +++ b/libavfilter/vf_inference_detect.c @@ -50,8 +50,8 @@ typedef struct InferenceDetectContext { InferenceBaseContext *base; char *model_file; - char *label_file; char *vpp_format; + char *model_proc; int backend_type; int device_type; @@ -64,22 +64,12 @@ typedef struct InferenceDetectContext { int input_precision; int input_is_image; - char *name; + void *proc_config; - AVBufferRef *label_buf; + ModelInputPreproc model_preproc; + ModelOutputPostproc model_postproc; } InferenceDetectContext; -static void infer_labels_buffer_free(void *opaque, uint8_t *data) -{ - int i; - LabelsArray *labels = (LabelsArray *)data; - - for (i = 0; i < labels->num; i++) - av_freep(&labels->label[i]); - - av_free(data); -} - static void infer_detect_metadata_buffer_free(void *opaque, uint8_t *data) { BBoxesArray *bboxes = ((InferDetectionMeta *)data)->bboxes; @@ -102,8 +92,8 @@ static int detect_postprocess(AVFilterContext *ctx, InferTensorMeta *meta, AVFra { int i; InferenceDetectContext *s = ctx->priv; - int object_size = meta->dims[0]; - int max_proposal_count = meta->dims[1]; + int object_size = meta->dims[3]; + int max_proposal_count = meta->dims[2]; const float *detection = (float *)meta->data; AVBufferRef *ref; AVFrameSideData *sd; @@ -136,11 +126,12 @@ static int detect_postprocess(AVFilterContext *ctx, InferTensorMeta *meta, AVFra if (new_bbox->confidence < s->threshold) { av_freep(&new_bbox); - break; + continue; } - if (s->label_buf) - new_bbox->label_buf = av_buffer_ref(s->label_buf); + // TODO: use layer name to get proc + if (s->model_postproc.procs[0].labels) + new_bbox->label_buf = av_buffer_ref(s->model_postproc.procs[0].labels); av_dynarray_add(&boxes->bbox, &boxes->num, new_bbox); } @@ -217,7 +208,7 @@ static int query_formats(AVFilterContext *context) static int config_input(AVFilterLink *inlink) { - int i, ret; + int ret; AVFrame *frame; AVFilterContext *ctx = inlink->dst; InferenceDetectContext *s = ctx->priv; @@ -227,14 +218,12 @@ static int config_input(AVFilterLink *inlink) DNNModelInfo *info = ff_inference_base_get_input_info(s->base); VideoPP *vpp = ff_inference_base_get_vpp(s->base); - for (i = 0; i < info->numbers; i++) { - av_log(ctx, AV_LOG_DEBUG, "Input info [%d] %d - %d %d %d - %d %d %d\n", - i, info->batch_size, info->width[i], info->height[i], info->channels[i], - info->is_image[i], info->precision[i], info->layout[i]); - } + int width = info->dims[0][0], height = info->dims[0][1]; + + ff_inference_dump_model_info(ctx, info); // right now, no model needs multiple inputs - av_assert0(info->numbers == 1); + av_assert0(info->number == 1); vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW; @@ -242,15 +231,15 @@ static int config_input(AVFilterLink *inlink) frame = av_frame_alloc(); if (!frame) return AVERROR(ENOMEM); - frame->width = info->width[0]; - frame->height = info->height[0]; + frame->width = width; + frame->height = height; frame->format = expect_format; vpp->frames[0] = frame; if (vpp->device == VPP_DEVICE_SW) { - int need_scale = expect_format != inlink->format || - info->width[0] != inlink->w || - info->height[0] != inlink->h; + int need_scale = expect_format != inlink->format || + width != inlink->w || + height != inlink->h; if (need_scale) { if (av_frame_get_buffer(frame, 0) < 0) { @@ -260,7 +249,7 @@ static int config_input(AVFilterLink *inlink) vpp->sw_vpp->scale_contexts[0] = sws_getContext( inlink->w, inlink->h, inlink->format, - info->width[0], info->height[0], expect_format, + width, height, expect_format, SWS_BILINEAR, NULL, NULL, NULL); if (!vpp->sw_vpp->scale_contexts[0]) { @@ -284,8 +273,7 @@ static int config_input(AVFilterLink *inlink) goto fail; } - ret = va_vpp_surface_alloc(vpp->va_vpp, - info->width[0], info->height[0], s->vpp_format); + ret = va_vpp_surface_alloc(vpp->va_vpp, width, height, s->vpp_format); if (ret < 0) { av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n"); ret = AVERROR(EINVAL); @@ -316,12 +304,7 @@ static int config_output(AVFilterLink *outlink) DNNModelInfo *info = ff_inference_base_get_output_info(s->base); - for (int i = 0; i < info->numbers; i++) { - av_log(ctx, AV_LOG_DEBUG, "Output info [%d] %d - %d %d %d - %d %d %d\n", - i, info->batch_size, - info->width[i], info->height[i], info->channels[i], - info->is_image[i], info->precision[i], info->layout[i]); - } + ff_inference_dump_model_info(ctx, info); #if CONFIG_VAAPI if (vpp->device == VPP_DEVICE_HW) { @@ -345,40 +328,31 @@ static av_cold int detect_init(AVFilterContext *ctx) InferenceDetectContext *s = ctx->priv; InferenceParam p = {}; - av_assert0(s->model_file && s->name); + av_assert0(s->model_file); av_assert0(s->backend_type == DNN_INTEL_IE); - if (s->label_file) { - int n, labels_num; - AVBufferRef *ref = NULL; - LabelsArray *larray = NULL; - char buffer[4096] = { }; - char *_labels[100] = { }; + ff_load_default_model_proc(&s->model_preproc, &s->model_postproc); - FILE *fp = fopen(s->label_file, "rb"); - if (!fp) { - av_log(ctx, AV_LOG_ERROR, "Could not open file:%s\n", s->label_file); + if (s->model_proc) { + void *proc = ff_read_model_proc(s->model_proc); + if (!proc) { + av_log(ctx, AV_LOG_ERROR, "Could not read proc config file:" + "%s\n", s->model_proc); return AVERROR(EIO); } - n = fread(buffer, sizeof(buffer), 1, fp); - fclose(fp); - - av_split(buffer, ",", _labels, &labels_num, 100); - - larray = av_mallocz(sizeof(*larray)); - if (!larray) - return AVERROR(ENOMEM); + if (ff_parse_input_preproc(proc, &s->model_preproc) < 0) { + av_log(ctx, AV_LOG_ERROR, "Parse input preproc error.\n"); + return AVERROR(EIO); + } - for (n = 0; n < labels_num; n++) { - char *l = av_strdup(_labels[n]); - av_dynarray_add(&larray->label, &larray->num, l); + if (ff_parse_output_postproc(proc, &s->model_postproc) < 0) { + av_log(ctx, AV_LOG_ERROR, "Parse output postproc error.\n"); + return AVERROR(EIO); } - ref = av_buffer_create((uint8_t *)larray, sizeof(*larray), - &infer_labels_buffer_free, NULL, 0); - s->label_buf = ref; + s->proc_config = proc; } p.model_file = s->model_file; @@ -405,7 +379,7 @@ static av_cold void detect_uninit(AVFilterContext *ctx) ff_inference_base_free(&s->base); - if (s->label_buf) av_buffer_unref(&s->label_buf); + ff_release_model_proc(s->proc_config, &s->model_preproc, &s->model_postproc); } static int filter_frame(AVFilterLink *inlink, AVFrame *in) @@ -440,14 +414,13 @@ fail: static const AVOption inference_detect_options[] = { { "dnn_backend", "DNN backend for model execution", OFFSET(backend_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_INTEL_IE }, 0, 2, FLAGS, "engine" }, { "model", "path to model file for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "model_proc", "model preproc and postproc", OFFSET(model_proc), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, - { "label", "label file path for detection", OFFSET(label_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "vpp_format", "specify vpp output format", OFFSET(vpp_format), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "interval", "detect every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, { "threshold", "threshod to filter output data", OFFSET(threshold), AV_OPT_TYPE_FLOAT, { .dbl = 0.5}, 0, 1, FLAGS}, - { "name", "detection type name", OFFSET(name), AV_OPT_TYPE_STRING, .flags = FLAGS, "detection" }, { NULL } }; -- 2.7.4