From 998fd438172395d82d41b1ff064374d98d2f3376 Mon Sep 17 00:00:00 2001 From: Lin Xie Date: Tue, 26 Mar 2019 14:03:35 +0800 Subject: [PATCH] Enable vaapi-scale for IE inference filter * This patch requires matched IE C API patch update Change-Id: Ic0fa65963a0708f424a4c2dd98260d38f0fc96af --- libavfilter/dnn_backend_intel_ie.c | 11 +- libavfilter/dnn_data.h | 8 +- libavfilter/inference.c | 373 ++++++++++++++++++++++++++++++++---- libavfilter/inference.h | 97 ++++++++-- libavfilter/vf_inference_classify.c | 120 ++++++++---- libavfilter/vf_inference_detect.c | 132 +++++++++---- 6 files changed, 611 insertions(+), 130 deletions(-) diff --git a/libavfilter/dnn_backend_intel_ie.c b/libavfilter/dnn_backend_intel_ie.c index 5742a8e..a600ee6 100644 --- a/libavfilter/dnn_backend_intel_ie.c +++ b/libavfilter/dnn_backend_intel_ie.c @@ -221,7 +221,7 @@ static DNNReturnType get_execute_result_intel_ie(void *model, DNNIOData *result) if (!model || !result) return DNN_ERROR; - result->data = IEGetResultSpace(ie_model->context, result->in_out_idx, &size); + result->data[0] = IEGetResultSpace(ie_model->context, result->in_out_idx, &size); if (!result->data) return DNN_ERROR; @@ -330,6 +330,7 @@ static DNNReturnType set_output_info_intel_ie(void *model, DNNModelInfo *info) static DNNReturnType set_input_intel_ie(void *model, const DNNIOData *input) { + int i; IEData data; DNNIntelIEModel *ie_model = (DNNIntelIEModel *)model; @@ -338,12 +339,12 @@ static DNNReturnType set_input_intel_ie(void *model, const DNNIOData *input) memset(&data, 0, sizeof(IEData)); - data.size = input->size; + for (i = 0; i < NUM_DATA_POINTS; i++) { + data.data[i] = input->data[i]; + data.linesize[i] = input->linesize[i]; + } data.width = input->width; data.height = input->height; - data.widthStride = input->width_stride; - data.heightStride = input->height_stride; - data.buffer = (void *)input->data; data.channelNum = input->channels; data.batchIdx = input->batch_idx; data.precision = get_precision(input->precision); diff --git a/libavfilter/dnn_data.h b/libavfilter/dnn_data.h index 8c70366..0add8ee 100644 --- a/libavfilter/dnn_data.h +++ b/libavfilter/dnn_data.h @@ -20,6 +20,8 @@ #ifndef AVFILTER_DNN_DATA_H #define AVFILTER_DNN_DATA_H +#include + /** * @enum TargetDevice * @brief Describes known device types @@ -113,12 +115,12 @@ typedef struct DNNDevice { * spencial for single 1D: height/height_stride/channels are 1 width_stride=width, output only */ typedef struct DNNIOData { - void *data; // the type depend on the data precision +#define NUM_DATA_POINTS 4 + uint8_t *data[NUM_DATA_POINTS]; + int linesize[NUM_DATA_POINTS]; unsigned int size; // size=width x height x channels,it is for 1D output/input. unit is byte. unsigned int width; unsigned int height; - unsigned int width_stride; // it is for HW memory or padding memory - unsigned int height_stride; unsigned int channels; // the index of the batch when batch size is bigger than 1. default value is zero when batch size is 1. unsigned int batch_idx; diff --git a/libavfilter/inference.c b/libavfilter/inference.c index 20934fc..596c5b4 100644 --- a/libavfilter/inference.c +++ b/libavfilter/inference.c @@ -33,6 +33,19 @@ #include "inference.h" +#if CONFIG_VAAPI +#define VA_CALL(_FUNC) \ + { \ + VAStatus _status = _FUNC; \ + if (_status != VA_STATUS_SUCCESS) \ + { \ + printf(#_FUNC " failed, sts = %d (%s).\n", \ + _status, vaErrorStr(_status)); \ + return AVERROR(EINVAL); \ + } \ + } +#endif + struct InferenceBaseContext { char *infer_type; @@ -49,13 +62,19 @@ struct InferenceBaseContext InferencePreProcess preprocess; }; +static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp, AVFrame *input, Rect *crop_rect, + int scale_w, int scale_h, uint8_t *data[], int stride[]); + +static int va_vpp_scale(VAAPIVpp *va_vpp, AVFrame *input, + int scale_w, int scale_h, uint8_t *data[], int stride[]); + static int fill_dnn_data_from_frame(DNNIOData *data, const AVFrame *frame, int batch_idx, int is_image, int input_idx) { - int channels_nb; + int i, channels_nb; DNNDataFormat dnn_fmt; DNNDataPrecisionType precision; enum AVPixelFormat pix_fmt = frame->format; @@ -68,6 +87,7 @@ static int fill_dnn_data_from_frame(DNNIOData *data, channels_nb = 1; break; case AV_PIX_FMT_BGRA: + case AV_PIX_FMT_BGR0: precision = DNN_DATA_PRECISION_U8; dnn_fmt = DNN_DATA_BGR_PACKED; channels_nb = 4; @@ -77,16 +97,22 @@ static int fill_dnn_data_from_frame(DNNIOData *data, dnn_fmt = DNN_DATA_BGR_PACKED; channels_nb = 3; break; + case AV_PIX_FMT_RGBP: + precision = DNN_DATA_PRECISION_U8; + dnn_fmt = DNN_DATA_RGB_PLANAR; + channels_nb = 3; + break; default: av_log(NULL, AV_LOG_ERROR, "format unsupport!\n"); return AVERROR(EINVAL); }; - data->data = (void *)frame->data[0]; + for (i = 0; i < NUM_DATA_POINTS; i++) { + data->data[i] = frame->data[i]; + data->linesize[i] = frame->linesize[i]; + } data->width = frame->width; data->height = frame->height; - data->width_stride = frame->linesize[0]/channels_nb; - data->height_stride = frame->height; data->channels = channels_nb; data->data_format = dnn_fmt; data->precision = precision; @@ -98,25 +124,26 @@ static int fill_dnn_data_from_frame(DNNIOData *data, return 0; } -static int sw_crop_and_scale(AVFrame *frame, - float x0, float y0, - float x1, float y1, - int out_w, int out_h, +static int sw_crop_and_scale(AVFrame *frame, Rect *crop_rect, + int out_w, int out_h, enum AVPixelFormat out_format, uint8_t *data[], int stride[]) { - int err, bufsize; + int bufsize; + AVFrame *temp; struct SwsContext *sws_ctx; const AVPixFmtDescriptor *desc; int x, y, w, h, hsub, vsub; int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes enum AVPixelFormat expect_format = out_format; - AVFrame *temp = av_frame_alloc(); - if (!temp) { - err = AVERROR(ENOMEM); - return err; - } + if (!crop_rect) + return AVERROR(EINVAL); + + temp = av_frame_alloc(); + if (!temp) + return AVERROR(ENOMEM); + av_frame_ref(temp, frame); desc = av_pix_fmt_desc_get(temp->format); @@ -126,10 +153,10 @@ static int sw_crop_and_scale(AVFrame *frame, /* cropping */ { - x = lrintf(x0); - y = lrintf(y0); - w = lrintf(x1) - x; - h = lrintf(y1) - y; + x = lrintf(crop_rect->x0); + y = lrintf(crop_rect->y0); + w = lrintf(crop_rect->x1) - x; + h = lrintf(crop_rect->y1) - y; temp->width = w; temp->height = h; @@ -157,8 +184,7 @@ static int sw_crop_and_scale(AVFrame *frame, SWS_BILINEAR, NULL, NULL, NULL); if (!sws_ctx) { av_log(NULL, AV_LOG_ERROR, "Create scaling context failed!\n"); - err = AVERROR(EINVAL); - return err; + return AVERROR(EINVAL); } if (!data[0]) { @@ -250,8 +276,6 @@ int ff_inference_base_create(AVFilterContext *ctx, if (!s) return AVERROR(ENOMEM); - // TODO: handle hw ctx - s->module = ff_get_dnn_module(param->backend_type); if (!s->module) { av_log(ctx, AV_LOG_ERROR, "could not create DNN backend module\n"); @@ -307,9 +331,13 @@ int ff_inference_base_create(AVFilterContext *ctx, vpp = &s->vpp; // vpp init - vpp->swscale = &sws_scale; - vpp->crop_and_scale = &sw_crop_and_scale; - vpp->expect_format = AV_PIX_FMT_BGR24; + vpp->sw_vpp = av_mallocz(sizeof(*vpp->sw_vpp)); + if (!vpp->sw_vpp) + goto fail; + + vpp->expect_format = AV_PIX_FMT_BGR24; + vpp->sw_vpp->scale = &sws_scale; + vpp->sw_vpp->crop_and_scale = &sw_crop_and_scale; *base = s; #undef DNN_ERR_CHECK @@ -326,14 +354,21 @@ int ff_inference_base_free(InferenceBaseContext **base) if (!s) return 0; - if (s->vpp.device == VPP_DEVICE_SW) { - for (int i = 0; i < MAX_VPP_NUM; i++) { - if (s->vpp.frames[i]) - av_frame_free(&s->vpp.frames[i]); - if (s->vpp.scale_contexts[i]) - sws_freeContext(s->vpp.scale_contexts[i]); - } + // VPP clean up + for (int i = 0; i < MAX_VPP_NUM; i++) { + if (s->vpp.frames[i]) + av_frame_free(&s->vpp.frames[i]); + if (s->vpp.sw_vpp->scale_contexts[i]) + sws_freeContext(s->vpp.sw_vpp->scale_contexts[i]); } + av_freep(&s->vpp.sw_vpp); + +#if CONFIG_VAAPI + if (s->vpp.va_vpp) { + va_vpp_device_free(s->vpp.va_vpp); + av_freep(&s->vpp.va_vpp); + } +#endif if (s->module) { s->module->free_model(&s->model); @@ -353,6 +388,11 @@ int ff_inference_base_submit_frame(InferenceBaseContext *base, fill_dnn_data_from_frame(&input, frame, batch_idx, 1, input_idx); base->model->set_input(base->model->model, &input); +#if CONFIG_VAAPI + if (base->vpp.va_vpp) + va_vpp_surface_release(base->vpp.va_vpp); +#endif + return 0; } @@ -373,10 +413,16 @@ int ff_inference_base_filter_frame(InferenceBaseContext *base, AVFrame *in) for (int i = 0; i < info->numbers; i++) { AVFrame *processed_frame; for (int j = 0; j < base->batch_size; j++) { - if (base->preprocess) - base->preprocess(base, i, in, &processed_frame); + if (base->preprocess) { + if (base->preprocess(base, i, in, &processed_frame) < 0) + return AVERROR(EINVAL); + } fill_dnn_data_from_frame(&input, processed_frame, j, 1, i); base->model->set_input(base->model->model, &input); +#if CONFIG_VAAPI + if (base->vpp.va_vpp) + va_vpp_surface_release(base->vpp.va_vpp); +#endif } } @@ -410,7 +456,7 @@ int ff_inference_base_get_infer_result(InferenceBaseContext *base, metadata->layout = info->layout[0]; metadata->precision = info->precision[0]; - metadata->data = data.data; + metadata->data = data.data[0]; metadata->total_bytes = data.size; return 0; @@ -430,3 +476,260 @@ VideoPP* ff_inference_base_get_vpp(InferenceBaseContext *base) { return &base->vpp; } + +/******************************************** + * * + * VAAPI VPP APIs * + * * + *******************************************/ +#if CONFIG_VAAPI +static int ff_vaapi_vpp_colour_standard(enum AVColorSpace av_cs) +{ + switch(av_cs) { +#define CS(av, va) case AVCOL_SPC_ ## av: return VAProcColorStandard ## va; + CS(BT709, BT709); + CS(BT470BG, BT601); + CS(SMPTE170M, SMPTE170M); + CS(SMPTE240M, SMPTE240M); +#undef CS + default: + return VAProcColorStandardNone; + } +} + +int va_vpp_device_create(VAAPIVpp *va_vpp, AVFilterLink *inlink) +{ + AVFilterContext *avctx = inlink->dst; + VADisplay display = NULL; + VAImageFormat *image_list = NULL; + VAStatus vas; + int err, image_count; + AVBufferRef *device_ref = NULL; + AVHWFramesContext *hw_frames_ctx; + + hw_frames_ctx = (AVHWFramesContext *)inlink->hw_frames_ctx->data; + av_assert0(hw_frames_ctx); + + device_ref = av_buffer_ref(hw_frames_ctx->device_ref); + if (!device_ref) { + av_log(avctx, AV_LOG_ERROR, "A device reference create failed.\n"); + return AVERROR(ENOMEM); + } + + va_vpp->hwctx = ((AVHWDeviceContext *)device_ref->data)->hwctx; + va_vpp->hw_frames_ref = inlink->hw_frames_ctx; + + av_buffer_unref(&device_ref); + + display = va_vpp->hwctx->display; + + image_count = vaMaxNumImageFormats(display); + if (image_count <= 0) { + err = AVERROR(EIO); + goto fail; + } + image_list = av_malloc(image_count * sizeof(*image_list)); + if (!image_list) { + err = AVERROR(ENOMEM); + goto fail; + } + vas = vaQueryImageFormats(display, image_list, &image_count); + if (vas != VA_STATUS_SUCCESS) { + err = AVERROR(EIO); + goto fail; + } + + va_vpp->format_list = image_list; + va_vpp->nb_formats = image_count; + va_vpp->va_config = VA_INVALID_ID; + va_vpp->va_context = VA_INVALID_ID; + va_vpp->va_surface = VA_INVALID_ID; + + va_vpp->scale = &va_vpp_scale; + va_vpp->crop_and_scale = &va_vpp_crop_and_scale; + + return VA_STATUS_SUCCESS; +fail: + if (image_list) + av_free(image_list); + return err; +} + +int va_vpp_device_free(VAAPIVpp *va_vpp) +{ + VAStatus vas; + + if (!va_vpp) + return 0; + + if (va_vpp->va_surface != VA_INVALID_ID) { + vaDestroySurfaces(va_vpp->hwctx->display, &va_vpp->va_surface, 1); + if (vas != VA_STATUS_SUCCESS) { + av_log(NULL, AV_LOG_ERROR, "Failed to destroy surface %#x: " + "%d (%s).\n", va_vpp->va_surface, vas, vaErrorStr(vas)); + } + } + + if (va_vpp->va_context != VA_INVALID_ID) { + vaDestroyContext(va_vpp->hwctx->display, va_vpp->va_context); + va_vpp->va_context = VA_INVALID_ID; + } + + if (va_vpp->va_config != VA_INVALID_ID) { + vaDestroyConfig(va_vpp->hwctx->display, va_vpp->va_config); + va_vpp->va_config = VA_INVALID_ID; + } + + av_free(va_vpp->format_list); + + return VA_STATUS_SUCCESS; +} + +int va_vpp_surface_alloc(VAAPIVpp *va_vpp, size_t width, size_t height, const char *format) +{ + int i, rt_format, fourcc; + VASurfaceAttrib surface_attrib; + enum AVPixelFormat av_format; + + if (!va_vpp) + return -1; + + if (format == NULL || strstr(format, "bgrx")) { + fourcc = VA_FOURCC_BGRX; rt_format = VA_RT_FORMAT_RGB32; av_format = AV_PIX_FMT_BGR0; + } else if (strstr(format, "rgbp")) { + fourcc = VA_FOURCC_RGBP; rt_format = VA_RT_FORMAT_RGBP; av_format = AV_PIX_FMT_RGBP; + } else + return -1; + + surface_attrib.type = VASurfaceAttribPixelFormat; + surface_attrib.flags = VA_SURFACE_ATTRIB_SETTABLE; + surface_attrib.value.type = VAGenericValueTypeInteger; + surface_attrib.value.value.i = fourcc; + + VA_CALL(vaCreateConfig(va_vpp->hwctx->display, VAProfileNone, + VAEntrypointVideoProc, 0, 0, &va_vpp->va_config)); + + VA_CALL(vaCreateSurfaces(va_vpp->hwctx->display, rt_format, width, height, + &va_vpp->va_surface, 1, &surface_attrib, 1)); + + VA_CALL(vaCreateContext(va_vpp->hwctx->display, va_vpp->va_config, + width, height, VA_PROGRESSIVE, + &va_vpp->va_surface, 1, &va_vpp->va_context)); + + for (i = 0; i < va_vpp->nb_formats; i++) { + if (va_vpp->format_list[i].fourcc == fourcc) { + va_vpp->va_format_selected = va_vpp->format_list[i]; + break; + } + } + + va_vpp->av_format = av_format; + + return VA_STATUS_SUCCESS; +} + +/* release mapped memory */ +int va_vpp_surface_release(VAAPIVpp *va_vpp) +{ + VA_CALL(vaUnmapBuffer(va_vpp->hwctx->display, va_vpp->va_image.buf)); + VA_CALL(vaDestroyImage(va_vpp->hwctx->display, va_vpp->va_image.image_id)); + + return VA_STATUS_SUCCESS; +} + +/* HW scale and map to system memory */ +static int va_vpp_scale(VAAPIVpp *va_vpp, AVFrame *input, + int scale_w, int scale_h, + uint8_t *data[], int stride[]) +{ + return va_vpp_crop_and_scale(va_vpp, input, NULL, scale_w, scale_h, data, stride); +} + +static int va_vpp_crop_and_scale(VAAPIVpp *va_vpp, + AVFrame *input, Rect *crop_rect, + int scale_w, int scale_h, + uint8_t *data[], int stride[]) +{ + int i; + void *address = NULL; + VAImage *va_image_ptr; + VABufferID params_id; + VASurfaceID input_surface, output_surface; + VAProcPipelineParameterBuffer params; + VARectangle input_region; + + input_surface = (VASurfaceID)(uintptr_t)input->data[3]; + av_log(NULL, AV_LOG_DEBUG, "Using surface %#x for scale input.\n", + input_surface); + + output_surface = va_vpp->va_surface; + + if (crop_rect == NULL) { + input_region = (VARectangle) { + .x = input->crop_left, + .y = input->crop_top, + .width = input->width - + (input->crop_left + input->crop_right), + .height = input->height - + (input->crop_top + input->crop_bottom), + }; + } else { + int _x = lrintf(crop_rect->x0); + int _y = lrintf(crop_rect->y0); + input_region = (VARectangle) { + .x = _x, + .y = _y, + .width = lrintf(crop_rect->x1) - _x, + .height = lrintf(crop_rect->y1) - _y, + }; + } + + memset(¶ms, 0, sizeof(params)); + + params.surface = input_surface; + params.surface_region = &input_region; + params.surface_color_standard = + ff_vaapi_vpp_colour_standard(input->colorspace); + + params.output_region = 0; + params.output_background_color = 0xff000000; + params.output_color_standard = params.surface_color_standard; + + params.pipeline_flags = 0; + params.filter_flags = VA_FILTER_SCALING_HQ; + + VA_CALL(vaBeginPicture(va_vpp->hwctx->display, va_vpp->va_context, output_surface)); + + VA_CALL(vaCreateBuffer(va_vpp->hwctx->display, va_vpp->va_context, + VAProcPipelineParameterBufferType, + sizeof(params), 1, ¶ms, ¶ms_id)); + + VA_CALL(vaRenderPicture(va_vpp->hwctx->display, va_vpp->va_context, + ¶ms_id, 1)); + + VA_CALL(vaEndPicture(va_vpp->hwctx->display, va_vpp->va_context)); + + VA_CALL(vaDestroyBuffer(va_vpp->hwctx->display, params_id)); + + VA_CALL(vaSyncSurface(va_vpp->hwctx->display, output_surface)); + + // map surface to system memory + va_image_ptr = &va_vpp->va_image; + + VA_CALL(vaCreateImage(va_vpp->hwctx->display, &va_vpp->va_format_selected, + scale_w, scale_h, va_image_ptr)); + + VA_CALL(vaGetImage(va_vpp->hwctx->display, output_surface, + 0, 0, scale_w, scale_h, + va_image_ptr->image_id)); + + VA_CALL(vaMapBuffer(va_vpp->hwctx->display, va_image_ptr->buf, &address)); + + for (i = 0; i < va_image_ptr->num_planes; i++) { + data[i] = (uint8_t *)address + va_image_ptr->offsets[i]; + stride[i] = va_image_ptr->pitches[i]; + } + + return VA_STATUS_SUCCESS; +} +#endif diff --git a/libavfilter/inference.h b/libavfilter/inference.h index 33de54b..d0515e2 100644 --- a/libavfilter/inference.h +++ b/libavfilter/inference.h @@ -19,8 +19,17 @@ #ifndef AVFILTER_INFERENCE_H #define AVFILTER_INFERENCE_H +#if CONFIG_VAAPI +#include +#endif + #include "libavutil/common.h" #include "libswscale/swscale.h" +#include "libavutil/hwcontext.h" +#if CONFIG_VAAPI +#include "libavutil/hwcontext_vaapi.h" +#endif + #include "dnn_interface.h" typedef struct InferenceBaseContext InferenceBaseContext; @@ -47,25 +56,71 @@ typedef struct InferenceParam { #define MAX_VPP_NUM DNN_INPUT_OUTPUT_NUM +/* + * Vpp device type detected according to frame format + */ typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } VPPDevice; +typedef struct _SwVpp SwVpp; + +typedef struct _VAAPIVpp VAAPIVpp; + +/* + * Generic rectangle structure consists of two diagonal points + */ +typedef struct Rect { + float x0; float y0; float x1; float y1; +} Rect; + +#if CONFIG_VAAPI +struct _VAAPIVpp { + AVVAAPIDeviceContext *hwctx; + AVBufferRef *hw_frames_ref; + VASurfaceID va_surface; + VAConfigID va_config; + VAContextID va_context; + + VAImageFormat *format_list; //!< Surface formats which can be used with this device. + int nb_formats; + + VAImage va_image; + VAImageFormat va_format_selected; + enum AVPixelFormat av_format; + + int (*scale)(VAAPIVpp *va_vpp, AVFrame *input, + int scale_w, int scale_h, + uint8_t *data[], int stride[]); + + int (*crop_and_scale)(VAAPIVpp *va_vpp, AVFrame *input, + Rect *crop_rect, + int scale_w, int scale_h, + uint8_t *data[], int stride[]); +}; +#endif + +struct _SwVpp { + struct SwsContext *scale_contexts[MAX_VPP_NUM]; + + int (*scale)(struct SwsContext *context, + const uint8_t * const srcSlice[], + const int srcStride[], int srcSliceY, + int srcSliceH, uint8_t *const dst[], + const int dstStride[]); + + int (*crop_and_scale)(AVFrame *frame, Rect *crop_rect, + int scale_w, int scale_h, + enum AVPixelFormat scale_format, + uint8_t *dst[], int dstStride[]); +}; + typedef struct VideoPP { - int device; - int expect_format; - void *scale_contexts[MAX_VPP_NUM]; - AVFrame *frames[MAX_VPP_NUM]; //post_process[i] = &age_gender_classify_result_process; } else if (strstr(names[i], "face")) { - VideoPP *vpp = ff_inference_base_get_vpp(s->infer_bases[i]); - - // face reidentification model requires RGB format - vpp->expect_format = AV_PIX_FMT_RGB24; - s->init[i] = &face_identify_init; s->uninit[i] = &face_identify_uninit; s->post_process[i] = &face_identify_result_process; @@ -603,16 +600,27 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) DNNModelInfo *iinfo = ff_inference_base_get_input_info(base); DNNModelInfo *oinfo = ff_inference_base_get_output_info(base); - ret = vpp->crop_and_scale(in, - bbox->x_min * in->width, - bbox->y_min * in->height, - bbox->x_max * in->width, - bbox->y_max * in->height, - iinfo->width[0], - iinfo->height[0], - vpp->expect_format, - tmp->data, - tmp->linesize); + Rect crop_rect = (Rect) { + .x0 = bbox->x_min * in->width, + .y0 = bbox->y_min * in->height, + .x1 = bbox->x_max * in->width, + .y1 = bbox->y_max * in->height, + }; + + if (vpp->device == VPP_DEVICE_SW) { + ret = vpp->sw_vpp->crop_and_scale(in, &crop_rect, + iinfo->width[0], iinfo->height[0], + vpp->expect_format, tmp->data, tmp->linesize); + } else { +#if CONFIG_VAAPI + ret = vpp->va_vpp->crop_and_scale(vpp->va_vpp, in, &crop_rect, + iinfo->width[0], iinfo->height[0], tmp->data, tmp->linesize); +#endif + } + if (ret != 0) { + ret = AVERROR(EINVAL); + goto fail; + } // TODO: support dynamic batch for faces ff_inference_base_submit_frame(base, tmp, 0, 0); @@ -651,41 +659,84 @@ fail: static av_cold int config_input(AVFilterLink *inlink) { - int i, j; - AVFilterContext *ctx = inlink->dst; - InferenceClassifyContext *s = ctx->priv; + int i, ret; + AVFrame *frame; + + AVFilterContext *ctx = inlink->dst; + InferenceClassifyContext *s = ctx->priv; enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24; - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); for (i = 0; i < s->loaded_num; i++) { InferenceBaseContext *base = s->infer_bases[i]; - DNNModelInfo *info = ff_inference_base_get_input_info(base); - VideoPP *vpp = ff_inference_base_get_vpp(base); + DNNModelInfo *info = ff_inference_base_get_input_info(base); + VideoPP *vpp = ff_inference_base_get_vpp(base); + + // right now, no model needs multiple inputs + av_assert0(info->numbers == 1); vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW; // allocate avframes to save preprocessed data - for (j = 0; j < info->numbers; j++) { - int ret; - AVFrame *frame = av_frame_alloc(); - if (!frame) - return AVERROR(ENOMEM); + frame = av_frame_alloc(); + if (!frame) + return AVERROR(ENOMEM); + frame->width = info->width[0]; + frame->height = info->height[0]; + frame->format = expect_format; + vpp->frames[0] = frame; + + if (vpp->device == VPP_DEVICE_SW) { + if (ret = av_frame_get_buffer(frame, 0) < 0) + goto fail; + } else { +#if CONFIG_VAAPI + vpp->va_vpp = av_mallocz(sizeof(*vpp->va_vpp)); + if (!vpp->va_vpp) { + ret = AVERROR(ENOMEM); + goto fail; + } - frame->format = expect_format; - frame->width = info->width[j]; - frame->height = info->height[j]; + ret = va_vpp_device_create(vpp->va_vpp, inlink); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Create va vpp device failed\n"); + ret = AVERROR(EINVAL); + goto fail; + } - ret = av_frame_get_buffer(frame, 0); + ret = va_vpp_surface_alloc(vpp->va_vpp, + info->width[0], info->height[0], s->vpp_format); if (ret < 0) { - av_frame_free(&frame); - return ret; + av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n"); + ret = AVERROR(EINVAL); + goto fail; } - vpp->frames[j] = frame; + + frame->format = vpp->va_vpp->av_format; +#endif } } return 0; +fail: + for (i = 0; i < s->loaded_num; i++) { + VideoPP *vpp = ff_inference_base_get_vpp(s->infer_bases[i]); + + frame = vpp->frames[0]; + if (!frame) + continue; + + av_frame_free(&frame); + +#if CONFIG_VAAPI + if (vpp->va_vpp) { + va_vpp_device_free(vpp->va_vpp); + av_freep(&vpp->va_vpp); + } +#endif + } + return ret; } static av_cold int config_output(AVFilterLink *outlink) @@ -698,6 +749,7 @@ static const AVOption inference_classify_options[] = { { "model", "path to model files for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "label", "labels for classify", OFFSET(labels), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "name", "classify type names", OFFSET(names), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "vpp_format", "specify vpp output format", OFFSET(vpp_format), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, { "interval", "do infer every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS }, { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS }, diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c index 624cb47..42a0e02 100644 --- a/libavfilter/vf_inference_detect.c +++ b/libavfilter/vf_inference_detect.c @@ -51,6 +51,7 @@ typedef struct InferenceDetectContext { char *model_file; char *label_file; + char *vpp_format; int backend_type; int device_type; @@ -176,13 +177,21 @@ static int detect_preprocess(InferenceBaseContext *base, int index, AVFrame *in, VideoPP *vpp = ff_inference_base_get_vpp(base); AVFrame *tmp = vpp->frames[index]; - if (!vpp->scale_contexts[index]) { - *out = in; - return 0; - } + if (vpp->device == VPP_DEVICE_SW) { + if (!vpp->sw_vpp->scale_contexts[index]) { + *out = in; + return 0; + } - ret = vpp->swscale(vpp->scale_contexts[index], (const uint8_t * const*)in->data, - in->linesize, 0, in->height, tmp->data, tmp->linesize); + ret = vpp->sw_vpp->scale(vpp->sw_vpp->scale_contexts[index], + (const uint8_t * const*)in->data, + in->linesize, 0, in->height, tmp->data, tmp->linesize); + } else { +#if CONFIG_VAAPI + ret = vpp->va_vpp->scale(vpp->va_vpp, in, + tmp->width, tmp->height, tmp->data, tmp->linesize); +#endif + } *out = tmp; return ret; } @@ -194,7 +203,8 @@ static int query_formats(AVFilterContext *context) AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P, AV_PIX_FMT_GRAY8, - AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_NONE}; + AV_PIX_FMT_BGR24, AV_PIX_FMT_BGRA, AV_PIX_FMT_VAAPI, + AV_PIX_FMT_NONE}; formats_list = ff_make_format_list(pixel_formats); if (!formats_list) { @@ -207,7 +217,8 @@ static int query_formats(AVFilterContext *context) static int config_input(AVFilterLink *inlink) { - int i; + int i, ret; + AVFrame *frame; AVFilterContext *ctx = inlink->dst; InferenceDetectContext *s = ctx->priv; enum AVPixelFormat expect_format = AV_PIX_FMT_BGR24; @@ -222,51 +233,86 @@ static int config_input(AVFilterLink *inlink) info->is_image[i], info->precision[i], info->layout[i]); } - vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW; + // right now, no model needs multiple inputs + av_assert0(info->numbers == 1); - // TODO: now just handle sw vpp - for (i = 0; i < info->numbers; i++) { - if (expect_format != inlink->format || - info->width[i] != inlink->w || - info->height[i] != inlink->h) - { - int ret; - AVFrame *frame; - - vpp->scale_contexts[i] = sws_getContext( - inlink->w, inlink->h, inlink->format, - info->width[i], info->height[i], expect_format, - SWS_BILINEAR, NULL, NULL, NULL); - - if (!vpp->scale_contexts[i]) { - av_log(ctx, AV_LOG_ERROR, "Impossible to create scale context"); - return AVERROR(EINVAL); - } + vpp->device = (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) ? VPP_DEVICE_HW : VPP_DEVICE_SW; - frame = av_frame_alloc(); - if (!frame) + // allocate frame to save scaled output + frame = av_frame_alloc(); + if (!frame) + return AVERROR(ENOMEM); + frame->width = info->width[0]; + frame->height = info->height[0]; + frame->format = expect_format; + vpp->frames[0] = frame; + + if (vpp->device == VPP_DEVICE_SW) { + int need_scale = expect_format != inlink->format || + info->width[0] != inlink->w || + info->height[0] != inlink->h; + + if (need_scale) { + if (av_frame_get_buffer(frame, 0) < 0) { + av_frame_free(&frame); return AVERROR(ENOMEM); + } - frame->format = expect_format; - frame->width = info->width[i]; - frame->height = info->height[i]; + vpp->sw_vpp->scale_contexts[0] = sws_getContext( + inlink->w, inlink->h, inlink->format, + info->width[0], info->height[0], expect_format, + SWS_BILINEAR, NULL, NULL, NULL); - ret = av_frame_get_buffer(frame, 0); - if (ret < 0) { + if (!vpp->sw_vpp->scale_contexts[0]) { + av_log(ctx, AV_LOG_ERROR, "Impossible to create scale context\n"); av_frame_free(&frame); - return ret; + return AVERROR(EINVAL); } - vpp->frames[i] = frame; } + } else { +#if CONFIG_VAAPI + vpp->va_vpp = av_mallocz(sizeof(*vpp->va_vpp)); + if (!vpp->va_vpp) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = va_vpp_device_create(vpp->va_vpp, inlink); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Create va vpp device failed\n"); + ret = AVERROR(EINVAL); + goto fail; + } + + ret = va_vpp_surface_alloc(vpp->va_vpp, + info->width[0], info->height[0], s->vpp_format); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, "Create va surface failed\n"); + ret = AVERROR(EINVAL); + goto fail; + } + + frame->format = vpp->va_vpp->av_format; +#endif } return 0; +fail: + av_frame_free(&frame); +#if CONFIG_VAAPI + if (vpp->va_vpp) { + va_vpp_device_free(vpp->va_vpp); + av_freep(&vpp->va_vpp); + } +#endif + return ret; } static int config_output(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; InferenceDetectContext *s = ctx->priv; + VideoPP *vpp = ff_inference_base_get_vpp(s->base); DNNModelInfo *info = ff_inference_base_get_output_info(s->base); @@ -277,7 +323,18 @@ static int config_output(AVFilterLink *outlink) info->is_image[i], info->precision[i], info->layout[i]); } - // TODO: define how to handle model output data +#if CONFIG_VAAPI + if (vpp->device == VPP_DEVICE_HW) { + if (!vpp->va_vpp || !vpp->va_vpp->hw_frames_ref) { + av_log(ctx, AV_LOG_ERROR, "The input must have a hardware frame " + "reference.\n"); + return AVERROR(EINVAL); + } + outlink->hw_frames_ctx = av_buffer_ref(vpp->va_vpp->hw_frames_ref); + if (!outlink->hw_frames_ctx) + return AVERROR(ENOMEM); + } +#endif return 0; } @@ -385,6 +442,7 @@ static const AVOption inference_detect_options[] = { { "model", "path to model file for network", OFFSET(model_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "device", "running on device type", OFFSET(device_type), AV_OPT_TYPE_FLAGS, { .i64 = DNN_TARGET_DEVICE_CPU }, 0, 12, FLAGS }, { "label", "label file path for detection", OFFSET(label_file), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, + { "vpp_format", "specify vpp output format", OFFSET(vpp_format), AV_OPT_TYPE_STRING, { .str = NULL}, 0, 0, FLAGS }, { "interval", "detect every Nth frame", OFFSET(every_nth_frame), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, { "batch_size", "batch size per infer", OFFSET(batch_size), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, 1024, FLAGS}, { "threshold", "threshod to filter output data", OFFSET(threshold), AV_OPT_TYPE_FLOAT, { .dbl = 0.5}, 0, 1, FLAGS}, -- 2.7.4