{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# 边界框与锚框 " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import d2l\n", "from mxnet import image, nd, contrib, np, npx\n", "\n", "d2l.set_figsize()\n", "img = image.imread('catdog.jpg').asnumpy()\n", "d2l.plt.imshow(img)\n", "\n", "npx.set_np()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## 边界框\n", "\n", "一个边界框可由（左上角x，左上角y，右下角x，右下角y）来定义。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dog_bbox, cat_bbox = [60, 45, 378, 516], [400, 112, 655, 493]\n", "\n", "def bbox_to_rect(bbox, color): \n", " # Convert to matplotlib format: ((upper-left x, upper-left y), width, height).\n", " return d2l.plt.Rectangle(\n", " xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],\n", " fill=False, edgecolor=color, linewidth=2)\n", "\n", "fig = d2l.plt.imshow(img)\n", "fig.axes.add_patch(bbox_to_rect(dog_bbox, 'blue'))\n", "fig.axes.add_patch(bbox_to_rect(cat_bbox, 'red'));" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## 锚框\n", "\n", "定义一个在一张图里画多个框的函数 " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def show_bboxes(axes, bboxes, labels=None, colors=None):\n", " def _make_list(obj, default_values=None):\n", " if obj is None:\n", " obj = default_values\n", " elif not isinstance(obj, (list, tuple)):\n", " obj = [obj]\n", " return obj\n", " labels = _make_list(labels)\n", " colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])\n", " for i, bbox in enumerate(bboxes):\n", " color = colors[i % len(colors)]\n", " rect = d2l.bbox_to_rect(bbox.asnumpy(), color)\n", " axes.add_patch(rect)\n", " if labels and len(labels) > i:\n", " text_color = 'k' if color == 'w' else 'w'\n", " axes.text(rect.xy[0], rect.xy[1], labels[i],\n", " va='center', ha='center', fontsize=9, color=text_color,\n", " bbox=dict(facecolor=color, lw=0))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "中心位于 (250, 250) 的锚框" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.05511677 0.07152405 0.63307005 0.821524 ]\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "h, w = img.shape[0:2]\n", "X = np.random.uniform(size=(1, 3, h, w)) # Construct input data.\n", "Y = npx.multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])\n", "boxes = Y.reshape((h, w, 5, 4))\n", "print(boxes[250, 250, 0, :])\n", "\n", "bbox_scale = np.array((w, h, w, h))\n", "fig = d2l.plt.imshow(img)\n", "show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,\n", " ['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2',\n", " 's=0.75, r=0.5'])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## 标记训练集的锚框" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "ground_truth = np.array([[0, 0.1, 0.08, 0.52, 0.92],\n", " [1, 0.55, 0.2, 0.9, 0.88]])\n", "anchors = np.array([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],\n", " [0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],\n", " [0.57, 0.3, 0.92, 0.9]])\n", "fig = d2l.plt.imshow(img)\n", "show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')\n", "show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']);" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "每个锚框都被标记为一个类别或者是背景" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0. 1. 2. 0. 2.]]\n", "[[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.]]\n", "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00\n", " 1.3999999e+00 9.9999990e+00 2.5939689e+00 7.1754227e+00\n", " -1.1999989e+00 2.6881757e-01 1.6823606e+00 -1.5654588e+00\n", " 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00\n", " -5.7142794e-01 -1.0000001e+00 -8.9406973e-07 6.2581623e-01]]\n" ] } ], "source": [ "labels = npx.multibox_target(np.expand_dims(anchors, axis=0),\n", " np.expand_dims(ground_truth, axis=0),\n", " np.zeros((1, 3, 5)))\n", "# assigned labels: (batch_size, #anchors)\n", "print(labels[2])\n", "# masks: (batch_size, 4 x #anchors), 0 for background, 1 for object\n", "print(labels[1])\n", "# offset to bounding boxes: (batch_size, 4 x #anchors)\n", "print(labels[0])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## 输出预测的边界框" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "anchors = np.array([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95],\n", " [0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]])\n", "offset_preds = np.array([0] * anchors.size)\n", "cls_probs = np.array([[0] * 4, # Predicted probability for background\n", " [0.9, 0.8, 0.7, 0.1], # Predicted probability for dog\n", " [0.1, 0.2, 0.3, 0.9]]) # Predicted probability for cat\n", "fig = d2l.plt.imshow(img)\n", "show_bboxes(fig.axes, anchors * bbox_scale,\n", " ['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9'])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "非极大值抑制（Non-maximum suppression）:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[[ 0. , 0.9 , 0.10000001, 0.07999998,\n", " 0.52 , 0.92 ],\n", " [ 1. , 0.9 , 0.5500001 , 0.20000002,\n", " 0.9 , 0.88 ],\n", " [-1. , 0.8 , 0.07999998, 0.19999999,\n", " 0.56 , 0.95 ],\n", " [-1. , 0.7 , 0.14999999, 0.3 ,\n", " 0.62 , 0.91 ]]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output = npx.multibox_detection(\n", " np.expand_dims(cls_probs, axis=0), np.expand_dims(offset_preds, axis=0),\n", " np.expand_dims(anchors, axis=0), nms_threshold=0.5)\n", "output" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "结果可视化" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig = d2l.plt.imshow(img)\n", "for i in output[0].asnumpy():\n", " if i[0] == -1:\n", " continue\n", " label = ('dog=', 'cat=')[int(i[0])] + str(i[1])\n", " show_bboxes(fig.axes, [np.array(i[2:]) * bbox_scale], label)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## 多尺度目标检测" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "def display_anchors(fmap_w, fmap_h, s):\n", " fmap = np.zeros((1, 10, fmap_w, fmap_h)) # The values from the first two dimensions will not affect the output.\n", " anchors = npx.multibox_prior(fmap, sizes=s, ratios=[1, 2, 0.5])\n", " bbox_scale = np.array((w, h, w, h))\n", " d2l.show_bboxes(d2l.plt.imshow(img).axes, anchors[0] * bbox_scale)\n", " \n", "display_anchors(fmap_w=4, fmap_h=4, s=[0.15])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "display_anchors(fmap_w=2, fmap_h=2, s=[0.4])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "display_anchors(fmap_w=1, fmap_h=1, s=[0.8])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }