{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# 边界框与锚框 "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"import d2l\n",
"from mxnet import image, nd, contrib, np, npx\n",
"\n",
"d2l.set_figsize()\n",
"img = image.imread('catdog.jpg').asnumpy()\n",
"d2l.plt.imshow(img)\n",
"\n",
"npx.set_np()"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## 边界框\n",
"\n",
"一个边界框可由(左上角x,左上角y,右下角x,右下角y)来定义。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"dog_bbox, cat_bbox = [60, 45, 378, 516], [400, 112, 655, 493]\n",
"\n",
"def bbox_to_rect(bbox, color): \n",
" # Convert to matplotlib format: ((upper-left x, upper-left y), width, height).\n",
" return d2l.plt.Rectangle(\n",
" xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],\n",
" fill=False, edgecolor=color, linewidth=2)\n",
"\n",
"fig = d2l.plt.imshow(img)\n",
"fig.axes.add_patch(bbox_to_rect(dog_bbox, 'blue'))\n",
"fig.axes.add_patch(bbox_to_rect(cat_bbox, 'red'));"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## 锚框\n",
"\n",
"定义一个在一张图里画多个框的函数 "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def show_bboxes(axes, bboxes, labels=None, colors=None):\n",
" def _make_list(obj, default_values=None):\n",
" if obj is None:\n",
" obj = default_values\n",
" elif not isinstance(obj, (list, tuple)):\n",
" obj = [obj]\n",
" return obj\n",
" labels = _make_list(labels)\n",
" colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])\n",
" for i, bbox in enumerate(bboxes):\n",
" color = colors[i % len(colors)]\n",
" rect = d2l.bbox_to_rect(bbox.asnumpy(), color)\n",
" axes.add_patch(rect)\n",
" if labels and len(labels) > i:\n",
" text_color = 'k' if color == 'w' else 'w'\n",
" axes.text(rect.xy[0], rect.xy[1], labels[i],\n",
" va='center', ha='center', fontsize=9, color=text_color,\n",
" bbox=dict(facecolor=color, lw=0))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"中心位于 (250, 250) 的锚框"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.05511677 0.07152405 0.63307005 0.821524 ]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"h, w = img.shape[0:2]\n",
"X = np.random.uniform(size=(1, 3, h, w)) # Construct input data.\n",
"Y = npx.multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])\n",
"boxes = Y.reshape((h, w, 5, 4))\n",
"print(boxes[250, 250, 0, :])\n",
"\n",
"bbox_scale = np.array((w, h, w, h))\n",
"fig = d2l.plt.imshow(img)\n",
"show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale,\n",
" ['s=0.75, r=1', 's=0.5, r=1', 's=0.25, r=1', 's=0.75, r=2',\n",
" 's=0.75, r=0.5'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## 标记训练集的锚框"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"ground_truth = np.array([[0, 0.1, 0.08, 0.52, 0.92],\n",
" [1, 0.55, 0.2, 0.9, 0.88]])\n",
"anchors = np.array([[0, 0.1, 0.2, 0.3], [0.15, 0.2, 0.4, 0.4],\n",
" [0.63, 0.05, 0.88, 0.98], [0.66, 0.45, 0.8, 0.8],\n",
" [0.57, 0.3, 0.92, 0.9]])\n",
"fig = d2l.plt.imshow(img)\n",
"show_bboxes(fig.axes, ground_truth[:, 1:] * bbox_scale, ['dog', 'cat'], 'k')\n",
"show_bboxes(fig.axes, anchors * bbox_scale, ['0', '1', '2', '3', '4']);"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"每个锚框都被标记为一个类别或者是背景"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0. 1. 2. 0. 2.]]\n",
"[[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.]]\n",
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00\n",
" 1.3999999e+00 9.9999990e+00 2.5939689e+00 7.1754227e+00\n",
" -1.1999989e+00 2.6881757e-01 1.6823606e+00 -1.5654588e+00\n",
" 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00\n",
" -5.7142794e-01 -1.0000001e+00 -8.9406973e-07 6.2581623e-01]]\n"
]
}
],
"source": [
"labels = npx.multibox_target(np.expand_dims(anchors, axis=0),\n",
" np.expand_dims(ground_truth, axis=0),\n",
" np.zeros((1, 3, 5)))\n",
"# assigned labels: (batch_size, #anchors)\n",
"print(labels[2])\n",
"# masks: (batch_size, 4 x #anchors), 0 for background, 1 for object\n",
"print(labels[1])\n",
"# offset to bounding boxes: (batch_size, 4 x #anchors)\n",
"print(labels[0])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## 输出预测的边界框"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"anchors = np.array([[0.1, 0.08, 0.52, 0.92], [0.08, 0.2, 0.56, 0.95],\n",
" [0.15, 0.3, 0.62, 0.91], [0.55, 0.2, 0.9, 0.88]])\n",
"offset_preds = np.array([0] * anchors.size)\n",
"cls_probs = np.array([[0] * 4, # Predicted probability for background\n",
" [0.9, 0.8, 0.7, 0.1], # Predicted probability for dog\n",
" [0.1, 0.2, 0.3, 0.9]]) # Predicted probability for cat\n",
"fig = d2l.plt.imshow(img)\n",
"show_bboxes(fig.axes, anchors * bbox_scale,\n",
" ['dog=0.9', 'dog=0.8', 'dog=0.7', 'cat=0.9'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"非极大值抑制(Non-maximum suppression):"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[[ 0. , 0.9 , 0.10000001, 0.07999998,\n",
" 0.52 , 0.92 ],\n",
" [ 1. , 0.9 , 0.5500001 , 0.20000002,\n",
" 0.9 , 0.88 ],\n",
" [-1. , 0.8 , 0.07999998, 0.19999999,\n",
" 0.56 , 0.95 ],\n",
" [-1. , 0.7 , 0.14999999, 0.3 ,\n",
" 0.62 , 0.91 ]]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output = npx.multibox_detection(\n",
" np.expand_dims(cls_probs, axis=0), np.expand_dims(offset_preds, axis=0),\n",
" np.expand_dims(anchors, axis=0), nms_threshold=0.5)\n",
"output"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"结果可视化"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig = d2l.plt.imshow(img)\n",
"for i in output[0].asnumpy():\n",
" if i[0] == -1:\n",
" continue\n",
" label = ('dog=', 'cat=')[int(i[0])] + str(i[1])\n",
" show_bboxes(fig.axes, [np.array(i[2:]) * bbox_scale], label)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## 多尺度目标检测"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"def display_anchors(fmap_w, fmap_h, s):\n",
" fmap = np.zeros((1, 10, fmap_w, fmap_h)) # The values from the first two dimensions will not affect the output.\n",
" anchors = npx.multibox_prior(fmap, sizes=s, ratios=[1, 2, 0.5])\n",
" bbox_scale = np.array((w, h, w, h))\n",
" d2l.show_bboxes(d2l.plt.imshow(img).axes, anchors[0] * bbox_scale)\n",
" \n",
"display_anchors(fmap_w=4, fmap_h=4, s=[0.15])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"display_anchors(fmap_w=2, fmap_h=2, s=[0.4])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"display_anchors(fmap_w=1, fmap_h=1, s=[0.8])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}