{
"cells": [
{
"cell_type": "markdown",
"id": "ac6561ad",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Anchor Boxes\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "14c6e4a6",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:40.922407Z",
"iopub.status.busy": "2023-08-18T19:32:40.921572Z",
"iopub.status.idle": "2023-08-18T19:32:43.887912Z",
"shell.execute_reply": "2023-08-18T19:32:43.886600Z"
},
"origin_pos": 2,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import torch\n",
"from d2l import torch as d2l\n",
"\n",
"torch.set_printoptions(2)"
]
},
{
"cell_type": "markdown",
"id": "3e327544",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"The width and height of the anchor box are $ws\\sqrt{r}$ and $hs/\\sqrt{r}$, respectively.\n",
"Consider those combinations\n",
"containing\n",
"$$(s_1, r_1), (s_1, r_2), \\ldots, (s_1, r_m), (s_2, r_1), (s_3, r_1), \\ldots, (s_n, r_1)$$"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c0e17016",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:43.893598Z",
"iopub.status.busy": "2023-08-18T19:32:43.893199Z",
"iopub.status.idle": "2023-08-18T19:32:43.902717Z",
"shell.execute_reply": "2023-08-18T19:32:43.901834Z"
},
"origin_pos": 5,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"def multibox_prior(data, sizes, ratios):\n",
" \"\"\"Generate anchor boxes with different shapes centered on each pixel.\"\"\"\n",
" in_height, in_width = data.shape[-2:]\n",
" device, num_sizes, num_ratios = data.device, len(sizes), len(ratios)\n",
" boxes_per_pixel = (num_sizes + num_ratios - 1)\n",
" size_tensor = torch.tensor(sizes, device=device)\n",
" ratio_tensor = torch.tensor(ratios, device=device)\n",
" offset_h, offset_w = 0.5, 0.5\n",
" steps_h = 1.0 / in_height\n",
" steps_w = 1.0 / in_width\n",
"\n",
" center_h = (torch.arange(in_height, device=device) + offset_h) * steps_h\n",
" center_w = (torch.arange(in_width, device=device) + offset_w) * steps_w\n",
" shift_y, shift_x = torch.meshgrid(center_h, center_w, indexing='ij')\n",
" shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1)\n",
"\n",
" w = torch.cat((size_tensor * torch.sqrt(ratio_tensor[0]),\n",
" sizes[0] * torch.sqrt(ratio_tensor[1:])))\\\n",
" * in_height / in_width\n",
" h = torch.cat((size_tensor / torch.sqrt(ratio_tensor[0]),\n",
" sizes[0] / torch.sqrt(ratio_tensor[1:])))\n",
" anchor_manipulations = torch.stack((-w, -h, w, h)).T.repeat(\n",
" in_height * in_width, 1) / 2\n",
"\n",
" out_grid = torch.stack([shift_x, shift_y, shift_x, shift_y],\n",
" dim=1).repeat_interleave(boxes_per_pixel, dim=0)\n",
" output = out_grid + anchor_manipulations\n",
" return output.unsqueeze(0)"
]
},
{
"cell_type": "markdown",
"id": "fca5f107",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"The shape of the returned anchor box variable `Y`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0509b5af",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:43.906825Z",
"iopub.status.busy": "2023-08-18T19:32:43.906238Z",
"iopub.status.idle": "2023-08-18T19:32:44.026992Z",
"shell.execute_reply": "2023-08-18T19:32:44.025888Z"
},
"origin_pos": 8,
"tab": [
"pytorch"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"561 728\n"
]
},
{
"data": {
"text/plain": [
"torch.Size([1, 2042040, 4])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"img = d2l.plt.imread('../img/catdog.jpg')\n",
"h, w = img.shape[:2]\n",
"\n",
"print(h, w)\n",
"X = torch.rand(size=(1, 3, h, w))\n",
"Y = multibox_prior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])\n",
"Y.shape"
]
},
{
"cell_type": "markdown",
"id": "3b70f7ac",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Access the first anchor box centered on (250, 250)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "68fde78e",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:44.031304Z",
"iopub.status.busy": "2023-08-18T19:32:44.030463Z",
"iopub.status.idle": "2023-08-18T19:32:44.039395Z",
"shell.execute_reply": "2023-08-18T19:32:44.038385Z"
},
"origin_pos": 10,
"tab": [
"pytorch"
]
},
"outputs": [
{
"data": {
"text/plain": [
"tensor([0.06, 0.07, 0.63, 0.82])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"boxes = Y.reshape(h, w, 5, 4)\n",
"boxes[250, 250, 0, :]"
]
},
{
"cell_type": "markdown",
"id": "29c2016a",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Show all the anchor boxes centered on one pixel in the image"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f4e0c959",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:44.055283Z",
"iopub.status.busy": "2023-08-18T19:32:44.054735Z",
"iopub.status.idle": "2023-08-18T19:32:44.372820Z",
"shell.execute_reply": "2023-08-18T19:32:44.371734Z"
},
"origin_pos": 14,
"tab": [
"pytorch"
]
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
"