{ "cells": [ { "cell_type": "markdown", "id": "ff142b6a", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# Data Preprocessing\n", "\n", "Create a CSV file below" ] }, { "cell_type": "code", "execution_count": 2, "id": "9ae201e1", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:31:20.510380Z", "iopub.status.busy": "2023-08-18T19:31:20.509849Z", "iopub.status.idle": "2023-08-18T19:31:21.105668Z", "shell.execute_reply": "2023-08-18T19:31:21.104596Z" }, "origin_pos": 4, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " NumRooms RoofType Price\n", "0 NaN NaN 127500\n", "1 2.0 NaN 106000\n", "2 4.0 Slate 178100\n", "3 NaN NaN 140000\n" ] } ], "source": [ "import os\n", "\n", "os.makedirs(os.path.join('..', 'data'), exist_ok=True)\n", "data_file = os.path.join('..', 'data', 'house_tiny.csv')\n", "with open(data_file, 'w') as f:\n", " f.write('''NumRooms,RoofType,Price\n", "NA,NA,127500\n", "2,NA,106000\n", "4,Slate,178100\n", "NA,NA,140000''')\n", "\n", "import pandas as pd\n", "\n", "data = pd.read_csv(data_file)\n", "print(data)" ] }, { "cell_type": "markdown", "id": "686956d8", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "For categorical input fields, \n", "we can treat `NaN` as a category" ] }, { "cell_type": "code", "execution_count": 3, "id": "f92e80b6", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:31:21.109879Z", "iopub.status.busy": "2023-08-18T19:31:21.109243Z", "iopub.status.idle": "2023-08-18T19:31:21.120081Z", "shell.execute_reply": "2023-08-18T19:31:21.119081Z" }, "origin_pos": 6, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " NumRooms RoofType_Slate RoofType_nan\n", "0 NaN False True\n", "1 2.0 False True\n", "2 4.0 True False\n", "3 NaN False True\n" ] } ], "source": [ "inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]\n", "inputs = pd.get_dummies(inputs, dummy_na=True)\n", "print(inputs)" ] }, { "cell_type": "markdown", "id": "77db8d46", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Replace the `NaN` entries with \n", "the mean value of the corresponding column" ] }, { "cell_type": "code", "execution_count": 4, "id": "37e8e900", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:31:21.123941Z", "iopub.status.busy": "2023-08-18T19:31:21.123273Z", "iopub.status.idle": "2023-08-18T19:31:21.132513Z", "shell.execute_reply": "2023-08-18T19:31:21.131522Z" }, "origin_pos": 8, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " NumRooms RoofType_Slate RoofType_nan\n", "0 3.0 False True\n", "1 2.0 False True\n", "2 4.0 True False\n", "3 3.0 False True\n" ] } ], "source": [ "inputs = inputs.fillna(inputs.mean())\n", "print(inputs)" ] }, { "cell_type": "markdown", "id": "9f5dbc25", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "All the entries in `inputs` and `targets` are numerical,\n", "we can load them into a tensor" ] }, { "cell_type": "code", "execution_count": 5, "id": "d211233b", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:31:21.137043Z", "iopub.status.busy": "2023-08-18T19:31:21.136126Z", "iopub.status.idle": "2023-08-18T19:31:23.159251Z", "shell.execute_reply": "2023-08-18T19:31:23.158224Z" }, "origin_pos": 11, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "(tensor([[3., 0., 1.],\n", " [2., 0., 1.],\n", " [4., 1., 0.],\n", " [3., 0., 1.]], dtype=torch.float64),\n", " tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "\n", "X = torch.tensor(inputs.to_numpy(dtype=float))\n", "y = torch.tensor(targets.to_numpy(dtype=float))\n", "X, y" ] } ], "metadata": { "celltoolbar": "Slideshow", "language_info": { "name": "python" }, "required_libs": [], "rise": { "autolaunch": true, "enable_chalkboard": true, "overlay": "
", "scroll": true } }, "nbformat": 4, "nbformat_minor": 5 }