{ "cells": [ { "cell_type": "markdown", "id": "290111d9", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# Predicting House Prices on Kaggle\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "1c33eb92", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:32:21.587414Z", "iopub.status.busy": "2023-08-18T19:32:21.586752Z", "iopub.status.idle": "2023-08-18T19:32:24.821984Z", "shell.execute_reply": "2023-08-18T19:32:24.820834Z" }, "origin_pos": 3, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "%matplotlib inline\n", "import pandas as pd\n", "import torch\n", "from torch import nn\n", "from d2l import torch as d2l" ] }, { "cell_type": "markdown", "id": "a51ad534", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "Implement two utility functions" ] }, { "cell_type": "code", "execution_count": 2, "id": "c5b9dd70", "metadata": { "attributes": { "classes": [], "id": "", "n": "2" }, "execution": { "iopub.execute_input": "2023-08-18T19:32:24.826201Z", "iopub.status.busy": "2023-08-18T19:32:24.825720Z", "iopub.status.idle": "2023-08-18T19:32:24.831209Z", "shell.execute_reply": "2023-08-18T19:32:24.830384Z" }, "origin_pos": 7, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "def download(url, folder, sha1_hash=None):\n", " \"\"\"Download a file to folder and return the local filepath.\"\"\"\n", "\n", "def extract(filename, folder):\n", " \"\"\"Extract a zip/tar file into folder.\"\"\"" ] }, { "cell_type": "markdown", "id": "2892cc66", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Read in and process the data\n", "using `pandas`" ] }, { "cell_type": "code", "execution_count": 4, "id": "7e9e8f7c", "metadata": { "attributes": { "classes": [], "id": "", "n": "31" }, "execution": { "iopub.execute_input": "2023-08-18T19:32:24.844705Z", "iopub.status.busy": "2023-08-18T19:32:24.843955Z", "iopub.status.idle": "2023-08-18T19:32:25.218067Z", "shell.execute_reply": "2023-08-18T19:32:25.217232Z" }, "origin_pos": 11, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading ../data/kaggle_house_pred_train.csv from http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv...\n", "Downloading ../data/kaggle_house_pred_test.csv from http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(1460, 81)\n", "(1459, 80)\n" ] } ], "source": [ "class KaggleHouse(d2l.DataModule):\n", " def __init__(self, batch_size, train=None, val=None):\n", " super().__init__()\n", " self.save_hyperparameters()\n", " if self.train is None:\n", " self.raw_train = pd.read_csv(d2l.download(\n", " d2l.DATA_URL + 'kaggle_house_pred_train.csv', self.root,\n", " sha1_hash='585e9cc93e70b39160e7921475f9bcd7d31219ce'))\n", " self.raw_val = pd.read_csv(d2l.download(\n", " d2l.DATA_URL + 'kaggle_house_pred_test.csv', self.root,\n", " sha1_hash='fa19780a7b011d9b009e8bff8e99922a8ee2eb90'))\n", "\n", "data = KaggleHouse(batch_size=64)\n", "print(data.raw_train.shape)\n", "print(data.raw_val.shape)" ] }, { "cell_type": "markdown", "id": "68194e07", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Take a look at the first four and final two features\n", "as well as the label (SalePrice)" ] }, { "cell_type": "code", "execution_count": 5, "id": "92621a85", "metadata": { "attributes": { "classes": [], "id": "", "n": "10" }, "execution": { "iopub.execute_input": "2023-08-18T19:32:25.221755Z", "iopub.status.busy": "2023-08-18T19:32:25.221161Z", "iopub.status.idle": "2023-08-18T19:32:25.230323Z", "shell.execute_reply": "2023-08-18T19:32:25.229502Z" }, "origin_pos": 13, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Id MSSubClass MSZoning LotFrontage SaleType SaleCondition SalePrice\n", "0 1 60 RL 65.0 WD Normal 208500\n", "1 2 20 RL 80.0 WD Normal 181500\n", "2 3 60 RL 68.0 WD Normal 223500\n", "3 4 70 RL 60.0 WD Abnorml 140000\n" ] } ], "source": [ "print(data.raw_train.iloc[:4, [0, 1, 2, 3, -3, -2, -1]])" ] }, { "cell_type": "markdown", "id": "a7ccf8f1", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Replacing all missing values\n", "by the corresponding feature's mean.\n", "*standardize* the data by\n", "rescaling features to zero mean and unit variance\n", "Next we deal with discrete values.\n", "We replace them by a one-hot encoding" ] }, { "cell_type": "code", "execution_count": 7, "id": "a9e39c34", "metadata": { "attributes": { "classes": [], "id": "", "n": "33" }, "execution": { "iopub.execute_input": "2023-08-18T19:32:25.242819Z", "iopub.status.busy": "2023-08-18T19:32:25.242192Z", "iopub.status.idle": "2023-08-18T19:32:25.356247Z", "shell.execute_reply": "2023-08-18T19:32:25.355251Z" }, "origin_pos": 17, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "(1460, 331)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@d2l.add_to_class(KaggleHouse)\n", "def preprocess(self):\n", " label = 'SalePrice'\n", " features = pd.concat(\n", " (self.raw_train.drop(columns=['Id', label]),\n", " self.raw_val.drop(columns=['Id'])))\n", " numeric_features = features.dtypes[features.dtypes!='object'].index\n", " features[numeric_features] = features[numeric_features].apply(\n", " lambda x: (x - x.mean()) / (x.std()))\n", " features[numeric_features] = features[numeric_features].fillna(0)\n", " features = pd.get_dummies(features, dummy_na=True)\n", " self.train = features[:self.raw_train.shape[0]].copy()\n", " self.train[label] = self.raw_train[label]\n", " self.val = features[self.raw_train.shape[0]:].copy()\n", "\n", "data.preprocess()\n", "data.train.shape" ] }, { "cell_type": "markdown", "id": "49669e79", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "We tend to care more about\n", "the relative error $\\frac{y - \\hat{y}}{y}$\n", "One way to address this problem is to\n", "measure the discrepancy in the logarithm of the price estimates" ] }, { "cell_type": "code", "execution_count": 8, "id": "22cee03d", "metadata": { "attributes": { "classes": [], "id": "", "n": "60" }, "execution": { "iopub.execute_input": "2023-08-18T19:32:25.360088Z", "iopub.status.busy": "2023-08-18T19:32:25.359480Z", "iopub.status.idle": "2023-08-18T19:32:25.365132Z", "shell.execute_reply": "2023-08-18T19:32:25.364342Z" }, "origin_pos": 19, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "@d2l.add_to_class(KaggleHouse)\n", "def get_dataloader(self, train):\n", " label = 'SalePrice'\n", " data = self.train if train else self.val\n", " if label not in data: return\n", " get_tensor = lambda x: torch.tensor(x.values.astype(float),\n", " dtype=torch.float32)\n", " tensors = (get_tensor(data.drop(columns=[label])),\n", " torch.log(get_tensor(data[label])).reshape((-1, 1)))\n", " return self.get_tensorloader(tensors, train)" ] }, { "cell_type": "markdown", "id": "c1f670ec", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Cross-validation" ] }, { "cell_type": "code", "execution_count": 9, "id": "e6949856", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:32:25.368517Z", "iopub.status.busy": "2023-08-18T19:32:25.367949Z", "iopub.status.idle": "2023-08-18T19:32:25.372985Z", "shell.execute_reply": "2023-08-18T19:32:25.372067Z" }, "origin_pos": 21, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "def k_fold_data(data, k):\n", " rets = []\n", " fold_size = data.train.shape[0] // k\n", " for j in range(k):\n", " idx = range(j * fold_size, (j+1) * fold_size)\n", " rets.append(KaggleHouse(data.batch_size, data.train.drop(index=idx),\n", " data.train.loc[idx]))\n", " return rets" ] }, { "cell_type": "markdown", "id": "64fe33cc", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The average validation error is returned" ] }, { "cell_type": "code", "execution_count": 10, "id": "c626ec24", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:32:25.376435Z", "iopub.status.busy": "2023-08-18T19:32:25.375867Z", "iopub.status.idle": "2023-08-18T19:32:25.381314Z", "shell.execute_reply": "2023-08-18T19:32:25.380464Z" }, "origin_pos": 23, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "def k_fold(trainer, data, k, lr):\n", " val_loss, models = [], []\n", " for i, data_fold in enumerate(k_fold_data(data, k)):\n", " model = d2l.LinearRegression(lr)\n", " model.board.yscale='log'\n", " if i != 0: model.board.display = False\n", " trainer.fit(model, data_fold)\n", " val_loss.append(float(model.board.data['val_loss'][-1].y))\n", " models.append(model)\n", " print(f'average validation log mse = {sum(val_loss)/len(val_loss)}')\n", " return models" ] }, { "cell_type": "markdown", "id": "96a99091", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Model Selection" ] }, { "cell_type": "code", "execution_count": 11, "id": "c86184c4", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:32:25.384646Z", "iopub.status.busy": "2023-08-18T19:32:25.384079Z", "iopub.status.idle": "2023-08-18T19:32:37.095341Z", "shell.execute_reply": "2023-08-18T19:32:37.094054Z" }, "origin_pos": 25, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average validation log mse = 0.17325432986021042\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", " \n", " 2023-08-18T19:32:36.970536\n", " image/svg+xml\n", " \n", " \n", " Matplotlib v3.7.2, https://matplotlib.org/\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trainer = d2l.Trainer(max_epochs=10)\n", "models = k_fold(trainer, data, k=5, lr=0.01)" ] }, { "cell_type": "markdown", "id": "64f028c8", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Submitting Predictions on Kaggle" ] }, { "cell_type": "code", "execution_count": 12, "id": "f4a3bcde", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:32:37.100208Z", "iopub.status.busy": "2023-08-18T19:32:37.099453Z", "iopub.status.idle": "2023-08-18T19:32:37.266811Z", "shell.execute_reply": "2023-08-18T19:32:37.265844Z" }, "origin_pos": 27, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "preds = [model(torch.tensor(data.val.values.astype(float), dtype=torch.float32))\n", " for model in models]\n", "ensemble_preds = torch.exp(torch.cat(preds, 1)).mean(1)\n", "submission = pd.DataFrame({'Id':data.raw_val.Id,\n", " 'SalePrice':ensemble_preds.detach().numpy()})\n", "submission.to_csv('submission.csv', index=False)" ] } ], "metadata": { "celltoolbar": "Slideshow", "language_info": { "name": "python" }, "required_libs": [], "rise": { "autolaunch": true, "enable_chalkboard": true, "overlay": "
", "scroll": true } }, "nbformat": 4, "nbformat_minor": 5 }