{
"cells": [
{
"cell_type": "markdown",
"id": "290111d9",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Predicting House Prices on Kaggle\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1c33eb92",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:21.587414Z",
"iopub.status.busy": "2023-08-18T19:32:21.586752Z",
"iopub.status.idle": "2023-08-18T19:32:24.821984Z",
"shell.execute_reply": "2023-08-18T19:32:24.820834Z"
},
"origin_pos": 3,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import pandas as pd\n",
"import torch\n",
"from torch import nn\n",
"from d2l import torch as d2l"
]
},
{
"cell_type": "markdown",
"id": "a51ad534",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"Implement two utility functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c5b9dd70",
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "2"
},
"execution": {
"iopub.execute_input": "2023-08-18T19:32:24.826201Z",
"iopub.status.busy": "2023-08-18T19:32:24.825720Z",
"iopub.status.idle": "2023-08-18T19:32:24.831209Z",
"shell.execute_reply": "2023-08-18T19:32:24.830384Z"
},
"origin_pos": 7,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"def download(url, folder, sha1_hash=None):\n",
" \"\"\"Download a file to folder and return the local filepath.\"\"\"\n",
"\n",
"def extract(filename, folder):\n",
" \"\"\"Extract a zip/tar file into folder.\"\"\""
]
},
{
"cell_type": "markdown",
"id": "2892cc66",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Read in and process the data\n",
"using `pandas`"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7e9e8f7c",
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "31"
},
"execution": {
"iopub.execute_input": "2023-08-18T19:32:24.844705Z",
"iopub.status.busy": "2023-08-18T19:32:24.843955Z",
"iopub.status.idle": "2023-08-18T19:32:25.218067Z",
"shell.execute_reply": "2023-08-18T19:32:25.217232Z"
},
"origin_pos": 11,
"tab": [
"pytorch"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading ../data/kaggle_house_pred_train.csv from http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv...\n",
"Downloading ../data/kaggle_house_pred_test.csv from http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_test.csv...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1460, 81)\n",
"(1459, 80)\n"
]
}
],
"source": [
"class KaggleHouse(d2l.DataModule):\n",
" def __init__(self, batch_size, train=None, val=None):\n",
" super().__init__()\n",
" self.save_hyperparameters()\n",
" if self.train is None:\n",
" self.raw_train = pd.read_csv(d2l.download(\n",
" d2l.DATA_URL + 'kaggle_house_pred_train.csv', self.root,\n",
" sha1_hash='585e9cc93e70b39160e7921475f9bcd7d31219ce'))\n",
" self.raw_val = pd.read_csv(d2l.download(\n",
" d2l.DATA_URL + 'kaggle_house_pred_test.csv', self.root,\n",
" sha1_hash='fa19780a7b011d9b009e8bff8e99922a8ee2eb90'))\n",
"\n",
"data = KaggleHouse(batch_size=64)\n",
"print(data.raw_train.shape)\n",
"print(data.raw_val.shape)"
]
},
{
"cell_type": "markdown",
"id": "68194e07",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Take a look at the first four and final two features\n",
"as well as the label (SalePrice)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "92621a85",
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "10"
},
"execution": {
"iopub.execute_input": "2023-08-18T19:32:25.221755Z",
"iopub.status.busy": "2023-08-18T19:32:25.221161Z",
"iopub.status.idle": "2023-08-18T19:32:25.230323Z",
"shell.execute_reply": "2023-08-18T19:32:25.229502Z"
},
"origin_pos": 13,
"tab": [
"pytorch"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Id MSSubClass MSZoning LotFrontage SaleType SaleCondition SalePrice\n",
"0 1 60 RL 65.0 WD Normal 208500\n",
"1 2 20 RL 80.0 WD Normal 181500\n",
"2 3 60 RL 68.0 WD Normal 223500\n",
"3 4 70 RL 60.0 WD Abnorml 140000\n"
]
}
],
"source": [
"print(data.raw_train.iloc[:4, [0, 1, 2, 3, -3, -2, -1]])"
]
},
{
"cell_type": "markdown",
"id": "a7ccf8f1",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Replacing all missing values\n",
"by the corresponding feature's mean.\n",
"*standardize* the data by\n",
"rescaling features to zero mean and unit variance\n",
"Next we deal with discrete values.\n",
"We replace them by a one-hot encoding"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a9e39c34",
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "33"
},
"execution": {
"iopub.execute_input": "2023-08-18T19:32:25.242819Z",
"iopub.status.busy": "2023-08-18T19:32:25.242192Z",
"iopub.status.idle": "2023-08-18T19:32:25.356247Z",
"shell.execute_reply": "2023-08-18T19:32:25.355251Z"
},
"origin_pos": 17,
"tab": [
"pytorch"
]
},
"outputs": [
{
"data": {
"text/plain": [
"(1460, 331)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@d2l.add_to_class(KaggleHouse)\n",
"def preprocess(self):\n",
" label = 'SalePrice'\n",
" features = pd.concat(\n",
" (self.raw_train.drop(columns=['Id', label]),\n",
" self.raw_val.drop(columns=['Id'])))\n",
" numeric_features = features.dtypes[features.dtypes!='object'].index\n",
" features[numeric_features] = features[numeric_features].apply(\n",
" lambda x: (x - x.mean()) / (x.std()))\n",
" features[numeric_features] = features[numeric_features].fillna(0)\n",
" features = pd.get_dummies(features, dummy_na=True)\n",
" self.train = features[:self.raw_train.shape[0]].copy()\n",
" self.train[label] = self.raw_train[label]\n",
" self.val = features[self.raw_train.shape[0]:].copy()\n",
"\n",
"data.preprocess()\n",
"data.train.shape"
]
},
{
"cell_type": "markdown",
"id": "49669e79",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We tend to care more about\n",
"the relative error $\\frac{y - \\hat{y}}{y}$\n",
"One way to address this problem is to\n",
"measure the discrepancy in the logarithm of the price estimates"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "22cee03d",
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "60"
},
"execution": {
"iopub.execute_input": "2023-08-18T19:32:25.360088Z",
"iopub.status.busy": "2023-08-18T19:32:25.359480Z",
"iopub.status.idle": "2023-08-18T19:32:25.365132Z",
"shell.execute_reply": "2023-08-18T19:32:25.364342Z"
},
"origin_pos": 19,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"@d2l.add_to_class(KaggleHouse)\n",
"def get_dataloader(self, train):\n",
" label = 'SalePrice'\n",
" data = self.train if train else self.val\n",
" if label not in data: return\n",
" get_tensor = lambda x: torch.tensor(x.values.astype(float),\n",
" dtype=torch.float32)\n",
" tensors = (get_tensor(data.drop(columns=[label])),\n",
" torch.log(get_tensor(data[label])).reshape((-1, 1)))\n",
" return self.get_tensorloader(tensors, train)"
]
},
{
"cell_type": "markdown",
"id": "c1f670ec",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Cross-validation"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e6949856",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:25.368517Z",
"iopub.status.busy": "2023-08-18T19:32:25.367949Z",
"iopub.status.idle": "2023-08-18T19:32:25.372985Z",
"shell.execute_reply": "2023-08-18T19:32:25.372067Z"
},
"origin_pos": 21,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"def k_fold_data(data, k):\n",
" rets = []\n",
" fold_size = data.train.shape[0] // k\n",
" for j in range(k):\n",
" idx = range(j * fold_size, (j+1) * fold_size)\n",
" rets.append(KaggleHouse(data.batch_size, data.train.drop(index=idx),\n",
" data.train.loc[idx]))\n",
" return rets"
]
},
{
"cell_type": "markdown",
"id": "64fe33cc",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"The average validation error is returned"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c626ec24",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:25.376435Z",
"iopub.status.busy": "2023-08-18T19:32:25.375867Z",
"iopub.status.idle": "2023-08-18T19:32:25.381314Z",
"shell.execute_reply": "2023-08-18T19:32:25.380464Z"
},
"origin_pos": 23,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"def k_fold(trainer, data, k, lr):\n",
" val_loss, models = [], []\n",
" for i, data_fold in enumerate(k_fold_data(data, k)):\n",
" model = d2l.LinearRegression(lr)\n",
" model.board.yscale='log'\n",
" if i != 0: model.board.display = False\n",
" trainer.fit(model, data_fold)\n",
" val_loss.append(float(model.board.data['val_loss'][-1].y))\n",
" models.append(model)\n",
" print(f'average validation log mse = {sum(val_loss)/len(val_loss)}')\n",
" return models"
]
},
{
"cell_type": "markdown",
"id": "96a99091",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Model Selection"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c86184c4",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:32:25.384646Z",
"iopub.status.busy": "2023-08-18T19:32:25.384079Z",
"iopub.status.idle": "2023-08-18T19:32:37.095341Z",
"shell.execute_reply": "2023-08-18T19:32:37.094054Z"
},
"origin_pos": 25,
"tab": [
"pytorch"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"average validation log mse = 0.17325432986021042\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
"