{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd \n", "\n", "from sklearn.preprocessing import RobustScaler\n", "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "from xgboost import XGBRegressor\n", "\n", "import pickle\n", "import warnings\n", "warnings.filterwarnings(\"ignore\", category=FutureWarning)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Combined_data = pd.read_csv('LosAngeles_2022.csv')\n", "# Combined_data['last_review'] = pd.to_datetime(Combined_data['last_review'], infer_datetime_format=True) \n", "Combined_data.drop(['host_id', 'id', 'host_name','name',\n", " 'last_review', 'neighbourhood', 'license', 'number_of_reviews_ltm'], \n", " axis=1, inplace=True)\n", "\n", "# fill NAs\n", "Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)\n", "Combined_data['neighbourhood_group'] = Combined_data['neighbourhood_group'].fillna('unknown')\n", "\n", "# remove outliers and log transformation\n", "Combined_data = Combined_data[np.log1p(Combined_data['price']) < 8]\n", "Combined_data = Combined_data[np.log1p(Combined_data['price']) > 3]\n", "Combined_data['price'] = np.log1p(Combined_data['price'])\n", "Combined_data['reviews_per_month'] = Combined_data[Combined_data['reviews_per_month'] < 17.5]['reviews_per_month']\n", "Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)\n", "Combined_data['minimum_nights'] = np.log1p(Combined_data['minimum_nights'])\n", "\n", "# segment numeric variable\n", "Combined_data['all_year_avail'] = 1*(Combined_data['availability_365']>353)\n", "Combined_data['low_avail'] = 1*(Combined_data['availability_365']< 12)\n", "Combined_data['no_reviews'] = 1*(Combined_data['reviews_per_month']==0)\n", "\n", "# Combined_data['room_type'] = Combined_data['room_type'].apply(lambda x: re.sub(r'[^\\w\\s]', ' ', x))\n", "# Combined_data['room_type'] = (Combined_data['room_type']).str.replace(' ', '_')\n", "print(np.unique(Combined_data['room_type']))\n", "\n", "print(Combined_data.shape)\n", "\n", "# one hot encode categorical variables\n", "categorical_features = Combined_data.select_dtypes(include=['object'])\n", "print(categorical_features.columns)\n", "print(categorical_features.shape)\n", "categorical_features_one_hot = pd.get_dummies(categorical_features)\n", "\n", "# select numerical variables\n", "numerical_features = Combined_data.select_dtypes(exclude=['object'])\n", "print(numerical_features.columns)\n", "print(numerical_features.shape)\n", "\n", "y = numerical_features.price\n", "numerical_features = numerical_features.drop(['price'], axis=1)\n", "\n", "X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1) # no column names\n", "X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1) # with column names\n", "\n", "print(X_df.shape)\n", "print(X_df.columns)\n", "# Processed_data = pd.concat([X_df, y], axis = 1)\n", "# Processed_data.to_csv('Airbnb_LA_Processed.dat')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "categorical_features_one_hot\n", "numerical_features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "print('Dimensions of the training feature matrix: {}'.format(X_train.shape))\n", "print('Dimensions of the training target vector: {}'.format(y_train.shape))\n", "print('Dimensions of the test feature matrix: {}'.format(X_test.shape))\n", "print('Dimensions of the test target vector: {}'.format(y_test.shape))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scaler = RobustScaler()\n", "X_train = scaler.fit_transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "X_train.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('./pickles/scaler', 'wb') as file:\n", " pickle.dump(scaler, file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# aim to report RMSE metric over 5-fold cross validation\n", "n_folds = 5\n", "\n", "# rmse\n", "def rmse_cv(model, X_train = X_train):\n", " kf = KFold(n_folds, shuffle=True, random_state=2022).get_n_splits(numerical_features)\n", " return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "xbgreg_best = XGBRegressor(n_estimators=1000, #learning_rate=0.1, \n", " early_stopping=5, max_depth=9, min_child_weight=5)\n", "xbgreg_CV_best = -rmse_cv(xbgreg_best)\n", "xbgreg_best.fit(X_train, y_train) \n", "y_train_xgbreg = xbgreg_best.predict(X_train)\n", "y_test_xgbreg = xbgreg_best.predict(X_test)\n", "xgb_best_results = pd.DataFrame({'algorithm':['XGBRegressor'],\n", " 'CV error': xbgreg_CV_best.mean(), \n", " 'CV std': xbgreg_CV_best.std(),\n", " 'training error': [mean_squared_error(y_train, y_train_xgbreg)],\n", " 'test error': [mean_squared_error(y_test, y_test_xgbreg)],\n", " 'training_r2_score': [r2_score(y_train, y_train_xgbreg)],\n", " 'test_r2_score': [r2_score(y_test, y_test_xgbreg)]})\n", "xgb_best_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('./pickles/model', 'wb') as file:\n", " pickle.dump(xbgreg_best, file)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.4 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" } } }, "nbformat": 4, "nbformat_minor": 2 }