{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd \n",
    "\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from xgboost import XGBRegressor\n",
    "\n",
    "import pickle\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Combined_data = pd.read_csv('LosAngeles_2022.csv')\n",
    "# Combined_data['last_review'] = pd.to_datetime(Combined_data['last_review'], infer_datetime_format=True) \n",
    "Combined_data.drop(['host_id', 'id', 'host_name','name',\n",
    "                    'last_review', 'neighbourhood', 'license', 'number_of_reviews_ltm'], \n",
    "                   axis=1, inplace=True)\n",
    "\n",
    "# fill NAs\n",
    "Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)\n",
    "Combined_data['neighbourhood_group'] = Combined_data['neighbourhood_group'].fillna('unknown')\n",
    "\n",
    "# remove outliers and log transformation\n",
    "Combined_data = Combined_data[np.log1p(Combined_data['price']) < 8]\n",
    "Combined_data = Combined_data[np.log1p(Combined_data['price']) > 3]\n",
    "Combined_data['price'] = np.log1p(Combined_data['price'])\n",
    "Combined_data['reviews_per_month'] = Combined_data[Combined_data['reviews_per_month'] < 17.5]['reviews_per_month']\n",
    "Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)\n",
    "Combined_data['minimum_nights'] = np.log1p(Combined_data['minimum_nights'])\n",
    "\n",
    "# segment numeric variable\n",
    "Combined_data['all_year_avail'] = 1*(Combined_data['availability_365']>353)\n",
    "Combined_data['low_avail'] = 1*(Combined_data['availability_365']< 12)\n",
    "Combined_data['no_reviews'] = 1*(Combined_data['reviews_per_month']==0)\n",
    "\n",
    "# Combined_data['room_type'] = Combined_data['room_type'].apply(lambda x: re.sub(r'[^\\w\\s]', ' ', x))\n",
    "# Combined_data['room_type'] = (Combined_data['room_type']).str.replace(' ', '_')\n",
    "print(np.unique(Combined_data['room_type']))\n",
    "\n",
    "print(Combined_data.shape)\n",
    "\n",
    "# one hot encode categorical variables\n",
    "categorical_features = Combined_data.select_dtypes(include=['object'])\n",
    "print(categorical_features.columns)\n",
    "print(categorical_features.shape)\n",
    "categorical_features_one_hot = pd.get_dummies(categorical_features)\n",
    "\n",
    "# select numerical variables\n",
    "numerical_features =  Combined_data.select_dtypes(exclude=['object'])\n",
    "print(numerical_features.columns)\n",
    "print(numerical_features.shape)\n",
    "\n",
    "y = numerical_features.price\n",
    "numerical_features = numerical_features.drop(['price'], axis=1)\n",
    "\n",
    "X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1) # no column names\n",
    "X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1) # with column names\n",
    "\n",
    "print(X_df.shape)\n",
    "print(X_df.columns)\n",
    "# Processed_data = pd.concat([X_df, y], axis = 1)\n",
    "# Processed_data.to_csv('Airbnb_LA_Processed.dat')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features_one_hot\n",
    "numerical_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "print('Dimensions of the training feature matrix: {}'.format(X_train.shape))\n",
    "print('Dimensions of the training target vector: {}'.format(y_train.shape))\n",
    "print('Dimensions of the test feature matrix: {}'.format(X_test.shape))\n",
    "print('Dimensions of the test target vector: {}'.format(y_test.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = RobustScaler()\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "X_test = scaler.transform(X_test)\n",
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./pickles/scaler', 'wb') as file:\n",
    "  pickle.dump(scaler, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# aim to report RMSE metric over 5-fold cross validation\n",
    "n_folds = 5\n",
    "\n",
    "# rmse\n",
    "def rmse_cv(model, X_train = X_train):\n",
    "    kf = KFold(n_folds, shuffle=True, random_state=2022).get_n_splits(numerical_features)\n",
    "    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xbgreg_best = XGBRegressor(n_estimators=1000, #learning_rate=0.1, \n",
    "                           early_stopping=5, max_depth=9, min_child_weight=5)\n",
    "xbgreg_CV_best = -rmse_cv(xbgreg_best)\n",
    "xbgreg_best.fit(X_train, y_train) \n",
    "y_train_xgbreg = xbgreg_best.predict(X_train)\n",
    "y_test_xgbreg = xbgreg_best.predict(X_test)\n",
    "xgb_best_results = pd.DataFrame({'algorithm':['XGBRegressor'],\n",
    "                                 'CV error': xbgreg_CV_best.mean(), \n",
    "                                 'CV std': xbgreg_CV_best.std(),\n",
    "                                 'training error': [mean_squared_error(y_train, y_train_xgbreg)],\n",
    "                                 'test error': [mean_squared_error(y_test, y_test_xgbreg)],\n",
    "                                 'training_r2_score': [r2_score(y_train, y_train_xgbreg)],\n",
    "                                 'test_r2_score': [r2_score(y_test, y_test_xgbreg)]})\n",
    "xgb_best_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./pickles/model', 'wb') as file:\n",
    "  pickle.dump(xbgreg_best, file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.4 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}