{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_squared_error" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "# metric de scoring\n", "def log_rmse(yhat, ytrue):\n", " return np.sqrt(mean_squared_error ( np.log(yhat), np.log(ytrue) ))\n" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", "

5 rows × 81 columns

\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "\n", " MoSold YrSold SaleType SaleCondition SalePrice \n", "0 2 2008 WD Normal 208500 \n", "1 5 2007 WD Normal 181500 \n", "2 9 2008 WD Normal 223500 \n", "3 2 2006 WD Abnorml 140000 \n", "4 12 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('ames/train.csv')\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MoSold: (0, 81)\n", "YrSold: (0, 81)\n", "1stFlrSF: (0, 81)\n", "2ndFlrSF: (0, 81)\n", "BedroomAbvGr: (0, 81)\n" ] } ], "source": [ "# selection arbitraire des colonnes\n", "# check for NaN\n", "cols = ['MoSold', 'YrSold','1stFlrSF','2ndFlrSF', 'BedroomAbvGr']\n", "for col in cols:\n", " print(\"{}: {}\".format(col, df[df[col].isnull()].shape ))\n", "\n", "# pas de null values dans ces colonnes\n", "X = df[cols]\n", "y = df['SalePrice']\n" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "# split train test\n", "\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,\n", " test_size=0.3,\n", " random_state=2)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,\n", " max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,\n", " oob_score=False, random_state=None, verbose=0, warm_start=False)\n", "\n", " -- RMSE test 0.2365\n", "\n", " -- RMSE train 0.2117\n" ] } ], "source": [ "# train regressor\n", "\n", "\n", "clf = RandomForestRegressor(max_depth = 5,n_estimators=300 )\n", "\n", "clf.fit(X_train, y_train)\n", "print(clf)\n", "yhat_test = clf.predict(X_test)\n", "yhat_train = clf.predict(X_train)\n", "\n", "print(\"\\n -- RMSE test {:.4f}\".format(log_rmse(y_test, yhat_test) ))\n", "print(\"\\n -- RMSE train {:.4f}\".format(log_rmse(y_train, yhat_train)))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Appliquer le model sur le test" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# load validation dataset\n", "vdf = pd.read_csv('ames/test.csv')\n", "# a ce niveau appliquer les memes tranformations sur X_valid\n", "X_valid = vdf[cols]\n", "\n", "y_valid = clf.predict(X_valid)\n" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "# construire la dataframe de resultats\n", "results = pd.DataFrame(columns = ['Id', 'SalePrice'])\n", "\n", "# Kaggle veut que la colonne index commence par 1461\n", "results['Id'] = X_valid.index + 1461\n", "\n", "results['SalePrice'] = y_valid\n", "\n", "# ecrire le resultats dans le fichier csv\n", "results.to_csv(\"submission_01.csv\", index = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }