{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**We're going to start by import the necessary libraries**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", "from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import SVR\n", "from sklearn.datasets import load_iris\n", "import pydot\n", "from IPython.display import Image\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**1-Exploration de la base de données**" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "house = 'C:/Users/HP/Anaconda3/Lib/site-packages/notebook/train.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df_train = pd.read_csv(house, sep = ',')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...WoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldYrSoldSalePrice
count1460.0000001460.0000001201.0000001460.0000001460.0000001460.0000001460.0000001460.0000001452.0000001460.000000...1460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.000000
mean730.50000056.89726070.04995810516.8280826.0993155.5753421971.2678081984.865753103.685262443.639726...94.24452146.66027421.9541103.40958915.0609592.75890443.4890416.3219182007.815753180921.195890
std421.61000942.30057124.2847529981.2649321.3829971.11279930.20290420.645407181.066207456.098091...125.33879466.25602861.11914929.31733155.75741540.177307496.1230242.7036261.32809579442.502883
min1.00000020.00000021.0000001300.0000001.0000001.0000001872.0000001950.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000002006.00000034900.000000
25%365.75000020.00000059.0000007553.5000005.0000005.0000001954.0000001967.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000005.0000002007.000000129975.000000
50%730.50000050.00000069.0000009478.5000006.0000005.0000001973.0000001994.0000000.000000383.500000...0.00000025.0000000.0000000.0000000.0000000.0000000.0000006.0000002008.000000163000.000000
75%1095.25000070.00000080.00000011601.5000007.0000006.0000002000.0000002004.000000166.000000712.250000...168.00000068.0000000.0000000.0000000.0000000.0000000.0000008.0000002009.000000214000.000000
max1460.000000190.000000313.000000215245.00000010.0000009.0000002010.0000002010.0000001600.0000005644.000000...857.000000547.000000552.000000508.000000480.000000738.00000015500.00000012.0000002010.000000755000.000000
\n", "

8 rows × 38 columns

\n", "
" ], "text/plain": [ " Id MSSubClass LotFrontage LotArea OverallQual \\\n", "count 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000 \n", "mean 730.500000 56.897260 70.049958 10516.828082 6.099315 \n", "std 421.610009 42.300571 24.284752 9981.264932 1.382997 \n", "min 1.000000 20.000000 21.000000 1300.000000 1.000000 \n", "25% 365.750000 20.000000 59.000000 7553.500000 5.000000 \n", "50% 730.500000 50.000000 69.000000 9478.500000 6.000000 \n", "75% 1095.250000 70.000000 80.000000 11601.500000 7.000000 \n", "max 1460.000000 190.000000 313.000000 215245.000000 10.000000 \n", "\n", " OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 \\\n", "count 1460.000000 1460.000000 1460.000000 1452.000000 1460.000000 \n", "mean 5.575342 1971.267808 1984.865753 103.685262 443.639726 \n", "std 1.112799 30.202904 20.645407 181.066207 456.098091 \n", "min 1.000000 1872.000000 1950.000000 0.000000 0.000000 \n", "25% 5.000000 1954.000000 1967.000000 0.000000 0.000000 \n", "50% 5.000000 1973.000000 1994.000000 0.000000 383.500000 \n", "75% 6.000000 2000.000000 2004.000000 166.000000 712.250000 \n", "max 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 \n", "\n", " ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", "count ... 1460.000000 1460.000000 1460.000000 1460.000000 \n", "mean ... 94.244521 46.660274 21.954110 3.409589 \n", "std ... 125.338794 66.256028 61.119149 29.317331 \n", "min ... 0.000000 0.000000 0.000000 0.000000 \n", "25% ... 0.000000 0.000000 0.000000 0.000000 \n", "50% ... 0.000000 25.000000 0.000000 0.000000 \n", "75% ... 168.000000 68.000000 0.000000 0.000000 \n", "max ... 857.000000 547.000000 552.000000 508.000000 \n", "\n", " ScreenPorch PoolArea MiscVal MoSold YrSold \\\n", "count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 \n", "mean 15.060959 2.758904 43.489041 6.321918 2007.815753 \n", "std 55.757415 40.177307 496.123024 2.703626 1.328095 \n", "min 0.000000 0.000000 0.000000 1.000000 2006.000000 \n", "25% 0.000000 0.000000 0.000000 5.000000 2007.000000 \n", "50% 0.000000 0.000000 0.000000 6.000000 2008.000000 \n", "75% 0.000000 0.000000 0.000000 8.000000 2009.000000 \n", "max 480.000000 738.000000 15500.000000 12.000000 2010.000000 \n", "\n", " SalePrice \n", "count 1460.000000 \n", "mean 180921.195890 \n", "std 79442.502883 \n", "min 34900.000000 \n", "25% 129975.000000 \n", "50% 163000.000000 \n", "75% 214000.000000 \n", "max 755000.000000 \n", "\n", "[8 rows x 38 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", "

5 rows × 81 columns

\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 \n", "\n", " MoSold YrSold SaleType SaleCondition SalePrice \n", "0 2 2008 WD Normal 208500 \n", "1 5 2007 WD Normal 181500 \n", "2 9 2008 WD Normal 223500 \n", "3 2 2006 WD Abnorml 140000 \n", "4 12 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** 2-Analyse de la target variable : 'SalePrice'**" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1460.000000\n", "mean 180921.195890\n", "std 79442.502883\n", "min 34900.000000\n", "25% 129975.000000\n", "50% 163000.000000\n", "75% 214000.000000\n", "max 755000.000000\n", "Name: SalePrice, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['SalePrice'].describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\matplotlib\\axes\\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#Histogramme de la variable cible pour trouver s'il y a 'skewness'\n", "sns.distplot(df_train['SalePrice'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** On remarque une skewness positive il est préférable d'appliquer log(1+x) sur cette variable pour la rendre plus normale**" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\matplotlib\\axes\\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.\n", " warnings.warn(\"The 'normed' kwarg is deprecated, and has been \"\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_train[\"SalePrice\"] = np.log1p(df_train[\"SalePrice\"])\n", "#La nouvelle distribution \n", "sns.distplot(df_train['SalePrice'] )\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**3-Etudier les corrélations entre les variables**" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "k = 10\n", "corrmat = df_train.corr()\n", "cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index\n", "cm = np.corrcoef(df_train[cols].values.T)\n", "sns.set(font_scale=1.25)\n", "hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)\n", "plt.rcParams['figure.figsize'] = ((15,5))\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** On remarque que les variables les plus corrélées à notre variable cible sont : 'OverallQual' et 'GrdLivArea', d'autres part, il sera intéressant aussi de s'intéresser aux multicoliéarités entre les variables prédictives, par exemple 'GarageArea' et 'GarageCars' ont une corrélation très elevée ce qui est justifiée en se basant sur la dascription des variables donnée par la consigne, donc on peut garder qu'une seule, de meme pour 'TotalBsmtF' and '1stFlrSF', et pour 'TotRmsAbvGrd' and 'GrLivArea' aussi. **" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**4-Les variables manquantes **" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "valid = 'C:/Users/HP/Anaconda3/Lib/site-packages/notebook/test.csv'\n", "df_test = pd.read_csv(valid, sep = ',')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "#Deal with multicolinearity :\n", "df_train = df_train.drop(['1stFlrSF','GarageCars','TotRmsAbvGrd'], 1)\n", "df_test = df_test.drop( ['1stFlrSF','GarageCars','TotRmsAbvGrd'],1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n", " 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n", " 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n", " 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n", " 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n", " 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n", " 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n", " 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n", " 'HeatingQC', 'CentralAir', 'Electrical', '2ndFlrSF', 'LowQualFinSF',\n", " 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',\n", " 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'Functional',\n", " 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',\n", " 'GarageFinish', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',\n", " 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',\n", " 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',\n", " 'MoSold', 'YrSold', 'SaleType', 'SaleCondition'],\n", " dtype='object')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.columns" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Id 0\n", "MSSubClass 0\n", "MSZoning 0\n", "LotFrontage 259\n", "LotArea 0\n", "Street 0\n", "Alley 1369\n", "LotShape 0\n", "LandContour 0\n", "Utilities 0\n", "LotConfig 0\n", "LandSlope 0\n", "Neighborhood 0\n", "Condition1 0\n", "Condition2 0\n", "BldgType 0\n", "HouseStyle 0\n", "OverallQual 0\n", "OverallCond 0\n", "YearBuilt 0\n", "YearRemodAdd 0\n", "RoofStyle 0\n", "RoofMatl 0\n", "Exterior1st 0\n", "Exterior2nd 0\n", "MasVnrType 8\n", "MasVnrArea 8\n", "ExterQual 0\n", "ExterCond 0\n", "Foundation 0\n", " ... \n", "FullBath 0\n", "HalfBath 0\n", "BedroomAbvGr 0\n", "KitchenAbvGr 0\n", "KitchenQual 0\n", "Functional 0\n", "Fireplaces 0\n", "FireplaceQu 690\n", "GarageType 81\n", "GarageYrBlt 81\n", "GarageFinish 81\n", "GarageArea 0\n", "GarageQual 81\n", "GarageCond 81\n", "PavedDrive 0\n", "WoodDeckSF 0\n", "OpenPorchSF 0\n", "EnclosedPorch 0\n", "3SsnPorch 0\n", "ScreenPorch 0\n", "PoolArea 0\n", "PoolQC 1453\n", "Fence 1179\n", "MiscFeature 1406\n", "MiscVal 0\n", "MoSold 0\n", "YrSold 0\n", "SaleType 0\n", "SaleCondition 0\n", "SalePrice 0\n", "Length: 78, dtype: int64\n" ] } ], "source": [ "print(df_train.isnull().sum())\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Id 0\n", "MSSubClass 0\n", "MSZoning 4\n", "LotFrontage 227\n", "LotArea 0\n", "Street 0\n", "Alley 1352\n", "LotShape 0\n", "LandContour 0\n", "Utilities 2\n", "LotConfig 0\n", "LandSlope 0\n", "Neighborhood 0\n", "Condition1 0\n", "Condition2 0\n", "BldgType 0\n", "HouseStyle 0\n", "OverallQual 0\n", "OverallCond 0\n", "YearBuilt 0\n", "YearRemodAdd 0\n", "RoofStyle 0\n", "RoofMatl 0\n", "Exterior1st 1\n", "Exterior2nd 1\n", "MasVnrType 16\n", "MasVnrArea 15\n", "ExterQual 0\n", "ExterCond 0\n", "Foundation 0\n", " ... \n", "BsmtHalfBath 2\n", "FullBath 0\n", "HalfBath 0\n", "BedroomAbvGr 0\n", "KitchenAbvGr 0\n", "KitchenQual 1\n", "Functional 2\n", "Fireplaces 0\n", "FireplaceQu 730\n", "GarageType 76\n", "GarageYrBlt 78\n", "GarageFinish 78\n", "GarageArea 1\n", "GarageQual 78\n", "GarageCond 78\n", "PavedDrive 0\n", "WoodDeckSF 0\n", "OpenPorchSF 0\n", "EnclosedPorch 0\n", "3SsnPorch 0\n", "ScreenPorch 0\n", "PoolArea 0\n", "PoolQC 1456\n", "Fence 1169\n", "MiscFeature 1408\n", "MiscVal 0\n", "MoSold 0\n", "YrSold 0\n", "SaleType 1\n", "SaleCondition 0\n", "Length: 77, dtype: int64\n" ] } ], "source": [ "print(df_test.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def log_rmse(yhat, ytrue):\n", " return np.sqrt(mean_squared_error ( np.log(yhat), np.log(ytrue) ))\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'Y_test' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"\\n -- RMSE test {:.4f}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlog_rmse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mY_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0myhat_test\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"\\n -- RMSE train {:.4f}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlog_rmse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mY_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0myhat_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mNameError\u001b[0m: name 'Y_test' is not defined" ] } ], "source": [ "print(\"\\n -- RMSE test {:.4f}\".format(log_rmse(Y_test, yhat_test) ))\n", "print(\"\\n -- RMSE train {:.4f}\".format(log_rmse(Y_train, yhat_train)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** On fait l'hypothèse que si le porcentage de valeurs manquantes est supérieur à 50 pourcent, on supprime cette colonne **" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TotalPercent
PoolQC145399.520548
MiscFeature140696.301370
Alley136993.767123
Fence117980.753425
FireplaceQu69047.260274
LotFrontage25917.739726
GarageCond815.547945
GarageType815.547945
GarageYrBlt815.547945
GarageFinish815.547945
GarageQual815.547945
BsmtFinType2382.602740
BsmtExposure382.602740
BsmtCond372.534247
BsmtQual372.534247
BsmtFinType1372.534247
MasVnrArea80.547945
MasVnrType80.547945
Electrical10.068493
LandSlope00.000000
\n", "
" ], "text/plain": [ " Total Percent\n", "PoolQC 1453 99.520548\n", "MiscFeature 1406 96.301370\n", "Alley 1369 93.767123\n", "Fence 1179 80.753425\n", "FireplaceQu 690 47.260274\n", "LotFrontage 259 17.739726\n", "GarageCond 81 5.547945\n", "GarageType 81 5.547945\n", "GarageYrBlt 81 5.547945\n", "GarageFinish 81 5.547945\n", "GarageQual 81 5.547945\n", "BsmtFinType2 38 2.602740\n", "BsmtExposure 38 2.602740\n", "BsmtCond 37 2.534247\n", "BsmtQual 37 2.534247\n", "BsmtFinType1 37 2.534247\n", "MasVnrArea 8 0.547945\n", "MasVnrType 8 0.547945\n", "Electrical 1 0.068493\n", "LandSlope 0 0.000000" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "total = df_train.isnull().sum().sort_values(ascending=False)\n", "percent = ((df_train.isnull().sum()/df_train.isnull().count())*100).sort_values(ascending=False)\n", "missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])\n", "missing_data.head(20)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FireplaceQu 690\n", "LotFrontage 259\n", "GarageFinish 81\n", "GarageYrBlt 81\n", "GarageType 81\n", "GarageQual 81\n", "GarageCond 81\n", "BsmtExposure 38\n", "BsmtFinType2 38\n", "BsmtQual 37\n", "BsmtCond 37\n", "BsmtFinType1 37\n", "MasVnrArea 8\n", "MasVnrType 8\n", "Electrical 1\n", "RoofMatl 0\n", "RoofStyle 0\n", "Exterior2nd 0\n", "Exterior1st 0\n", "YearBuilt 0\n", "ExterQual 0\n", "ExterCond 0\n", "Foundation 0\n", "YearRemodAdd 0\n", "SalePrice 0\n", "OverallCond 0\n", "OverallQual 0\n", "HouseStyle 0\n", "BldgType 0\n", "Condition1 0\n", " ... \n", "SaleType 0\n", "YrSold 0\n", "MoSold 0\n", "MiscVal 0\n", "PoolArea 0\n", "ScreenPorch 0\n", "3SsnPorch 0\n", "EnclosedPorch 0\n", "OpenPorchSF 0\n", "WoodDeckSF 0\n", "PavedDrive 0\n", "GarageArea 0\n", "Fireplaces 0\n", "Functional 0\n", "KitchenQual 0\n", "KitchenAbvGr 0\n", "BedroomAbvGr 0\n", "HalfBath 0\n", "FullBath 0\n", "BsmtHalfBath 0\n", "BsmtFullBath 0\n", "GrLivArea 0\n", "LowQualFinSF 0\n", "2ndFlrSF 0\n", "CentralAir 0\n", "HeatingQC 0\n", "Heating 0\n", "TotalBsmtSF 0\n", "SaleCondition 0\n", "Id 0\n", "Length: 74, dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train = df_train.drop((missing_data[missing_data['Percent'] > 50 ]).index,1)\n", "df_train.isnull().sum().sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FireplaceQu 730\n", "LotFrontage 227\n", "GarageCond 78\n", "GarageQual 78\n", "GarageYrBlt 78\n", "GarageFinish 78\n", "GarageType 76\n", "BsmtCond 45\n", "BsmtQual 44\n", "BsmtExposure 44\n", "BsmtFinType1 42\n", "BsmtFinType2 42\n", "MasVnrType 16\n", "MasVnrArea 15\n", "MSZoning 4\n", "BsmtHalfBath 2\n", "Utilities 2\n", "Functional 2\n", "BsmtFullBath 2\n", "BsmtFinSF2 1\n", "BsmtFinSF1 1\n", "Exterior2nd 1\n", "SaleType 1\n", "TotalBsmtSF 1\n", "Exterior1st 1\n", "BsmtUnfSF 1\n", "GarageArea 1\n", "KitchenQual 1\n", "HouseStyle 0\n", "MSSubClass 0\n", " ... \n", "OverallCond 0\n", "OverallQual 0\n", "Condition2 0\n", "SaleCondition 0\n", "ExterQual 0\n", "ExterCond 0\n", "YrSold 0\n", "MoSold 0\n", "MiscVal 0\n", "PoolArea 0\n", "ScreenPorch 0\n", "3SsnPorch 0\n", "EnclosedPorch 0\n", "OpenPorchSF 0\n", "WoodDeckSF 0\n", "PavedDrive 0\n", "Fireplaces 0\n", "KitchenAbvGr 0\n", "BedroomAbvGr 0\n", "HalfBath 0\n", "FullBath 0\n", "GrLivArea 0\n", "LowQualFinSF 0\n", "2ndFlrSF 0\n", "Electrical 0\n", "CentralAir 0\n", "HeatingQC 0\n", "Heating 0\n", "Foundation 0\n", "Id 0\n", "Length: 73, dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test= df_test.drop((missing_data[missing_data['Percent'] > 50 ]).index,1)\n", "df_test.isnull().sum().sort_values(ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Compléter les valeurs numériques manquantes par la moyenne **" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',\n", " 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',\n", " 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '2ndFlrSF', 'LowQualFinSF',\n", " 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',\n", " 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageYrBlt',\n", " 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',\n", " 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],\n", " dtype='object')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Complete missing values for numerical variables with the mean values : \n", "numerical_features = df_train.select_dtypes(exclude=['object']).columns\n", "numerical_features" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ " N = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',\n", " 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',\n", " 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '2ndFlrSF', 'LowQualFinSF',\n", " 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',\n", " 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageYrBlt',\n", " 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',\n", " 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']\n", "for col in N:\n", " df_train[col].fillna(np.mean(df_train[col]), inplace = True)\n", " df_test[col].fillna(np.mean(df_test[col]), inplace = True)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Remplacer les valeurs manquantes pour les variables catégorielles par des : None **" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',\n", " 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',\n", " 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',\n", " 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',\n", " 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',\n", " 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',\n", " 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',\n", " 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],\n", " dtype='object')" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "categorical_features = df_train.select_dtypes(include=['object']).columns\n", "categorical_features" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "C = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',\n", " 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',\n", " 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',\n", " 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',\n", " 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',\n", " 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',\n", " 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',\n", " 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']\n", "for col in C:\n", " df_train[col].fillna('None', inplace = True)\n", " df_test[col].fillna('None', inplace = True)\n", "\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", "of pandas will change to not sort by default.\n", "\n", "To accept the future behavior, pass 'sort=True'.\n", "\n", "To retain the current behavior and silence the warning, pass sort=False\n", "\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] } ], "source": [ "#Label encoding\n", "# Join the two datasets before encoding \n", "df_join = pd.concat([df_train,df_test])\n", "for col in C:\n", " le = LabelEncoder()\n", " le.fit(df_join[col])\n", " df_train[col] = le.transform(df_train[col])\n", " df_test[col] = le.transform(df_test[col])\n", " \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**5-Tester les différents modèles**" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "Y = df_train['SalePrice'] \n", "X = df_train.drop(['SalePrice'], 1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "clf = RandomForestRegressor(n_estimators=1000 )\n", "param_grid = {\"max_depth\": [3,6, 9, 12],\n", " \"min_samples_split\": [2, 3, 5]}\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "gs = GridSearchCV(clf, param_grid=param_grid, cv=3, verbose = 2)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 12 candidates, totalling 36 fits\n", "[CV] max_depth=3, min_samples_split=2 ................................\n", "[CV] ................. max_depth=3, min_samples_split=2, total= 3.8s\n", "[CV] max_depth=3, min_samples_split=2 ................................\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 3.9s remaining: 0.0s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[CV] ................. max_depth=3, min_samples_split=2, total= 3.7s\n", "[CV] max_depth=3, min_samples_split=2 ................................\n", "[CV] ................. max_depth=3, min_samples_split=2, total= 3.5s\n", "[CV] max_depth=3, min_samples_split=3 ................................\n", "[CV] ................. max_depth=3, min_samples_split=3, total= 3.5s\n", "[CV] max_depth=3, min_samples_split=3 ................................\n", "[CV] ................. max_depth=3, min_samples_split=3, total= 3.5s\n", "[CV] max_depth=3, min_samples_split=3 ................................\n", "[CV] ................. max_depth=3, min_samples_split=3, total= 3.7s\n", "[CV] max_depth=3, min_samples_split=5 ................................\n", "[CV] ................. max_depth=3, min_samples_split=5, total= 3.5s\n", "[CV] max_depth=3, min_samples_split=5 ................................\n", "[CV] ................. max_depth=3, min_samples_split=5, total= 3.5s\n", "[CV] max_depth=3, min_samples_split=5 ................................\n", "[CV] ................. max_depth=3, min_samples_split=5, total= 3.6s\n", "[CV] max_depth=6, min_samples_split=2 ................................\n", "[CV] ................. max_depth=6, min_samples_split=2, total= 6.4s\n", "[CV] max_depth=6, min_samples_split=2 ................................\n", "[CV] ................. max_depth=6, min_samples_split=2, total= 6.3s\n", "[CV] max_depth=6, min_samples_split=2 ................................\n", "[CV] ................. max_depth=6, min_samples_split=2, total= 6.5s\n", "[CV] max_depth=6, min_samples_split=3 ................................\n", "[CV] ................. max_depth=6, min_samples_split=3, total= 6.2s\n", "[CV] max_depth=6, min_samples_split=3 ................................\n", "[CV] ................. max_depth=6, min_samples_split=3, total= 6.3s\n", "[CV] max_depth=6, min_samples_split=3 ................................\n", "[CV] ................. max_depth=6, min_samples_split=3, total= 6.8s\n", "[CV] max_depth=6, min_samples_split=5 ................................\n", "[CV] ................. max_depth=6, min_samples_split=5, total= 6.2s\n", "[CV] max_depth=6, min_samples_split=5 ................................\n", "[CV] ................. max_depth=6, min_samples_split=5, total= 6.3s\n", "[CV] max_depth=6, min_samples_split=5 ................................\n", "[CV] ................. max_depth=6, min_samples_split=5, total= 6.3s\n", "[CV] max_depth=9, min_samples_split=2 ................................\n", "[CV] ................. max_depth=9, min_samples_split=2, total= 9.2s\n", "[CV] max_depth=9, min_samples_split=2 ................................\n", "[CV] ................. max_depth=9, min_samples_split=2, total= 9.2s\n", "[CV] max_depth=9, min_samples_split=2 ................................\n", "[CV] ................. max_depth=9, min_samples_split=2, total= 9.5s\n", "[CV] max_depth=9, min_samples_split=3 ................................\n", "[CV] ................. max_depth=9, min_samples_split=3, total= 8.9s\n", "[CV] max_depth=9, min_samples_split=3 ................................\n", "[CV] ................. max_depth=9, min_samples_split=3, total= 9.1s\n", "[CV] max_depth=9, min_samples_split=3 ................................\n", "[CV] ................. max_depth=9, min_samples_split=3, total= 9.1s\n", "[CV] max_depth=9, min_samples_split=5 ................................\n", "[CV] ................. max_depth=9, min_samples_split=5, total= 8.7s\n", "[CV] max_depth=9, min_samples_split=5 ................................\n", "[CV] ................. max_depth=9, min_samples_split=5, total= 8.7s\n", "[CV] max_depth=9, min_samples_split=5 ................................\n", "[CV] ................. max_depth=9, min_samples_split=5, total= 8.6s\n", "[CV] max_depth=12, min_samples_split=2 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=2, total= 11.4s\n", "[CV] max_depth=12, min_samples_split=2 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=2, total= 11.5s\n", "[CV] max_depth=12, min_samples_split=2 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=2, total= 11.3s\n", "[CV] max_depth=12, min_samples_split=3 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=3, total= 11.0s\n", "[CV] max_depth=12, min_samples_split=3 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=3, total= 10.9s\n", "[CV] max_depth=12, min_samples_split=3 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=3, total= 10.9s\n", "[CV] max_depth=12, min_samples_split=5 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=5, total= 9.7s\n", "[CV] max_depth=12, min_samples_split=5 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=5, total= 9.8s\n", "[CV] max_depth=12, min_samples_split=5 ...............................\n", "[CV] ................ max_depth=12, min_samples_split=5, total= 9.8s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=1)]: Done 36 out of 36 | elapsed: 4.6min finished\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", " max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,\n", " oob_score=False, random_state=None, verbose=0, warm_start=False),\n", " fit_params=None, iid=True, n_jobs=1,\n", " param_grid={'max_depth': [3, 6, 9, 12], 'min_samples_split': [2, 3, 5]},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", " scoring=None, verbose=2)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs.fit(X,Y)\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "yhat_train = gs.best_estimator_.predict(X_train)\n", "yhat_test = gs.best_estimator_.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "Train_score = np.sqrt(mean_squared_error(yhat_train, Y_train))\n", "Test_score = np.sqrt(mean_squared_error(yhat_test, Y_test))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test 0.0533 train 0.0549 \n" ] } ], "source": [ " print(\"test {:.4f} train {:.4f} \".format(Test_score, Train_score))\n" ] }, { "cell_type": "code", "execution_count": 988, "metadata": {}, "outputs": [], "source": [ "X_valid = df_test[C+N]\n", "yhat_valid = gs.best_estimator_.predict(X_valid)\n", "yhat_valid = np.exp(yhat_valid) -1\n", "\n" ] }, { "cell_type": "code", "execution_count": 984, "metadata": {}, "outputs": [], "source": [ "results = pd.DataFrame(columns = ['Id', 'SalePrice'])\n", "results['Id'] = X_valid.index + 1461\n", "results['SalePrice'] =yhat_valid\n", "results.to_csv(\"submission_RFNew.csv\", index = False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Après avoir construit le modèle du Random forest, je vais essayer le modèle SVR (Support Vector Regression)**" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "clf1 = SVR()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n", " kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf1.fit(X_train,Y_train)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "yhat_train1 = clf1.predict(X_train)\n", "yhat_test1 = clf1.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "#Scoring\n", "Train_score1 = np.sqrt(mean_squared_error(yhat_train1, Y_train))\n", "Test_score1 = np.sqrt(mean_squared_error(yhat_test1, Y_test))" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test 0.3778 train 0.1007 \n" ] } ], "source": [ "print(\"test {:.4f} train {:.4f} \".format(Test_score1, Train_score1))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** On remarque un overfitting du modèle**" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "#Validation dans la base de données test \n", "X_valid = df_test\n", "yhat_valid1 = clf1.predict(X_valid)\n", "yhat_valid1= np.exp(yhat_valid1) -1\n", "\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "#Submit :\n", "results = pd.DataFrame(columns = ['Id', 'SalePrice'])\n", "results['Id'] = X_valid.index + 1461\n", "results['SalePrice'] =yhat_valid1\n", "results.to_csv(\"submission_LR5.csv\", index = False)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "#Lasso model \n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import RobustScaler\n", "from sklearn.linear_model import ElasticNet, Lasso\n", "\n", "lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))\n" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "lasso.fit(X_train,Y_train)\n", "yhat_train2 = lasso.predict(X_train)\n", "yhat_test2 = lasso.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "Train_score2 = np.sqrt(mean_squared_error(yhat_train2, Y_train))\n", "Test_score2 = np.sqrt(mean_squared_error(yhat_test2, Y_test))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test 0.2050 train 0.1238 \n" ] } ], "source": [ "print(\"test {:.4f} train {:.4f} \".format(Test_score2, Train_score2))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Le score sur kaggle pour ce modèle est : 0.12666 avec un classement de 1503 ** " ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "#Validation dans la base de données test \n", "X_valid = df_test\n", "yhat_valid2 = lasso.predict(X_valid)\n", "yhat_valid2= np.exp(yhat_valid2) -1" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "results = pd.DataFrame(columns = ['Id', 'SalePrice'])\n", "results['Id'] = X_valid.index + 1461\n", "results['SalePrice'] =yhat_valid2\n", "results.to_csv(\"submission_Lasso.csv\", index = False)\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "#Elastic net :\n", "\n", "#Shuffle dataframe: \n", "df_train= df_train.sample(frac=1)\n", "df_test = df_test.sample(frac=1)\n" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "#Refaire le split du dataset\n", "X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n" ] } ], "source": [ "#Construire le modèle :\n", "from sklearn import linear_model\n", "EN = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(X_train, Y_train)\n" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n", "C:\\Users\\HP\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.\n", " ConvergenceWarning)\n" ] }, { "data": { "text/plain": [ "ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], copy_X=True,\n", " cv=None, eps=0.001, fit_intercept=True,\n", " l1_ratio=[0.01, 0.1, 0.5, 0.9, 0.99], max_iter=5000, n_alphas=100,\n", " n_jobs=1, normalize=False, positive=False, precompute='auto',\n", " random_state=None, selection='cyclic', tol=0.0001, verbose=0)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Fit the model : \n", "EN.fit(X_train,Y_train)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "yhat_train3 = EN.predict(X_train)\n", "yhat_test3 = EN.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "Train_score3 = np.sqrt(mean_squared_error(yhat_train3, Y_train))\n", "Test_score3 = np.sqrt(mean_squared_error(yhat_test3, Y_test))" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test 0.1456 train 0.1360 \n" ] } ], "source": [ "print(\"test {:.4f} train {:.4f} \".format(Test_score3, Train_score3))\n" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "#Validation dans la base de données test \n", "X_valid = df_test\n", "yhat_valid3 = EN.predict(X_valid)\n", "yhat_valid3= np.exp(yhat_valid3) -1" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "results = pd.DataFrame(columns = ['Id', 'SalePrice'])\n", "results['Id'] = X_valid.index + 1461\n", "results['SalePrice'] =yhat_valid3\n", "results.to_csv(\"submission_ENET.csv\", index = False)\n" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "yhat_validAvg =np.exp(lasso.predict(X_valid)) +np.exp(EN.predict(X_valid)) -2" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "yhat_validAvg= yhat_validAvg/2" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "results = pd.DataFrame(columns = ['Id', 'SalePrice'])\n", "results['Id'] = X_valid.index + 1461\n", "results['SalePrice'] =yhat_validAvg\n", "results.to_csv(\"submission_Avg2.csv\", index = False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Cette technique n'as pas permis de minimiser les erreurs, ainsi j'opte pour le modèle Lasso vu qu'il présente moins d'erreurs sur la prédiction**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }