{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#load some data in a pandas dataframe\n", "\n", "df = None\n", "#df =pd.read_excel(\"C:/Users/Admin/Pythonprojects/RAMS/data/Oilanalysis.xlsx\") #for those who would like to work from a local drive \n", "df = pd.read_excel(\"https://raw.githubusercontent.com/chrisrijsdijk/RAMS/master/data/Oilanalysis.xlsx\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# check the datatypes in the dataframe to verify that all columns except for \"Age\" are numerical\n", "\n", "#df.dtypes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# get a preview on the data\n", "\n", "#df.head(3)\n", "#df.describe()\n", "#len(df[\"BRSTVD\"])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# plot histograms of the data in the dataframe\n", "# causal effects of the columns that just contain one value remain invisible \"ceteris paribus\"\n", "# check for outliers and explain them eventually\n", "\n", "#for col in df.columns: \n", "# try: \n", "# df[col] = pd.to_numeric(df[col]) \n", "# df.hist(column=col)\n", "# except ValueError:\n", "# print(\"The column \"+col+' can not be represented as a histogram')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#create a correlation matrix to check for pairwise linear dependencies among the columns\n", "\n", "#dummy=df.iloc[:,1:] #remove the \"Age\" column that is not numerical\n", "#dummy.corr(min_periods=15)\n", "#plt.matshow(dummy.corr(min_periods=15))\n", "#plt.show()\n", "#print(dummy.columns)\n", "\n", "#del dummy" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# convert categorical variables into indicator functions\n", "\n", "df = pd.get_dummies(df,columns=[\"Age\"])\n", "#df\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# define the response variable and convert it into an np.array\n", "\n", "y=np.array(df[\"Age_New\"])\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "#perform RF-C\n", "\n", "from sklearn.impute import KNNImputer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn import tree\n", "#from sklearn.tree import DecisionTreeClassifier\n", "\n", "from sklearn.inspection import permutation_importance\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# define the explanatory variables\n", "\n", "X=df.iloc[:,:34]\n", "X_names=df.iloc[:,:34].columns\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# impute data in case of NaN's by using K nearest neighbour\n", "\n", "imputer = KNNImputer(n_neighbors=20, weights=\"distance\")\n", "X=imputer.fit_transform(X)\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# create a training set and a validation set\n", "\n", "X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, np.arange(len(X)), test_size = 0.25, random_state = None, stratify=y)\n", "\n", "#print('X_train Shape:', X_train.shape)\n", "#print('y_train Shape:', y_train.shape)\n", "#print('X_test Shape:', X_test.shape)\n", "#print('Y_test Shape:', y_test.shape)\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(n_estimators=1000)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Instantiate model with 1000 decision trees\n", "\n", "rf = RandomForestClassifier(n_estimators = 1000, criterion=\"gini\",random_state = None)\n", "\n", "# Train the model on training data\n", "\n", "rf.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "# validate the random forest using the test set\n", "\n", "predictions = rf.predict(X_test)\n" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# plot the result of the validation\n", "\n", "plt.figure(figsize=(16,3))\n", "plt.plot(range(len(predictions)), predictions, '-', label=\"Predicted labels\", color=\"red\")\n", "plt.plot(range(len(y_test)), y_test, 'x', label=\"Observed labels\",color=\"black\")\n", "plt.title(\"Accuracy score of the classification: \"+str(rf.score(X_test, y_test)))\n", "plt.xlabel('number of samples in the validation set')\n", "plt.ylabel('Indicator of \"Age_New\"')\n", "plt.legend()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# depict a tree from the forest\n", "\n", "#plt.figure(figsize=(10,10))\n", "#clf = rf.estimators_[5]\n", "#tree.plot_tree(clf,feature_names=X_names, filled=True)\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
CU0.121197
VIS990.090836
ISO 4406 large0.090261
VIS400.090237
WATER0.083972
TBN0.080551
VLAMCC0.060583
FE0.046925
ISO 4406 small0.042201
LNF-ROET0.040431
TAN0.036852
BRSTVD0.033758
SI0.029886
LNF-NMW0.027464
ISO 4406 medium0.027061
P0.023858
CA0.016000
MG0.013672
LNF-SSW0.010024
LNF-UNC0.009453
ZN0.007079
LNF-FW0.006504
LNF-CUT0.006081
LNF-FIB0.004499
PB0.000259
NA0.000177
BA0.000146
CR0.000023
SN0.000011
MN0.000000
NI0.000000
LI0.000000
AL0.000000
V0.000000
\n", "
" ], "text/plain": [ " 0\n", "CU 0.121197\n", "VIS99 0.090836\n", "ISO 4406 large 0.090261\n", "VIS40 0.090237\n", "WATER 0.083972\n", "TBN 0.080551\n", "VLAMCC 0.060583\n", "FE 0.046925\n", "ISO 4406 small 0.042201\n", "LNF-ROET 0.040431\n", "TAN 0.036852\n", "BRSTVD 0.033758\n", "SI 0.029886\n", "LNF-NMW 0.027464\n", "ISO 4406 medium 0.027061\n", "P 0.023858\n", "CA 0.016000\n", "MG 0.013672\n", "LNF-SSW 0.010024\n", "LNF-UNC 0.009453\n", "ZN 0.007079\n", "LNF-FW 0.006504\n", "LNF-CUT 0.006081\n", "LNF-FIB 0.004499\n", "PB 0.000259\n", "NA 0.000177\n", "BA 0.000146\n", "CR 0.000023\n", "SN 0.000011\n", "MN 0.000000\n", "NI 0.000000\n", "LI 0.000000\n", "AL 0.000000\n", "V 0.000000" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#plot the importance of the various explanatory variables using Gini importance\n", "\n", "pd.DataFrame(rf.feature_importances_,index=X_names).sort_values(0,ascending=False)\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
BRSTVD0.0
SN0.0
MG0.0
MN0.0
NA0.0
NI0.0
PB0.0
SI0.0
ZN0.0
ISO 4406 large0.0
LI0.0
TAN0.0
TBN0.0
VIS400.0
VIS990.0
VLAMCC0.0
FE0.0
CU0.0
CR0.0
CA0.0
BA0.0
AL0.0
V0.0
P0.0
LNF-UNC0.0
LNF-SSW0.0
LNF-NMW0.0
LNF-FW0.0
LNF-FIB0.0
LNF-CUT0.0
LNF-ROET0.0
ISO 4406 small0.0
ISO 4406 medium0.0
WATER0.0
\n", "
" ], "text/plain": [ " 0\n", "BRSTVD 0.0\n", "SN 0.0\n", "MG 0.0\n", "MN 0.0\n", "NA 0.0\n", "NI 0.0\n", "PB 0.0\n", "SI 0.0\n", "ZN 0.0\n", "ISO 4406 large 0.0\n", "LI 0.0\n", "TAN 0.0\n", "TBN 0.0\n", "VIS40 0.0\n", "VIS99 0.0\n", "VLAMCC 0.0\n", "FE 0.0\n", "CU 0.0\n", "CR 0.0\n", "CA 0.0\n", "BA 0.0\n", "AL 0.0\n", "V 0.0\n", "P 0.0\n", "LNF-UNC 0.0\n", "LNF-SSW 0.0\n", "LNF-NMW 0.0\n", "LNF-FW 0.0\n", "LNF-FIB 0.0\n", "LNF-CUT 0.0\n", "LNF-ROET 0.0\n", "ISO 4406 small 0.0\n", "ISO 4406 medium 0.0\n", "WATER 0.0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#plot the importance of the various explanatory variables using permutation importance\n", "result = permutation_importance(rf, X_test, y_test, n_repeats=30, random_state=None, n_jobs=2)\n", "\n", "pd.DataFrame(result.importances_mean, index=X_names).sort_values(0,ascending=False)\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "#save the result in a csv\n", "\n", "dum = df.iloc[idx_test,:].copy()\n", "dum[\"Prediction of -Age_New-\"] =y_test\n", "dum.to_excel(\"C:/Users/Admin/Pythonprojects/RAMS/notebook/outputRF_testset.xlsx\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "toc-autonumbering": false, "toc-showcode": true, "toc-showmarkdowntxt": true }, "nbformat": 4, "nbformat_minor": 4 }