{ "cells": [ { "cell_type": "markdown", "id": "ee702cec", "metadata": {}, "source": [ "# Sequential Ensemble Model (Voting, Stacking generalization) of Precipitation Downscaling" ] }, { "cell_type": "markdown", "id": "9299c596", "metadata": {}, "source": [ "### This notebook investigates the model performace of using individual estimators and ensemble approach. Precipition data (processed) from DWD and climate dataset (processed and standardized) from ERA5 is used. \n", "\n", "### 1. Recursive feature selection is used tune the predictors to select the optimized variables. \n", "\n", "### 2. Different estimators are establised using sklearn and tensorflow for the densed models. (I used sklearn wrapper in keras.utils to make both models compatible) ---level 0\n", "\n", "### 3. Ensemble models (voting regressor and stacking regressor) with different final estimators are tested to evaluate its performace on precipitation data --level 1\n", "\n", "### 4. prediction and visualizaiton \n", "\n", "### 5. Model preformance evaluation\n", "\n", "### Note: This example adopts climate data (precipitaition), therefore the focus of the approach is tailored towards regression using advance models.\n", "\n", "#### However, it can be adopted for classification problems or even improve with complex networks like CNN\n", "\n", "### Next: Use future projections data to feed the model for future local predictions" ] }, { "cell_type": "code", "execution_count": 55, "id": "03714017", "metadata": {}, "outputs": [], "source": [ "#@dboateng (13.01.2022)\n", "\n", "#importing models\n", "import numpy as np\n", "import pandas as pd \n", "import matplotlib.pyplot as plt \n", "from sklearn.feature_selection import RFECV\n", "from sklearn.model_selection import TimeSeriesSplit\n", "\n", "from sklearn.linear_model import LassoCV, RidgeCV, BayesianRidge, ARDRegression, GammaRegressor, LassoLarsCV, PoissonRegressor\n", "from sklearn.ensemble import BaggingRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, RandomForestRegressor, HistGradientBoostingRegressor\n", "from sklearn.svm import SVR\n", "from sklearn.neural_network import MLPRegressor, BernoulliRBM\n", "from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor \n", "\n", "from sklearn.metrics import mean_squared_error, accuracy_score, r2_score, explained_variance_score, max_error\n", "from sklearn.metrics import mean_absolute_error, mean_squared_log_error, mean_absolute_percentage_error\n", "\n", "import tensorflow as tf\n", "import tensorflow.keras as keras \n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout\n", "from keras.metrics import RootMeanSquaredError\n", "\n", "from xgboost import XGBRegressor\n", "\n", "from sklearn import set_config" ] }, { "cell_type": "code", "execution_count": 20, "id": "4d4df0ca", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(744, 22) (744,)\n" ] }, { "data": { "text/html": [ "
\n", " | t2m | \n", "msl | \n", "v10 | \n", "u10 | \n", "z500 | \n", "z850 | \n", "tp | \n", "q500 | \n", "q850 | \n", "t500 | \n", "... | \n", "vo500 | \n", "vo850 | \n", "pv500 | \n", "pv850 | \n", "u500 | \n", "u850 | \n", "v500 | \n", "v850 | \n", "d2m | \n", "Precipitation | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
time | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1958-01-01 | \n", "-0.599361 | \n", "-349.782636 | \n", "0.057458 | \n", "0.030403 | \n", "-341.345388 | \n", "-287.186476 | \n", "0.000917 | \n", "-0.000109 | \n", "0.000011 | \n", "-0.024671 | \n", "... | \n", "0.000002 | \n", "-0.000004 | \n", "1.244580e-07 | \n", "-6.846847e-08 | \n", "-0.218904 | \n", "0.298243 | \n", "0.319975 | \n", "1.885623 | \n", "-0.739180 | \n", "82.3 | \n", "
1958-02-01 | \n", "1.573796 | \n", "-493.167969 | \n", "0.659133 | \n", "1.702194 | \n", "-131.800592 | \n", "-299.689910 | \n", "0.003127 | \n", "0.000399 | \n", "0.000109 | \n", "1.271700 | \n", "... | \n", "-0.000016 | \n", "-0.000014 | \n", "-2.016244e-07 | \n", "-5.969440e-08 | \n", "7.091857 | \n", "11.915790 | \n", "0.032295 | \n", "-0.834710 | \n", "1.693005 | \n", "179.4 | \n", "
1958-03-01 | \n", "-4.403323 | \n", "-461.871850 | \n", "-0.570981 | \n", "-0.675173 | \n", "-1185.306200 | \n", "-586.329590 | \n", "-0.000708 | \n", "-0.000709 | \n", "-0.000149 | \n", "-3.920263 | \n", "... | \n", "0.000008 | \n", "0.000024 | \n", "3.843178e-08 | \n", "1.189981e-07 | \n", "-2.767158 | \n", "0.286086 | \n", "-1.069146 | \n", "-3.006988 | \n", "-3.907467 | \n", "27.6 | \n", "
1958-04-01 | \n", "-2.771583 | \n", "27.638861 | \n", "-0.502666 | \n", "-0.029506 | \n", "-529.931263 | \n", "-140.020177 | \n", "0.000243 | \n", "-0.000413 | \n", "-0.000057 | \n", "-3.140417 | \n", "... | \n", "0.000005 | \n", "0.000009 | \n", "-7.566521e-09 | \n", "4.726872e-08 | \n", "-0.809763 | \n", "-1.252618 | \n", "-1.183645 | \n", "-1.840023 | \n", "-1.817981 | \n", "62.5 | \n", "
1958-05-01 | \n", "1.452935 | \n", "75.049647 | \n", "0.493083 | \n", "0.678390 | \n", "470.619645 | \n", "135.861848 | \n", "-0.000161 | \n", "0.000497 | \n", "0.000142 | \n", "1.807232 | \n", "... | \n", "-0.000014 | \n", "-0.000017 | \n", "5.571874e-08 | \n", "-2.804966e-08 | \n", "3.167779 | \n", "6.612231 | \n", "1.108756 | \n", "3.515472 | \n", "1.917891 | \n", "77.2 | \n", "
5 rows × 23 columns
\n", "StackingRegressor(estimators=[('lassoCV',\n", " LassoCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n", " selection='random')),\n", " ('ARD', ARDRegression()),\n", " ('Gamma', GammaRegressor()),\n", " ('MLP',\n", " MLPRegressor(max_iter=1000, random_state=42)),\n", " ('RandomForest',\n", " RandomForestRegressor(random_state=42)),\n", " ('Dense',\n", " <keras.wrappers.scikit_learn.KerasRegressor object at 0x0000026486D4B2B0>),\n", " ('Lars',\n", " LassoLarsCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)))],\n", " final_estimator=RandomForestRegressor(random_state=42))Please rerun this cell to show the HTML repr or trust the notebook.
StackingRegressor(estimators=[('lassoCV',\n", " LassoCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n", " selection='random')),\n", " ('ARD', ARDRegression()),\n", " ('Gamma', GammaRegressor()),\n", " ('MLP',\n", " MLPRegressor(max_iter=1000, random_state=42)),\n", " ('RandomForest',\n", " RandomForestRegressor(random_state=42)),\n", " ('Dense',\n", " <keras.wrappers.scikit_learn.KerasRegressor object at 0x0000026486D4B2B0>),\n", " ('Lars',\n", " LassoLarsCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)))],\n", " final_estimator=RandomForestRegressor(random_state=42))
LassoCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n", " selection='random')
ARDRegression()
GammaRegressor()
MLPRegressor(max_iter=1000, random_state=42)
RandomForestRegressor(random_state=42)
<keras.wrappers.scikit_learn.KerasRegressor object at 0x0000026486D4B2B0>
LassoLarsCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None))
RandomForestRegressor(random_state=42)