{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have gone through a manual process of modeling our dataset, let's see if we can replicate this using an Automated workflow. As a reminder, our plan of action was as follows:\n", "\n", "1. Perform EDA on the dataset to extract valuable insight about the process generating the time series **(COMPLETED)**.\n", "2. Build a baseline model (univariable model without exogenous variables) for benchmarking purposes **(COMPLETED)**.\n", "3. Build a univariate model with all exogenous variables to check best possible performance **(COMPLETED)**.\n", "4. Evaluate the model with exogenous variables and discuss any potential issues **(COMPLETED)**.\n", "5. Overcome issues identified above **(COMPLETED)**.\n", "6. Make future predictions with the best model **(COMPLETED)**.\n", "7. Replicate flow with Automated Time Series Modeling (AutoML) **(Covered in this notebook)**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Only enable critical logging (Optional)\n", "import os\n", "os.environ[\"PYCARET_CUSTOM_LOGGING_LEVEL\"] = \"CRITICAL\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "System:\n", " python: 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]\n", "executable: C:\\Users\\Nikhil\\.conda\\envs\\pycaret_dev_sktime_0p11_2\\python.exe\n", " machine: Windows-10-10.0.19044-SP0\n", "\n", "PyCaret required dependencies:\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Nikhil\\.conda\\envs\\pycaret_dev_sktime_0p11_2\\lib\\site-packages\\_distutils_hack\\__init__.py:30: UserWarning: Setuptools is replacing distutils.\n", " warnings.warn(\"Setuptools is replacing distutils.\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " pip: 21.2.2\n", " setuptools: 61.2.0\n", " pycaret: 3.0.0\n", " ipython: Not installed\n", " ipywidgets: 7.7.0\n", " numpy: 1.21.6\n", " pandas: 1.4.2\n", " jinja2: 3.1.2\n", " scipy: 1.8.0\n", " joblib: 1.1.0\n", " sklearn: 1.0.2\n", " pyod: Installed but version unavailable\n", " imblearn: 0.9.0\n", " category_encoders: 2.4.1\n", " lightgbm: 3.3.2\n", " numba: 0.55.1\n", " requests: 2.27.1\n", " matplotlib: 3.5.2\n", " scikitplot: 0.3.7\n", " yellowbrick: 1.4\n", " plotly: 5.8.0\n", " kaleido: 0.2.1\n", " statsmodels: 0.13.2\n", " sktime: 0.11.4\n", " tbats: Installed but version unavailable\n", " pmdarima: 1.8.5\n", "\n", "PyCaret optional dependencies:\n", " shap: Not installed\n", " interpret: Not installed\n", " umap: Not installed\n", " pandas_profiling: Not installed\n", " explainerdashboard: Not installed\n", " autoviz: Not installed\n", " fairlearn: Not installed\n", " xgboost: Not installed\n", " catboost: Not installed\n", " kmodes: Not installed\n", " mlxtend: Not installed\n", " statsforecast: 0.5.5\n", " tune_sklearn: Not installed\n", " ray: Not installed\n", " hyperopt: Not installed\n", " optuna: Not installed\n", " skopt: Not installed\n", " mlflow: 1.25.1\n", " gradio: Not installed\n", " fastapi: Not installed\n", " uvicorn: Not installed\n", " m2cgen: Not installed\n", " evidently: Not installed\n", " nltk: Not installed\n", " pyLDAvis: Not installed\n", " gensim: Not installed\n", " spacy: Not installed\n", " wordcloud: Not installed\n", " textblob: Not installed\n", " psutil: 5.9.0\n", " fugue: Not installed\n", " streamlit: Not installed\n", " prophet: Not installed\n" ] } ], "source": [ "def what_is_installed():\n", " from pycaret import show_versions\n", " show_versions()\n", "\n", "try:\n", " what_is_installed()\n", "except ModuleNotFoundError:\n", " !pip install pycaret\n", " what_is_installed()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pycaret.datasets import get_data\n", "from pycaret.time_series import TSForecastingExperiment" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Global Figure Settings for notebook ----\n", "global_fig_settings = {\"renderer\": \"notebook\", \"width\": 1000, \"height\": 600}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CO(GT)NOx(GT)PT08.S3(NOx)RH
index
2004-03-10 18:00:002.6166.01056.048.9
2004-03-10 19:00:002.0103.01174.047.7
2004-03-10 20:00:002.2131.01140.054.0
2004-03-10 21:00:002.2172.01092.060.0
2004-03-10 22:00:001.6131.01205.059.6
\n", "
" ], "text/plain": [ " CO(GT) NOx(GT) PT08.S3(NOx) RH\n", "index \n", "2004-03-10 18:00:00 2.6 166.0 1056.0 48.9\n", "2004-03-10 19:00:00 2.0 103.0 1174.0 47.7\n", "2004-03-10 20:00:00 2.2 131.0 1140.0 54.0\n", "2004-03-10 21:00:00 2.2 172.0 1092.0 60.0\n", "2004-03-10 22:00:00 1.6 131.0 1205.0 59.6" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = get_data(\"airquality\", verbose=False)\n", "data[\"index\"] = pd.to_datetime(data[\"Date\"] + \" \" + data[\"Time\"])\n", "data.drop(columns=[\"Date\", \"Time\"], inplace=True)\n", "data.replace(-200, np.nan, inplace=True)\n", "data.set_index(\"index\", inplace=True)\n", "\n", "target = \"CO(GT)\"\n", "exog_vars = ['NOx(GT)', 'PT08.S3(NOx)', 'RH']\n", "include = [target] + exog_vars\n", "data = data[include]\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 7: AutoML" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "FH=48\n", "metric = \"mase\"\n", "exclude = [\"auto_arima\", \"bats\", \"tbats\", \"lar_cds_dt\", \"par_cds_dt\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 7A: Univariate AutoML with and without Exogenous Variables" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 DescriptionValue
0session_id42
1TargetCO(GT)
2ApproachUnivariate
3Exogenous VariablesPresent
4Original data shape(9357, 4)
5Transformed data shape(9357, 4)
6Transformed train set shape(9309, 4)
7Transformed test set shape(48, 4)
8Rows with missing values25.8%
9Fold GeneratorExpandingWindowSplitter
10Fold Number3
11Enforce Prediction IntervalFalse
12Seasonal Period(s) Tested24
13Seasonality PresentTrue
14Seasonalities Detected[24]
15Primary Seasonality24
16Target Strictly PositiveTrue
17Target White NoiseNo
18Recommended d1
19Recommended Seasonal D0
20PreprocessTrue
21Numerical Imputation (Target)ffill
22Transformation (Target)None
23Scaling (Target)None
24Numerical Imputation (Exogenous)ffill
25Transformation (Exogenous)None
26Scaling (Exogenous)None
27CPU Jobs-1
28Use GPUFalse
29Log ExperimentFalse
30Experiment Namets-default-name
31USIaa6b
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exp_auto = TSForecastingExperiment()\n", "\n", "# enforce_exogenous=False --> Use multivariate forecasting when model supports it, else use univariate forecasting\n", "exp_auto.setup(\n", " data=data, target=target, fh=FH, enforce_exogenous=False,\n", " numeric_imputation_target=\"ffill\", numeric_imputation_exogenous=\"ffill\",\n", " fig_kwargs=global_fig_settings, session_id=42\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# # Check available models ----\n", "# exp_auto_noexo.models()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 ModelMASERMSSEMAERMSEMAPESMAPER2TT (Sec)
arimaARIMA0.19630.17580.16740.23010.13800.15120.865224.1733
br_cds_dtBayesian Ridge w/ Cond. Deseasonalize & Detrending0.55760.48410.47560.63380.36830.2969-0.01348.9533
ridge_cds_dtRidge w/ Cond. Deseasonalize & Detrending0.56390.48980.48090.64130.37250.2997-0.03678.8033
lr_cds_dtLinear w/ Cond. Deseasonalize & Detrending0.56400.48990.48110.64140.37260.2998-0.03729.2867
omp_cds_dtOrthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending0.63730.56160.54350.73530.42230.3325-0.40629.1567
exp_smoothExponential Smoothing0.67290.61500.57420.80550.38130.3681-0.02784.5233
thetaTheta Forecaster0.70120.58880.59860.77140.40090.40670.14675.2033
rf_cds_dtRandom Forest w/ Cond. Deseasonalize & Detrending0.71310.58030.60820.75990.48870.3689-0.289625.8700
gbr_cds_dtGradient Boosting w/ Cond. Deseasonalize & Detrending0.71440.57980.60920.75900.51950.3738-0.534218.5533
lightgbm_cds_dtLight Gradient Boosting w/ Cond. Deseasonalize & Detrending0.77320.61650.65940.80720.54270.3930-0.32969.8100
snaiveSeasonal Naive Forecaster0.81750.78470.69721.02750.46450.3643-1.86165.9833
et_cds_dtExtra Trees w/ Cond. Deseasonalize & Detrending0.84100.67290.71730.88100.59270.4188-1.051514.6700
huber_cds_dtHuber w/ Cond. Deseasonalize & Detrending0.86750.66340.74000.86860.61350.4357-0.93049.2500
en_cds_dtElastic Net w/ Cond. Deseasonalize & Detrending0.87200.67120.74380.87890.60330.4312-1.01728.6667
lasso_cds_dtLasso w/ Cond. Deseasonalize & Detrending0.87640.67880.74750.88880.59340.4267-1.06968.9633
naiveNaive Forecaster0.92120.75540.78610.98950.61250.5160-0.37847.1367
crostonCroston0.93370.74640.79660.97750.77440.5053-0.64743.7600
llar_cds_dtLasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending1.05510.77350.89991.01270.83150.5344-2.31128.7700
grand_meansGrand Means Forecaster1.13320.83800.96661.09731.02690.5836-1.87076.4433
knn_cds_dtK Neighbors w/ Cond. Deseasonalize & Detrending1.15070.89520.98141.17210.80980.5266-3.02019.0300
dt_cds_dtDecision Tree w/ Cond. Deseasonalize & Detrending1.21281.08151.03461.41630.87800.5389-2.88799.5533
polytrendPolynomial Trend Forecaster1.26650.91681.08031.20031.17410.6261-2.66104.0433
etsETS1.87341.56721.59852.05281.08190.7912-4.81387.0567
ada_cds_dtAdaBoost w/ Cond. Deseasonalize & Detrending1.97461.40701.68391.84181.71050.7690-10.971012.0200
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Include slower models like Prophet (turbo=False), but exclude some specific models ----\n", "best = exp_auto.compare_models(sort=metric, turbo=False, exclude=exclude)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "exp_auto.plot_model(best)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "final_auto_model = exp_auto.finalize_model(best)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def safe_predict(exp, model):\n", " \"\"\"Prediction wrapper for demo purposes.\"\"\"\n", " try: \n", " future_preds = exp.predict_model(model)\n", " except ValueError as exception:\n", " print(exception)\n", " exo_vars = exp.exogenous_variables\n", " print(f\"{len(exo_vars)} exogenous variables (X) needed in order to make future predictions:\\n{exo_vars}\")\n", " \n", " \n", " exog_exps = []\n", " exog_models = []\n", " for exog_var in exog_vars:\n", " exog_exp = TSForecastingExperiment()\n", " exog_exp.setup(\n", " data=data[exog_var], fh=FH,\n", " numeric_imputation_target=\"ffill\", numeric_imputation_exogenous=\"ffill\",\n", " fig_kwargs=global_fig_settings, session_id=42\n", " )\n", "\n", " # Users can customize how to model future exogenous variables i.e. add\n", " # more steps and models to potentially get better models at the expense\n", " # of higher modeling time.\n", " best = exog_exp.compare_models(\n", " sort=metric, include=[\"arima\", \"ets\", \"exp_smooth\", \"theta\", \"lightgbm_cds_dt\",] \n", " )\n", " final_exog_model = exog_exp.finalize_model(best)\n", "\n", " exog_exps.append(exog_exp)\n", " exog_models.append(final_exog_model)\n", "\n", " # Step 2: Get future predictions for exog variables ----\n", " future_exog = [\n", " exog_exp.predict_model(exog_model)\n", " for exog_exp, exog_model in zip(exog_exps, exog_models)\n", " ]\n", " future_exog = pd.concat(future_exog, axis=1)\n", " future_exog.columns = exog_vars\n", " \n", " future_preds = exp.predict_model(model, X=future_exog)\n", " \n", " return future_preds " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 ModelMASERMSSEMAERMSEMAPESMAPER2TT (Sec)
lightgbm_cds_dtLight Gradient Boosting w/ Cond. Deseasonalize & Detrending1.19901.048611.083613.58790.28620.2304-0.95025.1967
exp_smoothExponential Smoothing1.48901.190413.751015.41400.30370.2478-0.42483.1167
arimaARIMA1.56571.369314.469617.74120.35390.2739-2.11913.7867
thetaTheta Forecaster1.91021.490217.642019.29580.37130.2986-1.26293.2633
etsETS3.45042.655731.865234.38390.62950.4585-6.173310.4200
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "future_preds = safe_predict(exp_auto, final_auto_model)\n", "future_preds.plot()" ] } ], "metadata": { "interpreter": { "hash": "c161a91f6f4623a54f30c5492a42e7cf0592610fb90c8abd312086f09f8fbe0f" }, "kernelspec": { "display_name": "pycaret_sktime_0p11_2", "language": "python", "name": "pycaret_sktime_0p11_2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 2 }