{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Time Series Features with tsfresh Tutorial" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook explains how to create time series features with `tsfresh`.\n", "\n", "This notebook will use the [Beijing Multi-Site Air-Quality Data](https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data) downloaded from the **UCI Machine Learning Repository**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Packages" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The documentation for each package used in this tutorial is linked below:\n", "* [pandas](https://pandas.pydata.org/docs/)\n", "* [tsfresh](https://tsfresh.readthedocs.io/en/latest/)\n", "* [urllib](https://docs.python.org/3/library/urllib.html)\n", "* [io](https://docs.python.org/3/library/io.html)\n", "* [zipfile](https://docs.python.org/3/library/zipfile.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import tsfresh\n", "from urllib.request import urlopen\n", "from io import BytesIO\n", "from zipfile import ZipFile" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create initial dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The zipfile is downloaded from **UCI Machine Learning Repository** using `urllib` and unzipped with `zipfile`. This zipfile contains one csv for each reporting station. Read each of these csv files and append to the pandas dataframe." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearmonthdayhourPM2.5PM10SO2NO2COO3TEMPPRESDEWPRAINwdWSPMstationtimestamp
020133104.04.04.07.0300.077.0-0.71023.0-18.80.0NNW4.4Aotizhongxin2013-03-01
020133103.06.013.07.0300.085.0-2.31020.8-19.70.0E0.5Changping2013-03-01
020133104.04.03.0NaN200.082.0-2.31020.8-19.70.0E0.5Dingling2013-03-01
020133109.09.03.017.0300.089.0-0.51024.5-21.40.0NNW5.7Dongsi2013-03-01
020133104.04.014.020.0300.069.0-0.71023.0-18.80.0NNW4.4Guanyuan2013-03-01
020133106.018.05.0NaN800.088.00.11021.1-18.60.0NW4.4Gucheng2013-03-01
020133107.07.03.02.0100.091.0-2.31020.3-20.70.0WNW3.1Huairou2013-03-01
020133105.014.04.012.0200.085.0-0.51024.5-21.40.0NNW5.7Nongzhanguan2013-03-01
020133103.06.03.08.0300.044.0-0.91025.8-20.50.0NW9.3Shunyi2013-03-01
020133106.06.04.08.0300.081.0-0.51024.5-21.40.0NNW5.7Tiantan2013-03-01
\n", "
" ], "text/plain": [ " year month day hour PM2.5 PM10 SO2 NO2 CO O3 TEMP PRES \\\n", "0 2013 3 1 0 4.0 4.0 4.0 7.0 300.0 77.0 -0.7 1023.0 \n", "0 2013 3 1 0 3.0 6.0 13.0 7.0 300.0 85.0 -2.3 1020.8 \n", "0 2013 3 1 0 4.0 4.0 3.0 NaN 200.0 82.0 -2.3 1020.8 \n", "0 2013 3 1 0 9.0 9.0 3.0 17.0 300.0 89.0 -0.5 1024.5 \n", "0 2013 3 1 0 4.0 4.0 14.0 20.0 300.0 69.0 -0.7 1023.0 \n", "0 2013 3 1 0 6.0 18.0 5.0 NaN 800.0 88.0 0.1 1021.1 \n", "0 2013 3 1 0 7.0 7.0 3.0 2.0 100.0 91.0 -2.3 1020.3 \n", "0 2013 3 1 0 5.0 14.0 4.0 12.0 200.0 85.0 -0.5 1024.5 \n", "0 2013 3 1 0 3.0 6.0 3.0 8.0 300.0 44.0 -0.9 1025.8 \n", "0 2013 3 1 0 6.0 6.0 4.0 8.0 300.0 81.0 -0.5 1024.5 \n", "\n", " DEWP RAIN wd WSPM station timestamp \n", "0 -18.8 0.0 NNW 4.4 Aotizhongxin 2013-03-01 \n", "0 -19.7 0.0 E 0.5 Changping 2013-03-01 \n", "0 -19.7 0.0 E 0.5 Dingling 2013-03-01 \n", "0 -21.4 0.0 NNW 5.7 Dongsi 2013-03-01 \n", "0 -18.8 0.0 NNW 4.4 Guanyuan 2013-03-01 \n", "0 -18.6 0.0 NW 4.4 Gucheng 2013-03-01 \n", "0 -20.7 0.0 WNW 3.1 Huairou 2013-03-01 \n", "0 -21.4 0.0 NNW 5.7 Nongzhanguan 2013-03-01 \n", "0 -20.5 0.0 NW 9.3 Shunyi 2013-03-01 \n", "0 -21.4 0.0 NNW 5.7 Tiantan 2013-03-01 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip\"\n", "r = urlopen(url)\n", "zf = ZipFile(BytesIO(r.read()))\n", "\n", "df = pd.DataFrame()\n", "for file in zf.infolist():\n", " if file.filename.endswith('.csv'):\n", " df = df.append(pd.read_csv(zf.open(file)))\n", "\n", "df['timestamp'] = pd.to_datetime(df[[\"year\", \"month\", \"day\", \"hour\"]])\n", "df.drop(columns=['No'], inplace=True)\n", "df.sort_values(by=['timestamp', 'station']).head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`tsfresh` doesn't handle missing value well, so check for missing values." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "year 0\n", "month 0\n", "day 0\n", "hour 0\n", "PM2.5 8739\n", "PM10 6449\n", "SO2 9021\n", "NO2 12116\n", "CO 20701\n", "O3 13277\n", "TEMP 398\n", "PRES 393\n", "DEWP 403\n", "RAIN 390\n", "wd 1822\n", "WSPM 318\n", "station 0\n", "timestamp 0\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As this is hourly time series, replace missing values by the previous value." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df.fillna(method='ffill', inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`tsfresh` computes a large number of features by default. For the purposes of this tutorial, limit the data to just one month for three stations." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df2014 = df[df.timestamp.between(\"2014-03-01\", \"2014-04-01\")]\n", "df2014_limited = df2014[df2014.station.isin(['Dongsi', 'Wanliu', 'Shunyi'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Time series features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove categorical features as `tsfresh` doesn't process this type of feature. They can be added back in before modeling if necessary." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "ts_df = df2014_limited.drop(columns=['year', 'month', 'day', 'hour', 'wd'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Feature Extraction: 100%|██████████| 17/17 [00:05<00:00, 3.36it/s]\n" ] }, { "data": { "text/plain": [ "Index(['CO__variance_larger_than_standard_deviation', 'CO__has_duplicate_max',\n", " 'CO__has_duplicate_min', 'CO__has_duplicate', 'CO__sum_values',\n", " 'CO__abs_energy', 'CO__mean_abs_change', 'CO__mean_change',\n", " 'CO__mean_second_derivative_central', 'CO__median',\n", " ...\n", " 'WSPM__permutation_entropy__dimension_5__tau_1',\n", " 'WSPM__permutation_entropy__dimension_6__tau_1',\n", " 'WSPM__permutation_entropy__dimension_7__tau_1',\n", " 'WSPM__query_similarity_count__query_None__threshold_0.0',\n", " 'WSPM__matrix_profile__feature_\"min\"__threshold_0.98',\n", " 'WSPM__matrix_profile__feature_\"max\"__threshold_0.98',\n", " 'WSPM__matrix_profile__feature_\"mean\"__threshold_0.98',\n", " 'WSPM__matrix_profile__feature_\"median\"__threshold_0.98',\n", " 'WSPM__matrix_profile__feature_\"25\"__threshold_0.98',\n", " 'WSPM__matrix_profile__feature_\"75\"__threshold_0.98'],\n", " dtype='object', length=8657)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_features = tsfresh.extract_features(ts_df, column_id='station', column_sort='timestamp')\n", "df_features.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Control features created" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`tsfresh` allows control over what features are created. `tsfresh` supports several methods to determine this list: `tsfresh.feature_extraction.ComprehensiveFCParameters` (the default value) includes all features with common parameters, `tsfresh.feature_extraction.MinimalFCParameters` includes a small number of easily calculated features, `tsfresh.feature_extraction.EfficientFCParameters` drops high computational cost features from the comprehensive list." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Feature Extraction: 100%|██████████| 17/17 [00:00<00:00, 805.99it/s]\n" ] }, { "data": { "text/plain": [ "Index(['PM2.5__sum_values', 'PM2.5__median', 'PM2.5__mean', 'PM2.5__length',\n", " 'PM2.5__standard_deviation', 'PM2.5__variance',\n", " 'PM2.5__root_mean_square', 'PM2.5__maximum', 'PM2.5__minimum',\n", " 'PM10__sum_values', 'PM10__median', 'PM10__mean', 'PM10__length',\n", " 'PM10__standard_deviation', 'PM10__variance', 'PM10__root_mean_square',\n", " 'PM10__maximum', 'PM10__minimum', 'DEWP__sum_values', 'DEWP__median',\n", " 'DEWP__mean', 'DEWP__length', 'DEWP__standard_deviation',\n", " 'DEWP__variance', 'DEWP__root_mean_square', 'DEWP__maximum',\n", " 'DEWP__minimum', 'RAIN__sum_values', 'RAIN__median', 'RAIN__mean',\n", " 'RAIN__length', 'RAIN__standard_deviation', 'RAIN__variance',\n", " 'RAIN__root_mean_square', 'RAIN__maximum', 'RAIN__minimum',\n", " 'SO2__sum_values', 'SO2__median', 'SO2__mean', 'SO2__length',\n", " 'SO2__standard_deviation', 'SO2__variance', 'SO2__root_mean_square',\n", " 'SO2__maximum', 'SO2__minimum', 'NO2__sum_values', 'NO2__median',\n", " 'NO2__mean', 'NO2__length', 'NO2__standard_deviation', 'NO2__variance',\n", " 'NO2__root_mean_square', 'NO2__maximum', 'NO2__minimum',\n", " 'CO__sum_values', 'CO__median', 'CO__mean', 'CO__length',\n", " 'CO__standard_deviation', 'CO__variance', 'CO__root_mean_square',\n", " 'CO__maximum', 'CO__minimum', 'O3__sum_values', 'O3__median',\n", " 'O3__mean', 'O3__length', 'O3__standard_deviation', 'O3__variance',\n", " 'O3__root_mean_square', 'O3__maximum', 'O3__minimum',\n", " 'WSPM__sum_values', 'WSPM__median', 'WSPM__mean', 'WSPM__length',\n", " 'WSPM__standard_deviation', 'WSPM__variance', 'WSPM__root_mean_square',\n", " 'WSPM__maximum', 'WSPM__minimum', 'TEMP__sum_values', 'TEMP__median',\n", " 'TEMP__mean', 'TEMP__length', 'TEMP__standard_deviation',\n", " 'TEMP__variance', 'TEMP__root_mean_square', 'TEMP__maximum',\n", " 'TEMP__minimum', 'PRES__sum_values', 'PRES__median', 'PRES__mean',\n", " 'PRES__length', 'PRES__standard_deviation', 'PRES__variance',\n", " 'PRES__root_mean_square', 'PRES__maximum', 'PRES__minimum'],\n", " dtype='object')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_features = tsfresh.extract_features(ts_df, column_id='station', column_sort='timestamp', \n", " default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters())\n", "df_features.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A dictionary of features and settings can also be created to control the features created." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "fc_settings = {'variance_larger_than_standard_deviation': None,\n", " 'has_duplicate_max': None,\n", " 'has_duplicate_min': None,\n", " 'has_duplicate': None,\n", " 'sum_values': None,\n", " 'abs_energy': None,\n", " 'mean_abs_change': None,\n", " 'mean_change': None,\n", " 'mean_second_derivative_central': None,\n", " 'median': None,\n", " 'mean': None,\n", " 'length': None,\n", " 'standard_deviation': None,\n", " 'variation_coefficient': None,\n", " 'variance': None,\n", " 'skewness': None,\n", " 'kurtosis': None,\n", " 'root_mean_square': None,\n", " 'absolute_sum_of_changes': None,\n", " 'longest_strike_below_mean': None,\n", " 'longest_strike_above_mean': None,\n", " 'count_above_mean': None,\n", " 'count_below_mean': None,\n", " 'last_location_of_maximum': None,\n", " 'first_location_of_maximum': None,\n", " 'last_location_of_minimum': None,\n", " 'first_location_of_minimum': None,\n", " 'percentage_of_reoccurring_values_to_all_values': None,\n", " 'percentage_of_reoccurring_datapoints_to_all_datapoints': None,\n", " 'sum_of_reoccurring_values': None,\n", " 'sum_of_reoccurring_data_points': None,\n", " 'ratio_value_number_to_time_series_length': None,\n", " 'maximum': None,\n", " 'minimum': None,\n", " 'benford_correlation': None,\n", " 'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}],\n", " 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],\n", " 'cid_ce': [{'normalize': True}, {'normalize': False}],\n", " 'symmetry_looking': [{'r': 0.0},\n", " {'r': 0.1},\n", " {'r': 0.2},\n", " {'r': 0.30000000000000004},\n", " {'r': 0.4},\n", " {'r': 0.5}],\n", " 'large_standard_deviation': [{'r': 0.5},\n", " {'r': 0.75},\n", " {'r': 0.9500000000000001}],\n", " 'quantile': [{'q': 0.1},\n", " {'q': 0.2},\n", " {'q': 0.3},\n", " {'q': 0.4},\n", " {'q': 0.6},\n", " {'q': 0.7},\n", " {'q': 0.8},\n", " {'q': 0.9}],\n", " 'autocorrelation': [{'lag': 0},\n", " {'lag': 1},\n", " {'lag': 2},\n", " {'lag': 3},\n", " {'lag': 4},\n", " {'lag': 5},\n", " {'lag': 6},\n", " {'lag': 7},\n", " {'lag': 8},\n", " {'lag': 9}],\n", " 'agg_autocorrelation': [{'f_agg': 'mean', 'maxlag': 40},\n", " {'f_agg': 'median', 'maxlag': 40},\n", " {'f_agg': 'var', 'maxlag': 40}],\n", " 'partial_autocorrelation': [{'lag': 0},\n", " {'lag': 1},\n", " {'lag': 2},\n", " {'lag': 3},\n", " {'lag': 4},\n", " {'lag': 5},\n", " {'lag': 6},\n", " {'lag': 7},\n", " {'lag': 8},\n", " {'lag': 9}],\n", " 'number_cwt_peaks': [{'n': 1}, {'n': 5}],\n", " 'number_peaks': [{'n': 1}, {'n': 3}, {'n': 5}, {'n': 10}, {'n': 50}],\n", " 'binned_entropy': [{'max_bins': 10}],\n", " 'index_mass_quantile': [{'q': 0.1},\n", " {'q': 0.2},\n", " {'q': 0.3},\n", " {'q': 0.4},\n", " {'q': 0.6},\n", " {'q': 0.7},\n", " {'q': 0.8},\n", " {'q': 0.9}],\n", " 'spkt_welch_density': [{'coeff': 2}, {'coeff': 5}, {'coeff': 8}],\n", " 'ar_coefficient': [{'coeff': 0, 'k': 10},\n", " {'coeff': 1, 'k': 10},\n", " {'coeff': 2, 'k': 10},\n", " {'coeff': 3, 'k': 10},\n", " {'coeff': 4, 'k': 10},\n", " {'coeff': 5, 'k': 10},\n", " {'coeff': 6, 'k': 10},\n", " {'coeff': 7, 'k': 10},\n", " {'coeff': 8, 'k': 10},\n", " {'coeff': 9, 'k': 10},\n", " {'coeff': 10, 'k': 10}],\n", " 'value_count': [{'value': 0}, {'value': 1}, {'value': -1}],\n", " 'range_count': [{'min': -1, 'max': 1}],\n", " 'linear_trend': [{'attr': 'pvalue'},\n", " {'attr': 'rvalue'},\n", " {'attr': 'intercept'},\n", " {'attr': 'slope'},\n", " {'attr': 'stderr'}],\n", " 'augmented_dickey_fuller': [{'attr': 'teststat'},\n", " {'attr': 'pvalue'},\n", " {'attr': 'usedlag'}],\n", " 'number_crossing_m': [{'m': 0}, {'m': -1}, {'m': 1}],\n", " 'energy_ratio_by_chunks': [{'num_segments': 10, 'segment_focus': 0},\n", " {'num_segments': 10, 'segment_focus': 1},\n", " {'num_segments': 10, 'segment_focus': 2},\n", " {'num_segments': 10, 'segment_focus': 3},\n", " {'num_segments': 10, 'segment_focus': 4},\n", " {'num_segments': 10, 'segment_focus': 5},\n", " {'num_segments': 10, 'segment_focus': 6},\n", " {'num_segments': 10, 'segment_focus': 7},\n", " {'num_segments': 10, 'segment_focus': 8},\n", " {'num_segments': 10, 'segment_focus': 9}],\n", " 'ratio_beyond_r_sigma': [{'r': 0.5},\n", " {'r': 1},\n", " {'r': 1.5},\n", " {'r': 2},\n", " {'r': 2.5},\n", " {'r': 3},\n", " {'r': 5},\n", " {'r': 6},\n", " {'r': 7},\n", " {'r': 10}],\n", " 'linear_trend_timewise': [{'attr': 'pvalue'},\n", " {'attr': 'rvalue'},\n", " {'attr': 'intercept'},\n", " {'attr': 'slope'},\n", " {'attr': 'stderr'}],\n", " 'count_above': [{'t': 0}],\n", " 'count_below': [{'t': 0}],\n", " 'permutation_entropy': [{'tau': 1, 'dimension': 3},\n", " {'tau': 1, 'dimension': 4},\n", " {'tau': 1, 'dimension': 5},\n", " {'tau': 1, 'dimension': 6},\n", " {'tau': 1, 'dimension': 7}],\n", " 'query_similarity_count': [{'query': None, 'threshold': 0.0}]}" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Feature Extraction: 100%|██████████| 17/17 [00:01<00:00, 15.05it/s]\n" ] }, { "data": { "text/plain": [ "Index(['TEMP__variance_larger_than_standard_deviation',\n", " 'TEMP__has_duplicate_max', 'TEMP__has_duplicate_min',\n", " 'TEMP__has_duplicate', 'TEMP__sum_values', 'TEMP__abs_energy',\n", " 'TEMP__mean_abs_change', 'TEMP__mean_change',\n", " 'TEMP__mean_second_derivative_central', 'TEMP__median',\n", " ...\n", " 'WSPM__ratio_beyond_r_sigma__r_7', 'WSPM__ratio_beyond_r_sigma__r_10',\n", " 'WSPM__count_above__t_0', 'WSPM__count_below__t_0',\n", " 'WSPM__permutation_entropy__dimension_3__tau_1',\n", " 'WSPM__permutation_entropy__dimension_4__tau_1',\n", " 'WSPM__permutation_entropy__dimension_5__tau_1',\n", " 'WSPM__permutation_entropy__dimension_6__tau_1',\n", " 'WSPM__permutation_entropy__dimension_7__tau_1',\n", " 'WSPM__query_similarity_count__query_None__threshold_0.0'],\n", " dtype='object', length=1716)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_features = tsfresh.extract_features(ts_df, column_id='station', column_sort='timestamp', \n", " default_fc_parameters=fc_settings)\n", "df_features.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Time-series forecasting use case" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The above method rolls all time series data up into a single record per `column_id` (station in this case). For time series, this summarization often needs to be done at each timestamp and summarize the data from prior to the current timestamp. `roll_time_series` creates a dataframe that allows `tsfresh` to calculate the features at each timestamp correctly. We control the maximum window of the data with the parameter **max_timeshift**." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Rolling: 100%|██████████| 20/20 [00:06<00:00, 3.29it/s]\n" ] } ], "source": [ "df_rolled = tsfresh.utilities.dataframe_functions.roll_time_series(df2014,\n", " column_id='station',\n", " column_sort='timestamp',\n", " min_timeshift=24,\n", " max_timeshift=24)\n", "df_rolled.drop(columns=['year', 'month', 'day', 'hour', 'wd', 'station'], inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that the rolled dataframe has been created, `extract_features` can be run just as was done before" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Feature Extraction: 100%|██████████| 20/20 [00:16<00:00, 1.19it/s]\n" ] }, { "data": { "text/plain": [ "Index(['PM2.5__sum_values', 'PM2.5__median', 'PM2.5__mean', 'PM2.5__length',\n", " 'PM2.5__standard_deviation', 'PM2.5__variance',\n", " 'PM2.5__root_mean_square', 'PM2.5__maximum', 'PM2.5__minimum',\n", " 'PM10__sum_values', 'PM10__median', 'PM10__mean', 'PM10__length',\n", " 'PM10__standard_deviation', 'PM10__variance', 'PM10__root_mean_square',\n", " 'PM10__maximum', 'PM10__minimum', 'SO2__sum_values', 'SO2__median',\n", " 'SO2__mean', 'SO2__length', 'SO2__standard_deviation', 'SO2__variance',\n", " 'SO2__root_mean_square', 'SO2__maximum', 'SO2__minimum',\n", " 'NO2__sum_values', 'NO2__median', 'NO2__mean', 'NO2__length',\n", " 'NO2__standard_deviation', 'NO2__variance', 'NO2__root_mean_square',\n", " 'NO2__maximum', 'NO2__minimum', 'CO__sum_values', 'CO__median',\n", " 'CO__mean', 'CO__length', 'CO__standard_deviation', 'CO__variance',\n", " 'CO__root_mean_square', 'CO__maximum', 'CO__minimum', 'O3__sum_values',\n", " 'O3__median', 'O3__mean', 'O3__length', 'O3__standard_deviation',\n", " 'O3__variance', 'O3__root_mean_square', 'O3__maximum', 'O3__minimum',\n", " 'TEMP__sum_values', 'TEMP__median', 'TEMP__mean', 'TEMP__length',\n", " 'TEMP__standard_deviation', 'TEMP__variance', 'TEMP__root_mean_square',\n", " 'TEMP__maximum', 'TEMP__minimum', 'PRES__sum_values', 'PRES__median',\n", " 'PRES__mean', 'PRES__length', 'PRES__standard_deviation',\n", " 'PRES__variance', 'PRES__root_mean_square', 'PRES__maximum',\n", " 'PRES__minimum', 'DEWP__sum_values', 'DEWP__median', 'DEWP__mean',\n", " 'DEWP__length', 'DEWP__standard_deviation', 'DEWP__variance',\n", " 'DEWP__root_mean_square', 'DEWP__maximum', 'DEWP__minimum',\n", " 'RAIN__sum_values', 'RAIN__median', 'RAIN__mean', 'RAIN__length',\n", " 'RAIN__standard_deviation', 'RAIN__variance', 'RAIN__root_mean_square',\n", " 'RAIN__maximum', 'RAIN__minimum', 'WSPM__sum_values', 'WSPM__median',\n", " 'WSPM__mean', 'WSPM__length', 'WSPM__standard_deviation',\n", " 'WSPM__variance', 'WSPM__root_mean_square', 'WSPM__maximum',\n", " 'WSPM__minimum'],\n", " dtype='object')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_features = tsfresh.extract_features(df_rolled, column_id='id', column_sort='timestamp', \n", " default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters())\n", "df_features.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, each timestamp has the data summarized from the prior 24 hours." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stationtimestampPM2.5__sum_valuesPM2.5__medianPM2.5__meanPM2.5__lengthPM2.5__standard_deviationPM2.5__variancePM2.5__root_mean_squarePM2.5__maximum...RAIN__minimumWSPM__sum_valuesWSPM__medianWSPM__meanWSPM__lengthWSPM__standard_deviationWSPM__varianceWSPM__root_mean_squareWSPM__maximumWSPM__minimum
0Aotizhongxin2014-03-02 00:00:002053.067.082.1225.067.6581534577.6256106.401692210.0...0.051.11.82.04425.00.9646060.9304642.2601774.30.1
1Aotizhongxin2014-03-02 01:00:001976.067.079.0425.064.1089574109.9584101.770723210.0...0.052.32.02.09225.00.9661970.9335362.3043444.30.1
2Aotizhongxin2014-03-02 02:00:001902.067.076.0825.059.5395133544.953696.608074177.0...0.052.62.12.10425.00.9643570.9299842.3144764.30.1
3Aotizhongxin2014-03-02 03:00:001852.067.074.0825.056.8970443237.273693.408351176.0...0.053.52.22.14025.00.9503680.9032002.3415384.30.1
4Aotizhongxin2014-03-02 04:00:001790.067.071.6025.053.6596682879.360089.475807175.0...0.054.32.22.17225.00.9348880.8740162.3646564.30.1
\n", "

5 rows × 101 columns

\n", "
" ], "text/plain": [ " station timestamp PM2.5__sum_values PM2.5__median \\\n", "0 Aotizhongxin 2014-03-02 00:00:00 2053.0 67.0 \n", "1 Aotizhongxin 2014-03-02 01:00:00 1976.0 67.0 \n", "2 Aotizhongxin 2014-03-02 02:00:00 1902.0 67.0 \n", "3 Aotizhongxin 2014-03-02 03:00:00 1852.0 67.0 \n", "4 Aotizhongxin 2014-03-02 04:00:00 1790.0 67.0 \n", "\n", " PM2.5__mean PM2.5__length PM2.5__standard_deviation PM2.5__variance \\\n", "0 82.12 25.0 67.658153 4577.6256 \n", "1 79.04 25.0 64.108957 4109.9584 \n", "2 76.08 25.0 59.539513 3544.9536 \n", "3 74.08 25.0 56.897044 3237.2736 \n", "4 71.60 25.0 53.659668 2879.3600 \n", "\n", " PM2.5__root_mean_square PM2.5__maximum ... RAIN__minimum \\\n", "0 106.401692 210.0 ... 0.0 \n", "1 101.770723 210.0 ... 0.0 \n", "2 96.608074 177.0 ... 0.0 \n", "3 93.408351 176.0 ... 0.0 \n", "4 89.475807 175.0 ... 0.0 \n", "\n", " WSPM__sum_values WSPM__median WSPM__mean WSPM__length \\\n", "0 51.1 1.8 2.044 25.0 \n", "1 52.3 2.0 2.092 25.0 \n", "2 52.6 2.1 2.104 25.0 \n", "3 53.5 2.2 2.140 25.0 \n", "4 54.3 2.2 2.172 25.0 \n", "\n", " WSPM__standard_deviation WSPM__variance WSPM__root_mean_square \\\n", "0 0.964606 0.930464 2.260177 \n", "1 0.966197 0.933536 2.304344 \n", "2 0.964357 0.929984 2.314476 \n", "3 0.950368 0.903200 2.341538 \n", "4 0.934888 0.874016 2.364656 \n", "\n", " WSPM__maximum WSPM__minimum \n", "0 4.3 0.1 \n", "1 4.3 0.1 \n", "2 4.3 0.1 \n", "3 4.3 0.1 \n", "4 4.3 0.1 \n", "\n", "[5 rows x 101 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_features = df_features.reset_index().rename(columns={'level_0': 'station', 'level_1': 'timestamp'})\n", "df_features.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }