# -*- coding: utf-8 -*- """Time_series_1D_v1.0.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Zzc5_s-OhGwbtpE0XJa10830dfq464cm **Univariate time series visualization with python** An easy way to transform and visualize a univariate time data series. A time series is a sequence of a single or a group of data points occurred and recorder over regular time intervals. A time series can be univariate or multivariate, depending of the number of variables recorded in each time interval. In this case we are going to graph a time series of variables that represent the different temperatures recorded by a sensor with an hourly period. **Loading dataset** Dataset used with temperature information is based on Max Planck Weather Dataset (https://www.kaggle.com/datasets/arashnic/max-planck-weather-dataset) and is stored in a public github repository. Once the dataset is loaded, we can obtain the information from the dataset, as well as a preview a sample of the data and some basic statistics. """ import pandas as pd #based on dataset https://www.kaggle.com/datasets/arashnic/max-planck-weather-dataset #Import dataset dates from github url = 'https://raw.githubusercontent.com/mabrotons/datasets/master/weather_1D.csv' df = pd.read_csv(url, index_col=False, parse_dates=['Date Time']) df['DateTime'] = pd.to_datetime(df['Date Time'], utc=True) df.sort_values(by='DateTime', inplace = True) df.set_index('Date Time', inplace = True) print("Data frame info: ") print(df.info()) print("\nData frame head: ") print(df.head()) print("\nDescribe: ") print(df.describe()) """Let's plot the time series with a time line for temperature and including the mean, and maximum and minimum bounds.""" import matplotlib.pyplot as plt f = plt.figure() f.set_figwidth(18) f.set_figheight(5) #basic stats mean_temp = df['T (degC)'].mean() max_temp = df['T (degC)'].max() min_temp = df['T (degC)'].min() stdev_temp = df['T (degC)'].std() plt.plot(df.index, df['T (degC)'], label='Temperature C') #add horizontal line for mean, max and min values of temp plt.axhline(y=mean_temp, color='blue', linewidth=2, label='mean') plt.axhline(y=max_temp, color='brown', linewidth=1, label='max', linestyle='dashed') plt.axhline(y=min_temp, color='brown', linewidth=1, label='min', linestyle='dashed') plt.xticks(rotation=30) plt.legend() plt.show() """Additionally to the time line plot, we will build a graph to represent all the data included in the dataset in order to have a first view of data distribution.""" f = plt.figure() f.set_figwidth(18) f.set_figheight(5) plt.hist(df['T (degC)'], bins=50, edgecolor='black') plt.xticks(rotation=30) plt.title("T (degC)") plt.show() """Now, having a normal distribution, with z-score calculation we can measure how many standard deviations a value is far away from the mean, and the probability of data to be unusual in a distribution. For that, we will plot: - a blue line with mean - two green lines for [mean - (1*std)] and [mean + (1*std)] - two orange lines for [mean - (2*std)] and [mean + (2*std)] - two red lines for [mean - (3*std)] and [mean + (3*std)] """ f = plt.figure() f.set_figwidth(18) f.set_figheight(5) plt.plot(df.index, df['T (degC)'], label='Temperature C') #add horizontal line at std value of dataset and how many std data is located stdev_temp = df['T (degC)'].std() #add horizontal lines for how many stds from mean plt.axhline(y=mean_temp, color='blue', linewidth=2, label='mean') plt.axhline(y=mean_temp+stdev_temp, color='green', linewidth=1, label='std+1') plt.axhline(y=mean_temp-stdev_temp, color='green', linewidth=1, label='std-1') plt.axhline(y=mean_temp+(2*stdev_temp), color='orange', linewidth=1, label='std+2') plt.axhline(y=mean_temp-(2*stdev_temp), color='orange', linewidth=1, label='std-2') plt.axhline(y=mean_temp+(3*stdev_temp), color='red', linewidth=1, label='std+3') plt.axhline(y=mean_temp-(3*stdev_temp), color='red', linewidth=1, label='std-3') plt.xticks(rotation=30) plt.title("T (degC)") plt.legend() plt.show() """**Moving average** Moving average is a indicator commonly used in technical analysis. The reason for calculating one o various moving averages is to help smooth time lines, mitigating the impacts of random and short-term fluctuations. Let's compute three different moving averages (daily, weekly and monthly) for a better visualization of time series trends. The greater the range of the moving average, the greater the attenuation of the time series. """ df['temp_mean_24H'] = df['T (degC)'].rolling(24).mean() #daily df['temp_mean_week'] = df['T (degC)'].rolling(24*7).mean() #weekly df['temp_mean_month'] = df['T (degC)'].rolling(24*7*4).mean() #monthly fig, axes =plt.subplots(4, 1, figsize=(20,20)) axes[0].plot(df.index, df['T (degC)']) axes[1].plot(df.index, df['temp_mean_24H'], color='orange') axes[2].plot(df.index, df['temp_mean_week'], color='green') axes[3].plot(df.index, df['temp_mean_month'], color='black') axes[0].title.set_text('Temperature (C) - Hourly') axes[1].title.set_text('Temperature (C) - Daily') axes[2].title.set_text('Temperature (C) - Weekly') axes[3].title.set_text('Temperature (C) - Monthly') plt.xticks(rotation=30) plt.show() """**Aggregating data** Pandas dataframe.groupby() is a function that helps us to aggregate data by applying simple functions to the selected value to group by. Let's aggregate data by four different periods: Day, Week, Month and Year, drawing the mean line and painting the maximum and minimum boundary area. """ import numpy as np day_agg = df.groupby(df.index.to_period("D"))['T (degC)'].agg(['min', 'mean', 'max']) #Daily week_agg = df.groupby(df.index.to_period("W"))['T (degC)'].agg(['min', 'mean', 'max']) #Weekly month_agg = df.groupby(df.index.to_period("M"))['T (degC)'].agg(['min', 'mean', 'max']) #Monthly year_agg = df.groupby(df.index.to_period("Y"))['T (degC)'].agg(['min', 'mean', 'max']) #Yearly fig, axes =plt.subplots(4, 1, figsize=(20,20)) axes[0].plot(day_agg.index.astype('datetime64'), day_agg['mean'], color='black') axes[0].fill_between(day_agg.index.astype('datetime64'), day_agg['max'], day_agg['mean'], alpha=0.2, color='red') axes[0].fill_between(day_agg.index.astype('datetime64'), day_agg['min'], day_agg['mean'], alpha=0.2, color='blue') axes[1].plot(week_agg.index.astype('datetime64'), week_agg['mean'], color='black') axes[1].fill_between(week_agg.index.astype('datetime64'), week_agg['max'], week_agg['mean'], alpha=0.2, color='red') axes[1].fill_between(week_agg.index.astype('datetime64'), week_agg['min'], week_agg['mean'], alpha=0.2, color='blue') axes[2].plot(month_agg.index.astype('datetime64'), month_agg['mean'], color='black') axes[2].fill_between(month_agg.index.astype('datetime64'), month_agg['max'], month_agg['mean'], alpha=0.2, color='red') axes[2].fill_between(month_agg.index.astype('datetime64'), month_agg['min'], month_agg['mean'], alpha=0.2, color='blue') axes[3].plot(year_agg.index.astype('datetime64'), year_agg['mean'], color='black') axes[3].fill_between(year_agg.index.astype('datetime64'), year_agg['max'], year_agg['mean'], alpha=0.2, color='red') axes[3].fill_between(year_agg.index.astype('datetime64'), year_agg['min'], year_agg['mean'], alpha=0.2, color='blue') axes[0].title.set_text('Temperature (C) - Daily') axes[1].title.set_text('Temperature (C) - Weekly') axes[2].title.set_text('Temperature (C) - Monthly') axes[3].title.set_text('Temperature (C) - Yearly') plt.xticks(rotation=30) plt.show() """**Time Series analysis (tsa)** With tsa library we can decompose a time series in its trend, seasonal and residual components. Using moving average we can see: - trend component, which refers to patterns in the data that spans across seasonal periods - seasonal component, which explains the periodic ups and downs in many data sets - residual component, which allows us to see what remains after removing trends and seasonality """ from statsmodels.tsa.seasonal import seasonal_decompose res_decompose = seasonal_decompose(df['T (degC)'], model="additive", period=360*24*6) f = res_decompose.plot() f.set_figwidth(15) f.set_figheight(15) """**Conclusion** With some basic statistical functions and graphics we can start a basic analysis of an univariate time series data set and determine its distribution, trends, patterns and detect outlier candidates. """