In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa import stattools
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
##########################################################################################

def csv_to_df(input_filename):
    df = pd.read_csv(input_filename, usecols=['D', 'M', 'Y', 'CNT'])
    df[['D', 'M', 'Y']] = df[['D', 'M', 'Y']].astype(str)
    df['Date'] = df['D'] + '-' + df['M'] + '-' + df['Y']
    df = df[['Date', 'CNT']]
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
    df.sort_values(by='Date', inplace=True)
    df['WDN'] = df['Date'].apply(lambda x: pd.Timestamp.weekday(x))
    df.set_index('Date', inplace=True)
    return df

##########################################################################################
# Weekend start/end: 0 - Sunday, 5- Friday


def ws(df):
    wknd_start = df[df['WDN'] == 5].index
    wknd_end = df[df['WDN'] == 0].index
    return zip(wknd_start, wknd_end)

###########################################################################################
# Highlighted weekends


def highlight_wknds(w, wknds_highlight_color):
    shapes_list = [dict(
        type='rect',
        xref='x',
        yref='paper',
        x0=u[0],
        y0=0,
        x1=u[1],
        y1=1,
        fillcolor=wknds_highlight_color,
        opacity=0.5,
        layer='below',
        line_width=0,) for u in w]
    return shapes_list

###########################################################################################
# Plot TS and highligt weekends


def plot_ts(df, title_text):
    w = ws(df)
    shapes_list = highlight_wknds(w, wknds_highlight_color='lightskyblue')
    # Plot TS, add highlighted weekends
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, y=df['CNT'], line_color='blue'))
    fig.update_layout(shapes=shapes_list, autosize=False,
                      width=1800, height=450)
    fig.update_xaxes(
        tickangle=-90,
        title_text="Date",
        title_font={"size": 12},
        title_standoff=25,
        nticks=df.shape[0])
    fig.update_yaxes(
        title_text=title_text,
        title_font={"size": 12},
        title_standoff=25)
    fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()

###########################################################################################
# Plot ACF


def plot_acf(df, alpha):
    nlags = df.shape[0] - 1
    acf, confint = stattools.acf(
        df['CNT'].values, nlags=nlags, qstat=False, fft=False,
        alpha=alpha, missing='none')

    # Y-coordinates for highlighting CI zone
    y_pos = [abs(i - j)/2 if (i >= 0 and j >= 0)
             else (abs(i) + abs(j))/2 for i, j in confint[1:]]
    lags = np.arange(0, df.shape[0], 1, dtype=int)
    fig = go.Figure()
    fig.update_layout(width=900, height=450)
    fig.update_xaxes(
        tickangle=-90,
        title_text="Lag",
        title_font={"size": 12},
        title_standoff=25,
        nticks=32)
    fig.update_yaxes(
        title_text="Autocorrelation",
        title_font={"size": 12},
        title_standoff=25,
        nticks=10)
    fig.add_trace(go.Scatter(x=lags[1:], y=y_pos,
                             fill='tozeroy',
                             fillcolor='rgba(135, 206, 250, 0.5)',
                             line_color='rgba(135, 206, 250, 0.5)',
                             showlegend=False,
                             name='CI'
                             ))
    fig.add_trace(go.Scatter(x=lags[1:], y=[-i for i in y_pos],
                             fill='tozeroy',
                             fillcolor='rgba(135, 206, 250, 0.5)',
                             line_color='rgba(135, 206, 250, 0.5)',
                             showlegend=False,
                             name='CI'))
    fig.add_trace(go.Scatter(x=lags, y=acf,
                             mode='markers', line_color='blue',
                             name='Autocorr'))
    for X0, Y0, Y1 in zip(lags, np.array([0]*len(acf)), acf):
        fig.add_shape(type="line",
                      x0=X0, y0=Y0, x1=X0, y1=Y1,
                      line=dict(color="Blue", width=1.5))
    # Add zero line
    fig.add_shape(type="line",
                  x0=0, y0=0, x1=len(lags), y1=0,
                  line=dict(color="Blue", width=1))
    fig.show()

###########################################################################################
# Plot TS decomposition


def plot_ts_decomp(df, model, period):
    subplot_titles = ["Observed", "Trend", "Seasonality", "Residuals"]
    d = seasonal_decompose(df['CNT'],
                           model=model, period=period)

    l = [d.observed, d.trend, d.seasonal, d.resid]
    fig = make_subplots(rows=len(l), cols=1,
                        subplot_titles=subplot_titles)

    for r, (k, v) in enumerate(zip(subplot_titles, l), start=1):
        x = v.index[np.logical_not(
            np.isnan(v.values))]
        y = v.values[np.logical_not(
            np.isnan(v.values))]

        fig.append_trace(go.Scatter(
            x=x,
            y=y,
            name=k), row=r, col=1)

    fig.update_layout(height=1000, width=750,
                      title_text="Time series decomposition")

    fig.show()

###########################################################################################
# ADF test


def calc_adf(x, a):
    ar = stattools.adfuller(x, autolag=a)
    res = ['ADF', 'p-value', 'Used Lag', 'Nobs', 'Critical Values',
           'ICbest']
    d = dict(zip(res, ar))
    print('\n'.join([f'{i}: {j}' if i != 'Critical Values'
                     else f'{i}:\n'+'\n'.join([f'  {p}: {q}'
                                               for p, q in d[i].items()])
                     for i, j in d.items()]))

###########################################################################################
# KPSS test


def calc_kpss(x, n):
    kp = stattools.kpss(x, nlags=n)
    res = ['statistic', 'p_value', 'n_lags', 'critical_values']
    d = dict(zip(res, kp))
    print('\n'.join([f'{i}: {j}' if i != 'critical_values'
                     else 'Critical Values:\n'+'\n'.join([f'  {p}: {q}'
                                                          for p, q in d[i].items()])
                     for i, j in d.items()]))

###########################################################################################
# Histogram


def plot_msgsz(df, x_scale):
    if ('MN' not in df.columns) or ('MSGSZ'not in df.columns):
        raise Exception('Wrong DF!')
    else:
        months = [m for m in df['MN'].unique()]
        colors = ['blue', 'yellow', 'red', 'green', 'magenta']
        scale = ['lin', 'log10']
        fig = go.Figure()
        if x_scale not in scale:
            raise Exception('Wrong x-scale!')
        d = dict(zip(months, colors))
        for m, c in d.items():
            x = [np.log10(y) if (y > 0) & (x_scale == 'log10') else 0
                 if (y <= 0) & (x_scale == 'log10') else y
                 if x_scale == 'lin' else -1
                 for y in df[df['MN'] == m]['MSGSZ'].values]

            fig.add_trace(go.Histogram(
                x=x,
                name=m,
                marker_color=c))

        # Overlay the histograms
        fig.update_layout(barmode='overlay',
                          bargap=0.2,
                          bargroupgap=0.1)
        # Reduce opacity to see all the histograms
        fig.update_traces(opacity=0.85)
        fig.show()

###########################################################################################
# Boxplot


def plot_box(df):
    fig = go.Figure()
    fig.update_layout(width=1200, height=900)
    for m in df_msgsz[['M', 'MN']].sort_values(by='M')['MN'].unique():
        fig.add_trace(
            go.Box(y=df_msgsz[df_msgsz['MN'] == m]['MSGSZ'],
                   name=m,
                   boxpoints='all',
                   jitter=0.5,
                   pointpos=-1.8))
    fig.show()
###########################################################################################

In [3]:
input_filename = './Regions/R2_msgcnt.csv'
df_msgcnt = csv_to_df(input_filename)

In [4]:
df_msgcnt.head()

Unnamed: 0_level_0,CNT,WDN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-07-18,16991,5
2020-07-19,9202,6
2020-07-20,81548,0
2020-07-21,95604,1
2020-07-22,94348,2


In [5]:
# Plot TS
plot_ts(df_msgcnt, title_text="SS Count")

In [6]:
# Plot TS decomposition, period = 7
plot_ts_decomp(df_msgcnt, 'multiplicative', 7)

In [7]:
# Plot ACF
plot_acf(df_msgcnt, alpha=.05)

In [8]:
calc_adf(df_msgcnt['CNT'], a = 'AIC')

ADF: -2.372305673971571
p-value: 0.14970595526957609
Used Lag: 7
Nobs: 125
Critical Values:
  1%: -3.4837793736959997
  5%: -2.88495387648
  10%: -2.579256976
ICbest: 2716.6582805405005


In [9]:
input_filename = './Regions/R2_msgsz_cnt.csv'
df_msgsz = pd.read_csv(input_filename)

In [10]:
plot_msgsz(df_msgsz,'lin')

In [11]:
plot_msgsz(df_msgsz,'log10')

In [12]:
plot_box(df_msgsz)

In [13]:
df_msgsz.groupby(['M', 'MN'])['MSGSZ'].agg(['mean','median', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,max
M,MN,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7,Jul,11.26565,8.930206,123.821259
8,Aug,14.353889,11.335373,123.825073
9,Sep,14.63552,11.650085,123.825073
10,Oct,12.859234,11.363983,120.887756
11,Nov,13.011319,11.516571,97.877502


In [14]:
input_filename = './Regions/R2_errors_cnt.csv'
df_errors = pd.read_csv(input_filename)
df_errors['Date'] = pd.to_datetime(df_errors['Date'], format='%Y-%m-%d')
df_errors.set_index('Date', inplace = True)

In [15]:
plot_ts(df_errors, title_text="Errors Count")

In [16]:
plot_ts_decomp(df_errors, model='additive', period = None)

In [17]:
plot_acf(df_errors, alpha=.05)

In [18]:
calc_adf(df_errors['CNT'], a = 'AIC')

ADF: -4.463673425646953
p-value: 0.00022859535464419574
Used Lag: 5
Nobs: 131
Critical Values:
  1%: -3.481281802271349
  5%: -2.883867891664528
  10%: -2.5786771965503177
ICbest: 956.7840678152355


In [19]:
calc_kpss(df_errors['CNT'], n = 'auto')

statistic: 0.09692950176266624
p_value: 0.1
n_lags: 1
Critical Values:
  10%: 0.347
  5%: 0.463
  2.5%: 0.574
  1%: 0.739



p-value is greater than the indicated p-value

