""" This code extracts temperature, humidity, and pressure information from 5 west coast cities and cleans the
data by discarding outlier points that are either due to sensor malfunction or data collection error. It then
saves the cleaned data in csv format for further processing.

Original dataset:
    https://www.kaggle.com/selfishgene/historical-hourly-weather-data
"""
import pandas as pd
from pandas import Series
import numpy as np
import matplotlib.pyplot as plt

# Read in temperature, humidity, and pressure
temperature = pd.read_csv('temperature.csv', usecols=["datetime","Vancouver", "Portland", "San Francisco", "Seattle", "Los Angeles"])
humidity = pd.read_csv('humidity.csv', usecols=["Vancouver", "Portland", "San Francisco", "Seattle", "Los Angeles"])
pressure = pd.read_csv('pressure.csv', usecols=["Vancouver", "Portland", "San Francisco", "Seattle", "Los Angeles"])

# Create a single dataframe from the information we are interested in.
df = pd.concat([temperature,humidity,pressure], axis=1)
df.columns = ['datetime', 't-Van', 't-Port', 't-SF', 't-Sea', 't-LA', 'h-Van', 'h-Port', 'h-SF', 'h-Sea', 'h-LA', 'p-Van', 'p-Port', 'p-SF', 'p-Sea', 'p-LA']

kw = dict(marker='o', linestyle='none', color='r', alpha=0.3)

# Fix the pressures; for readings that deviate too far from the local median, discard the datapoint
# because it is likely a sensor error (or a data scraping error in the dataset)
fix_cols = ['p-Van', 'p-Port', 'p-SF', 'p-Sea', 'p-LA']
for c in fix_cols:
    threshold = 70  # If the pressure differs by more than 70 millibars from the 5000-hour window, discard the point
    df['pandas'] = Series.rolling(df[c], window=5000, min_periods=10, center=True).median()
    
    difference = np.abs(df[c] - df['pandas'])
    outlier_idx = difference > threshold
    
    df.loc[outlier_idx,c] = np.nan
    
    # If the pressure differs by more thna 40 millibars in a 400-hour window, discard the point
    threshold = 40
    df['pandas'] = Series.rolling(df[c], window=400, min_periods=5, center=True).median()
    
    difference = np.abs(df[c] - df['pandas'])
    outlier_idx = difference > threshold
    
    df.loc[outlier_idx,c] = np.nan
    
    num_outliers = 1
    threshold = 10
    while num_outliers > 0:
        print(num_outliers)
        # Using a window of 5 hours, discard outliers of more than 10 millibars from the median
        # of those 5 points (minimum of 3 points in window to take action)
        df['pandas'] = Series.rolling(df[c], window=5, min_periods=3, center=True).median()
        
        difference = np.abs(df[c] - df['pandas'])
        outlier_idx = difference > threshold
        
        df.loc[outlier_idx,c] = np.nan
        num_outliers = np.sum(outlier_idx.get_values())
    
    # Interpolate missing values with a linear interpolation
    df = df.interpolate(method='linear', axis=0, limit_direction='both')
    bad_idx = df.index[df[c].diff().abs().ge(20)]

# Fix outliers in temperatures
fix_cols = ['t-Van', 't-Port', 't-SF', 't-Sea', 't-LA']
for c in fix_cols:
    
    threshold = 10  # Discard points that are more than 10 C away from the two neighboring hours.
    df['pandas'] = Series.rolling(df[c], window=3, center=True).median()
    
    difference = np.abs(df[c] - df['pandas'])
    outlier_idx = difference > threshold
    
    df.loc[outlier_idx,c] = np.nan
    num_outliers = np.sum(outlier_idx.get_values())
    
    # Interpolate missing values.
    df = df.interpolate(method='linear', axis=0, limit_direction='both')
    bad_idx = df.index[df[c].diff().abs().ge(20)]

## Replace remaining with the following searchable number, -99999 so we can discard sections of
## data at the beginning and end of the time period that cannot be interpolated.
df = df.fillna(-99999)
print(df.head(3))
df.to_csv('final_dataset/west_coast_weather.csv', index=False)