{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 서울 날씨 트위터를 위한 머신러닝\n", "\n", "![서울타워](https://raw.githubusercontent.com/leehaesung/AWS_BIG_DATA_SPECIALTY/master/data/QRCode.png.png)\n", "\n", "* Data set: pm2p5c.csv\n", "\n", "* Date: 23rd Apr 2019\n", "\n", "* Written By Haesung Lee\n", "\n", "* AWS-IoT Diagram:\n", "\n", "![Image](https://raw.githubusercontent.com/leehaesung/AWS_BIG_DATA_SPECIALTY/master/data/AWS-IoT_Analytics.png)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### S3 myiotstation 버킷에서 pm2p5c.csv 초미세 먼지데이터 가져오기" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# create and evaluate an updated autoregressive model\n", "import pandas as pa\n", "import matplotlib as plot\n", "from pandas import read_csv\n", "from matplotlib import pyplot\n", "from statsmodels.tsa.ar_model import AR\n", "from sklearn.metrics import mean_squared_error\n", "from math import sqrt\n", "# load dataset\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "#\n", "# Data sets" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 35.000000\n", "mean 21.685714\n", "std 15.914794\n", "min 3.000000\n", "25% 8.500000\n", "50% 18.000000\n", "75% 33.000000\n", "max 63.000000\n", "Name: Dust, dtype: float64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#view basic stats information on data\n", "series.describe()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "time\n", "2019-04-20 20\n", "2019-04-21 21\n", "2019-04-21 41\n", "2019-04-21 21\n", "2019-04-21 31\n", "2019-04-22 37\n", "2019-04-22 32\n", "2019-04-22 38\n", "2019-04-22 34\n", "2019-04-23 30\n", "Name: Dust, dtype: int64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#get data\n", "def GetData(fileName):\n", " return read_csv(fileName, header=0, parse_dates=[0], index_col=0)\n", "\n", "#read time series from the exchange.csv file \n", "\n", "#view top 10 records\n", "series.head(10)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAADPtJREFUeJzt3V9sXYV9wPHvb0krQlwIXb0MBTSvovJW4RVKxEBslQ37wwjaXlqNSWVl6uSHdh2bMlVG01btYVoeRlUepkmItnto1cmjdKtIVYpoPakPZYshXRKC167LCoGS9qHpjNCY198efKK6yXV8cn2Pr39X349k2fdwru/vJ5Mvl2PfODITSVIdPzHsASRJl8ZwS1IxhluSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqZmcXn3TPnj153XXXdfGpt9Srr77K7t27hz3GQLjL9jMqe4C7DMLi4uL3MnO8zbmdhHvv3r0cOXKki0+9pRYWFpienh72GAPhLtvPqOwB7jIIEfFfbc/1UokkFWO4JakYwy1JxRhuSSrGcEtSMYZbkoox3JJUjOGWpGIMtyQV08krJzdjYu7wUB731KEDQ3lcSbpUPuOWpGIMtyQVY7glqRjDLUnFGG5JKsZwS1IxhluSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFdMq3BHxxxFxIiKOR8RnIuKyrgeTJPW2YbgjYh/wh8D+zLwe2AHc0/VgkqTe2l4q2QnsioidwOXAS92NJEm6mA3DnZmngb8Gvg28DJzNzC91PZgkqbfIzIufEHEV8Fngt4HvA/8APJqZnzrvvFlgFmB8fPym+fn5vgY6dvpsX/fbrKl9V15wbHl5mbGxsSFMM3jusv2Myh7gLoMwMzOzmJn725zbJtzvAe7MzPc3t38XuCUzP7DefSYnJ3NpaekSRv6RibnDfd1vs04dOnDBsYWFBaanp7d+mA64y/YzKnuAuwxCRLQOd5tr3N8GbomIyyMigDuAk5sZUJLUvzbXuJ8GHgWeAY4193m447kkSevY2eakzPwI8JGOZ5EkteArJyWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFWO4JakYwy1JxRhuSSrGcEtSMYZbkoox3JJUjOGWpGIMtyQVY7glqRjDLUnFGG5JKsZwS1IxhluSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFWO4JakYwy1JxRhuSSrGcEtSMYZbkoppFe6I2BMRj0bE8xFxMiJu7XowSVJvO1ue9xDwxcx8d0S8Ebi8w5kkSRexYbgj4grgXcB9AJn5OvB6t2NJktbT5lLJW4HvAp+MiGcj4pGI2N3xXJKkdURmXvyEiP3A14DbMvPpiHgI+EFm/tl5580CswDj4+M3zc/P9zXQsdNn+7rfZk3tu/KCY8vLy4yNjXX+2Fux895d8MprP36s184VbNXXpWujsge4yyDMzMwsZub+Nue2CfdPA1/LzInm9i8Dc5l5YL37TE5O5tLSUvuJ15iYO9zX/Tbr1KEL11lYWGB6errzx96KnQ9OrfDgsR+/MtZr5wq26uvStVHZA9xlECKidbg3vFSSmd8BXoiIyebQHcBzm5hPkrQJbX+q5EPAp5ufKPkW8HvdjSRJuphW4c7Mo0Crp/CSpG75yklJKsZwS1IxhluSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFWO4JakYwy1JxRhuSSrGcEtSMW1/5+TI6/Wb1g9OrXDfkH7rvCStx2fcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFWO4JakYwy1JxRhuSSrGcEtSMYZbkoox3JJUjOGWpGJahzsidkTEsxHxeJcDSZIu7lKecd8PnOxqEElSO63CHRHXAAeAR7odR5K0kbbPuD8GfBj4YYezSJJaiMy8+AkRdwN3ZeYHImIa+JPMvLvHebPALMD4+PhN8/PzfQ107PTZvu7Xhb274JXXhj3FYPTaZWrflcMZZpOWl5cZGxsb9hibNip7gLsMwszMzGJm7m9zbptw/xVwL7ACXAZcATyWme9d7z6Tk5O5tLTUfuI1JuYO93W/LhycWuHBYzuHPcZA9Nrl1KEDQ5pmcxYWFpienh72GJs2KnuAuwxCRLQO94aXSjLzgcy8JjMngHuAL18s2pKkbvlz3JJUzCVdB8jMBWChk0kkSa34jFuSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFWO4JakYwy1JxRhuSSrGcEtSMYZbkoox3JJUzGj8CnP1ZWLu8LBH6MvBqRXuKzb7qUMHhvbYW/F17vU1GebOo85n3JJUjOGWpGIMtyQVY7glqRjDLUnFGG5JKsZwS1IxhluSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiNgx3RFwbEV+JiJMRcSIi7t+KwSRJvbX5nZMrwMHMfCYi3gQsRsSTmflcx7NJknrY8Bl3Zr6cmc80H/83cBLY1/VgkqTeLukad0RMADcCT3cxjCRpY5GZ7U6MGAP+GfjLzHysxz+fBWYBxsfHb5qfn+9roGOnz/Z1vy7s3QWvvDbsKQbDXbafUdkDeu8yte/K4QyzScvLy4yNjW35487MzCxm5v4257YKd0S8AXgceCIzP7rR+ZOTk7m0tNTm8S8wMXe4r/t14eDUCg8ea/NtgO3PXbafUdkDeu9y6tCBIU2zOQsLC0xPT2/540ZE63C3+amSAD4OnGwTbUlSt9pc474NuBe4PSKONm93dTyXJGkdG/5/WmZ+FYgtmEWS1IKvnJSkYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGGW5KKMdySVIzhlqRiDLckFWO4JakYwy1JxRhuSSrGcEtSMYZbkoox3JJUjOGWpGIMtyQVMxq/YlqS1piYO9z3fQ9OrXBfn/ffqt9s7zNuSSrGcEtSMYZbkoox3JJUjOGWpGIMtyQVY7glqRjDLUnFGG5JKsZwS1IxhluSijHcklSM4ZakYgy3JBVjuCWpGMMtScUYbkkqxnBLUjGtwh0Rd0bEUkR8MyLmuh5KkrS+DcMdETuAvwF+A3g78DsR8fauB5Mk9dbmGffNwDcz81uZ+Trw98BvdTuWJGk9bcK9D3hhze0Xm2OSpCGIzLz4CRHvAX49M3+/uX0vcHNmfui882aB2ebm9cDxwY+75d4CfG/YQwyIu2w/o7IHuMsg/Exmjrc5cWeLc14Erl1z+xrgpfNPysyHgYcBIuJIZu5vM8B2Nip7gLtsR6OyB7jLVmtzqeRfgbdFxM9GxBuBe4DPdzuWJGk9Gz7jzsyViPgD4AlgB/CJzDzR+WSSpJ7aXCohM78AfOESPu/D/Y2z7YzKHuAu29Go7AHusqU2/OakJGl78SXvklTMQMNd+aXxEfGJiDgTEcfXHHtzRDwZEd9o3l81zBnbiIhrI+IrEXEyIk5ExP3N8Yq7XBYR/xIRX292+YvmeLldzomIHRHxbEQ83twuuUtEnIqIYxFxNCKONMfK7RIReyLi0Yh4vvkzc2uFPQYW7hF4afzfAXeed2wOeCoz3wY81dze7laAg5n588AtwAebr0PFXf4HuD0z3wHcANwZEbdQc5dz7gdOrrldeZeZzLxhzY/OVdzlIeCLmflzwDtY/dps/z0ycyBvwK3AE2tuPwA8MKjPvxVvwARwfM3tJeDq5uOrgaVhz9jHTv8E/Gr1XYDLgWeAX6y6C6uvgXgKuB14vDlWdZdTwFvOO1ZqF+AK4D9pvtdXaY9BXioZxZfG783MlwGa9z815HkuSURMADcCT1N0l+bSwlHgDPBkZpbdBfgY8GHgh2uOVd0lgS9FxGLzqmmot8tbge8Cn2wuXz0SEbspsMcgwx09jvkjK0MSEWPAZ4E/yswfDHuefmXm/2XmDaw+W705Iq4f9kz9iIi7gTOZuTjsWQbktsx8J6uXRj8YEe8a9kB92Am8E/jbzLwReJXteFmkh0GGu9VL44t5JSKuBmjenxnyPK1ExBtYjfanM/Ox5nDJXc7JzO8DC6x+H6LiLrcBvxkRp1j9GzZvj4hPUXMXMvOl5v0Z4HOs/i2i1XZ5EXix+b84gEdZDfm232OQ4R7Fl8Z/Hnhf8/H7WL1evK1FRAAfB05m5kfX/KOKu4xHxJ7m413ArwDPU3CXzHwgM6/JzAlW/2x8OTPfS8FdImJ3RLzp3MfAr7H6l8qV2iUzvwO8EBGTzaE7gOeosMeAL/bfBfw78B/Anw77Av4lzv4Z4GXgf1n9L/H7gZ9k9ZtJ32jev3nYc7bY45dYvUT1b8DR5u2uorv8AvBss8tx4M+b4+V2OW+vaX70zclyu7B6bfjrzduJc3/Wi+5yA3Ck+XfsH4GrKuzhKyclqRhfOSlJxRhuSSrGcEtSMYZbkoox3JJUjOGWpGIMtyQVY7glqZj/B142+dAmNoGnAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# create a histogram plot\n", "from pandas import read_csv\n", "from matplotlib import pyplot\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "series.hist()\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# create a density plot\n", "from pandas import read_csv\n", "from matplotlib import pyplot\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "series.plot(kind='kde')\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from pandas import Series\n", "from matplotlib import pyplot\n", "from pandas.plotting import lag_plot\n", "lag_plot(series)\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " t t+1\n", "t 1.000000 0.805891\n", "t+1 0.805891 1.000000\n" ] } ], "source": [ "# correlation of lag=1\n", "from pandas import read_csv\n", "from pandas import DataFrame\n", "from pandas import concat\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "values = DataFrame(series.values)\n", "dataframe = concat([values.shift(1), values], axis=1)\n", "dataframe.columns = ['t', 't+1']\n", "result = dataframe.corr()\n", "print(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Autocorrelation (자기상관도)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# autocorrelation plot of time series\n", "from pandas import read_csv\n", "from matplotlib import pyplot\n", "from statsmodels.graphics.tsaplots import plot_acf\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "plot_acf(series, lags=34)\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# autocorrelation plot of time series\n", "from pandas import read_csv\n", "from matplotlib import pyplot\n", "from pandas.plotting import autocorrelation_plot\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "autocorrelation_plot(series)\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{sensor:30.000000,prediction:36.868843}\n", "{sensor:63.000000,prediction:38.295336}\n", "{sensor:47.000000,prediction:41.722092}\n", "{sensor:46.000000,prediction:48.042836}\n", "{sensor:50.000000,prediction:48.080494}\n", "{sensor:41.000000,prediction:47.277171}\n", "{sensor:22.000000,prediction:56.122334}\n", "{sensor:8.000000,prediction:47.248394}\n", "{sensor:10.000000,prediction:37.402841}\n", "{sensor:14.000000,prediction:31.303013}\n", "{sensor:4.000000,prediction:25.419424}\n", "{sensor:5.000000,prediction:16.639138}\n", "{sensor:3.000000,prediction:9.284652}\n", "{sensor:3.000000,prediction:7.737093}\n", "{sensor:3.000000,prediction:7.935095}\n", "{sensor:7.000000,prediction:4.457086}\n", "{sensor:9.000000,prediction:4.794741}\n", "{sensor:13.000000,prediction:5.555003}\n", "{sensor:15.000000,prediction:7.387235}\n", "{sensor:18.000000,prediction:9.509336}\n", "{sensor:15.000000,prediction:12.833780}\n", "{sensor:8.000000,prediction:14.696294}\n", "{sensor:6.000000,prediction:14.851742}\n", "{sensor:9.000000,prediction:13.348264}\n", "{sensor:17.000000,prediction:13.008920}\n", "{sensor:18.000000,prediction:13.352161}\n", "{TestRMSE:14.626}\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# create and evaluate an updated autoregressive model\n", "from pandas import read_csv\n", "from matplotlib import pyplot\n", "from statsmodels.tsa.ar_model import AR\n", "from sklearn.metrics import mean_squared_error\n", "from math import sqrt\n", "\n", "# load dataset\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "\n", "# split dataset\n", "X = series.values\n", "testLength = 26;\n", "train, test = X[1:len(X)-testLength], X[len(X)-testLength:]\n", "\n", "# train autoregression\n", "model = AR(train)\n", "model_fit = model.fit()\n", "window = model_fit.k_ar\n", "coef = model_fit.params\n", "\n", "# walk forward over time steps in test\n", "history = train[len(train)-window:]\n", "history = [history[i] for i in range(len(history))]\n", "predictions = list()\n", "for t in range(len(test)):\n", " length = len(history)\n", " lag = [history[i] for i in range(length-window,length)]\n", " yhat = coef[0]\n", " for d in range(window):\n", " yhat += coef[d+1] * lag[window-d-1]\n", " obs = test[t]\n", " predictions.append(yhat)\n", " history.append(obs)\n", " print('{sensor:%f,prediction:%f}' % (obs, yhat))\n", "rmse = sqrt(mean_squared_error(test, predictions))\n", "\n", "print('{TestRMSE:%.3f}' % rmse)\n", "# plot\n", "pyplot.plot(test)\n", "pyplot.plot(predictions, color='red')\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test RMSE: 9.911\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# evaluate a persistence model\n", "from pandas import read_csv\n", "from pandas import DataFrame\n", "from pandas import concat\n", "from matplotlib import pyplot\n", "from sklearn.metrics import mean_squared_error\n", "from math import sqrt\n", "\n", "# load dataset\n", "series = read_csv('https://s3.amazonaws.com/myiotstation/pm2p5c.csv', header=0, index_col=0, parse_dates=True, squeeze=True)\n", "# create lagged dataset\n", "values = DataFrame(series.values)\n", "dataframe = concat([values.shift(1), values], axis=1)\n", "dataframe.columns = ['t', 't+1']\n", "# split into train and test sets\n", "X = dataframe.values\n", "testLength = 34;\n", "train, test = X[1:len(X)-testLength], X[len(X)-testLength:]\n", "train_X, train_y = train[:,0], train[:,1]\n", "test_X, test_y = test[:,0], test[:,1]\n", "# persistence model\n", "def model_persistence(x):\n", " return x\n", "# walk-forward validation\n", "predictions = list()\n", "for x in test_X:\n", " yhat = model_persistence(x)\n", " predictions.append(yhat)\n", "rmse = sqrt(mean_squared_error(test_y, predictions))\n", "print('Test RMSE: %.3f' % rmse)\n", "# plot predictions vs expected\n", "pyplot.plot(test_y)\n", "pyplot.plot(predictions, color='red')\n", "pyplot.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# END" ] } ], "metadata": { "kernelspec": { "display_name": "Containerized conda_python2", "language": "python", "name": "containerized_conda_python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 2 }