{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Import the JSON File with Industry Lab Sensor Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "import sys\n", "import types\n", "import pandas as pd\n", "from botocore.client import Config\n", "import ibm_boto3\n", "\n", "def __iter__(self): return 0\n", "\n", "# @hidden_cell\n", "# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n", "# You might want to remove those credentials before you share your notebook.\n", "client_915ea66450e44183938b1aab8572887f = ibm_boto3.client(service_name='s3',\n", " ibm_api_key_id='x8LKJUWfwBvzMvyp4glOaZk6VQOcrYmbWV80lKToccNW',\n", " ibm_auth_endpoint=\"https://iam.ng.bluemix.net/oidc/token\",\n", " config=Config(signature_version='oauth'),\n", " endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')\n", "\n", "body = client_915ea66450e44183938b1aab8572887f.get_object(Bucket='dsbootcampac3431d743f2492ebe1cfe6103674873',Key='floorsensordata2604.json')['Body']\n", "# add missing __iter__ method, so pandas accepts body as file-like object \n", "\n", "if not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n", "\n", "# Since JSON data can be semi-structured and contain additional metadata, it is possible that you might face an error during data loading.\n", "# Please read the documentation of 'pandas.read_json()' and 'pandas.io.json.json_normalize' to learn more about the possibilities to adjust the data loading.\n", "# pandas documentation: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader\n", "# and http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html\n", "\n", "df_data_1 = pd.read_json(body, orient='values')\n", "df_data_1.head()\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check the Structure of the Data Frame" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df_data_1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "df.itemname.unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Format Yanzi Sensor Data " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['id'] = df['itemname'].str.split('_').str[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['temperature'] = df[df['itemname'].str.contains('temperature')]['value'].astype(float)\n", "df['carbonDioxide'] = df[df['itemname'].str.contains('carbonDioxide')]['value'].astype(float)\n", "df['humidity'] = df[df['itemname'].str.contains('humidity')]['value'].astype(float)\n", "df['illuminance'] = df[df['itemname'].str.contains('illuminance')]['value'].astype(float)\n", "df['pressure'] = df[df['itemname'].str.contains('pressure')]['value'].astype(float)\n", "df['Occupancy'] = df[df['itemname'].str.contains('Occupancy')]['value']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize Data with Pixidust" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pixiedust" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pixiedust": { "displayParams": { "filter": "{\"regex\": \"false\", \"constraint\": \"None\", \"field\": \"id\", \"case_matter\": \"false\", \"value\": \"4674C\"}", "handlerId": "lineChart", "keyFields": "time", "legend": "true", "lineChartType": "subplots", "no_margin": "true", "rendererId": "matplotlib", "timeseries": "true", "title": "Shinano Meeting Room", "valueFields": "temperature" } } }, "outputs": [], "source": [ "display(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filter on Shinano and Fill Missing Values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano = df[df['id'].isin(['4674C', 'Shinano'])].copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.sort_values(by='time', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.index = pd.to_datetime(df_shinano.time)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.fillna(method='ffill', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.fillna(method='bfill', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.drop(['illuminance'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.Occupancy.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.groupby('Occupancy')['time'].nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot Sensor Values depending on Occupancy State" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano.temperature.plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Build Simple Prediction Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano_model = df_shinano.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_shinano_model.loc[df_shinano_model.Occupancy == 'free', 'Occupancy'] = 0\n", "df_shinano_model.loc[df_shinano_model.Occupancy == 'occupied', 'Occupancy'] = 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y = df_shinano_model['Occupancy'].values\n", "y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = df_shinano_model.loc[:, ['temperature', 'carbonDioxide', 'humidity', 'pressure']].values\n", "X" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report\n", "from sklearn.model_selection import learning_curve" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,\n", " n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):\n", " plt.figure()\n", " plt.title(title)\n", " if ylim is not None:\n", " plt.ylim(*ylim)\n", " plt.xlabel(\"Training examples\")\n", " plt.ylabel(\"Score\")\n", " train_sizes, train_scores, test_scores = learning_curve(\n", " estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)\n", " train_scores_mean = np.mean(train_scores, axis=1)\n", " train_scores_std = np.std(train_scores, axis=1)\n", " test_scores_mean = np.mean(test_scores, axis=1)\n", " test_scores_std = np.std(test_scores, axis=1)\n", " plt.grid()\n", "\n", " plt.fill_between(train_sizes, train_scores_mean - train_scores_std,\n", " train_scores_mean + train_scores_std, alpha=0.1,\n", " color=\"r\")\n", " plt.fill_between(train_sizes, test_scores_mean - test_scores_std,\n", " test_scores_mean + test_scores_std, alpha=0.1, color=\"g\")\n", " plt.plot(train_sizes, train_scores_mean, 'o-', color=\"r\",\n", " label=\"Training score\")\n", " plt.plot(train_sizes, test_scores_mean, 'o-', color=\"g\",\n", " label=\"Cross-validation score\")\n", "\n", " plt.legend(loc=\"best\")\n", " return plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf = RandomForestClassifier(n_estimators=10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(classification_report(y_test, predictions))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title = 'Learning Curves (Random Forest)'\n", "estimator = clf\n", "plot_learning_curve(estimator, title, X_train, y_train)\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.5", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.5" } }, "nbformat": 4, "nbformat_minor": 1 }