{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import the JSON File with Industry Lab Sensor Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import sys\n",
    "import types\n",
    "import pandas as pd\n",
    "from botocore.client import Config\n",
    "import ibm_boto3\n",
    "\n",
    "def __iter__(self): return 0\n",
    "\n",
    "# @hidden_cell\n",
    "# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n",
    "# You might want to remove those credentials before you share your notebook.\n",
    "client_915ea66450e44183938b1aab8572887f = ibm_boto3.client(service_name='s3',\n",
    "    ibm_api_key_id='x8LKJUWfwBvzMvyp4glOaZk6VQOcrYmbWV80lKToccNW',\n",
    "    ibm_auth_endpoint=\"https://iam.ng.bluemix.net/oidc/token\",\n",
    "    config=Config(signature_version='oauth'),\n",
    "    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')\n",
    "\n",
    "body = client_915ea66450e44183938b1aab8572887f.get_object(Bucket='dsbootcampac3431d743f2492ebe1cfe6103674873',Key='floorsensordata2604.json')['Body']\n",
    "# add missing __iter__ method, so pandas accepts body as file-like object \n",
    "\n",
    "if not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n",
    "\n",
    "# Since JSON data can be semi-structured and contain additional metadata, it is possible that you might face an error during data loading.\n",
    "# Please read the documentation of 'pandas.read_json()' and 'pandas.io.json.json_normalize' to learn more about the possibilities to adjust the data loading.\n",
    "# pandas documentation: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader\n",
    "# and http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html\n",
    "\n",
    "df_data_1 = pd.read_json(body, orient='values')\n",
    "df_data_1.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the Structure of the Data Frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_data_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df.itemname.unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Format Yanzi Sensor Data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['id'] = df['itemname'].str.split('_').str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['temperature'] = df[df['itemname'].str.contains('temperature')]['value'].astype(float)\n",
    "df['carbonDioxide'] = df[df['itemname'].str.contains('carbonDioxide')]['value'].astype(float)\n",
    "df['humidity'] = df[df['itemname'].str.contains('humidity')]['value'].astype(float)\n",
    "df['illuminance'] = df[df['itemname'].str.contains('illuminance')]['value'].astype(float)\n",
    "df['pressure'] = df[df['itemname'].str.contains('pressure')]['value'].astype(float)\n",
    "df['Occupancy'] = df[df['itemname'].str.contains('Occupancy')]['value']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize Data with Pixidust"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pixiedust"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pixiedust": {
     "displayParams": {
      "filter": "{\"regex\": \"false\", \"constraint\": \"None\", \"field\": \"id\", \"case_matter\": \"false\", \"value\": \"4674C\"}",
      "handlerId": "lineChart",
      "keyFields": "time",
      "legend": "true",
      "lineChartType": "subplots",
      "no_margin": "true",
      "rendererId": "matplotlib",
      "timeseries": "true",
      "title": "Shinano Meeting Room",
      "valueFields": "temperature"
     }
    }
   },
   "outputs": [],
   "source": [
    "display(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter on Shinano and Fill Missing Values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano = df[df['id'].isin(['4674C', 'Shinano'])].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.sort_values(by='time', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.index = pd.to_datetime(df_shinano.time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.fillna(method='ffill', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.fillna(method='bfill', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.drop(['illuminance'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.Occupancy.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.groupby('Occupancy')['time'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot Sensor Values depending on Occupancy State"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano.temperature.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build Simple Prediction Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano_model = df_shinano.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_shinano_model.loc[df_shinano_model.Occupancy == 'free', 'Occupancy'] = 0\n",
    "df_shinano_model.loc[df_shinano_model.Occupancy == 'occupied', 'Occupancy'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = df_shinano_model['Occupancy'].values\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df_shinano_model.loc[:, ['temperature', 'carbonDioxide', 'humidity', 'pressure']].values\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import learning_curve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,\n",
    "                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):\n",
    "    plt.figure()\n",
    "    plt.title(title)\n",
    "    if ylim is not None:\n",
    "        plt.ylim(*ylim)\n",
    "    plt.xlabel(\"Training examples\")\n",
    "    plt.ylabel(\"Score\")\n",
    "    train_sizes, train_scores, test_scores = learning_curve(\n",
    "        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)\n",
    "    train_scores_mean = np.mean(train_scores, axis=1)\n",
    "    train_scores_std = np.std(train_scores, axis=1)\n",
    "    test_scores_mean = np.mean(test_scores, axis=1)\n",
    "    test_scores_std = np.std(test_scores, axis=1)\n",
    "    plt.grid()\n",
    "\n",
    "    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,\n",
    "                     train_scores_mean + train_scores_std, alpha=0.1,\n",
    "                     color=\"r\")\n",
    "    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,\n",
    "                     test_scores_mean + test_scores_std, alpha=0.1, color=\"g\")\n",
    "    plt.plot(train_sizes, train_scores_mean, 'o-', color=\"r\",\n",
    "             label=\"Training score\")\n",
    "    plt.plot(train_sizes, test_scores_mean, 'o-', color=\"g\",\n",
    "             label=\"Cross-validation score\")\n",
    "\n",
    "    plt.legend(loc=\"best\")\n",
    "    return plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = RandomForestClassifier(n_estimators=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = 'Learning Curves (Random Forest)'\n",
    "estimator = clf\n",
    "plot_learning_curve(estimator, title, X_train, y_train)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.5",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}