{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Convert NASA data to Orion format\n",
    "\n",
    "In this notebook we download the data from the Telemanom S3 bucket and reformat it\n",
    "as Orion pipelines expect."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import os\n",
    "import urllib\n",
    "import zipfile\n",
    "\n",
    "DATA_URL = 'https://s3-us-west-2.amazonaws.com/telemanom/data.zip'\n",
    "\n",
    "if not os.path.exists('data'):\n",
    "    response = urllib.request.urlopen(DATA_URL)\n",
    "    bytes_io = io.BytesIO(response.read())\n",
    "    \n",
    "    with zipfile.ZipFile(bytes_io) as zf:\n",
    "        zf.extractall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_signals = os.listdir('data/train')\n",
    "test_signals = os.listdir('data/test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_signals == test_signals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert the NPY matrices to CSVs\n",
    "\n",
    "We convert the NPY matrices to CSV files with two columns: `timestamp` and `value`.\n",
    "\n",
    "For this, what we do is loading both the train and test matrices for each signals\n",
    "and concantenate them to generate a single matrix for each signal.\n",
    "\n",
    "Afterwards, we add a timestamp column by taking the value 1222819200 (2008-10-01T00:00:00)\n",
    "as for the first row and then increasing the timestamp by 21600 seconds (6h) for each other row."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "NASA_DIR = os.path.join('data', '{}', '{}')\n",
    "\n",
    "def build_df(data, start=0):\n",
    "    index = np.array(range(start, start + len(data)))\n",
    "    timestamp = index * 21600 + 1222819200\n",
    "    \n",
    "    return pd.DataFrame({'timestamp': timestamp, 'value': data[:, 0]})\n",
    "\n",
    "data = build_df(np.load(NASA_DIR.format('train', 'S-1.npy')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1222819200</td>\n",
       "      <td>-0.366359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1222840800</td>\n",
       "      <td>-0.394108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1222862400</td>\n",
       "      <td>0.403625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1222884000</td>\n",
       "      <td>-0.362759</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1222905600</td>\n",
       "      <td>-0.370746</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    timestamp     value\n",
       "0  1222819200 -0.366359\n",
       "1  1222840800 -0.394108\n",
       "2  1222862400  0.403625\n",
       "3  1222884000 -0.362759\n",
       "4  1222905600 -0.370746"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Store the results as CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs('csv', exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH_DIR = os.path.join('csv', '{}')\n",
    "\n",
    "for signal in train_signals:\n",
    "    name = signal[:-4]\n",
    "    train_np = np.load(NASA_DIR.format('train', signal))\n",
    "    test_np = np.load(NASA_DIR.format('test', signal))\n",
    "    \n",
    "    data = build_df(np.concatenate([train_np, test_np]))\n",
    "    data.to_csv(PATH_DIR.format(name + '.csv'), index=False)\n",
    "    \n",
    "    train = build_df(train_np)\n",
    "    train.to_csv(PATH_DIR.format(name + '-train.csv'), index=False)\n",
    "    \n",
    "    test = build_df(test_np, start=len(train))\n",
    "    test.to_csv(PATH_DIR.format(name + '-test.csv'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = pd.read_csv(PATH_DIR.format('S-1.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Prepare Anomaly Labels\n",
    "\n",
    "We will use the `labeled_anomalies.csv` file from the telemanom project\n",
    "and convert it to the CSV that we will later on use in Orion."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "from orion.data import load_signal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CSV_URL = 'https://github.com/khundman/telemanom/raw/master/labeled_anomalies.csv'\n",
    "\n",
    "df = pd.read_csv(CSV_URL)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "labels_data = list()\n",
    "\n",
    "for _, row in df.iterrows():\n",
    "    signal = row.chan_id\n",
    "    data = load_signal(os.path.join('csv', signal + '.csv'))\n",
    "    test = data[-row.num_values:]\n",
    "    \n",
    "    events = list()\n",
    "    for start, end in json.loads(row.anomaly_sequences):\n",
    "        start_ts = test.iloc[start].timestamp.astype(int)\n",
    "        end_ts = test.iloc[end].timestamp.astype(int)\n",
    "        events.append([start_ts, end_ts])\n",
    "    \n",
    "    labels_data.append({\n",
    "        'signal': signal,\n",
    "        'events': events\n",
    "    })\n",
    "    \n",
    "labels = pd.DataFrame(labels_data)[['signal','events']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels.to_csv('labels.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}