{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Convert NASA data to Orion format\n", "\n", "In this notebook we download the data from the Telemanom S3 bucket and reformat it\n", "as Orion pipelines expect." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download the data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import urllib\n", "import zipfile\n", "\n", "DATA_URL = 'https://s3-us-west-2.amazonaws.com/telemanom/data.zip'\n", "\n", "if not os.path.exists('data'):\n", " response = urllib.request.urlopen(DATA_URL)\n", " bytes_io = io.BytesIO(response.read())\n", " \n", " with zipfile.ZipFile(bytes_io) as zf:\n", " zf.extractall()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train_signals = os.listdir('data/train')\n", "test_signals = os.listdir('data/test')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_signals == test_signals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert the NPY matrices to CSVs\n", "\n", "We convert the NPY matrices to CSV files with two columns: `timestamp` and `value`.\n", "\n", "For this, what we do is loading both the train and test matrices for each signals\n", "and concantenate them to generate a single matrix for each signal.\n", "\n", "Afterwards, we add a timestamp column by taking the value 1222819200 (2008-10-01T00:00:00)\n", "as for the first row and then increasing the timestamp by 21600 seconds (6h) for each other row." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "NASA_DIR = os.path.join('data', '{}', '{}')\n", "\n", "def build_df(data, start=0):\n", " index = np.array(range(start, start + len(data)))\n", " timestamp = index * 21600 + 1222819200\n", " \n", " return pd.DataFrame({'timestamp': timestamp, 'value': data[:, 0]})\n", "\n", "data = build_df(np.load(NASA_DIR.format('train', 'S-1.npy')))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampvalue
01222819200-0.366359
11222840800-0.394108
212228624000.403625
31222884000-0.362759
41222905600-0.370746
\n", "
" ], "text/plain": [ " timestamp value\n", "0 1222819200 -0.366359\n", "1 1222840800 -0.394108\n", "2 1222862400 0.403625\n", "3 1222884000 -0.362759\n", "4 1222905600 -0.370746" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Store the results as CSV" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "os.makedirs('csv', exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PATH_DIR = os.path.join('csv', '{}')\n", "\n", "for signal in train_signals:\n", " name = signal[:-4]\n", " train_np = np.load(NASA_DIR.format('train', signal))\n", " test_np = np.load(NASA_DIR.format('test', signal))\n", " \n", " data = build_df(np.concatenate([train_np, test_np]))\n", " data.to_csv(PATH_DIR.format(name + '.csv'), index=False)\n", " \n", " train = build_df(train_np)\n", " train.to_csv(PATH_DIR.format(name + '-train.csv'), index=False)\n", " \n", " test = build_df(test_np, start=len(train))\n", " test.to_csv(PATH_DIR.format(name + '-test.csv'), index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1 = pd.read_csv(PATH_DIR.format('S-1.csv'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Prepare Anomaly Labels\n", "\n", "We will use the `labeled_anomalies.csv` file from the telemanom project\n", "and convert it to the CSV that we will later on use in Orion." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import pandas as pd\n", "\n", "from orion.data import load_signal" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CSV_URL = 'https://github.com/khundman/telemanom/raw/master/labeled_anomalies.csv'\n", "\n", "df = pd.read_csv(CSV_URL)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "\n", "labels_data = list()\n", "\n", "for _, row in df.iterrows():\n", " signal = row.chan_id\n", " data = load_signal(os.path.join('csv', signal + '.csv'))\n", " test = data[-row.num_values:]\n", " \n", " events = list()\n", " for start, end in json.loads(row.anomaly_sequences):\n", " start_ts = test.iloc[start].timestamp.astype(int)\n", " end_ts = test.iloc[end].timestamp.astype(int)\n", " events.append([start_ts, end_ts])\n", " \n", " labels_data.append({\n", " 'signal': signal,\n", " 'events': events\n", " })\n", " \n", "labels = pd.DataFrame(labels_data)[['signal','events']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels.to_csv('labels.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" } }, "nbformat": 4, "nbformat_minor": 2 }