[![Static Badge](https://img.shields.io/badge/notebook-open_in_colab-blue?style=flat&logo=googlecolab&color=blue)](https://colab.research.google.com/drive/1QsYjUX3DgS8ccvDtxEgLeHHmbtPViIqV?usp=drive_link)

### We recommend using the [Google Colab](https://colab.research.google.com/drive/1QsYjUX3DgS8ccvDtxEgLeHHmbtPViIqV?usp=drive_link) verion of the notebook!

# Convert UCR data to Orion format

In this notebook we download the data and reformat it
as Orion pipelines expect.

### Download the data

In [1]:
# download dataset & unzip

import io
import os
import urllib
import zipfile

DATA_URL = 'https://www.cs.ucr.edu/~eamonn/time_series_data_2018/UCR_TimeSeriesAnomalyDatasets2021.zip'

response = urllib.request.urlopen(DATA_URL)
bytes_io = io.BytesIO(response.read())

with zipfile.ZipFile(bytes_io) as zf:
    zf.extractall()

In [2]:
DATA_PATH = os.path.join('AnomalyDatasets_2021',
                         'UCR_TimeSeriesAnomalyDatasets2021',
                         'FilesAreInHere',
                         'UCR_Anomaly_FullData')

SAVE_TO = 'UCR'
os.makedirs(SAVE_TO, exist_ok=True)

In [3]:
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm

#### Format

012_UCR_Anomaly_tiltAPB1_100000_114283_114350.txt

- `012` Dataset number
- `tiltAPB1` Mnemonic name
- `100000` From 1 to X is training data
- `114283` Begin anomaly
- `114350` End anomaly

In [4]:
def build_df(data, start=0):
    index = np.array(range(start, start + len(data)))
    step = 300
    initial_time = 1222819200
    timestamp = index * step + initial_time

    if len(data.shape) > 1 and data.shape[1] > 1:
        print("MULTIVARIATE")
        df = pd.DataFrame(data)
        df['timestamp'] = timestamp
    else:
        df = pd.DataFrame({'timestamp': timestamp, 'value': data.reshape(-1, )})

    df['timestamp'] = df['timestamp'].astype('int64')
    return df

df = build_df(np.loadtxt(DATA_PATH + '/204_UCR_Anomaly_CHARISfive_12412_15000_15070.txt'))
df.head()

Unnamed: 0,timestamp,value
0,1222819200,1990.0
1,1222819500,1996.0
2,1222819800,1958.0
3,1222820100,1958.0
4,1222820400,1923.0


In [5]:
files = os.listdir(DATA_PATH)
file_names, train_sizes, intervals = [], [], []

for file in tqdm(files):
    file_num_str, _, _, file_name, train_size_str, begin_str, end_str = file.split("_")

    train_size, begin_anomaly = int(train_size_str), int(begin_str)
    end_anomaly = int(end_str.split('.')[0])
    file_name = file_num_str + "-" + file_name

    # get timestamp from data
    df = build_df(np.loadtxt(os.path.join(DATA_PATH, file)))
    begin_anomaly = int(df.timestamp.iloc[begin_anomaly])
    end_anomaly = int(df.timestamp.iloc[end_anomaly])

    # train - test split
    train_df = df.iloc[: train_size]
    test_df = df.iloc[train_size: ]

    # save file
    train_df.to_csv(SAVE_TO + '/{}-train.csv'.format(file_name), index=False)
    test_df.to_csv(SAVE_TO + '/{}-test.csv'.format(file_name), index=False)
    df.to_csv(SAVE_TO + '/{}.csv'.format(file_name), index=False)

    file_names.append(file_name)
    train_sizes.append(train_size)
    intervals.append([begin_anomaly, end_anomaly])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [00:19<00:00, 12.84it/s]


In [6]:
rows = []
for index, file_name in enumerate(file_names):
    row = [file_name, [intervals[index]]]
    rows.append(row)

In [7]:
rows

[['183-qtdbSel100MLII', [[1226839200, 1226959200]]],
 ['194-sddb49', [[1243204200, 1243279200]]],
 ['069-DISTORTEDinsectEPG5', [[1225369200, 1225369500]]],
 ['023-DISTORTEDGP711MarkerLFM5z5', [[1225402800, 1225434000]]],
 ['212-Italianpowerdemand', [[1231663200, 1231670400]]],
 ['180-ltstdbs30791ES', [[1238599200, 1238659200]]],
 ['058-DISTORTEDapneaecg', [[1226491200, 1226511600]]],
 ['130-GP711MarkerLFM5z4', [[1224777300, 1224812700]]],
 ['079-DISTORTEDresperation2', [[1273294200, 1273294200]]],
 ['224-mit14046longtermecg', [[1280179200, 1280329200]]],
 ['044-DISTORTEDPowerDemand1', [[1228364700, 1228465500]]],
 ['233-mit14157longtermecg', [[1230169200, 1230169500]]],
 ['042-DISTORTEDLab2Cmac011215EPG6', [[1226476200, 1226545200]]],
 ['114-CIMIS44AirTemperature2', [[1224530100, 1224537300]]],
 ['204-CHARISfive', [[1227319200, 1227340200]]],
 ['241-taichidbS0715Master', [[1474039200, 1474549200]]],
 ['028-DISTORTEDInternalBleeding17', [[1223778600, 1223811900]]],
 ['026-DISTORTEDInter

In [8]:
# save anomalies

with open(SAVE_TO + '/anomalies.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)

In [9]:
new_labels = pd.read_csv(SAVE_TO + '/anomalies.csv', header=None, names=['signal', 'events'])
new_labels

Unnamed: 0,signal,events
0,183-qtdbSel100MLII,"[[1226839200, 1226959200]]"
1,194-sddb49,"[[1243204200, 1243279200]]"
2,069-DISTORTEDinsectEPG5,"[[1225369200, 1225369500]]"
3,023-DISTORTEDGP711MarkerLFM5z5,"[[1225402800, 1225434000]]"
4,212-Italianpowerdemand,"[[1231663200, 1231670400]]"
...,...,...
245,075-DISTORTEDqtdbSel100MLII,"[[1226839200, 1226959200]]"
246,132-InternalBleeding10,"[[1224177000, 1224186000]]"
247,109-1sddb40,"[[1238419200, 1238605200]]"
248,176-insectEPG4,"[[1224771600, 1224786600]]"
