{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading data" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "df = pd.read_parquet(\"fraud-all.parquet\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning data\n", "\n", "These data are mostly clean but we need to add a new field for transaction interarrival time." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df = df.sort_values([\"user_id\", \"timestamp\"]).reset_index()\n", "del df['index']" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "shifted = df.shift(1)[['user_id', 'timestamp']]\n", "\n", "df['prev_user_id'] = shifted['user_id']\n", "df['prev_timestamp'] = shifted['timestamp']\n", "df['interarrival'] = (df['timestamp'] - df['prev_timestamp']).where(df['user_id'] == df['prev_user_id'], np.NaN)\n", "\n", "del df['prev_user_id']\n", "del df['prev_timestamp']" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "df[\"label\"] = df[\"label\"].astype(\"string\")\n", "df[\"trans_type\"] = df[\"trans_type\"].astype(\"string\")\n", "df[\"merchant_id\"] = df[\"merchant_id\"].astype(\"int32\")\n", "df[\"user_id\"] = df[\"user_id\"].astype(\"int32\")\n", "df[\"amount\"] = df[\"amount\"].astype(\"float32\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# for this data set, brotli is ~60% the size of snappy (the default)\n", "# fastparquet can't currently handle strings (!)\n", "df.to_parquet(\"fraud-cleaned.parquet\", engine=\"pyarrow\", compression=\"brotli\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "df.sample(frac=0.05).to_parquet(\"fraud-cleaned-sample.parquet\", engine=\"pyarrow\", compression=\"brotli\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }