{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'll start by reading in our fraud dataset and looking at the column names:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"fraud-cleaned-sample.parquet\")\n", "df.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Transaction type distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pt = pd.pivot_table(df[[\"label\", \"trans_type\", \"timestamp\"]], \n", " index=[\"label\", \"trans_type\"], aggfunc=len)\n", "\n", "pt.columns = ['count']\n", "gdf = pd.DataFrame(pt.to_records())\n", "\n", "gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)\n", "gdf['percentage'] = gdf['count'] / gdf['total']\n", "\n", "gdf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import altair as alt\n", "\n", "alt.Chart(gdf).mark_bar().encode(\n", " alt.Y('percentage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x=\"label\", color='label'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Foreign transaction distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pt = pd.pivot_table(df[[\"label\", \"foreign\", \"timestamp\"]], \n", " index=[\"label\", \"foreign\"], aggfunc=len)\n", "\n", "pt.columns = ['count']\n", "gdf = pd.DataFrame(pt.to_records())\n", "\n", "gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)\n", "gdf['pctage'] = gdf['count'] / gdf['total']\n", "\n", "gdf\n", "\n", "alt.Chart(gdf).mark_bar().encode(\n", " alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x=\"label\", color='label'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Transaction amount distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])\n", "qs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qdf = pd.DataFrame(qs.to_records())\n", "\n", "alt.Chart(qdf).mark_line(interpolate=\"monotone\").encode(\n", " alt.Y(\"amount\", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')), \n", " alt.X(\"level_0\", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), \n", " color=\"label\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Interarrival times" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fraudsamp = df[df[\"label\"] == \"fraud\"].copy()\n", "legitsamp = df[df[\"label\"] == \"legitimate\"].sample(len(fraudsamp)).copy()\n", "\n", "fraudsamp['irank'] = fraudsamp['interarrival'].rank(pct=True, method=\"dense\")\n", "legitsamp['irank'] = legitsamp['interarrival'].rank(pct=True, method=\"dense\")\n", "qdf = pd.concat([fraudsamp.groupby(['label', 'interarrival', 'irank']).size(), legitsamp.groupby(['label', 'interarrival', 'irank']).size()])\n", "qdf = pd.DataFrame(pd.DataFrame(qdf).to_records())\n", "qdf = qdf[qdf['interarrival'] > 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "alt.Chart(qdf.sample(5000)).mark_line().interactive().encode(\n", " alt.Y(\"interarrival\", axis=alt.Axis(title='interarrival time'), scale=alt.Scale(type='log')), \n", " alt.X(\"irank\", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), \n", " color=\"label\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }