In [None]:
import pandas as pd
import numpy as np

We'll start by reading in our fraud dataset and looking at the column names:

In [None]:
df = pd.read_parquet("fraud-cleaned-sample.parquet")
df.columns

# Transaction type distribution

In [None]:
pt = pd.pivot_table(df[["label", "trans_type", "timestamp"]], 
 index=["label", "trans_type"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['percentage'] = gdf['count'] / gdf['total']

gdf

In [None]:
import altair as alt

alt.Chart(gdf).mark_bar().encode(
 alt.Y('percentage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x="label", color='label'
)

# Foreign transaction distribution

In [None]:
pt = pd.pivot_table(df[["label", "foreign", "timestamp"]], 
 index=["label", "foreign"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

alt.Chart(gdf).mark_bar().encode(
 alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x="label", color='label'
)

# Transaction amount distribution

In [None]:
%%time
qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
qs

In [None]:
qdf = pd.DataFrame(qs.to_records())

alt.Chart(qdf).mark_line(interpolate="monotone").encode(
 alt.Y("amount", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')), 
 alt.X("level_0", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
 color="label"
)

# Interarrival times

In [None]:
fraudsamp = df[df["label"] == "fraud"].copy()
legitsamp = df[df["label"] == "legitimate"].sample(len(fraudsamp)).copy()

fraudsamp['irank'] = fraudsamp['interarrival'].rank(pct=True, method="dense")
legitsamp['irank'] = legitsamp['interarrival'].rank(pct=True, method="dense")
qdf = pd.concat([fraudsamp.groupby(['label', 'interarrival', 'irank']).size(), legitsamp.groupby(['label', 'interarrival', 'irank']).size()])
qdf = pd.DataFrame(pd.DataFrame(qdf).to_records())
qdf = qdf[qdf['interarrival'] > 0]

In [None]:
alt.Chart(qdf.sample(5000)).mark_line().interactive().encode(
 alt.Y("interarrival", axis=alt.Axis(title='interarrival time'), scale=alt.Scale(type='log')), 
 alt.X("irank", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
 color="label"
)