{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "df = pd.read_parquet(\"fraud-cleaned-sample.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Train/test split\n",
    "\n",
    "We're using time-series data, so we'll split based on time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "first = df['timestamp'].min()\n",
    "last = df['timestamp'].max()\n",
    "cutoff = first + ((last - first) * 0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = df[df['timestamp'] <= cutoff]\n",
    "len(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = df[df['timestamp'] > cutoff]\n",
    "len(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(train) / (len(train) + len(test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Encoding categorical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn import feature_extraction, preprocessing\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
    "from sklearn.compose import ColumnTransformer\n",
    "\n",
    "stringize = np.frompyfunc(lambda x: \"%s\" % x, 1, 1)\n",
    "\n",
    "def mk_stringize(colname):\n",
    "    def stringize(tab):\n",
    "        return [{colname : s} for s in tab]\n",
    "    return stringize\n",
    "\n",
    "def amap(s):\n",
    "    return s.map(str)\n",
    "\n",
    "# my_func = mk_stringize('merchant_id')\n",
    "my_func = amap\n",
    "\n",
    "def mk_hasher(features=16384, values=None):    \n",
    "    return Pipeline([('stringize', \n",
    "                      FunctionTransformer(my_func, accept_sparse=True)), \n",
    "                     ('hasher', \n",
    "                      sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string'))])\n",
    "\n",
    "\n",
    "tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])\n",
    "mu_xform = ('m_hashing', mk_hasher(256), 'merchant_id')\n",
    "\n",
    "xform_steps = [tt_xform, mu_xform]\n",
    "\n",
    "cat_xform = ColumnTransformer(transformers=xform_steps, n_jobs=None)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Encoding other features\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import RobustScaler\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "\n",
    "impute_and_scale = Pipeline([('median_imputer', SimpleImputer()), ('interarrival_scaler', RobustScaler())])\n",
    "ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])\n",
    "amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])\n",
    "\n",
    "scale_steps = [ia_scaler, amount_scaler]\n",
    "all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fit and save the feature extraction pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feat_pipeline = Pipeline([\n",
    "    ('feature_extraction',all_xforms)\n",
    "])\n",
    "\n",
    "feat_pipeline.fit(train)\n",
    "\n",
    "from mlworkflows import util\n",
    "util.serialize_to(feat_pipeline, \"feature_pipeline.sav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}