{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "df = pd.read_parquet(\"fraud-cleaned-sample.parquet\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train/test split\n", "\n", "We're using time-series data, so we'll split based on time." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "first = df['timestamp'].min()\n", "last = df['timestamp'].max()\n", "cutoff = first + ((last - first) * 0.7)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = df[df['timestamp'] <= cutoff]\n", "len(train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test = df[df['timestamp'] > cutoff]\n", "len(test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(train) / (len(train) + len(test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encoding categorical features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sklearn\n", "from sklearn.pipeline import Pipeline\n", "from sklearn import feature_extraction, preprocessing\n", "from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.compose import ColumnTransformer\n", "\n", "stringize = np.frompyfunc(lambda x: \"%s\" % x, 1, 1)\n", "\n", "def mk_stringize(colname):\n", " def stringize(tab):\n", " return [{colname : s} for s in tab]\n", " return stringize\n", "\n", "def amap(s):\n", " return s.map(str)\n", "\n", "# my_func = mk_stringize('merchant_id')\n", "my_func = amap\n", "\n", "def mk_hasher(features=16384, values=None): \n", " return Pipeline([('stringize', \n", " FunctionTransformer(my_func, accept_sparse=True)), \n", " ('hasher', \n", " sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string'))])\n", "\n", "\n", "tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])\n", "mu_xform = ('m_hashing', mk_hasher(256), 'merchant_id')\n", "\n", "xform_steps = [tt_xform, mu_xform]\n", "\n", "cat_xform = ColumnTransformer(transformers=xform_steps, n_jobs=None)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encoding other features\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import RobustScaler\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "\n", "\n", "impute_and_scale = Pipeline([('median_imputer', SimpleImputer()), ('interarrival_scaler', RobustScaler())])\n", "ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])\n", "amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])\n", "\n", "scale_steps = [ia_scaler, amount_scaler]\n", "all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fit and save the feature extraction pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "feat_pipeline = Pipeline([\n", " ('feature_extraction',all_xforms)\n", "])\n", "\n", "feat_pipeline.fit(train)\n", "\n", "from mlworkflows import util\n", "util.serialize_to(feat_pipeline, \"feature_pipeline.sav\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }