{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "df = pd.read_parquet(\"fraud-cleaned-sample.parquet\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train/test split\n", "\n", "We're using time-series data, so we'll split based on time." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "first = df['timestamp'].min()\n", "last = df['timestamp'].max()\n", "cutoff = first + ((last - first) * 0.7)\n", "\n", "df = df.copy()\n", "\n", "train = df[df['timestamp'] <= cutoff]\n", "test = df[df['timestamp'] > cutoff]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import cloudpickle as cp\n", "feature_pipeline = cp.load(open('feature_pipeline.sav', 'rb'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train the model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn import model_selection\n", "\n", "rfc = RandomForestClassifier(n_estimators=16, max_depth=8, random_state=404, class_weight=\"balanced_subsample\")\n", "\n", "svecs = feature_pipeline.fit_transform(train)\n", "rfc.fit(svecs, train[\"label\"])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import classification_report\n", "\n", "predictions = rfc.predict(feature_pipeline.fit_transform(test))\n", "print(classification_report(test.label.values, predictions))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from mlworkflows import plot\n", "df, chart = plot.binary_confusion_matrix(test[\"label\"], predictions)\n", "chart" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save the trained model as a pipeline stage" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from mlworkflows import util\n", "util.serialize_to(rfc, \"rfc.sav\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }