{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "df = pd.read_parquet(\"fraud.parquet\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train/test split\n", "\n", "We're using time-series data, so we'll split based on time." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "first = df['timestamp'].min()\n", "last = df['timestamp'].max()\n", "cutoff = first + ((last - first) * 0.7)\n", "\n", "df = df.sample(frac=0.1).copy()\n", "\n", "train = df[df['timestamp'] <= cutoff]\n", "test = df[df['timestamp'] > cutoff]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import cloudpickle as cp\n", "feature_pipeline = cp.load(open('feature_pipeline.sav', 'rb'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Weighting samples\n", "\n", "We're going to weight samples by the the inverse of the frequency of their label." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "fraud_frequency = train[train[\"label\"] == \"fraud\"][\"timestamp\"].count() / train[\"timestamp\"].count()\n", "train[\"weights\"] = fraud_frequency\n", "train.loc[train[\"label\"] == \"fraud\", \"weights\"] = (1 - fraud_frequency)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# WIP from here..." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'feature_pipeline' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mlr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLogisticRegression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0msvecs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfeature_pipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msmol_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mlr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msvecs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msmol_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"label\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmol_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"weights\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'feature_pipeline' is not defined" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn import model_selection\n", "\n", "rfc = RandomForestClassifier(n_estimators=4, max_depth=3, random_state=404, class_weight=\"balanced_subsample\")\n", "\n", "svecs = feature_pipeline.fit_transform(train)\n", "rfc.fit(svecs, train[\"label\"], sample_weight=train[\"weights\"])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import classification_report\n", "\n", "predictions = rfc.predict(feature_pipeline.fit_transform(test))\n", "print(classification_report(smol_test.label.values, predictions))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from mlworkflows import plot\n", "df, chart = plot.binary_confusion_matrix(test[\"label\"], predictions)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# save model here" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }