{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install pandas_profiling" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install lightgbm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd \n", "import matplotlib.pyplot as plt\n", "import ipaddress\n", "import pandas_profiling as pp\n", "%matplotlib inline\n", "from sklearn import preprocessing\n", "plt.rc(\"font\", size=14)\n", "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "import time\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "import lightgbm as lgb\n", "from lightgbm import LGBMClassifier\n", "import seaborn as sns\n", "sns.set(style=\"white\")\n", "sns.set(style=\"whitegrid\", color_codes=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import types\n", "import pandas as pd\n", "\n", "url = 'https://raw.githubusercontent.com/IBM/predict-fraud-using-auto-ai/master/data/fraud_dataset.csv'\n", "df = pd.read_csv(url)\n", "\n", "print(df.head())\n", "print(df.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "count_fraud = len(df[df['Fraud_Risk']==0])\n", "count_non_fraud = len(df[df['Fraud_Risk']==1])\n", "pct_of_non_fraud = count_non_fraud/(count_non_fraud +count_fraud)\n", "print(\"percentage of non Fraud Risk is\", round(pct_of_non_fraud*100,2))\n", "pct_of_fraud = count_fraud/(count_non_fraud +count_fraud)\n", "print(\"percentage of Fraud Risk\", round(pct_of_fraud*100,2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sns.countplot(x='Fraud_Risk',data=df, palette='hls')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.groupby('Fraud_Risk').mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.corr(method ='pearson')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = df[df.columns[0:12]]\n", "y = df[df.columns[12:]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.isna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Train_x Shape :: \", X_train.shape)\n", "print(\"Train_y Shape :: \", y_train.shape)\n", "print(\"Test_x Shape :: \", X_test.shape)\n", "print(\"Test_y Shape :: \", y_test.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "d_train = lgb.Dataset(X_train, label=y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def LGBM_classifier(features, target):\n", " \"\"\"\n", " To train the LGBM classifier with features and target data\n", " :param features:\n", " :param target:\n", " :return: trained LGBM classifier\n", " \"\"\"\n", " model = LGBMClassifier(metric='binary_logloss', objective='binary')\n", " model.fit(features, target)\n", " return model\n", "\n", "start = time.time()\n", "trained_model = LGBM_classifier(X_train, y_train.values.ravel())\n", "print(\"> Completion Time : \", time.time() - start)\n", "print(\"Trained LGBM model :: \", trained_model)\n", "predictions = trained_model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Train Accuracy :: \", accuracy_score(y_train, trained_model.predict(X_train)))\n", "print(\"LGBM Model Test Accuracy is :: \", accuracy_score(y_test, predictions))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\" Confusion matrix \", confusion_matrix(y_test, predictions))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "feat_imp = pd.Series(trained_model.feature_importances_, index=X.columns)\n", "feat_imp.nlargest(12).plot(kind='barh', figsize=(8,10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install shap" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import shap\n", "shap.initjs()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shap_values = shap.TreeExplainer(trained_model.booster_).shap_values(X_train)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 1 }