{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3-final" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3", "language": "python" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "source": [ "# Resampling approaches to the imbalanced problem" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "\n", "from builtins import range\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from keras.utils import np_utils\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import balanced_accuracy_score, log_loss, f1_score\n", "from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "# result = pd.read_pickle(\"/home/zhendi/pm/scripts/result_non_split_rmnum.pkl\")\n", "result = pd.read_pickle(\"/home/zhendi/pm/scripts/result_non_split_strict.pkl\")\n", "labels = result[[\"Class\"]] - 1" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Onehot Encoding...\n", "Splitting...\n", "(2656, 3259) (665, 3259) (2656,) (665,)\n", "Tfidf Vectorizing...\n", "xtrain tfidf shape: (2656, 390)\n", "xtest tfidf shape: (665, 390)\n", "Count Vectorizing...\n", "xtrain cvec shape: (2656, 390)\n", "xtest cvec shape: (665, 390)\n", "Renaming...\n", "Finished.\n" ] } ], "source": [ "# better way\n", "def buildFeatures_split(df):\n", " \"\"\"This is a function to extract all features (gene, variation, and text), \n", " df argument should be a pandas dataframe with only Gene, Variation, TEXT, and Class columns\"\"\"\n", " # make a copy\n", " temp = df.copy()\n", " labels = temp[\"Class\"] - 1\n", " del temp[\"Class\"]\n", "\n", " # onehot encode gene and variation\n", " print(\"Onehot Encoding...\")\n", " temp = pd.get_dummies(temp, columns=[\"Gene\", \"Variation\"], drop_first=True)\n", " \n", " \n", " # split the data to training data and testing data\n", " print(\"Splitting...\")\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " temp, labels, test_size=0.2, random_state=5, stratify = labels\n", " )\n", " print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n", "\n", " svdT = TruncatedSVD(n_components=390, n_iter=5)\n", "\n", " # Tfidf vectorize TEXT\n", " tfidf = TfidfVectorizer(\n", " max_features=10000,\n", " ngram_range=(1, 2),\n", " analyzer=\"word\",\n", " stop_words=\"english\",\n", " token_pattern=r\"\\w+\",\n", " )\n", " print(\"Tfidf Vectorizing...\")\n", " temp_tfidf_xtrain = svdT.fit_transform(tfidf.fit_transform(X_train[\"TEXT\"]))\n", " print(\"xtrain tfidf shape: \", temp_tfidf_xtrain.shape)\n", " temp_tfidf_xtest = svdT.transform(tfidf.transform(X_test[\"TEXT\"]))\n", " print(\"xtest tfidf shape: \", temp_tfidf_xtest.shape)\n", "\n", " # Count vectorize TEXT\n", " cvec = CountVectorizer(\n", " max_features=10000,\n", " ngram_range=(1, 2),\n", " analyzer=\"word\",\n", " stop_words=\"english\",\n", " token_pattern=r\"\\w+\",\n", " )\n", " print(\"Count Vectorizing...\")\n", " temp_count_xtrain = svdT.fit_transform(cvec.fit_transform(X_train[\"TEXT\"]))\n", " print(\"xtrain cvec shape: \", temp_count_xtrain.shape)\n", " temp_count_xtest = svdT.transform(cvec.transform(X_test[\"TEXT\"]))\n", " print(\"xtest cvec shape: \", temp_count_xtest.shape)\n", "\n", " del X_train[\"TEXT\"]\n", " del X_test[\"TEXT\"]\n", "\n", " # rename the colnames\n", " print(\"Renaming...\")\n", " tempc_xtrain = list(X_train.columns)\n", " tempc_xtest = list(X_test.columns)\n", "\n", " for i in range(np.shape(temp_tfidf_xtrain)[1]):\n", " tempc_xtrain.append(\"tfidf_\" + str(i + 1))\n", " for i in range(np.shape(temp_tfidf_xtest)[1]):\n", " tempc_xtest.append(\"tfidf_\" + str(i + 1))\n", "\n", " for i in range(np.shape(temp_count_xtrain)[1]):\n", " tempc_xtrain.append(\"count_\" + str(i + 1))\n", " for i in range(np.shape(temp_count_xtest)[1]):\n", " tempc_xtest.append(\"count_\" + str(i + 1))\n", "\n", " X_train = pd.concat(\n", " [\n", " X_train,\n", " pd.DataFrame(temp_tfidf_xtrain, index=X_train.index),\n", " pd.DataFrame(temp_count_xtrain, index=X_train.index),\n", " ],\n", " axis=1,\n", " )\n", " X_test = pd.concat(\n", " [\n", " X_test,\n", " pd.DataFrame(temp_tfidf_xtest, index=X_test.index),\n", " pd.DataFrame(temp_count_xtest, index=X_test.index),\n", " ],\n", " axis=1,\n", " )\n", " X_train.columns = tempc_xtrain\n", " X_test.columns = tempc_xtest\n", "\n", " print(\"Finished.\")\n", "\n", " return X_train, y_train, X_test, y_test\n", "\n", "\n", "X_train, y_train, X_test, y_test = buildFeatures_split(\n", " result[[\"Gene\", \"Variation\", \"TEXT\", \"Class\"]]\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# pd.to_pickle((X_train, y_train, X_test, y_test), \"/home/zhendi/pm/scripts/resample_data_split4baseline_strict.pkl\")\n", "# X_train, y_train, X_test, y_test = pd.read_pickle( \"/home/zhendi/pm/scripts/resample_data_split4baseline_strict.pkl\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "\n", "df_X_train = X_train.copy()\n", "df_X_train['Class'] = y_train\n", "df_X_test = X_test.copy()\n", "df_X_test['Class'] = y_test\n" ] }, { "source": [ "## Baseline" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 4.1s finished\n", "Cross validation on training data: \n", "Log loss: 1.3795442587887174\n", "Accuracy: 0.4829926178643152\n", "f1score: 0.6359186746987951\n", "Validation on testing data: \n", "Log loss: 1.196023582675107\n", "Accuracy: 0.5284083015643508\n", "f1score: 0.6601503759398496\n" ] } ], "source": [ "# X_train, y_train, X_test, y_test = pd.read_pickle( \"/home/zhendi/pm/scripts/resample_data_split4baseline_strict.pkl\")\n", "\n", "\n", "\n", "def evaluate_features(X, y, X_test, y_test, clf=None):\n", "\n", " if clf is None:\n", " clf = RandomForestClassifier(n_estimators=400, random_state = 5, n_jobs = -1)\n", "\n", " probas = cross_val_predict(\n", " clf,\n", " X,\n", " y,\n", " cv=StratifiedKFold(n_splits=3),\n", " n_jobs=-1,\n", " method=\"predict_proba\",\n", " verbose=2,\n", " )\n", " pred_indices = np.argmax(probas, axis=1)\n", " classes = np.unique(y)\n", " preds = classes[pred_indices]\n", "\n", " print(\"Cross validation on training data: \")\n", " print(\"Log loss: {}\".format(log_loss(y, probas)))\n", " print(\"Accuracy: {}\".format(balanced_accuracy_score(y, preds)))\n", " print('f1score: {}'.format(f1_score(y, preds, average = 'micro')))\n", "\n", " print(\"Validation on testing data: \")\n", " clf.fit(X, y)\n", " ytest = clf.predict(X_test)\n", " yprobas_test = clf.predict_proba(X_test)\n", " print(\"Log loss: {}\".format(log_loss(y_test, yprobas_test)))\n", " print(\"Accuracy: {}\".format(balanced_accuracy_score(y_test, ytest)))\n", " print('f1score: {}'.format(f1_score(y_test, ytest, average = 'micro')))\n", "\n", "evaluate_features(X_train, y_train, X_test, y_test)\n", "\n", "# Cross validation on training data: \n", "# Log loss: 1.3043531922895704\n", "# Accuracy: 0.4947690264523213\n", "# f1score: 0.6355421686746988\n", "\n", "# Validation on testing data: \n", "# Log loss: 1.116404955301456\n", "# Accuracy: 0.5350690829081902\n", "# f1score: 0.6706766917293233" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "source": [ "## Class Count and Split" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "# Class count: class 6 is the most, class 7 is the least for both training data and testing data\n", "df_X_train.Class.value_counts().sort_values\n", "# df_X_test.Class.value_counts().sort_values" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "762\n15\n" ] } ], "source": [ "count_class_0, count_class_1, count_class_2, count_class_3, count_class_4, count_class_5, count_class_6, count_class_7, count_class_8 = df_X_train.Class.value_counts().sort_index()\n", "\n", "print(count_class_6)\n", "print(count_class_7)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Divide by class\n", "df_class_0 = df_X_train[df_X_train['Class'] == 0]\n", "df_class_1 = df_X_train[df_X_train['Class'] == 1]\n", "df_class_2 = df_X_train[df_X_train['Class'] == 2]\n", "df_class_3 = df_X_train[df_X_train['Class'] == 3] \n", "df_class_4 = df_X_train[df_X_train['Class'] == 4]\n", "df_class_5 = df_X_train[df_X_train['Class'] == 5]\n", "df_class_6 = df_X_train[df_X_train['Class'] == 6] # most\n", "df_class_7 = df_X_train[df_X_train['Class'] == 7] # least\n", "df_class_8 = df_X_train[df_X_train['Class'] == 8]\n" ] }, { "source": [ "## Random up Sampling" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Random over-sampling:\n7 762\n3 762\n6 762\n2 762\n5 762\n1 762\n8 762\n4 762\n0 762\nName: Class, dtype: int64\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 10 }, { "output_type": "display_data", "data": { "text/plain": "
", "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "image/png": "\n" }, "metadata": { "needs_background": "light" } } ], "source": [ "df_class_0_over = df_class_0.sample(count_class_6, replace=True)\n", "df_class_1_over = df_class_1.sample(count_class_6, replace=True)\n", "df_class_2_over = df_class_2.sample(count_class_6, replace=True)\n", "df_class_3_over = df_class_3.sample(count_class_6, replace=True)\n", "df_class_4_over = df_class_4.sample(count_class_6, replace=True)\n", "df_class_5_over = df_class_5.sample(count_class_6, replace=True)\n", "df_class_7_over = df_class_7.sample(count_class_6, replace=True)\n", "df_class_8_over = df_class_8.sample(count_class_6, replace=True)\n", "\n", "\n", "df_train_over = pd.concat([df_class_0_over, df_class_1_over, df_class_2_over, df_class_3_over, df_class_4_over,df_class_5_over, df_class_6, df_class_7_over, df_class_8_over], axis=0)\n", "\n", "print('Random over-sampling:')\n", "print(df_train_over.Class.value_counts())\n", "\n", "df_train_over.Class.value_counts().plot(kind='bar', title='Count (target)')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "y_train = df_train_over['Class']\n", "del df_train_over['Class']\n", "X_train = df_train_over" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "le = LabelEncoder()\n", "y_train = le.fit_transform(y_train)\n", "y_test = le.transform(y_test)\n", "encoded_test_y = np_utils.to_categorical((le.inverse_transform(y_test)))\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.1s finished\n", "Cross validation on training data: \n", "Log loss: 0.48078177807428746\n", "Accuracy: 0.9270924467774861\n", "f1score: 0.9270924467774861\n", "Validation on testing data: \n", "Log loss: 1.7733442629632505\n", "Accuracy: 0.5814359446468222\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "\n", "\n", "evaluate_features(X_train, y_train, X_test, y_test)\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.42451396848021816\n", "# Accuracy: 0.9284047827354915\n", "# f1score: 0.9284047827354914\n", "# Validation on testing data: \n", "# Log loss: 1.6780803656137624\n", "# Accuracy: 0.5973130480577082\n", "# f1score: 0.6601503759398496" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "source": [ "## imblearn library" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import imblearn\n", "from imblearn.over_sampling import RandomOverSampler\n", "\n", "ros = RandomOverSampler()\n", "X_ros, y_ros = ros.fit_sample(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.2s finished\n", "Cross validation on training data: \n", "Log loss: 0.48078177807428746\n", "Accuracy: 0.9270924467774861\n", "f1score: 0.9270924467774861\n", "Validation on testing data: \n", "Log loss: 1.7733442629632505\n", "Accuracy: 0.5814359446468222\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "evaluate_features(X_ros, y_ros, X_test, y_test)\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.42451396848021816\n", "# Accuracy: 0.9284047827354915\n", "# f1score: 0.9284047827354914\n", "# Validation on testing data: \n", "# Log loss: 1.6780803656137624\n", "# Accuracy: 0.5973130480577082\n", "# f1score: 0.6601503759398496\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "source": [ "## Over-sampling: SMOTE" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from imblearn.over_sampling import SMOTE\n", "\n", "smote = SMOTE(sampling_strategy = 'auto')\n", "X_sm, y_sm = smote.fit_sample(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.2s finished\n", "Cross validation on training data: \n", "Log loss: 0.48078177807428746\n", "Accuracy: 0.9270924467774861\n", "f1score: 0.9270924467774861\n", "Validation on testing data: \n", "Log loss: 1.7733442629632505\n", "Accuracy: 0.5814359446468222\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "evaluate_features(X_sm, y_sm, X_test, y_test)\n", "\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.42451396848021816\n", "# Accuracy: 0.9284047827354915\n", "# f1score: 0.9284047827354914\n", "# Validation on testing data: \n", "# Log loss: 1.6780803656137624\n", "# Accuracy: 0.5973130480577082\n", "# f1score: 0.6601503759398496" ] }, { "source": [ "## Over-sampling followed by under-sampling" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.0s finished\n", "Cross validation on training data: \n", "Log loss: 0.473950905659361\n", "Accuracy: 0.9276366124252111\n", "f1score: 0.9279648609077599\n", "Validation on testing data: \n", "Log loss: 1.8214311079902497\n", "Accuracy: 0.5906275150100488\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "from imblearn.combine import SMOTETomek\n", "\n", "smt = SMOTETomek(sampling_strategy = 'auto')\n", "X_smt, y_smt = smt.fit_sample(X_train, y_train)\n", "\n", "evaluate_features(X_smt, y_smt, X_test, y_test)\n", "\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.4193741176318106\n", "# Accuracy: 0.9330736269724108\n", "# f1score: 0.9335387323943662\n", "# Validation on testing data: \n", "# Log loss: 1.7250979894933764\n", "# Accuracy: 0.5860118374849917\n", "# f1score: 0.6601503759398496\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }