{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3-final" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3", "language": "python" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "source": [ "# Resampling approaches to the imbalanced problem" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "\n", "from builtins import range\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from keras.utils import np_utils\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import balanced_accuracy_score, log_loss, f1_score\n", "from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "# result = pd.read_pickle(\"/home/zhendi/pm/scripts/result_non_split_rmnum.pkl\")\n", "result = pd.read_pickle(\"/home/zhendi/pm/scripts/result_non_split_strict.pkl\")\n", "labels = result[[\"Class\"]] - 1" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Onehot Encoding...\n", "Splitting...\n", "(2656, 3259) (665, 3259) (2656,) (665,)\n", "Tfidf Vectorizing...\n", "xtrain tfidf shape: (2656, 390)\n", "xtest tfidf shape: (665, 390)\n", "Count Vectorizing...\n", "xtrain cvec shape: (2656, 390)\n", "xtest cvec shape: (665, 390)\n", "Renaming...\n", "Finished.\n" ] } ], "source": [ "# better way\n", "def buildFeatures_split(df):\n", " \"\"\"This is a function to extract all features (gene, variation, and text), \n", " df argument should be a pandas dataframe with only Gene, Variation, TEXT, and Class columns\"\"\"\n", " # make a copy\n", " temp = df.copy()\n", " labels = temp[\"Class\"] - 1\n", " del temp[\"Class\"]\n", "\n", " # onehot encode gene and variation\n", " print(\"Onehot Encoding...\")\n", " temp = pd.get_dummies(temp, columns=[\"Gene\", \"Variation\"], drop_first=True)\n", " \n", " \n", " # split the data to training data and testing data\n", " print(\"Splitting...\")\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " temp, labels, test_size=0.2, random_state=5, stratify = labels\n", " )\n", " print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n", "\n", " svdT = TruncatedSVD(n_components=390, n_iter=5)\n", "\n", " # Tfidf vectorize TEXT\n", " tfidf = TfidfVectorizer(\n", " max_features=10000,\n", " ngram_range=(1, 2),\n", " analyzer=\"word\",\n", " stop_words=\"english\",\n", " token_pattern=r\"\\w+\",\n", " )\n", " print(\"Tfidf Vectorizing...\")\n", " temp_tfidf_xtrain = svdT.fit_transform(tfidf.fit_transform(X_train[\"TEXT\"]))\n", " print(\"xtrain tfidf shape: \", temp_tfidf_xtrain.shape)\n", " temp_tfidf_xtest = svdT.transform(tfidf.transform(X_test[\"TEXT\"]))\n", " print(\"xtest tfidf shape: \", temp_tfidf_xtest.shape)\n", "\n", " # Count vectorize TEXT\n", " cvec = CountVectorizer(\n", " max_features=10000,\n", " ngram_range=(1, 2),\n", " analyzer=\"word\",\n", " stop_words=\"english\",\n", " token_pattern=r\"\\w+\",\n", " )\n", " print(\"Count Vectorizing...\")\n", " temp_count_xtrain = svdT.fit_transform(cvec.fit_transform(X_train[\"TEXT\"]))\n", " print(\"xtrain cvec shape: \", temp_count_xtrain.shape)\n", " temp_count_xtest = svdT.transform(cvec.transform(X_test[\"TEXT\"]))\n", " print(\"xtest cvec shape: \", temp_count_xtest.shape)\n", "\n", " del X_train[\"TEXT\"]\n", " del X_test[\"TEXT\"]\n", "\n", " # rename the colnames\n", " print(\"Renaming...\")\n", " tempc_xtrain = list(X_train.columns)\n", " tempc_xtest = list(X_test.columns)\n", "\n", " for i in range(np.shape(temp_tfidf_xtrain)[1]):\n", " tempc_xtrain.append(\"tfidf_\" + str(i + 1))\n", " for i in range(np.shape(temp_tfidf_xtest)[1]):\n", " tempc_xtest.append(\"tfidf_\" + str(i + 1))\n", "\n", " for i in range(np.shape(temp_count_xtrain)[1]):\n", " tempc_xtrain.append(\"count_\" + str(i + 1))\n", " for i in range(np.shape(temp_count_xtest)[1]):\n", " tempc_xtest.append(\"count_\" + str(i + 1))\n", "\n", " X_train = pd.concat(\n", " [\n", " X_train,\n", " pd.DataFrame(temp_tfidf_xtrain, index=X_train.index),\n", " pd.DataFrame(temp_count_xtrain, index=X_train.index),\n", " ],\n", " axis=1,\n", " )\n", " X_test = pd.concat(\n", " [\n", " X_test,\n", " pd.DataFrame(temp_tfidf_xtest, index=X_test.index),\n", " pd.DataFrame(temp_count_xtest, index=X_test.index),\n", " ],\n", " axis=1,\n", " )\n", " X_train.columns = tempc_xtrain\n", " X_test.columns = tempc_xtest\n", "\n", " print(\"Finished.\")\n", "\n", " return X_train, y_train, X_test, y_test\n", "\n", "\n", "X_train, y_train, X_test, y_test = buildFeatures_split(\n", " result[[\"Gene\", \"Variation\", \"TEXT\", \"Class\"]]\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# pd.to_pickle((X_train, y_train, X_test, y_test), \"/home/zhendi/pm/scripts/resample_data_split4baseline_strict.pkl\")\n", "# X_train, y_train, X_test, y_test = pd.read_pickle( \"/home/zhendi/pm/scripts/resample_data_split4baseline_strict.pkl\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "\n", "df_X_train = X_train.copy()\n", "df_X_train['Class'] = y_train\n", "df_X_test = X_test.copy()\n", "df_X_test['Class'] = y_test\n" ] }, { "source": [ "## Baseline" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 4.1s finished\n", "Cross validation on training data: \n", "Log loss: 1.3795442587887174\n", "Accuracy: 0.4829926178643152\n", "f1score: 0.6359186746987951\n", "Validation on testing data: \n", "Log loss: 1.196023582675107\n", "Accuracy: 0.5284083015643508\n", "f1score: 0.6601503759398496\n" ] } ], "source": [ "# X_train, y_train, X_test, y_test = pd.read_pickle( \"/home/zhendi/pm/scripts/resample_data_split4baseline_strict.pkl\")\n", "\n", "\n", "\n", "def evaluate_features(X, y, X_test, y_test, clf=None):\n", "\n", " if clf is None:\n", " clf = RandomForestClassifier(n_estimators=400, random_state = 5, n_jobs = -1)\n", "\n", " probas = cross_val_predict(\n", " clf,\n", " X,\n", " y,\n", " cv=StratifiedKFold(n_splits=3),\n", " n_jobs=-1,\n", " method=\"predict_proba\",\n", " verbose=2,\n", " )\n", " pred_indices = np.argmax(probas, axis=1)\n", " classes = np.unique(y)\n", " preds = classes[pred_indices]\n", "\n", " print(\"Cross validation on training data: \")\n", " print(\"Log loss: {}\".format(log_loss(y, probas)))\n", " print(\"Accuracy: {}\".format(balanced_accuracy_score(y, preds)))\n", " print('f1score: {}'.format(f1_score(y, preds, average = 'micro')))\n", "\n", " print(\"Validation on testing data: \")\n", " clf.fit(X, y)\n", " ytest = clf.predict(X_test)\n", " yprobas_test = clf.predict_proba(X_test)\n", " print(\"Log loss: {}\".format(log_loss(y_test, yprobas_test)))\n", " print(\"Accuracy: {}\".format(balanced_accuracy_score(y_test, ytest)))\n", " print('f1score: {}'.format(f1_score(y_test, ytest, average = 'micro')))\n", "\n", "evaluate_features(X_train, y_train, X_test, y_test)\n", "\n", "# Cross validation on training data: \n", "# Log loss: 1.3043531922895704\n", "# Accuracy: 0.4947690264523213\n", "# f1score: 0.6355421686746988\n", "\n", "# Validation on testing data: \n", "# Log loss: 1.116404955301456\n", "# Accuracy: 0.5350690829081902\n", "# f1score: 0.6706766917293233" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "source": [ "## Class Count and Split" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "# Class count: class 6 is the most, class 7 is the least for both training data and testing data\n", "df_X_train.Class.value_counts().sort_values\n", "# df_X_test.Class.value_counts().sort_values" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "762\n15\n" ] } ], "source": [ "count_class_0, count_class_1, count_class_2, count_class_3, count_class_4, count_class_5, count_class_6, count_class_7, count_class_8 = df_X_train.Class.value_counts().sort_index()\n", "\n", "print(count_class_6)\n", "print(count_class_7)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Divide by class\n", "df_class_0 = df_X_train[df_X_train['Class'] == 0]\n", "df_class_1 = df_X_train[df_X_train['Class'] == 1]\n", "df_class_2 = df_X_train[df_X_train['Class'] == 2]\n", "df_class_3 = df_X_train[df_X_train['Class'] == 3] \n", "df_class_4 = df_X_train[df_X_train['Class'] == 4]\n", "df_class_5 = df_X_train[df_X_train['Class'] == 5]\n", "df_class_6 = df_X_train[df_X_train['Class'] == 6] # most\n", "df_class_7 = df_X_train[df_X_train['Class'] == 7] # least\n", "df_class_8 = df_X_train[df_X_train['Class'] == 8]\n" ] }, { "source": [ "## Random up Sampling" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Random over-sampling:\n7 762\n3 762\n6 762\n2 762\n5 762\n1 762\n8 762\n4 762\n0 762\nName: Class, dtype: int64\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 10 }, { "output_type": "display_data", "data": { "text/plain": "
", "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAW7ElEQVR4nO3df7DVd33n8ecrkGB+acBcEIEotmgk20msV4wba9PiBqytMDtNl1gt7cSys0O2unbdgDoaO2Un3el2jaPpDFN/3N3EIEmTgtWqSDfrprUhN78lBEExcBcC12gSkygR8to/zofk5OZc7oF7zj3k4+sxc+f7/X6+n+/3+z7k5HW/53O+3++VbSIioi4n9bqAiIjovIR7RESFEu4RERVKuEdEVCjhHhFRoYR7RESFEu4Ro5DUJ2m7pJf0upZWJE2R9KCk6b2uJU48CffoKUnvljQo6QlJ+yT9g6S3TsBxLemXx+i2Cvi87Z+VbW6V9L5u1zaakce3fRD4HHBlr2qKE1fCPXpG0geBTwL/FZgBnANcCyzpZV3QOCsGlgPXdXCfkzu1ryZfBJaXeiOelXCPnpD0MuDPgJW2b7b9pO2f2/6y7Q+VPlMkfVLS3vLzySMhJukPJd02Yp/Pno1L+oKkz0j6iqSfSLpd0i+Vdd8qm9xbPjH8uxYlvhl41PZQ2WYN8GvAp8s2ny7t10jaI+lxSXdK+rWmeq6SdJOk6yQ9DvyhpLmSvlVq+map8bqmbS6U9M+SHpV0r6SLj3b8Ut+PgQvH8Z8jKpRwj155C/AS4Jaj9PkIjdC6ADgfWAB89BiOcRnwCWAqsBNYA2D7bWX9+bbPsP2lFtv+CrD9yILtjwD/F7iibHNFWXVHqW8ajbPoG0eM0S8BbgLOAq4vfbYALweuAt57pKOkWcBXgD8v+/vPwN9K6jvK8QG20fj3iXhWwj165eXAD20fOkqf3wf+zPYB28M0gvq9R+k/0s22t5RjXE8jhNt1FvCTsTrZvs72I7YP2f7vwBTgdU1dvm3772w/A/QBbwI+Zvtp27cBG5v6vgf4qu2v2n7G9iZgEPitMcr4Sak34lkJ9+iVR4CzxxiHfiXwUNPyQ6WtXQ83zT8FnHEM2/4YOHOsTpL+VNI2SY9JehR4GXB2U5c9TfOvBH5k+6lR1r8KuLQMyTxa9vdWYOYYZZwJPDpWrfGLJeEevfJt4GfA0qP02Usj8I44p7QBPAmcdmSFpFd0uL77gNeOaHveI1TL+PqVwO8BU22fBTwGaJRt9gHTJJ3W1DanaX4P8L9sn9X0c7rtq1sdv8nrgXvbeVHxiyPhHj1h+zHgY8BnJC2VdJqkkyW9Q9J/K91uAD5arjc/u/Q/8uXjvcB5ki4oY9xXHWMJ+4HXHGX9FuCsMg4+2jZnAoeAYWCypI8BLx1th7YfojHMcpWkUyS9Bfidpi7XAb8jaZGkSZJeIuliSbNHq7nUNw34l6O8lvgFlHCPnrH9V8AHaXxJOkzjzPUK4O9Klz+nEYb3AfcDd5U2bH+XxtU23wR2AM+7cqYNVwEDZfjj91rU9jTwBRrj4EdcA/yupB9L+hTwdeAfgO/SGDL6Gc8fZmnl92l8mfxIeS1fAg6WY+6h8QXsh3nu3+NDPPf/6cjjA7wbGCjXvEc8S/ljHRGtSeqjcYXKG2z/tEvH+BLwoO2PH8e2U2h8gnmb7QMdLy5e1BLuERNI0puAHwG7gEtofEp5i+27e1pYVKetYRlJ/0nSVknfkXRDGQucJmmTpB1lOrWp/2pJO8tzORZ1r/yIF51XALcCTwCfAv5Dgj26Ycwz9/KFzW3AfNs/lbQe+Cown8ZlXVdLWkXjaoErJc2n8UXYAhqXfn0TeK3tw918IRER8Zx2v1CdDJxarkk+jcblaEuAgbJ+gOcuaVsCrLN90PYuGncGLuhcyRERMZYxw932/wP+EthN4zrdx2x/A5hhe1/psw848tjRWTz/ioGh0hYRERNkzKfUlbH0JcBcGnfB3SjpPUfbpEXbC8Z+JK0AVgCcfvrpbzz33HPbKjgiIhruvPPOH9rua7WunUeQvh3YVZ7tgaSbgX8N7Jc00/Y+STOBI5diDfH8u+5m89xdhc+yvRZYC9Df3+/BwcF2X09ERACSHhptXTtj7ruBC8sdhAIW0ngK3UYaz7umTDeU+Y3AsvK41rnAPBp3+0VExAQZ88zd9u2SbqJxd+Ah4G4aZ9xnAOslXU7jF8Clpf/WckXNA6X/ylwpExExsU6Im5gyLBMRcewk3Wm7v9W6PFsmIqJCCfeIiAol3CMiKpRwj4ioUMI9IqJC7dzEdMJ49aqvdGQ/P7j6nR3ZD9RdE3SurtTUvprfUydiTVDneypn7hERFUq4R0RUKOEeEVGhhHtERIUS7hERFUq4R0RUKOEeEVGhhHtERIUS7hERFUq4R0RUKOEeEVGhhHtERIXGDHdJr5N0T9PP45I+IGmapE2SdpTp1KZtVkvaKWm7pEXdfQkRETHSmOFue7vtC2xfALwReAq4BVgFbLY9D9hclpE0H1gGnAcsBq6VNKlL9UdERAvHOiyzEPie7YeAJcBAaR8Alpb5JcA62wdt7wJ2Ags6UWxERLTnWMN9GXBDmZ9hex9AmU4v7bOAPU3bDJW2iIiYIG2Hu6RTgHcBN47VtUWbW+xvhaRBSYPDw8PtlhEREW04ljP3dwB32d5flvdLmglQpgdK+xAwp2m72cDekTuzvdZ2v+3+vr6+Y688IiJGdSzhfhnPDckAbASWl/nlwIam9mWSpkiaC8wDtoy30IiIaF9bf0NV0mnAvwH+fVPz1cB6SZcDu4FLAWxvlbQeeAA4BKy0fbijVUdExFG1Fe62nwJePqLtERpXz7TqvwZYM+7qIiLiuOQO1YiICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQW+Eu6SxJN0l6UNI2SW+RNE3SJkk7ynRqU//VknZK2i5pUffKj4iIVto9c78G+Jrtc4HzgW3AKmCz7XnA5rKMpPnAMuA8YDFwraRJnS48IiJGN2a4S3op8DbgswC2n7b9KLAEGCjdBoClZX4JsM72Qdu7gJ3Agk4XHhERo2vnzP01wDDweUl3S/obSacDM2zvAyjT6aX/LGBP0/ZDpe15JK2QNChpcHh4eFwvIiIinq+dcJ8M/Crw17bfADxJGYIZhVq0+QUN9lrb/bb7+/r62io2IiLa0064DwFDtm8vyzfRCPv9kmYClOmBpv5zmrafDeztTLkREdGOMcPd9sPAHkmvK00LgQeAjcDy0rYc2FDmNwLLJE2RNBeYB2zpaNUREXFUk9vs9x+B6yWdAnwf+CMavxjWS7oc2A1cCmB7q6T1NH4BHAJW2j7c8cojImJUbYW77XuA/harFo7Sfw2wZhx1RUTEOOQO1YiICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIq1Fa4S/qBpPsl3SNpsLRNk7RJ0o4yndrUf7WknZK2S1rUreIjIqK1Yzlz/w3bF9g+8rdUVwGbbc8DNpdlJM0HlgHnAYuBayVN6mDNERExhvEMyywBBsr8ALC0qX2d7YO2dwE7gQXjOE5ERByjdsPdwDck3SlpRWmbYXsfQJlOL+2zgD1N2w6VtoiImCCT2+x3ke29kqYDmyQ9eJS+atHmF3Rq/JJYAXDOOee0WUZERLSjrTN323vL9ABwC41hlv2SZgKU6YHSfQiY07T5bGBvi32utd1vu7+vr+/4X0FERLzAmOEu6XRJZx6ZBy4BvgNsBJaXbsuBDWV+I7BM0hRJc4F5wJZOFx4REaNrZ1hmBnCLpCP9v2j7a5LuANZLuhzYDVwKYHurpPXAA8AhYKXtw12pPiIiWhoz3G1/Hzi/RfsjwMJRtlkDrBl3dRERcVxyh2pERIUS7hERFUq4R0RUKOEeEVGhhHtERIUS7hERFUq4R0RUKOEeEVGhhHtERIUS7hERFUq4R0RUKOEeEVGhhHtERIUS7hERFUq4R0RUKOEeEVGhhHtERIUS7hERFWo73CVNknS3pL8vy9MkbZK0o0ynNvVdLWmnpO2SFnWj8IiIGN2xnLm/H9jWtLwK2Gx7HrC5LCNpPrAMOA9YDFwraVJnyo2IiHa0Fe6SZgPvBP6mqXkJMFDmB4ClTe3rbB+0vQvYCSzoTLkREdGOds/cPwn8F+CZprYZtvcBlOn00j4L2NPUb6i0RUTEBBkz3CX9NnDA9p1t7lMt2txivyskDUoaHB4ebnPXERHRjnbO3C8C3iXpB8A64DclXQfslzQToEwPlP5DwJym7WcDe0fu1PZa2/22+/v6+sbxEiIiYqQxw932atuzbb+axhel/2j7PcBGYHnpthzYUOY3AsskTZE0F5gHbOl45RERMarJ49j2amC9pMuB3cClALa3SloPPAAcAlbaPjzuSiMiom3HFO62bwVuLfOPAAtH6bcGWDPO2iIi4jjlDtWIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKjRmuEt6iaQtku6VtFXSJ0r7NEmbJO0o06lN26yWtFPSdkmLuvkCIiLihdo5cz8I/Kbt84ELgMWSLgRWAZttzwM2l2UkzQeWAecBi4FrJU3qRvEREdHamOHuhifK4snlx8ASYKC0DwBLy/wSYJ3tg7Z3ATuBBR2tOiIijqqtMXdJkyTdAxwANtm+HZhhex9AmU4v3WcBe5o2HyptERExQdoKd9uHbV8AzAYWSPpXR+muVrt4QSdphaRBSYPDw8PtVRsREW05pqtlbD8K3EpjLH2/pJkAZXqgdBsC5jRtNhvY22Jfa2332+7v6+s7jtIjImI07Vwt0yfprDJ/KvB24EFgI7C8dFsObCjzG4FlkqZImgvMA7Z0uvCIiBjd5Db6zAQGyhUvJwHrbf+9pG8D6yVdDuwGLgWwvVXSeuAB4BCw0vbh7pQfERGtjBnutu8D3tCi/RFg4SjbrAHWjLu6iIg4LrlDNSKiQgn3iIgKJdwjIiqUcI+IqFDCPSKiQgn3iIgKJdwjIiqUcI+IqFDCPSKiQgn3iIgKJdwjIiqUcI+IqFDCPSKiQgn3iIgKJdwjIiqUcI+IqFDCPSKiQgn3iIgKtfMHsudI+t+StknaKun9pX2apE2SdpTp1KZtVkvaKWm7pEXdfAEREfFC7Zy5HwL+1PbrgQuBlZLmA6uAzbbnAZvLMmXdMuA8YDFwbfnj2hERMUHGDHfb+2zfVeZ/AmwDZgFLgIHSbQBYWuaXAOtsH7S9C9gJLOh04RERMbpjGnOX9GrgDcDtwAzb+6DxCwCYXrrNAvY0bTZU2iIiYoK0He6SzgD+FviA7ceP1rVFm1vsb4WkQUmDw8PD7ZYRERFtaCvcJZ1MI9ivt31zad4vaWZZPxM4UNqHgDlNm88G9o7cp+21tvtt9/f19R1v/RER0UI7V8sI+CywzfZfNa3aCCwv88uBDU3tyyRNkTQXmAds6VzJERExlslt9LkIeC9wv6R7StuHgauB9ZIuB3YDlwLY3ippPfAAjSttVto+3PHKIyJiVGOGu+3baD2ODrBwlG3WAGvGUVdERIxD7lCNiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokIJ94iICiXcIyIqlHCPiKhQwj0iokLt/IHsz0k6IOk7TW3TJG2StKNMpzatWy1pp6TtkhZ1q/CIiBhdO2fuXwAWj2hbBWy2PQ/YXJaRNB9YBpxXtrlW0qSOVRsREW0ZM9xtfwv40YjmJcBAmR8Alja1r7N90PYuYCewoEO1RkREm453zH2G7X0AZTq9tM8C9jT1GyptERExgTr9hapatLllR2mFpEFJg8PDwx0uIyLiF9vxhvt+STMByvRAaR8C5jT1mw3sbbUD22tt99vu7+vrO84yIiKileMN943A8jK/HNjQ1L5M0hRJc4F5wJbxlRgREcdq8lgdJN0AXAycLWkI+DhwNbBe0uXAbuBSANtbJa0HHgAOASttH+5S7RERMYoxw932ZaOsWjhK/zXAmvEUFRER45M7VCMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCCfeIiAol3CMiKpRwj4ioUMI9IqJCXQt3SYslbZe0U9Kqbh0nIiJeqCvhLmkS8BngHcB84DJJ87txrIiIeKFunbkvAHba/r7tp4F1wJIuHSsiIkaQ7c7vVPpdYLHt95Xl9wJvtn1FU58VwIqy+Dpge4cOfzbwww7tq1NSU/tOxLpSU3tSU/s6VderbPe1WjG5AztvRS3anvdbxPZaYG3HDywN2u7v9H7HIzW170SsKzW1JzW1byLq6tawzBAwp2l5NrC3S8eKiIgRuhXudwDzJM2VdAqwDNjYpWNFRMQIXRmWsX1I0hXA14FJwOdsb+3GsVro+FBPB6Sm9p2IdaWm9qSm9nW9rq58oRoREb2VO1QjIiqUcI+IqFDCPSKiQi/qcJf0J5LmjN1zYklaIOlNZX6+pA9K+q0e1vNmSS8t86dK+oSkL0v6C0kv62Fd50paKOmMEe2Le1XTSJLeWv77XdLrWk4kkk6R9AeS3l6W3y3p05JWSjq51/UdIel/ngA1nCvpSkmfknRNmX9914/7Yv5CVdJjwJPA94AbgBttD/e4po/TeKbOZGAT8GbgVuDtwNdtr+lBTVuB88tVTGuBp4CbgIWl/d/2oKY/AVYC24ALgPfb3lDW3WX7Vye6pnLsLbYXlPk/LjXeAlwCfNn21b2oazSS/sj253tw3OtpvMdPAx4FzgBupvGeku3lPahp5OXWAn4D+EcA2+/qQU1XApfReATLUGmeTePy8HVdfT/ZftH+AHfT+PRxCfBZYBj4GrAcOLNHNd1P4/LP04DHgZeW9lOB+3pU07am+btGrLunh/9OZ5T5VwODNAIe4O5evqea5u8A+sr86cD9varrKPXu7tFx7yvTycB+YFJZVg/f53cB1wEXA79epvvK/K/3qKbvAie3aD8F2NHNY3fr8QMTxbafAb4BfKN8HHwHjd+Ufwm0fOZClx2yfRh4StL3bD9eCv2ppGd6UA/Ad5rO8O6V1G97UNJrgZ/3qKZJtp8AsP0DSRcDN0l6Fa0fXzFRTpI0lcZJg1w+Cdp+UtKhXhQk6b7RVgEzJrKWJieVGxRPp3Ei8zLgR8AUoFfDMv3A+4GPAB+yfY+kn9r+Pz2qB+AZ4JXAQyPaZ5Z1XfNiD/fnhYDtn9O4E3ajpFN7UxJPSzrN9lPAG480lrHtXoX7+4BrJH2UxsOKvi1pD7CnrOuFhyVdYPseANtPSPpt4HPAr/SoJmiE1J003luW9ArbD5fvBXr1S2cGsAj48Yh2Af888eUAjU/KD9L4lPoR4EZJ3wcupDEEMeHKid7/kHRjme6n9xn3AWCzpB00/n8DOAf4ZeCKUbfqgBf7mPtrbX+313U0kzTF9sEW7WcDM23f34OyjtRwJvAaGm/4Idv7e1jLbBqfch5use4i2//Ug7JGJek0YIbtXT049meBz9u+rcW6L9p+90TXVI79SgDbeyWdReN7pd22t/SinpEkvRO4yPaHe1zHSTQegz6Lxi/kIeCO8gm/e8d9MYd7RES09qK+FDIiIlpLuEdEVCjhHhFRoYR7RESFEu4RERX6//xxkGjiFaQ/AAAAAElFTkSuQmCC\n" }, "metadata": { "needs_background": "light" } } ], "source": [ "df_class_0_over = df_class_0.sample(count_class_6, replace=True)\n", "df_class_1_over = df_class_1.sample(count_class_6, replace=True)\n", "df_class_2_over = df_class_2.sample(count_class_6, replace=True)\n", "df_class_3_over = df_class_3.sample(count_class_6, replace=True)\n", "df_class_4_over = df_class_4.sample(count_class_6, replace=True)\n", "df_class_5_over = df_class_5.sample(count_class_6, replace=True)\n", "df_class_7_over = df_class_7.sample(count_class_6, replace=True)\n", "df_class_8_over = df_class_8.sample(count_class_6, replace=True)\n", "\n", "\n", "df_train_over = pd.concat([df_class_0_over, df_class_1_over, df_class_2_over, df_class_3_over, df_class_4_over,df_class_5_over, df_class_6, df_class_7_over, df_class_8_over], axis=0)\n", "\n", "print('Random over-sampling:')\n", "print(df_train_over.Class.value_counts())\n", "\n", "df_train_over.Class.value_counts().plot(kind='bar', title='Count (target)')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "y_train = df_train_over['Class']\n", "del df_train_over['Class']\n", "X_train = df_train_over" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "le = LabelEncoder()\n", "y_train = le.fit_transform(y_train)\n", "y_test = le.transform(y_test)\n", "encoded_test_y = np_utils.to_categorical((le.inverse_transform(y_test)))\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.1s finished\n", "Cross validation on training data: \n", "Log loss: 0.48078177807428746\n", "Accuracy: 0.9270924467774861\n", "f1score: 0.9270924467774861\n", "Validation on testing data: \n", "Log loss: 1.7733442629632505\n", "Accuracy: 0.5814359446468222\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "\n", "\n", "evaluate_features(X_train, y_train, X_test, y_test)\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.42451396848021816\n", "# Accuracy: 0.9284047827354915\n", "# f1score: 0.9284047827354914\n", "# Validation on testing data: \n", "# Log loss: 1.6780803656137624\n", "# Accuracy: 0.5973130480577082\n", "# f1score: 0.6601503759398496" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "source": [ "## imblearn library" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import imblearn\n", "from imblearn.over_sampling import RandomOverSampler\n", "\n", "ros = RandomOverSampler()\n", "X_ros, y_ros = ros.fit_sample(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.2s finished\n", "Cross validation on training data: \n", "Log loss: 0.48078177807428746\n", "Accuracy: 0.9270924467774861\n", "f1score: 0.9270924467774861\n", "Validation on testing data: \n", "Log loss: 1.7733442629632505\n", "Accuracy: 0.5814359446468222\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "evaluate_features(X_ros, y_ros, X_test, y_test)\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.42451396848021816\n", "# Accuracy: 0.9284047827354915\n", "# f1score: 0.9284047827354914\n", "# Validation on testing data: \n", "# Log loss: 1.6780803656137624\n", "# Accuracy: 0.5973130480577082\n", "# f1score: 0.6601503759398496\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "source": [ "## Over-sampling: SMOTE" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from imblearn.over_sampling import SMOTE\n", "\n", "smote = SMOTE(sampling_strategy = 'auto')\n", "X_sm, y_sm = smote.fit_sample(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.2s finished\n", "Cross validation on training data: \n", "Log loss: 0.48078177807428746\n", "Accuracy: 0.9270924467774861\n", "f1score: 0.9270924467774861\n", "Validation on testing data: \n", "Log loss: 1.7733442629632505\n", "Accuracy: 0.5814359446468222\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "evaluate_features(X_sm, y_sm, X_test, y_test)\n", "\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.42451396848021816\n", "# Accuracy: 0.9284047827354915\n", "# f1score: 0.9284047827354914\n", "# Validation on testing data: \n", "# Log loss: 1.6780803656137624\n", "# Accuracy: 0.5973130480577082\n", "# f1score: 0.6601503759398496" ] }, { "source": [ "## Over-sampling followed by under-sampling" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 5.0s finished\n", "Cross validation on training data: \n", "Log loss: 0.473950905659361\n", "Accuracy: 0.9276366124252111\n", "f1score: 0.9279648609077599\n", "Validation on testing data: \n", "Log loss: 1.8214311079902497\n", "Accuracy: 0.5906275150100488\n", "f1score: 0.6451127819548872\n" ] } ], "source": [ "from imblearn.combine import SMOTETomek\n", "\n", "smt = SMOTETomek(sampling_strategy = 'auto')\n", "X_smt, y_smt = smt.fit_sample(X_train, y_train)\n", "\n", "evaluate_features(X_smt, y_smt, X_test, y_test)\n", "\n", "\n", "# Cross validation on training data: \n", "# Log loss: 0.4193741176318106\n", "# Accuracy: 0.9330736269724108\n", "# f1score: 0.9335387323943662\n", "# Validation on testing data: \n", "# Log loss: 1.7250979894933764\n", "# Accuracy: 0.5860118374849917\n", "# f1score: 0.6601503759398496\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }