{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ca555af9", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:17.790372Z", "iopub.status.busy": "2022-04-20T18:28:17.789010Z", "iopub.status.idle": "2022-04-20T18:28:20.247943Z", "shell.execute_reply": "2022-04-20T18:28:20.248470Z", "shell.execute_reply.started": "2022-04-20T09:21:59.649817Z" }, "id": "jfZyocATxtuY", "outputId": "241ce5e4-6e24-4b24-ea2f-fb71ec44c241", "papermill": { "duration": 2.481663, "end_time": "2022-04-20T18:28:20.248767", "exception": false, "start_time": "2022-04-20T18:28:17.767104", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to /usr/share/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#imports\n", "import pandas as pd\n", "from imblearn.pipeline import Pipeline, make_pipeline\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "\n", "from sklearn.metrics import coverage_error\n", "from sklearn.metrics import label_ranking_average_precision_score\n", "\n", "\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.svm import LinearSVC\n", "from sklearn import metrics\n", "from nltk import word_tokenize\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import multilabel_confusion_matrix\n", "import matplotlib.pyplot as plt\n", "\n", "import nltk\n", "nltk.download('stopwords')\n", "from nltk.corpus import stopwords\n", "nltk.download('punkt')" ] }, { "cell_type": "markdown", "id": "43c1970f", "metadata": { "id": "_V7ars6WxrbQ", "papermill": { "duration": 0.016705, "end_time": "2022-04-20T18:28:20.282691", "exception": false, "start_time": "2022-04-20T18:28:20.265986", "status": "completed" }, "tags": [] }, "source": [ "# Data preparation" ] }, { "cell_type": "code", "execution_count": 2, "id": "4530543e", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:20.325972Z", "iopub.status.busy": "2022-04-20T18:28:20.325191Z", "iopub.status.idle": "2022-04-20T18:28:21.450647Z", "shell.execute_reply": "2022-04-20T18:28:21.450097Z", "shell.execute_reply.started": "2022-04-20T09:22:00.035051Z" }, "papermill": { "duration": 1.15018, "end_time": "2022-04-20T18:28:21.450806", "exception": false, "start_time": "2022-04-20T18:28:20.300626", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descripcion030914151618192224...73757677798085909298
0Contrato Administrativo de Servicios de diseño...000000000...0000100000
12019(Y)1535 Construcción escalera de emergenci...000000000...0000000000
2Suministro de energía electrica de diversas in...010000000...0000000000
3Servicio desplazamiento del personal operativo...000000000...0000000000
4Contrato de suministro de gas natural en los ...010000000...0000000000
\n", "

5 rows × 46 columns

\n", "
" ], "text/plain": [ " descripcion 03 09 14 15 16 18 \\\n", "0 Contrato Administrativo de Servicios de diseño... 0 0 0 0 0 0 \n", "1 2019(Y)1535 Construcción escalera de emergenci... 0 0 0 0 0 0 \n", "2 Suministro de energía electrica de diversas in... 0 1 0 0 0 0 \n", "3 Servicio desplazamiento del personal operativo... 0 0 0 0 0 0 \n", "4 Contrato de suministro de gas natural en los ... 0 1 0 0 0 0 \n", "\n", " 19 22 24 ... 73 75 76 77 79 80 85 90 92 98 \n", "0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0 \n", "1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "2 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "3 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "4 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 \n", "\n", "[5 rows x 46 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../input/dataset/train.csv')\n", "dftest = pd.read_csv('../input/dataset/test.csv')\n", "\n", "#df = pd.read_csv('../input/dataset10/train10.csv')\n", "#dftest = pd.read_csv('../input/dataset10/test10.csv')\n", "\n", "dftest['descripcion'] = dftest['descripcion'].apply(lambda x: x.strip('\"'))\n", "df['descripcion'] = df['descripcion'].apply(lambda x: x.strip('\"'))\n", "\n", "df.pop('Unnamed: 0')\n", "dftest.pop('Unnamed: 0')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "5d5545d1", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:21.491903Z", "iopub.status.busy": "2022-04-20T18:28:21.491201Z", "iopub.status.idle": "2022-04-20T18:28:21.493500Z", "shell.execute_reply": "2022-04-20T18:28:21.493952Z", "shell.execute_reply.started": "2022-04-20T09:22:00.902207Z" }, "papermill": { "duration": 0.025504, "end_time": "2022-04-20T18:28:21.494136", "exception": false, "start_time": "2022-04-20T18:28:21.468632", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "labels = df.columns[1:]" ] }, { "cell_type": "code", "execution_count": 4, "id": "b73d26c6", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:21.534038Z", "iopub.status.busy": "2022-04-20T18:28:21.533414Z", "iopub.status.idle": "2022-04-20T18:28:21.604422Z", "shell.execute_reply": "2022-04-20T18:28:21.603755Z", "shell.execute_reply.started": "2022-04-20T09:22:00.907786Z" }, "id": "JbbwCXtUNv0G", "outputId": "1f6ef2d2-53ec-44ef-b6a6-a409373ac10a", "papermill": { "duration": 0.092474, "end_time": "2022-04-20T18:28:21.604567", "exception": false, "start_time": "2022-04-20T18:28:21.512093", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
03091415161819222430...73757677798085909298
00000000000...0000100000
10000000000...0000000000
20100000000...0000000000
30000000000...0000000000
40100000000...0000000000
..................................................................
724240000000000...0000000000
724250000000000...0000000000
724260000000000...0000000000
724270100000010...0000000000
724280000000000...0000000000
\n", "

72429 rows × 45 columns

\n", "
" ], "text/plain": [ " 03 09 14 15 16 18 19 22 24 30 ... 73 75 76 77 79 80 \\\n", "0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 \n", "1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "2 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "4 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "... .. .. .. .. .. .. .. .. .. .. ... .. .. .. .. .. .. \n", "72424 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "72425 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "72426 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "72427 0 1 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 \n", "72428 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", "\n", " 85 90 92 98 \n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "... .. .. .. .. \n", "72424 0 0 0 0 \n", "72425 0 0 0 0 \n", "72426 0 0 0 0 \n", "72427 0 0 0 0 \n", "72428 0 0 0 0 \n", "\n", "[72429 rows x 45 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train = df['descripcion']\n", "X_test = dftest['descripcion']\n", "\n", "y_train = df.drop('descripcion', axis=1)\n", "y_test = dftest.drop('descripcion', axis=1)\n", "\n", "y_train" ] }, { "cell_type": "markdown", "id": "438f8c69", "metadata": { "id": "6ER4f-Jlzj3Z", "papermill": { "duration": 0.018085, "end_time": "2022-04-20T18:28:21.641268", "exception": false, "start_time": "2022-04-20T18:28:21.623183", "status": "completed" }, "tags": [] }, "source": [ "# Models" ] }, { "cell_type": "code", "execution_count": 5, "id": "8e81be8e", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:21.686985Z", "iopub.status.busy": "2022-04-20T18:28:21.686334Z", "iopub.status.idle": "2022-04-20T18:28:21.689527Z", "shell.execute_reply": "2022-04-20T18:28:21.688870Z", "shell.execute_reply.started": "2022-04-20T09:22:00.957103Z" }, "id": "Ti1_48srKepn", "papermill": { "duration": 0.030095, "end_time": "2022-04-20T18:28:21.689673", "exception": false, "start_time": "2022-04-20T18:28:21.659578", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from sklearn.metrics import f1_score, roc_auc_score, accuracy_score\n", "from sklearn.metrics import coverage_error\n", "from sklearn.metrics import label_ranking_average_precision_score\n", " \n", "# adapted from: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/\n", "def multi_label_metrics(ytest,y_pred):\n", " # finally, compute metrics\n", " y_true = ytest\n", " f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')\n", " roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')\n", " accuracy = accuracy_score(y_true, y_pred)\n", " coverage_err = coverage_error(y_test, y_pred)\n", " label_ranking_average_precision = label_ranking_average_precision_score(y_test, y_pred)\n", " # return as dictionary\n", " metrics = {'f1': f1_micro_average,\n", " 'roc_auc': roc_auc,\n", " 'accuracy': accuracy,\n", " 'coverage_error': coverage_err,\n", " 'label_ranking_average_precision_score': label_ranking_average_precision}\n", " return metrics" ] }, { "cell_type": "code", "execution_count": 6, "id": "79b0b6e3", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:21.731418Z", "iopub.status.busy": "2022-04-20T18:28:21.730711Z", "iopub.status.idle": "2022-04-20T18:28:21.738394Z", "shell.execute_reply": "2022-04-20T18:28:21.737835Z", "shell.execute_reply.started": "2022-04-20T09:22:00.965259Z" }, "papermill": { "duration": 0.030355, "end_time": "2022-04-20T18:28:21.738549", "exception": false, "start_time": "2022-04-20T18:28:21.708194", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "from sklearn.multiclass import OneVsRestClassifier" ] }, { "cell_type": "code", "execution_count": 7, "id": "d109ddf9", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:28:21.783002Z", "iopub.status.busy": "2022-04-20T18:28:21.782297Z", "iopub.status.idle": "2022-04-20T18:50:18.187084Z", "shell.execute_reply": "2022-04-20T18:50:18.187687Z", "shell.execute_reply.started": "2022-04-20T09:22:00.979511Z" }, "id": "R994KBIu8sFK", "outputId": "1e6540de-1a26-404d-f41a-4ed124086392", "papermill": { "duration": 1316.430974, "end_time": "2022-04-20T18:50:18.187926", "exception": false, "start_time": "2022-04-20T18:28:21.756952", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "tfidf_tree = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('tree', DecisionTreeClassifier(random_state=42))\n", "])\n", "\n", "tfidf_tree.fit(X_train, y_train)\n", "ypred = tfidf_tree.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 8, "id": "724c5398", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:50:18.229967Z", "iopub.status.busy": "2022-04-20T18:50:18.228968Z", "iopub.status.idle": "2022-04-20T18:50:22.892983Z", "shell.execute_reply": "2022-04-20T18:50:22.893460Z", "shell.execute_reply.started": "2022-04-20T09:37:37.458486Z" }, "papermill": { "duration": 4.686493, "end_time": "2022-04-20T18:50:22.893656", "exception": false, "start_time": "2022-04-20T18:50:18.207163", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tfidf_tree:\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'f1': 0.6247494183579404, 'roc_auc': 0.7993711656698566, 'accuracy': 0.5994136975710328, 'coverage_error': 17.67556858449842, 'label_ranking_average_precision_score': 0.6415911108734487}\n" ] } ], "source": [ "print(f'tfidf_tree:')\t\n", "\n", "print(multi_label_metrics(y_test, ypred))" ] }, { "cell_type": "code", "execution_count": 9, "id": "890b787f", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:50:22.967729Z", "iopub.status.busy": "2022-04-20T18:50:22.941631Z", "iopub.status.idle": "2022-04-20T18:53:03.150138Z", "shell.execute_reply": "2022-04-20T18:53:03.149498Z", "shell.execute_reply.started": "2022-04-20T09:37:41.248953Z" }, "id": "gYNZPMwgHPgo", "outputId": "6fb2a505-2b82-4f0d-fcad-1249b1a00495", "papermill": { "duration": 160.236295, "end_time": "2022-04-20T18:53:03.150448", "exception": false, "start_time": "2022-04-20T18:50:22.914153", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tfidf_knn:\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'f1': 0.6233266282860513, 'roc_auc': 0.754034082546732, 'accuracy': 0.5210038013014625, 'coverage_error': 21.68468526512467, 'label_ranking_average_precision_score': 0.5560795536653822}\n" ] } ], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "tfidf_knn = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('knn', KNeighborsClassifier(n_neighbors=5))\n", "])\n", "\n", "tfidf_knn.fit(X_train, y_train)\n", "ypred = tfidf_knn.predict(X_test)\n", "\n", "#ypredprob = tfidf_knn.predict_proba(X_test)\n", "#print(ypredprob)\n", "\n", "print(f'tfidf_knn:')\t\n", "print(multi_label_metrics(y_test, ypred))" ] }, { "cell_type": "code", "execution_count": 10, "id": "bd7f0778", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:53:03.194640Z", "iopub.status.busy": "2022-04-20T18:53:03.193928Z", "iopub.status.idle": "2022-04-20T18:53:28.205280Z", "shell.execute_reply": "2022-04-20T18:53:28.205871Z", "shell.execute_reply.started": "2022-04-20T09:39:50.076209Z" }, "id": "xjXlXc3n0op3", "outputId": "695d55da-eeb2-42b4-eba7-fae240f8b2a1", "papermill": { "duration": 25.035848, "end_time": "2022-04-20T18:53:28.206109", "exception": false, "start_time": "2022-04-20T18:53:03.170261", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tfidf_nb:\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'f1': 0.21822499557891015, 'roc_auc': 0.5614031276289032, 'accuracy': 0.13472070098576122, 'coverage_error': 39.0722891566265, 'label_ranking_average_precision_score': 0.1588181603418836}\n" ] } ], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.multiclass import OneVsRestClassifier\n", "\n", "\n", "tfidf_NB = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('nb', OneVsRestClassifier(MultinomialNB()))\n", "])\n", "\n", "\n", "tfidf_NB.fit(X_train, y_train)\n", "ypred2 = tfidf_NB.predict(X_test)\n", "\n", "print(f'tfidf_nb:')\t\n", "\n", "print(multi_label_metrics(y_test, ypred2))" ] }, { "cell_type": "code", "execution_count": 11, "id": "a59dceb0", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T18:53:28.265660Z", "iopub.status.busy": "2022-04-20T18:53:28.259925Z", "iopub.status.idle": "2022-04-20T20:45:14.002182Z", "shell.execute_reply": "2022-04-20T20:45:14.003009Z" }, "papermill": { "duration": 6705.775504, "end_time": "2022-04-20T20:45:14.003407", "exception": false, "start_time": "2022-04-20T18:53:28.227903", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tfidf_random_forest:\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'f1': 0.6383469843633656, 'roc_auc': 0.7434141602805479, 'accuracy': 0.5138522002448296, 'coverage_error': 22.322337478255267, 'label_ranking_average_precision_score': 0.5416257063669199}\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "tfidf_random_forest = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('randomforest', RandomForestClassifier(random_state=42))\n", "])\n", "\n", "\n", "\n", "tfidf_random_forest.fit(X_train, y_train)\n", "ypred3 = tfidf_random_forest.predict(X_test)\n", "\n", "\n", "print(f'tfidf_random_forest:')\t\n", "print(multi_label_metrics(y_test, ypred3))" ] }, { "cell_type": "code", "execution_count": 12, "id": "d673e77c", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T20:45:14.054008Z", "iopub.status.busy": "2022-04-20T20:45:14.053269Z", "iopub.status.idle": "2022-04-20T21:27:18.733115Z", "shell.execute_reply": "2022-04-20T21:27:18.732462Z" }, "papermill": { "duration": 2524.705922, "end_time": "2022-04-20T21:27:18.733316", "exception": false, "start_time": "2022-04-20T20:45:14.027394", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tfidf_adaboost:\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'f1': 0.6022293427870193, 'roc_auc': 0.7491995654601569, 'accuracy': 0.45196830101153274, 'coverage_error': 22.465820501256363, 'label_ranking_average_precision_score': 0.5127305400913543}\n" ] } ], "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "\n", "tfidf_adaboost = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('adaboost', OneVsRestClassifier(AdaBoostClassifier(random_state=42)))\n", "])\n", "\n", "\n", "\n", "tfidf_adaboost.fit(X_train, y_train)\n", "ypred3 = tfidf_adaboost.predict(X_test)\n", "\n", "\n", "print(f'tfidf_adaboost:')\t\n", "print(multi_label_metrics(y_test, ypred3))" ] }, { "cell_type": "code", "execution_count": null, "id": "d3a4d9d4", "metadata": { "execution": { "iopub.execute_input": "2022-04-20T09:40:07.524385Z", "iopub.status.busy": "2022-04-20T09:40:07.523955Z" }, "id": "kwxxAPm-KP_Y", "outputId": "9e182203-db0d-49ae-bd5a-b8a887614bf9", "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2022-04-20T21:27:18.756332", "status": "running" }, "tags": [] }, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "\n", "tfidf_SVC = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('linearsvc', OneVsRestClassifier(SVC(random_state=42)))\n", "])\n", "\n", "\n", "tfidf_SVC.fit(X_train, y_train)\n", "ypred2 = tfidf_SVC.predict(X_test)\n", "\n", "\n", "print(f'tfidf_SVC:')\t\n", "print(multi_label_metrics(y_test, ypred2))" ] }, { "cell_type": "code", "execution_count": null, "id": "b82f9e0a", "metadata": { "id": "j-ZUzhm5wm_X", "outputId": "4d113cc8-cc89-4390-ff81-4cc54815fb43", "papermill": { "duration": null, "end_time": null, "exception": null, "start_time": null, "status": "pending" }, "tags": [] }, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "\n", "tfidf_SVMrbf = Pipeline([\n", " ('vectorizer', TfidfVectorizer(ngram_range=(1,3),\n", " analyzer=\"word\",\n", " max_features=None,\n", " stop_words=stopwords.words('spanish'),\n", " use_idf=True,\n", " lowercase=True)),\n", " ('rbfsvc', OneVsRestClassifier(SVC(kernel=\"rbf\", random_state=42)))\n", "])\n", "\n", "\n", "tfidf_SVMrbf.fit(X_train, y_train)\n", "ypred2 = tfidf_SVMrbf.predict(X_test)\n", "\n", "\n", "print(f'tfidf_SVMrbf:')\t\n", "print(multi_label_metrics(y_test, ypred2))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" }, "papermill": { "default_parameters": {}, "duration": null, "end_time": null, "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2022-04-20T18:28:06.182049", "version": "2.3.3" } }, "nbformat": 4, "nbformat_minor": 5 }