{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "WQWllbm7n3yE" }, "source": [ "# **Tweets classified as agressive or not**\n", "\n", "Author: Ezhova Darya (@ezhdi slack)\n", "\n", "Dataset https://www.kaggle.com/dataturks/dataset-for-detection-of-cybertrolls" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "NPVhV-e-C-xx" }, "source": [ "The dataset has 20001 items of which 20001 items have been manually labeled.\n", "\n", "The labels are divided into following 2 categories:\n", "\n", "1 - Cyber-Aggressive\n", "0 - Non Cyber-Aggressive" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 929, "status": "ok", "timestamp": 1545074660860, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "tcxZbNs1_66K", "outputId": "dd7b681f-6ac2-4168-8f7d-72b57f0ae308" }, "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/gdrive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "ElmXpP1H_85Q" }, "outputs": [], "source": [ "import os\n", "import json\n", "import numpy as np\n", "import pandas as pd\n", "import scipy\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import mean_absolute_error\n", "from scipy.sparse import csr_matrix, hstack\n", "from sklearn.linear_model import Ridge\n", "from sklearn.model_selection import train_test_split, validation_curve\n", "from sklearn.model_selection import StratifiedKFold" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "X6IuuvvOlPBI" }, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "xeijnNHGAEtX" }, "outputs": [], "source": [ "PATH_TO_DATA = '/content/gdrive/My Drive/Data/'" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "xnYlxlBHq_Fx" }, "source": [ "**Read the data and have a look at it**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 1139, "status": "ok", "timestamp": 1545074661117, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "7bLrv1YJAV-Y", "outputId": "e5359b14-4623-4376-9af8-c2fd2a3e49a1" }, "outputs": [], "source": [ "df = pd.read_json(PATH_TO_DATA + '/Dataset for Detection of Cyber-Trolls.json', lines= True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 1128, "status": "ok", "timestamp": 1545074661118, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "5ln5bw1pALT8", "outputId": "e716f367-7794-4796-8437-1a3d7fa8fe5c" }, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 153 }, "colab_type": "code", "executionInfo": { "elapsed": 1629, "status": "ok", "timestamp": 1545074661628, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "xxaueJVaXqFd", "outputId": "e5c0eddc-c9be-497c-b189-2c5a39105e07" }, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "esguMF3JrM7V" }, "source": [ "**Delete null column extras, transform target column annotation and make some new features**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "jIxH0DMCXw4l" }, "outputs": [], "source": [ "df.drop(columns = ['extras'], inplace = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "wuCFTjUvZ3vX" }, "outputs": [], "source": [ "df.rename(columns = {'annotation' : 'label'}, inplace = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "YSyPv27Rauls" }, "outputs": [], "source": [ "df.label = df.label.apply(lambda x : int(x['label'][0]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 1597, "status": "ok", "timestamp": 1545074661634, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "PlH6iw_l_yVD", "outputId": "da7b9437-d543-4787-9aa1-aa089f16e4ef" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 1587, "status": "ok", "timestamp": 1545074661634, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "PasaixlKdEPc", "outputId": "a1a90568-5512-414e-a0b8-543c5f2776c2" }, "outputs": [], "source": [ "df[df['label'] == 0].sample(5).content" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 1579, "status": "ok", "timestamp": 1545074661635, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "wteESba_AuP9", "outputId": "70342740-5606-4527-b32a-6ad96cb76a29" }, "outputs": [], "source": [ "df[df['label'] == 1].sample(5).content" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 265 }, "colab_type": "code", "executionInfo": { "elapsed": 1571, "status": "ok", "timestamp": 1545074661635, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "7jY1qqAnA2ro", "outputId": "436024f1-55c9-4380-aa5a-93130dacbbc3" }, "outputs": [], "source": [ "_, ax = plt.subplots()\n", "plt.bar(np.arange(2), df['label'].value_counts(), color = ['blue', 'red'])\n", "ax.set_xticks(np.arange(2))\n", "ax.set_xticklabels(['Non Cyber-Aggressive', 'Cyber-Aggressive']);" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 1563, "status": "ok", "timestamp": 1545074661635, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "rU7LoBULBziZ", "outputId": "519c3845-9d44-409f-ed28-aa7f7942a72a" }, "outputs": [], "source": [ "df['label'].value_counts()[1] / df.shape[0], df['label'].value_counts()[0] / df.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "ZmgQvhomCgqi" }, "outputs": [], "source": [ "df['len'] = df['content'].apply(lambda x : len(x.strip().split()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "p7Wss-cSDSN6" }, "outputs": [], "source": [ "import regex as re " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "PDcAbnLmJJcu" }, "outputs": [], "source": [ "df['capital'] = df['content'].apply(lambda x : sum(1 for c in x if c.isupper()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "lgc2UZj3Y8fZ" }, "outputs": [], "source": [ "df['punct'] = df['content'].apply(lambda x : len(re.findall(\"[^\\P{P}-]+\", x)))\n", "df['punct'] = df['content'].apply(lambda x : len(re.findall(\"[^\\P{P}-]+\", x)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "ZNIddyfkY9mR" }, "outputs": [], "source": [ "df['content'] = df['content'].apply(lambda x : re.sub(\"[^\\P{P}-]+\", \"\", x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "6k9s6SikDZUJ" }, "outputs": [], "source": [ "df['content'] = df['content'].apply(lambda x : str.lower(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 816 }, "colab_type": "code", "executionInfo": { "elapsed": 2706, "status": "ok", "timestamp": 1545074662839, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "FNY_NuDPJ063", "outputId": "32725561-4930-4e63-cea9-f35a62f2201e" }, "outputs": [], "source": [ "symbols = {}\n", "for x in [item for sublist in list(map(list, df['content'].tolist())) for item in sublist] :\n", " if x in symbols :\n", " symbols[x] += 1\n", " else :\n", " symbols[x] = 1\n", "symbols" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "-8WkviCgejql" }, "outputs": [], "source": [ "digits = '0123456789'\n", "df['num'] = df['content'].apply(lambda x : 1 if len([s for s in x if s in digits]) > 0 else 0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 2693, "status": "ok", "timestamp": 1545074662842, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "OILpILKFfxOO", "outputId": "12abb3ff-65d7-4470-e0b5-9dbd73144f8d" }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "Am5Bxbk9fyvJ" }, "outputs": [], "source": [ "target = df['label'].values" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "b6Qj5kowrvtG" }, "source": [ "**Split dataset on train and test**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "XxoTjFz_gCEL" }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = 0.3, stratify = target, random_state = 31)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 2679, "status": "ok", "timestamp": 1545074662845, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "SgXDrfjzgKFv", "outputId": "22834b26-69c7-4997-f543-af182d87be82" }, "outputs": [], "source": [ "y_train.sum() / len(y_train), y_test.sum() / len(y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 2674, "status": "ok", "timestamp": 1545074662846, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "nOFQkM99gZJG", "outputId": "1f29112e-706c-4137-b511-f746462a3d95" }, "outputs": [], "source": [ "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 2373 }, "colab_type": "code", "executionInfo": { "elapsed": 8334, "status": "ok", "timestamp": 1545074668514, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "hVjajeJJgkDZ", "outputId": "2132836e-9e5a-4c7a-93a9-0735eb856390" }, "outputs": [], "source": [ "for col in X_train.columns[2 :] :\n", " fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (20, 10))\n", " axes[0].set_title(col)\n", " axes[0].hist(X_train[col], bins = 200);\n", " axes[1].set_title(col)\n", " axes[1].hist(X_train[col][X_train['label'] == 0], bins = 200, label = 'normal')\n", " axes[1].hist(X_train[col][X_train['label'] == 1], bins = 200, label = 'agressive')\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 613 }, "colab_type": "code", "executionInfo": { "elapsed": 8704, "status": "ok", "timestamp": 1545074668892, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "6ZHdgzrRgw7_", "outputId": "d4d17823-7ed4-49bb-aea7-f3f29c180577" }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize = (20, 10))\n", "sns.heatmap(X_train[['label', 'len', 'punct', 'capital','num']].corr())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "1HtcW9nghANu" }, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "dzrmSTWDhy0t" }, "outputs": [], "source": [ "cols = ['len', 'punct', 'capital', 'num']\n", "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[cols]), columns = cols)\n", "X_test_scaled = pd.DataFrame(scaler.transform(X_test[cols]), columns = cols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "54UHa62ziJW-" }, "outputs": [], "source": [ "def valid(model, n, bayes = False) :\n", " skf = StratifiedKFold(n_splits = n, random_state = 31)\n", " auc_scores = []\n", " for train_index, valid_index in skf.split(X_train_scaled, y_train):\n", " X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]\n", " y_train_part, y_valid = y_train[train_index], y_train[valid_index]\n", " \n", " X_train_sms, X_valid_sms = X_train.iloc[train_index]['content'], X_train.iloc[valid_index]['content']\n", " cv = TfidfVectorizer(ngram_range = (1, 3))\n", " X_train_bow = cv.fit_transform(X_train_sms)\n", " X_valid_bow = cv.transform(X_valid_sms) \n", " if bayes :\n", " X_train_new = X_train_bow\n", " X_valid_new = X_valid_bow\n", " else :\n", " X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))\n", " X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))\n", " model.fit(X_train_new, y_train_part)\n", " model_pred_for_auc = model.predict_proba(X_valid_new)\n", " auc_scores.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))\n", " return np.mean(auc_scores)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "DWLMwdxZiSJO" }, "outputs": [], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "logit = LogisticRegression(random_state = 31)\n", "bayes = MultinomialNB()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "sCvnmUwDlFlt" }, "outputs": [], "source": [ "from sklearn.metrics import roc_auc_score, precision_score" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 29198, "status": "ok", "timestamp": 1545074689425, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "VmrA6oLDkZkU", "outputId": "e35f3012-db68-4174-e741-d66c8b5296d5" }, "outputs": [], "source": [ "scores_logit = valid(logit, 10)\n", "print('Logistic regreession - rocauc : {}'.format(scores_logit))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 42138, "status": "ok", "timestamp": 1545074702373, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "L1jyNyKIkbAh", "outputId": "d1b97bec-ae8b-44d6-db5e-2d0cf610332c" }, "outputs": [], "source": [ "scores_bayes = valid(bayes, 10, True)\n", "print('Bayessian classfier - rocauc : {}'.format(scores_bayes))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "szWD42lYldYE" }, "outputs": [], "source": [ "def valid_for_valid_plots(model, n, bayes = False) :\n", " skf = StratifiedKFold(n_splits = n, random_state = 17)\n", " auc_scores_cv = []\n", " auc_scores_valid = []\n", " for train_index, valid_index in skf.split(X_train_scaled, y_train):\n", " X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]\n", " y_train_part, y_valid = y_train[train_index], y_train[valid_index]\n", " \n", " X_train_sms, X_valid_sms = X_train.iloc[train_index]['content'], X_train.iloc[valid_index]['content']\n", " cv = TfidfVectorizer(ngram_range = (1, 3))\n", " X_train_bow = cv.fit_transform(X_train_sms)\n", " X_valid_bow = cv.transform(X_valid_sms) \n", " if bayes :\n", " X_train_new = X_train_bow\n", " X_valid_new = X_valid_bow\n", " else :\n", " X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))\n", " X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))\n", " \n", " model.fit(X_train_new, y_train_part)\n", " auc_scores_cv.append(roc_auc_score(y_train_part, model.predict_proba(X_train_new)[:, 1]))\n", " model_pred_for_auc = model.predict_proba(X_valid_new)\n", " auc_scores_valid.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))\n", " return 1 - np.mean(auc_scores_valid), 1 - np.mean(auc_scores_cv)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "7auqqbVvllfu" }, "outputs": [], "source": [ "Cs = [0.1 * i for i in range(1, 11)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "RVtony63lryo" }, "outputs": [], "source": [ "scores = []\n", "for c in Cs :\n", " logit = LogisticRegression(C = c, random_state = 31)\n", " scores.append(valid_for_valid_plots(logit, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 620 }, "colab_type": "code", "executionInfo": { "elapsed": 241245, "status": "ok", "timestamp": 1545074901501, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "Ix0BNb-IlwTw", "outputId": "a6868c4e-2124-41a1-f0e2-ebb983a7b1db" }, "outputs": [], "source": [ "fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (20, 10))\n", "plt.plot(Cs, [i[0] for i in scores], color = 'blue', label='holdout')\n", "plt.plot(Cs, [i[1] for i in scores], color = 'red', label='CV')\n", "plt.ylabel(\"ROCAUC\")\n", "plt.xlabel(\"C\")\n", "plt.title('Validation curve for C in (0.1, 2)');" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "BdockSQKl9eq" }, "outputs": [], "source": [ "Cs = np.linspace(0.5, 1.5, 10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 449100, "status": "ok", "timestamp": 1545075109368, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "TGSx514lmCzT", "outputId": "80d99e95-1394-42c4-a655-1e3f6601b0fc" }, "outputs": [], "source": [ "for c in Cs :\n", " logit = LogisticRegression(C = c, random_state = 31)\n", " print(c, valid(logit, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "0E555wuenXOM" }, "outputs": [], "source": [ "C_opt = 1.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "9QNqUHoFmNUr" }, "outputs": [], "source": [ "cv = TfidfVectorizer(ngram_range = (1, 3))\n", "X_train_content = cv.fit_transform(X_train['content'])\n", "X_test_content = cv.transform(X_test['content'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "rUNkxonKmQwA" }, "outputs": [], "source": [ "train = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_content, X_train_scaled]))\n", "test = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_test_content, X_test_scaled]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "CUMb1Dp9ml8-" }, "outputs": [], "source": [ "logit = LogisticRegression(C = C_opt, random_state = 31)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 85 }, "colab_type": "code", "executionInfo": { "elapsed": 451689, "status": "ok", "timestamp": 1545075111998, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "mCfz_Ti3mqsX", "outputId": "fdd0a604-024d-4a68-e9fb-f3708e70465d" }, "outputs": [], "source": [ "logit.fit(train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 85 }, "colab_type": "code", "executionInfo": { "elapsed": 451918, "status": "ok", "timestamp": 1545075112249, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "q1ychVYBmti1", "outputId": "4d069dde-76eb-421c-ddbe-dbe04c656b02" }, "outputs": [], "source": [ "for x, y in zip(cols, logit.coef_[0][len(cv.get_feature_names()) :]) :\n", " print(x, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "mgvHMir8mwGj" }, "outputs": [], "source": [ "logit_pred = logit.predict_proba(test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 451893, "status": "ok", "timestamp": 1545075112252, "user": { "displayName": "Darya Ezhova", "photoUrl": "", "userId": "05078344894993071262" }, "user_tz": -240 }, "id": "zU_am3bFmyoW", "outputId": "3782bedb-0c89-41e7-e171-87e104038f47" }, "outputs": [], "source": [ "roc_auc_score(y_test, logit_pred[:, 1])" ] } ], "metadata": { "colab": { "name": "Tweets_classified_Ezhova_Darya.ipynb", "provenance": [], "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 0 }