{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "WQWllbm7n3yE"
   },
   "source": [
    "# **Tweets classified as agressive or not**\n",
    "\n",
    "Author: Ezhova Darya (@ezhdi slack)\n",
    "\n",
    "Dataset https://www.kaggle.com/dataturks/dataset-for-detection-of-cybertrolls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "NPVhV-e-C-xx"
   },
   "source": [
    "The dataset has 20001 items of which 20001 items have been manually labeled.\n",
    "\n",
    "The labels are divided into following 2 categories:\n",
    "\n",
    "1 - Cyber-Aggressive\n",
    "0 - Non Cyber-Aggressive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 929,
     "status": "ok",
     "timestamp": 1545074660860,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "tcxZbNs1_66K",
    "outputId": "dd7b681f-6ac2-4168-8f7d-72b57f0ae308"
   },
   "outputs": [],
   "source": [
    "from google.colab import drive\n",
    "drive.mount('/content/gdrive')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ElmXpP1H_85Q"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "from scipy.sparse import csr_matrix, hstack\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.model_selection import train_test_split, validation_curve\n",
    "from sklearn.model_selection import StratifiedKFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "X6IuuvvOlPBI"
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "xeijnNHGAEtX"
   },
   "outputs": [],
   "source": [
    "PATH_TO_DATA = '/content/gdrive/My Drive/Data/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "xnYlxlBHq_Fx"
   },
   "source": [
    "**Read the data and have a look at it**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1139,
     "status": "ok",
     "timestamp": 1545074661117,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "7bLrv1YJAV-Y",
    "outputId": "e5359b14-4623-4376-9af8-c2fd2a3e49a1"
   },
   "outputs": [],
   "source": [
    "df = pd.read_json(PATH_TO_DATA + '/Dataset for Detection of Cyber-Trolls.json', lines= True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1128,
     "status": "ok",
     "timestamp": 1545074661118,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "5ln5bw1pALT8",
    "outputId": "e716f367-7794-4796-8437-1a3d7fa8fe5c"
   },
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 153
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1629,
     "status": "ok",
     "timestamp": 1545074661628,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "xxaueJVaXqFd",
    "outputId": "e5c0eddc-c9be-497c-b189-2c5a39105e07"
   },
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "esguMF3JrM7V"
   },
   "source": [
    "**Delete null column extras, transform target column annotation and make some new features**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "jIxH0DMCXw4l"
   },
   "outputs": [],
   "source": [
    "df.drop(columns = ['extras'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wuCFTjUvZ3vX"
   },
   "outputs": [],
   "source": [
    "df.rename(columns = {'annotation' : 'label'}, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "YSyPv27Rauls"
   },
   "outputs": [],
   "source": [
    "df.label = df.label.apply(lambda x : int(x['label'][0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1597,
     "status": "ok",
     "timestamp": 1545074661634,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "PlH6iw_l_yVD",
    "outputId": "da7b9437-d543-4787-9aa1-aa089f16e4ef"
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 119
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1587,
     "status": "ok",
     "timestamp": 1545074661634,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "PasaixlKdEPc",
    "outputId": "a1a90568-5512-414e-a0b8-543c5f2776c2"
   },
   "outputs": [],
   "source": [
    "df[df['label'] == 0].sample(5).content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 119
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1579,
     "status": "ok",
     "timestamp": 1545074661635,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "wteESba_AuP9",
    "outputId": "70342740-5606-4527-b32a-6ad96cb76a29"
   },
   "outputs": [],
   "source": [
    "df[df['label'] == 1].sample(5).content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 265
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1571,
     "status": "ok",
     "timestamp": 1545074661635,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "7jY1qqAnA2ro",
    "outputId": "436024f1-55c9-4380-aa5a-93130dacbbc3"
   },
   "outputs": [],
   "source": [
    "_, ax = plt.subplots()\n",
    "plt.bar(np.arange(2), df['label'].value_counts(), color = ['blue', 'red'])\n",
    "ax.set_xticks(np.arange(2))\n",
    "ax.set_xticklabels(['Non Cyber-Aggressive', 'Cyber-Aggressive']);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1563,
     "status": "ok",
     "timestamp": 1545074661635,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "rU7LoBULBziZ",
    "outputId": "519c3845-9d44-409f-ed28-aa7f7942a72a"
   },
   "outputs": [],
   "source": [
    "df['label'].value_counts()[1] / df.shape[0], df['label'].value_counts()[0] / df.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ZmgQvhomCgqi"
   },
   "outputs": [],
   "source": [
    "df['len'] = df['content'].apply(lambda x : len(x.strip().split()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "p7Wss-cSDSN6"
   },
   "outputs": [],
   "source": [
    "import regex as re "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "PDcAbnLmJJcu"
   },
   "outputs": [],
   "source": [
    "df['capital'] = df['content'].apply(lambda x : sum(1 for c in x if c.isupper()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "lgc2UZj3Y8fZ"
   },
   "outputs": [],
   "source": [
    "df['punct'] = df['content'].apply(lambda x : len(re.findall(\"[^\\P{P}-]+\", x)))\n",
    "df['punct'] = df['content'].apply(lambda x : len(re.findall(\"[^\\P{P}-]+\", x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ZNIddyfkY9mR"
   },
   "outputs": [],
   "source": [
    "df['content'] = df['content'].apply(lambda x : re.sub(\"[^\\P{P}-]+\", \"\", x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "6k9s6SikDZUJ"
   },
   "outputs": [],
   "source": [
    "df['content'] = df['content'].apply(lambda x : str.lower(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 816
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2706,
     "status": "ok",
     "timestamp": 1545074662839,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "FNY_NuDPJ063",
    "outputId": "32725561-4930-4e63-cea9-f35a62f2201e"
   },
   "outputs": [],
   "source": [
    "symbols = {}\n",
    "for x in [item for sublist in list(map(list, df['content'].tolist())) for item in sublist] :\n",
    "    if x in symbols :\n",
    "        symbols[x] += 1\n",
    "    else :\n",
    "        symbols[x] = 1\n",
    "symbols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "-8WkviCgejql"
   },
   "outputs": [],
   "source": [
    "digits = '0123456789'\n",
    "df['num'] = df['content'].apply(lambda x : 1 if len([s for s in x if s in digits]) > 0 else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2693,
     "status": "ok",
     "timestamp": 1545074662842,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "OILpILKFfxOO",
    "outputId": "12abb3ff-65d7-4470-e0b5-9dbd73144f8d"
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Am5Bxbk9fyvJ"
   },
   "outputs": [],
   "source": [
    "target = df['label'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "b6Qj5kowrvtG"
   },
   "source": [
    "**Split dataset on train and test**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "XxoTjFz_gCEL"
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = 0.3, stratify = target, random_state = 31)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2679,
     "status": "ok",
     "timestamp": 1545074662845,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "SgXDrfjzgKFv",
    "outputId": "22834b26-69c7-4997-f543-af182d87be82"
   },
   "outputs": [],
   "source": [
    "y_train.sum() / len(y_train), y_test.sum() / len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2674,
     "status": "ok",
     "timestamp": 1545074662846,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "nOFQkM99gZJG",
    "outputId": "1f29112e-706c-4137-b511-f746462a3d95"
   },
   "outputs": [],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 2373
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 8334,
     "status": "ok",
     "timestamp": 1545074668514,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "hVjajeJJgkDZ",
    "outputId": "2132836e-9e5a-4c7a-93a9-0735eb856390"
   },
   "outputs": [],
   "source": [
    "for col in X_train.columns[2 :] :\n",
    "    fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (20, 10))\n",
    "    axes[0].set_title(col)\n",
    "    axes[0].hist(X_train[col], bins = 200);\n",
    "    axes[1].set_title(col)\n",
    "    axes[1].hist(X_train[col][X_train['label'] == 0], bins = 200, label = 'normal')\n",
    "    axes[1].hist(X_train[col][X_train['label'] == 1], bins = 200, label = 'agressive')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 613
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 8704,
     "status": "ok",
     "timestamp": 1545074668892,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "6ZHdgzrRgw7_",
    "outputId": "d4d17823-7ed4-49bb-aea7-f3f29c180577"
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize = (20, 10))\n",
    "sns.heatmap(X_train[['label', 'len', 'punct', 'capital','num']].corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "1HtcW9nghANu"
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "dzrmSTWDhy0t"
   },
   "outputs": [],
   "source": [
    "cols = ['len', 'punct', 'capital', 'num']\n",
    "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[cols]), columns = cols)\n",
    "X_test_scaled = pd.DataFrame(scaler.transform(X_test[cols]), columns = cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "54UHa62ziJW-"
   },
   "outputs": [],
   "source": [
    "def valid(model, n, bayes = False) :\n",
    "    skf = StratifiedKFold(n_splits = n, random_state = 31)\n",
    "    auc_scores = []\n",
    "    for train_index, valid_index in skf.split(X_train_scaled, y_train):\n",
    "        X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]\n",
    "        y_train_part, y_valid = y_train[train_index], y_train[valid_index]\n",
    "        \n",
    "        X_train_sms, X_valid_sms = X_train.iloc[train_index]['content'], X_train.iloc[valid_index]['content']\n",
    "        cv = TfidfVectorizer(ngram_range = (1, 3))\n",
    "        X_train_bow = cv.fit_transform(X_train_sms)\n",
    "        X_valid_bow = cv.transform(X_valid_sms)     \n",
    "        if bayes :\n",
    "            X_train_new = X_train_bow\n",
    "            X_valid_new = X_valid_bow\n",
    "        else :\n",
    "            X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))\n",
    "            X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))\n",
    "        model.fit(X_train_new, y_train_part)\n",
    "        model_pred_for_auc = model.predict_proba(X_valid_new)\n",
    "        auc_scores.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))\n",
    "    return np.mean(auc_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "DWLMwdxZiSJO"
   },
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "logit = LogisticRegression(random_state = 31)\n",
    "bayes = MultinomialNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "sCvnmUwDlFlt"
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_auc_score, precision_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 29198,
     "status": "ok",
     "timestamp": 1545074689425,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "VmrA6oLDkZkU",
    "outputId": "e35f3012-db68-4174-e741-d66c8b5296d5"
   },
   "outputs": [],
   "source": [
    "scores_logit = valid(logit, 10)\n",
    "print('Logistic regreession - rocauc : {}'.format(scores_logit))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 42138,
     "status": "ok",
     "timestamp": 1545074702373,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "L1jyNyKIkbAh",
    "outputId": "d1b97bec-ae8b-44d6-db5e-2d0cf610332c"
   },
   "outputs": [],
   "source": [
    "scores_bayes = valid(bayes, 10, True)\n",
    "print('Bayessian classfier - rocauc : {}'.format(scores_bayes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "szWD42lYldYE"
   },
   "outputs": [],
   "source": [
    "def valid_for_valid_plots(model, n, bayes = False) :\n",
    "    skf = StratifiedKFold(n_splits = n, random_state = 17)\n",
    "    auc_scores_cv = []\n",
    "    auc_scores_valid = []\n",
    "    for train_index, valid_index in skf.split(X_train_scaled, y_train):\n",
    "        X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]\n",
    "        y_train_part, y_valid = y_train[train_index], y_train[valid_index]\n",
    "        \n",
    "        X_train_sms, X_valid_sms = X_train.iloc[train_index]['content'], X_train.iloc[valid_index]['content']\n",
    "        cv = TfidfVectorizer(ngram_range = (1, 3))\n",
    "        X_train_bow = cv.fit_transform(X_train_sms)\n",
    "        X_valid_bow = cv.transform(X_valid_sms)     \n",
    "        if bayes :\n",
    "            X_train_new = X_train_bow\n",
    "            X_valid_new = X_valid_bow\n",
    "        else :\n",
    "            X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))\n",
    "            X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))\n",
    "            \n",
    "        model.fit(X_train_new, y_train_part)\n",
    "        auc_scores_cv.append(roc_auc_score(y_train_part, model.predict_proba(X_train_new)[:, 1]))\n",
    "        model_pred_for_auc = model.predict_proba(X_valid_new)\n",
    "        auc_scores_valid.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))\n",
    "    return 1 - np.mean(auc_scores_valid), 1 - np.mean(auc_scores_cv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "7auqqbVvllfu"
   },
   "outputs": [],
   "source": [
    "Cs = [0.1 * i for i in range(1, 11)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "RVtony63lryo"
   },
   "outputs": [],
   "source": [
    "scores = []\n",
    "for c in Cs :\n",
    "    logit = LogisticRegression(C = c, random_state = 31)\n",
    "    scores.append(valid_for_valid_plots(logit, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 620
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 241245,
     "status": "ok",
     "timestamp": 1545074901501,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "Ix0BNb-IlwTw",
    "outputId": "a6868c4e-2124-41a1-f0e2-ebb983a7b1db"
   },
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (20, 10))\n",
    "plt.plot(Cs, [i[0] for i in scores], color = 'blue', label='holdout')\n",
    "plt.plot(Cs, [i[1] for i in scores], color = 'red', label='CV')\n",
    "plt.ylabel(\"ROCAUC\")\n",
    "plt.xlabel(\"C\")\n",
    "plt.title('Validation curve for C in (0.1, 2)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "BdockSQKl9eq"
   },
   "outputs": [],
   "source": [
    "Cs = np.linspace(0.5, 1.5, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 187
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 449100,
     "status": "ok",
     "timestamp": 1545075109368,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "TGSx514lmCzT",
    "outputId": "80d99e95-1394-42c4-a655-1e3f6601b0fc"
   },
   "outputs": [],
   "source": [
    "for c in Cs :\n",
    "    logit = LogisticRegression(C = c, random_state = 31)\n",
    "    print(c, valid(logit, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "0E555wuenXOM"
   },
   "outputs": [],
   "source": [
    "C_opt = 1.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "9QNqUHoFmNUr"
   },
   "outputs": [],
   "source": [
    "cv = TfidfVectorizer(ngram_range = (1, 3))\n",
    "X_train_content = cv.fit_transform(X_train['content'])\n",
    "X_test_content = cv.transform(X_test['content'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "rUNkxonKmQwA"
   },
   "outputs": [],
   "source": [
    "train = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_content, X_train_scaled]))\n",
    "test = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_test_content, X_test_scaled]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "CUMb1Dp9ml8-"
   },
   "outputs": [],
   "source": [
    "logit = LogisticRegression(C = C_opt, random_state = 31)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 85
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 451689,
     "status": "ok",
     "timestamp": 1545075111998,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "mCfz_Ti3mqsX",
    "outputId": "fdd0a604-024d-4a68-e9fb-f3708e70465d"
   },
   "outputs": [],
   "source": [
    "logit.fit(train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 85
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 451918,
     "status": "ok",
     "timestamp": 1545075112249,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "q1ychVYBmti1",
    "outputId": "4d069dde-76eb-421c-ddbe-dbe04c656b02"
   },
   "outputs": [],
   "source": [
    "for x, y in zip(cols, logit.coef_[0][len(cv.get_feature_names()) :]) :\n",
    "    print(x, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "mgvHMir8mwGj"
   },
   "outputs": [],
   "source": [
    "logit_pred = logit.predict_proba(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 451893,
     "status": "ok",
     "timestamp": 1545075112252,
     "user": {
      "displayName": "Darya Ezhova",
      "photoUrl": "",
      "userId": "05078344894993071262"
     },
     "user_tz": -240
    },
    "id": "zU_am3bFmyoW",
    "outputId": "3782bedb-0c89-41e7-e171-87e104038f47"
   },
   "outputs": [],
   "source": [
    "roc_auc_score(y_test, logit_pred[:, 1])"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Tweets_classified_Ezhova_Darya.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}