{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Project 3\n",
    "\n",
    "\n",
    "# Conversations Toxicity Detection\n",
    "\n",
    "Jigsaw Unintended Bias in Toxicity Classification \n",
    "\n",
    "Detect toxicity across a diverse range of conversations\n",
    "\n",
    "\n",
    "https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data#\n",
    "\n",
    "# Sample Solution\n",
    "\n",
    "### Install the Kaggle API and download the datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "94JWSTZMH0IB"
   },
   "outputs": [],
   "source": [
    "!pip install -U -q kaggle\n",
    "!echo {\"username\":\"albahnsen\",\"key\":\"1c9e16f2c8e4de73bb6b7db12d17e22d\"} > C:\\Users\\albah\\.kaggle\\kaggle.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 84
    },
    "colab_type": "code",
    "id": "0eQDyQmnKGYh",
    "outputId": "693bfd9a-0e4f-4dd3-fe11-8dbf6d922f4c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
     ]
    }
   ],
   "source": [
    "!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f test.csv -p ../datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f train.csv -p ../datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model with TF-IDF and Ranfom Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "CWro5KSCPAHG"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "import string\n",
    "from joblib import Parallel, delayed\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 84
    },
    "colab_type": "code",
    "id": "9GO20asAPqte",
    "outputId": "d7956043-e722-4eea-aee5-ec2befb20f2c"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\albah\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\albah\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "nltk.download('punkt')\n",
    "from nltk.corpus import stopwords \n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import SnowballStemmer\n",
    "stop_words = set(stopwords.words('english'))\n",
    "stem = SnowballStemmer('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "8GbZh9o2PauG"
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv(\"../datasets/train.csv.zip\")\n",
    "train_df = train_df[['id','comment_text', 'target']]\n",
    "test_df = pd.read_csv(\"../datasets/test.csv.zip\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 195
    },
    "colab_type": "code",
    "id": "MmwNcTzXPw12",
    "outputId": "6fe5ad8e-2d84-44ab-9792-56285f1e7206"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>59848</td>\n",
       "      <td>This is so cool. It's like, 'would you want yo...</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>59849</td>\n",
       "      <td>Thank you!! This would make my life a lot less...</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>59852</td>\n",
       "      <td>This is such an urgent design problem; kudos t...</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>59855</td>\n",
       "      <td>Is this something I'll be able to install on m...</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>59856</td>\n",
       "      <td>haha you guys are a bunch of losers.</td>\n",
       "      <td>0.893617</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id                                       comment_text    target\n",
       "0  59848  This is so cool. It's like, 'would you want yo...  0.000000\n",
       "1  59849  Thank you!! This would make my life a lot less...  0.000000\n",
       "2  59852  This is such an urgent design problem; kudos t...  0.000000\n",
       "3  59855  Is this something I'll be able to install on m...  0.000000\n",
       "4  59856               haha you guys are a bunch of losers.  0.893617"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 286
    },
    "colab_type": "code",
    "id": "qtWj7h8gQOkF",
    "outputId": "20e09c89-3264-424f-f307-8b3532c2abee"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x22d74af47f0>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFjpJREFUeJzt3X+QXeV93/H3N1KEZWIsjOIdRlIjMt6kkSGd4B1Qmpl0a6UgcAbxB3TEkCA7SjSl2E2DprVo/lDHLjO4KaWGwSSqpSIyqkGhaaVJhFUNcMdtx5IRIUX8CNVWqGiNaowlVK8Zm8r99o/7rH3Z3P2h++zu0bLv18zOnvM9zznP89yV9Nnz415FZiJJUo2faHoAkqS5zzCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklRtYdMDmC1Lly7NlStX9rTv9773PS688MLpHdB5zjnPD855fqiZ87PPPvtmZv70ZO3mTZisXLmSw4cP97Rvq9VicHBwegd0nnPO84Nznh9q5hwR/2sq7bzMJUmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSao2b94BX+PIN8/wyS1/3kjfx+/5RCP9StK58MxEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFWbNEwiYkdEvBERL3TU/iAi/ioino+I/xgRSzq23RURQxHxSkRc21FfW2pDEbGlo35ZRByKiKMR8VhELCr1C8r6UNm+crI+JEnNmMqZycPA2jG1A8DlmfmLwP8A7gKIiFXAeuCjZZ8vRcSCiFgAPAhcB6wCbiltAb4A3JeZ/cBpYGOpbwROZ+ZHgPtKu3H7OMd5S5Km0aRhkplfA06Nqf3nzDxbVg8Cy8vyOuDRzPxBZr4KDAFXla+hzDyWme8AjwLrIiKAjwOPl/13Ajd2HGtnWX4cWFPaj9eHJKkh03HP5LeAJ8ryMuBEx7bhUhuvfgnwVkcwjdbfdayy/UxpP96xJEkNqfo4lYj4feAssGu01KVZ0j20coL2Ex1ron3Gjm8TsAmgr6+PVqvVrdmk+hbD5ivOTt5wBvQ65lojIyON9d0U5zw/OOeZ0XOYRMQG4NeBNZk5+o/5MLCio9ly4PWy3K3+JrAkIhaWs4/O9qPHGo6IhcAHaV9um6iPd8nMbcA2gIGBgRwcHDz3iQIP7NrDvUea+Riz47cONtJvq9Wi19drrnLO84Nznhk9XeaKiLXAZ4EbMvPtjk17gfXlSazLgH7gG8AzQH95cmsR7Rvoe0sIPQ3cVPbfAOzpONaGsnwT8FRpP14fkqSGTPrrdkR8BRgElkbEMLCV9tNbFwAH2vfEOZiZ/yAzX4yI3cBLtC9/3ZGZPyzH+TSwH1gA7MjMF0sXnwUejYh/ATwHbC/17cAfR8QQ7TOS9QAT9SFJasakYZKZt3Qpb+9SG21/N3B3l/o+YF+X+jG6PI2Vmd8Hbj6XPiRJzfAd8JKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqpNGiYRsSMi3oiIFzpqH4qIAxFxtHy/uNQjIu6PiKGIeD4iruzYZ0NpfzQiNnTUPxYRR8o+90dE9NqHJKkZUzkzeRhYO6a2BXgyM/uBJ8s6wHVAf/naBDwE7WAAtgJXA1cBW0fDobTZ1LHf2l76kCQ1Z9IwycyvAafGlNcBO8vyTuDGjvoj2XYQWBIRlwLXAgcy81RmngYOAGvLtosy8+uZmcAjY451Ln1IkhqysMf9+jLzJEBmnoyID5f6MuBER7vhUpuoPtyl3ksfJ8cOMiI20T57oa+vj1ardW6zHB3IYth8xdme9q3V65hrjYyMNNZ3U5zz/OCcZ0avYTKe6FLLHuq99PHXi5nbgG0AAwMDOTg4OMmhu3tg1x7uPTLdL9XUHL91sJF+W60Wvb5ec5Vznh+c88zo9Wmub41eWirf3yj1YWBFR7vlwOuT1Jd3qffShySpIb2GyV5g9ImsDcCejvpt5Ymr1cCZcqlqP3BNRFxcbrxfA+wv274bEavLU1y3jTnWufQhSWrIpNduIuIrwCCwNCKGaT+VdQ+wOyI2Aq8BN5fm+4DrgSHgbeBTAJl5KiI+DzxT2n0uM0dv6t9O+4mxxcAT5Ytz7UOS1JxJwyQzbxln05oubRO4Y5zj7AB2dKkfBi7vUv/OufYhSWqG74CXJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUrWqMImI34uIFyPihYj4SkS8LyIui4hDEXE0Ih6LiEWl7QVlfahsX9lxnLtK/ZWIuLajvrbUhiJiS0e9ax+SpGb0HCYRsQz4R8BAZl4OLADWA18A7svMfuA0sLHsshE4nZkfAe4r7YiIVWW/jwJrgS9FxIKIWAA8CFwHrAJuKW2ZoA9JUgNqL3MtBBZHxELg/cBJ4OPA42X7TuDGsryurFO2r4mIKPVHM/MHmfkqMARcVb6GMvNYZr4DPAqsK/uM14ckqQE9h0lmfhP4V8BrtEPkDPAs8FZmni3NhoFlZXkZcKLse7a0v6SzPmaf8eqXTNCHJKkBC3vdMSIupn1WcRnwFvAntC9JjZWju4yzbbx6t6CbqH23MW4CNgH09fXRarW6NZtU32LYfMXZyRvOgF7HXGtkZKSxvpvinOcH5zwzeg4T4NeAVzPz2wAR8afA3waWRMTCcuawHHi9tB8GVgDD5bLYB4FTHfVRnft0q785QR/vkpnbgG0AAwMDOTg42NNEH9i1h3uP1LxUvTt+62Aj/bZaLXp9veYq5zw/OOeZUXPP5DVgdUS8v9zHWAO8BDwN3FTabAD2lOW9ZZ2y/anMzFJfX572ugzoB74BPAP0lye3FtG+Sb+37DNeH5KkBtTcMzlE+yb4XwBHyrG2AZ8F7oyIIdr3N7aXXbYDl5T6ncCWcpwXgd20g+irwB2Z+cNy1vFpYD/wMrC7tGWCPiRJDai6dpOZW4GtY8rHaD+JNbbt94GbxznO3cDdXer7gH1d6l37kCQ1w3fAS5KqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqlYVJhGxJCIej4i/ioiXI+KXI+JDEXEgIo6W7xeXthER90fEUEQ8HxFXdhxnQ2l/NCI2dNQ/FhFHyj73R0SUetc+JEnNqD0z+SLw1cz8m8DfAl4GtgBPZmY/8GRZB7gO6C9fm4CHoB0MwFbgauAqYGtHODxU2o7ut7bUx+tDktSAnsMkIi4CfhXYDpCZ72TmW8A6YGdpthO4sSyvAx7JtoPAkoi4FLgWOJCZpzLzNHAAWFu2XZSZX8/MBB4Zc6xufUiSGlBzZvKzwLeBfxcRz0XElyPiQqAvM08ClO8fLu2XASc69h8utYnqw13qTNCHJKkBCyv3vRL4TGYeiogvMvHlpuhSyx7qUxYRm2hfJqOvr49Wq3Uuu/9I32LYfMXZnvat1euYa42MjDTWd1Oc8/zgnGdGTZgMA8OZeaisP047TL4VEZdm5slyqeqNjvYrOvZfDrxe6oNj6q1SX96lPRP08S6ZuQ3YBjAwMJCDg4Pdmk3qgV17uPdIzUvVu+O3DjbSb6vVotfXa65yzvODc54ZPV/mysz/DZyIiJ8vpTXAS8BeYPSJrA3AnrK8F7itPNW1GjhTLlHtB66JiIvLjfdrgP1l23cjYnV5iuu2Mcfq1ockqQG1v25/BtgVEYuAY8CnaAfU7ojYCLwG3Fza7gOuB4aAt0tbMvNURHweeKa0+1xmnirLtwMPA4uBJ8oXwD3j9CFJakBVmGTmXwIDXTat6dI2gTvGOc4OYEeX+mHg8i7173TrQ5LUDN8BL0mqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqVp1mETEgoh4LiL+rKxfFhGHIuJoRDwWEYtK/YKyPlS2r+w4xl2l/kpEXNtRX1tqQxGxpaPetQ9JUjOm48zkd4GXO9a/ANyXmf3AaWBjqW8ETmfmR4D7SjsiYhWwHvgosBb4UgmoBcCDwHXAKuCW0naiPiRJDagKk4hYDnwC+HJZD+DjwOOlyU7gxrK8rqxTtq8p7dcBj2bmDzLzVWAIuKp8DWXmscx8B3gUWDdJH5KkBiys3P/fAP8U+EBZvwR4KzPPlvVhYFlZXgacAMjMsxFxprRfBhzsOGbnPifG1K+epI93iYhNwCaAvr4+Wq3Wuc8Q6FsMm684O3nDGdDrmGuNjIw01ndTnPP84JxnRs9hEhG/DryRmc9GxOBouUvTnGTbePVuZ00Ttf/rxcxtwDaAgYGBHBwc7NZsUg/s2sO9R2pztzfHbx1spN9Wq0Wvr9dc5ZznB+c8M2r+hfwV4IaIuB54H3AR7TOVJRGxsJw5LAdeL+2HgRXAcEQsBD4InOqoj+rcp1v9zQn6kCQ1oOd7Jpl5V2Yuz8yVtG+gP5WZtwJPAzeVZhuAPWV5b1mnbH8qM7PU15envS4D+oFvAM8A/eXJrUWlj71ln/H6kCQ1YCbeZ/JZ4M6IGKJ9f2N7qW8HLin1O4EtAJn5IrAbeAn4KnBHZv6wnHV8GthP+2mx3aXtRH1IkhowLTcCMrMFtMryMdpPYo1t833g5nH2vxu4u0t9H7CvS71rH5KkZvgOeElSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRV6zlMImJFRDwdES9HxIsR8bul/qGIOBARR8v3i0s9IuL+iBiKiOcj4sqOY20o7Y9GxIaO+sci4kjZ5/6IiIn6kCQ1o+bM5CywOTN/AVgN3BERq4AtwJOZ2Q88WdYBrgP6y9cm4CFoBwOwFbgauArY2hEOD5W2o/utLfXx+pAkNaDnMMnMk5n5F2X5u8DLwDJgHbCzNNsJ3FiW1wGPZNtBYElEXApcCxzIzFOZeRo4AKwt2y7KzK9nZgKPjDlWtz4kSQ2YlnsmEbES+CXgENCXmSehHTjAh0uzZcCJjt2GS22i+nCXOhP0IUlqwMLaA0TETwH/AfjHmfl/ym2Nrk271LKH+rmMbRPty2T09fXRarXOZfcf6VsMm68429O+tXodc62RkZHG+m6Kc54fnPPMqAqTiPhJ2kGyKzP/tJS/FRGXZubJcqnqjVIfBlZ07L4ceL3UB8fUW6W+vEv7ifp4l8zcBmwDGBgYyMHBwW7NJvXArj3ce6Q6d3ty/NbBRvpttVr0+nrNVc55fnDOM6Pmaa4AtgMvZ+a/7ti0Fxh9ImsDsKejflt5qms1cKZcotoPXBMRF5cb79cA+8u270bE6tLXbWOO1a0PSVIDan7d/hXgN4EjEfGXpfbPgHuA3RGxEXgNuLls2wdcDwwBbwOfAsjMUxHxeeCZ0u5zmXmqLN8OPAwsBp4oX0zQhySpAT2HSWb+V7rf1wBY06V9AneMc6wdwI4u9cPA5V3q3+nWhySpGc3cCNB578g3z/DJLX/eSN/H7/lEI/1K6p1hcp5b2dA/6JuvaKRbSXOUn80lSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqvkR9NJ5oKn/auDhtRc20q/eezwzkSRV88xEKpr83yWluc4zE0lSNcNEklTNMJEkVTNMJEnVvAEvzWNNPXRw/J5PzHqfmllzOkwiYi3wRWAB8OXMvKfhIUk6zxmgM2POhklELAAeBP4eMAw8ExF7M/OlZkcmaTJNvUkTYPMVjXX9njZnwwS4ChjKzGMAEfEosA4wTCSdd5oM0Nn4pIO5HCbLgBMd68PA1Q2NRdOoqb90/sYq9S4ys+kx9CQibgauzczfLuu/CVyVmZ/paLMJ2FRWfx54pcfulgJvVgx3LnLO84Nznh9q5vwzmfnTkzWay2cmw8CKjvXlwOudDTJzG7CttqOIOJyZA7XHmUuc8/zgnOeH2ZjzXH6fyTNAf0RcFhGLgPXA3obHJEnz0pw9M8nMsxHxaWA/7UeDd2Tmiw0PS5LmpTkbJgCZuQ/YNwtdVV8qm4Oc8/zgnOeHGZ/znL0BL0k6f8zleyaSpPOEYdIhItZGxCsRMRQRW7psvyAiHivbD0XEytkf5fSawpzvjIiXIuL5iHgyIn6miXFOp8nm3NHupojIiJjzT/5MZc4R8ffLz/rFiPj3sz3G6TaFP9t/IyKejojnyp/v65sY53SJiB0R8UZEvDDO9oiI+8vr8XxEXDmtA8hMv9qX+hYA/xP4WWAR8N+BVWPa/EPgD8vyeuCxpsc9C3P+u8D7y/Lt82HOpd0HgK8BB4GBpsc9Cz/nfuA54OKy/uGmxz0Lc94G3F6WVwHHmx535Zx/FbgSeGGc7dcDTwABrAYOTWf/npn82I8+niUz3wFGP56l0zpgZ1l+HFgTETGLY5xuk845M5/OzLfL6kHa7+eZy6bycwb4PPAvge/P5uBmyFTm/DvAg5l5GiAz35jlMU63qcw5gYvK8gcZ8z61uSYzvwacmqDJOuCRbDsILImIS6erf8Pkx7p9PMuy8dpk5lngDHDJrIxuZkxlzp020v7NZi6bdM4R8UvAisz8s9kc2Ayays/554Cfi4j/FhEHyydyz2VTmfM/B34jIoZpPxX6Gd7bzvXv+zmZ048GT7NuZxhjH3WbSpu5ZMrziYjfAAaAvzOjI5p5E845In4CuA/45GwNaBZM5ee8kPalrkHaZ5//JSIuz8y3ZnhsM2Uqc74FeDgz742IXwb+uMz5/8388Boxo/9+eWbyY5N+PEtnm4hYSPvUeKLTyvPdVOZMRPwa8PvADZn5g1ka20yZbM4fAC4HWhFxnPa15b1z/Cb8VP9s78nM/5uZr9L+HLv+WRrfTJjKnDcCuwEy8+vA+2h/htV71ZT+vvfKMPmxqXw8y15gQ1m+CXgqy52tOWrSOZdLPn9EO0jm+nV0mGTOmXkmM5dm5srMXEn7PtENmXm4meFOi6n82f5PtB+2ICKW0r7sdWxWRzm9pjLn14A1ABHxC7TD5NuzOsrZtRe4rTzVtRo4k5knp+vgXuYqcpyPZ4mIzwGHM3MvsJ32qfAQ7TOS9c2NuN4U5/wHwE8Bf1KeNXgtM29obNCVpjjn95Qpznk/cE1EvAT8EPgnmfmd5kZdZ4pz3gz824j4PdqXez45l385jIiv0L5MubTcB9oK/CRAZv4h7ftC1wNDwNvAp6a1/zn82kmSzhNe5pIkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVO3/A7VGdCyrFXx3AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "train_df.target.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1804874, 3)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 195
    },
    "colab_type": "code",
    "id": "jz3TWXNiQL5X",
    "outputId": "6fbcf6c1-526d-4c5a-dc3f-6e07e49e4431"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7000000</td>\n",
       "      <td>Jeff Sessions is another one of Trump's Orwell...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7000001</td>\n",
       "      <td>I actually inspected the infrastructure on Gra...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7000002</td>\n",
       "      <td>No it won't . That's just wishful thinking on ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7000003</td>\n",
       "      <td>Instead of wringing our hands and nibbling the...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7000004</td>\n",
       "      <td>how many of you commenters have garbage piled ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id                                       comment_text\n",
       "0  7000000  Jeff Sessions is another one of Trump's Orwell...\n",
       "1  7000001  I actually inspected the infrastructure on Gra...\n",
       "2  7000002  No it won't . That's just wishful thinking on ...\n",
       "3  7000003  Instead of wringing our hands and nibbling the...\n",
       "4  7000004  how many of you commenters have garbage piled ..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(97320, 2)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = train_df.sample(100000, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100000, 3)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "P6inc2DWQTzT"
   },
   "outputs": [],
   "source": [
    "def tokenize(text):\n",
    "    \n",
    "    tokens = []\n",
    "    for token in word_tokenize(text):\n",
    "        if token in string.punctuation: continue\n",
    "        if token in stop_words: continue\n",
    "        tokens.append(stem.stem(token))\n",
    "    \n",
    "    return \" \".join(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s\n",
      "[Parallel(n_jobs=-1)]: Done 8088 tasks      | elapsed:    8.3s\n",
      "[Parallel(n_jobs=-1)]: Done 31088 tasks      | elapsed:   20.7s\n",
      "[Parallel(n_jobs=-1)]: Done 63288 tasks      | elapsed:   37.9s\n",
      "[Parallel(n_jobs=-1)]: Done 100000 out of 100000 | elapsed:   57.5s finished\n"
     ]
    }
   ],
   "source": [
    "train_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in train_df['comment_text'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'what breath fresh air someon embrac common sens valu instead leadership canada clear differ page perhap read differ book'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_tokens[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    0.2s\n",
      "[Parallel(n_jobs=-1)]: Done 9952 tasks      | elapsed:    5.5s\n",
      "[Parallel(n_jobs=-1)]: Done 27452 tasks      | elapsed:   15.0s\n",
      "[Parallel(n_jobs=-1)]: Done 51952 tasks      | elapsed:   27.7s\n",
      "[Parallel(n_jobs=-1)]: Done 83452 tasks      | elapsed:   43.9s\n",
      "[Parallel(n_jobs=-1)]: Done 97320 out of 97320 | elapsed:   50.8s finished\n"
     ]
    }
   ],
   "source": [
    "test_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in test_df['comment_text'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "197320"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_tokens + test_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "3vteOVvNQbf-"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
       "        stop_words=None, strip_accents=None, sublinear_tf=False,\n",
       "        token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
       "        vocabulary=None)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vect = TfidfVectorizer()\n",
    "vect.fit(train_tokens + test_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "rGKVqvTrQmLJ"
   },
   "outputs": [],
   "source": [
    "X = vect.transform(train_tokens)\n",
    "y = train_df['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "oCtm_U8xQm_C"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,\n",
       "           max_features='auto', max_leaf_nodes=None,\n",
       "           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "           min_samples_leaf=1, min_samples_split=2,\n",
       "           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n",
       "           oob_score=False, random_state=42, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42, max_depth=10)\n",
    "reg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_X =  vect.transform(test_tokens)\n",
    "test_y = reg.predict(test_X)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Untitled0.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}