{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Project 3\n",
"\n",
"\n",
"# Conversations Toxicity Detection\n",
"\n",
"Jigsaw Unintended Bias in Toxicity Classification \n",
"\n",
"Detect toxicity across a diverse range of conversations\n",
"\n",
"\n",
"https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data#\n",
"\n",
"# Sample Solution\n",
"\n",
"### Install the Kaggle API and download the datasets"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "94JWSTZMH0IB"
},
"outputs": [],
"source": [
"!pip install -U -q kaggle\n",
"!echo {\"username\":\"albahnsen\",\"key\":\"1c9e16f2c8e4de73bb6b7db12d17e22d\"} > C:\\Users\\albah\\.kaggle\\kaggle.json"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 84
},
"colab_type": "code",
"id": "0eQDyQmnKGYh",
"outputId": "693bfd9a-0e4f-4dd3-fe11-8dbf6d922f4c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f test.csv -p ../datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f train.csv -p ../datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model with TF-IDF and Ranfom Forest"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CWro5KSCPAHG"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"import string\n",
"from joblib import Parallel, delayed\n",
"from tqdm import tqdm_notebook as tqdm\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 84
},
"colab_type": "code",
"id": "9GO20asAPqte",
"outputId": "d7956043-e722-4eea-aee5-ec2befb20f2c"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\albah\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\albah\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"import nltk\n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')\n",
"from nltk.corpus import stopwords \n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.stem import SnowballStemmer\n",
"stop_words = set(stopwords.words('english'))\n",
"stem = SnowballStemmer('english')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "8GbZh9o2PauG"
},
"outputs": [],
"source": [
"train_df = pd.read_csv(\"../datasets/train.csv.zip\")\n",
"train_df = train_df[['id','comment_text', 'target']]\n",
"test_df = pd.read_csv(\"../datasets/test.csv.zip\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 195
},
"colab_type": "code",
"id": "MmwNcTzXPw12",
"outputId": "6fe5ad8e-2d84-44ab-9792-56285f1e7206"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" comment_text | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 59848 | \n",
" This is so cool. It's like, 'would you want yo... | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" 59849 | \n",
" Thank you!! This would make my life a lot less... | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" 59852 | \n",
" This is such an urgent design problem; kudos t... | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" 59855 | \n",
" Is this something I'll be able to install on m... | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" 59856 | \n",
" haha you guys are a bunch of losers. | \n",
" 0.893617 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id comment_text target\n",
"0 59848 This is so cool. It's like, 'would you want yo... 0.000000\n",
"1 59849 Thank you!! This would make my life a lot less... 0.000000\n",
"2 59852 This is such an urgent design problem; kudos t... 0.000000\n",
"3 59855 Is this something I'll be able to install on m... 0.000000\n",
"4 59856 haha you guys are a bunch of losers. 0.893617"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 286
},
"colab_type": "code",
"id": "qtWj7h8gQOkF",
"outputId": "20e09c89-3264-424f-f307-8b3532c2abee"
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFjpJREFUeJzt3X+QXeV93/H3N1KEZWIsjOIdRlIjMt6kkSGd4B1Qmpl0a6UgcAbxB3TEkCA7SjSl2E2DprVo/lDHLjO4KaWGwSSqpSIyqkGhaaVJhFUNcMdtx5IRIUX8CNVWqGiNaowlVK8Zm8r99o/7rH3Z3P2h++zu0bLv18zOnvM9zznP89yV9Nnz415FZiJJUo2faHoAkqS5zzCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklRtYdMDmC1Lly7NlStX9rTv9773PS688MLpHdB5zjnPD855fqiZ87PPPvtmZv70ZO3mTZisXLmSw4cP97Rvq9VicHBwegd0nnPO84Nznh9q5hwR/2sq7bzMJUmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSao2b94BX+PIN8/wyS1/3kjfx+/5RCP9StK58MxEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFWbNEwiYkdEvBERL3TU/iAi/ioino+I/xgRSzq23RURQxHxSkRc21FfW2pDEbGlo35ZRByKiKMR8VhELCr1C8r6UNm+crI+JEnNmMqZycPA2jG1A8DlmfmLwP8A7gKIiFXAeuCjZZ8vRcSCiFgAPAhcB6wCbiltAb4A3JeZ/cBpYGOpbwROZ+ZHgPtKu3H7OMd5S5Km0aRhkplfA06Nqf3nzDxbVg8Cy8vyOuDRzPxBZr4KDAFXla+hzDyWme8AjwLrIiKAjwOPl/13Ajd2HGtnWX4cWFPaj9eHJKkh03HP5LeAJ8ryMuBEx7bhUhuvfgnwVkcwjdbfdayy/UxpP96xJEkNqfo4lYj4feAssGu01KVZ0j20coL2Ex1ron3Gjm8TsAmgr6+PVqvVrdmk+hbD5ivOTt5wBvQ65lojIyON9d0U5zw/OOeZ0XOYRMQG4NeBNZk5+o/5MLCio9ly4PWy3K3+JrAkIhaWs4/O9qPHGo6IhcAHaV9um6iPd8nMbcA2gIGBgRwcHDz3iQIP7NrDvUea+Riz47cONtJvq9Wi19drrnLO84Nznhk9XeaKiLXAZ4EbMvPtjk17gfXlSazLgH7gG8AzQH95cmsR7Rvoe0sIPQ3cVPbfAOzpONaGsnwT8FRpP14fkqSGTPrrdkR8BRgElkbEMLCV9tNbFwAH2vfEOZiZ/yAzX4yI3cBLtC9/3ZGZPyzH+TSwH1gA7MjMF0sXnwUejYh/ATwHbC/17cAfR8QQ7TOS9QAT9SFJasakYZKZt3Qpb+9SG21/N3B3l/o+YF+X+jG6PI2Vmd8Hbj6XPiRJzfAd8JKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqpNGiYRsSMi3oiIFzpqH4qIAxFxtHy/uNQjIu6PiKGIeD4iruzYZ0NpfzQiNnTUPxYRR8o+90dE9NqHJKkZUzkzeRhYO6a2BXgyM/uBJ8s6wHVAf/naBDwE7WAAtgJXA1cBW0fDobTZ1LHf2l76kCQ1Z9IwycyvAafGlNcBO8vyTuDGjvoj2XYQWBIRlwLXAgcy81RmngYOAGvLtosy8+uZmcAjY451Ln1IkhqysMf9+jLzJEBmnoyID5f6MuBER7vhUpuoPtyl3ksfJ8cOMiI20T57oa+vj1ardW6zHB3IYth8xdme9q3V65hrjYyMNNZ3U5zz/OCcZ0avYTKe6FLLHuq99PHXi5nbgG0AAwMDOTg4OMmhu3tg1x7uPTLdL9XUHL91sJF+W60Wvb5ec5Vznh+c88zo9Wmub41eWirf3yj1YWBFR7vlwOuT1Jd3qffShySpIb2GyV5g9ImsDcCejvpt5Ymr1cCZcqlqP3BNRFxcbrxfA+wv274bEavLU1y3jTnWufQhSWrIpNduIuIrwCCwNCKGaT+VdQ+wOyI2Aq8BN5fm+4DrgSHgbeBTAJl5KiI+DzxT2n0uM0dv6t9O+4mxxcAT5Ytz7UOS1JxJwyQzbxln05oubRO4Y5zj7AB2dKkfBi7vUv/OufYhSWqG74CXJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUrWqMImI34uIFyPihYj4SkS8LyIui4hDEXE0Ih6LiEWl7QVlfahsX9lxnLtK/ZWIuLajvrbUhiJiS0e9ax+SpGb0HCYRsQz4R8BAZl4OLADWA18A7svMfuA0sLHsshE4nZkfAe4r7YiIVWW/jwJrgS9FxIKIWAA8CFwHrAJuKW2ZoA9JUgNqL3MtBBZHxELg/cBJ4OPA42X7TuDGsryurFO2r4mIKPVHM/MHmfkqMARcVb6GMvNYZr4DPAqsK/uM14ckqQE9h0lmfhP4V8BrtEPkDPAs8FZmni3NhoFlZXkZcKLse7a0v6SzPmaf8eqXTNCHJKkBC3vdMSIupn1WcRnwFvAntC9JjZWju4yzbbx6t6CbqH23MW4CNgH09fXRarW6NZtU32LYfMXZyRvOgF7HXGtkZKSxvpvinOcH5zwzeg4T4NeAVzPz2wAR8afA3waWRMTCcuawHHi9tB8GVgDD5bLYB4FTHfVRnft0q785QR/vkpnbgG0AAwMDOTg42NNEH9i1h3uP1LxUvTt+62Aj/bZaLXp9veYq5zw/OOeZUXPP5DVgdUS8v9zHWAO8BDwN3FTabAD2lOW9ZZ2y/anMzFJfX572ugzoB74BPAP0lye3FtG+Sb+37DNeH5KkBtTcMzlE+yb4XwBHyrG2AZ8F7oyIIdr3N7aXXbYDl5T6ncCWcpwXgd20g+irwB2Z+cNy1vFpYD/wMrC7tGWCPiRJDai6dpOZW4GtY8rHaD+JNbbt94GbxznO3cDdXer7gH1d6l37kCQ1w3fAS5KqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqlYVJhGxJCIej4i/ioiXI+KXI+JDEXEgIo6W7xeXthER90fEUEQ8HxFXdhxnQ2l/NCI2dNQ/FhFHyj73R0SUetc+JEnNqD0z+SLw1cz8m8DfAl4GtgBPZmY/8GRZB7gO6C9fm4CHoB0MwFbgauAqYGtHODxU2o7ut7bUx+tDktSAnsMkIi4CfhXYDpCZ72TmW8A6YGdpthO4sSyvAx7JtoPAkoi4FLgWOJCZpzLzNHAAWFu2XZSZX8/MBB4Zc6xufUiSGlBzZvKzwLeBfxcRz0XElyPiQqAvM08ClO8fLu2XASc69h8utYnqw13qTNCHJKkBCyv3vRL4TGYeiogvMvHlpuhSyx7qUxYRm2hfJqOvr49Wq3Uuu/9I32LYfMXZnvat1euYa42MjDTWd1Oc8/zgnGdGTZgMA8OZeaisP047TL4VEZdm5slyqeqNjvYrOvZfDrxe6oNj6q1SX96lPRP08S6ZuQ3YBjAwMJCDg4Pdmk3qgV17uPdIzUvVu+O3DjbSb6vVotfXa65yzvODc54ZPV/mysz/DZyIiJ8vpTXAS8BeYPSJrA3AnrK8F7itPNW1GjhTLlHtB66JiIvLjfdrgP1l23cjYnV5iuu2Mcfq1ockqQG1v25/BtgVEYuAY8CnaAfU7ojYCLwG3Fza7gOuB4aAt0tbMvNURHweeKa0+1xmnirLtwMPA4uBJ8oXwD3j9CFJakBVmGTmXwIDXTat6dI2gTvGOc4OYEeX+mHg8i7173TrQ5LUDN8BL0mqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqVp1mETEgoh4LiL+rKxfFhGHIuJoRDwWEYtK/YKyPlS2r+w4xl2l/kpEXNtRX1tqQxGxpaPetQ9JUjOm48zkd4GXO9a/ANyXmf3AaWBjqW8ETmfmR4D7SjsiYhWwHvgosBb4UgmoBcCDwHXAKuCW0naiPiRJDagKk4hYDnwC+HJZD+DjwOOlyU7gxrK8rqxTtq8p7dcBj2bmDzLzVWAIuKp8DWXmscx8B3gUWDdJH5KkBiys3P/fAP8U+EBZvwR4KzPPlvVhYFlZXgacAMjMsxFxprRfBhzsOGbnPifG1K+epI93iYhNwCaAvr4+Wq3Wuc8Q6FsMm684O3nDGdDrmGuNjIw01ndTnPP84JxnRs9hEhG/DryRmc9GxOBouUvTnGTbePVuZ00Ttf/rxcxtwDaAgYGBHBwc7NZsUg/s2sO9R2pztzfHbx1spN9Wq0Wvr9dc5ZznB+c8M2r+hfwV4IaIuB54H3AR7TOVJRGxsJw5LAdeL+2HgRXAcEQsBD4InOqoj+rcp1v9zQn6kCQ1oOd7Jpl5V2Yuz8yVtG+gP5WZtwJPAzeVZhuAPWV5b1mnbH8qM7PU15envS4D+oFvAM8A/eXJrUWlj71ln/H6kCQ1YCbeZ/JZ4M6IGKJ9f2N7qW8HLin1O4EtAJn5IrAbeAn4KnBHZv6wnHV8GthP+2mx3aXtRH1IkhowLTcCMrMFtMryMdpPYo1t833g5nH2vxu4u0t9H7CvS71rH5KkZvgOeElSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRV6zlMImJFRDwdES9HxIsR8bul/qGIOBARR8v3i0s9IuL+iBiKiOcj4sqOY20o7Y9GxIaO+sci4kjZ5/6IiIn6kCQ1o+bM5CywOTN/AVgN3BERq4AtwJOZ2Q88WdYBrgP6y9cm4CFoBwOwFbgauArY2hEOD5W2o/utLfXx+pAkNaDnMMnMk5n5F2X5u8DLwDJgHbCzNNsJ3FiW1wGPZNtBYElEXApcCxzIzFOZeRo4AKwt2y7KzK9nZgKPjDlWtz4kSQ2YlnsmEbES+CXgENCXmSehHTjAh0uzZcCJjt2GS22i+nCXOhP0IUlqwMLaA0TETwH/AfjHmfl/ym2Nrk271LKH+rmMbRPty2T09fXRarXOZfcf6VsMm68429O+tXodc62RkZHG+m6Kc54fnPPMqAqTiPhJ2kGyKzP/tJS/FRGXZubJcqnqjVIfBlZ07L4ceL3UB8fUW6W+vEv7ifp4l8zcBmwDGBgYyMHBwW7NJvXArj3ce6Q6d3ty/NbBRvpttVr0+nrNVc55fnDOM6Pmaa4AtgMvZ+a/7ti0Fxh9ImsDsKejflt5qms1cKZcotoPXBMRF5cb79cA+8u270bE6tLXbWOO1a0PSVIDan7d/hXgN4EjEfGXpfbPgHuA3RGxEXgNuLls2wdcDwwBbwOfAsjMUxHxeeCZ0u5zmXmqLN8OPAwsBp4oX0zQhySpAT2HSWb+V7rf1wBY06V9AneMc6wdwI4u9cPA5V3q3+nWhySpGc3cCNB578g3z/DJLX/eSN/H7/lEI/1K6p1hcp5b2dA/6JuvaKRbSXOUn80lSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqvkR9NJ5oKn/auDhtRc20q/eezwzkSRV88xEKpr83yWluc4zE0lSNcNEklTNMJEkVTNMJEnVvAEvzWNNPXRw/J5PzHqfmllzOkwiYi3wRWAB8OXMvKfhIUk6zxmgM2POhklELAAeBP4eMAw8ExF7M/OlZkcmaTJNvUkTYPMVjXX9njZnwwS4ChjKzGMAEfEosA4wTCSdd5oM0Nn4pIO5HCbLgBMd68PA1Q2NRdOoqb90/sYq9S4ys+kx9CQibgauzczfLuu/CVyVmZ/paLMJ2FRWfx54pcfulgJvVgx3LnLO84Nznh9q5vwzmfnTkzWay2cmw8CKjvXlwOudDTJzG7CttqOIOJyZA7XHmUuc8/zgnOeH2ZjzXH6fyTNAf0RcFhGLgPXA3obHJEnz0pw9M8nMsxHxaWA/7UeDd2Tmiw0PS5LmpTkbJgCZuQ/YNwtdVV8qm4Oc8/zgnOeHGZ/znL0BL0k6f8zleyaSpPOEYdIhItZGxCsRMRQRW7psvyAiHivbD0XEytkf5fSawpzvjIiXIuL5iHgyIn6miXFOp8nm3NHupojIiJjzT/5MZc4R8ffLz/rFiPj3sz3G6TaFP9t/IyKejojnyp/v65sY53SJiB0R8UZEvDDO9oiI+8vr8XxEXDmtA8hMv9qX+hYA/xP4WWAR8N+BVWPa/EPgD8vyeuCxpsc9C3P+u8D7y/Lt82HOpd0HgK8BB4GBpsc9Cz/nfuA54OKy/uGmxz0Lc94G3F6WVwHHmx535Zx/FbgSeGGc7dcDTwABrAYOTWf/npn82I8+niUz3wFGP56l0zpgZ1l+HFgTETGLY5xuk845M5/OzLfL6kHa7+eZy6bycwb4PPAvge/P5uBmyFTm/DvAg5l5GiAz35jlMU63qcw5gYvK8gcZ8z61uSYzvwacmqDJOuCRbDsILImIS6erf8Pkx7p9PMuy8dpk5lngDHDJrIxuZkxlzp020v7NZi6bdM4R8UvAisz8s9kc2Ayays/554Cfi4j/FhEHyydyz2VTmfM/B34jIoZpPxX6Gd7bzvXv+zmZ048GT7NuZxhjH3WbSpu5ZMrziYjfAAaAvzOjI5p5E845In4CuA/45GwNaBZM5ee8kPalrkHaZ5//JSIuz8y3ZnhsM2Uqc74FeDgz742IXwb+uMz5/8388Boxo/9+eWbyY5N+PEtnm4hYSPvUeKLTyvPdVOZMRPwa8PvADZn5g1ka20yZbM4fAC4HWhFxnPa15b1z/Cb8VP9s78nM/5uZr9L+HLv+WRrfTJjKnDcCuwEy8+vA+2h/htV71ZT+vvfKMPmxqXw8y15gQ1m+CXgqy52tOWrSOZdLPn9EO0jm+nV0mGTOmXkmM5dm5srMXEn7PtENmXm4meFOi6n82f5PtB+2ICKW0r7sdWxWRzm9pjLn14A1ABHxC7TD5NuzOsrZtRe4rTzVtRo4k5knp+vgXuYqcpyPZ4mIzwGHM3MvsJ32qfAQ7TOS9c2NuN4U5/wHwE8Bf1KeNXgtM29obNCVpjjn95Qpznk/cE1EvAT8EPgnmfmd5kZdZ4pz3gz824j4PdqXez45l385jIiv0L5MubTcB9oK/CRAZv4h7ftC1wNDwNvAp6a1/zn82kmSzhNe5pIkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVO3/A7VGdCyrFXx3AAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"train_df.target.hist()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1804874, 3)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 195
},
"colab_type": "code",
"id": "jz3TWXNiQL5X",
"outputId": "6fbcf6c1-526d-4c5a-dc3f-6e07e49e4431"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" comment_text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7000000 | \n",
" Jeff Sessions is another one of Trump's Orwell... | \n",
"
\n",
" \n",
" 1 | \n",
" 7000001 | \n",
" I actually inspected the infrastructure on Gra... | \n",
"
\n",
" \n",
" 2 | \n",
" 7000002 | \n",
" No it won't . That's just wishful thinking on ... | \n",
"
\n",
" \n",
" 3 | \n",
" 7000003 | \n",
" Instead of wringing our hands and nibbling the... | \n",
"
\n",
" \n",
" 4 | \n",
" 7000004 | \n",
" how many of you commenters have garbage piled ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id comment_text\n",
"0 7000000 Jeff Sessions is another one of Trump's Orwell...\n",
"1 7000001 I actually inspected the infrastructure on Gra...\n",
"2 7000002 No it won't . That's just wishful thinking on ...\n",
"3 7000003 Instead of wringing our hands and nibbling the...\n",
"4 7000004 how many of you commenters have garbage piled ..."
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(97320, 2)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"train_df = train_df.sample(100000, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(100000, 3)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create tokens"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "P6inc2DWQTzT"
},
"outputs": [],
"source": [
"def tokenize(text):\n",
" \n",
" tokens = []\n",
" for token in word_tokenize(text):\n",
" if token in string.punctuation: continue\n",
" if token in stop_words: continue\n",
" tokens.append(stem.stem(token))\n",
" \n",
" return \" \".join(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
"[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 3.8s\n",
"[Parallel(n_jobs=-1)]: Done 8088 tasks | elapsed: 8.3s\n",
"[Parallel(n_jobs=-1)]: Done 31088 tasks | elapsed: 20.7s\n",
"[Parallel(n_jobs=-1)]: Done 63288 tasks | elapsed: 37.9s\n",
"[Parallel(n_jobs=-1)]: Done 100000 out of 100000 | elapsed: 57.5s finished\n"
]
}
],
"source": [
"train_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in train_df['comment_text'].tolist())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'what breath fresh air someon embrac common sens valu instead leadership canada clear differ page perhap read differ book'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_tokens[0]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
"[Parallel(n_jobs=-1)]: Done 152 tasks | elapsed: 0.2s\n",
"[Parallel(n_jobs=-1)]: Done 9952 tasks | elapsed: 5.5s\n",
"[Parallel(n_jobs=-1)]: Done 27452 tasks | elapsed: 15.0s\n",
"[Parallel(n_jobs=-1)]: Done 51952 tasks | elapsed: 27.7s\n",
"[Parallel(n_jobs=-1)]: Done 83452 tasks | elapsed: 43.9s\n",
"[Parallel(n_jobs=-1)]: Done 97320 out of 97320 | elapsed: 50.8s finished\n"
]
}
],
"source": [
"test_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in test_df['comment_text'].tolist())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"197320"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(train_tokens + test_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "3vteOVvNQbf-"
},
"outputs": [
{
"data": {
"text/plain": [
"TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
" stop_words=None, strip_accents=None, sublinear_tf=False,\n",
" token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
" vocabulary=None)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vect = TfidfVectorizer()\n",
"vect.fit(train_tokens + test_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "rGKVqvTrQmLJ"
},
"outputs": [],
"source": [
"X = vect.transform(train_tokens)\n",
"y = train_df['target']"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "oCtm_U8xQm_C"
},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,\n",
" max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n",
" oob_score=False, random_state=42, verbose=0, warm_start=False)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42, max_depth=10)\n",
"reg.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"test_X = vect.transform(test_tokens)\n",
"test_y = reg.predict(test_X)"
]
}
],
"metadata": {
"colab": {
"name": "Untitled0.ipynb",
"provenance": [],
"version": "0.3.2"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}