{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Project 3\n", "\n", "\n", "# Conversations Toxicity Detection\n", "\n", "Jigsaw Unintended Bias in Toxicity Classification \n", "\n", "Detect toxicity across a diverse range of conversations\n", "\n", "\n", "https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data#\n", "\n", "# Sample Solution\n", "\n", "### Install the Kaggle API and download the datasets" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": {}, "colab_type": "code", "id": "94JWSTZMH0IB" }, "outputs": [], "source": [ "!pip install -U -q kaggle\n", "!echo {\"username\":\"albahnsen\",\"key\":\"1c9e16f2c8e4de73bb6b7db12d17e22d\"} > C:\\Users\\albah\\.kaggle\\kaggle.json" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 84 }, "colab_type": "code", "id": "0eQDyQmnKGYh", "outputId": "693bfd9a-0e4f-4dd3-fe11-8dbf6d922f4c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n" ] } ], "source": [ "!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f test.csv -p ../datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f train.csv -p ../datasets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model with TF-IDF and Ranfom Forest" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "CWro5KSCPAHG" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestRegressor\n", "import string\n", "from joblib import Parallel, delayed\n", "from tqdm import tqdm_notebook as tqdm\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 84 }, "colab_type": "code", "id": "9GO20asAPqte", "outputId": "d7956043-e722-4eea-aee5-ec2befb20f2c" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\albah\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\albah\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "import nltk\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "from nltk.corpus import stopwords \n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem import SnowballStemmer\n", "stop_words = set(stopwords.words('english'))\n", "stem = SnowballStemmer('english')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": {}, "colab_type": "code", "id": "8GbZh9o2PauG" }, "outputs": [], "source": [ "train_df = pd.read_csv(\"../datasets/train.csv.zip\")\n", "train_df = train_df[['id','comment_text', 'target']]\n", "test_df = pd.read_csv(\"../datasets/test.csv.zip\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 195 }, "colab_type": "code", "id": "MmwNcTzXPw12", "outputId": "6fe5ad8e-2d84-44ab-9792-56285f1e7206" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_texttarget
059848This is so cool. It's like, 'would you want yo...0.000000
159849Thank you!! This would make my life a lot less...0.000000
259852This is such an urgent design problem; kudos t...0.000000
359855Is this something I'll be able to install on m...0.000000
459856haha you guys are a bunch of losers.0.893617
\n", "
" ], "text/plain": [ " id comment_text target\n", "0 59848 This is so cool. It's like, 'would you want yo... 0.000000\n", "1 59849 Thank you!! This would make my life a lot less... 0.000000\n", "2 59852 This is such an urgent design problem; kudos t... 0.000000\n", "3 59855 Is this something I'll be able to install on m... 0.000000\n", "4 59856 haha you guys are a bunch of losers. 0.893617" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 286 }, "colab_type": "code", "id": "qtWj7h8gQOkF", "outputId": "20e09c89-3264-424f-f307-8b3532c2abee" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFjpJREFUeJzt3X+QXeV93/H3N1KEZWIsjOIdRlIjMt6kkSGd4B1Qmpl0a6UgcAbxB3TEkCA7SjSl2E2DprVo/lDHLjO4KaWGwSSqpSIyqkGhaaVJhFUNcMdtx5IRIUX8CNVWqGiNaowlVK8Zm8r99o/7rH3Z3P2h++zu0bLv18zOnvM9zznP89yV9Nnz415FZiJJUo2faHoAkqS5zzCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklRtYdMDmC1Lly7NlStX9rTv9773PS688MLpHdB5zjnPD855fqiZ87PPPvtmZv70ZO3mTZisXLmSw4cP97Rvq9VicHBwegd0nnPO84Nznh9q5hwR/2sq7bzMJUmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSao2b94BX+PIN8/wyS1/3kjfx+/5RCP9StK58MxEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFWbNEwiYkdEvBERL3TU/iAi/ioino+I/xgRSzq23RURQxHxSkRc21FfW2pDEbGlo35ZRByKiKMR8VhELCr1C8r6UNm+crI+JEnNmMqZycPA2jG1A8DlmfmLwP8A7gKIiFXAeuCjZZ8vRcSCiFgAPAhcB6wCbiltAb4A3JeZ/cBpYGOpbwROZ+ZHgPtKu3H7OMd5S5Km0aRhkplfA06Nqf3nzDxbVg8Cy8vyOuDRzPxBZr4KDAFXla+hzDyWme8AjwLrIiKAjwOPl/13Ajd2HGtnWX4cWFPaj9eHJKkh03HP5LeAJ8ryMuBEx7bhUhuvfgnwVkcwjdbfdayy/UxpP96xJEkNqfo4lYj4feAssGu01KVZ0j20coL2Ex1ron3Gjm8TsAmgr6+PVqvVrdmk+hbD5ivOTt5wBvQ65lojIyON9d0U5zw/OOeZ0XOYRMQG4NeBNZk5+o/5MLCio9ly4PWy3K3+JrAkIhaWs4/O9qPHGo6IhcAHaV9um6iPd8nMbcA2gIGBgRwcHDz3iQIP7NrDvUea+Riz47cONtJvq9Wi19drrnLO84Nznhk9XeaKiLXAZ4EbMvPtjk17gfXlSazLgH7gG8AzQH95cmsR7Rvoe0sIPQ3cVPbfAOzpONaGsnwT8FRpP14fkqSGTPrrdkR8BRgElkbEMLCV9tNbFwAH2vfEOZiZ/yAzX4yI3cBLtC9/3ZGZPyzH+TSwH1gA7MjMF0sXnwUejYh/ATwHbC/17cAfR8QQ7TOS9QAT9SFJasakYZKZt3Qpb+9SG21/N3B3l/o+YF+X+jG6PI2Vmd8Hbj6XPiRJzfAd8JKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqpNGiYRsSMi3oiIFzpqH4qIAxFxtHy/uNQjIu6PiKGIeD4iruzYZ0NpfzQiNnTUPxYRR8o+90dE9NqHJKkZUzkzeRhYO6a2BXgyM/uBJ8s6wHVAf/naBDwE7WAAtgJXA1cBW0fDobTZ1LHf2l76kCQ1Z9IwycyvAafGlNcBO8vyTuDGjvoj2XYQWBIRlwLXAgcy81RmngYOAGvLtosy8+uZmcAjY451Ln1IkhqysMf9+jLzJEBmnoyID5f6MuBER7vhUpuoPtyl3ksfJ8cOMiI20T57oa+vj1ardW6zHB3IYth8xdme9q3V65hrjYyMNNZ3U5zz/OCcZ0avYTKe6FLLHuq99PHXi5nbgG0AAwMDOTg4OMmhu3tg1x7uPTLdL9XUHL91sJF+W60Wvb5ec5Vznh+c88zo9Wmub41eWirf3yj1YWBFR7vlwOuT1Jd3qffShySpIb2GyV5g9ImsDcCejvpt5Ymr1cCZcqlqP3BNRFxcbrxfA+wv274bEavLU1y3jTnWufQhSWrIpNduIuIrwCCwNCKGaT+VdQ+wOyI2Aq8BN5fm+4DrgSHgbeBTAJl5KiI+DzxT2n0uM0dv6t9O+4mxxcAT5Ytz7UOS1JxJwyQzbxln05oubRO4Y5zj7AB2dKkfBi7vUv/OufYhSWqG74CXJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUrWqMImI34uIFyPihYj4SkS8LyIui4hDEXE0Ih6LiEWl7QVlfahsX9lxnLtK/ZWIuLajvrbUhiJiS0e9ax+SpGb0HCYRsQz4R8BAZl4OLADWA18A7svMfuA0sLHsshE4nZkfAe4r7YiIVWW/jwJrgS9FxIKIWAA8CFwHrAJuKW2ZoA9JUgNqL3MtBBZHxELg/cBJ4OPA42X7TuDGsryurFO2r4mIKPVHM/MHmfkqMARcVb6GMvNYZr4DPAqsK/uM14ckqQE9h0lmfhP4V8BrtEPkDPAs8FZmni3NhoFlZXkZcKLse7a0v6SzPmaf8eqXTNCHJKkBC3vdMSIupn1WcRnwFvAntC9JjZWju4yzbbx6t6CbqH23MW4CNgH09fXRarW6NZtU32LYfMXZyRvOgF7HXGtkZKSxvpvinOcH5zwzeg4T4NeAVzPz2wAR8afA3waWRMTCcuawHHi9tB8GVgDD5bLYB4FTHfVRnft0q785QR/vkpnbgG0AAwMDOTg42NNEH9i1h3uP1LxUvTt+62Aj/bZaLXp9veYq5zw/OOeZUXPP5DVgdUS8v9zHWAO8BDwN3FTabAD2lOW9ZZ2y/anMzFJfX572ugzoB74BPAP0lye3FtG+Sb+37DNeH5KkBtTcMzlE+yb4XwBHyrG2AZ8F7oyIIdr3N7aXXbYDl5T6ncCWcpwXgd20g+irwB2Z+cNy1vFpYD/wMrC7tGWCPiRJDai6dpOZW4GtY8rHaD+JNbbt94GbxznO3cDdXer7gH1d6l37kCQ1w3fAS5KqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqlYVJhGxJCIej4i/ioiXI+KXI+JDEXEgIo6W7xeXthER90fEUEQ8HxFXdhxnQ2l/NCI2dNQ/FhFHyj73R0SUetc+JEnNqD0z+SLw1cz8m8DfAl4GtgBPZmY/8GRZB7gO6C9fm4CHoB0MwFbgauAqYGtHODxU2o7ut7bUx+tDktSAnsMkIi4CfhXYDpCZ72TmW8A6YGdpthO4sSyvAx7JtoPAkoi4FLgWOJCZpzLzNHAAWFu2XZSZX8/MBB4Zc6xufUiSGlBzZvKzwLeBfxcRz0XElyPiQqAvM08ClO8fLu2XASc69h8utYnqw13qTNCHJKkBCyv3vRL4TGYeiogvMvHlpuhSyx7qUxYRm2hfJqOvr49Wq3Uuu/9I32LYfMXZnvat1euYa42MjDTWd1Oc8/zgnGdGTZgMA8OZeaisP047TL4VEZdm5slyqeqNjvYrOvZfDrxe6oNj6q1SX96lPRP08S6ZuQ3YBjAwMJCDg4Pdmk3qgV17uPdIzUvVu+O3DjbSb6vVotfXa65yzvODc54ZPV/mysz/DZyIiJ8vpTXAS8BeYPSJrA3AnrK8F7itPNW1GjhTLlHtB66JiIvLjfdrgP1l23cjYnV5iuu2Mcfq1ockqQG1v25/BtgVEYuAY8CnaAfU7ojYCLwG3Fza7gOuB4aAt0tbMvNURHweeKa0+1xmnirLtwMPA4uBJ8oXwD3j9CFJakBVmGTmXwIDXTat6dI2gTvGOc4OYEeX+mHg8i7173TrQ5LUDN8BL0mqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqhkmkqRqhokkqVp1mETEgoh4LiL+rKxfFhGHIuJoRDwWEYtK/YKyPlS2r+w4xl2l/kpEXNtRX1tqQxGxpaPetQ9JUjOm48zkd4GXO9a/ANyXmf3AaWBjqW8ETmfmR4D7SjsiYhWwHvgosBb4UgmoBcCDwHXAKuCW0naiPiRJDagKk4hYDnwC+HJZD+DjwOOlyU7gxrK8rqxTtq8p7dcBj2bmDzLzVWAIuKp8DWXmscx8B3gUWDdJH5KkBiys3P/fAP8U+EBZvwR4KzPPlvVhYFlZXgacAMjMsxFxprRfBhzsOGbnPifG1K+epI93iYhNwCaAvr4+Wq3Wuc8Q6FsMm684O3nDGdDrmGuNjIw01ndTnPP84JxnRs9hEhG/DryRmc9GxOBouUvTnGTbePVuZ00Ttf/rxcxtwDaAgYGBHBwc7NZsUg/s2sO9R2pztzfHbx1spN9Wq0Wvr9dc5ZznB+c8M2r+hfwV4IaIuB54H3AR7TOVJRGxsJw5LAdeL+2HgRXAcEQsBD4InOqoj+rcp1v9zQn6kCQ1oOd7Jpl5V2Yuz8yVtG+gP5WZtwJPAzeVZhuAPWV5b1mnbH8qM7PU15envS4D+oFvAM8A/eXJrUWlj71ln/H6kCQ1YCbeZ/JZ4M6IGKJ9f2N7qW8HLin1O4EtAJn5IrAbeAn4KnBHZv6wnHV8GthP+2mx3aXtRH1IkhowLTcCMrMFtMryMdpPYo1t833g5nH2vxu4u0t9H7CvS71rH5KkZvgOeElSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRVM0wkSdUME0lSNcNEklTNMJEkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVM0wkSRV6zlMImJFRDwdES9HxIsR8bul/qGIOBARR8v3i0s9IuL+iBiKiOcj4sqOY20o7Y9GxIaO+sci4kjZ5/6IiIn6kCQ1o+bM5CywOTN/AVgN3BERq4AtwJOZ2Q88WdYBrgP6y9cm4CFoBwOwFbgauArY2hEOD5W2o/utLfXx+pAkNaDnMMnMk5n5F2X5u8DLwDJgHbCzNNsJ3FiW1wGPZNtBYElEXApcCxzIzFOZeRo4AKwt2y7KzK9nZgKPjDlWtz4kSQ2YlnsmEbES+CXgENCXmSehHTjAh0uzZcCJjt2GS22i+nCXOhP0IUlqwMLaA0TETwH/AfjHmfl/ym2Nrk271LKH+rmMbRPty2T09fXRarXOZfcf6VsMm68429O+tXodc62RkZHG+m6Kc54fnPPMqAqTiPhJ2kGyKzP/tJS/FRGXZubJcqnqjVIfBlZ07L4ceL3UB8fUW6W+vEv7ifp4l8zcBmwDGBgYyMHBwW7NJvXArj3ce6Q6d3ty/NbBRvpttVr0+nrNVc55fnDOM6Pmaa4AtgMvZ+a/7ti0Fxh9ImsDsKejflt5qms1cKZcotoPXBMRF5cb79cA+8u270bE6tLXbWOO1a0PSVIDan7d/hXgN4EjEfGXpfbPgHuA3RGxEXgNuLls2wdcDwwBbwOfAsjMUxHxeeCZ0u5zmXmqLN8OPAwsBp4oX0zQhySpAT2HSWb+V7rf1wBY06V9AneMc6wdwI4u9cPA5V3q3+nWhySpGc3cCNB578g3z/DJLX/eSN/H7/lEI/1K6p1hcp5b2dA/6JuvaKRbSXOUn80lSapmmEiSqhkmkqRqhokkqZphIkmqZphIkqoZJpKkaoaJJKmaYSJJqmaYSJKqGSaSpGqGiSSpmmEiSapmmEiSqvkR9NJ5oKn/auDhtRc20q/eezwzkSRV88xEKpr83yWluc4zE0lSNcNEklTNMJEkVTNMJEnVvAEvzWNNPXRw/J5PzHqfmllzOkwiYi3wRWAB8OXMvKfhIUk6zxmgM2POhklELAAeBP4eMAw8ExF7M/OlZkcmaTJNvUkTYPMVjXX9njZnwwS4ChjKzGMAEfEosA4wTCSdd5oM0Nn4pIO5HCbLgBMd68PA1Q2NRdOoqb90/sYq9S4ys+kx9CQibgauzczfLuu/CVyVmZ/paLMJ2FRWfx54pcfulgJvVgx3LnLO84Nznh9q5vwzmfnTkzWay2cmw8CKjvXlwOudDTJzG7CttqOIOJyZA7XHmUuc8/zgnOeH2ZjzXH6fyTNAf0RcFhGLgPXA3obHJEnz0pw9M8nMsxHxaWA/7UeDd2Tmiw0PS5LmpTkbJgCZuQ/YNwtdVV8qm4Oc8/zgnOeHGZ/znL0BL0k6f8zleyaSpPOEYdIhItZGxCsRMRQRW7psvyAiHivbD0XEytkf5fSawpzvjIiXIuL5iHgyIn6miXFOp8nm3NHupojIiJjzT/5MZc4R8ffLz/rFiPj3sz3G6TaFP9t/IyKejojnyp/v65sY53SJiB0R8UZEvDDO9oiI+8vr8XxEXDmtA8hMv9qX+hYA/xP4WWAR8N+BVWPa/EPgD8vyeuCxpsc9C3P+u8D7y/Lt82HOpd0HgK8BB4GBpsc9Cz/nfuA54OKy/uGmxz0Lc94G3F6WVwHHmx535Zx/FbgSeGGc7dcDTwABrAYOTWf/npn82I8+niUz3wFGP56l0zpgZ1l+HFgTETGLY5xuk845M5/OzLfL6kHa7+eZy6bycwb4PPAvge/P5uBmyFTm/DvAg5l5GiAz35jlMU63qcw5gYvK8gcZ8z61uSYzvwacmqDJOuCRbDsILImIS6erf8Pkx7p9PMuy8dpk5lngDHDJrIxuZkxlzp020v7NZi6bdM4R8UvAisz8s9kc2Ayays/554Cfi4j/FhEHyydyz2VTmfM/B34jIoZpPxX6Gd7bzvXv+zmZ048GT7NuZxhjH3WbSpu5ZMrziYjfAAaAvzOjI5p5E845In4CuA/45GwNaBZM5ee8kPalrkHaZ5//JSIuz8y3ZnhsM2Uqc74FeDgz742IXwb+uMz5/8388Boxo/9+eWbyY5N+PEtnm4hYSPvUeKLTyvPdVOZMRPwa8PvADZn5g1ka20yZbM4fAC4HWhFxnPa15b1z/Cb8VP9s78nM/5uZr9L+HLv+WRrfTJjKnDcCuwEy8+vA+2h/htV71ZT+vvfKMPmxqXw8y15gQ1m+CXgqy52tOWrSOZdLPn9EO0jm+nV0mGTOmXkmM5dm5srMXEn7PtENmXm4meFOi6n82f5PtB+2ICKW0r7sdWxWRzm9pjLn14A1ABHxC7TD5NuzOsrZtRe4rTzVtRo4k5knp+vgXuYqcpyPZ4mIzwGHM3MvsJ32qfAQ7TOS9c2NuN4U5/wHwE8Bf1KeNXgtM29obNCVpjjn95Qpznk/cE1EvAT8EPgnmfmd5kZdZ4pz3gz824j4PdqXez45l385jIiv0L5MubTcB9oK/CRAZv4h7ftC1wNDwNvAp6a1/zn82kmSzhNe5pIkVTNMJEnVDBNJUjXDRJJUzTCRJFUzTCRJ1QwTSVI1w0SSVO3/A7VGdCyrFXx3AAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "train_df.target.hist()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1804874, 3)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 195 }, "colab_type": "code", "id": "jz3TWXNiQL5X", "outputId": "6fbcf6c1-526d-4c5a-dc3f-6e07e49e4431" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_text
07000000Jeff Sessions is another one of Trump's Orwell...
17000001I actually inspected the infrastructure on Gra...
27000002No it won't . That's just wishful thinking on ...
37000003Instead of wringing our hands and nibbling the...
47000004how many of you commenters have garbage piled ...
\n", "
" ], "text/plain": [ " id comment_text\n", "0 7000000 Jeff Sessions is another one of Trump's Orwell...\n", "1 7000001 I actually inspected the infrastructure on Gra...\n", "2 7000002 No it won't . That's just wishful thinking on ...\n", "3 7000003 Instead of wringing our hands and nibbling the...\n", "4 7000004 how many of you commenters have garbage piled ..." ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(97320, 2)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "train_df = train_df.sample(100000, random_state=42)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100000, 3)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create tokens" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": {}, "colab_type": "code", "id": "P6inc2DWQTzT" }, "outputs": [], "source": [ "def tokenize(text):\n", " \n", " tokens = []\n", " for token in word_tokenize(text):\n", " if token in string.punctuation: continue\n", " if token in stop_words: continue\n", " tokens.append(stem.stem(token))\n", " \n", " return \" \".join(tokens)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 3.8s\n", "[Parallel(n_jobs=-1)]: Done 8088 tasks | elapsed: 8.3s\n", "[Parallel(n_jobs=-1)]: Done 31088 tasks | elapsed: 20.7s\n", "[Parallel(n_jobs=-1)]: Done 63288 tasks | elapsed: 37.9s\n", "[Parallel(n_jobs=-1)]: Done 100000 out of 100000 | elapsed: 57.5s finished\n" ] } ], "source": [ "train_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in train_df['comment_text'].tolist())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'what breath fresh air someon embrac common sens valu instead leadership canada clear differ page perhap read differ book'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_tokens[0]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 152 tasks | elapsed: 0.2s\n", "[Parallel(n_jobs=-1)]: Done 9952 tasks | elapsed: 5.5s\n", "[Parallel(n_jobs=-1)]: Done 27452 tasks | elapsed: 15.0s\n", "[Parallel(n_jobs=-1)]: Done 51952 tasks | elapsed: 27.7s\n", "[Parallel(n_jobs=-1)]: Done 83452 tasks | elapsed: 43.9s\n", "[Parallel(n_jobs=-1)]: Done 97320 out of 97320 | elapsed: 50.8s finished\n" ] } ], "source": [ "test_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in test_df['comment_text'].tolist())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "197320" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train_tokens + test_tokens)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": {}, "colab_type": "code", "id": "3vteOVvNQbf-" }, "outputs": [ { "data": { "text/plain": [ "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", " ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n", " stop_words=None, strip_accents=None, sublinear_tf=False,\n", " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", " vocabulary=None)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vect = TfidfVectorizer()\n", "vect.fit(train_tokens + test_tokens)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": {}, "colab_type": "code", "id": "rGKVqvTrQmLJ" }, "outputs": [], "source": [ "X = vect.transform(train_tokens)\n", "y = train_df['target']" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": {}, "colab_type": "code", "id": "oCtm_U8xQm_C" }, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,\n", " max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n", " oob_score=False, random_state=42, verbose=0, warm_start=False)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42, max_depth=10)\n", "reg.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "test_X = vect.transform(test_tokens)\n", "test_y = reg.predict(test_X)" ] } ], "metadata": { "colab": { "name": "Untitled0.ipynb", "provenance": [], "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 1 }