{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Week11_Assignment.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "73f21b5fada64111939d55196c6bff38": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_a92ee96deebf4b90abcf25edd040a069",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_f678eb268422496f9893cba8e3d1721d",
              "IPY_MODEL_44339ab1489541609584c7b5975c6e85"
            ]
          }
        },
        "a92ee96deebf4b90abcf25edd040a069": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f678eb268422496f9893cba8e3d1721d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_551a4739e8264837ad7a743e88e14318",
            "_dom_classes": [],
            "description": "100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 19467,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 19467,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_6414f34d6dfe49b9b3ea8cc67468df23"
          }
        },
        "44339ab1489541609584c7b5975c6e85": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_597e2e3fb6744bd6b3815bfa4f61a0c2",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 19467/19467 [02:11<00:00, 148.22it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_be44ed38b4b2451aaea12210c46f60f0"
          }
        },
        "551a4739e8264837ad7a743e88e14318": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "6414f34d6dfe49b9b3ea8cc67468df23": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "597e2e3fb6744bd6b3815bfa4f61a0c2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "be44ed38b4b2451aaea12210c46f60f0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "98063d275f2c41aab44514e08a4ad865": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_1f9db30dd7254f5cb8ecf00cb9cc36d5",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_9d1b72642d05413db3c635496b69b1c8",
              "IPY_MODEL_ff11f7d8684543309f9ec64039fc0cb6"
            ]
          }
        },
        "1f9db30dd7254f5cb8ecf00cb9cc36d5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "9d1b72642d05413db3c635496b69b1c8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_6d52348064b2443aa39590a252b0dbf3",
            "_dom_classes": [],
            "description": "100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 19467,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 19467,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_ddf12efe92124d188c177576cd6fe682"
          }
        },
        "ff11f7d8684543309f9ec64039fc0cb6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_27b4a636b35d4c3a839c1f6183b9e446",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 19467/19467 [00:04<00:00, 4007.04it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_568d9e4bbb0a4dd5912d70753a573751"
          }
        },
        "6d52348064b2443aa39590a252b0dbf3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "ddf12efe92124d188c177576cd6fe682": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "27b4a636b35d4c3a839c1f6183b9e446": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "568d9e4bbb0a4dd5912d70753a573751": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lgzZ8kc2zVeQ",
        "outputId": "a5ed299a-cdec-4e89-a6ac-c9a4dc60b9d3"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/data/')\n",
        "data_dir = '/data/My Drive/Colab Notebooks/Experiment'\n",
        "!ls '/data/My Drive/Colab Notebooks/Experiment'\n",
        "!pip install matplotlib"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mounted at /data/\n",
            "diamonds.csv  Iris.csv\tm_data.csv  news_data.csv  TSLA.csv  w_data.csv\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)\n",
            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)\n",
            "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b2tMpgP8zeHE",
        "outputId": "b8a77d80-e525-40c7-d881-77b6e3400318"
      },
      "source": [
        "df = pd.read_csv(data_dir + '/news_data.csv')\n",
        "print(df.shape)\n",
        "\n",
        "df = df.drop_duplicates('description') # drop dupes\n",
        "print(\"drop dupes: \" + str(df.shape))\n",
        "\n",
        "df = df[~df['description'].isnull()] # drop null values\n",
        "print(\"drop null values: \" + str(df.shape))\n",
        "\n",
        "df = df[(df.description.map(len) > 120) & (df.description.map(len) <= 350)] # limit to descriptions between 120 and 350 characters\n",
        "\n",
        "df.reset_index(inplace=True, drop=True)\n",
        "print(\"filter on desc lengths: \" + str(df.shape))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(50126, 9)\n",
            "drop dupes: (44774, 9)\n",
            "drop null values: (44773, 9)\n",
            "filter on desc lengths: (19467, 9)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jMa8Nalf3pWp",
        "outputId": "438858e5-83d4-448c-b1b4-2d910f16e26b"
      },
      "source": [
        "df['description'].head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0    Researchers discover what could be one of the ...\n",
              "1    Yemen is now classified as the world's worst h...\n",
              "2    Malcolm Turnbull and Joko Widodo hold talks in...\n",
              "3    KUALA LUMPUR, Malaysia (AP) — Malaysia's healt...\n",
              "4    HANOI, Vietnam (AP) — Two women — a Vietnamese...\n",
              "Name: description, dtype: object"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MfvcmuFIEN1a",
        "outputId": "3726dba8-b609-4e5b-98f1-34b355baed0d"
      },
      "source": [
        "import nltk\n",
        "from nltk.stem import *\n",
        "nltk.download('punkt')\n",
        "from nltk.tokenize import RegexpTokenizer\n",
        "import pandas as pd\n",
        "from tqdm.notebook import tqdm\n",
        "tqdm.pandas()\n",
        "from functools import reduce\n",
        "import re"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Package punkt is already up-to-date!\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "aeAyEoxh8ZeN",
        "outputId": "3446fceb-606c-4a50-f4aa-d375741af15f"
      },
      "source": [
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "\n",
        "from nltk.corpus import stopwords\n",
        "stopwords.words('english')\n",
        "\n",
        "en_stops = set(stopwords.words('english'))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qxnGQ7lkEiec",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "73f21b5fada64111939d55196c6bff38",
            "a92ee96deebf4b90abcf25edd040a069",
            "f678eb268422496f9893cba8e3d1721d",
            "44339ab1489541609584c7b5975c6e85",
            "551a4739e8264837ad7a743e88e14318",
            "6414f34d6dfe49b9b3ea8cc67468df23",
            "597e2e3fb6744bd6b3815bfa4f61a0c2",
            "be44ed38b4b2451aaea12210c46f60f0"
          ]
        },
        "outputId": "222d679d-a633-4c23-b3b8-cf94ce7f34ec"
      },
      "source": [
        "### Cleaning the data set \n",
        "\n",
        "def clean_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub(r\"what's\", \"what is \", text)\n",
        "    text = text.replace('(ap)', '')\n",
        "    text = re.sub(r\"\\'s\", \" is \", text)\n",
        "    text = re.sub(r\"\\'ve\", \" have \", text)\n",
        "    text = re.sub(r\"can't\", \"cannot \", text)\n",
        "    text = re.sub(r\"n't\", \" not \", text)\n",
        "    text = re.sub(r\"i'm\", \"i am \", text)\n",
        "    text = re.sub(r\"\\'re\", \" are \", text)\n",
        "    text = re.sub(r\"\\'d\", \" would \", text)\n",
        "    text = re.sub(r\"\\'ll\", \" will \", text)\n",
        "    text = re.sub(r'\\W+', ' ', text)\n",
        "    text = re.sub(r'\\s+', ' ', text)\n",
        "    text = re.sub(r\"\\\\\", \"\", text)\n",
        "    text = re.sub(r\"\\'\", \"\", text)    \n",
        "    text = re.sub(r\"\\\"\", \"\", text)\n",
        "    text = re.sub('[^a-zA-Z ?!]+', '', text)\n",
        "    text = text.strip()\n",
        "    return text\n",
        "\n",
        "df['text_clean'] = df['description'].progress_map(lambda d: clean_text(d))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "73f21b5fada64111939d55196c6bff38",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RfQG8A4UBr2T",
        "outputId": "ecf241d1-d759-47cf-de5a-3d1085c440e9"
      },
      "source": [
        "df['text_clean'].head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0    researchers discover what could be one of the ...\n",
              "1    yemen is now classified as the world is worst ...\n",
              "2    malcolm turnbull and joko widodo hold talks in...\n",
              "3    kuala lumpur malaysia malaysia is health minis...\n",
              "4    hanoi vietnam two women a vietnamese and an in...\n",
              "Name: text_clean, dtype: object"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 15
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XeqFkFN9Wu9I"
      },
      "source": [
        "text = df['text_clean'].to_list()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "98063d275f2c41aab44514e08a4ad865",
            "1f9db30dd7254f5cb8ecf00cb9cc36d5",
            "9d1b72642d05413db3c635496b69b1c8",
            "ff11f7d8684543309f9ec64039fc0cb6",
            "6d52348064b2443aa39590a252b0dbf3",
            "ddf12efe92124d188c177576cd6fe682",
            "27b4a636b35d4c3a839c1f6183b9e446",
            "568d9e4bbb0a4dd5912d70753a573751"
          ]
        },
        "id": "FjMi3jdxBerA",
        "outputId": "96e3bdc3-a77b-4870-de76-1f0dae6b3cba"
      },
      "source": [
        "from nltk.tokenize import word_tokenize, sent_tokenize\n",
        "from string import punctuation\n",
        "\n",
        "def tokenizer(text):\n",
        "\n",
        "    tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]\n",
        "    tokens = list(reduce(lambda x,y: x+y, tokens))\n",
        "    tokens = list(filter(lambda token: token not in (en_stops), tokens))\n",
        "\n",
        "    return tokens\n",
        "\n",
        "df['token'] = df['text_clean'].progress_map(lambda d: tokenizer(d))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "98063d275f2c41aab44514e08a4ad865",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LCM85d1lYDPQ",
        "outputId": "5dba0345-8968-414c-f193-3cac624ab1a0"
      },
      "source": [
        "for descripition, tokens in zip(df['description'].head(5), df['token'].head(5)):\n",
        "    print('description:', descripition)\n",
        "    print('tokens:', tokens)\n",
        "    print()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "description: Researchers discover what could be one of the worst cases of mine pollution in the world in the heart of New South Wales' pristine heritage-listed Blue Mountains.\n",
            "tokens: ['researchers', 'discover', 'could', 'one', 'worst', 'cases', 'mine', 'pollution', 'world', 'heart', 'new', 'south', 'wales', 'pristine', 'heritage', 'listed', 'blue', 'mountains']\n",
            "\n",
            "description: Yemen is now classified as the world's worst humanitarian disaster but Australia has committed no funding to help save lives there.\n",
            "tokens: ['yemen', 'classified', 'world', 'worst', 'humanitarian', 'disaster', 'australia', 'committed', 'funding', 'help', 'save', 'lives']\n",
            "\n",
            "description: Malcolm Turnbull and Joko Widodo hold talks in Sydney, reviving cooperation halted after the discovery of insulting posters at a military base, and reaching deals on trade and a new consulate in east Java.\n",
            "tokens: ['malcolm', 'turnbull', 'joko', 'widodo', 'hold', 'talks', 'sydney', 'reviving', 'cooperation', 'halted', 'discovery', 'insulting', 'posters', 'military', 'base', 'reaching', 'deals', 'trade', 'new', 'consulate', 'east', 'java']\n",
            "\n",
            "description: KUALA LUMPUR, Malaysia (AP) — Malaysia's health minister said Sunday that the dose of nerve agent given to North Korean ruler Kim Jong Un's exiled half brother was so high that it killed him within 20 minutes and caused…\n",
            "tokens: ['kuala', 'lumpur', 'malaysia', 'malaysia', 'health', 'minister', 'said', 'sunday', 'dose', 'nerve', 'agent', 'given', 'north', 'korean', 'ruler', 'kim', 'jong', 'un', 'exiled', 'half', 'brother', 'high', 'killed', 'within', 'minutes', 'caused']\n",
            "\n",
            "description: HANOI, Vietnam (AP) — Two women — a Vietnamese and an Indonesian — have been arrested for allegedly coating their hands with the immensely toxic chemical agent VX and wiping them on the face of the North Korean leader's…\n",
            "tokens: ['hanoi', 'vietnam', 'two', 'women', 'vietnamese', 'indonesian', 'arrested', 'allegedly', 'coating', 'hands', 'immensely', 'toxic', 'chemical', 'agent', 'vx', 'wiping', 'face', 'north', 'korean', 'leader']\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "634fSw9d01QI"
      },
      "source": [
        "from collections import Counter\n",
        "from nltk.probability import FreqDist\n",
        "\n",
        "vf = pd.DataFrame(df.head(1000)['category'])\n",
        "\n",
        "vectors = pd.DataFrame()\n",
        "for row in df.head(1000)['token']:\n",
        "    vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VHjCq_4v1atR"
      },
      "source": [
        "vectors.fillna(0,inplace=True)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "O9TapkcB1tTZ"
      },
      "source": [
        "from sklearn.cluster import KMeans\n",
        "\n",
        "kmeans = KMeans(n_clusters=30, random_state=123).fit(vectors)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Sobx_zZQ18hB",
        "outputId": "d59bef21-c1b5-4825-8869-20056d24781a"
      },
      "source": [
        "centers=kmeans.cluster_centers_\n",
        "{k:v for k,v in dict(zip(vectors.columns,centers[6])).items() if v >= 0.1}"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'administration': 0.11999999999999993,\n",
              " 'attend': 0.10000000000000019,\n",
              " 'attorney': 0.11999999999999984,\n",
              " 'correspondents': 0.11999999999999987,\n",
              " 'dinner': 0.1600000000000003,\n",
              " 'donald': 0.56,\n",
              " 'general': 0.1200000000000001,\n",
              " 'house': 0.6799999999999996,\n",
              " 'media': 0.12000000000000015,\n",
              " 'new': 0.10000000000000002,\n",
              " 'news': 0.1400000000000002,\n",
              " 'president': 0.23999999999999969,\n",
              " 'presidential': 0.13999999999999985,\n",
              " 'press': 0.13999999999999987,\n",
              " 'sessions': 0.11999999999999993,\n",
              " 'sunday': 0.1,\n",
              " 'trump': 0.84,\n",
              " 'twitter': 0.10000000000000006,\n",
              " 'us': 0.10000000000000006,\n",
              " 'white': 0.6599999999999995}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "i3rwNzng2tUr",
        "outputId": "31a9e84d-de6b-4331-bb1a-be13c301b3fb"
      },
      "source": [
        "kmeans.inertia_"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "16532.01116078362"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 33
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Qcc91wET26-G",
        "outputId": "7cf7e336-5e55-4ff8-8fab-a0cda136d99f"
      },
      "source": [
        "vec = {k:v for k,v in dict(FreqDist(tokens)).items() if k in vectors.columns}\n",
        "\n",
        "vectors = vectors.append(vec,ignore_index=True)\n",
        "vectors.fillna(0,inplace=True)\n",
        "kmeans.predict([vectors.iloc[-1]])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([10], dtype=int32)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 34
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xWG32TQW8HKK",
        "outputId": "01bb3bb5-ada1-4cae-f210-256f94340bc4"
      },
      "source": [
        "centers=kmeans.cluster_centers_\n",
        "{k:v for k,v in dict(zip(vectors.columns,centers[1])).items() if v >= 0.1}"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'accusations': 1.0,\n",
              " 'carmaker': 1.0,\n",
              " 'ceo': 1.0,\n",
              " 'conditions': 1.0,\n",
              " 'elon': 1.0,\n",
              " 'employee': 1.0,\n",
              " 'factory': 1.0,\n",
              " 'following': 1.0,\n",
              " 'investigation': 1.0,\n",
              " 'musk': 1.0,\n",
              " 'results': 1.0,\n",
              " 'shared': 1.0,\n",
              " 'tesla': 1.0,\n",
              " 'working': 1.0}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 35
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 160
        },
        "id": "ioejpJ1G8MPZ",
        "outputId": "612ac8f3-224f-441c-e735-1e4a37475382"
      },
      "source": [
        "from sklearn.metrics import pairwise_distances\n",
        "from scipy.spatial import distance\n",
        "\n",
        "dist = pd.DataFrame(pairwise_distances(vectors, metric='cosine'))\n",
        "vectors[dist.iloc[2]<0.8]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>blue</th>\n",
              "      <th>cases</th>\n",
              "      <th>could</th>\n",
              "      <th>discover</th>\n",
              "      <th>heart</th>\n",
              "      <th>heritage</th>\n",
              "      <th>listed</th>\n",
              "      <th>mine</th>\n",
              "      <th>mountains</th>\n",
              "      <th>new</th>\n",
              "      <th>one</th>\n",
              "      <th>pollution</th>\n",
              "      <th>pristine</th>\n",
              "      <th>researchers</th>\n",
              "      <th>south</th>\n",
              "      <th>wales</th>\n",
              "      <th>world</th>\n",
              "      <th>worst</th>\n",
              "      <th>australia</th>\n",
              "      <th>classified</th>\n",
              "      <th>committed</th>\n",
              "      <th>disaster</th>\n",
              "      <th>funding</th>\n",
              "      <th>help</th>\n",
              "      <th>humanitarian</th>\n",
              "      <th>lives</th>\n",
              "      <th>save</th>\n",
              "      <th>yemen</th>\n",
              "      <th>base</th>\n",
              "      <th>consulate</th>\n",
              "      <th>cooperation</th>\n",
              "      <th>deals</th>\n",
              "      <th>discovery</th>\n",
              "      <th>east</th>\n",
              "      <th>halted</th>\n",
              "      <th>hold</th>\n",
              "      <th>insulting</th>\n",
              "      <th>java</th>\n",
              "      <th>joko</th>\n",
              "      <th>malcolm</th>\n",
              "      <th>...</th>\n",
              "      <th>slovacko</th>\n",
              "      <th>attacking</th>\n",
              "      <th>backyard</th>\n",
              "      <th>flying</th>\n",
              "      <th>neighbor</th>\n",
              "      <th>sheriff</th>\n",
              "      <th>summons</th>\n",
              "      <th>ballots</th>\n",
              "      <th>husted</th>\n",
              "      <th>ohio</th>\n",
              "      <th>registered</th>\n",
              "      <th>uncovered</th>\n",
              "      <th>pentagon</th>\n",
              "      <th>belong</th>\n",
              "      <th>hint</th>\n",
              "      <th>lunardi</th>\n",
              "      <th>approaches</th>\n",
              "      <th>jayhawks</th>\n",
              "      <th>sits</th>\n",
              "      <th>lahore</th>\n",
              "      <th>punjab</th>\n",
              "      <th>adrien</th>\n",
              "      <th>midfielder</th>\n",
              "      <th>rabiot</th>\n",
              "      <th>stature</th>\n",
              "      <th>alvaro</th>\n",
              "      <th>impressive</th>\n",
              "      <th>isco</th>\n",
              "      <th>morata</th>\n",
              "      <th>outings</th>\n",
              "      <th>substitute</th>\n",
              "      <th>allows</th>\n",
              "      <th>applications</th>\n",
              "      <th>browser</th>\n",
              "      <th>optimized</th>\n",
              "      <th>qt</th>\n",
              "      <th>remote</th>\n",
              "      <th>webgl</th>\n",
              "      <th>defence</th>\n",
              "      <th>ramp</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>146</th>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>...</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>2 rows × 6144 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "     blue  cases  could  discover  heart  ...   qt  remote  webgl  defence  ramp\n",
              "2     0.0    0.0    0.0       0.0    0.0  ...  0.0     0.0    0.0      0.0   0.0\n",
              "146   0.0    0.0    0.0       0.0    0.0  ...  0.0     0.0    0.0      0.0   0.0\n",
              "\n",
              "[2 rows x 6144 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 36
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RODLFOf68pmK"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}