{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Week11_Assignment.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "73f21b5fada64111939d55196c6bff38": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_a92ee96deebf4b90abcf25edd040a069", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_f678eb268422496f9893cba8e3d1721d", "IPY_MODEL_44339ab1489541609584c7b5975c6e85" ] } }, "a92ee96deebf4b90abcf25edd040a069": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "f678eb268422496f9893cba8e3d1721d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_551a4739e8264837ad7a743e88e14318", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 19467, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 19467, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_6414f34d6dfe49b9b3ea8cc67468df23" } }, "44339ab1489541609584c7b5975c6e85": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_597e2e3fb6744bd6b3815bfa4f61a0c2", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 19467/19467 [02:11<00:00, 148.22it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_be44ed38b4b2451aaea12210c46f60f0" } }, "551a4739e8264837ad7a743e88e14318": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "6414f34d6dfe49b9b3ea8cc67468df23": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "597e2e3fb6744bd6b3815bfa4f61a0c2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "be44ed38b4b2451aaea12210c46f60f0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "98063d275f2c41aab44514e08a4ad865": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_1f9db30dd7254f5cb8ecf00cb9cc36d5", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_9d1b72642d05413db3c635496b69b1c8", "IPY_MODEL_ff11f7d8684543309f9ec64039fc0cb6" ] } }, "1f9db30dd7254f5cb8ecf00cb9cc36d5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9d1b72642d05413db3c635496b69b1c8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_6d52348064b2443aa39590a252b0dbf3", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 19467, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 19467, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_ddf12efe92124d188c177576cd6fe682" } }, "ff11f7d8684543309f9ec64039fc0cb6": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_27b4a636b35d4c3a839c1f6183b9e446", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 19467/19467 [00:04<00:00, 4007.04it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_568d9e4bbb0a4dd5912d70753a573751" } }, "6d52348064b2443aa39590a252b0dbf3": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "ddf12efe92124d188c177576cd6fe682": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "27b4a636b35d4c3a839c1f6183b9e446": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "568d9e4bbb0a4dd5912d70753a573751": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lgzZ8kc2zVeQ", "outputId": "a5ed299a-cdec-4e89-a6ac-c9a4dc60b9d3" }, "source": [ "from google.colab import drive\n", "drive.mount('/data/')\n", "data_dir = '/data/My Drive/Colab Notebooks/Experiment'\n", "!ls '/data/My Drive/Colab Notebooks/Experiment'\n", "!pip install matplotlib" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Mounted at /data/\n", "diamonds.csv Iris.csv\tm_data.csv news_data.csv TSLA.csv w_data.csv\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b2tMpgP8zeHE", "outputId": "b8a77d80-e525-40c7-d881-77b6e3400318" }, "source": [ "df = pd.read_csv(data_dir + '/news_data.csv')\n", "print(df.shape)\n", "\n", "df = df.drop_duplicates('description') # drop dupes\n", "print(\"drop dupes: \" + str(df.shape))\n", "\n", "df = df[~df['description'].isnull()] # drop null values\n", "print(\"drop null values: \" + str(df.shape))\n", "\n", "df = df[(df.description.map(len) > 120) & (df.description.map(len) <= 350)] # limit to descriptions between 120 and 350 characters\n", "\n", "df.reset_index(inplace=True, drop=True)\n", "print(\"filter on desc lengths: \" + str(df.shape))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "(50126, 9)\n", "drop dupes: (44774, 9)\n", "drop null values: (44773, 9)\n", "filter on desc lengths: (19467, 9)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jMa8Nalf3pWp", "outputId": "438858e5-83d4-448c-b1b4-2d910f16e26b" }, "source": [ "df['description'].head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 Researchers discover what could be one of the ...\n", "1 Yemen is now classified as the world's worst h...\n", "2 Malcolm Turnbull and Joko Widodo hold talks in...\n", "3 KUALA LUMPUR, Malaysia (AP) — Malaysia's healt...\n", "4 HANOI, Vietnam (AP) — Two women — a Vietnamese...\n", "Name: description, dtype: object" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MfvcmuFIEN1a", "outputId": "3726dba8-b609-4e5b-98f1-34b355baed0d" }, "source": [ "import nltk\n", "from nltk.stem import *\n", "nltk.download('punkt')\n", "from nltk.tokenize import RegexpTokenizer\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "tqdm.pandas()\n", "from functools import reduce\n", "import re" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aeAyEoxh8ZeN", "outputId": "3446fceb-606c-4a50-f4aa-d375741af15f" }, "source": [ "import nltk\n", "nltk.download('stopwords')\n", "\n", "from nltk.corpus import stopwords\n", "stopwords.words('english')\n", "\n", "en_stops = set(stopwords.words('english'))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "qxnGQ7lkEiec", "colab": { "base_uri": "https://localhost:8080/", "height": 66, "referenced_widgets": [ "73f21b5fada64111939d55196c6bff38", "a92ee96deebf4b90abcf25edd040a069", "f678eb268422496f9893cba8e3d1721d", "44339ab1489541609584c7b5975c6e85", "551a4739e8264837ad7a743e88e14318", "6414f34d6dfe49b9b3ea8cc67468df23", "597e2e3fb6744bd6b3815bfa4f61a0c2", "be44ed38b4b2451aaea12210c46f60f0" ] }, "outputId": "222d679d-a633-4c23-b3b8-cf94ce7f34ec" }, "source": [ "### Cleaning the data set \n", "\n", "def clean_text(text):\n", " text = text.lower()\n", " text = re.sub(r\"what's\", \"what is \", text)\n", " text = text.replace('(ap)', '')\n", " text = re.sub(r\"\\'s\", \" is \", text)\n", " text = re.sub(r\"\\'ve\", \" have \", text)\n", " text = re.sub(r\"can't\", \"cannot \", text)\n", " text = re.sub(r\"n't\", \" not \", text)\n", " text = re.sub(r\"i'm\", \"i am \", text)\n", " text = re.sub(r\"\\'re\", \" are \", text)\n", " text = re.sub(r\"\\'d\", \" would \", text)\n", " text = re.sub(r\"\\'ll\", \" will \", text)\n", " text = re.sub(r'\\W+', ' ', text)\n", " text = re.sub(r'\\s+', ' ', text)\n", " text = re.sub(r\"\\\\\", \"\", text)\n", " text = re.sub(r\"\\'\", \"\", text) \n", " text = re.sub(r\"\\\"\", \"\", text)\n", " text = re.sub('[^a-zA-Z ?!]+', '', text)\n", " text = text.strip()\n", " return text\n", "\n", "df['text_clean'] = df['description'].progress_map(lambda d: clean_text(d))" ], "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "73f21b5fada64111939d55196c6bff38", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RfQG8A4UBr2T", "outputId": "ecf241d1-d759-47cf-de5a-3d1085c440e9" }, "source": [ "df['text_clean'].head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 researchers discover what could be one of the ...\n", "1 yemen is now classified as the world is worst ...\n", "2 malcolm turnbull and joko widodo hold talks in...\n", "3 kuala lumpur malaysia malaysia is health minis...\n", "4 hanoi vietnam two women a vietnamese and an in...\n", "Name: text_clean, dtype: object" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "cell_type": "code", "metadata": { "id": "XeqFkFN9Wu9I" }, "source": [ "text = df['text_clean'].to_list()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 66, "referenced_widgets": [ "98063d275f2c41aab44514e08a4ad865", "1f9db30dd7254f5cb8ecf00cb9cc36d5", "9d1b72642d05413db3c635496b69b1c8", "ff11f7d8684543309f9ec64039fc0cb6", "6d52348064b2443aa39590a252b0dbf3", "ddf12efe92124d188c177576cd6fe682", "27b4a636b35d4c3a839c1f6183b9e446", "568d9e4bbb0a4dd5912d70753a573751" ] }, "id": "FjMi3jdxBerA", "outputId": "96e3bdc3-a77b-4870-de76-1f0dae6b3cba" }, "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from string import punctuation\n", "\n", "def tokenizer(text):\n", "\n", " tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]\n", " tokens = list(reduce(lambda x,y: x+y, tokens))\n", " tokens = list(filter(lambda token: token not in (en_stops), tokens))\n", "\n", " return tokens\n", "\n", "df['token'] = df['text_clean'].progress_map(lambda d: tokenizer(d))" ], "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "98063d275f2c41aab44514e08a4ad865", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LCM85d1lYDPQ", "outputId": "5dba0345-8968-414c-f193-3cac624ab1a0" }, "source": [ "for descripition, tokens in zip(df['description'].head(5), df['token'].head(5)):\n", " print('description:', descripition)\n", " print('tokens:', tokens)\n", " print()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "description: Researchers discover what could be one of the worst cases of mine pollution in the world in the heart of New South Wales' pristine heritage-listed Blue Mountains.\n", "tokens: ['researchers', 'discover', 'could', 'one', 'worst', 'cases', 'mine', 'pollution', 'world', 'heart', 'new', 'south', 'wales', 'pristine', 'heritage', 'listed', 'blue', 'mountains']\n", "\n", "description: Yemen is now classified as the world's worst humanitarian disaster but Australia has committed no funding to help save lives there.\n", "tokens: ['yemen', 'classified', 'world', 'worst', 'humanitarian', 'disaster', 'australia', 'committed', 'funding', 'help', 'save', 'lives']\n", "\n", "description: Malcolm Turnbull and Joko Widodo hold talks in Sydney, reviving cooperation halted after the discovery of insulting posters at a military base, and reaching deals on trade and a new consulate in east Java.\n", "tokens: ['malcolm', 'turnbull', 'joko', 'widodo', 'hold', 'talks', 'sydney', 'reviving', 'cooperation', 'halted', 'discovery', 'insulting', 'posters', 'military', 'base', 'reaching', 'deals', 'trade', 'new', 'consulate', 'east', 'java']\n", "\n", "description: KUALA LUMPUR, Malaysia (AP) — Malaysia's health minister said Sunday that the dose of nerve agent given to North Korean ruler Kim Jong Un's exiled half brother was so high that it killed him within 20 minutes and caused…\n", "tokens: ['kuala', 'lumpur', 'malaysia', 'malaysia', 'health', 'minister', 'said', 'sunday', 'dose', 'nerve', 'agent', 'given', 'north', 'korean', 'ruler', 'kim', 'jong', 'un', 'exiled', 'half', 'brother', 'high', 'killed', 'within', 'minutes', 'caused']\n", "\n", "description: HANOI, Vietnam (AP) — Two women — a Vietnamese and an Indonesian — have been arrested for allegedly coating their hands with the immensely toxic chemical agent VX and wiping them on the face of the North Korean leader's…\n", "tokens: ['hanoi', 'vietnam', 'two', 'women', 'vietnamese', 'indonesian', 'arrested', 'allegedly', 'coating', 'hands', 'immensely', 'toxic', 'chemical', 'agent', 'vx', 'wiping', 'face', 'north', 'korean', 'leader']\n", "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "634fSw9d01QI" }, "source": [ "from collections import Counter\n", "from nltk.probability import FreqDist\n", "\n", "vf = pd.DataFrame(df.head(1000)['category'])\n", "\n", "vectors = pd.DataFrame()\n", "for row in df.head(1000)['token']:\n", " vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "VHjCq_4v1atR" }, "source": [ "vectors.fillna(0,inplace=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "O9TapkcB1tTZ" }, "source": [ "from sklearn.cluster import KMeans\n", "\n", "kmeans = KMeans(n_clusters=30, random_state=123).fit(vectors)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Sobx_zZQ18hB", "outputId": "d59bef21-c1b5-4825-8869-20056d24781a" }, "source": [ "centers=kmeans.cluster_centers_\n", "{k:v for k,v in dict(zip(vectors.columns,centers[6])).items() if v >= 0.1}" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'administration': 0.11999999999999993,\n", " 'attend': 0.10000000000000019,\n", " 'attorney': 0.11999999999999984,\n", " 'correspondents': 0.11999999999999987,\n", " 'dinner': 0.1600000000000003,\n", " 'donald': 0.56,\n", " 'general': 0.1200000000000001,\n", " 'house': 0.6799999999999996,\n", " 'media': 0.12000000000000015,\n", " 'new': 0.10000000000000002,\n", " 'news': 0.1400000000000002,\n", " 'president': 0.23999999999999969,\n", " 'presidential': 0.13999999999999985,\n", " 'press': 0.13999999999999987,\n", " 'sessions': 0.11999999999999993,\n", " 'sunday': 0.1,\n", " 'trump': 0.84,\n", " 'twitter': 0.10000000000000006,\n", " 'us': 0.10000000000000006,\n", " 'white': 0.6599999999999995}" ] }, "metadata": { "tags": [] }, "execution_count": 32 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i3rwNzng2tUr", "outputId": "31a9e84d-de6b-4331-bb1a-be13c301b3fb" }, "source": [ "kmeans.inertia_" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "16532.01116078362" ] }, "metadata": { "tags": [] }, "execution_count": 33 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Qcc91wET26-G", "outputId": "7cf7e336-5e55-4ff8-8fab-a0cda136d99f" }, "source": [ "vec = {k:v for k,v in dict(FreqDist(tokens)).items() if k in vectors.columns}\n", "\n", "vectors = vectors.append(vec,ignore_index=True)\n", "vectors.fillna(0,inplace=True)\n", "kmeans.predict([vectors.iloc[-1]])" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([10], dtype=int32)" ] }, "metadata": { "tags": [] }, "execution_count": 34 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xWG32TQW8HKK", "outputId": "01bb3bb5-ada1-4cae-f210-256f94340bc4" }, "source": [ "centers=kmeans.cluster_centers_\n", "{k:v for k,v in dict(zip(vectors.columns,centers[1])).items() if v >= 0.1}" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'accusations': 1.0,\n", " 'carmaker': 1.0,\n", " 'ceo': 1.0,\n", " 'conditions': 1.0,\n", " 'elon': 1.0,\n", " 'employee': 1.0,\n", " 'factory': 1.0,\n", " 'following': 1.0,\n", " 'investigation': 1.0,\n", " 'musk': 1.0,\n", " 'results': 1.0,\n", " 'shared': 1.0,\n", " 'tesla': 1.0,\n", " 'working': 1.0}" ] }, "metadata": { "tags": [] }, "execution_count": 35 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 160 }, "id": "ioejpJ1G8MPZ", "outputId": "612ac8f3-224f-441c-e735-1e4a37475382" }, "source": [ "from sklearn.metrics import pairwise_distances\n", "from scipy.spatial import distance\n", "\n", "dist = pd.DataFrame(pairwise_distances(vectors, metric='cosine'))\n", "vectors[dist.iloc[2]<0.8]" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>blue</th>\n", " <th>cases</th>\n", " <th>could</th>\n", " <th>discover</th>\n", " <th>heart</th>\n", " <th>heritage</th>\n", " <th>listed</th>\n", " <th>mine</th>\n", " <th>mountains</th>\n", " <th>new</th>\n", " <th>one</th>\n", " <th>pollution</th>\n", " <th>pristine</th>\n", " <th>researchers</th>\n", " <th>south</th>\n", " <th>wales</th>\n", " <th>world</th>\n", " <th>worst</th>\n", " <th>australia</th>\n", " <th>classified</th>\n", " <th>committed</th>\n", " <th>disaster</th>\n", " <th>funding</th>\n", " <th>help</th>\n", " <th>humanitarian</th>\n", " <th>lives</th>\n", " <th>save</th>\n", " <th>yemen</th>\n", " <th>base</th>\n", " <th>consulate</th>\n", " <th>cooperation</th>\n", " <th>deals</th>\n", " <th>discovery</th>\n", " <th>east</th>\n", " <th>halted</th>\n", " <th>hold</th>\n", " <th>insulting</th>\n", " <th>java</th>\n", " <th>joko</th>\n", " <th>malcolm</th>\n", " <th>...</th>\n", " <th>slovacko</th>\n", " <th>attacking</th>\n", " <th>backyard</th>\n", " <th>flying</th>\n", " <th>neighbor</th>\n", " <th>sheriff</th>\n", " <th>summons</th>\n", " <th>ballots</th>\n", " <th>husted</th>\n", " <th>ohio</th>\n", " <th>registered</th>\n", " <th>uncovered</th>\n", " <th>pentagon</th>\n", " <th>belong</th>\n", " <th>hint</th>\n", " <th>lunardi</th>\n", " <th>approaches</th>\n", " <th>jayhawks</th>\n", " <th>sits</th>\n", " <th>lahore</th>\n", " <th>punjab</th>\n", " <th>adrien</th>\n", " <th>midfielder</th>\n", " <th>rabiot</th>\n", " <th>stature</th>\n", " <th>alvaro</th>\n", " <th>impressive</th>\n", " <th>isco</th>\n", " <th>morata</th>\n", " <th>outings</th>\n", " <th>substitute</th>\n", " <th>allows</th>\n", " <th>applications</th>\n", " <th>browser</th>\n", " <th>optimized</th>\n", " <th>qt</th>\n", " <th>remote</th>\n", " <th>webgl</th>\n", " <th>defence</th>\n", " <th>ramp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>...</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>146</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>...</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>2 rows × 6144 columns</p>\n", "</div>" ], "text/plain": [ " blue cases could discover heart ... qt remote webgl defence ramp\n", "2 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", "146 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0\n", "\n", "[2 rows x 6144 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 36 } ] }, { "cell_type": "code", "metadata": { "id": "RODLFOf68pmK" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }