{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "feature_engg_text.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyNdJbebN9/8/+c0SzmsW5kt", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "<a href=\"https://colab.research.google.com/github/bipinKrishnan/fastai_course/blob/master/feature_engg_text.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" ] }, { "cell_type": "code", "metadata": { "id": "BA5Tgfw-z5fI" }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "\n", "nltk.download('stopwords')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-Aqfz4r10G-0" }, "source": [ "corpus = ['The sky is blue and beautiful.',\n", " 'Love this blue and beautiful sky!',\n", " 'The quick brown fox jumps over the lazy dog.',\n", " \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n", " 'I love green eggs, ham, sausages and bacon!',\n", " 'The brown fox is quick and the blue dog is lazy!',\n", " 'The sky is very blue and the sky is very beautiful today',\n", " 'The dog is lazy but the brown fox is quick!' \n", "]\n", "\n", "labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Hy6Vzhvk0Q_m" }, "source": [ "corpus = np.array(corpus)\n", "df = pd.DataFrame({\"text\": corpus, \"label\": labels})" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "PTR8R6Zp0XRZ", "outputId": "ecc4f688-66eb-4ba7-ff7f-2756740aff92", "colab": { "base_uri": "https://localhost:8080/", "height": 206 } }, "source": [ "df.head()" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>text</th>\n", " <th>label</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>The sky is blue and beautiful.</td>\n", " <td>weather</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Love this blue and beautiful sky!</td>\n", " <td>weather</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>The quick brown fox jumps over the lazy dog.</td>\n", " <td>animals</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>A king's breakfast has sausages, ham, bacon, e...</td>\n", " <td>food</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>I love green eggs, ham, sausages and bacon!</td>\n", " <td>food</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " text label\n", "0 The sky is blue and beautiful. weather\n", "1 Love this blue and beautiful sky! weather\n", "2 The quick brown fox jumps over the lazy dog. animals\n", "3 A king's breakfast has sausages, ham, bacon, e... food\n", "4 I love green eggs, ham, sausages and bacon! food" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "XnkiPOyT0s1e", "outputId": "9beae70f-a549-40be-c438-612e666f33e2", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "wpt = nltk.WordPunctTokenizer()\n", "stop_words = nltk.corpus.stopwords.words('english')\n", "wpt.tokenize(corpus[0])" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['The', 'sky', 'is', 'blue', 'and', 'beautiful', '.']" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "code", "metadata": { "id": "oFzkUsQS10bS" }, "source": [ "def preprocess(doc):\n", " doc = doc.lower().strip()\n", " tokens = wpt.tokenize(doc)\n", " tok = [token for token in tokens if token not in stop_words]\n", "\n", " doc = ' '.join(tok)\n", " return doc" ], "execution_count": 19, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ayitk8hh24BT" }, "source": [ "normalize_corpus = np.vectorize(preprocess)" ], "execution_count": 26, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "hp5zRLYh27DK" }, "source": [ "norm_corp = normalize_corpus(corpus)" ], "execution_count": 28, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vpU9XPJw3qyx", "outputId": "369df59e-7463-4ecb-c9b2-e5ac9aba0847", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "norm_corp" ], "execution_count": 30, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['sky blue beautiful .', 'love blue beautiful sky !',\n", " 'quick brown fox jumps lazy dog .',\n", " \"king ' breakfast sausages , ham , bacon , eggs , toast beans\",\n", " 'love green eggs , ham , sausages bacon !',\n", " 'brown fox quick blue dog lazy !', 'sky blue sky beautiful today',\n", " 'dog lazy brown fox quick !'], dtype='<U60')" ] }, "metadata": { "tags": [] }, "execution_count": 30 } ] }, { "cell_type": "markdown", "metadata": { "id": "SY_of05g5eds" }, "source": [ "## Vectorizing text" ] }, { "cell_type": "code", "metadata": { "id": "ajFvXb_u3w4S", "outputId": "cb422945-2ab4-4bf6-c36b-9bc076b98190", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "cv = CountVectorizer(min_df=0., max_df=1.)\n", "cv_matrix = cv.fit_transform(norm_corp).toarray()\n", "\n", "cv_matrix" ], "execution_count": 41, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],\n", " [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],\n", " [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],\n", " [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],\n", " [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0],\n", " [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],\n", " [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1],\n", " [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])" ] }, "metadata": { "tags": [] }, "execution_count": 41 } ] }, { "cell_type": "code", "metadata": { "id": "9Yaq31rI5BKu", "outputId": "58b28c9a-5af6-4090-8493-c46b694dec89", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "vocab = cv.get_feature_names()\n", "vocab" ], "execution_count": 44, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['bacon',\n", " 'beans',\n", " 'beautiful',\n", " 'blue',\n", " 'breakfast',\n", " 'brown',\n", " 'dog',\n", " 'eggs',\n", " 'fox',\n", " 'green',\n", " 'ham',\n", " 'jumps',\n", " 'king',\n", " 'lazy',\n", " 'love',\n", " 'quick',\n", " 'sausages',\n", " 'sky',\n", " 'toast',\n", " 'today']" ] }, "metadata": { "tags": [] }, "execution_count": 44 } ] }, { "cell_type": "code", "metadata": { "id": "lPVEdeww5ocX", "outputId": "394f48f7-1157-47bf-e5c5-61c88b7c7f9d", "colab": { "base_uri": "https://localhost:8080/", "height": 300 } }, "source": [ "pd.DataFrame(cv_matrix, columns=vocab)" ], "execution_count": 46, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>bacon</th>\n", " <th>beans</th>\n", " <th>beautiful</th>\n", " <th>blue</th>\n", " <th>breakfast</th>\n", " <th>brown</th>\n", " <th>dog</th>\n", " <th>eggs</th>\n", " <th>fox</th>\n", " <th>green</th>\n", " <th>ham</th>\n", " <th>jumps</th>\n", " <th>king</th>\n", " <th>lazy</th>\n", " <th>love</th>\n", " <th>quick</th>\n", " <th>sausages</th>\n", " <th>sky</th>\n", " <th>toast</th>\n", " <th>today</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " bacon beans beautiful blue breakfast ... quick sausages sky toast today\n", "0 0 0 1 1 0 ... 0 0 1 0 0\n", "1 0 0 1 1 0 ... 0 0 1 0 0\n", "2 0 0 0 0 0 ... 1 0 0 0 0\n", "3 1 1 0 0 1 ... 0 1 0 1 0\n", "4 1 0 0 0 0 ... 0 1 0 0 0\n", "5 0 0 0 1 0 ... 1 0 0 0 0\n", "6 0 0 1 1 0 ... 0 0 2 0 1\n", "7 0 0 0 0 0 ... 1 0 0 0 0\n", "\n", "[8 rows x 20 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 46 } ] }, { "cell_type": "code", "metadata": { "id": "1uG4fGkt50fJ", "outputId": "cfe664c6-7013-47e2-edc4-466eb1311b1b", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "cv.transform(['sky is good and beautiful beautiful today']).toarray()" ], "execution_count": 52, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]])" ] }, "metadata": { "tags": [] }, "execution_count": 52 } ] }, { "cell_type": "markdown", "metadata": { "id": "-0tPfNKv8F0Y" }, "source": [ "ngram_range --> if set to (1, 2)\n", "\n", " --> creates uni-gram and bi-gram\n", "\n", " --> if set to (2, 2) --> creates only bi-gram\n", "\n", " --> if set to (1, 3) --> creates only uni-gram, bi-gram and tri-gram" ] }, { "cell_type": "code", "metadata": { "id": "O_0n3ddd6SkV" }, "source": [ "cv = CountVectorizer(ngram_range=(2, 2))" ], "execution_count": 53, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bMlvArrS8a9m" }, "source": [ "cv_matrix = cv.fit_transform(norm_corp).toarray()" ], "execution_count": 65, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "I47IQd1Y8geI", "outputId": "1eec935c-b332-4f0d-cd26-a55a8d5c9ff8", "colab": { "base_uri": "https://localhost:8080/", "height": 338 } }, "source": [ "pd.DataFrame(cv_matrix, columns=cv.get_feature_names())" ], "execution_count": 55, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>bacon eggs</th>\n", " <th>beautiful sky</th>\n", " <th>beautiful today</th>\n", " <th>blue beautiful</th>\n", " <th>blue dog</th>\n", " <th>blue sky</th>\n", " <th>breakfast sausages</th>\n", " <th>brown fox</th>\n", " <th>dog lazy</th>\n", " <th>eggs ham</th>\n", " <th>eggs toast</th>\n", " <th>fox jumps</th>\n", " <th>fox quick</th>\n", " <th>green eggs</th>\n", " <th>ham bacon</th>\n", " <th>ham sausages</th>\n", " <th>jumps lazy</th>\n", " <th>king breakfast</th>\n", " <th>lazy brown</th>\n", " <th>lazy dog</th>\n", " <th>love blue</th>\n", " <th>love green</th>\n", " <th>quick blue</th>\n", " <th>quick brown</th>\n", " <th>sausages bacon</th>\n", " <th>sausages ham</th>\n", " <th>sky beautiful</th>\n", " <th>sky blue</th>\n", " <th>toast beans</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " bacon eggs beautiful sky ... sky blue toast beans\n", "0 0 0 ... 1 0\n", "1 0 1 ... 0 0\n", "2 0 0 ... 0 0\n", "3 1 0 ... 0 1\n", "4 0 0 ... 0 0\n", "5 0 0 ... 0 0\n", "6 0 0 ... 1 0\n", "7 0 0 ... 0 0\n", "\n", "[8 rows x 29 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 55 } ] }, { "cell_type": "code", "metadata": { "id": "IDG7gNlA8lQs", "outputId": "1bb297ea-daef-4352-8160-4943840143d0", "colab": { "base_uri": "https://localhost:8080/", "height": 355 } }, "source": [ "cv1 = CountVectorizer(ngram_range=(1, 3))\n", "cv1_matrix = cv1.fit_transform(norm_corp).toarray()\n", "pd.DataFrame(cv1_matrix, columns=cv1.get_feature_names())" ], "execution_count": 56, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>bacon</th>\n", " <th>bacon eggs</th>\n", " <th>bacon eggs toast</th>\n", " <th>beans</th>\n", " <th>beautiful</th>\n", " <th>beautiful sky</th>\n", " <th>beautiful today</th>\n", " <th>blue</th>\n", " <th>blue beautiful</th>\n", " <th>blue beautiful sky</th>\n", " <th>blue dog</th>\n", " <th>blue dog lazy</th>\n", " <th>blue sky</th>\n", " <th>blue sky beautiful</th>\n", " <th>breakfast</th>\n", " <th>breakfast sausages</th>\n", " <th>breakfast sausages ham</th>\n", " <th>brown</th>\n", " <th>brown fox</th>\n", " <th>brown fox jumps</th>\n", " <th>brown fox quick</th>\n", " <th>dog</th>\n", " <th>dog lazy</th>\n", " <th>dog lazy brown</th>\n", " <th>eggs</th>\n", " <th>eggs ham</th>\n", " <th>eggs ham sausages</th>\n", " <th>eggs toast</th>\n", " <th>eggs toast beans</th>\n", " <th>fox</th>\n", " <th>fox jumps</th>\n", " <th>fox jumps lazy</th>\n", " <th>fox quick</th>\n", " <th>fox quick blue</th>\n", " <th>green</th>\n", " <th>green eggs</th>\n", " <th>green eggs ham</th>\n", " <th>ham</th>\n", " <th>ham bacon</th>\n", " <th>ham bacon eggs</th>\n", " <th>ham sausages</th>\n", " <th>ham sausages bacon</th>\n", " <th>jumps</th>\n", " <th>jumps lazy</th>\n", " <th>jumps lazy dog</th>\n", " <th>king</th>\n", " <th>king breakfast</th>\n", " <th>king breakfast sausages</th>\n", " <th>lazy</th>\n", " <th>lazy brown</th>\n", " <th>lazy brown fox</th>\n", " <th>lazy dog</th>\n", " <th>love</th>\n", " <th>love blue</th>\n", " <th>love blue beautiful</th>\n", " <th>love green</th>\n", " <th>love green eggs</th>\n", " <th>quick</th>\n", " <th>quick blue</th>\n", " <th>quick blue dog</th>\n", " <th>quick brown</th>\n", " <th>quick brown fox</th>\n", " <th>sausages</th>\n", " <th>sausages bacon</th>\n", " <th>sausages ham</th>\n", " <th>sausages ham bacon</th>\n", " <th>sky</th>\n", " <th>sky beautiful</th>\n", " <th>sky beautiful today</th>\n", " <th>sky blue</th>\n", " <th>sky blue beautiful</th>\n", " <th>sky blue sky</th>\n", " <th>toast</th>\n", " <th>toast beans</th>\n", " <th>today</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " bacon bacon eggs bacon eggs toast ... toast toast beans today\n", "0 0 0 0 ... 0 0 0\n", "1 0 0 0 ... 0 0 0\n", "2 0 0 0 ... 0 0 0\n", "3 1 1 1 ... 1 1 0\n", "4 1 0 0 ... 0 0 0\n", "5 0 0 0 ... 0 0 0\n", "6 0 0 0 ... 0 0 1\n", "7 0 0 0 ... 0 0 0\n", "\n", "[8 rows x 75 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 56 } ] }, { "cell_type": "markdown", "metadata": { "id": "YjrYlsTGAToA" }, "source": [ "# Tfidf(Term frequency and Inverse document frequency)" ] }, { "cell_type": "code", "metadata": { "id": "Yukd6D658wka" }, "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer" ], "execution_count": 58, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "YZrJJI6jCCCf" }, "source": [ "Creates vector based on the frequency and inverse document frequency value of ech words and displays the words based on the value or threshold frequency passed to the min_df and max_df value" ] }, { "cell_type": "code", "metadata": { "id": "ijOkvnWuAiOX" }, "source": [ "tfidf = TfidfVectorizer()" ], "execution_count": 59, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Sp1rBzzuAnbQ", "outputId": "5f69f2d7-7871-415d-b25a-b77d72208941", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "tf_matrix = tfidf.fit_transform(norm_corp).toarray()\n", "tf_matrix" ], "execution_count": 61, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0. , 0. , 0.6009782 , 0.52692542, 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0.6009782 , 0. , 0. ],\n", " [0. , 0. , 0.49316188, 0.43239428, 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0.57150495,\n", " 0. , 0. , 0.49316188, 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0. ,\n", " 0.38036238, 0.38036238, 0. , 0.38036238, 0. ,\n", " 0. , 0.52594895, 0. , 0.38036238, 0. ,\n", " 0.38036238, 0. , 0. , 0. , 0. ],\n", " [0.32116401, 0.38321492, 0. , 0. , 0.38321492,\n", " 0. , 0. , 0.32116401, 0. , 0. ,\n", " 0.32116401, 0. , 0.38321492, 0. , 0. ,\n", " 0. , 0.32116401, 0. , 0.38321492, 0. ],\n", " [0.39455357, 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0.39455357, 0. , 0.47078381,\n", " 0.39455357, 0. , 0. , 0. , 0.39455357,\n", " 0. , 0.39455357, 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0.3650479 , 0. ,\n", " 0.41635082, 0.41635082, 0. , 0.41635082, 0. ,\n", " 0. , 0. , 0. , 0.41635082, 0. ,\n", " 0.41635082, 0. , 0. , 0. , 0. ],\n", " [0. , 0. , 0.36082605, 0.31636491, 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0.72165209, 0. , 0.49893493],\n", " [0. , 0. , 0. , 0. , 0. ,\n", " 0.4472136 , 0.4472136 , 0. , 0.4472136 , 0. ,\n", " 0. , 0. , 0. , 0.4472136 , 0. ,\n", " 0.4472136 , 0. , 0. , 0. , 0. ]])" ] }, "metadata": { "tags": [] }, "execution_count": 61 } ] }, { "cell_type": "code", "metadata": { "id": "jlgzsyVUAu1X", "outputId": "0236bc5c-801a-47a0-8e4e-0aa603956049", "colab": { "base_uri": "https://localhost:8080/", "height": 320 } }, "source": [ "pd.DataFrame(tf_matrix, columns=tfidf.get_feature_names())" ], "execution_count": 63, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>bacon</th>\n", " <th>beans</th>\n", " <th>beautiful</th>\n", " <th>blue</th>\n", " <th>breakfast</th>\n", " <th>brown</th>\n", " <th>dog</th>\n", " <th>eggs</th>\n", " <th>fox</th>\n", " <th>green</th>\n", " <th>ham</th>\n", " <th>jumps</th>\n", " <th>king</th>\n", " <th>lazy</th>\n", " <th>love</th>\n", " <th>quick</th>\n", " <th>sausages</th>\n", " <th>sky</th>\n", " <th>toast</th>\n", " <th>today</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.600978</td>\n", " <td>0.526925</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.600978</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.493162</td>\n", " <td>0.432394</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.571505</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.493162</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.380362</td>\n", " <td>0.380362</td>\n", " <td>0.000000</td>\n", " <td>0.380362</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.525949</td>\n", " <td>0.000000</td>\n", " <td>0.380362</td>\n", " <td>0.000000</td>\n", " <td>0.380362</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.321164</td>\n", " <td>0.383215</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.383215</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.321164</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.321164</td>\n", " <td>0.000000</td>\n", " <td>0.383215</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.321164</td>\n", " <td>0.000000</td>\n", " <td>0.383215</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.394554</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.394554</td>\n", " <td>0.000000</td>\n", " <td>0.470784</td>\n", " <td>0.394554</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.394554</td>\n", " <td>0.000000</td>\n", " <td>0.394554</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.365048</td>\n", " <td>0.000000</td>\n", " <td>0.416351</td>\n", " <td>0.416351</td>\n", " <td>0.000000</td>\n", " <td>0.416351</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.416351</td>\n", " <td>0.000000</td>\n", " <td>0.416351</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.360826</td>\n", " <td>0.316365</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.721652</td>\n", " <td>0.000000</td>\n", " <td>0.498935</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.447214</td>\n", " <td>0.447214</td>\n", " <td>0.000000</td>\n", " <td>0.447214</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.447214</td>\n", " <td>0.000000</td>\n", " <td>0.447214</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " bacon beans beautiful ... sky toast today\n", "0 0.000000 0.000000 0.600978 ... 0.600978 0.000000 0.000000\n", "1 0.000000 0.000000 0.493162 ... 0.493162 0.000000 0.000000\n", "2 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "3 0.321164 0.383215 0.000000 ... 0.000000 0.383215 0.000000\n", "4 0.394554 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "5 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "6 0.000000 0.000000 0.360826 ... 0.721652 0.000000 0.498935\n", "7 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "\n", "[8 rows x 20 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 63 } ] }, { "cell_type": "code", "metadata": { "id": "LnISj957A8BE", "outputId": "94bfb1b2-9140-4df7-b0ef-cdc7be9c1aed", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "len(tfidf.get_feature_names()), len(cv.get_feature_names())" ], "execution_count": 67, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(20, 29)" ] }, "metadata": { "tags": [] }, "execution_count": 67 } ] }, { "cell_type": "code", "metadata": { "id": "Uuvqfu7CBLqE" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }