{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "feature_engg_text.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyNdJbebN9/8/+c0SzmsW5kt", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "metadata": { "id": "BA5Tgfw-z5fI" }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "\n", "nltk.download('stopwords')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-Aqfz4r10G-0" }, "source": [ "corpus = ['The sky is blue and beautiful.',\n", " 'Love this blue and beautiful sky!',\n", " 'The quick brown fox jumps over the lazy dog.',\n", " \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n", " 'I love green eggs, ham, sausages and bacon!',\n", " 'The brown fox is quick and the blue dog is lazy!',\n", " 'The sky is very blue and the sky is very beautiful today',\n", " 'The dog is lazy but the brown fox is quick!' \n", "]\n", "\n", "labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Hy6Vzhvk0Q_m" }, "source": [ "corpus = np.array(corpus)\n", "df = pd.DataFrame({\"text\": corpus, \"label\": labels})" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "PTR8R6Zp0XRZ", "outputId": "ecc4f688-66eb-4ba7-ff7f-2756740aff92", "colab": { "base_uri": "https://localhost:8080/", "height": 206 } }, "source": [ "df.head()" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
0The sky is blue and beautiful.weather
1Love this blue and beautiful sky!weather
2The quick brown fox jumps over the lazy dog.animals
3A king's breakfast has sausages, ham, bacon, e...food
4I love green eggs, ham, sausages and bacon!food
\n", "
" ], "text/plain": [ " text label\n", "0 The sky is blue and beautiful. weather\n", "1 Love this blue and beautiful sky! weather\n", "2 The quick brown fox jumps over the lazy dog. animals\n", "3 A king's breakfast has sausages, ham, bacon, e... food\n", "4 I love green eggs, ham, sausages and bacon! food" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "XnkiPOyT0s1e", "outputId": "9beae70f-a549-40be-c438-612e666f33e2", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "wpt = nltk.WordPunctTokenizer()\n", "stop_words = nltk.corpus.stopwords.words('english')\n", "wpt.tokenize(corpus[0])" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['The', 'sky', 'is', 'blue', 'and', 'beautiful', '.']" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "code", "metadata": { "id": "oFzkUsQS10bS" }, "source": [ "def preprocess(doc):\n", " doc = doc.lower().strip()\n", " tokens = wpt.tokenize(doc)\n", " tok = [token for token in tokens if token not in stop_words]\n", "\n", " doc = ' '.join(tok)\n", " return doc" ], "execution_count": 19, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ayitk8hh24BT" }, "source": [ "normalize_corpus = np.vectorize(preprocess)" ], "execution_count": 26, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "hp5zRLYh27DK" }, "source": [ "norm_corp = normalize_corpus(corpus)" ], "execution_count": 28, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vpU9XPJw3qyx", "outputId": "369df59e-7463-4ecb-c9b2-e5ac9aba0847", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "norm_corp" ], "execution_count": 30, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['sky blue beautiful .', 'love blue beautiful sky !',\n", " 'quick brown fox jumps lazy dog .',\n", " \"king ' breakfast sausages , ham , bacon , eggs , toast beans\",\n", " 'love green eggs , ham , sausages bacon !',\n", " 'brown fox quick blue dog lazy !', 'sky blue sky beautiful today',\n", " 'dog lazy brown fox quick !'], dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
baconbeansbeautifulbluebreakfastbrowndogeggsfoxgreenhamjumpskinglazylovequicksausagesskytoasttoday
000110000000000000100
100110000000000100100
200000110100101010000
311001001001010001010
410000001011000101000
500010110100001010000
600110000000000000201
700000110100001010000
\n", "" ], "text/plain": [ " bacon beans beautiful blue breakfast ... quick sausages sky toast today\n", "0 0 0 1 1 0 ... 0 0 1 0 0\n", "1 0 0 1 1 0 ... 0 0 1 0 0\n", "2 0 0 0 0 0 ... 1 0 0 0 0\n", "3 1 1 0 0 1 ... 0 1 0 1 0\n", "4 1 0 0 0 0 ... 0 1 0 0 0\n", "5 0 0 0 1 0 ... 1 0 0 0 0\n", "6 0 0 1 1 0 ... 0 0 2 0 1\n", "7 0 0 0 0 0 ... 1 0 0 0 0\n", "\n", "[8 rows x 20 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 46 } ] }, { "cell_type": "code", "metadata": { "id": "1uG4fGkt50fJ", "outputId": "cfe664c6-7013-47e2-edc4-466eb1311b1b", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "cv.transform(['sky is good and beautiful beautiful today']).toarray()" ], "execution_count": 52, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]])" ] }, "metadata": { "tags": [] }, "execution_count": 52 } ] }, { "cell_type": "markdown", "metadata": { "id": "-0tPfNKv8F0Y" }, "source": [ "ngram_range --> if set to (1, 2)\n", "\n", " --> creates uni-gram and bi-gram\n", "\n", " --> if set to (2, 2) --> creates only bi-gram\n", "\n", " --> if set to (1, 3) --> creates only uni-gram, bi-gram and tri-gram" ] }, { "cell_type": "code", "metadata": { "id": "O_0n3ddd6SkV" }, "source": [ "cv = CountVectorizer(ngram_range=(2, 2))" ], "execution_count": 53, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bMlvArrS8a9m" }, "source": [ "cv_matrix = cv.fit_transform(norm_corp).toarray()" ], "execution_count": 65, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "I47IQd1Y8geI", "outputId": "1eec935c-b332-4f0d-cd26-a55a8d5c9ff8", "colab": { "base_uri": "https://localhost:8080/", "height": 338 } }, "source": [ "pd.DataFrame(cv_matrix, columns=cv.get_feature_names())" ], "execution_count": 55, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bacon eggsbeautiful skybeautiful todayblue beautifulblue dogblue skybreakfast sausagesbrown foxdog lazyeggs hameggs toastfox jumpsfox quickgreen eggsham baconham sausagesjumps lazyking breakfastlazy brownlazy doglove bluelove greenquick bluequick brownsausages baconsausages hamsky beautifulsky bluetoast beans
000010000000000000000000000010
101010000000000000000100000000
200000001000100001001000100000
310000010001000100100000001001
400000000010001010000010010000
500001001100010000000001000000
600100100000000000000000000110
700000001100010000010000000000
\n", "
" ], "text/plain": [ " bacon eggs beautiful sky ... sky blue toast beans\n", "0 0 0 ... 1 0\n", "1 0 1 ... 0 0\n", "2 0 0 ... 0 0\n", "3 1 0 ... 0 1\n", "4 0 0 ... 0 0\n", "5 0 0 ... 0 0\n", "6 0 0 ... 1 0\n", "7 0 0 ... 0 0\n", "\n", "[8 rows x 29 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 55 } ] }, { "cell_type": "code", "metadata": { "id": "IDG7gNlA8lQs", "outputId": "1bb297ea-daef-4352-8160-4943840143d0", "colab": { "base_uri": "https://localhost:8080/", "height": 355 } }, "source": [ "cv1 = CountVectorizer(ngram_range=(1, 3))\n", "cv1_matrix = cv1.fit_transform(norm_corp).toarray()\n", "pd.DataFrame(cv1_matrix, columns=cv1.get_feature_names())" ], "execution_count": 56, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
baconbacon eggsbacon eggs toastbeansbeautifulbeautiful skybeautiful todayblueblue beautifulblue beautiful skyblue dogblue dog lazyblue skyblue sky beautifulbreakfastbreakfast sausagesbreakfast sausages hambrownbrown foxbrown fox jumpsbrown fox quickdogdog lazydog lazy browneggseggs hameggs ham sausageseggs toasteggs toast beansfoxfox jumpsfox jumps lazyfox quickfox quick bluegreengreen eggsgreen eggs hamhamham baconham bacon eggsham sausagesham sausages baconjumpsjumps lazyjumps lazy dogkingking breakfastking breakfast sausageslazylazy brownlazy brown foxlazy doglovelove bluelove blue beautifullove greenlove green eggsquickquick bluequick blue dogquick brownquick brown foxsausagessausages baconsausages hamsausages ham baconskysky beautifulsky beautiful todaysky bluesky blue beautifulsky blue skytoasttoast beanstoday
0000010011000000000000000000000000000000000000000000000000000000000100110000
1000011011100000000000000000000000000000000000000000011100000000000100000000
2000000000000000001110100000001110000000000111000100100000100110000000000000
3111100000000001110000000100110000000011100000111000000000000001011000000110
4100000000000000000000000111000000011110011000000000010011000001100000000000
5000000010011000001101110000001001100000000000000100000000111000000000000000
6000010110000110000000000000000000000000000000000000000000000000000211101001
7000000000000000001101111000001001000000000000000111000000100000000000000000
\n", "
" ], "text/plain": [ " bacon bacon eggs bacon eggs toast ... toast toast beans today\n", "0 0 0 0 ... 0 0 0\n", "1 0 0 0 ... 0 0 0\n", "2 0 0 0 ... 0 0 0\n", "3 1 1 1 ... 1 1 0\n", "4 1 0 0 ... 0 0 0\n", "5 0 0 0 ... 0 0 0\n", "6 0 0 0 ... 0 0 1\n", "7 0 0 0 ... 0 0 0\n", "\n", "[8 rows x 75 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 56 } ] }, { "cell_type": "markdown", "metadata": { "id": "YjrYlsTGAToA" }, "source": [ "# Tfidf(Term frequency and Inverse document frequency)" ] }, { "cell_type": "code", "metadata": { "id": "Yukd6D658wka" }, "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer" ], "execution_count": 58, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "YZrJJI6jCCCf" }, "source": [ "Creates vector based on the frequency and inverse document frequency value of ech words and displays the words based on the value or threshold frequency passed to the min_df and max_df value" ] }, { "cell_type": "code", "metadata": { "id": "ijOkvnWuAiOX" }, "source": [ "tfidf = TfidfVectorizer()" ], "execution_count": 59, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Sp1rBzzuAnbQ", "outputId": "5f69f2d7-7871-415d-b25a-b77d72208941", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "tf_matrix = tfidf.fit_transform(norm_corp).toarray()\n", "tf_matrix" ], "execution_count": 61, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0. , 0. , 0.6009782 , 0.52692542, 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0.6009782 , 0. , 0. ],\n", " [0. , 0. , 0.49316188, 0.43239428, 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0.57150495,\n", " 0. , 0. , 0.49316188, 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0. ,\n", " 0.38036238, 0.38036238, 0. , 0.38036238, 0. ,\n", " 0. , 0.52594895, 0. , 0.38036238, 0. ,\n", " 0.38036238, 0. , 0. , 0. , 0. ],\n", " [0.32116401, 0.38321492, 0. , 0. , 0.38321492,\n", " 0. , 0. , 0.32116401, 0. , 0. ,\n", " 0.32116401, 0. , 0.38321492, 0. , 0. ,\n", " 0. , 0.32116401, 0. , 0.38321492, 0. ],\n", " [0.39455357, 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0.39455357, 0. , 0.47078381,\n", " 0.39455357, 0. , 0. , 0. , 0.39455357,\n", " 0. , 0.39455357, 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0.3650479 , 0. ,\n", " 0.41635082, 0.41635082, 0. , 0.41635082, 0. ,\n", " 0. , 0. , 0. , 0.41635082, 0. ,\n", " 0.41635082, 0. , 0. , 0. , 0. ],\n", " [0. , 0. , 0.36082605, 0.31636491, 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0.72165209, 0. , 0.49893493],\n", " [0. , 0. , 0. , 0. , 0. ,\n", " 0.4472136 , 0.4472136 , 0. , 0.4472136 , 0. ,\n", " 0. , 0. , 0. , 0.4472136 , 0. ,\n", " 0.4472136 , 0. , 0. , 0. , 0. ]])" ] }, "metadata": { "tags": [] }, "execution_count": 61 } ] }, { "cell_type": "code", "metadata": { "id": "jlgzsyVUAu1X", "outputId": "0236bc5c-801a-47a0-8e4e-0aa603956049", "colab": { "base_uri": "https://localhost:8080/", "height": 320 } }, "source": [ "pd.DataFrame(tf_matrix, columns=tfidf.get_feature_names())" ], "execution_count": 63, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
baconbeansbeautifulbluebreakfastbrowndogeggsfoxgreenhamjumpskinglazylovequicksausagesskytoasttoday
00.0000000.0000000.6009780.5269250.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.6009780.0000000.000000
10.0000000.0000000.4931620.4323940.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.5715050.0000000.0000000.4931620.0000000.000000
20.0000000.0000000.0000000.0000000.0000000.3803620.3803620.0000000.3803620.0000000.0000000.5259490.0000000.3803620.0000000.3803620.0000000.0000000.0000000.000000
30.3211640.3832150.0000000.0000000.3832150.0000000.0000000.3211640.0000000.0000000.3211640.0000000.3832150.0000000.0000000.0000000.3211640.0000000.3832150.000000
40.3945540.0000000.0000000.0000000.0000000.0000000.0000000.3945540.0000000.4707840.3945540.0000000.0000000.0000000.3945540.0000000.3945540.0000000.0000000.000000
50.0000000.0000000.0000000.3650480.0000000.4163510.4163510.0000000.4163510.0000000.0000000.0000000.0000000.4163510.0000000.4163510.0000000.0000000.0000000.000000
60.0000000.0000000.3608260.3163650.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.7216520.0000000.498935
70.0000000.0000000.0000000.0000000.0000000.4472140.4472140.0000000.4472140.0000000.0000000.0000000.0000000.4472140.0000000.4472140.0000000.0000000.0000000.000000
\n", "
" ], "text/plain": [ " bacon beans beautiful ... sky toast today\n", "0 0.000000 0.000000 0.600978 ... 0.600978 0.000000 0.000000\n", "1 0.000000 0.000000 0.493162 ... 0.493162 0.000000 0.000000\n", "2 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "3 0.321164 0.383215 0.000000 ... 0.000000 0.383215 0.000000\n", "4 0.394554 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "5 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "6 0.000000 0.000000 0.360826 ... 0.721652 0.000000 0.498935\n", "7 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n", "\n", "[8 rows x 20 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 63 } ] }, { "cell_type": "code", "metadata": { "id": "LnISj957A8BE", "outputId": "94bfb1b2-9140-4df7-b0ef-cdc7be9c1aed", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "len(tfidf.get_feature_names()), len(cv.get_feature_names())" ], "execution_count": 67, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(20, 29)" ] }, "metadata": { "tags": [] }, "execution_count": 67 } ] }, { "cell_type": "code", "metadata": { "id": "Uuvqfu7CBLqE" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }