{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "feature_engg_text.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNdJbebN9/8/+c0SzmsW5kt",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"metadata": {
"id": "BA5Tgfw-z5fI"
},
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import nltk\n",
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"\n",
"nltk.download('stopwords')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "-Aqfz4r10G-0"
},
"source": [
"corpus = ['The sky is blue and beautiful.',\n",
" 'Love this blue and beautiful sky!',\n",
" 'The quick brown fox jumps over the lazy dog.',\n",
" \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n",
" 'I love green eggs, ham, sausages and bacon!',\n",
" 'The brown fox is quick and the blue dog is lazy!',\n",
" 'The sky is very blue and the sky is very beautiful today',\n",
" 'The dog is lazy but the brown fox is quick!' \n",
"]\n",
"\n",
"labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Hy6Vzhvk0Q_m"
},
"source": [
"corpus = np.array(corpus)\n",
"df = pd.DataFrame({\"text\": corpus, \"label\": labels})"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "PTR8R6Zp0XRZ",
"outputId": "ecc4f688-66eb-4ba7-ff7f-2756740aff92",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
}
},
"source": [
"df.head()"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" The sky is blue and beautiful. | \n",
" weather | \n",
"
\n",
" \n",
" | 1 | \n",
" Love this blue and beautiful sky! | \n",
" weather | \n",
"
\n",
" \n",
" | 2 | \n",
" The quick brown fox jumps over the lazy dog. | \n",
" animals | \n",
"
\n",
" \n",
" | 3 | \n",
" A king's breakfast has sausages, ham, bacon, e... | \n",
" food | \n",
"
\n",
" \n",
" | 4 | \n",
" I love green eggs, ham, sausages and bacon! | \n",
" food | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text label\n",
"0 The sky is blue and beautiful. weather\n",
"1 Love this blue and beautiful sky! weather\n",
"2 The quick brown fox jumps over the lazy dog. animals\n",
"3 A king's breakfast has sausages, ham, bacon, e... food\n",
"4 I love green eggs, ham, sausages and bacon! food"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "XnkiPOyT0s1e",
"outputId": "9beae70f-a549-40be-c438-612e666f33e2",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"wpt = nltk.WordPunctTokenizer()\n",
"stop_words = nltk.corpus.stopwords.words('english')\n",
"wpt.tokenize(corpus[0])"
],
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['The', 'sky', 'is', 'blue', 'and', 'beautiful', '.']"
]
},
"metadata": {
"tags": []
},
"execution_count": 16
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "oFzkUsQS10bS"
},
"source": [
"def preprocess(doc):\n",
" doc = doc.lower().strip()\n",
" tokens = wpt.tokenize(doc)\n",
" tok = [token for token in tokens if token not in stop_words]\n",
"\n",
" doc = ' '.join(tok)\n",
" return doc"
],
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ayitk8hh24BT"
},
"source": [
"normalize_corpus = np.vectorize(preprocess)"
],
"execution_count": 26,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "hp5zRLYh27DK"
},
"source": [
"norm_corp = normalize_corpus(corpus)"
],
"execution_count": 28,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "vpU9XPJw3qyx",
"outputId": "369df59e-7463-4ecb-c9b2-e5ac9aba0847",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"norm_corp"
],
"execution_count": 30,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['sky blue beautiful .', 'love blue beautiful sky !',\n",
" 'quick brown fox jumps lazy dog .',\n",
" \"king ' breakfast sausages , ham , bacon , eggs , toast beans\",\n",
" 'love green eggs , ham , sausages bacon !',\n",
" 'brown fox quick blue dog lazy !', 'sky blue sky beautiful today',\n",
" 'dog lazy brown fox quick !'], dtype='\n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" bacon | \n",
" beans | \n",
" beautiful | \n",
" blue | \n",
" breakfast | \n",
" brown | \n",
" dog | \n",
" eggs | \n",
" fox | \n",
" green | \n",
" ham | \n",
" jumps | \n",
" king | \n",
" lazy | \n",
" love | \n",
" quick | \n",
" sausages | \n",
" sky | \n",
" toast | \n",
" today | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 7 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" bacon beans beautiful blue breakfast ... quick sausages sky toast today\n",
"0 0 0 1 1 0 ... 0 0 1 0 0\n",
"1 0 0 1 1 0 ... 0 0 1 0 0\n",
"2 0 0 0 0 0 ... 1 0 0 0 0\n",
"3 1 1 0 0 1 ... 0 1 0 1 0\n",
"4 1 0 0 0 0 ... 0 1 0 0 0\n",
"5 0 0 0 1 0 ... 1 0 0 0 0\n",
"6 0 0 1 1 0 ... 0 0 2 0 1\n",
"7 0 0 0 0 0 ... 1 0 0 0 0\n",
"\n",
"[8 rows x 20 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 46
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1uG4fGkt50fJ",
"outputId": "cfe664c6-7013-47e2-edc4-466eb1311b1b",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"cv.transform(['sky is good and beautiful beautiful today']).toarray()"
],
"execution_count": 52,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]])"
]
},
"metadata": {
"tags": []
},
"execution_count": 52
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-0tPfNKv8F0Y"
},
"source": [
"ngram_range --> if set to (1, 2)\n",
"\n",
" --> creates uni-gram and bi-gram\n",
"\n",
" --> if set to (2, 2) --> creates only bi-gram\n",
"\n",
" --> if set to (1, 3) --> creates only uni-gram, bi-gram and tri-gram"
]
},
{
"cell_type": "code",
"metadata": {
"id": "O_0n3ddd6SkV"
},
"source": [
"cv = CountVectorizer(ngram_range=(2, 2))"
],
"execution_count": 53,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bMlvArrS8a9m"
},
"source": [
"cv_matrix = cv.fit_transform(norm_corp).toarray()"
],
"execution_count": 65,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "I47IQd1Y8geI",
"outputId": "1eec935c-b332-4f0d-cd26-a55a8d5c9ff8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 338
}
},
"source": [
"pd.DataFrame(cv_matrix, columns=cv.get_feature_names())"
],
"execution_count": 55,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bacon eggs | \n",
" beautiful sky | \n",
" beautiful today | \n",
" blue beautiful | \n",
" blue dog | \n",
" blue sky | \n",
" breakfast sausages | \n",
" brown fox | \n",
" dog lazy | \n",
" eggs ham | \n",
" eggs toast | \n",
" fox jumps | \n",
" fox quick | \n",
" green eggs | \n",
" ham bacon | \n",
" ham sausages | \n",
" jumps lazy | \n",
" king breakfast | \n",
" lazy brown | \n",
" lazy dog | \n",
" love blue | \n",
" love green | \n",
" quick blue | \n",
" quick brown | \n",
" sausages bacon | \n",
" sausages ham | \n",
" sky beautiful | \n",
" sky blue | \n",
" toast beans | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 7 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bacon eggs beautiful sky ... sky blue toast beans\n",
"0 0 0 ... 1 0\n",
"1 0 1 ... 0 0\n",
"2 0 0 ... 0 0\n",
"3 1 0 ... 0 1\n",
"4 0 0 ... 0 0\n",
"5 0 0 ... 0 0\n",
"6 0 0 ... 1 0\n",
"7 0 0 ... 0 0\n",
"\n",
"[8 rows x 29 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 55
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "IDG7gNlA8lQs",
"outputId": "1bb297ea-daef-4352-8160-4943840143d0",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 355
}
},
"source": [
"cv1 = CountVectorizer(ngram_range=(1, 3))\n",
"cv1_matrix = cv1.fit_transform(norm_corp).toarray()\n",
"pd.DataFrame(cv1_matrix, columns=cv1.get_feature_names())"
],
"execution_count": 56,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bacon | \n",
" bacon eggs | \n",
" bacon eggs toast | \n",
" beans | \n",
" beautiful | \n",
" beautiful sky | \n",
" beautiful today | \n",
" blue | \n",
" blue beautiful | \n",
" blue beautiful sky | \n",
" blue dog | \n",
" blue dog lazy | \n",
" blue sky | \n",
" blue sky beautiful | \n",
" breakfast | \n",
" breakfast sausages | \n",
" breakfast sausages ham | \n",
" brown | \n",
" brown fox | \n",
" brown fox jumps | \n",
" brown fox quick | \n",
" dog | \n",
" dog lazy | \n",
" dog lazy brown | \n",
" eggs | \n",
" eggs ham | \n",
" eggs ham sausages | \n",
" eggs toast | \n",
" eggs toast beans | \n",
" fox | \n",
" fox jumps | \n",
" fox jumps lazy | \n",
" fox quick | \n",
" fox quick blue | \n",
" green | \n",
" green eggs | \n",
" green eggs ham | \n",
" ham | \n",
" ham bacon | \n",
" ham bacon eggs | \n",
" ham sausages | \n",
" ham sausages bacon | \n",
" jumps | \n",
" jumps lazy | \n",
" jumps lazy dog | \n",
" king | \n",
" king breakfast | \n",
" king breakfast sausages | \n",
" lazy | \n",
" lazy brown | \n",
" lazy brown fox | \n",
" lazy dog | \n",
" love | \n",
" love blue | \n",
" love blue beautiful | \n",
" love green | \n",
" love green eggs | \n",
" quick | \n",
" quick blue | \n",
" quick blue dog | \n",
" quick brown | \n",
" quick brown fox | \n",
" sausages | \n",
" sausages bacon | \n",
" sausages ham | \n",
" sausages ham bacon | \n",
" sky | \n",
" sky beautiful | \n",
" sky beautiful today | \n",
" sky blue | \n",
" sky blue beautiful | \n",
" sky blue sky | \n",
" toast | \n",
" toast beans | \n",
" today | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 7 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bacon bacon eggs bacon eggs toast ... toast toast beans today\n",
"0 0 0 0 ... 0 0 0\n",
"1 0 0 0 ... 0 0 0\n",
"2 0 0 0 ... 0 0 0\n",
"3 1 1 1 ... 1 1 0\n",
"4 1 0 0 ... 0 0 0\n",
"5 0 0 0 ... 0 0 0\n",
"6 0 0 0 ... 0 0 1\n",
"7 0 0 0 ... 0 0 0\n",
"\n",
"[8 rows x 75 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 56
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YjrYlsTGAToA"
},
"source": [
"# Tfidf(Term frequency and Inverse document frequency)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Yukd6D658wka"
},
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer"
],
"execution_count": 58,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "YZrJJI6jCCCf"
},
"source": [
"Creates vector based on the frequency and inverse document frequency value of ech words and displays the words based on the value or threshold frequency passed to the min_df and max_df value"
]
},
{
"cell_type": "code",
"metadata": {
"id": "ijOkvnWuAiOX"
},
"source": [
"tfidf = TfidfVectorizer()"
],
"execution_count": 59,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Sp1rBzzuAnbQ",
"outputId": "5f69f2d7-7871-415d-b25a-b77d72208941",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"tf_matrix = tfidf.fit_transform(norm_corp).toarray()\n",
"tf_matrix"
],
"execution_count": 61,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[0. , 0. , 0.6009782 , 0.52692542, 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0.6009782 , 0. , 0. ],\n",
" [0. , 0. , 0.49316188, 0.43239428, 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0.57150495,\n",
" 0. , 0. , 0.49316188, 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0. ,\n",
" 0.38036238, 0.38036238, 0. , 0.38036238, 0. ,\n",
" 0. , 0.52594895, 0. , 0.38036238, 0. ,\n",
" 0.38036238, 0. , 0. , 0. , 0. ],\n",
" [0.32116401, 0.38321492, 0. , 0. , 0.38321492,\n",
" 0. , 0. , 0.32116401, 0. , 0. ,\n",
" 0.32116401, 0. , 0.38321492, 0. , 0. ,\n",
" 0. , 0.32116401, 0. , 0.38321492, 0. ],\n",
" [0.39455357, 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0.39455357, 0. , 0.47078381,\n",
" 0.39455357, 0. , 0. , 0. , 0.39455357,\n",
" 0. , 0.39455357, 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , 0.3650479 , 0. ,\n",
" 0.41635082, 0.41635082, 0. , 0.41635082, 0. ,\n",
" 0. , 0. , 0. , 0.41635082, 0. ,\n",
" 0.41635082, 0. , 0. , 0. , 0. ],\n",
" [0. , 0. , 0.36082605, 0.31636491, 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0.72165209, 0. , 0.49893493],\n",
" [0. , 0. , 0. , 0. , 0. ,\n",
" 0.4472136 , 0.4472136 , 0. , 0.4472136 , 0. ,\n",
" 0. , 0. , 0. , 0.4472136 , 0. ,\n",
" 0.4472136 , 0. , 0. , 0. , 0. ]])"
]
},
"metadata": {
"tags": []
},
"execution_count": 61
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "jlgzsyVUAu1X",
"outputId": "0236bc5c-801a-47a0-8e4e-0aa603956049",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 320
}
},
"source": [
"pd.DataFrame(tf_matrix, columns=tfidf.get_feature_names())"
],
"execution_count": 63,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bacon | \n",
" beans | \n",
" beautiful | \n",
" blue | \n",
" breakfast | \n",
" brown | \n",
" dog | \n",
" eggs | \n",
" fox | \n",
" green | \n",
" ham | \n",
" jumps | \n",
" king | \n",
" lazy | \n",
" love | \n",
" quick | \n",
" sausages | \n",
" sky | \n",
" toast | \n",
" today | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.600978 | \n",
" 0.526925 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.600978 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.493162 | \n",
" 0.432394 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.571505 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.493162 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.380362 | \n",
" 0.380362 | \n",
" 0.000000 | \n",
" 0.380362 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.525949 | \n",
" 0.000000 | \n",
" 0.380362 | \n",
" 0.000000 | \n",
" 0.380362 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.321164 | \n",
" 0.383215 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.383215 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.321164 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.321164 | \n",
" 0.000000 | \n",
" 0.383215 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.321164 | \n",
" 0.000000 | \n",
" 0.383215 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.394554 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.394554 | \n",
" 0.000000 | \n",
" 0.470784 | \n",
" 0.394554 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.394554 | \n",
" 0.000000 | \n",
" 0.394554 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.365048 | \n",
" 0.000000 | \n",
" 0.416351 | \n",
" 0.416351 | \n",
" 0.000000 | \n",
" 0.416351 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.416351 | \n",
" 0.000000 | \n",
" 0.416351 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 6 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.360826 | \n",
" 0.316365 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.721652 | \n",
" 0.000000 | \n",
" 0.498935 | \n",
"
\n",
" \n",
" | 7 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.447214 | \n",
" 0.447214 | \n",
" 0.000000 | \n",
" 0.447214 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.447214 | \n",
" 0.000000 | \n",
" 0.447214 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bacon beans beautiful ... sky toast today\n",
"0 0.000000 0.000000 0.600978 ... 0.600978 0.000000 0.000000\n",
"1 0.000000 0.000000 0.493162 ... 0.493162 0.000000 0.000000\n",
"2 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n",
"3 0.321164 0.383215 0.000000 ... 0.000000 0.383215 0.000000\n",
"4 0.394554 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n",
"5 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n",
"6 0.000000 0.000000 0.360826 ... 0.721652 0.000000 0.498935\n",
"7 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000\n",
"\n",
"[8 rows x 20 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 63
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LnISj957A8BE",
"outputId": "94bfb1b2-9140-4df7-b0ef-cdc7be9c1aed",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"len(tfidf.get_feature_names()), len(cv.get_feature_names())"
],
"execution_count": 67,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(20, 29)"
]
},
"metadata": {
"tags": []
},
"execution_count": 67
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Uuvqfu7CBLqE"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}