{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Raw Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"X_raw = ['That priduct is poor product!!...', 'I loving this product.', 'That is brilliant!!@#']\n",
"y = ['negative', 'positive', 'positive']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clean Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/uzaycetin/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"# Importing the libraries\n",
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"import pickle \n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from sklearn.datasets import load_files\n",
"nltk.download('stopwords')\n",
"\n",
"from nltk.stem import WordNetLemmatizer\n",
"lema = WordNetLemmatizer()\n",
"\n",
"# stopwordsleri sil\n",
"from nltk.corpus import stopwords\n",
"stop = stopwords.words('english')\n",
"\n",
"from textblob import TextBlob\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def preprocessing(text):\n",
" text = text.lower()\n",
" # get rid of non-alphanumerical characters\n",
" text = re.sub(r'\\W', ' ', text) \n",
" # get rid of spaces\n",
" text = re.sub(r'\\s+', ' ', text) \n",
" # Correct mistakes \n",
" # and do the stemming\n",
" return \" \".join([lema.lemmatize(str(TextBlob(word).correct())) \n",
" for word in text.split() if word not in stop])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['product poor product', 'loving product', 'brilliant']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = [preprocessing(x) for x in X_raw]\n",
"X"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Term Frequency\n",
"\n",
"$$\n",
"tf(w,d) = \\frac{count_d (w)}{|d|}\n",
"$$"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'brilliant', 'loving', 'poor', 'product'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words = {w for d in X for w in d.split()}\n",
"words"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def term_frequency(d):\n",
" tf = {w:0 for w in words}\n",
" for w in d.split():\n",
" if w in tf:\n",
" tf[w] += 1\n",
" return pd.Series(tf)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"tf = pd.DataFrame(columns=words)\n",
"for i in range(len(X)):\n",
" tf.loc[i] = term_frequency(X[i])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" poor | \n",
" product | \n",
" loving | \n",
" brilliant | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" poor product loving brilliant\n",
"0 1 2 0 0\n",
"1 0 1 1 0\n",
"2 0 0 0 1"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Count Vectorizer"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 1, 2],\n",
" [0, 1, 0, 1],\n",
" [1, 0, 0, 0]], dtype=int64)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"vectorizer = CountVectorizer()\n",
"cX = vectorizer.fit_transform(X).toarray()\n",
"cX"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.vocabulary_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### inverse document frequency\n",
"$$idf(w) = 1 + log(\\frac{ N+1 }{ N_w + 1})$$\n",
"\n",
" - N: Number of documents in the corpus\n",
" - N_w : Number of documents containing word w\n",
" \n",
" \n",
"### Example\n",
"\n",
"\n",
"think of a corpus, with 1000 documents\n",
" - word __cat__ appears 5 of them\n",
" - worf __the__ appears 500 of then\n",
" \n",
"Calculate\n",
"$\\frac{N}{N_w}$"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"N = len(X)\n",
"N"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'brilliant', 'loving', 'poor', 'product'}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def inverse_document_freq(w, X):\n",
" N = len(X)\n",
" N_d = 0\n",
" for d in X: \n",
" if w in d.split():\n",
" N_d += 1\n",
" return 1 + np.log((N+1)/(N_d+1))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('poor', 1.6931471805599454),\n",
" ('product', 1.2876820724517808),\n",
" ('loving', 1.6931471805599454),\n",
" ('brilliant', 1.6931471805599454)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idfs = [(w,inverse_document_freq(w, X)) for w in words]\n",
"idfs"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"for c, idf in idfs:\n",
" tf[c] *= idf"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" poor | \n",
" product | \n",
" loving | \n",
" brilliant | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1.69315 | \n",
" 2.57536 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1.28768 | \n",
" 1.69315 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1.69315 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" poor product loving brilliant\n",
"0 1.69315 2.57536 0 0\n",
"1 0 1.28768 1.69315 0\n",
"2 0 0 0 1.69315"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.54935123, 0.83559154, 0. , 0. ],\n",
" [0. , 0.60534851, 0.79596054, 0. ],\n",
" [0. , 0. , 0. , 1. ]])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import normalize\n",
"normalize(tf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SKLEARN TF-IDF Model"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. , 1.69314718, 2.57536414],\n",
" [0. , 1.69314718, 0. , 1.28768207],\n",
" [1.69314718, 0. , 0. , 0. ]])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Creating the Tf-Idf model directly\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer(norm = None, smooth_idf=True)\n",
"tX = vectorizer.fit_transform(X).toarray()\n",
"tX"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer.vocabulary_"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. , 0.54935123, 0.83559154],\n",
" [0. , 0.79596054, 0. , 0.60534851],\n",
" [1. , 0. , 0. , 0. ]])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Creating the Tf-Idf model directly\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer()\n",
"tX = vectorizer.fit_transform(X).toarray()\n",
"tX"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}