{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Raw Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "X_raw = ['That priduct is poor product!!...', 'I loving this product.', 'That is brilliant!!@#']\n", "y = ['negative', 'positive', 'positive']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Clean Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/uzaycetin/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "# Importing the libraries\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import pickle \n", "import nltk\n", "from nltk.corpus import stopwords\n", "from sklearn.datasets import load_files\n", "nltk.download('stopwords')\n", "\n", "from nltk.stem import WordNetLemmatizer\n", "lema = WordNetLemmatizer()\n", "\n", "# stopwordsleri sil\n", "from nltk.corpus import stopwords\n", "stop = stopwords.words('english')\n", "\n", "from textblob import TextBlob\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def preprocessing(text):\n", " text = text.lower()\n", " # get rid of non-alphanumerical characters\n", " text = re.sub(r'\\W', ' ', text) \n", " # get rid of spaces\n", " text = re.sub(r'\\s+', ' ', text) \n", " # Correct mistakes \n", " # and do the stemming\n", " return \" \".join([lema.lemmatize(str(TextBlob(word).correct())) \n", " for word in text.split() if word not in stop])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['product poor product', 'loving product', 'brilliant']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = [preprocessing(x) for x in X_raw]\n", "X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Term Frequency\n", "\n", "$$\n", "tf(w,d) = \\frac{count_d (w)}{|d|}\n", "$$" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'brilliant', 'loving', 'poor', 'product'}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words = {w for d in X for w in d.split()}\n", "words" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def term_frequency(d):\n", " tf = {w:0 for w in words}\n", " for w in d.split():\n", " if w in tf:\n", " tf[w] += 1\n", " return pd.Series(tf)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "tf = pd.DataFrame(columns=words)\n", "for i in range(len(X)):\n", " tf.loc[i] = term_frequency(X[i])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>poor</th>\n", " <th>product</th>\n", " <th>loving</th>\n", " <th>brilliant</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " poor product loving brilliant\n", "0 1 2 0 0\n", "1 0 1 1 0\n", "2 0 0 0 1" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count Vectorizer" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 1, 2],\n", " [0, 1, 0, 1],\n", " [1, 0, 0, 0]], dtype=int64)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "vectorizer = CountVectorizer()\n", "cX = vectorizer.fit_transform(X).toarray()\n", "cX" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorizer.vocabulary_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### inverse document frequency\n", "$$idf(w) = 1 + log(\\frac{ N+1 }{ N_w + 1})$$\n", "\n", " - N: Number of documents in the corpus\n", " - N_w : Number of documents containing word w\n", " \n", " \n", "### Example\n", "\n", "\n", "think of a corpus, with 1000 documents\n", " - word __cat__ appears 5 of them\n", " - worf __the__ appears 500 of then\n", " \n", "Calculate\n", "$\\frac{N}{N_w}$" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "N = len(X)\n", "N" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'brilliant', 'loving', 'poor', 'product'}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def inverse_document_freq(w, X):\n", " N = len(X)\n", " N_d = 0\n", " for d in X: \n", " if w in d.split():\n", " N_d += 1\n", " return 1 + np.log((N+1)/(N_d+1))\n", " " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('poor', 1.6931471805599454),\n", " ('product', 1.2876820724517808),\n", " ('loving', 1.6931471805599454),\n", " ('brilliant', 1.6931471805599454)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idfs = [(w,inverse_document_freq(w, X)) for w in words]\n", "idfs" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "for c, idf in idfs:\n", " tf[c] *= idf" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>poor</th>\n", " <th>product</th>\n", " <th>loving</th>\n", " <th>brilliant</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.69315</td>\n", " <td>2.57536</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>1.28768</td>\n", " <td>1.69315</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1.69315</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " poor product loving brilliant\n", "0 1.69315 2.57536 0 0\n", "1 0 1.28768 1.69315 0\n", "2 0 0 0 1.69315" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.54935123, 0.83559154, 0. , 0. ],\n", " [0. , 0.60534851, 0.79596054, 0. ],\n", " [0. , 0. , 0. , 1. ]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import normalize\n", "normalize(tf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SKLEARN TF-IDF Model" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0. , 1.69314718, 2.57536414],\n", " [0. , 1.69314718, 0. , 1.28768207],\n", " [1.69314718, 0. , 0. , 0. ]])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Creating the Tf-Idf model directly\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer(norm = None, smooth_idf=True)\n", "tX = vectorizer.fit_transform(X).toarray()\n", "tX" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorizer.vocabulary_" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0. , 0.54935123, 0.83559154],\n", " [0. , 0.79596054, 0. , 0.60534851],\n", " [1. , 0. , 0. , 0. ]])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Creating the Tf-Idf model directly\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer()\n", "tX = vectorizer.fit_transform(X).toarray()\n", "tX" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }