{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Raw Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_raw = ['That priduct is poor product!!...', 'I loving this product.', 'That is     brilliant!!@#']\n",
    "y = ['negative', 'positive', 'positive']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/uzaycetin/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "# Importing the libraries\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import pickle \n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.datasets import load_files\n",
    "nltk.download('stopwords')\n",
    "\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "lema = WordNetLemmatizer()\n",
    "\n",
    "# stopwordsleri sil\n",
    "from nltk.corpus import stopwords\n",
    "stop = stopwords.words('english')\n",
    "\n",
    "from textblob import TextBlob\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocessing(text):\n",
    "    text = text.lower()\n",
    "    # get rid of non-alphanumerical characters\n",
    "    text = re.sub(r'\\W', ' ', text) \n",
    "    # get rid of spaces\n",
    "    text = re.sub(r'\\s+', ' ', text) \n",
    "    # Correct mistakes \n",
    "    # and do the stemming\n",
    "    return \" \".join([lema.lemmatize(str(TextBlob(word).correct())) \n",
    "                     for word in text.split() if word not in stop])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['product poor product', 'loving product', 'brilliant']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = [preprocessing(x) for x in X_raw]\n",
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Term Frequency\n",
    "\n",
    "$$\n",
    "tf(w,d) = \\frac{count_d (w)}{|d|}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'brilliant', 'loving', 'poor', 'product'}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = {w for d in X for w in d.split()}\n",
    "words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def term_frequency(d):\n",
    "    tf = {w:0 for w in words}\n",
    "    for w in d.split():\n",
    "        if w in tf:\n",
    "            tf[w] += 1\n",
    "    return pd.Series(tf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf = pd.DataFrame(columns=words)\n",
    "for i in range(len(X)):\n",
    "    tf.loc[i] = term_frequency(X[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>poor</th>\n",
       "      <th>product</th>\n",
       "      <th>loving</th>\n",
       "      <th>brilliant</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  poor product loving brilliant\n",
       "0    1       2      0         0\n",
       "1    0       1      1         0\n",
       "2    0       0      0         1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Count Vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 1, 2],\n",
       "       [0, 1, 0, 1],\n",
       "       [1, 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vectorizer = CountVectorizer()\n",
    "cX = vectorizer.fit_transform(X).toarray()\n",
    "cX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.vocabulary_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### inverse document frequency\n",
    "$$idf(w) = 1 + log(\\frac{ N+1 }{ N_w + 1})$$\n",
    "\n",
    " - N: Number of documents in the corpus\n",
    " - N_w : Number of documents containing word w\n",
    " \n",
    " \n",
    "### Example\n",
    "\n",
    "\n",
    "think of a corpus, with 1000 documents\n",
    " - word __cat__ appears 5 of them\n",
    " - worf __the__ appears 500 of then\n",
    " \n",
    "Calculate\n",
    "$\\frac{N}{N_w}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "N = len(X)\n",
    "N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'brilliant', 'loving', 'poor', 'product'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def inverse_document_freq(w, X):\n",
    "    N = len(X)\n",
    "    N_d = 0\n",
    "    for d in X: \n",
    "        if w in d.split():\n",
    "            N_d += 1\n",
    "    return 1 + np.log((N+1)/(N_d+1))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('poor', 1.6931471805599454),\n",
       " ('product', 1.2876820724517808),\n",
       " ('loving', 1.6931471805599454),\n",
       " ('brilliant', 1.6931471805599454)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idfs = [(w,inverse_document_freq(w, X)) for w in words]\n",
    "idfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c, idf in idfs:\n",
    "    tf[c] *= idf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>poor</th>\n",
       "      <th>product</th>\n",
       "      <th>loving</th>\n",
       "      <th>brilliant</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.69315</td>\n",
       "      <td>2.57536</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1.28768</td>\n",
       "      <td>1.69315</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.69315</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      poor  product   loving brilliant\n",
       "0  1.69315  2.57536        0         0\n",
       "1        0  1.28768  1.69315         0\n",
       "2        0        0        0   1.69315"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.54935123, 0.83559154, 0.        , 0.        ],\n",
       "       [0.        , 0.60534851, 0.79596054, 0.        ],\n",
       "       [0.        , 0.        , 0.        , 1.        ]])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import normalize\n",
    "normalize(tf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SKLEARN TF-IDF Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.        , 1.69314718, 2.57536414],\n",
       "       [0.        , 1.69314718, 0.        , 1.28768207],\n",
       "       [1.69314718, 0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Creating the Tf-Idf model directly\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer = TfidfVectorizer(norm = None, smooth_idf=True)\n",
    "tX = vectorizer.fit_transform(X).toarray()\n",
    "tX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.        , 0.54935123, 0.83559154],\n",
       "       [0.        , 0.79596054, 0.        , 0.60534851],\n",
       "       [1.        , 0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Creating the Tf-Idf model directly\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer = TfidfVectorizer()\n",
    "tX = vectorizer.fit_transform(X).toarray()\n",
    "tX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}