{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<small><small><i>\n",
    "All the IPython Notebooks in **[Python Natural Language Processing](https://github.com/milaan9/Python_Python_Natural_Language_Processing)** lecture series by **[Dr. Milaan Parmar](https://www.linkedin.com/in/milaanparmar/)** are available @ **[GitHub](https://github.com/milaan9)**\n",
    "</i></small></small>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/milaan9/Python_Python_Natural_Language_Processing/blob/main/08_Stemming.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 08 Stemming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 340
    },
    "id": "uROUhD6qDkcs",
    "outputId": "482bc406-753b-44bf-8371-63515e303b18"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I\n",
      "'m\n",
      "always\n",
      "here\n",
      "to\n",
      "help\n",
      "you\n",
      "all\n",
      "!\n",
      "Email:milaanparmar9@gmail.com\n",
      "or\n",
      "visit\n",
      "more\n",
      "at\n",
      "https://github.com/milaan9\n",
      "!\n"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "doc2 = nlp(u\"I'm always here to help you all! Email:milaanparmar9@gmail.com or visit more at https://github.com/milaan9!\")\n",
    "for t in doc2:\n",
    "    print(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 170
    },
    "id": "T9SuBFxJD60t",
    "outputId": "ba8e3005-92c2-45dd-8b35-2cbf45df14c0"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A\n",
      "5\n",
      "km\n",
      "NYC\n",
      "cab\n",
      "ride\n",
      "costs\n",
      "$\n",
      "10.30\n"
     ]
    }
   ],
   "source": [
    "doc3 = nlp(u'A 5km NYC cab ride costs $10.30')\n",
    "for t in doc3:\n",
    "    print(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "id": "W954E2p7EA06",
    "outputId": "7c7a5417-f259-49ab-d813-1022fc34b74a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Let\n",
      "'s\n",
      "visit\n",
      "St.\n",
      "Louis\n",
      "in\n",
      "the\n",
      "U.S.\n",
      "next\n",
      "year\n",
      ".\n"
     ]
    }
   ],
   "source": [
    "doc4 = nlp(u\"Let's visit St. Louis in the U.S. next year.\")\n",
    "for t in doc4:\n",
    "    print(t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MnVieHC_EBd6"
   },
   "source": [
    " ### ----   Porter Stemmer   ------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "32eUb4IHIxg-"
   },
   "outputs": [],
   "source": [
    "# Import the toolkit and the full Porter Stemmer library\n",
    "import nltk\n",
    "from nltk.stem.porter import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "id": "Mqz3RhM5Ix9p"
   },
   "outputs": [],
   "source": [
    "p_stemmer = PorterStemmer()\n",
    "words = ['run','runner','running','ran','runs','easily','fairly']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 136
    },
    "id": "3BtJibwLI1Ca",
    "outputId": "6a64b851-9353-4a52-cb12-a1f466f66bd7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "run --> run\n",
      "runner --> runner\n",
      "running --> run\n",
      "ran --> ran\n",
      "runs --> run\n",
      "easily --> easili\n",
      "fairly --> fairli\n"
     ]
    }
   ],
   "source": [
    "for word in words:\n",
    "    print(word+' --> '+p_stemmer.stem(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "id": "1YgYkshwI2jo"
   },
   "outputs": [],
   "source": [
    "#SnowballStemmer\n",
    "from nltk.stem.snowball import SnowballStemmer\n",
    "# The Snowball Stemmer requires that you pass a language parameter\n",
    "s_stemmer = SnowballStemmer(language='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "id": "kDERoxYEI9R4"
   },
   "outputs": [],
   "source": [
    "words = ['run','runner','running','ran','runs','easily','fairly']\n",
    "# words = ['generous','generation','generously','generate']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 136
    },
    "id": "4M0epwwkI-yo",
    "outputId": "755b30e6-6940-42a7-aea4-3ddb52f85321"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "run --> run\n",
      "runner --> runner\n",
      "running --> run\n",
      "ran --> ran\n",
      "runs --> run\n",
      "easily --> easili\n",
      "fairly --> fair\n"
     ]
    }
   ],
   "source": [
    "for word in words:\n",
    "    print(word+' --> '+s_stemmer.stem(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "id": "HAhZxPlwJALZ"
   },
   "outputs": [],
   "source": [
    "# ----Do Some more practice -----"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "id": "pjnfD_FOJSx5"
   },
   "outputs": [],
   "source": [
    "words = ['consolingly']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "id": "WznSJQT6JUa6",
    "outputId": "a897f507-b64e-4a1c-de9e-886e90cc950a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Porter Stemmer:\n",
      "consolingly --> consolingli\n"
     ]
    }
   ],
   "source": [
    "print('Porter Stemmer:')\n",
    "for word in words:\n",
    "    print(word+' --> '+p_stemmer.stem(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "id": "kRWOhSIkJVvZ",
    "outputId": "c6ba8350-92b6-409f-ba39-eea293d607a3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Porter2 Stemmer:\n",
      "consolingly --> consol\n"
     ]
    }
   ],
   "source": [
    "print('Porter2 Stemmer:')\n",
    "for word in words:\n",
    "    print(word+' --> '+s_stemmer.stem(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 153
    },
    "id": "CHjOtYJcJXLp",
    "outputId": "cc9a6cdc-1062-47f3-c523-b12b9a3815ba"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I --> i\n",
      "am --> am\n",
      "meeting --> meet\n",
      "him --> him\n",
      "tomorrow --> tomorrow\n",
      "at --> at\n",
      "the --> the\n",
      "meeting --> meet\n"
     ]
    }
   ],
   "source": [
    "# Stemming has its drawbacks. If given the token saw, stemming might always return saw, whereas lemmatization would likely return either\n",
    "# see or saw depending on whether the use of the token was as a verb or a noun. As an example, consider the following:\n",
    "\n",
    "phrase = 'I am meeting him tomorrow at the meeting'\n",
    "for word in phrase.split():\n",
    "    print(word+' --> '+p_stemmer.stem(word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "id": "euP5mgGRLzMP"
   },
   "outputs": [],
   "source": [
    "# Perform standard imports:\n",
    "import spacy\n",
    "nlp = spacy.load('en_core_web_sm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 272
    },
    "id": "u38F6ATiL3Np",
    "outputId": "da5dd4a5-1cd0-4f7e-ba22-cab74b306b86"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "John \t PROPN \t 11174346320140919546 \t John\n",
      "Adam \t PROPN \t 14264057329400597350 \t Adam\n",
      "is \t AUX \t 10382539506755952630 \t be\n",
      "one \t NUM \t 17454115351911680600 \t one\n",
      "the \t DET \t 7425985699627899538 \t the\n",
      "researcher \t NOUN \t 1317581537614213870 \t researcher\n",
      "who \t PRON \t 3876862883474502309 \t who\n",
      "invent \t VERB \t 5373681334090504585 \t invent\n",
      "the \t DET \t 7425985699627899538 \t the\n",
      "direction \t NOUN \t 895834437038626927 \t direction\n",
      "of \t ADP \t 886050111519832510 \t of\n",
      "way \t NOUN \t 6878210874361030284 \t way\n",
      "towards \t ADP \t 9315050841437086371 \t towards\n",
      "success \t NOUN \t 16089821935113899987 \t success\n",
      "! \t PUNCT \t 17494803046312582752 \t !\n"
     ]
    }
   ],
   "source": [
    "var1 = nlp(u\"John Adam is one the researcher who invent the direction of way towards success!\")\n",
    "\n",
    "for token in var1:\n",
    "    print(token.text, '\\t', token.pos_, '\\t', token.lemma, '\\t', token.lemma_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "id": "15GTnATmL4h3"
   },
   "outputs": [],
   "source": [
    "def show_lemmas(text):\n",
    "    for token in text:\n",
    "        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 272
    },
    "id": "kcR_PwbVL61X",
    "outputId": "44dcddcc-391f-477a-cd7b-ba0042e61a60"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "John         PROPN  11174346320140919546   John\n",
      "Adam         PROPN  14264057329400597350   Adam\n",
      "is           AUX    10382539506755952630   be\n",
      "one          NUM    17454115351911680600   one\n",
      "the          DET    7425985699627899538    the\n",
      "researcher   NOUN   1317581537614213870    researcher\n",
      "who          PRON   3876862883474502309    who\n",
      "invent       VERB   5373681334090504585    invent\n",
      "the          DET    7425985699627899538    the\n",
      "direction    NOUN   895834437038626927     direction\n",
      "of           ADP    886050111519832510     of\n",
      "way          NOUN   6878210874361030284    way\n",
      "towards      ADP    9315050841437086371    towards\n",
      "success      NOUN   16089821935113899987   success\n",
      "!            PUNCT  17494803046312582752   !\n"
     ]
    }
   ],
   "source": [
    "var2 = nlp(u\"John Adam is one the researcher who invent the direction of way towards success!\")\n",
    "show_lemmas(var2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 170
    },
    "id": "u9HofN4pL8S3",
    "outputId": "fe8b7e71-7824-4d88-9d72-d0a4a9251d12"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I            PRON   4690420944186131903    I\n",
      "am           AUX    10382539506755952630   be\n",
      "meeting      VERB   6880656908171229526    meet\n",
      "him          PRON   1655312771067108281    he\n",
      "tomorrow     NOUN   3573583789758258062    tomorrow\n",
      "at           ADP    11667289587015813222   at\n",
      "the          DET    7425985699627899538    the\n",
      "meeting      NOUN   14798207169164081740   meeting\n",
      ".            PUNCT  12646065887601541794   .\n"
     ]
    }
   ],
   "source": [
    "var3 = nlp(u\"I am meeting him tomorrow at the meeting.\")\n",
    "show_lemmas(var3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 170
    },
    "id": "62Rj1pk3L-MH",
    "outputId": "8a271b77-0a39-4c8b-9e27-e62351dc3055"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "That         PRON   4380130941430378203    that\n",
      "'s           AUX    10382539506755952630   be\n",
      "of           ADP    886050111519832510     of\n",
      "the          DET    7425985699627899538    the\n",
      "greate       ADJ    4429768169814447593    greate\n",
      "person       NOUN   14800503047316267216   person\n",
      "in           ADP    3002984154512732771    in\n",
      "the          DET    7425985699627899538    the\n",
      "world        NOUN   1703489418272052182    world\n"
     ]
    }
   ],
   "source": [
    "var4 = nlp(u\"That's of the greate person in the world\")\n",
    "show_lemmas(var4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UGGTGXLfMtcu"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "8_Stemming.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}