{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "<small><small><i>\n", "All the IPython Notebooks in **[Python Natural Language Processing](https://github.com/milaan9/Python_Python_Natural_Language_Processing)** lecture series by **[Dr. Milaan Parmar](https://www.linkedin.com/in/milaanparmar/)** are available @ **[GitHub](https://github.com/milaan9)**\n", "</i></small></small>" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "<a href=\"https://colab.research.google.com/github/milaan9/Python_Python_Natural_Language_Processing/blob/main/08_Stemming.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 08 Stemming" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 340 }, "id": "uROUhD6qDkcs", "outputId": "482bc406-753b-44bf-8371-63515e303b18" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I\n", "'m\n", "always\n", "here\n", "to\n", "help\n", "you\n", "all\n", "!\n", "Email:milaanparmar9@gmail.com\n", "or\n", "visit\n", "more\n", "at\n", "https://github.com/milaan9\n", "!\n" ] } ], "source": [ "import spacy\n", "nlp = spacy.load('en_core_web_sm')\n", "doc2 = nlp(u\"I'm always here to help you all! Email:milaanparmar9@gmail.com or visit more at https://github.com/milaan9!\")\n", "for t in doc2:\n", " print(t)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 170 }, "id": "T9SuBFxJD60t", "outputId": "ba8e3005-92c2-45dd-8b35-2cbf45df14c0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A\n", "5\n", "km\n", "NYC\n", "cab\n", "ride\n", "costs\n", "$\n", "10.30\n" ] } ], "source": [ "doc3 = nlp(u'A 5km NYC cab ride costs $10.30')\n", "for t in doc3:\n", " print(t)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "W954E2p7EA06", "outputId": "7c7a5417-f259-49ab-d813-1022fc34b74a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Let\n", "'s\n", "visit\n", "St.\n", "Louis\n", "in\n", "the\n", "U.S.\n", "next\n", "year\n", ".\n" ] } ], "source": [ "doc4 = nlp(u\"Let's visit St. Louis in the U.S. next year.\")\n", "for t in doc4:\n", " print(t)" ] }, { "cell_type": "markdown", "metadata": { "id": "MnVieHC_EBd6" }, "source": [ " ### ---- Porter Stemmer ------" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "32eUb4IHIxg-" }, "outputs": [], "source": [ "# Import the toolkit and the full Porter Stemmer library\n", "import nltk\n", "from nltk.stem.porter import *" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "Mqz3RhM5Ix9p" }, "outputs": [], "source": [ "p_stemmer = PorterStemmer()\n", "words = ['run','runner','running','ran','runs','easily','fairly']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "id": "3BtJibwLI1Ca", "outputId": "6a64b851-9353-4a52-cb12-a1f466f66bd7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "run --> run\n", "runner --> runner\n", "running --> run\n", "ran --> ran\n", "runs --> run\n", "easily --> easili\n", "fairly --> fairli\n" ] } ], "source": [ "for word in words:\n", " print(word+' --> '+p_stemmer.stem(word))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "1YgYkshwI2jo" }, "outputs": [], "source": [ "#SnowballStemmer\n", "from nltk.stem.snowball import SnowballStemmer\n", "# The Snowball Stemmer requires that you pass a language parameter\n", "s_stemmer = SnowballStemmer(language='english')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "kDERoxYEI9R4" }, "outputs": [], "source": [ "words = ['run','runner','running','ran','runs','easily','fairly']\n", "# words = ['generous','generation','generously','generate']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "id": "4M0epwwkI-yo", "outputId": "755b30e6-6940-42a7-aea4-3ddb52f85321" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "run --> run\n", "runner --> runner\n", "running --> run\n", "ran --> ran\n", "runs --> run\n", "easily --> easili\n", "fairly --> fair\n" ] } ], "source": [ "for word in words:\n", " print(word+' --> '+s_stemmer.stem(word))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "HAhZxPlwJALZ" }, "outputs": [], "source": [ "# ----Do Some more practice -----" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "pjnfD_FOJSx5" }, "outputs": [], "source": [ "words = ['consolingly']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 51 }, "id": "WznSJQT6JUa6", "outputId": "a897f507-b64e-4a1c-de9e-886e90cc950a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Porter Stemmer:\n", "consolingly --> consolingli\n" ] } ], "source": [ "print('Porter Stemmer:')\n", "for word in words:\n", " print(word+' --> '+p_stemmer.stem(word))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 51 }, "id": "kRWOhSIkJVvZ", "outputId": "c6ba8350-92b6-409f-ba39-eea293d607a3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Porter2 Stemmer:\n", "consolingly --> consol\n" ] } ], "source": [ "print('Porter2 Stemmer:')\n", "for word in words:\n", " print(word+' --> '+s_stemmer.stem(word))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 153 }, "id": "CHjOtYJcJXLp", "outputId": "cc9a6cdc-1062-47f3-c523-b12b9a3815ba" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I --> i\n", "am --> am\n", "meeting --> meet\n", "him --> him\n", "tomorrow --> tomorrow\n", "at --> at\n", "the --> the\n", "meeting --> meet\n" ] } ], "source": [ "# Stemming has its drawbacks. If given the token saw, stemming might always return saw, whereas lemmatization would likely return either\n", "# see or saw depending on whether the use of the token was as a verb or a noun. As an example, consider the following:\n", "\n", "phrase = 'I am meeting him tomorrow at the meeting'\n", "for word in phrase.split():\n", " print(word+' --> '+p_stemmer.stem(word))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "euP5mgGRLzMP" }, "outputs": [], "source": [ "# Perform standard imports:\n", "import spacy\n", "nlp = spacy.load('en_core_web_sm')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 272 }, "id": "u38F6ATiL3Np", "outputId": "da5dd4a5-1cd0-4f7e-ba22-cab74b306b86" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "John \t PROPN \t 11174346320140919546 \t John\n", "Adam \t PROPN \t 14264057329400597350 \t Adam\n", "is \t AUX \t 10382539506755952630 \t be\n", "one \t NUM \t 17454115351911680600 \t one\n", "the \t DET \t 7425985699627899538 \t the\n", "researcher \t NOUN \t 1317581537614213870 \t researcher\n", "who \t PRON \t 3876862883474502309 \t who\n", "invent \t VERB \t 5373681334090504585 \t invent\n", "the \t DET \t 7425985699627899538 \t the\n", "direction \t NOUN \t 895834437038626927 \t direction\n", "of \t ADP \t 886050111519832510 \t of\n", "way \t NOUN \t 6878210874361030284 \t way\n", "towards \t ADP \t 9315050841437086371 \t towards\n", "success \t NOUN \t 16089821935113899987 \t success\n", "! \t PUNCT \t 17494803046312582752 \t !\n" ] } ], "source": [ "var1 = nlp(u\"John Adam is one the researcher who invent the direction of way towards success!\")\n", "\n", "for token in var1:\n", " print(token.text, '\\t', token.pos_, '\\t', token.lemma, '\\t', token.lemma_)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "15GTnATmL4h3" }, "outputs": [], "source": [ "def show_lemmas(text):\n", " for token in text:\n", " print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 272 }, "id": "kcR_PwbVL61X", "outputId": "44dcddcc-391f-477a-cd7b-ba0042e61a60" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "John PROPN 11174346320140919546 John\n", "Adam PROPN 14264057329400597350 Adam\n", "is AUX 10382539506755952630 be\n", "one NUM 17454115351911680600 one\n", "the DET 7425985699627899538 the\n", "researcher NOUN 1317581537614213870 researcher\n", "who PRON 3876862883474502309 who\n", "invent VERB 5373681334090504585 invent\n", "the DET 7425985699627899538 the\n", "direction NOUN 895834437038626927 direction\n", "of ADP 886050111519832510 of\n", "way NOUN 6878210874361030284 way\n", "towards ADP 9315050841437086371 towards\n", "success NOUN 16089821935113899987 success\n", "! PUNCT 17494803046312582752 !\n" ] } ], "source": [ "var2 = nlp(u\"John Adam is one the researcher who invent the direction of way towards success!\")\n", "show_lemmas(var2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 170 }, "id": "u9HofN4pL8S3", "outputId": "fe8b7e71-7824-4d88-9d72-d0a4a9251d12" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I PRON 4690420944186131903 I\n", "am AUX 10382539506755952630 be\n", "meeting VERB 6880656908171229526 meet\n", "him PRON 1655312771067108281 he\n", "tomorrow NOUN 3573583789758258062 tomorrow\n", "at ADP 11667289587015813222 at\n", "the DET 7425985699627899538 the\n", "meeting NOUN 14798207169164081740 meeting\n", ". PUNCT 12646065887601541794 .\n" ] } ], "source": [ "var3 = nlp(u\"I am meeting him tomorrow at the meeting.\")\n", "show_lemmas(var3)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 170 }, "id": "62Rj1pk3L-MH", "outputId": "8a271b77-0a39-4c8b-9e27-e62351dc3055" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "That PRON 4380130941430378203 that\n", "'s AUX 10382539506755952630 be\n", "of ADP 886050111519832510 of\n", "the DET 7425985699627899538 the\n", "greate ADJ 4429768169814447593 greate\n", "person NOUN 14800503047316267216 person\n", "in ADP 3002984154512732771 in\n", "the DET 7425985699627899538 the\n", "world NOUN 1703489418272052182 world\n" ] } ], "source": [ "var4 = nlp(u\"That's of the greate person in the world\")\n", "show_lemmas(var4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UGGTGXLfMtcu" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "8_Stemming.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }