{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "text_preprocessing.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyOszrHP2i3hI07U7AxUKjV8", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "<a href=\"https://colab.research.google.com/github/bipinKrishnan/fastai_course/blob/master/text_preprocessing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" ] }, { "cell_type": "code", "metadata": { "id": "5KWvUH4OlEkG" }, "source": [ "from torchvision.datasets.utils import download_and_extract_archive\n", "from torch.utils.data import DataLoader, Dataset\n", "\n", "from pathlib import Path" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "havyIu1zl1Up" }, "source": [ "url = 'https://download.pytorch.org/tutorial/data.zip'\n", "download_and_extract_archive(url, '.')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "O4uNub6zmEIE" }, "source": [ "path = '/content/data/names/'" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "0JU7NG-t0utt" }, "source": [ "names, targets, vocabs" ] }, { "cell_type": "code", "metadata": { "id": "Ub92eULe6hFB" }, "source": [ "class Preprocess:\n", " def __init__(self, path):\n", " self.path = path\n", "\n", " def get_names_targets(self):\n", " self.corpus = []\n", " for files in Path(self.path).glob('*.txt'):\n", " with open(files) as f:\n", " for l in f.readlines():\n", " self.corpus.append((l.split('\\n')[0], files.stem))\n", "\n", " return self.corpus\n", "\n", " def get_targets(self):\n", " targets = []\n", " for files in Path(self.path).glob('*.txt'):\n", " targets.append(files.stem)\n", "\n", " return targets\n", "\n", " def get_vocab(self):\n", " vocab = set()\n", "\n", " for name, target in self.corpus:\n", " vocab.add(name)\n", "\n", " return list(vocab)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "k7aljtz65StJ" }, "source": [ "class LoadDataset(Dataset):\n", " def __init__(self, names_targets, names_vocab, target_vocab):\n", " self.name_target = names_targets\n", " self.name_vocab = names_vocab\n", " self.target_vocab = target_vocab\n", " \n", " def __getitem__(self, idx): \n", " item = self.name_target[idx]\n", " return self.name_vocab.index(item[0]), self.target_vocab.index(item[1])\n", "\n", " def __len__(self): return len(self.name_target)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dGX_2cND8NVZ" }, "source": [ "pre = Preprocess(path)\n", "\n", "data = pre.get_names_targets()\n", "target_vocab = pre.get_targets()\n", "name_vocab = pre.get_vocab()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zr5ewS27CaaH" }, "source": [ "ds = LoadDataset(data, name_vocab, target_vocab)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MCMt1c9fDxCN", "outputId": "f4c94d0b-f0f1-4018-a5d3-830f2e1a90df", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "for x, y in ds:\n", " print(x, y)\n", " break" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "5823 0\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "fs7bkRwmD4kT" }, "source": [ "dls = DataLoader(ds, 16, shuffle=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dtrOHyAGEqOY", "outputId": "5c5ea3a1-ba58-4eb3-bc73-272b0a27cf71", "colab": { "base_uri": "https://localhost:8080/", "height": 69 } }, "source": [ "for data, label in dls:\n", " print(data.shape, label.shape)\n", " print(data[0], label[0],'\\n', name_vocab[data[0].item()], target_vocab[label[0].item()])\n", " break" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "torch.Size([16]) torch.Size([16])\n", "tensor(1468) tensor(8) \n", " Jigailo Russian\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "b6qnh-FbZBk3" }, "source": [ "# Spacy library" ] }, { "cell_type": "code", "metadata": { "id": "3HdsCjw1ZR-V" }, "source": [ "text = \"In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning). A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer. A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.\"" ], "execution_count": 4, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "DeibLiDwITj7" }, "source": [ "import spacy" ], "execution_count": 1, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "71FJpl1sZFDl" }, "source": [ "spacy_nlp = spacy.load('en_core_web_sm')" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "JQI5r6mDZLTS" }, "source": [ "doc = spacy_nlp(text)" ], "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "3zLL3VOJZaEn" }, "source": [ "[token.text for token in doc]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bbivD9WdZa5U", "outputId": "5ce91758-beba-49d0-e862-7b111603e8f1", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "text1 = \"ConcateStringAnd123 ConcateSepcialCharacter_!@# !@#$%^&*()_+ 0123456\"\n", "doc1 = spacy_nlp(text1)\n", "\n", "[token.text for token in doc1]" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['ConcateStringAnd123',\n", " 'ConcateSepcialCharacter_!@',\n", " '#',\n", " '!',\n", " '@#$%^&*()_+',\n", " '0123456']" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "code", "metadata": { "id": "h4bg9fR6aRSp", "outputId": "29e6cf63-196c-47bf-ba66-68ad829268c1", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "text2 = \"Let’s go to N.Y.!\"\n", "doc2 = spacy_nlp(text2)\n", "\n", "[token.text for token in doc2]" ], "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['Let', '’s', 'go', 'to', 'N.Y.', '!']" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "markdown", "metadata": { "id": "Se1xwsfXa4tL" }, "source": [ "# NLTK" ] }, { "cell_type": "code", "metadata": { "id": "eHIS6tzJajuq" }, "source": [ "import nltk\n", "nltk.download('punkt')\n", "nltk.download('averaged_perceptron_tagger')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Mc0QxFPfa62S" }, "source": [ "nltk.word_tokenize(text)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "3j0po3zja_Ql" }, "source": [ "nltk.word_tokenize(text1)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "laVtqtNdbL_F", "outputId": "0d0f6b00-4071-4121-e074-2b2e4717a88d", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "nltk.sent_tokenize(text), nltk.sent_tokenize(text1), nltk.sent_tokenize(text2)" ], "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(['In computer science, lexical analysis, lexing or tokenization is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an assigned and thus identified meaning).',\n", " 'A program that performs lexical analysis may be termed a lexer, tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.',\n", " 'A lexer is generally combined with a parser, which together analyze the syntax of programming languages, web pages, and so forth.'],\n", " ['ConcateStringAnd123 ConcateSepcialCharacter_!',\n", " '@# !',\n", " '@#$%^&*()_+ 0123456'],\n", " ['Let’s go to N.Y.!'])" ] }, "metadata": { "tags": [] }, "execution_count": 27 } ] }, { "cell_type": "code", "metadata": { "id": "bTwCWnj8bzYr" }, "source": [ "#parts of speech tagging\n", "tokens = nltk.word_tokenize(text)\n", "nltk.pos_tag(tokens)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ytAtcGYpeROB" }, "source": [ "# Stop word removal" ] }, { "cell_type": "markdown", "metadata": { "id": "jnncf97tlKV2" }, "source": [ "### NLTK" ] }, { "cell_type": "markdown", "metadata": { "id": "reXYbPnMdvkX" }, "source": [ "### Remove Stopwords\n", "\n", "We can remove stopwords while performing the following tasks:\n", "\n", "Text Classification\n", " * Spam Filtering\n", " * Language Classification\n", " * Genre Classification\n", " * Caption Generation\n", " * Auto-Tag Generation\n", "\n", " \n", "### Avoid Stopword Removal\n", "\n", " * Machine Translation\n", " * Language Modeling\n", " * Text Summarization\n", " * Question-Answering problems\n" ] }, { "cell_type": "code", "metadata": { "id": "wkohUSbGcPV9" }, "source": [ "from nltk.corpus import stopwords\n", "nltk.download('stopwords')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QqRMLljRefKb" }, "source": [ "tokens = nltk.word_tokenize(text)\n", "stop_word = set(stopwords.words('english'))" ], "execution_count": 57, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "tBSSy4N_eizi" }, "source": [ "[token for token in tokens if token not in stop_word]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bfhpAcskfaa1" }, "source": [ "t = \"He determined to drop his litigation with the monastry and relinguish his claims to the wood-cuting and \\n fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had \\n indeed the vaguest idea where the wood and river in question were.\"\n", "\n", "to = nltk.word_tokenize(t)\n", "[token for token in to if token not in stop_word]" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "MogLAMMgiMrO" }, "source": [ "# Text normalization(stemming & lemmatization)" ] }, { "cell_type": "code", "metadata": { "id": "tqdZhSJdgLji" }, "source": [ "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", "nltk.download('wordnet')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "4bwa5dvAiW_O", "outputId": "a227b8ab-4dc2-4e98-a590-24905d074111", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "ps = PorterStemmer()\n", "lemmatizer = WordNetLemmatizer()\n", "ps.stem('going'), lemmatizer.lemmatize('going', pos='v') #lemmatizes only the specified parts of speech" ], "execution_count": 78, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('go', 'go')" ] }, "metadata": { "tags": [] }, "execution_count": 78 } ] }, { "cell_type": "code", "metadata": { "id": "fng7M0LLigFg" }, "source": [ "[ps.stem(token) for token in tokens if token not in stop_word]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "XwkX4iETi6tj" }, "source": [ "lemma = []\n", "\n", "for token in tokens:\n", " if token not in stop_word:\n", " word = lemmatizer.lemmatize(token, pos='n')\n", " word = lemmatizer.lemmatize(word, pos='v')\n", " word = lemmatizer.lemmatize(word, pos='a')\n", "\n", " lemma.append(word)" ], "execution_count": 79, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "CTuzZEezkfOe" }, "source": [ "lemma" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "VGAVziWVlExW" }, "source": [ "### Spacy lemmatization" ] }, { "cell_type": "code", "metadata": { "id": "MHJ2hlATlEYm" }, "source": [ "nlp = spacy.load('en_core_web_sm')" ], "execution_count": 82, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Qv9ltnjQkyAG" }, "source": [ "doc = nlp(text)\n", "\n", "[token.lemma_ for token in doc if token.text not in stop_word]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-tN3CoL5lbLC" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }