{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import spacy\n", "import pandas as pd\n", "import numpy as np\n", "import nltk\n", "from nltk.tokenize.toktok import ToktokTokenizer\n", "import re\n", "from bs4 import BeautifulSoup\n", "from contractions import CONTRACTION_MAP\n", "import unicodedata\n", "\n", "nlp = spacy.load('en', parse = False, tag=False, entity=False)\n", "tokenizer = ToktokTokenizer()\n", "stopword_list = nltk.corpus.stopwords.words('english')\n", "stopword_list.remove('no')\n", "stopword_list.remove('not')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cleaning Text - strip HTML" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def strip_html_tags(text):\n", " soup = BeautifulSoup(text, \"html.parser\")\n", " stripped_text = soup.get_text()\n", " return stripped_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing accented characters" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def remove_accented_chars(text):\n", " text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Expanding Contractions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):\n", " \n", " contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), \n", " flags=re.IGNORECASE|re.DOTALL)\n", " def expand_match(contraction):\n", " match = contraction.group(0)\n", " first_char = match[0]\n", " expanded_contraction = contraction_mapping.get(match)\\\n", " if contraction_mapping.get(match)\\\n", " else contraction_mapping.get(match.lower()) \n", " expanded_contraction = first_char+expanded_contraction[1:]\n", " return expanded_contraction\n", " \n", " expanded_text = contractions_pattern.sub(expand_match, text)\n", " expanded_text = re.sub(\"'\", \"\", expanded_text)\n", " return expanded_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Special Characters" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def remove_special_characters(text):\n", " text = re.sub('[^a-zA-z0-9\\s]', '', text)\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lemmatizing text" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def lemmatize_text(text):\n", " text = nlp(text)\n", " text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Stopwords" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def remove_stopwords(text, is_lower_case=False):\n", " tokens = tokenizer.tokenize(text)\n", " tokens = [token.strip() for token in tokens]\n", " if is_lower_case:\n", " filtered_tokens = [token for token in tokens if token not in stopword_list]\n", " else:\n", " filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]\n", " filtered_text = ' '.join(filtered_tokens) \n", " return filtered_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Normalize text corpus - tying it all together" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,\n", " accented_char_removal=True, text_lower_case=True, \n", " text_lemmatization=True, special_char_removal=True, \n", " stopword_removal=True):\n", " \n", " normalized_corpus = []\n", " # normalize each document in the corpus\n", " for doc in corpus:\n", " # strip HTML\n", " if html_stripping:\n", " doc = strip_html_tags(doc)\n", " # remove accented characters\n", " if accented_char_removal:\n", " doc = remove_accented_chars(doc)\n", " # expand contractions \n", " if contraction_expansion:\n", " doc = expand_contractions(doc)\n", " # lowercase the text \n", " if text_lower_case:\n", " doc = doc.lower()\n", " # remove extra newlines\n", " doc = re.sub(r'[\\r|\\n|\\r\\n]+', ' ',doc)\n", " # insert spaces between special characters to isolate them \n", " special_char_pattern = re.compile(r'([{.(-)!}])')\n", " doc = special_char_pattern.sub(\" \\\\1 \", doc)\n", " # lemmatize text\n", " if text_lemmatization:\n", " doc = lemmatize_text(doc)\n", " # remove special characters \n", " if special_char_removal:\n", " doc = remove_special_characters(doc) \n", " # remove extra whitespace\n", " doc = re.sub(' +', ' ', doc)\n", " # remove stopwords\n", " if stopword_removal:\n", " doc = remove_stopwords(doc, is_lower_case=text_lower_case)\n", " \n", " normalized_corpus.append(doc)\n", " \n", " return normalized_corpus\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample demo" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"
Héllo! Héllo! can you hear me! I just heard about Python!
\\r\\n \\n It's an amazing language which can be used for Scripting, Web development,\\r\\n\\r\\n\\n Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\\n\\n What are you waiting for? Go and get started.
He's learning, she's learning, they've already\\n\\n\\n got a headstart!
Héllo! Héllo! can you hear me! I just heard about Python!
\\r\\n \n",
" It's an amazing language which can be used for Scripting, Web development,\\r\\n\\r\\n\n",
" Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\\n\n",
" What are you waiting for? Go and get started.
He's learning, she's learning, they've already\\n\\n\n",
" got a headstart!