{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import spacy\n", "import pandas as pd\n", "import numpy as np\n", "import nltk\n", "from nltk.tokenize.toktok import ToktokTokenizer\n", "import re\n", "from bs4 import BeautifulSoup\n", "from contractions import CONTRACTION_MAP\n", "import unicodedata\n", "\n", "nlp = spacy.load('en', parse = False, tag=False, entity=False)\n", "tokenizer = ToktokTokenizer()\n", "stopword_list = nltk.corpus.stopwords.words('english')\n", "stopword_list.remove('no')\n", "stopword_list.remove('not')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cleaning Text - strip HTML" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def strip_html_tags(text):\n", " soup = BeautifulSoup(text, \"html.parser\")\n", " stripped_text = soup.get_text()\n", " return stripped_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing accented characters" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def remove_accented_chars(text):\n", " text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Expanding Contractions" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):\n", " \n", " contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), \n", " flags=re.IGNORECASE|re.DOTALL)\n", " def expand_match(contraction):\n", " match = contraction.group(0)\n", " first_char = match[0]\n", " expanded_contraction = contraction_mapping.get(match)\\\n", " if contraction_mapping.get(match)\\\n", " else contraction_mapping.get(match.lower()) \n", " expanded_contraction = first_char+expanded_contraction[1:]\n", " return expanded_contraction\n", " \n", " expanded_text = contractions_pattern.sub(expand_match, text)\n", " expanded_text = re.sub(\"'\", \"\", expanded_text)\n", " return expanded_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Special Characters" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def remove_special_characters(text):\n", " text = re.sub('[^a-zA-z0-9\\s]', '', text)\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lemmatizing text" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def lemmatize_text(text):\n", " text = nlp(text)\n", " text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Stopwords" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def remove_stopwords(text, is_lower_case=False):\n", " tokens = tokenizer.tokenize(text)\n", " tokens = [token.strip() for token in tokens]\n", " if is_lower_case:\n", " filtered_tokens = [token for token in tokens if token not in stopword_list]\n", " else:\n", " filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]\n", " filtered_text = ' '.join(filtered_tokens) \n", " return filtered_text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Normalize text corpus - tying it all together" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,\n", " accented_char_removal=True, text_lower_case=True, \n", " text_lemmatization=True, special_char_removal=True, \n", " stopword_removal=True):\n", " \n", " normalized_corpus = []\n", " # normalize each document in the corpus\n", " for doc in corpus:\n", " # strip HTML\n", " if html_stripping:\n", " doc = strip_html_tags(doc)\n", " # remove accented characters\n", " if accented_char_removal:\n", " doc = remove_accented_chars(doc)\n", " # expand contractions \n", " if contraction_expansion:\n", " doc = expand_contractions(doc)\n", " # lowercase the text \n", " if text_lower_case:\n", " doc = doc.lower()\n", " # remove extra newlines\n", " doc = re.sub(r'[\\r|\\n|\\r\\n]+', ' ',doc)\n", " # insert spaces between special characters to isolate them \n", " special_char_pattern = re.compile(r'([{.(-)!}])')\n", " doc = special_char_pattern.sub(\" \\\\1 \", doc)\n", " # lemmatize text\n", " if text_lemmatization:\n", " doc = lemmatize_text(doc)\n", " # remove special characters \n", " if special_char_removal:\n", " doc = remove_special_characters(doc) \n", " # remove extra whitespace\n", " doc = re.sub(' +', ' ', doc)\n", " # remove stopwords\n", " if stopword_removal:\n", " doc = remove_stopwords(doc, is_lower_case=text_lower_case)\n", " \n", " normalized_corpus.append(doc)\n", " \n", " return normalized_corpus\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sample demo" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"

Héllo! Héllo! can you hear me! I just heard about Python!
\\r\\n \\n It's an amazing language which can be used for Scripting, Web development,\\r\\n\\r\\n\\n Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\\n\\n What are you waiting for? Go and get started.
He's learning, she's learning, they've already\\n\\n\\n got a headstart!

\\n \"" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "document = \"\"\"

Héllo! Héllo! can you hear me! I just heard about Python!
\\r\\n \n", " It's an amazing language which can be used for Scripting, Web development,\\r\\n\\r\\n\n", " Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\\n\n", " What are you waiting for? Go and get started.
He's learning, she's learning, they've already\\n\\n\n", " got a headstart!

\n", " \"\"\"\n", "document" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Hello Hello can you hear me I just heard about Python It is an amazing language which can be used for Scripting Web development Information Retrieval Natural Language Processing Machine Learning Artificial Intelligence What are you waiting for Go and get started He is learning she is learning they have already got a headstart ']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalize_corpus([document], text_lemmatization=False, stopword_removal=False, text_lower_case=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['hello hello hear hear python amazing language use scripting web development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalize_corpus([document])" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }