{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cleaning Text\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:** - Based on [http://nbviewer.ipython.org/gist/rjweiss/7577004](http://nbviewer.ipython.org/gist/rjweiss/7577004)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create some raw text" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Create a list of three strings.\n", "incoming_reports = [\"We are attacking on their left flank but are losing many men.\", \n", " \"We cannot see the enemy army. Nothing else to report.\", \n", " \"We are ready to attack but are waiting for your orders.\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Seperate by word" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['We',\n", " 'are',\n", " 'attacking',\n", " 'on',\n", " 'their',\n", " 'left',\n", " 'flank',\n", " 'but',\n", " 'are',\n", " 'losing',\n", " 'many',\n", " 'men',\n", " '.'],\n", " ['We',\n", " 'can',\n", " 'not',\n", " 'see',\n", " 'the',\n", " 'enemy',\n", " 'army',\n", " '.',\n", " 'Nothing',\n", " 'else',\n", " 'to',\n", " 'report',\n", " '.'],\n", " ['We',\n", " 'are',\n", " 'ready',\n", " 'to',\n", " 'attack',\n", " 'but',\n", " 'are',\n", " 'waiting',\n", " 'for',\n", " 'your',\n", " 'orders',\n", " '.']]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import word tokenizer\n", "from nltk.tokenize import word_tokenize\n", "\n", "# Apply word_tokenize to each element of the list called incoming_reports\n", "tokenized_reports = [word_tokenize(report) for report in incoming_reports]\n", "\n", "# View tokenized_reports\n", "tokenized_reports" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['We',\n", " 'are',\n", " 'attacking',\n", " 'on',\n", " 'their',\n", " 'left',\n", " 'flank',\n", " 'but',\n", " 'are',\n", " 'losing',\n", " 'many',\n", " 'men'],\n", " ['We',\n", " 'can',\n", " 'not',\n", " 'see',\n", " 'the',\n", " 'enemy',\n", " 'army',\n", " 'Nothing',\n", " 'else',\n", " 'to',\n", " 'report'],\n", " ['We',\n", " 'are',\n", " 'ready',\n", " 'to',\n", " 'attack',\n", " 'but',\n", " 'are',\n", " 'waiting',\n", " 'for',\n", " 'your',\n", " 'orders']]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Import regex\n", "import re\n", "\n", "# Import string\n", "import string\n", "\n", "\n", "regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n", "\n", "tokenized_reports_no_punctuation = []\n", "\n", "for review in tokenized_reports:\n", " \n", " new_review = []\n", " for token in review: \n", " new_token = regex.sub(u'', token)\n", " if not new_token == u'':\n", " new_review.append(new_token)\n", " \n", " tokenized_reports_no_punctuation.append(new_review)\n", " \n", "tokenized_reports_no_punctuation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove filler words" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],\n", " ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],\n", " ['We', 'ready', 'attack', 'waiting', 'orders']]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from nltk.corpus import stopwords\n", "\n", "tokenized_reports_no_stopwords = []\n", "for report in tokenized_reports_no_punctuation:\n", " new_term_vector = []\n", " for word in report:\n", " if not word in stopwords.words('english'):\n", " new_term_vector.append(word)\n", " tokenized_reports_no_stopwords.append(new_term_vector)\n", " \n", "tokenized_reports_no_stopwords" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.3.5" } }, "nbformat": 4, "nbformat_minor": 0 }