{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cleaning Text\n",
    "\n",
    "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n",
    "- **Date:** -\n",
    "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n",
    "- **Note:** - Based on [http://nbviewer.ipython.org/gist/rjweiss/7577004](http://nbviewer.ipython.org/gist/rjweiss/7577004)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create some raw text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a list of three strings.\n",
    "incoming_reports = [\"We are attacking on their left flank but are losing many men.\", \n",
    "               \"We cannot see the enemy army. Nothing else to report.\", \n",
    "               \"We are ready to attack but are waiting for your orders.\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Seperate by word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['We',\n",
       "  'are',\n",
       "  'attacking',\n",
       "  'on',\n",
       "  'their',\n",
       "  'left',\n",
       "  'flank',\n",
       "  'but',\n",
       "  'are',\n",
       "  'losing',\n",
       "  'many',\n",
       "  'men',\n",
       "  '.'],\n",
       " ['We',\n",
       "  'can',\n",
       "  'not',\n",
       "  'see',\n",
       "  'the',\n",
       "  'enemy',\n",
       "  'army',\n",
       "  '.',\n",
       "  'Nothing',\n",
       "  'else',\n",
       "  'to',\n",
       "  'report',\n",
       "  '.'],\n",
       " ['We',\n",
       "  'are',\n",
       "  'ready',\n",
       "  'to',\n",
       "  'attack',\n",
       "  'but',\n",
       "  'are',\n",
       "  'waiting',\n",
       "  'for',\n",
       "  'your',\n",
       "  'orders',\n",
       "  '.']]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import word tokenizer\n",
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "# Apply word_tokenize to each element of the list called incoming_reports\n",
    "tokenized_reports = [word_tokenize(report) for report in incoming_reports]\n",
    "\n",
    "# View tokenized_reports\n",
    "tokenized_reports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['We',\n",
       "  'are',\n",
       "  'attacking',\n",
       "  'on',\n",
       "  'their',\n",
       "  'left',\n",
       "  'flank',\n",
       "  'but',\n",
       "  'are',\n",
       "  'losing',\n",
       "  'many',\n",
       "  'men'],\n",
       " ['We',\n",
       "  'can',\n",
       "  'not',\n",
       "  'see',\n",
       "  'the',\n",
       "  'enemy',\n",
       "  'army',\n",
       "  'Nothing',\n",
       "  'else',\n",
       "  'to',\n",
       "  'report'],\n",
       " ['We',\n",
       "  'are',\n",
       "  'ready',\n",
       "  'to',\n",
       "  'attack',\n",
       "  'but',\n",
       "  'are',\n",
       "  'waiting',\n",
       "  'for',\n",
       "  'your',\n",
       "  'orders']]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import regex\n",
    "import re\n",
    "\n",
    "# Import string\n",
    "import string\n",
    "\n",
    "\n",
    "regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
    "\n",
    "tokenized_reports_no_punctuation = []\n",
    "\n",
    "for review in tokenized_reports:\n",
    "    \n",
    "    new_review = []\n",
    "    for token in review: \n",
    "        new_token = regex.sub(u'', token)\n",
    "        if not new_token == u'':\n",
    "            new_review.append(new_token)\n",
    "    \n",
    "    tokenized_reports_no_punctuation.append(new_review)\n",
    "    \n",
    "tokenized_reports_no_punctuation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove filler words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],\n",
       " ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],\n",
       " ['We', 'ready', 'attack', 'waiting', 'orders']]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "\n",
    "tokenized_reports_no_stopwords = []\n",
    "for report in tokenized_reports_no_punctuation:\n",
    "    new_term_vector = []\n",
    "    for word in report:\n",
    "        if not word in stopwords.words('english'):\n",
    "            new_term_vector.append(word)\n",
    "    tokenized_reports_no_stopwords.append(new_term_vector)\n",
    "            \n",
    "tokenized_reports_no_stopwords"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.3.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}