{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "[View / download this notebook](http://nbviewer.ipython.org/urls/raw.github.com/dayton-dynamic/dayton-dynamic.github.com/master/regular_expressions.ipynb)"
     ]
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Our text: The Zen of Python"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!cat zen.txt"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print if /never/;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print if /is better than/;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Case-sensitive by default"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print if /simple/;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print if /simple/i;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Wildcards"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print if / ... /;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Extracting"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print \"$1\\n\" if / (...) /;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/(....) to (.....)/) {\n",
      "      print;\n",
      "      print \"$1, $2\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "`*` means \"any number\""
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/(.*) to (.*)/) {\n",
      "      print;\n",
      "      print \"$1, $2\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Character classes"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print \"$1\\n\" if / ([aeiou].....) /i;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    print \"$1\\n\" if / ([aeiou][^ ]*) /i;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/([^ ]*) to ([^ ]*)/) {\n",
      "      print;\n",
      "      print \"$1, $2\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Hmm.  How to get rid of those trailing punctuations?"
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "character class shortcuts"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/([a-z]*) to ([a-z]*)/) {\n",
      "      print;\n",
      "      print \"$1, $2\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/([\\w]*) to ([\\w]*)/) {\n",
      "      print;\n",
      "      print \"$1, $2\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Search and replace"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    $_ =~ s/is/might be/; \n",
      "    print $_;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    $_ =~ s/ (\\w*) to / \\1, really very \\1, to /; \n",
      "    print $_;\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "splitting"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/ to /) {\n",
      "      print\n",
      "      print split / /;\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "$data = \"I can be reached at (937) 395-2343 or 122-4235, unless it's raining.\"\n",
      "if ($data =~ /\\d+-\\d+/"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Verbose mode"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "> This flag allows you to write regular expressions that look nicer. Whitespace within the pattern is ignored, except when in a character class or preceded by an unescaped backslash, and, when a line contains a '#' neither in a character class or preceded by an unescaped backslash, all characters from the leftmost such '#' through the end of the line are ignored."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/(\\b[A-Z][a-z]+\\b)/) {\n",
      "      print $1;\n",
      "      print \"\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Random question: How to display all the capitalized words?\n",
      "\n",
      "But anyway:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%script perl\n",
      "open (ZENFILE, 'zen.txt');\n",
      "  while (<ZENFILE>) {\n",
      "    if (/ # Capitalized word detector\n",
      "          (       # capture the result in $1\n",
      "           \\b     # word boundary\n",
      "           [A-Z]  # one capital letter\n",
      "           [a-z]+ # one or more lowercase letters\n",
      "           \\b     # word boundary\n",
      "           )      # end captured group\n",
      "          /x) {\n",
      "      print $1;\n",
      "      print \"\\n\";\n",
      "      }\n",
      " }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "What are we skipping?\n",
      "---------------------\n",
      "\n",
      "Lots:\n",
      "    \n",
      "* Lookaheads"
     ]
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Verbal expressions"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "see [wiki](https://github.com/VerbalExpressions/JSVerbalExpressions/wiki)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from verbalexpressions import VerEx\n",
      "verbal_expression = VerEx()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Create an example of how to test for correctly formed URLs\n",
      "verbal_expression = VerEx()\n",
      "tester = (verbal_expression.\n",
      "            start_of_line().\n",
      "            find('http').\n",
      "            maybe('s').\n",
      "            find('://').\n",
      "            maybe('www.').\n",
      "            anything_but(' ').\n",
      "            end_of_line()\n",
      ")\n",
      "\n",
      "tester.source()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "tester.match(\"https://www.google.com\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "tester.match(\"my nifty website\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "result = tester.match(\"https://www.google.com\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "result.groups()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "VerEx().anything().source()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "VerEx().find('cows').match('how do you like Cows?')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ve2 = VerEx().find(\"ftp\").or().find(\"http\").maybe(\"s\").then(\"://\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "[RegExpBuilder](https://github.com/thebinarysearchtree/RegExpBuilder)"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "see [comparison](http://thechangelog.com/meet-regexpbuilder-verbal-expressions-rich-older-cousin/)"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Tragically, they haven't yet set it up for `pip install`.  I cloned from git & then created a symlink:\n",
      "\n",
      "    ln -s ~/sw/RegExpBuilder/python/regexpbuilder regexpbuilder\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from regexpbuilder.RegExpBuilder import RegExpBuilder"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "builder1 = (RegExpBuilder()\n",
      "  .find(\"$\")\n",
      "  .min(1).digits()\n",
      "  .then(\".\")\n",
      "  .digit()\n",
      "  .digit())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "builder1.search(\"is there money in $2.22?\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "money = builder1.get_regexp()\n",
      "result = money.search(\"is there money in $2.22?\")\n",
      "result"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "result.group(0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "discount = (RegExpBuilder()\n",
      "            .find(\"was\").then(money)\n",
      "            .maybe(\",\").find(\"now\").then(money)\n",
      "            )"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "money"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "type(money)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}