{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[View / download this notebook](http://nbviewer.ipython.org/urls/raw.github.com/dayton-dynamic/dayton-dynamic.github.com/master/regular_expressions.ipynb)" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Our text: The Zen of Python" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!cat zen.txt" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print if /never/;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print if /is better than/;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Case-sensitive by default" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print if /simple/;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print if /simple/i;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Wildcards" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print if / ... /;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Extracting" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print \"$1\\n\" if / (...) /;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/(....) to (.....)/) {\n", " print;\n", " print \"$1, $2\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "`*` means \"any number\"" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/(.*) to (.*)/) {\n", " print;\n", " print \"$1, $2\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Character classes" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print \"$1\\n\" if / ([aeiou].....) /i;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " print \"$1\\n\" if / ([aeiou][^ ]*) /i;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/([^ ]*) to ([^ ]*)/) {\n", " print;\n", " print \"$1, $2\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hmm. How to get rid of those trailing punctuations?" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "character class shortcuts" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/([a-z]*) to ([a-z]*)/) {\n", " print;\n", " print \"$1, $2\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/([\\w]*) to ([\\w]*)/) {\n", " print;\n", " print \"$1, $2\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Search and replace" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " $_ =~ s/is/might be/; \n", " print $_;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " $_ =~ s/ (\\w*) to / \\1, really very \\1, to /; \n", " print $_;\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "splitting" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/ to /) {\n", " print\n", " print split / /;\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "$data = \"I can be reached at (937) 395-2343 or 122-4235, unless it's raining.\"\n", "if ($data =~ /\\d+-\\d+/" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Verbose mode" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> This flag allows you to write regular expressions that look nicer. Whitespace within the pattern is ignored, except when in a character class or preceded by an unescaped backslash, and, when a line contains a '#' neither in a character class or preceded by an unescaped backslash, all characters from the leftmost such '#' through the end of the line are ignored." ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/(\\b[A-Z][a-z]+\\b)/) {\n", " print $1;\n", " print \"\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Random question: How to display all the capitalized words?\n", "\n", "But anyway:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%script perl\n", "open (ZENFILE, 'zen.txt');\n", " while () {\n", " if (/ # Capitalized word detector\n", " ( # capture the result in $1\n", " \\b # word boundary\n", " [A-Z] # one capital letter\n", " [a-z]+ # one or more lowercase letters\n", " \\b # word boundary\n", " ) # end captured group\n", " /x) {\n", " print $1;\n", " print \"\\n\";\n", " }\n", " }" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What are we skipping?\n", "---------------------\n", "\n", "Lots:\n", " \n", "* Lookaheads" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Verbal expressions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "see [wiki](https://github.com/VerbalExpressions/JSVerbalExpressions/wiki)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from verbalexpressions import VerEx\n", "verbal_expression = VerEx()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# Create an example of how to test for correctly formed URLs\n", "verbal_expression = VerEx()\n", "tester = (verbal_expression.\n", " start_of_line().\n", " find('http').\n", " maybe('s').\n", " find('://').\n", " maybe('www.').\n", " anything_but(' ').\n", " end_of_line()\n", ")\n", "\n", "tester.source()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "tester.match(\"https://www.google.com\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "tester.match(\"my nifty website\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "result = tester.match(\"https://www.google.com\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "result.groups()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "VerEx().anything().source()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "VerEx().find('cows').match('how do you like Cows?')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "ve2 = VerEx().find(\"ftp\").or().find(\"http\").maybe(\"s\").then(\"://\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "[RegExpBuilder](https://github.com/thebinarysearchtree/RegExpBuilder)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "see [comparison](http://thechangelog.com/meet-regexpbuilder-verbal-expressions-rich-older-cousin/)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tragically, they haven't yet set it up for `pip install`. I cloned from git & then created a symlink:\n", "\n", " ln -s ~/sw/RegExpBuilder/python/regexpbuilder regexpbuilder\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from regexpbuilder.RegExpBuilder import RegExpBuilder" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "builder1 = (RegExpBuilder()\n", " .find(\"$\")\n", " .min(1).digits()\n", " .then(\".\")\n", " .digit()\n", " .digit())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "builder1.search(\"is there money in $2.22?\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "money = builder1.get_regexp()\n", "result = money.search(\"is there money in $2.22?\")\n", "result" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "result.group(0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "discount = (RegExpBuilder()\n", " .find(\"was\").then(money)\n", " .maybe(\",\").find(\"now\").then(money)\n", " )" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "money" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "type(money)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }