{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Regular Expressions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Basic Regular Expression Patterns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ome ', 'tring', '.']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# neither S nor s\n", "re.compile(r'[^Ss]+').findall(\"Some strings.\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['work']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# not a period\n", "re.compile(r'[^\\.]+').findall(\"work\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['e', '^']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Either e or ^\n", "re.compile(r'[e^]+').findall(\"egg, ^\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['a^b']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# a^b\n", "re.compile(r'[a^b]+').findall(\"hello, a^b.\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['color', 'colour']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# optional elements\n", "re.compile(r'colou?r').findall(\"color, colour\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['2']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# an integer\n", "re.compile(r'[0-9][0-9]*').findall(\"2\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['2']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# an integer\n", "re.compile(r'[0-9]+').findall(\"2\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['begin', \"beg'n\", 'begun']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# any single character\n", "re.compile(r'beg.n').findall(\"begin, beg'n, begun\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['dog.']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# begin and end\n", "# \\. means . is a period not athe wildcard\n", "re.compile(r'dog\\.$').findall(\"the dog.\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['the']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# boundary\n", "re.compile(r'\\bthe\\b').findall(\"other, the, $they\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Disjunction, Grouping, and Precedence" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['cat', 'dog']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# disjunction\n", "re.compile(r'cat|dog').findall(\"there are a cat and a dog.\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['y', 'ies']" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# precedence\n", "re.compile(r'gupp(y|ies)').findall(\"guppy and guppies.\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['Column 1 ', 'Column 2 ', 'Column 3 ', 'Column 4']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.compile(r'Column [0-9]+ *').findall(\"Column 1 Column 2 Column 3 Column 4.\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Column 4', '', '']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# () as a whole\n", "re.compile(r'(Column [0-9]+ *)*').findall(\"Column 1 Column 2 Column 3 Column 4.\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['the', 'any', 'the']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# counters have a higher precedence than sequences, cannot match \"theny\"\n", "re.compile(r'the|any').findall(\"the, any, theny\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A Simple Example" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['the', 'the', 'the']\n", "['the', 'the', 'the', 'the']\n", "['the']\n", "[' the_', ' the2']\n", "[('', ','), (' ', '_'), (' ', '2')]\n" ] } ], "source": [ "# mathch word \"the\"\n", "print(re.compile(r'the').findall(\"the, The, the_, the25\"))\n", "print(re.compile(r'[tT]he').findall(\"the, their, the_, the25\"))\n", "print(re.compile(r'\\b[tT]he\\b').findall(\"the, their, the_, the25\"))\n", "print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall(\"the, their, the_, the25\"))\n", "print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall(\"the, their, the_, the25\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A More Complex Example" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 6), match='$199.9'>\n", "<_sre.SRE_Match object; span=(15, 22), match='$199.99'>\n", "<_sre.SRE_Match object; span=(41, 45), match='$199'>\n", "<_sre.SRE_Match object; span=(59, 60), match='$'>\n", "<_sre.SRE_Match object; span=(74, 80), match='$199.1'>\n" ] } ], "source": [ "p = re.compile(r'\\$[0-9]{0,3}(\\.[0-9]+)?\\b')\n", "for item in p.finditer(\"$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1\"):\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 6), match='$199.9'>\n", "<_sre.SRE_Match object; span=(14, 22), match=' $199.99'>\n", "<_sre.SRE_Match object; span=(40, 45), match=' $199'>\n", "<_sre.SRE_Match object; span=(58, 60), match=' $'>\n" ] } ], "source": [ "p = re.compile(r'(^|\\W)\\$[0-9]{0,3}(\\.[0-9]+)?\\b')\n", "for item in p.finditer(\"$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1\"):\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Regular Expression Substitution, Capture Groups, and ELIZA" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 45), match='the bigger they were, the bigger they will be'>\n" ] } ], "source": [ "for item in re.compile(r'the (.*)er they were, the \\1er they will be').finditer(\n", " \"the bigger they were, the bigger they will be but not the bigger they were, the faster they will be.\"):\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 38), match='the faster they ran, the faster we ran'>\n" ] } ], "source": [ "for item in re.compile(r'the (.*)er they (.*), the \\1er we \\2').finditer(\n", " \"the faster they ran, the faster we ran but not the faster they ran, the faster we ate.\"):\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_sre.SRE_Match object; span=(0, 25), match='a few cats like some cats'>\n" ] } ], "source": [ "# non-capturing\n", "for item in re.compile(r'(?:some|a few) (people|cats) like some \\1').finditer(\n", " \"a few cats like some cats but not some cats like some a few.\"):\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "abc\n", "abc\n" ] } ], "source": [ "print(re.match(\"([abc])+\", \"abc\").group())\n", "print(re.match(\"(?:[abc])+\", \"abc\").group())" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('c',)\n", "()\n" ] } ], "source": [ "print(re.match(\"([abc])+\", \"abc\").groups())\n", "print(re.match(\"(?:[abc])+\", \"abc\").groups())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Lookahead assertions" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Volcano']\n", "[]\n" ] } ], "source": [ "# 前向\n", "test = re.compile(r'^(?=Volcano)[a-zA-Z]+')\n", "print(test.findall(\"Volcano I\"))\n", "test = re.compile(r'^(?!Volcano)[a-zA-Z]+')\n", "print(test.findall(\"Volcano I\"))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[]\n", "['Volcano']\n" ] } ], "source": [ "# 后向\n", "test = re.compile(r'^(?<=Volcano)[a-zA-Z]+')\n", "print(test.findall(\"Volcano I\"))\n", "test = re.compile(r'^(?