{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regular Expressions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic Regular Expression Patterns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ome ', 'tring', '.']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# neither S nor s\n",
    "re.compile(r'[^Ss]+').findall(\"Some strings.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['work']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# not a period\n",
    "re.compile(r'[^\\.]+').findall(\"work\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['e', '^']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Either e or ^\n",
    "re.compile(r'[e^]+').findall(\"egg, ^\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a^b']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a^b\n",
    "re.compile(r'[a^b]+').findall(\"hello, a^b.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['color', 'colour']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# optional elements\n",
    "re.compile(r'colou?r').findall(\"color, colour\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['2']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# an integer\n",
    "re.compile(r'[0-9][0-9]*').findall(\"2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['2']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# an integer\n",
    "re.compile(r'[0-9]+').findall(\"2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['begin', \"beg'n\", 'begun']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# any single character\n",
    "re.compile(r'beg.n').findall(\"begin, beg'n, begun\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['dog.']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# begin and end\n",
    "# \\. means . is a period not athe wildcard\n",
    "re.compile(r'dog\\.$').findall(\"the dog.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# boundary\n",
    "re.compile(r'\\bthe\\b').findall(\"other, the, $they\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Disjunction, Grouping, and Precedence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['cat', 'dog']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# disjunction\n",
    "re.compile(r'cat|dog').findall(\"there are a cat and a dog.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['y', 'ies']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# precedence\n",
    "re.compile(r'gupp(y|ies)').findall(\"guppy and guppies.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Column 1 ', 'Column 2 ', 'Column 3 ', 'Column 4']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.compile(r'Column [0-9]+ *').findall(\"Column 1 Column 2 Column 3 Column 4.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Column 4', '', '']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# () as a whole\n",
    "re.compile(r'(Column [0-9]+ *)*').findall(\"Column 1 Column 2 Column 3 Column 4.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the', 'any', 'the']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# counters have a higher precedence than sequences, cannot match \"theny\"\n",
    "re.compile(r'the|any').findall(\"the, any, theny\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A Simple Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['the', 'the', 'the']\n",
      "['the', 'the', 'the', 'the']\n",
      "['the']\n",
      "[' the_', ' the2']\n",
      "[('', ','), (' ', '_'), (' ', '2')]\n"
     ]
    }
   ],
   "source": [
    "# mathch word \"the\"\n",
    "print(re.compile(r'the').findall(\"the, The, the_, the25\"))\n",
    "print(re.compile(r'[tT]he').findall(\"the, their, the_, the25\"))\n",
    "print(re.compile(r'\\b[tT]he\\b').findall(\"the, their, the_, the25\"))\n",
    "print(re.compile(r'[^a-zA-Z][tT]he[^a-zA-Z]').findall(\"the, their, the_, the25\"))\n",
    "print(re.compile(r'(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)').findall(\"the, their, the_, the25\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A More Complex Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(0, 6), match='$199.9'>\n",
      "<_sre.SRE_Match object; span=(15, 22), match='$199.99'>\n",
      "<_sre.SRE_Match object; span=(41, 45), match='$199'>\n",
      "<_sre.SRE_Match object; span=(59, 60), match='$'>\n",
      "<_sre.SRE_Match object; span=(74, 80), match='$199.1'>\n"
     ]
    }
   ],
   "source": [
    "p = re.compile(r'\\$[0-9]{0,3}(\\.[0-9]+)?\\b')\n",
    "for item in p.finditer(\"$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1\"):\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(0, 6), match='$199.9'>\n",
      "<_sre.SRE_Match object; span=(14, 22), match=' $199.99'>\n",
      "<_sre.SRE_Match object; span=(40, 45), match=' $199'>\n",
      "<_sre.SRE_Match object; span=(58, 60), match=' $'>\n"
     ]
    }
   ],
   "source": [
    "p = re.compile(r'(^|\\W)\\$[0-9]{0,3}(\\.[0-9]+)?\\b')\n",
    "for item in p.finditer(\"$199.9, one is $199.99. and the other is $199. the last is $1999999.99 . a$199.1\"):\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Regular Expression Substitution, Capture Groups, and ELIZA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(0, 45), match='the bigger they were, the bigger they will be'>\n"
     ]
    }
   ],
   "source": [
    "for item in re.compile(r'the (.*)er they were, the \\1er they will be').finditer(\n",
    "    \"the bigger they were, the bigger they will be but not the bigger they were, the faster they will be.\"):\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(0, 38), match='the faster they ran, the faster we ran'>\n"
     ]
    }
   ],
   "source": [
    "for item in re.compile(r'the (.*)er they (.*), the \\1er we \\2').finditer(\n",
    "    \"the faster they ran, the faster we ran but not the faster they ran, the faster we ate.\"):\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(0, 25), match='a few cats like some cats'>\n"
     ]
    }
   ],
   "source": [
    "# non-capturing\n",
    "for item in re.compile(r'(?:some|a few) (people|cats) like some \\1').finditer(\n",
    "    \"a few cats like some cats but not some cats like some a few.\"):\n",
    "    print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "abc\n",
      "abc\n"
     ]
    }
   ],
   "source": [
    "print(re.match(\"([abc])+\", \"abc\").group())\n",
    "print(re.match(\"(?:[abc])+\", \"abc\").group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('c',)\n",
      "()\n"
     ]
    }
   ],
   "source": [
    "print(re.match(\"([abc])+\", \"abc\").groups())\n",
    "print(re.match(\"(?:[abc])+\", \"abc\").groups())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lookahead assertions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Volcano']\n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "# 前向\n",
    "test = re.compile(r'^(?=Volcano)[a-zA-Z]+')\n",
    "print(test.findall(\"Volcano I\"))\n",
    "test = re.compile(r'^(?!Volcano)[a-zA-Z]+')\n",
    "print(test.findall(\"Volcano I\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n",
      "['Volcano']\n"
     ]
    }
   ],
   "source": [
    "# 后向\n",
    "test = re.compile(r'^(?<=Volcano)[a-zA-Z]+')\n",
    "print(test.findall(\"Volcano I\"))\n",
    "test = re.compile(r'^(?<!Volcano)[a-zA-Z]+')\n",
    "print(test.findall(\"Volcano I\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}