{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# How to capture repeating subpatterns\n", "\n", "https://stackoverflow.com/a/67346603/1421907\n", "\n", "Here are the two solutions from the above question on stackoverflow. The first one is based on `re` the second one is based on `regex`." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import re\n", "import regex\n", "\n", "import ast" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "txt = r\"\"\"* DCH : 0.80000000 *\n", "* PYR : 100.00000000 *\n", "* Bond ( 1, 0) : 0.80000000 *\n", "* Angle ( 1, 0, 2) : 100.00000000 *\n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implementation using re" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "p = re.compile(r\"\\s+(\\w+)\\s+(\\((?:\\s*(?:\\d+),?){2,4}\\))?\\s+:\\s+(\\d+.\\d+)\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DCH 0.8\n", "PYR 100.0\n", "(1, 0) 0.8\n", "(1, 0, 2) 100.0\n" ] } ], "source": [ "for line in txt.splitlines():\n", " m = p.search(line)\n", " coord = m.group(1)\n", " value = float(m.group(3))\n", " if m.group(2):\n", " coord = ast.literal_eval(m.group(2))\n", " print(f\"{coord} {value}\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "23.4 µs ± 667 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "%%timeit\n", "for line in txt.splitlines():\n", " m = p.search(line)\n", " coord = m.group(1)\n", " value = float(m.group(3))\n", " if m.group(2):\n", " coord = ast.literal_eval(m.group(2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implementation using regex" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pre = regex.compile(r'\\s+(\\w+)\\s+(?:\\((?:\\s*(\\d+),?){2,4}\\))?\\s+:\\s+(\\d+.\\d+)')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DCH 0.8\n", "PYR 100.0\n", "(1, 0) 0.8\n", "(1, 0, 2) 100.0\n" ] } ], "source": [ "for line in txt.splitlines():\n", " m = pre.search(line)\n", " coord = m.group(1)\n", " value = float(m.group(3))\n", " if m.captures(2):\n", " coord = tuple([int(i) for i in m.captures(2)])\n", "\n", " print(f\"{coord} {value}\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12.4 µs ± 46.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" ] } ], "source": [ "%%timeit\n", "for line in txt.splitlines():\n", " m = pre.search(line)\n", " coord = m.group(1)\n", " value = float(m.group(3))\n", " if m.captures(2):\n", " coord = tuple([int(i) for i in m.captures(2)])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }