{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "EXTRA_DIGITS = {\n", " \"i\": 1,\n", " \"l\": 1,\n", "}\n", "\n", "EXTRA_DIGIT_STR = \"\".join(EXTRA_DIGITS)\n", "\n", "NUMBER_SANITY_RE = re.compile(\n", " fr\"\"\"\n", " \\b\n", " (?:\n", " [0-9{EXTRA_DIGIT_STR}]\n", " [0-9{EXTRA_DIGIT_STR}.,/-]*\n", " [0-9{EXTRA_DIGIT_STR}]\n", " )\n", " \\b\n", " \"\"\",\n", " re.S | re.X,\n", ")\n", "\n", "\n", "def numberRepl(match):\n", " number = match.group(0)\n", " for (extraDigit, value) in EXTRA_DIGITS.items():\n", " number = number.replace(extraDigit, str(value))\n", " return number" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "text = \"\"\"\n", "de poederzuijker voor l32/243 Spaans de frazel van 27 lb, reekent 35 V2 percento\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "de poederzuijker voor 132/243 Spaans de frazel van 27 lb, reekent 35 V2 percento\n", "\n" ] } ], "source": [ "text = NUMBER_SANITY_RE.sub(numberRepl, text)\n", "print(text)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "MARK_NUM = r\"\"\"\n", " (?:\n", " [0-9]{1,2}\n", " (?:\n", " \\s+\n", " [0-9]{1,2}\n", " )*\n", " )\n", "\"\"\"\n", "\n", "MARK_PLAIN_RE = re.compile(\n", " fr\"\"\"\n", " (\n", " (?:\n", " \n", " {MARK_NUM}\n", " \n", " )\n", " |\n", " (?:\n", " ⌊\n", " [0-9]{{1,2}}\n", " ⌋\n", " )\n", " |\n", " (?:\n", " (?<=[a-zé])\n", " [0-9]{{1,2}}\n", " \\b\n", " )\n", " |\n", " (?:\n", " (?<=[a-zé][;.])\n", " [0-9]{{1,2}}\n", " \\b\n", " )\n", " )\n", " \"\"\",\n", " re.S | re.X,\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "text = \"\"\"van het eyland Zakynthos28,\"\"\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "match = MARK_PLAIN_RE.search(text)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "match" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'28'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "match.group(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }