{ "cells": [ { "cell_type": "code", "execution_count": 12, "id": "smooth-consumption", "metadata": {}, "outputs": [], "source": [ "#Import the requisite library\n", "import spacy\n", "\n", "#Sample text\n", "text = \"This is a sample number (555) 555-5555.\"\n", "\n", "#Build upon the spaCy Small Model\n", "nlp = spacy.blank(\"en\")\n", "\n", "#Create the Ruler and Add it\n", "ruler = nlp.add_pipe(\"entity_ruler\")\n", "\n", "#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)\n", "patterns = [\n", " {\n", " \"label\": \"PHONE_NUMBER\",\n", " \"pattern\":\n", " [{\"TEXT\":\n", " {\"REGEX\": \"((\\d){3}-(\\d){4})\"}}\n", " ]\n", " }\n", " ]\n", "#add patterns to ruler\n", "ruler.add_patterns(patterns)\n", "\n", "\n", "#create the doc\n", "doc = nlp(text)\n", "\n", "#extract entities\n", "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": 13, "id": "statewide-insider", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5555555 PHONE_NUMBER\n" ] } ], "source": [ "#Import the requisite library\n", "import spacy\n", "\n", "#Sample text\n", "text = \"This is a sample number 5555555.\"\n", "#Build upon the spaCy Small Model\n", "nlp = spacy.blank(\"en\")\n", "\n", "#Create the Ruler and Add it\n", "ruler = nlp.add_pipe(\"entity_ruler\")\n", "\n", "#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)\n", "patterns = [\n", " {\n", " \"label\": \"PHONE_NUMBER\",\n", " \"pattern\":\n", " [{\"TEXT\":\n", " {\"REGEX\": \"((\\d){5})\"}}\n", " ]\n", " }\n", " ]\n", "#add patterns to ruler\n", "ruler.add_patterns(patterns)\n", "\n", "\n", "#create the doc\n", "doc = nlp(text)\n", "\n", "#extract entities\n", "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": null, "id": "difficult-landing", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "polished-gothic", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }