{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "divided-bloom", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Enabling eager execution\n", "INFO:tensorflow:Enabling v2 tensorshape\n", "INFO:tensorflow:Enabling resource variables\n", "INFO:tensorflow:Enabling tensor equality\n", "INFO:tensorflow:Enabling control flow v2\n" ] } ], "source": [ "import spacy" ] }, { "cell_type": "code", "execution_count": 2, "id": "saved-jungle", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", "text = \"West Chestertenfieldville was referenced in Mr. Deeds.\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "thermal-january", "metadata": {}, "outputs": [], "source": [ "doc = nlp(text)" ] }, { "cell_type": "code", "execution_count": 4, "id": "functional-championship", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "West Chestertenfieldville PERSON\n", "Deeds PERSON\n" ] } ], "source": [ "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": 5, "id": "collect-archives", "metadata": {}, "outputs": [], "source": [ "ruler = nlp.add_pipe(\"entity_ruler\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "super-stretch", "metadata": {}, "outputs": [], "source": [ "patterns = [\n", " {\"label\": \"GPE\", \"pattern\": \"West Chestertenfieldville\"}\n", "]\n", "ruler.add_patterns(patterns)" ] }, { "cell_type": "code", "execution_count": 7, "id": "related-nepal", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "West Chestertenfieldville PERSON\n", "Deeds PERSON\n" ] } ], "source": [ "doc = nlp(text)\n", "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": 8, "id": "military-wrapping", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'summary': {'tok2vec': {'assigns': ['doc.tensor'],\n", " 'requires': [],\n", " 'scores': [],\n", " 'retokenizes': False},\n", " 'tagger': {'assigns': ['token.tag'],\n", " 'requires': [],\n", " 'scores': ['tag_acc'],\n", " 'retokenizes': False},\n", " 'parser': {'assigns': ['token.dep',\n", " 'token.head',\n", " 'token.is_sent_start',\n", " 'doc.sents'],\n", " 'requires': [],\n", " 'scores': ['dep_uas',\n", " 'dep_las',\n", " 'dep_las_per_type',\n", " 'sents_p',\n", " 'sents_r',\n", " 'sents_f'],\n", " 'retokenizes': False},\n", " 'attribute_ruler': {'assigns': [],\n", " 'requires': [],\n", " 'scores': [],\n", " 'retokenizes': False},\n", " 'lemmatizer': {'assigns': ['token.lemma'],\n", " 'requires': [],\n", " 'scores': ['lemma_acc'],\n", " 'retokenizes': False},\n", " 'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],\n", " 'requires': [],\n", " 'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],\n", " 'retokenizes': False},\n", " 'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],\n", " 'requires': [],\n", " 'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],\n", " 'retokenizes': False}},\n", " 'problems': {'tok2vec': [],\n", " 'tagger': [],\n", " 'parser': [],\n", " 'attribute_ruler': [],\n", " 'lemmatizer': [],\n", " 'ner': [],\n", " 'entity_ruler': []},\n", " 'attrs': {'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},\n", " 'token.tag': {'assigns': ['tagger'], 'requires': []},\n", " 'token.head': {'assigns': ['parser'], 'requires': []},\n", " 'token.dep': {'assigns': ['parser'], 'requires': []},\n", " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", " 'token.ent_iob': {'assigns': ['ner', 'entity_ruler'], 'requires': []},\n", " 'token.is_sent_start': {'assigns': ['parser'], 'requires': []},\n", " 'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},\n", " 'doc.ents': {'assigns': ['ner', 'entity_ruler'], 'requires': []},\n", " 'token.ent_type': {'assigns': ['ner', 'entity_ruler'], 'requires': []}}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.analyze_pipes()" ] }, { "cell_type": "code", "execution_count": 9, "id": "operational-interview", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "competitive-water", "metadata": {}, "outputs": [], "source": [ "ruler = nlp.add_pipe(\"entity_ruler\", before=\"ner\")\n", "patterns = [\n", " {\"label\": \"GPE\", \"pattern\": \"West Chestertenfieldville\"}\n", "]\n", "ruler.add_patterns(patterns)" ] }, { "cell_type": "code", "execution_count": 11, "id": "precious-sunset", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'summary': {'tok2vec': {'assigns': ['doc.tensor'],\n", " 'requires': [],\n", " 'scores': [],\n", " 'retokenizes': False},\n", " 'tagger': {'assigns': ['token.tag'],\n", " 'requires': [],\n", " 'scores': ['tag_acc'],\n", " 'retokenizes': False},\n", " 'parser': {'assigns': ['token.dep',\n", " 'token.head',\n", " 'token.is_sent_start',\n", " 'doc.sents'],\n", " 'requires': [],\n", " 'scores': ['dep_uas',\n", " 'dep_las',\n", " 'dep_las_per_type',\n", " 'sents_p',\n", " 'sents_r',\n", " 'sents_f'],\n", " 'retokenizes': False},\n", " 'attribute_ruler': {'assigns': [],\n", " 'requires': [],\n", " 'scores': [],\n", " 'retokenizes': False},\n", " 'lemmatizer': {'assigns': ['token.lemma'],\n", " 'requires': [],\n", " 'scores': ['lemma_acc'],\n", " 'retokenizes': False},\n", " 'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],\n", " 'requires': [],\n", " 'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],\n", " 'retokenizes': False},\n", " 'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],\n", " 'requires': [],\n", " 'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],\n", " 'retokenizes': False}},\n", " 'problems': {'tok2vec': [],\n", " 'tagger': [],\n", " 'parser': [],\n", " 'attribute_ruler': [],\n", " 'lemmatizer': [],\n", " 'entity_ruler': [],\n", " 'ner': []},\n", " 'attrs': {'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},\n", " 'token.tag': {'assigns': ['tagger'], 'requires': []},\n", " 'token.head': {'assigns': ['parser'], 'requires': []},\n", " 'token.dep': {'assigns': ['parser'], 'requires': []},\n", " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", " 'token.ent_iob': {'assigns': ['entity_ruler', 'ner'], 'requires': []},\n", " 'token.is_sent_start': {'assigns': ['parser'], 'requires': []},\n", " 'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},\n", " 'doc.ents': {'assigns': ['entity_ruler', 'ner'], 'requires': []},\n", " 'token.ent_type': {'assigns': ['entity_ruler', 'ner'], 'requires': []}}}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.analyze_pipes()" ] }, { "cell_type": "code", "execution_count": 12, "id": "handy-washington", "metadata": {}, "outputs": [], "source": [ "doc = nlp(text)" ] }, { "cell_type": "code", "execution_count": 13, "id": "electric-nickname", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "West Chestertenfieldville GPE\n", "Deeds PERSON\n" ] } ], "source": [ "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": 14, "id": "vertical-drama", "metadata": {}, "outputs": [], "source": [ "nlp3 = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "amazing-speaking", "metadata": {}, "outputs": [], "source": [ "ruler = nlp3.add_pipe(\"entity_ruler\", before=\"ner\")\n", "patterns = [\n", " {\"label\": \"GPE\", \"pattern\": \"West Chestertenfieldville\"},\n", " {\"label\": \"FILM\", \"pattern\": \"Mr. Deeds\"}\n", "]\n", "ruler.add_patterns(patterns)" ] }, { "cell_type": "code", "execution_count": 16, "id": "former-resolution", "metadata": {}, "outputs": [], "source": [ "doc = nlp3(text)" ] }, { "cell_type": "code", "execution_count": 17, "id": "dietary-launch", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "West Chestertenfieldville GPE\n", "Mr. Deeds FILM\n" ] } ], "source": [ "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": 18, "id": "rental-sunset", "metadata": {}, "outputs": [], "source": [ "text = \"This is a sample number (555) 555-5555.\"" ] }, { "cell_type": "code", "execution_count": 19, "id": "approved-intro", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.blank(\"en\")\n", "ruler = nlp.add_pipe(\"entity_ruler\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "blocked-politics", "metadata": {}, "outputs": [], "source": [ "patterns = [\n", " {\"label\": \"PHONE_NUMBER\",\n", " \"pattern\": [\n", " {\"ORTH\": \"(\"},\n", " {\"SHAPE\": \"ddd\"},\n", " {\"ORTH\": \")\"},\n", " {\"SHAPE\": \"ddd\"},\n", " {\"ORTH\": \"-\", \"OP\": \"?\"},\n", " {\"SHAPE\": \"dddd\"}\n", " ]}\n", " \n", "]\n", "ruler.add_patterns(patterns)" ] }, { "cell_type": "code", "execution_count": 24, "id": "turkish-pottery", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(555) 555-5555 PHONE_NUMBER\n" ] } ], "source": [ "doc = nlp(text)\n", "for ent in doc.ents:\n", " print (ent.text, ent.label_)" ] }, { "cell_type": "code", "execution_count": null, "id": "executive-hotel", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }