{ "cells": [ { "cell_type": "markdown", "id": "operational-landing", "metadata": {}, "source": [ "#
How to use the spaCy Matcher
" ] }, { "cell_type": "markdown", "id": "vital-understanding", "metadata": {}, "source": [ "
Dr. W.J.B. Mattingly
\n", "\n", "
Smithsonian Data Science Lab and United States Holocaust Memorial Museum
\n", "\n", "
August 2021
" ] }, { "cell_type": "code", "execution_count": 3, "id": "naked-lighting", "metadata": {}, "outputs": [], "source": [ "import spacy" ] }, { "cell_type": "code", "execution_count": null, "id": "legal-mechanism", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "id": "described-clarity", "metadata": {}, "outputs": [], "source": [ "from spacy.matcher import Matcher" ] }, { "cell_type": "markdown", "id": "inappropriate-delicious", "metadata": {}, "source": [ "## Basic Example" ] }, { "cell_type": "code", "execution_count": 5, "id": "technical-anderson", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")\n", "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"LIKE_EMAIL\": True}]\n", "matcher.add(\"EMAIL_ADDRESS\", [pattern])\n", "doc = nlp(\"This is an email address: wmattingly@aol.com\")\n", "matches = matcher(doc)" ] }, { "cell_type": "code", "execution_count": 6, "id": "synthetic-craps", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(16571425990740197027, 6, 7)]\n" ] } ], "source": [ "print (matches)" ] }, { "cell_type": "markdown", "id": "provincial-belarus", "metadata": {}, "source": [ "Lexeme, start token, end token" ] }, { "cell_type": "code", "execution_count": 7, "id": "mounted-morning", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EMAIL_ADDRESS\n" ] } ], "source": [ "print (nlp.vocab[matches[0][0]].text)" ] }, { "cell_type": "markdown", "id": "gentle-nitrogen", "metadata": {}, "source": [ "## Attributes Taken by Matcher" ] }, { "cell_type": "markdown", "id": "egyptian-democracy", "metadata": {}, "source": [ "* ORTH - The exact verbatim of a token (str)\n", "* TEXT - The exact verbatim of a token (str)\n", "* LOWER - The lowercase form of the token text (str)\n", "* LENGTH - The length of the token text (int)\n", "* IS_ALPHA\n", "* IS_ASCII\n", "* IS_DIGIT\n", "* IS_LOWER\n", "* IS_UPPER\n", "* IS_TITLE\n", "* IS_PUNCT\n", "* IS_SPACE\n", "* IS_STOP\n", "* IS_SENT_START\n", "* LIKE_NUM\n", "* LIKE_URL\n", "* LIKE_EMAIL\n", "* SPACY\n", "* POS\n", "* TAG\n", "* MORPH\n", "* DEP\n", "* LEMMA\n", "* SHAPE\n", "* ENT_TYPE\n", "* _ - Custom extension attributes (Dict\\[str, Any\\])\n", "* OP" ] }, { "cell_type": "markdown", "id": "synthetic-apache", "metadata": {}, "source": [ "## Applied Matcher" ] }, { "cell_type": "code", "execution_count": 9, "id": "fifth-magnitude", "metadata": {}, "outputs": [], "source": [ "with open (\"data/wiki_mlk.txt\", \"r\") as f:\n", " text = f.read()" ] }, { "cell_type": "markdown", "id": "initial-automation", "metadata": {}, "source": [ "print (text)" ] }, { "cell_type": "code", "execution_count": 11, "id": "minute-sympathy", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "markdown", "id": "medieval-vintage", "metadata": {}, "source": [ "## Grabbing all Proper Nouns" ] }, { "cell_type": "code", "execution_count": 33, "id": "cosmetic-combination", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "102\n", "(3232560085755078826, 0, 1) Martin\n", "(3232560085755078826, 1, 2) Luther\n", "(3232560085755078826, 2, 3) King\n", "(3232560085755078826, 3, 4) Jr.\n", "(3232560085755078826, 6, 7) Michael\n", "(3232560085755078826, 7, 8) King\n", "(3232560085755078826, 8, 9) Jr.\n", "(3232560085755078826, 10, 11) January\n", "(3232560085755078826, 14, 15) â€\n", "(3232560085755078826, 16, 17) April\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern])\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "transparent-hughes", "metadata": {}, "source": [ "### Improving it with Multi-Word Tokens" ] }, { "cell_type": "code", "execution_count": 34, "id": "medical-method", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "175\n", "(3232560085755078826, 0, 1) Martin\n", "(3232560085755078826, 0, 2) Martin Luther\n", "(3232560085755078826, 1, 2) Luther\n", "(3232560085755078826, 0, 3) Martin Luther King\n", "(3232560085755078826, 1, 3) Luther King\n", "(3232560085755078826, 2, 3) King\n", "(3232560085755078826, 0, 4) Martin Luther King Jr.\n", "(3232560085755078826, 1, 4) Luther King Jr.\n", "(3232560085755078826, 2, 4) King Jr.\n", "(3232560085755078826, 3, 4) Jr.\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern])\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "mathematical-express", "metadata": {}, "source": [ "### Greedy Keyword Argument" ] }, { "cell_type": "code", "execution_count": 35, "id": "located-employer", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "61\n", "(3232560085755078826, 84, 89) Martin Luther King Sr.\n", "(3232560085755078826, 470, 475) Martin Luther King Jr. Day\n", "(3232560085755078826, 537, 542) Martin Luther King Jr. Memorial\n", "(3232560085755078826, 0, 4) Martin Luther King Jr.\n", "(3232560085755078826, 129, 133) Southern Christian Leadership Conference\n", "(3232560085755078826, 248, 252) Director J. Edgar Hoover\n", "(3232560085755078826, 6, 9) Michael King Jr.\n", "(3232560085755078826, 326, 329) Nobel Peace Prize\n", "(3232560085755078826, 423, 426) James Earl Ray\n", "(3232560085755078826, 464, 467) Congressional Gold Medal\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "overall-fitting", "metadata": {}, "source": [ "### Sorting it to Apperance" ] }, { "cell_type": "code", "execution_count": 36, "id": "blessed-begin", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "61\n", "(3232560085755078826, 0, 4) Martin Luther King Jr.\n", "(3232560085755078826, 6, 9) Michael King Jr.\n", "(3232560085755078826, 10, 11) January\n", "(3232560085755078826, 14, 15) â€\n", "(3232560085755078826, 16, 17) April\n", "(3232560085755078826, 24, 25) Baptist\n", "(3232560085755078826, 50, 51) King\n", "(3232560085755078826, 70, 72) Mahatma Gandhi\n", "(3232560085755078826, 84, 89) Martin Luther King Sr.\n", "(3232560085755078826, 90, 91) King\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "aware-stupid", "metadata": {}, "source": [ "### Adding in Sequences" ] }, { "cell_type": "code", "execution_count": 37, "id": "individual-timber", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7\n", "(3232560085755078826, 50, 52) King advanced\n", "(3232560085755078826, 90, 92) King participated\n", "(3232560085755078826, 114, 116) King led\n", "(3232560085755078826, 168, 170) King helped\n", "(3232560085755078826, 248, 253) Director J. Edgar Hoover considered\n", "(3232560085755078826, 323, 325) King won\n", "(3232560085755078826, 486, 489) United States beginning\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}, {\"POS\": \"VERB\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "innovative-stylus", "metadata": {}, "source": [ "## Finding Quotes and Speakers" ] }, { "cell_type": "code", "execution_count": 38, "id": "similar-budget", "metadata": {}, "outputs": [], "source": [ "import json\n", "with open (\"data/alice.json\", \"r\") as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 102, "id": "available-nutrition", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'\n" ] } ], "source": [ "text = data[0][2][0]\n", "print (text)" ] }, { "cell_type": "code", "execution_count": 103, "id": "elder-ghost", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n" ] } ], "source": [ "text = data[0][2][0].replace( \"`\", \"'\")\n", "print (text)" ] }, { "cell_type": "code", "execution_count": 77, "id": "young-integration", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n", "(3232560085755078826, 47, 58) 'and what is the use of a book,'\n", "(3232560085755078826, 60, 67) 'without pictures or conversation?'\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": null, "id": "fifty-emperor", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "transsexual-pastor", "metadata": {}, "source": [ "### Find Speaker" ] }, { "cell_type": "code", "execution_count": 97, "id": "engaging-heath", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n" ] } ], "source": [ "speak_lemmas = [\"think\", \"say\"]\n", "text = data[0][2][0].replace( \"`\", \"'\")\n", "matcher = Matcher(nlp.vocab)\n", "pattern1 = [{'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}, {\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {\"POS\": \"PROPN\", \"OP\": \"+\"}, {'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern1], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "competent-immigration", "metadata": {}, "source": [ "### Problem with this Approach" ] }, { "cell_type": "code", "execution_count": 98, "id": "answering-payday", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n" ] } ], "source": [ "for text in data[0][2]:\n", " text = text.replace(\"`\", \"'\")\n", " doc = nlp(text)\n", " matches = matcher(doc)\n", " matches.sort(key = lambda x: x[1])\n", " print (len(matches))\n", " for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "markdown", "id": "potential-dynamics", "metadata": {}, "source": [ "### Adding More Patterns" ] }, { "cell_type": "code", "execution_count": 101, "id": "planned-locking", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n", "0\n", "0\n", "0\n", "0\n", "0\n", "1\n", "(3232560085755078826, 0, 6) 'Well!' thought Alice\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "1\n", "(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice\n", "0\n", "0\n" ] } ], "source": [ "speak_lemmas = [\"think\", \"say\"]\n", "text = data[0][2][0].replace( \"`\", \"'\")\n", "matcher = Matcher(nlp.vocab)\n", "pattern1 = [{'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}, {\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {\"POS\": \"PROPN\", \"OP\": \"+\"}, {'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}]\n", "pattern2 = [{'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}, {\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "pattern3 = [{\"POS\": \"PROPN\", \"OP\": \"+\"},{\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern1, pattern2, pattern3], greedy='LONGEST')\n", "for text in data[0][2]:\n", " text = text.replace(\"`\", \"'\")\n", " doc = nlp(text)\n", " matches = matcher(doc)\n", " matches.sort(key = lambda x: x[1])\n", " print (len(matches))\n", " for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": null, "id": "fantastic-auditor", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }