{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "authorized-chapel", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Enabling eager execution\n", "INFO:tensorflow:Enabling v2 tensorshape\n", "INFO:tensorflow:Enabling resource variables\n", "INFO:tensorflow:Enabling tensor equality\n", "INFO:tensorflow:Enabling control flow v2\n" ] } ], "source": [ "import spacy\n", "from spacy.matcher import Matcher" ] }, { "cell_type": "code", "execution_count": 2, "id": "current-expert", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "artistic-silicon", "metadata": {}, "outputs": [], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"LIKE_EMAIL\": True}]\n", "matcher.add(\"EMAIL_ADDRESS\", [pattern])" ] }, { "cell_type": "code", "execution_count": 4, "id": "contrary-veteran", "metadata": {}, "outputs": [], "source": [ "doc = nlp(\"This is an email address: wmattingly@aol.com\")\n", "matches = matcher(doc)" ] }, { "cell_type": "code", "execution_count": 5, "id": "immediate-england", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(16571425990740197027, 6, 7)]\n" ] } ], "source": [ "print (matches)" ] }, { "cell_type": "code", "execution_count": 6, "id": "white-clinton", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EMAIL_ADDRESS\n" ] } ], "source": [ "print (nlp.vocab[matches[0][0]].text)" ] }, { "cell_type": "code", "execution_count": 7, "id": "imperial-arrival", "metadata": {}, "outputs": [], "source": [ "with open (\"data/wiki_mlk.txt\", \"r\") as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": 8, "id": "offshore-union", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.\n", "\n", "King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famous \"I Have a Dream\" speech on the steps of the Lincoln Memorial.\n", "\n", "The SCLC put into practice the tactics of nonviolent protest with some success by strategically choosing the methods and places in which protests were carried out. There were several dramatic stand-offs with segregationist authorities, who sometimes turned violent.[2] Federal Bureau of Investigation (FBI) Director J. Edgar Hoover considered King a radical and made him an object of the FBI's COINTELPRO from 1963, forward. FBI agents investigated him for possible communist ties, recorded his extramarital affairs and reported on them to government officials, and, in 1964, mailed King a threatening anonymous letter, which he interpreted as an attempt to make him commit suicide.[3]\n", "\n", "On October 14, 1964, King won the Nobel Peace Prize for combating racial inequality through nonviolent resistance. In 1965, he helped organize two of the three Selma to Montgomery marches. In his final years, he expanded his focus to include opposition towards poverty, capitalism, and the Vietnam War.\n", "\n", "In 1968, King was planning a national occupation of Washington, D.C., to be called the Poor People's Campaign, when he was assassinated on April 4 in Memphis, Tennessee. His death was followed by riots in many U.S. cities. Allegations that James Earl Ray, the man convicted of killing King, had been framed or acted in concert with government agents persisted for decades after the shooting. King was posthumously awarded the Presidential Medal of Freedom in 1977 and the Congressional Gold Medal in 2003. Martin Luther King Jr. Day was established as a holiday in cities and states throughout the United States beginning in 1971; the holiday was enacted at the federal level by legislation signed by President Ronald Reagan in 1986. Hundreds of streets in the U.S. have been renamed in his honor, and the most populous county in Washington State was rededicated for him. The Martin Luther King Jr. Memorial on the National Mall in Washington, D.C., was dedicated in 2011.\n" ] } ], "source": [ "print (text)" ] }, { "cell_type": "code", "execution_count": 9, "id": "tight-torture", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "executive-lecture", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "102\n", "(451313080118390996, 0, 1) Martin\n", "(451313080118390996, 1, 2) Luther\n", "(451313080118390996, 2, 3) King\n", "(451313080118390996, 3, 4) Jr.\n", "(451313080118390996, 6, 7) Michael\n", "(451313080118390996, 7, 8) King\n", "(451313080118390996, 8, 9) Jr.\n", "(451313080118390996, 10, 11) January\n", "(451313080118390996, 14, 15) â€\n", "(451313080118390996, 16, 17) April\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\"}]\n", "matcher.add(\"PROPER_NOUN\", [pattern])\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 11, "id": "liable-bargain", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "175\n", "(451313080118390996, 0, 1) Martin\n", "(451313080118390996, 0, 2) Martin Luther\n", "(451313080118390996, 1, 2) Luther\n", "(451313080118390996, 0, 3) Martin Luther King\n", "(451313080118390996, 1, 3) Luther King\n", "(451313080118390996, 2, 3) King\n", "(451313080118390996, 0, 4) Martin Luther King Jr.\n", "(451313080118390996, 1, 4) Luther King Jr.\n", "(451313080118390996, 2, 4) King Jr.\n", "(451313080118390996, 3, 4) Jr.\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "matcher.add(\"PROPER_NOUN\", [pattern])\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 14, "id": "informed-passport", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "61\n", "(3232560085755078826, 84, 89) Martin Luther King Sr.\n", "(3232560085755078826, 470, 475) Martin Luther King Jr. Day\n", "(3232560085755078826, 537, 542) Martin Luther King Jr. Memorial\n", "(3232560085755078826, 0, 4) Martin Luther King Jr.\n", "(3232560085755078826, 129, 133) Southern Christian Leadership Conference\n", "(3232560085755078826, 248, 252) Director J. Edgar Hoover\n", "(3232560085755078826, 6, 9) Michael King Jr.\n", "(3232560085755078826, 326, 329) Nobel Peace Prize\n", "(3232560085755078826, 423, 426) James Earl Ray\n", "(3232560085755078826, 464, 467) Congressional Gold Medal\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy=\"LONGEST\")\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 15, "id": "going-works", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "61\n", "(3232560085755078826, 0, 4) Martin Luther King Jr.\n", "(3232560085755078826, 6, 9) Michael King Jr.\n", "(3232560085755078826, 10, 11) January\n", "(3232560085755078826, 14, 15) â€\n", "(3232560085755078826, 16, 17) April\n", "(3232560085755078826, 24, 25) Baptist\n", "(3232560085755078826, 50, 51) King\n", "(3232560085755078826, 70, 72) Mahatma Gandhi\n", "(3232560085755078826, 84, 89) Martin Luther King Sr.\n", "(3232560085755078826, 90, 91) King\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 16, "id": "buried-lebanon", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7\n", "(3232560085755078826, 50, 52) King advanced\n", "(3232560085755078826, 90, 92) King participated\n", "(3232560085755078826, 114, 116) King led\n", "(3232560085755078826, 168, 170) King helped\n", "(3232560085755078826, 248, 253) Director J. Edgar Hoover considered\n", "(3232560085755078826, 323, 325) King won\n", "(3232560085755078826, 486, 489) United States beginning\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"POS\": \"PROPN\", \"OP\": \"+\"}, {\"POS\": \"VERB\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 17, "id": "horizontal-knife", "metadata": {}, "outputs": [], "source": [ "import json\n", "with open (\"data/alice.json\", \"r\") as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 18, "id": "impressive-willow", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'\n" ] } ], "source": [ "text = data[0][2][0]\n", "print (text)" ] }, { "cell_type": "code", "execution_count": 19, "id": "respiratory-nigeria", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n" ] } ], "source": [ "text = text.replace(\"`\", \"'\")\n", "print (text)" ] }, { "cell_type": "code", "execution_count": 20, "id": "analyzed-longer", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n", "(3232560085755078826, 47, 58) 'and what is the use of a book,'\n", "(3232560085755078826, 60, 67) 'without pictures or conversation?'\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"ORTH\": \"'\"},\n", " {\"IS_ALPHA\": True, \"OP\": \"+\"},\n", " {\"IS_PUNCT\": True, \"OP\": \"*\"},\n", " {\"ORTH\": \"'\"}\n", " ]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 23, "id": "forward-spirituality", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n" ] } ], "source": [ "speak_lemmas = [\"think\", \"say\"]\n", "matcher = Matcher(nlp.vocab)\n", "pattern = [{\"ORTH\": \"'\"},\n", " {\"IS_ALPHA\": True, \"OP\": \"+\"},\n", " {\"IS_PUNCT\": True, \"OP\": \"*\"},\n", " {\"ORTH\": \"'\"},\n", " {\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}},\n", " {\"POS\": \"PROPN\", \"OP\": \"+\"},\n", " {\"ORTH\": \"'\"},\n", " {\"IS_ALPHA\": True, \"OP\": \"+\"},\n", " {\"IS_PUNCT\": True, \"OP\": \"*\"},\n", " {\"ORTH\": \"'\"}\n", " ]\n", "matcher.add(\"PROPER_NOUNS\", [pattern], greedy='LONGEST')\n", "doc = nlp(text)\n", "matches = matcher(doc)\n", "matches.sort(key = lambda x: x[1])\n", "print (len(matches))\n", "for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 28, "id": "transsexual-insight", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n" ] } ], "source": [ "for text in data[0][2]:\n", " text = text.replace(\"`\", \"'\")\n", " doc = nlp(text)\n", " matches = matcher(doc)\n", " print (len(matches))\n", " matches.sort(key = lambda x: x[1])\n", " for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": 29, "id": "collected-peter", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'\n", "0\n", "0\n", "0\n", "0\n", "0\n", "1\n", "(3232560085755078826, 0, 6) 'Well!' thought Alice\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "0\n", "1\n", "(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice\n", "0\n", "0\n" ] } ], "source": [ "matcher = Matcher(nlp.vocab)\n", "pattern1 = [{'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}, {\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {\"POS\": \"PROPN\", \"OP\": \"+\"}, {'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}]\n", "pattern2 = [{'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}, {\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {\"POS\": \"PROPN\", \"OP\": \"+\"}]\n", "pattern3 = [{\"POS\": \"PROPN\", \"OP\": \"+\"},{\"POS\": \"VERB\", \"LEMMA\": {\"IN\": speak_lemmas}}, {'ORTH': \"'\"}, {'IS_ALPHA': True, \"OP\": \"+\"}, {'IS_PUNCT': True, \"OP\": \"*\"}, {'ORTH': \"'\"}]\n", "matcher.add(\"PROPER_NOUNS\", [pattern1, pattern2, pattern3], greedy='LONGEST')\n", "for text in data[0][2]:\n", " text = text.replace(\"`\", \"'\")\n", " doc = nlp(text)\n", " matches = matcher(doc)\n", " matches.sort(key = lambda x: x[1])\n", " print (len(matches))\n", " for match in matches[:10]:\n", " print (match, doc[match[1]:match[2]])" ] }, { "cell_type": "code", "execution_count": null, "id": "congressional-things", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }