{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import the English Language Model\n", "\n", "If you have not already done so, you will need to run this code to download the language model." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (2.2.5)\n", "Requirement already satisfied: spacy>=2.2.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from en_core_web_sm==2.2.5) (2.2.4)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.44.1)\n", "Requirement already satisfied: setuptools in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (46.1.3)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.23.0)\n", "Requirement already satisfied: thinc==7.4.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)\n", "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n", "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.6.0)\n", "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n", "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)\n", "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n", "Requirement already satisfied: numpy>=1.15.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.18.2)\n", "Requirement already satisfied: idna<3,>=2.5 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.9)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.8)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2019.11.28)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n", "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.6.0)\n", "Requirement already satisfied: zipp>=0.5 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)\n", "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the model via spacy.load('en_core_web_sm')\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m spacy download en_core_web_sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Defining variables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "## define directory path and entity type\n", "import os\n", "cwd = os.getcwd()\n", "data_loc = cwd + \"/data\"\n", "output_loc = cwd + \"/output/\"\n", "ent_type = \"PERSON\"\n", "\n", "### entity type can be \"PERSON\", \"NORP\", \"ORG\", \"GPE\", etc.\n", "### https://spacy.io/api/annotation#named-entities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports and setup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "from spacy import displacy\n", "import os\n", "import string\n", "import codecs\n", "import subprocess\n", "from collections import Counter\n", "\n", "nlp = spacy.load('en_core_web_sm')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Walk the directory and collect text files" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "files: 4 \n" ] } ], "source": [ "allfiles = []\n", "\n", "for root, dirs, files in os.walk(data_loc):\n", " for file in files:\n", " if file.endswith(\".txt\"):\n", " allfiles.append(os.path.join(root, file))\n", " \n", "print('files: %d ' % len(allfiles))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "myfile = codecs.open(allfiles[0], 'r', encoding='utf-8')\n", "pagetext=myfile.read()\n", "myfile.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# First pass: Parse the text and recognize entities\n", "\n", "Here we apply the plain, \"out of the box\" Spacy English model to our text document. \n", "We then display the first sentence as a dependency graph and the entire document\n", "with highlighted entities." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def parse():\n", " doc = nlp(pagetext)\n", " sentence_spans = list(doc.sents)\n", " displacy.render(sentence_spans[0:1], options={'compact': True}, style=\"dep\")\n", " displacy.render(doc, options={'compact': True}, style=\"ent\")\n", " " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " White\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Adams\n", " PROPN\n", "\n", "\n", "\n", " \t\t\t\n", " SPACE\n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
White\tAdams\t\t\tHenry\tL\tFannie\tS\troute agent\tSouthern Railway Co\tHouse\t\n", "\n", " 327\n", " CARDINAL\n", "\n", " n Tryon
White\tAdams\t\t\tJames\t\tGertrude\t\tmanager\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t419 \n", "\n", " Elizabeth\n", " PERSON\n", "\n", " av
\n", "\n", " Black\tAdams\t\t\tJane\n", " EVENT\n", "\n", "\t\t\t\tteacher\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1021 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tJohn\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1031 s \n", "\n", " Church\n", " ORG\n", "\n", "
White\tAdams\t\t\tJohn\tJ\t\t\tpresident\tAdams G & P Co and Char Pepsi-Cola Co\tHouse\t\n", "\n", " 309\n", " CARDINAL\n", "\n", " e \n", "\n", " 6th\n", " ORDINAL\n", "\n", "
White\tAdams\t\t\tJohn\tW\tCora\t\tconductor\tS A L Railway\tHouse\t21st nr \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tJoseph\t\tViolet\t\tcooper\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1011 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "White\tAdams\tRev\t\t\n", "\n", " Joseph\t\n", " PERSON\n", "\n", "Q\tLeslie\t\t\t\tHouse\t1509 s Boulevard
White\tAdams\t\tMiss\tJulia\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t707 n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\t\n", "\n", " Kate\n", " PERSON\n", "\n", "\t\t\t\tlaundress\t\tHouse\tGroveton
White\tAdams\t\t\tLafayette\tN\t\t\tclerk\tSouthern Railway\tHouse\t327 n Tryon
White\tAdams\t\t\tLawrence\tA\t\t\tsalesman\tB S Moore & Co\tRooms\t405 s Tryon
White\tAdams\t\t\tLaurie\tA\tMargaret\tN\tmill head\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tElizabetMills
\n", "\n", " Black\tAdams\t\t\t\n", " EVENT\n", "\n", "Leland\t\t\t\tporter\tJ P Stowe & Co\tHouse\t403 s Myers
Black\tAdams\t\t\tLizzie\t\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t309 1/2 w Morehead
\n", "\n", " White\tAdams\n", " ORG\n", "\n", "\t\tMiss\tLula\t\t\t\tclerk\tBelk Bros\tRooms\tY W C A
White\tAdams\t\twid (\n", "\n", " Geo O\n", " PRODUCT\n", "\n", ")\tGrace\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 601\n", " CARDINAL\n", "\n", " \n", "\n", " n College\n", " ORG\n", "\n", "
White\tAdams\t\t\tLuther\tM\tMamie\t\t\t\tHouse\tGroveton
\n", "\n", " Black\tAdams\t\t\t\n", " EVENT\n", "\n", "Major\t\t\t\twaiter\tBuford Hotel \t\t
Black\tAdams\t\t\tMattie\t\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t719 n \n", "\n", " Graham\n", " PERSON\n", "\n", " ext
\n", "\n", " White\tAdams\n", " EVENT\n", "\n", "\t\tMiss\tPattie\tV\t\t\tstenographer\t\t\n", "\n", " Boards\t708\n", " EVENT\n", "\n", " n \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tReuben\t\t\n", "\n", " Belle\n", " PERSON\n", "\n", "\t\tlaborer\tY & B Co\tHouse\t\n", "\n", " 419\n", " CARDINAL\n", "\n", " w 2d
Black\tAdams\t\t\tRosa\t\t\t\t\t\tHouse\t714 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tRufus\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tGreenville
\n", "\n", " Black\tAdams\t\t\t\n", " EVENT\n", "\n", "Rufus\t\t\t\tdriver\tStand I & F Co\tHouse\tRoss Town
\n", "\n", " White\tAdams\n", " EVENT\n", "\n", "\t\tMiss\tSalie\tH\t\t\tassistant\tCarnegie Library\tHouse\t\n", "\n", " 707\n", " CARDINAL\n", "\n", " n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tViolet\t\t\t\tservant\t\t\t\n", "\n", " 508\n", " CARDINAL\n", "\n", " w Trade
White\tAdams\t\t\tWheeler\tF\tMamie\t\tmoulder\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t303 s \n", "\n", " Cedar\n", " PRODUCT\n", "\n", "
White\tAdams\t\t\tWilliam\tE\t\t\t\t\n", "\n", " The Chronicle\tRooms\n", " ORG\n", "\n", "\t300 \n", "\n", " 1/2\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", " ORG\n", "\n", "
White\tAdcock\t\t\tJohn\tF\t\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
\n", "\n", " White\t\n", " WORK_OF_ART\n", "\n", "Adcock\t\twid (\n", "\n", " Jas M\n", " PERSON\n", "\n", ")\tMillie\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
White\tAdelsheimer\t\t\tHenry\tS\tLizzie\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1216 \n", "\n", " Louise\n", " PERSON\n", "\n", " av
\n", "\n", " Black\tAdkins\t\t\tKing\t\t\t\t\n", " WORK_OF_ART\n", "\n", "laborer\t\t\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " s Myers
\n", "\n", " White\t\n", " DATE\n", "\n", "Adkins \n", "\n", " Walter D\n", " PERSON\n", "\n", " (\n", "\n", " Leona E\n", " PERSON\n", "\n", ")\t\t\tWalter\tD\tLeona\tE\tlineman\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t(r) \n", "\n", " 305 e 13th\n", " TIME\n", "\n", "
\n", "\n", " Black\tAgers\n", " WORK_OF_ART\n", "\n", "\t\t\tNancy\t\t\t\tcook\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t206 Wilson
\n", "\n", " Black\tAgers\t\t\t\n", " WORK_OF_ART\n", "\n", "Sallie\t\t\t\tlaundress\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t420 Jackson
White\tAhaus\t\t\tHerman\t\t\n", "\n", " Frances\t\n", " PERSON\n", "\n", "E\ttailor\t203 w \n", "\n", " 4th\n", " ORDINAL\n", "\n", "\tHouse\t\n", "\n", " 204\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "White\tAikel\t\t\tJoseph\t\t\t\tconfectioner\t317 e Trade\tRooms\t\n", "\n", " 225\n", " CARDINAL\n", "\n", " w Trade
White\tAiken\t\t\tGeorge\tW M\t\n", "\n", " Barbara\n", " PERSON\n", "\n", "\t\tsuperintendent\tQueen City M & G Wks\tHouse\t1120 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "White\tAiken\t\t\tHenry\t\t\t\t\t\tRooms\t\n", "\n", " 9\n", " CARDINAL\n", "\n", " e \n", "\n", " 3d\n", " CARDINAL\n", "\n", "
Black\tAiken\t\t\tWalter\t\t\n", "\n", " Ella\n", " PERSON\n", "\n", "\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " e \n", "\n", " 2d\n", " CARDINAL\n", "\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "parse()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Student Exercise\n", "\n", "Analyze the results obtained above. How accurate are the entities that are recognized. Can you point out any reasons why certain mistakes were made\n", "by the \"out of the box\" model?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create Line-by-Line Sentence Boundaries\n", "\n", "Our directory text files contain one group of related words per line, but they aren't exactly sentences.\n", "Let's see if we can improve the NLP output by telling the pipeline that each line is a sentence of related\n", "words. The code below creates a function 'set_newline_sentences', which is added to our NLP pipeline.\n", "\n", "## Newline and Escape Characters\n", "The newline character in text-encoded files that is only indirectly visible. It causes the character after it\n", "to jump to the next when the file is printed or displayed in an editor or viewer. In programming languages you\n", "often need to create a newline character within a string, without typing a literal line-break. Instead we use\n", "an \"escape code\" to add the invisible character. Newline's escape code is '\\n'. String escape code in most \n", "programming languages start with a '\\', for instance a tab character is created by placing '\\t' in a string." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def set_newline_sentences(doc):\n", " for token in doc[:-1]:\n", " if token.text == \"\\n\":\n", " doc[token.i+1].is_sent_start = True\n", " elif doc[token.i].is_sent_start is None:\n", " doc[token.i].is_sent_start = False\n", " return doc\n", "\n", "nlp = spacy.load('en_core_web_sm')\n", "nlp.add_pipe(set_newline_sentences, before=\"parser\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " White\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Adams\n", " PROPN\n", "\n", "\n", "\n", " \t\t\t\n", " SPACE\n", "\n", "\n", "\n", " Henry\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " L\n", " NOUN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Fannie\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " S\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " route\n", " NOUN\n", "\n", "\n", "\n", " agent\n", " NOUN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Southern\n", " PROPN\n", "\n", "\n", "\n", " Railway\n", " PROPN\n", "\n", "\n", "\n", " Co\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " House\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " 327\n", " NUM\n", "\n", "\n", "\n", " n\n", " CCONJ\n", "\n", "\n", "\n", " Tryon\n", " PROPN\n", "\n", "\n", "\n", " \n", "\n", " SPACE\n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " punct\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " intj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
White\tAdams\t\t\tHenry\tL\tFannie\tS\troute agent\tSouthern Railway Co\tHouse\t\n", "\n", " 327\n", " CARDINAL\n", "\n", " n Tryon
White\tAdams\t\t\tJames\t\tGertrude\t\tmanager\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t419 \n", "\n", " Elizabeth\n", " PERSON\n", "\n", " av
\n", "\n", " Black\tAdams\t\t\tJane\n", " EVENT\n", "\n", "\t\t\t\tteacher\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1021 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tJohn\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1031 s \n", "\n", " Church\n", " ORG\n", "\n", "
White\tAdams\t\t\tJohn\tJ\t\t\tpresident\tAdams G & P Co and \n", "\n", " Char Pepsi-Cola Co\n", " ORG\n", "\n", "\tHouse\t\n", "\n", " 309\n", " CARDINAL\n", "\n", " e \n", "\n", " 6th\n", " ORDINAL\n", "\n", "
White\tAdams\t\t\tJohn\tW\tCora\t\tconductor\tS A L Railway\tHouse\t21st nr \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tJoseph\t\tViolet\t\tcooper\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1011 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "White\tAdams\tRev\t\t\n", "\n", " Joseph\t\n", " PERSON\n", "\n", "Q\tLeslie\t\t\t\tHouse\t1509 s Boulevard
White\tAdams\t\tMiss\tJulia\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t707 n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\t\n", "\n", " Kate\n", " PERSON\n", "\n", "\t\t\t\tlaundress\t\tHouse\tGroveton
White\tAdams\t\t\tLafayette\tN\t\t\tclerk\tSouthern Railway\tHouse\t327 n Tryon
White\tAdams\t\t\tLawrence\tA\t\t\tsalesman\tB S Moore & Co\tRooms\t405 s Tryon
White\tAdams\t\t\tLaurie\tA\tMargaret\tN\tmill head\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tElizabetMills
\n", "\n", " Black\tAdams\t\t\tLeland\t\t\t\t\n", " EVENT\n", "\n", "porter\tJ P Stowe & Co\tHouse\t403 s Myers
Black\tAdams\t\t\tLizzie\t\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t309 1/2 w Morehead
\n", "\n", " White\tAdams\n", " ORG\n", "\n", "\t\tMiss\tLula\t\t\t\tclerk\tBelk Bros\tRooms\tY W C A
White\tAdams\t\twid (\n", "\n", " Geo O\n", " PRODUCT\n", "\n", ")\tGrace\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 601\n", " CARDINAL\n", "\n", " \n", "\n", " n College\n", " ORG\n", "\n", "
White\tAdams\t\t\tLuther\tM\tMamie\t\t\t\tHouse\tGroveton
\n", "\n", " Black\tAdams\t\t\tMajor\n", " EVENT\n", "\n", "\t\t\t\twaiter\tBuford Hotel \t\t
Black\tAdams\t\t\tMattie\t\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t719 n \n", "\n", " Graham\n", " PERSON\n", "\n", " ext
\n", "\n", " White\tAdams\n", " EVENT\n", "\n", "\t\tMiss\tPattie\tV\t\t\tstenographer\t\t\n", "\n", " Boards\t708\n", " EVENT\n", "\n", " n \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tReuben\t\t\n", "\n", " Belle\n", " PERSON\n", "\n", "\t\tlaborer\tY & B Co\tHouse\t\n", "\n", " 419\n", " CARDINAL\n", "\n", " w 2d
Black\tAdams\t\t\tRosa\t\t\t\t\t\tHouse\t714 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tRufus\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tGreenville
\n", "\n", " Black\tAdams\t\t\tRufus\n", " EVENT\n", "\n", "\t\t\t\tdriver\tStand I & F Co\tHouse\tRoss Town
\n", "\n", " White\tAdams\n", " EVENT\n", "\n", "\t\tMiss\tSalie\tH\t\t\tassistant\tCarnegie Library\tHouse\t\n", "\n", " 707\n", " CARDINAL\n", "\n", " n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "Black\tAdams\t\t\tViolet\t\t\t\tservant\t\t\t\n", "\n", " 508\n", " CARDINAL\n", "\n", " w Trade
White\tAdams\t\t\tWheeler\tF\tMamie\t\tmoulder\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t303 s \n", "\n", " Cedar\n", " PRODUCT\n", "\n", "
White\tAdams\t\t\tWilliam\tE\t\t\t\t\n", "\n", " The Chronicle\tRooms\n", " ORG\n", "\n", "\t300 \n", "\n", " 1/2\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", " ORG\n", "\n", "
White\tAdcock\t\t\tJohn\tF\t\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
\n", "\n", " White\tAdcock\n", " WORK_OF_ART\n", "\n", "\t\twid (\n", "\n", " Jas M\n", " PERSON\n", "\n", ")\tMillie\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
White\tAdelsheimer\t\t\tHenry\tS\tLizzie\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1216 \n", "\n", " Louise\n", " PERSON\n", "\n", " av
\n", "\n", " Black\tAdkins\t\t\tKing\t\t\t\tlaborer\n", " WORK_OF_ART\n", "\n", "\t\t\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " s Myers
\n", "\n", " White\t\n", " DATE\n", "\n", "Adkins \n", "\n", " Walter D\n", " PERSON\n", "\n", " (\n", "\n", " Leona E\n", " PERSON\n", "\n", ")\t\t\tWalter\tD\tLeona\tE\tlineman\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t(r) \n", "\n", " 305 e 13th\n", " TIME\n", "\n", "
\n", "\n", " Black\tAgers\n", " WORK_OF_ART\n", "\n", "\t\t\tNancy\t\t\t\tcook\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t206 Wilson
\n", "\n", " Black\tAgers\t\t\tSallie\n", " WORK_OF_ART\n", "\n", "\t\t\t\tlaundress\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t420 Jackson
White\tAhaus\t\t\tHerman\t\t\n", "\n", " Frances\t\n", " PERSON\n", "\n", "E\ttailor\t203 w \n", "\n", " 4th\n", " ORDINAL\n", "\n", "\tHouse\t\n", "\n", " 204\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "White\tAikel\t\t\tJoseph\t\t\t\tconfectioner\t317 e Trade\tRooms\t\n", "\n", " 225\n", " CARDINAL\n", "\n", " w Trade
White\tAiken\t\t\tGeorge\tW M\t\n", "\n", " Barbara\n", " PERSON\n", "\n", "\t\tsuperintendent\tQueen City M & G Wks\tHouse\t1120 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "White\tAiken\t\t\tHenry\t\t\t\t\t\tRooms\t\n", "\n", " 9\n", " CARDINAL\n", "\n", " e \n", "\n", " 3d\n", " CARDINAL\n", "\n", "
Black\tAiken\t\t\tWalter\t\t\n", "\n", " Ella\n", " PERSON\n", "\n", "\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " e \n", "\n", " 2d\n", " CARDINAL\n", "\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "parse()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from spacy.pipeline import EntityRuler\n", "race_entities = EntityRuler(nlp)\n", "patterns = [{\"label\": \"RACE\", \"pattern\": [{\"LOWER\": \"black\"},]},\n", " {\"label\": \"RACE\", \"pattern\": [{\"LOWER\": \"white\"},]}]\n", "race_entities.add_patterns(patterns)\n", "\n", "nlp = spacy.load('en_core_web_sm')\n", "nlp.entity.add_label('RACE')\n", "nlp.add_pipe(set_newline_sentences, before=\"parser\")\n", "nlp.add_pipe(race_entities, before=\"ner\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " White\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Adams\n", " PROPN\n", "\n", "\n", "\n", " \t\t\t\n", " SPACE\n", "\n", "\n", "\n", " Henry\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " L\n", " NOUN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Fannie\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " S\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " route\n", " NOUN\n", "\n", "\n", "\n", " agent\n", " NOUN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Southern\n", " PROPN\n", "\n", "\n", "\n", " Railway\n", " PROPN\n", "\n", "\n", "\n", " Co\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " House\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " 327\n", " NUM\n", "\n", "\n", "\n", " n\n", " CCONJ\n", "\n", "\n", "\n", " Tryon\n", " PROPN\n", "\n", "\n", "\n", " \n", "\n", " SPACE\n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " punct\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " intj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tHenry\tL\tFannie\tS\troute agent\tSouthern Railway Co\tHouse\t327 n Tryon
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\t\n", "\n", " James\n", " PERSON\n", "\n", "\t\tGertrude\t\tmanager\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t419 \n", "\n", " Elizabeth\n", " PERSON\n", "\n", " av
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\t\n", "\n", " Jane\n", " PERSON\n", "\n", "\t\t\t\tteacher\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1021 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tJohn\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1031 s \n", "\n", " Church\n", " ORG\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tJohn\tJ\t\t\tpresident\tAdams G & P Co and \n", "\n", " Char Pepsi-Cola Co\n", " ORG\n", "\n", "\tHouse\t\n", "\n", " 309\n", " CARDINAL\n", "\n", " e \n", "\n", " 6th\n", " ORDINAL\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tJohn\tW\tCora\t\tconductor\tS A L Railway\tHouse\t21st nr \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\t\n", "\n", " Joseph\n", " PERSON\n", "\n", "\t\tViolet\t\tcooper\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1011 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\tRev\t\t\n", "\n", " Joseph\t\n", " PERSON\n", "\n", "Q\tLeslie\t\t\t\tHouse\t1509 s Boulevard
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\tMiss\tJulia\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t707 n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\t\n", "\n", " Kate\n", " PERSON\n", "\n", "\t\t\t\tlaundress\t\tHouse\tGroveton
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tLafayette\tN\t\t\tclerk\tSouthern Railway\tHouse\t327 n Tryon
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tLawrence\tA\t\t\tsalesman\tB S Moore & Co\tRooms\t405 s Tryon
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tLaurie\tA\tMargaret\tN\tmill head\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tElizabetMills
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tLeland\t\t\t\tporter\tJ P Stowe & Co\tHouse\t403 s Myers
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tLizzie\t\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t309 1/2 w Morehead
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\tMiss\tLula\t\t\t\tclerk\tBelk Bros\tRooms\tY W C A
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\twid (\n", "\n", " Geo O\n", " PRODUCT\n", "\n", ")\tGrace\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 601\n", " CARDINAL\n", "\n", " \n", "\n", " n College\n", " ORG\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tLuther\tM\tMamie\t\t\t\tHouse\tGroveton
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tMajor\t\t\t\twaiter\tBuford Hotel \t\t
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tMattie\t\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t719 n \n", "\n", " Graham\n", " PERSON\n", "\n", " ext
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\tMiss\tPattie\tV\t\t\tstenographer\t\t\n", "\n", " Boards\t708\n", " EVENT\n", "\n", " n \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tReuben\t\t\n", "\n", " Belle\n", " PERSON\n", "\n", "\t\tlaborer\tY & B Co\tHouse\t\n", "\n", " 419\n", " CARDINAL\n", "\n", " w 2d
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tRosa\t\t\t\t\t\tHouse\t714 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tRufus\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tGreenville
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tRufus\t\t\t\tdriver\tStand I & F Co\tHouse\tRoss Town
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\tMiss\tSalie\tH\t\t\tassistant\tCarnegie Library\tHouse\t\n", "\n", " 707\n", " CARDINAL\n", "\n", " n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\tAdams\t\t\tViolet\t\t\t\tservant\t\t\t\n", "\n", " 508\n", " CARDINAL\n", "\n", " w Trade
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tWheeler\tF\t\n", "\n", " Mamie\n", " PERSON\n", "\n", "\t\tmoulder\t\tHouse\t303 s \n", "\n", " Cedar\n", " PRODUCT\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\tAdams\t\t\tWilliam\tE\t\t\t\t\n", "\n", " The Chronicle\tRooms\n", " WORK_OF_ART\n", "\n", "\t300 \n", "\n", " 1/2\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", " ORG\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\tAdcock\t\t\tJohn\tF\t\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
\n", "\n", " White\n", " RACE\n", "\n", "\tAdcock\t\twid (\n", "\n", " Jas M\n", " PERSON\n", "\n", ")\tMillie\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
\n", "\n", " White\n", " RACE\n", "\n", "\tAdelsheimer\t\t\tHenry\tS\tLizzie\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1216 \n", "\n", " Louise\n", " PERSON\n", "\n", " av
\n", "\n", " Black\n", " RACE\n", "\n", "\tAdkins\t\t\tKing\t\t\t\tlaborer\t\t\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " s Myers
\n", "\n", " White\n", " RACE\n", "\n", "\tAdkins \n", "\n", " Walter D\n", " PERSON\n", "\n", " (\n", "\n", " Leona E\n", " PERSON\n", "\n", ")\t\t\tWalter\tD\tLeona\tE\tlineman\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t(r) \n", "\n", " 305 e 13th\n", " TIME\n", "\n", "
\n", "\n", " Black\n", " RACE\n", "\n", "\tAgers\t\t\tNancy\t\t\t\tcook\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t206 Wilson
\n", "\n", " Black\n", " RACE\n", "\n", "\tAgers\t\t\tSallie\t\t\t\tlaundress\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t420 Jackson
\n", "\n", " White\n", " RACE\n", "\n", "\tAhaus\t\t\tHerman\t\t\n", "\n", " Frances\t\n", " PERSON\n", "\n", "E\ttailor\t203 w \n", "\n", " 4th\n", " ORDINAL\n", "\n", "\tHouse\t\n", "\n", " 204\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " White\n", " RACE\n", "\n", "\tAikel\t\t\tJoseph\t\t\t\tconfectioner\t317 e Trade\tRooms\t225 w Trade
\n", "\n", " White\n", " RACE\n", "\n", "\tAiken\t\t\tGeorge\tW M\t\n", "\n", " Barbara\n", " PERSON\n", "\n", "\t\tsuperintendent\tQueen City M & G Wks\tHouse\t1120 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " White\n", " RACE\n", "\n", "\tAiken\t\t\tHenry\t\t\t\t\t\tRooms\t\n", "\n", " 9\n", " CARDINAL\n", "\n", " e \n", "\n", " 3d\n", " CARDINAL\n", "\n", "
\n", "\n", " Black\n", " RACE\n", "\n", "\tAiken\t\t\t\n", "\n", " Walter\n", " PERSON\n", "\n", "\t\t\n", "\n", " Ella\n", " PERSON\n", "\n", "\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " e \n", "\n", " 2d\n", " CARDINAL\n", "\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "parse()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from spacy.tokens import Span\n", "def lastname_follows_race_entities(doc):\n", " new_ents = []\n", " for ent in doc.ents:\n", " new_ents.append(ent)\n", " if ent.label_ == \"RACE\":\n", " next_token = doc[ent.end].nbor()\n", " new_ent = Span(doc, next_token.i, next_token.i + 1, label=\"PERSON\")\n", " new_ents.append(new_ent)\n", " doc.ents = new_ents\n", " return doc\n", "\n", "nlp = spacy.load('en_core_web_sm')\n", "nlp.add_pipe(set_newline_sentences, name=\"newline\", before=\"parser\")\n", "nlp.entity.add_label('RACE')\n", "nlp.add_pipe(race_entities, name=\"race\", before=\"ner\")\n", "nlp.add_pipe(lastname_follows_race_entities, name=\"lastname\", after='race')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " White\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Adams\n", " PROPN\n", "\n", "\n", "\n", " \t\t\t\n", " SPACE\n", "\n", "\n", "\n", " Henry\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " L\n", " NOUN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Fannie\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " S\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " route\n", " NOUN\n", "\n", "\n", "\n", " agent\n", " NOUN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " Southern\n", " PROPN\n", "\n", "\n", "\n", " Railway\n", " PROPN\n", "\n", "\n", "\n", " Co\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " House\n", " PROPN\n", "\n", "\n", "\n", " \t\n", " SPACE\n", "\n", "\n", "\n", " 327\n", " NUM\n", "\n", "\n", "\n", " n\n", " CCONJ\n", "\n", "\n", "\n", " Tryon\n", " PROPN\n", "\n", "\n", "\n", " \n", "\n", " SPACE\n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " punct\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " compound\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " intj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tHenry\tL\tFannie\tS\troute agent\tSouthern Railway Co\tHouse\t327 n Tryon
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tJames\t\tGertrude\t\tmanager\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t419 \n", "\n", " Elizabeth\n", " PERSON\n", "\n", " av
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tJane\t\t\t\tteacher\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1021 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tJohn\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1031 s \n", "\n", " Church\n", " ORG\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tJohn\tJ\t\t\tpresident\tAdams G & P Co and \n", "\n", " Char Pepsi-Cola Co\n", " ORG\n", "\n", "\tHouse\t\n", "\n", " 309\n", " CARDINAL\n", "\n", " e \n", "\n", " 6th\n", " ORDINAL\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tJohn\tW\tCora\t\tconductor\tS \n", "\n", " A L Railway\n", " ORG\n", "\n", "\tHouse\t21st nr \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tJoseph\t\t\n", "\n", " Violet\n", " NORP\n", "\n", "\t\tcooper\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1011 s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\tRev\t\t\n", "\n", " Joseph\t\n", " PERSON\n", "\n", "Q\tLeslie\t\t\t\tHouse\t1509 s Boulevard
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\tMiss\tJulia\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t707 n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\t\n", "\n", " Kate\n", " PERSON\n", "\n", "\t\t\t\tlaundress\t\tHouse\tGroveton
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tLafayette\tN\t\t\tclerk\tSouthern Railway\tHouse\t327 n Tryon
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tLawrence\tA\t\t\tsalesman\tB S Moore & Co\tRooms\t405 s \n", "\n", " Tryon\n", " PRODUCT\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tLaurie\tA\tMargaret\tN\tmill head\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tElizabetMills
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tLeland\t\t\t\tporter\tJ P Stowe & Co\tHouse\t403 s \n", "\n", " Myers \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tLizzie\t\t\t\t\t\tHouse\t309 \n", "\n", " 1/2\n", " CARDINAL\n", "\n", " w Morehead
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\tMiss\tLula\t\t\t\tclerk\tBelk Bros\tRooms\tY W C A
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\twid (\n", "\n", " Geo O\n", " PRODUCT\n", "\n", ")\tGrace\tM\t\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 601\n", " CARDINAL\n", "\n", " \n", "\n", " n College\n", " ORG\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tLuther\tM\tMamie\t\t\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tGroveton
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tMajor\t\t\t\twaiter\tBuford Hotel \t\t
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tMattie\t\t\t\t\t\tHouse\t719 n \n", "\n", " Graham\n", " PERSON\n", "\n", " ext
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\tMiss\tPattie\tV\t\t\tstenographer\t\t\n", "\n", " Boards\t708\n", " EVENT\n", "\n", " n \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tReuben\t\t\n", "\n", " Belle\n", " PERSON\n", "\n", "\t\tlaborer\tY & B Co\tHouse\t\n", "\n", " 419\n", " CARDINAL\n", "\n", " w 2d
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tRosa\t\t\t\t\t\tHouse\t714 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tRufus\t\t\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\tGreenville
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tRufus\t\t\t\tdriver\tStand I & F Co\tHouse\tRoss Town
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\tMiss\tSalie\tH\t\t\tassistant\tCarnegie Library\tHouse\t707 n \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tViolet\t\t\t\tservant\t\t\t\n", "\n", " 508\n", " CARDINAL\n", "\n", " w Trade
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tWheeler\tF\t\n", "\n", " Mamie\n", " PERSON\n", "\n", "\t\tmoulder\t\tHouse\t303 s \n", "\n", " Cedar\n", " PRODUCT\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adams\n", " PERSON\n", "\n", "\t\t\tWilliam\tE\t\t\t\t\n", "\n", " The Chronicle\tRooms\n", " ORG\n", "\n", "\t300 \n", "\n", " 1/2\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", " ORG\n", "\n", "
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adcock\n", " PERSON\n", "\n", "\t\t\t\n", "\n", " John\tF\t\t\t\n", " PERSON\n", "\n", "mill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t916 Calvine av
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adcock\n", " PERSON\n", "\n", "\t\twid (Jas M)\tMillie\tM\t\t\t\t\tHouse\t916 Calvine av
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adelsheimer\n", " PERSON\n", "\n", "\t\t\tHenry\tS\tLizzie\t\tmill worker\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t1216 \n", "\n", " Louise\n", " PERSON\n", "\n", " av
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Adkins\n", " PERSON\n", "\n", "\t\t\tKing\t\t\t\tlaborer\t\t\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " s Myers
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Adkins\n", " PERSON\n", "\n", " Walter D (Leona E)\t\t\tWalter\tD\tLeona\tE\tlineman\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t(r) \n", "\n", " 305 e 13th\n", " TIME\n", "\n", "
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Agers\n", " PERSON\n", "\n", "\t\t\tNancy\t\t\t\tcook\t\tHouse\t206 Wilson
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Agers\n", " PERSON\n", "\n", "\t\t\tSallie\t\t\t\tlaundress\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t420 Jackson
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Ahaus\n", " PERSON\n", "\n", "\t\t\tHerman\t\t\n", "\n", " Frances\t\n", " PERSON\n", "\n", "E\ttailor\t203 w \n", "\n", " 4th\n", " ORDINAL\n", "\n", "\tHouse\t\n", "\n", " 204\n", " CARDINAL\n", "\n", " s \n", "\n", " Church\n", "\n", " ORG\n", "\n", "\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Aikel\n", " PERSON\n", "\n", "\t\t\tJoseph\t\t\t\tconfectioner\t317 e Trade\tRooms\t225 w Trade
\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Aiken\n", " PERSON\n", "\n", "\t\t\tGeorge\tW M\t\n", "\n", " Barbara\n", " PERSON\n", "\n", "\t\tsuperintendent\tQueen City M & G Wks\tHouse\t1120 s \n", "\n", " Caldwell \n", "\n", " ORG\n", "\n", "\n", "\n", " White\n", " RACE\n", "\n", "\t\n", "\n", " Aiken\n", " PERSON\n", "\n", "\t\t\tHenry\t\t\t\t\t\tRooms\t9 e 3d
\n", "\n", " Black\n", " RACE\n", "\n", "\t\n", "\n", " Aiken\n", " PERSON\n", "\n", "\t\t\tWalter\t\t\n", "\n", " Ella\n", " PERSON\n", "\n", "\t\tlaborer\t\t\n", "\n", " House\n", " ORG\n", "\n", "\t\n", "\n", " 600\n", " CARDINAL\n", "\n", " e \n", "\n", " 2d\n", " CARDINAL\n", "\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "parse()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Return top entities" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "os.makedirs(output_loc)\n", "os.chdir(output_loc)\n", "\n", "\n", "namecount = Counter(filter_entlist)\n", "fullnamecount = Counter(filter_entlist2)\n", "commonnames = [x for x in fullnamecount.most_common() if x[1] > 5]\n", "commonall = [x for x in namecount.most_common() if x[1] > 5]\n", "\n", "entities_table = []\n", "\n", "for name in commonnames:\n", " row = [(name[0])[0].encode('utf-8'), name[1]]\n", " entities_table.append(row)\n", "\n", "out_path = \"entities_fullnames.csv\"\n", "\n", "header = ['Name', 'Frequency']\n", "\n", "with open(out_path, 'w') as fo:\n", " csv_writer = csv.writer(fo)\n", " csv_writer.writerow(header)\n", " csv_writer.writerows(entities_table)\n", " \n", "entities_table2 = []\n", "\n", "for name in commonall:\n", " row = [(name[0])[0].encode('utf-8'), name[1]]\n", " entities_table2.append(row)\n", "\n", "out_path = \"names_all.csv\"\n", "\n", "header = ['Name', 'Frequency']\n", "\n", "with open(out_path, 'w') as fo:\n", " csv_writer = csv.writer(fo)\n", " csv_writer.writerow(header)\n", " csv_writer.writerows(entities_table2)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }