{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Converting AIF To Pandas\n", "This notebook shows how to convert an AIDA TA1 AIF file to Pandas to make it programmer-friendly" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "import io\n", "from IPython.display import display, HTML, Image" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Before you start\n", "All the examples used in this document read from the /aida folder to make sure that the cells can be run in an independent manner.\n", "\n", "We create the /results folder inside so you can see the results generated from each of the KGTK operations. This way if a cells produces an error, you can continue browsing the notebook." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "mkdir sample_data/aida/results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert AIF triples to TSV KGTK format" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " .\r\n", " _:b0 .\r\n", " _:b1 .\r\n", " _:g0 .\r\n", "_:g0 .\r\n", "_:g0 \"{\\\"fileType\\\":\\\"en\\\"}\"^^ .\r\n", "_:g0 .\r\n", " .\r\n", " \"32\"^^ .\r\n", "_:g1 .\r\n" ] } ], "source": [ "!head sample_data/aida/HC00001DO.ttl.nt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Define prefixes to compress the URIs**" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
0entityprefix_expansionhttp://www.isi.edu/gaia/entities/
1relationprefix_expansionhttp://www.isi.edu/gaia/relations/
2eventprefix_expansionhttp://www.isi.edu/gaia/events/
3rdfprefix_expansionhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
4ontprefix_expansionhttps://tac.nist.gov/tracks/SM-KBP/2019/ontolo...
5rpiprefix_expansionhttp://www.rpi.edu/
6xml-schema-typeprefix_expansionhttp://www.w3.org/2001/XMLSchema#
7columbiaprefix_expansionhttp://www.columbia.edu/
8isiprefix_expansionhttp://www.isi.edu/
9isi1prefix_expansionwww.isi.edu/
10irisprefix_expansionhttp://www.usc.edu/AIDA/IRIS/Systems/
11rpiprefix_expansionhttp://www.rpi.edu/
12rpi1prefix_expansionhttp://www.rpi.edu
13rpi2prefix_expansionhttp://www.rpi.edu-projectToSingleton
14ldcprefix_expansionhttps://tac.nist.gov/tracks/SM-KBP/2019/ontolo...
\n", "
" ], "text/plain": [ " node1 label \\\n", "0 entity prefix_expansion \n", "1 relation prefix_expansion \n", "2 event prefix_expansion \n", "3 rdf prefix_expansion \n", "4 ont prefix_expansion \n", "5 rpi prefix_expansion \n", "6 xml-schema-type prefix_expansion \n", "7 columbia prefix_expansion \n", "8 isi prefix_expansion \n", "9 isi1 prefix_expansion \n", "10 iris prefix_expansion \n", "11 rpi prefix_expansion \n", "12 rpi1 prefix_expansion \n", "13 rpi2 prefix_expansion \n", "14 ldc prefix_expansion \n", "\n", " node2 \n", "0 http://www.isi.edu/gaia/entities/ \n", "1 http://www.isi.edu/gaia/relations/ \n", "2 http://www.isi.edu/gaia/events/ \n", "3 http://www.w3.org/1999/02/22-rdf-syntax-ns# \n", "4 https://tac.nist.gov/tracks/SM-KBP/2019/ontolo... \n", "5 http://www.rpi.edu/ \n", "6 http://www.w3.org/2001/XMLSchema# \n", "7 http://www.columbia.edu/ \n", "8 http://www.isi.edu/ \n", "9 www.isi.edu/ \n", "10 http://www.usc.edu/AIDA/IRIS/Systems/ \n", "11 http://www.rpi.edu/ \n", "12 http://www.rpi.edu \n", "13 http://www.rpi.edu-projectToSingleton \n", "14 https://tac.nist.gov/tracks/SM-KBP/2019/ontolo... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(\"sample_data/aida/aida-namespaces.tsv\", delimiter='\\t')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Import the AIF triples**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "!kgtk import-ntriples -i sample_data/aida/HC00001DO.ttl.nt \\\n", " --namespace-file sample_data/aida/aida-namespaces.tsv \\\n", " --namespace-id-use-uuid True \\\n", " --local-namespace-use-uuid False \\\n", " --local-namespace-prefix _ \\\n", " --newnode-use-uuid True \\\n", " / sort \\\n", " > sample_data/aida/results/HC00001DO.ttl.tsv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rpi:NominalInformativeMention/eec532c4-dc9f-4f42-8a3c-56cd14de8c9d/HC00002Z8/2541/2557/PER\tont:system\trpi:informativejustification\r\n", "rpi:NominalInformativeMention/eec532c4-dc9f-4f42-8a3c-56cd14de8c9d/HC00002Z8/2541/2557/PER\trdf:type\tont:TextJustification\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:confidence\t_:g8310\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:endOffsetInclusive\t4539\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:sourceDocument\t\"HC00001DO\"\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:source\t\"HC00002Z8\"\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:startOffset\t4527\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:system\trpi:informativejustification\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\trdf:type\tont:TextJustification\r\n", "xml-schema-type\tprefix_expansion\t\"http://www.w3.org/2001/XMLSchema#\"\r\n" ] } ], "source": [ "!tail sample_data/aida/results/HC00001DO.ttl.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Reified information is cumbersome to work with**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
16723_:g10ont:confidence_:g11
16724_:g10ont:justifiedBy_:g12
16725_:g10ont:systemrpi1:
16726_:g10rdf:objectentity:c72e94f4-e4d1-45de-966f-b52cf4d6de5e
16727_:g10rdf:predicateldc:Transaction.TransferOwnership_Artifact
16728_:g10rdf:subjectevent:9100be93-931d-4ee0-89aa-50e7d06f773e
16729_:g10rdf:typerdf:Statement
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ta1 = pd.read_csv(\"sample_data/aida/results/HC00001DO.ttl.tsv\", delimiter='\\t')\n", "display(HTML(ta1.loc[ta1.node1 =='_:g10'].to_html()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simplify the KG\n", "\n", "**What we want an easy to understand representation that is close to the diagrams that people want to see**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Undo the reification, and put the justifications as annotations on the semantic edges**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "!kgtk unreify-rdf-statements -i sample_data/aida/results/HC00001DO.ttl.tsv \\\n", " / sort --columns 1,2 \\\n", " > sample_data/aida/results/HC00001DO.ttl.unreified.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Events now have direct edges to the role fillers (orange diamonds), the justifications are in the id object**" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
17054event:fd2323ad-b9c6-4b57-9228-8579b52475c8ldc:Life.Die_Placeentity:584ecaed-6832-489c-8e45-2e63a460ab90_:g3162
17055event:fd2323ad-b9c6-4b57-9228-8579b52475c8ldc:Life.Die_Victimentity:10147d53-19e3-4b20-b144-02077ba0f2ac_:g2654
17056event:fd2323ad-b9c6-4b57-9228-8579b52475c8ldc:Life.Die_Victimentity:fbf6e4a1-54e2-423c-92e2-75b2f2aab53b_:g8555
17057event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:informativeJustification_:b1233NaN
17058event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:justifiedBy_:b1113NaN
17059event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:justifiedBy_:b1366NaN
17060event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:justifiedBy_:b1367NaN
17061event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:justifiedBy_:b301NaN
17062event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:justifiedBy_:b368NaN
17063event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:justifiedBy_:b642NaN
17064event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:ldcTime_:g5041NaN
17065event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:privateData_:g5044NaN
17066event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:privateData_:g5045NaN
17067event:fd2323ad-b9c6-4b57-9228-8579b52475c8ont:systemrpi1:NaN
17068event:fd2323ad-b9c6-4b57-9228-8579b52475c8rdf:typeldc:Life.Dieisi:gaia/assertions/ef947ce3-6fe8-4f74-bf6c-30...
17069event:fd2323ad-b9c6-4b57-9228-8579b52475c8rdf:typeont:EventNaN
\n", "
" ], "text/plain": [ " node1 \\\n", "17054 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17055 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17056 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17057 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17058 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17059 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17060 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17061 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17062 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17063 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17064 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17065 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17066 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17067 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17068 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "17069 event:fd2323ad-b9c6-4b57-9228-8579b52475c8 \n", "\n", " label \\\n", "17054 ldc:Life.Die_Place \n", "17055 ldc:Life.Die_Victim \n", "17056 ldc:Life.Die_Victim \n", "17057 ont:informativeJustification \n", "17058 ont:justifiedBy \n", "17059 ont:justifiedBy \n", "17060 ont:justifiedBy \n", "17061 ont:justifiedBy \n", "17062 ont:justifiedBy \n", "17063 ont:justifiedBy \n", "17064 ont:ldcTime \n", "17065 ont:privateData \n", "17066 ont:privateData \n", "17067 ont:system \n", "17068 rdf:type \n", "17069 rdf:type \n", "\n", " node2 \\\n", "17054 entity:584ecaed-6832-489c-8e45-2e63a460ab90 \n", "17055 entity:10147d53-19e3-4b20-b144-02077ba0f2ac \n", "17056 entity:fbf6e4a1-54e2-423c-92e2-75b2f2aab53b \n", "17057 _:b1233 \n", "17058 _:b1113 \n", "17059 _:b1366 \n", "17060 _:b1367 \n", "17061 _:b301 \n", "17062 _:b368 \n", "17063 _:b642 \n", "17064 _:g5041 \n", "17065 _:g5044 \n", "17066 _:g5045 \n", "17067 rpi1: \n", "17068 ldc:Life.Die \n", "17069 ont:Event \n", "\n", " id \n", "17054 _:g3162 \n", "17055 _:g2654 \n", "17056 _:g8555 \n", "17057 NaN \n", "17058 NaN \n", "17059 NaN \n", "17060 NaN \n", "17061 NaN \n", "17062 NaN \n", "17063 NaN \n", "17064 NaN \n", "17065 NaN \n", "17066 NaN \n", "17067 NaN \n", "17068 isi:gaia/assertions/ef947ce3-6fe8-4f74-bf6c-30... \n", "17069 NaN " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unreified = pd.read_csv(\"sample_data/aida/results/HC00001DO.ttl.unreified.tsv\", delimiter='\\t')\n", "unreified.loc[unreified.node1 == 'event:fd2323ad-b9c6-4b57-9228-8579b52475c8']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**The relations are also objects with direct links to the entities (green diamonds)**" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
54164relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9dldc:Physical.LocatedNear_EntityOrFillerentity:5c64e1a6-d96a-41ef-b584-2c3c30757bf4_:g6297
54165relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9dldc:Physical.LocatedNear_Placeentity:584ecaed-6832-489c-8e45-2e63a460ab90_:g530
54166relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9dont:informativeJustification_:b397NaN
54167relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9dont:justifiedBy_:b1096NaN
54168relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9dont:systemrpi1:NaN
54169relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9drdf:typeldc:Physical.LocatedNearisi:gaia/assertions/4d0acbd2-f7b3-49f8-abc4-84...
54170relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9drdf:typeont:RelationNaN
\n", "
" ], "text/plain": [ " node1 \\\n", "54164 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "54165 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "54166 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "54167 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "54168 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "54169 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "54170 relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d \n", "\n", " label \\\n", "54164 ldc:Physical.LocatedNear_EntityOrFiller \n", "54165 ldc:Physical.LocatedNear_Place \n", "54166 ont:informativeJustification \n", "54167 ont:justifiedBy \n", "54168 ont:system \n", "54169 rdf:type \n", "54170 rdf:type \n", "\n", " node2 \\\n", "54164 entity:5c64e1a6-d96a-41ef-b584-2c3c30757bf4 \n", "54165 entity:584ecaed-6832-489c-8e45-2e63a460ab90 \n", "54166 _:b397 \n", "54167 _:b1096 \n", "54168 rpi1: \n", "54169 ldc:Physical.LocatedNear \n", "54170 ont:Relation \n", "\n", " id \n", "54164 _:g6297 \n", "54165 _:g530 \n", "54166 NaN \n", "54167 NaN \n", "54168 NaN \n", "54169 isi:gaia/assertions/4d0acbd2-f7b3-49f8-abc4-84... \n", "54170 NaN " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unreified.loc[unreified.node1 == 'relation:4b8f6334-dbc1-4186-8d9e-a04d864d9a9d']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create files to Work in TA2\n", "\n", "**We want Pandas-friendly files, having a single rows for entities, relations and events.**\n", "\n", "For initial analysis, let's remove justifications, etc." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "!kgtk filter \\\n", " --invert \\\n", " -p ';ont:justifiedBy,ont:privateData,ont:system,ont:informativeJustification;' -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv \\\n", " > sample_data/aida/results/HC00001DO.ttl.unreified.nojust.tsv" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rpi:NominalInformativeMention/eec532c4-dc9f-4f42-8a3c-56cd14de8c9d/HC00002Z8/2541/2557/PER\tont:system\trpi:informativejustification\t\r\n", "rpi:NominalInformativeMention/eec532c4-dc9f-4f42-8a3c-56cd14de8c9d/HC00002Z8/2541/2557/PER\trdf:type\tont:TextJustification\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:confidence\t_:g8310\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:endOffsetInclusive\t4539\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:source\t\"HC00002Z8\"\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:sourceDocument\t\"HC00001DO\"\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:startOffset\t4527\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:system\trpi:informativejustification\t\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\trdf:type\tont:TextJustification\t\r\n", "xml-schema-type\tprefix_expansion\t\"http://www.w3.org/2001/XMLSchema#\"\t\r\n" ] } ], "source": [ "!tail sample_data/aida/results/HC00001DO.ttl.unreified.nojust.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Split into a separate file for each of entities, relations and events**" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "!kgtk filter -p ';rdf:type;ont:Entity' -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv > sample_data/aida/results/HC00001DO.entity_ids.tsv\n", "!kgtk filter -p ';rdf:type;ont:Event' -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv > sample_data/aida/results/HC00001DO.event_ids.tsv\n", "!kgtk filter -p ';rdf:type;ont:Relation' -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv > sample_data/aida/results/HC00001DO.relation_ids.tsv" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Get all entities from the unreified file\n", "!kgtk ifexists \\\n", " --input-keys node1 \\\n", " --filter-keys node1 \\\n", " --filter-on sample_data/aida/results/HC00001DO.entity_ids.tsv \\\n", " -i sample_data/aida/results/HC00001DO.ttl.unreified.nojust.tsv \\\n", " / sort --columns 1,2 \\\n", " > sample_data/aida/results/HC00001DO.entities.tsv\n", "\n", "# Get all events from the unreified file\n", "!kgtk ifexists \\\n", " --input-keys node1 \\\n", " --filter-keys node1 \\\n", " --filter-on sample_data/aida/results/HC00001DO.event_ids.tsv \\\n", " -i sample_data/aida/results/HC00001DO.ttl.unreified.nojust.tsv \\\n", " / sort --columns 1,2 \\\n", " > sample_data/aida/results/HC00001DO.events.tsv\n", "\n", "# Get all relations from the unreified file\n", "!kgtk ifexists \\\n", " --input-keys node1 \\\n", " --filter-keys node1 \\\n", " --filter-on sample_data/aida/results/HC00001DO.relation_ids.tsv \\\n", " -i sample_data/aida/results/HC00001DO.ttl.unreified.nojust.tsv \\\n", " / sort --columns 1,2 \\\n", " > sample_data/aida/results/HC00001DO.relations.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Little hack : replace ont:textValue by label**" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "!sed 's/ont:hasName/label/' sample_data/aida/results/HC00001DO.entities.tsv \\\n", " | sed 's/ont:textValue/label/' \\\n", " > sample_data/aida/results/HC00001DO.entities.renamed.tsv " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Remove the type edges as they do not provide useful info (e.g., we know, by construction, the entities file contains entities)**" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "!kgtk filter \\\n", " --invert \\\n", " -p ';;ont:Entity' -i sample_data/aida/results/HC00001DO.entities.renamed.tsv \\\n", " > sample_data/aida/results/HC00001DO.entities.notype.tsv\n", "!kgtk filter \\\n", " --invert \\\n", " -p ';;ont:Relation' -i sample_data/aida/results/HC00001DO.relations.tsv \\\n", " > sample_data/aida/results/HC00001DO.relations.notype.tsv\n", "!kgtk filter \\\n", " --invert \\\n", " -p ';;ont:Event' -i sample_data/aida/results/HC00001DO.events.tsv \\\n", " > sample_data/aida/results/HC00001DO.events.notype.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's make a file that has one entity per row\n", "**Start by lifting the labels into a column**" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "!kgtk lift --suppress-empty-columns True -i sample_data/aida/results/HC00001DO.entities.notype.tsv / sort > sample_data/aida/results/HC00001DO.entities.labels.tsv" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode1;labelnode2;label
0_:b0ont:confidence_:g7653NaNNaNNaN
1_:b0ont:endOffsetInclusive4680NaNNaNNaN
2_:b0ont:privateData_:g7654NaNNaNNaN
3_:b0ont:sourceDocumentHC00001DONaNNaNNaN
4_:b0ont:sourceHC00002Z8NaNNaNNaN
.....................
56017rpi:NominalInformativeMention/fbc758f0-d19f-4f...ont:sourceHC00002Z8NaNNaNNaN
56018rpi:NominalInformativeMention/fbc758f0-d19f-4f...ont:startOffset4527NaNNaNNaN
56019rpi:NominalInformativeMention/fbc758f0-d19f-4f...ont:systemrpi:informativejustificationNaNNaNNaN
56020rpi:NominalInformativeMention/fbc758f0-d19f-4f...rdf:typeont:TextJustificationNaNNaNNaN
56021xml-schema-typeprefix_expansionhttp://www.w3.org/2001/XMLSchema#NaNNaNNaN
\n", "

56022 rows × 6 columns

\n", "
" ], "text/plain": [ " node1 \\\n", "0 _:b0 \n", "1 _:b0 \n", "2 _:b0 \n", "3 _:b0 \n", "4 _:b0 \n", "... ... \n", "56017 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "56018 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "56019 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "56020 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "56021 xml-schema-type \n", "\n", " label node2 id \\\n", "0 ont:confidence _:g7653 NaN \n", "1 ont:endOffsetInclusive 4680 NaN \n", "2 ont:privateData _:g7654 NaN \n", "3 ont:sourceDocument HC00001DO NaN \n", "4 ont:source HC00002Z8 NaN \n", "... ... ... ... \n", "56017 ont:source HC00002Z8 NaN \n", "56018 ont:startOffset 4527 NaN \n", "56019 ont:system rpi:informativejustification NaN \n", "56020 rdf:type ont:TextJustification NaN \n", "56021 prefix_expansion http://www.w3.org/2001/XMLSchema# NaN \n", "\n", " node1;label node2;label \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "... ... ... \n", "56017 NaN NaN \n", "56018 NaN NaN \n", "56019 NaN NaN \n", "56020 NaN NaN \n", "56021 NaN NaN \n", "\n", "[56022 rows x 6 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entities = pd.read_csv(\"sample_data/aida/results/HC00001DO.entities.labels.tsv\", delimiter='\\t')\n", "entities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Now lift the LDC link targets into a separate column, this is a bit complicated because of the extra level of reification**" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "!kgtk lift \\\n", " --suppress-empty-columns True \\\n", " --label-value ont:linkTarget \\\n", " --lift-suffix ';temp' \\\n", " --label-file sample_data/aida/results/HC00001DO.ttl.unreified.tsv \\\n", " -i sample_data/aida/results/HC00001DO.entities.labels.tsv \\\n", " / lift \\\n", " --suppress-empty-columns True \\\n", " --label-value ont:link \\\n", " --lift-suffix ';linkTarget' \\\n", " --node2-name 'node2;temp' \\\n", " / sort \\\n", " / remove-columns -c 'node2;temp' \\\n", " > sample_data/aida/results/HC00001DO.entities.labels.linktargets.tsv" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode1;labelnode2;labelnode1;tempnode1;linkTargetnode2;linkTarget
0_:b0ont:confidence_:g7653NaNNaNNaNNaNNaNNaN
1_:b0ont:endOffsetInclusive4680NaNNaNNaNNaNNaNNaN
2_:b0ont:privateData_:g7654NaNNaNNaNNaNNaNNaN
3_:b0ont:sourceDocumentHC00001DONaNNaNNaNNaNNaNNaN
4_:b0ont:sourceHC00002Z8NaNNaNNaNNaNNaNNaN
..............................
55849rpi:NominalInformativeMention/fbc758f0-d19f-4f...ont:sourceHC00002Z8NaNNaNNaNNaNNaNNaN
55850rpi:NominalInformativeMention/fbc758f0-d19f-4f...ont:startOffset4527NaNNaNNaNNaNNaNNaN
55851rpi:NominalInformativeMention/fbc758f0-d19f-4f...ont:systemrpi:informativejustificationNaNNaNNaNNaNNaNNaN
55852rpi:NominalInformativeMention/fbc758f0-d19f-4f...rdf:typeont:TextJustificationNaNNaNNaNNaNNaNNaN
55853xml-schema-typeprefix_expansionhttp://www.w3.org/2001/XMLSchema#NaNNaNNaNNaNNaNNaN
\n", "

55854 rows × 9 columns

\n", "
" ], "text/plain": [ " node1 \\\n", "0 _:b0 \n", "1 _:b0 \n", "2 _:b0 \n", "3 _:b0 \n", "4 _:b0 \n", "... ... \n", "55849 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "55850 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "55851 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "55852 rpi:NominalInformativeMention/fbc758f0-d19f-4f... \n", "55853 xml-schema-type \n", "\n", " label node2 id \\\n", "0 ont:confidence _:g7653 NaN \n", "1 ont:endOffsetInclusive 4680 NaN \n", "2 ont:privateData _:g7654 NaN \n", "3 ont:sourceDocument HC00001DO NaN \n", "4 ont:source HC00002Z8 NaN \n", "... ... ... ... \n", "55849 ont:source HC00002Z8 NaN \n", "55850 ont:startOffset 4527 NaN \n", "55851 ont:system rpi:informativejustification NaN \n", "55852 rdf:type ont:TextJustification NaN \n", "55853 prefix_expansion http://www.w3.org/2001/XMLSchema# NaN \n", "\n", " node1;label node2;label node1;temp node1;linkTarget node2;linkTarget \n", "0 NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... \n", "55849 NaN NaN NaN NaN NaN \n", "55850 NaN NaN NaN NaN NaN \n", "55851 NaN NaN NaN NaN NaN \n", "55852 NaN NaN NaN NaN NaN \n", "55853 NaN NaN NaN NaN NaN \n", "\n", "[55854 rows x 9 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entities = pd.read_csv(\"sample_data/aida/results/HC00001DO.entities.labels.linktargets.tsv\", delimiter='\\t')\n", "entities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Statistics of fraction of entities have labels or link targets**" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "node1 1.000\n", "label 1.000\n", "node2 1.000\n", "id 0.090\n", "node1;label 0.015\n", "node2;label 0.007\n", "node1;temp 0.005\n", "node1;linkTarget 0.006\n", "node2;linkTarget 0.004\n", "dtype: float64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "((entities.shape[0]-entities.isnull().sum())/entities.shape[0]).round(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Distribution of types**" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "rpi1: 6294\n", "ont:Confidence 4011\n", "ont:PrivateData 3391\n", "rpi:fileType 2161\n", "{\\fileType\\\":\\\"en\\\"}\" 2161\n", " ... \n", "_:g6019 1\n", "_:g9082 1\n", "_:g787 1\n", "_:g1093 1\n", "_:g5691 1\n", "Name: node2, Length: 13836, dtype: int64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entities['node2'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Add the labels of the entities to the event file**" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "!kgtk filter \\\n", " -p ';label;' -i sample_data/aida/results/HC00001DO.entities.renamed.tsv \\\n", " > sample_data/aida/results/HC00001DO.entities.renamed.labels.tsv" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "!kgtk join --left-file sample_data/aida/results/HC00001DO.events.notype.tsv --right-file sample_data/aida/results/HC00001DO.entities.renamed.labels.tsv \\\n", " --left-join \\\n", " --left-file-join-columns node2 \\\n", " --right-file-join-columns node1 \\\n", " / lift --suppress-empty-columns \\\n", " > sample_data/aida/results/HC00001DO.events.notype.entity-labels.tsv" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode1;labelnode2;label
0_:b0ont:confidence_:g7653NaNNaNNaN
1_:b0ont:endOffsetInclusive4680NaNNaNNaN
2_:b0ont:privateData_:g7654NaNNaNNaN
3_:b0ont:sourceHC00002Z8NaNNaNNaN
4_:b0ont:sourceDocumentHC00001DONaNNaNNaN
5_:b0ont:startOffset4679NaNNaNNaN
6_:b0ont:systemrpi1:NaNNaNNaN
7_:b0rdf:typeont:TextJustificationNaNNaNNaN
8_:b1ont:confidence_:b1530NaNNaNNaN
9_:b1ont:endOffsetInclusive4680NaNNaNNaN
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "events = pd.read_csv(\"sample_data/aida/results/HC00001DO.events.notype.entity-labels.tsv\", delimiter='\\t')\n", "display(HTML(events[:10].to_html()))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "entity:9911ecfc-e6b9-41a4-a488-30075e439aa8 74\n", "entity:fcb78e77-4962-4fca-977b-aea84bfa3ddd 61\n", "entity:8e97e2c0-5ed1-4ae3-81bc-f66cedd2d8e5 46\n", "entity:c32bb2f7-eb58-4612-b101-dbfcee3e84ae 43\n", "entity:79969b4c-cf9e-4eb7-8123-c7714e087454 42\n", "event:519cf108-2005-4d3d-b82c-a4309db8992e 40\n", "event:dab890d7-aa46-4e1f-9309-e7c2834a164d 38\n", "entity:5d6629ee-be36-4445-8a35-3be47b8ee97a 36\n", "entity:bb729095-2592-4e3d-aa40-cf1a48b01383 36\n", "entity:d1dcefce-badf-4948-bfcf-5d33116fa12c 35\n", "Name: node1, dtype: int64" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "events['node1'].value_counts()[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Work with clusters" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "!kgtk filter -p ';ont:clusterMember;' -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv > sample_data/aida/results/HC00001DO.ttl.clusters.tsv" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "!kgtk join --left-file sample_data/aida/results/HC00001DO.ttl.clusters.tsv --right-file sample_data/aida/results/HC00001DO.entities.notype.tsv \\\n", " --left-file-join-columns node2 \\\n", " --right-file-join-columns node1 \\\n", " > sample_data/aida/results/HC00001DO.cluster.ids.entities.tsv \n", "!kgtk join --left-file sample_data/aida/results/HC00001DO.ttl.clusters.tsv --right-file sample_data/aida/results/HC00001DO.relations.notype.tsv \\\n", " --left-file-join-columns node2 \\\n", " --right-file-join-columns node1 \\\n", " > sample_data/aida/results/HC00001DO.cluster.ids.relations.tsv \n", "!kgtk join --left-file sample_data/aida/results/HC00001DO.ttl.clusters.tsv --right-file sample_data/aida/results/HC00001DO.events.notype.tsv \\\n", " --left-file-join-columns node2 \\\n", " --right-file-join-columns node1 \\\n", " > sample_data/aida/results/HC00001DO.cluster.ids.events.tsv " ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "!kgtk ifexists \\\n", " --input-keys node1 \\\n", " --filter-keys node1 \\\n", " --filter-on sample_data/aida/results/HC00001DO.cluster.ids.entities.tsv \\\n", " -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv \\\n", " > sample_data/aida/results/HC00001DO.cluster.entities.tsv \n", "!kgtk ifexists \\\n", " --input-keys node1 \\\n", " --filter-keys node1 \\\n", " --filter-on sample_data/aida/results/HC00001DO.cluster.ids.relations.tsv \\\n", " -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv \\\n", " > sample_data/aida/results/HC00001DO.cluster.relations.tsv \n", "!kgtk ifexists \\\n", " --input-keys node1 \\\n", " --filter-keys node1 \\\n", " --filter-on sample_data/aida/results/HC00001DO.cluster.ids.events.tsv \\\n", " -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv \\\n", " > sample_data/aida/results/HC00001DO.cluster.events.tsv " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create and edge file with ids to load in Wikidata SPARQL and browse using SQID" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "!kgtk add-id -i sample_data/aida/results/HC00001DO.ttl.unreified.tsv > sample_data/aida/results/HC00001DO.ttl.unreified.ids.tsv" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rpi:NominalInformativeMention/eec532c4-dc9f-4f42-8a3c-56cd14de8c9d/HC00002Z8/2541/2557/PER\tont:system\trpi:informativejustification\tE51098\r\n", "rpi:NominalInformativeMention/eec532c4-dc9f-4f42-8a3c-56cd14de8c9d/HC00002Z8/2541/2557/PER\trdf:type\tont:TextJustification\tE51099\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:confidence\t_:g8310\tE51100\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:endOffsetInclusive\t4539\tE51101\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:source\t\"HC00002Z8\"\tE51102\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:sourceDocument\t\"HC00001DO\"\tE51103\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:startOffset\t4527\tE51104\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\tont:system\trpi:informativejustification\tE51105\r\n", "rpi:NominalInformativeMention/fbc758f0-d19f-4faa-bb06-d042f7884144/HC00002Z8/4527/4539/ORG_CommercialOrganization_NewsAgency\trdf:type\tont:TextJustification\tE51106\r\n", "xml-schema-type\tprefix_expansion\t\"http://www.w3.org/2001/XMLSchema#\"\tE51107\r\n" ] } ], "source": [ "!tail sample_data/aida/results/HC00001DO.ttl.unreified.ids.tsv" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HC00001DO.cluster.entities.tsv\r\n", "HC00001DO.cluster.events.tsv\r\n", "HC00001DO.cluster.ids.entities.tsv\r\n", "HC00001DO.cluster.ids.events.tsv\r\n", "HC00001DO.cluster.ids.relations.tsv\r\n", "HC00001DO.cluster.relations.tsv\r\n", "HC00001DO.entities.labels.linktargets.tsv\r\n", "HC00001DO.entities.labels.tsv\r\n", "HC00001DO.entities.notype.tsv\r\n", "HC00001DO.entities.renamed.labels.tsv\r\n", "HC00001DO.entities.renamed.tsv\r\n", "HC00001DO.entities.tsv\r\n", "HC00001DO.entity_ids.tsv\r\n", "HC00001DO.event_ids.tsv\r\n", "HC00001DO.events.notype.entity-labels.tsv\r\n", "HC00001DO.events.notype.tsv\r\n", "HC00001DO.events.tsv\r\n", "HC00001DO.relation_ids.tsv\r\n", "HC00001DO.relations.notype.tsv\r\n", "HC00001DO.relations.tsv\r\n", "HC00001DO.ttl.clusters.tsv\r\n", "HC00001DO.ttl.tsv\r\n", "HC00001DO.ttl.unreified.ids.tsv\r\n", "HC00001DO.ttl.unreified.nojust.tsv\r\n", "HC00001DO.ttl.unreified.tsv\r\n" ] } ], "source": [ "# Show all results in results folder created in this tutorial\n", "!ls sample_data/aida/results/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read KGTK results into lines and directly into Pandas\n", "# lines = !kgtk filter -p ';prefix_expansion;' -i ta1/HC00001DO/HC00001DO.ttl.tsv\n", "# pd.read_csv(io.StringIO('\\n'.join(lines)), delimiter='\\t')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }