{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Use Case: A Knowledge Graph About Alocholic Beverages\n", "We are going to build a small KG about alcoholoc beverages by extracting from Wikidata the subgraph that relates to alcoholic beverages (https://www.wikidata.org/wiki/Q154)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/aliases.en.tsv.gz\"\n", "ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/all.tsv.gz\"\n", "CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.tsv.gz\"\n", "DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/descriptions.en.tsv.gz\"\n", "EXAMPLES_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/examples\"\n", "GE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding\"\n", "ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.isa.tsv.gz\"\n", "ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.wikibase-item.tsv.gz\"\n", "KGTK_PATH: \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/labels.en.tsv.gz\"\n", "OUT: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output\"\n", "P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279.tsv.gz\"\n", "P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279star.tsv.gz\"\n", "PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/metadata.property.datatypes.tsv.gz\"\n", "Q154ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/aliases.en.tsv.gz\"\n", "Q154ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/all.tsv.gz\"\n", "Q154CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.tsv.gz\"\n", "Q154DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/descriptions.en.tsv.gz\"\n", "Q154ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.isa.tsv.gz\"\n", "Q154ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.wikibase-item.tsv.gz\"\n", "Q154LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/labels.en.tsv.gz\"\n", "Q154P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279.tsv.gz\"\n", "Q154P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279star.tsv.gz\"\n", "Q154PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/metadata.property.datatypes.tsv.gz\"\n", "Q154QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.tsv.gz\"\n", "Q154QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.time.tsv.gz\"\n", "Q154SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/sitelinks.tsv.gz\"\n", "QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.tsv.gz\"\n", "QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.time.tsv.gz\"\n", "SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/sitelinks.tsv.gz\"\n", "STORE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n", "TE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding\"\n", "TEMP: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp\"\n", "USECASE_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/use-cases\"\n", "WIKIDATA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/\"\n", "kgtk: \"kgtk --debug\"\n", "kypher: \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n" ] } ], "source": [ "import sys \n", "sys.path.insert(0, 'tutorial')\n", "from tutorial_setup import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kgtk-tutorial\n" ] } ], "source": [ "%cd {output_path}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "!mkdir -p {output_folder}\n", "!mkdir -p {temp_folder}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 1: create a list of all descendants of `alcoholic beverage` (https://www.wikidata.org/wiki/Q154)\n", "Here is some of the information about `Q154` in Wikidata:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labellabel;labelnode2node2;label
0Q154P1151'topic\\\\'s main Wikimedia portal'@enQ61473108'Portal:Alcoholic drinks'@en
1Q154P1343'described by source'@enQ1768721'Gujin Tushu Jicheng'@en
2Q154P1424'topic\\\\'s main template'@enQ10557691'Template:Infobox alcoholic beverage'@en
3Q154P1552'has quality'@enQ1517187'bitterness'@en
4Q154P2354'has list'@enQ2468826'list of alcoholic beverages'@en
5Q154P279'subclass of'@enQ40050'drink'@en
6Q154P31'instance of'@enQ187661'carcinogen'@en
7Q154P31'instance of'@enQ8386'drug'@en
8Q154P452'industry'@enQ3150593'alcohol industry'@en
9Q154P461'opposite of'@enQ2647467'non-alcoholic beverage'@en
10Q154P910'topic\\\\'s main category'@enQ7214082'Category:Alcoholic drinks'@en
\n", "
" ], "text/plain": [ " node1 label label;label node2 \\\n", "0 Q154 P1151 'topic\\\\'s main Wikimedia portal'@en Q61473108 \n", "1 Q154 P1343 'described by source'@en Q1768721 \n", "2 Q154 P1424 'topic\\\\'s main template'@en Q10557691 \n", "3 Q154 P1552 'has quality'@en Q1517187 \n", "4 Q154 P2354 'has list'@en Q2468826 \n", "5 Q154 P279 'subclass of'@en Q40050 \n", "6 Q154 P31 'instance of'@en Q187661 \n", "7 Q154 P31 'instance of'@en Q8386 \n", "8 Q154 P452 'industry'@en Q3150593 \n", "9 Q154 P461 'opposite of'@en Q2647467 \n", "10 Q154 P910 'topic\\\\'s main category'@en Q7214082 \n", "\n", " node2;label \n", "0 'Portal:Alcoholic drinks'@en \n", "1 'Gujin Tushu Jicheng'@en \n", "2 'Template:Infobox alcoholic beverage'@en \n", "3 'bitterness'@en \n", "4 'list of alcoholic beverages'@en \n", "5 'drink'@en \n", "6 'carcinogen'@en \n", "7 'drug'@en \n", "8 'alcohol industry'@en \n", "9 'non-alcoholic beverage'@en \n", "10 'Category:Alcoholic drinks'@en " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = !$kypher -i claims -i labels \\\n", "--match 'claims: (n1:Q154)-[l {label:p}]->(n2), label: (n2)-[]->(n2_label), label: (p)-[]->(p_label)' \\\n", "--return 'n1 as node1, l.label as label, p_label as `label;label`, n2 as node2, n2_label as `node2;label`' \\\n", "--order-by 'l.label'\n", "\n", "\n", "kgtk_to_dataframe(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Wikidata uses two properties to organize entities in a hierarchy: the `instance of` property (`P31`) and the `subclass of` (`P279`) property. In many cases, the distinction between instance of and subclass of is subtle, and we find many situations in Wikidata where either one or the other is used to organize hierarchies. For this reason, we created a new property called `isa` that contains the union of `P31` and `P279` and stored in the file `derived.isa.tsv`" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
0P10isaQ18610173
1P1000isaQ18608871
2P1001isaQ15720608
3P1001isaQ22984026
\n", "
" ], "text/plain": [ " node1 label node2\n", "0 P10 isa Q18610173\n", "1 P1000 isa Q18608871\n", "2 P1001 isa Q15720608\n", "3 P1001 isa Q22984026" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$ISA\" | head -5\n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get all the alcoholic beverages, we need to get all entities that are `isa` of alcoholic beverage (`Q154`) or that are `isa` of any descendant of `Q154` in the `subclass of` (`P279`) hierarchy. The length of the chain of `P279` edges can be arbitrarily long. To support this use case, KGTK offers the `derived.P279star.tsv` file that contains edges `n1/P279star/n2` if `n1` is a descendant of `n2` on chains of `P279` edges, includiing chains of zero length (`n1/P279star/n1`)." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q100000030P279starQ100000030Q100000030-P279star-Q100000030-0000
1Q100000030P279starQ1357761Q100000030-P279star-Q1357761-0000
2Q100000030P279starQ14745Q100000030-P279star-Q14745-0000
3Q100000030P279starQ14748Q100000030-P279star-Q14748-0000
\n", "
" ], "text/plain": [ " node1 label node2 id\n", "0 Q100000030 P279star Q100000030 Q100000030-P279star-Q100000030-0000\n", "1 Q100000030 P279star Q1357761 Q100000030-P279star-Q1357761-0000\n", "2 Q100000030 P279star Q14745 Q100000030-P279star-Q14745-0000\n", "3 Q100000030 P279star Q14748 Q100000030-P279star-Q14748-0000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$P279STAR\" | head -5 \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get all alcoholic beverages, we need to find all nodes `n1` that are connected to `Q154` with an `isa` edge and a chain of `P279` edges:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$ISA\" --as \"isa\" -i \"$P279STAR\" --as \"p279star\" -i labels \\\n", "--match 'isa: (n1)-[]->(n2), star: (n2)-[]->(n3:Q154), label: (n1)-[]->(n1l)' \\\n", "--return 'n1 as node1, n1l as `node1;label`, n3 as node2, \"isastar\" as label' \\\n", "-o \"$TEMP\"/Q154.descendant.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is a sample of alcoholic beverages in Wikidata" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node1;labelnode2label
0Q1350656'corn whiskey'@enQ154isastar
1Q20713240'Buckwheat whisky'@enQ154isastar
2Q2535077'rye whiskey'@enQ154isastar
3Q536976'Canadian whisky'@enQ154isastar
4Q7991845'wheat whiskey'@enQ154isastar
...............
3346Q7719471'The Botanist'@enQ154isastar
3347Q187155'Tanqueray'@enQ154isastar
3348Q62076228'dry gin'@enQ154isastar
3349Q7085234'Old Tom Gin'@enQ154isastar
3350Q891834'Bombay Sapphire'@enQ154isastar
\n", "

3351 rows × 4 columns

\n", "
" ], "text/plain": [ " node1 node1;label node2 label\n", "0 Q1350656 'corn whiskey'@en Q154 isastar\n", "1 Q20713240 'Buckwheat whisky'@en Q154 isastar\n", "2 Q2535077 'rye whiskey'@en Q154 isastar\n", "3 Q536976 'Canadian whisky'@en Q154 isastar\n", "4 Q7991845 'wheat whiskey'@en Q154 isastar\n", "... ... ... ... ...\n", "3346 Q7719471 'The Botanist'@en Q154 isastar\n", "3347 Q187155 'Tanqueray'@en Q154 isastar\n", "3348 Q62076228 'dry gin'@en Q154 isastar\n", "3349 Q7085234 'Old Tom Gin'@en Q154 isastar\n", "3350 Q891834 'Bombay Sapphire'@en Q154 isastar\n", "\n", "[3351 rows x 4 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !cat \"$TEMP\"/Q154.descendant.tsv \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The computation of `Q154.descendant.tsv` can be implemented in SPARQL using the common `P31/P279*` graph pattern, but the query will time out if the result size is large. For example, the query will time out when requesting all descendants of chemical compounds, as there are over one million chemical compounds in Wikidata. The query can be easily done in KGTK." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 2: get the incoming and outgoing edges\n", "We want out graph to have the neighbors of all alcoholic beverages, so we need to get the incoming and outgoing edges.\n", "\n", "The following query gets the **outgoing** edges." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims -i \"$TEMP\"/Q154.descendant.tsv \\\n", "--match 'Q154: (n1)-[]->(), claims: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that we are getting several properties for our items:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0Q1000737-P1435-Q17297633-53903946-0Q1000737P1435Q17297633
1Q1000737-P1454-Q460178-8ad4931b-0Q1000737P1454Q460178
2Q1000737-P159-Q16003-31e24011-0Q1000737P159Q16003
3Q1000737-P17-Q183-24107fe2-0Q1000737P17Q183
4Q1000737-P18-147fc9-667304f8-0Q1000737P18\"Marthabräuhalle 2011-04-03.jpg\"
5Q1000737-P31-Q131734-f97bd6f6-0Q1000737P31Q131734
6Q1000737-P31-Q15075508-a4c83928-0Q1000737P31Q15075508
7Q1000737-P373-689157-3110aade-0Q1000737P373\"Marthabräu\"
8Q1000737-P452-Q869095-f5d8e7a2-0Q1000737P452Q869095
\n", "
" ], "text/plain": [ " id node1 label \\\n", "0 Q1000737-P1435-Q17297633-53903946-0 Q1000737 P1435 \n", "1 Q1000737-P1454-Q460178-8ad4931b-0 Q1000737 P1454 \n", "2 Q1000737-P159-Q16003-31e24011-0 Q1000737 P159 \n", "3 Q1000737-P17-Q183-24107fe2-0 Q1000737 P17 \n", "4 Q1000737-P18-147fc9-667304f8-0 Q1000737 P18 \n", "5 Q1000737-P31-Q131734-f97bd6f6-0 Q1000737 P31 \n", "6 Q1000737-P31-Q15075508-a4c83928-0 Q1000737 P31 \n", "7 Q1000737-P373-689157-3110aade-0 Q1000737 P373 \n", "8 Q1000737-P452-Q869095-f5d8e7a2-0 Q1000737 P452 \n", "\n", " node2 \n", "0 Q17297633 \n", "1 Q460178 \n", "2 Q16003 \n", "3 Q183 \n", "4 \"Marthabräuhalle 2011-04-03.jpg\" \n", "5 Q131734 \n", "6 Q15075508 \n", "7 \"Marthabräu\" \n", "8 Q869095 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$TEMP\"/Q154.node1.tsv.gz | head \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now get the **incoming** edges:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims -i \"$TEMP\"/Q154.descendant.tsv \\\n", "--match 'Q154: (n1)-[]->(), claims: (n3)-[l]->(n1)' \\\n", "--return 'distinct l as id, n3 as node1, l.label as label, n1 as node2' \\\n", "-o \"$TEMP\"/Q154.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is a sample of the edges we are getting" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0Q1350656-P279-Q1007164-7e3ecba9-0Q1350656P279Q1007164
1Q20713240-P279-Q1007164-b3112260-0Q20713240P279Q1007164
2Q2535077-P279-Q1007164-b2d3684b-0Q2535077P279Q1007164
3Q536976-P279-Q1007164-8bf7467b-0Q536976P279Q1007164
4Q7991845-P279-Q1007164-18bc383a-0Q7991845P279Q1007164
5Q10337004-P186-Q10210-c56dd7ce-0Q10337004P186Q10210
6Q10429117-P31-Q10210-d342f061-0Q10429117P31Q10210
7Q1051699-P279-Q10210-65d32c67-0Q1051699P279Q10210
8Q1058259-P279-Q10210-e204554a-0Q1058259P279Q10210
\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 Q1350656-P279-Q1007164-7e3ecba9-0 Q1350656 P279 Q1007164\n", "1 Q20713240-P279-Q1007164-b3112260-0 Q20713240 P279 Q1007164\n", "2 Q2535077-P279-Q1007164-b2d3684b-0 Q2535077 P279 Q1007164\n", "3 Q536976-P279-Q1007164-8bf7467b-0 Q536976 P279 Q1007164\n", "4 Q7991845-P279-Q1007164-18bc383a-0 Q7991845 P279 Q1007164\n", "5 Q10337004-P186-Q10210-c56dd7ce-0 Q10337004 P186 Q10210\n", "6 Q10429117-P31-Q10210-d342f061-0 Q10429117 P31 Q10210\n", "7 Q1051699-P279-Q10210-65d32c67-0 Q1051699 P279 Q10210\n", "8 Q1058259-P279-Q10210-e204554a-0 Q1058259 P279 Q10210" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$TEMP\"/Q154.node2.tsv.gz | head\n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the incoming and outgoing edges to put them in a single file:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.node1.tsv.gz -i \"$TEMP\"/Q154.node2.tsv.gz -o \"$TEMP\"/Q154.claims.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See how many edges we have:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 31945 131399 1801186\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.claims.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary of where we are:\n", "- Computed the list of q-nodes below alcoholic beverage\n", "- Found all incoming and outgoing edges to these q-nodes; for the new q-nodes we bring in, we have no information, we only have the q-node\n", "\n", "We have the q-nodes connected to alcoholic beverages, but so far we don't have the edges of those q-nodes. We need to go one hop out from the q-nodes that we have. We run a query to go one hop out from any qnode in `Q154.claims.tsv` which will use all the q-nodes in our graph, including the alcoholic beverages for which we already got outgoing edges; no harm done, as we can eliminate duplicated later." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims -i \"$TEMP\"/Q154.claims.tsv.gz \\\n", "--match 'Q154: ()-[]->(n1), claims: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.hop.out.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For sanity check, let's take a peek:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0Q1000-P1014-693343-9664fa33-0Q1000P1014\"300262648\"
1Q1000-P1036-9bef62-f77ac5cf-0Q1000P1036\"2--6721\"
2Q1000-P1081-0d345f-3a33abf5-0Q1000P1081+0.641
3Q1000-P1081-0d345f-6da37c02-0Q1000P1081+0.641
4Q1000-P1081-1100e3-c7631769-0Q1000P1081+0.624
5Q1000-P1081-1ada51-7c71c229-0Q1000P1081+0.639
6Q1000-P1081-345681-88a99cab-0Q1000P1081+0.702
7Q1000-P1081-347db1-da0e5e03-0Q1000P1081+0.637
8Q1000-P1081-419245-b03a8b59-0Q1000P1081+0.647
\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 Q1000-P1014-693343-9664fa33-0 Q1000 P1014 \"300262648\"\n", "1 Q1000-P1036-9bef62-f77ac5cf-0 Q1000 P1036 \"2--6721\"\n", "2 Q1000-P1081-0d345f-3a33abf5-0 Q1000 P1081 +0.641\n", "3 Q1000-P1081-0d345f-6da37c02-0 Q1000 P1081 +0.641\n", "4 Q1000-P1081-1100e3-c7631769-0 Q1000 P1081 +0.624\n", "5 Q1000-P1081-1ada51-7c71c229-0 Q1000 P1081 +0.639\n", "6 Q1000-P1081-345681-88a99cab-0 Q1000 P1081 +0.702\n", "7 Q1000-P1081-347db1-da0e5e03-0 Q1000 P1081 +0.637\n", "8 Q1000-P1081-419245-b03a8b59-0 Q1000 P1081 +0.647" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$TEMP\"/Q154.hop.out.tsv.gz | head \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's consolidate our edge files into one larger file. We use compact to remove duplicates and sort to keep edges for the same subject together:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.claims.tsv.gz -i \"$TEMP\"/Q154.hop.out.tsv.gz \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See how many edges we have:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 159073 655180 8549211\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Take a peek:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P1389-P1855-Q1109662-9e2ef218-0P1389P1855Q1109662
1P1582-P1855-Q17329207-f4ef508d-0P1582P1855Q17329207
2P2581-P1855-Q7639844-08b3a4c7-0P2581P1855Q7639844
3P2665-P1855-Q1067702-402a80a9-0P2665P1855Q1067702
4P2665-P1855-Q170210-30d44f0b-0P2665P1855Q170210
5P5420-P1855-Q44-209cffb1-0P5420P1855Q44
6P5420-P1855-Q722338-73d7be75-0P5420P1855Q722338
7P5471-P1855-Q44-6c38949b-0P5471P1855Q44
8P6088-P1855-Q1543214-3d934541-0P6088P1855Q1543214
\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 P1389-P1855-Q1109662-9e2ef218-0 P1389 P1855 Q1109662\n", "1 P1582-P1855-Q17329207-f4ef508d-0 P1582 P1855 Q17329207\n", "2 P2581-P1855-Q7639844-08b3a4c7-0 P2581 P1855 Q7639844\n", "3 P2665-P1855-Q1067702-402a80a9-0 P2665 P1855 Q1067702\n", "4 P2665-P1855-Q170210-30d44f0b-0 P2665 P1855 Q170210\n", "5 P5420-P1855-Q44-209cffb1-0 P5420 P1855 Q44\n", "6 P5420-P1855-Q722338-73d7be75-0 P5420 P1855 Q722338\n", "7 P5471-P1855-Q44-6c38949b-0 P5471 P1855 Q44\n", "8 P6088-P1855-Q1543214-3d934541-0 P6088 P1855 Q1543214" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | head \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3: get the ontology\n", "Once we have all the alcoholic beverages, we want to get the upper ontology of all the classes used, so that every class in our KG has a path to the root of the ontology. For example, first go to `drink` (`Q40050`), then to `liquid` (`Q11435`), then `fluid` (`Q102205`) and so on until we reach `entity` (`Q35120`).\n", "\n", "To do this, we need to get all the `isa` of all items in our graph, then get `P279star` so we get the list of all classes that these items descend from. Finally we need to get all the `P279` edges between them." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$TEMP\"/Q154.edges.1.tsv.gz -i p279star -i isa \\\n", "--match 'Q154: (n1)-[]->(), isa: (n1)-[]->(n2), p279star: (n2)-[]->(class)' \\\n", "--return 'distinct class as node1' \\\n", "-o \"$TEMP\"/Q154.classes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See how many classes we have in the upper ontology for the entities in our graph:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2791 2791 24573 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/Q154.classes.tsv\n" ] } ], "source": [ "!wc \"$TEMP\"/Q154.classes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that `fluid` (`Q102205`) is listed in the classes:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q102205\n" ] } ], "source": [ "!grep Q102205 \"$TEMP\"/Q154.classes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now use the `derived.P279.tsv` file to get the `P279` edges that connect a class to its superclass." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$TEMP\"/Q154.classes.tsv -i \"$P279\" --as \"p279\" \\\n", "--match 'Q154: (class)-[]->(), p279: (class)-[l]->(super)' \\\n", "--return 'distinct l as id, class as node1, l.label as label, super as node2' \\\n", "-o \"$TEMP\"/Q154.P279.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See how many `P279` edges are in the upper ontology; we will take care of potential duplicates at a final cleanup step:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4428 17712 245148 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/Q154.P279.tsv\n" ] } ], "source": [ "!wc \"$TEMP\"/Q154.P279.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see several q-nodes below `entity` (`Q35120`), a good indication that we computed the upper ontology correctly:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q488383-P279-Q35120-5fad2ad7-0 Q488383 P279 Q35120\n", "Q58415929-P279-Q35120-75659d0c-0 Q58415929 P279 Q35120\n", "Q99527517-P279-Q35120-562a6511-0 Q99527517 P279 Q35120\n", "Q16686448-P279-Q35120-674edbf9-0 Q16686448 P279 Q35120\n", "Q23958946-P279-Q35120-70a9ed90-0 Q23958946 P279 Q35120\n" ] } ], "source": [ "!grep Q35120 \"$TEMP\"/Q154.P279.tsv | head -5 | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's consolidate the edges again:" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.edges.1.tsv.gz -i \"$TEMP\"/Q154.P279.tsv \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The number of edges is growing:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 162839 670244 8758729\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.2.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary:\n", "- We have the instances of alcoholic beverages\n", "- We added incoming and outgoing edges\n", "- For all the q-nodes in the previous step, we went one hop forward\n", "- We got the upper ontology" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 4: get the edges for properties\n", "The properties are also items in Wikidata, so let's collect them all and get their edges." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$TEMP\"/Q154.edges.2.tsv.gz \\\n", "--match '()-[l {label: property}]->()' \\\n", "--return 'distinct property as node1' \\\n", "-o \"$TEMP\"/Q154.properties.tsv" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\n", "P10\n", "P1001\n", "P1004\n", "P1005\n", "P101\n", "P1014\n", "P1015\n", "P1017\n", "P1019\n" ] } ], "source": [ "!head \"$TEMP\"/Q154.properties.tsv | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's get the edges of these properties:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims -i \"$TEMP\"/Q154.properties.tsv \\\n", "--match 'Q154: (p)-[]->(), claims: (p)-[l]->(n2)' \\\n", "--return 'distinct l as id, p as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.properties.edges.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the resulting file, `node1` is a property and now we have data about them:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P10-P1628-32b85d-7927ece6-0P10P1628\"http://www.w3.org/2006/vcard/ns#Video\"
1P10-P1628-acf60d-b8950832-0P10P1628\"https://schema.org/video\"
2P10-P1629-Q34508-bcc39400-0P10P1629Q34508
3P10-P1659-P1651-c4068028-0P10P1659P1651
4P10-P1659-P18-5e4b9c4f-0P10P1659P18
5P10-P1659-P4238-d21d1ac0-0P10P1659P4238
6P10-P1659-P51-86aca4c5-0P10P1659P51
7P10-P1855-Q7378-555592a4-0P10P1855Q7378
8P10-P2302-Q21502404-d012aef4-0P10P2302Q21502404
\n", "
" ], "text/plain": [ " id node1 label \\\n", "0 P10-P1628-32b85d-7927ece6-0 P10 P1628 \n", "1 P10-P1628-acf60d-b8950832-0 P10 P1628 \n", "2 P10-P1629-Q34508-bcc39400-0 P10 P1629 \n", "3 P10-P1659-P1651-c4068028-0 P10 P1659 \n", "4 P10-P1659-P18-5e4b9c4f-0 P10 P1659 \n", "5 P10-P1659-P4238-d21d1ac0-0 P10 P1659 \n", "6 P10-P1659-P51-86aca4c5-0 P10 P1659 \n", "7 P10-P1855-Q7378-555592a4-0 P10 P1855 \n", "8 P10-P2302-Q21502404-d012aef4-0 P10 P2302 \n", "\n", " node2 \n", "0 \"http://www.w3.org/2006/vcard/ns#Video\" \n", "1 \"https://schema.org/video\" \n", "2 Q34508 \n", "3 P1651 \n", "4 P18 \n", "5 P4238 \n", "6 P51 \n", "7 Q7378 \n", "8 Q21502404 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !head \"$TEMP\"/Q154.properties.edges.tsv \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's consolidate the edges again:" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.edges.2.tsv.gz -i \"$TEMP\"/Q154.properties.edges.tsv \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.3.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The number of edges grew a bit" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 185031 763294 10102000\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.3.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary:\n", "- We have the instances of alcoholic beverages\n", "- We added incoming and outgoing edges\n", "- For all the q-nodes in the previous step, we went one hop forward\n", "- We got the upper ontology\n", "- And we have the edges on all the properties being used" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 5: get edges between any two q-nodes in our graph\n", "As we added q-nodes to our graph, it is possible that there exist edges between these q-nodes that we didn't get when doing the one hop out. To ensure completeness, we get all edges from Wikidata between any two nodes in our Q154 graph. The following query ensures that we are not missing any adeges betwee the nodes we added to our KG. Note that this query is expensive as it needs to find edges between any two nodes. This can be done in kypher, but would be impssible in SPARQL as you would surely get a time-out." ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$TEMP\"/Q154.edges.3.tsv.gz -i \"$ITEM\" --as items \\\n", "--match 'Q154: (n1)-[]->(n2), item: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.edges.complete.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the files again:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.edges.3.tsv.gz -i \"$TEMP\"/Q154.edges.complete.tsv.gz \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.4.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now have all the edges we want:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 185238 764122 10113585\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.4.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 6: get the labels, aliases and descriptions of all the items in our KG\n", "Before we start, let's define an environment variable to hold the final edges file so that if we change our mind later, we can update it without having to change the commands below." ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "os.environ[\"Q154GRAPH\"] = os.environ[\"TEMP\"] + \"/Q154.edges.4.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kgtk-tutorial/temp/Q154.edges.4.tsv.gz\n" ] } ], "source": [ "!ls \"$Q154GRAPH\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Get the labels of the `node1` nodes" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i labels \\\n", "--match 'Q154: (n1)-[]->(), label: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.label.node1.tsv.gz" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P10-label-enP10label'video'@en
1P1001-label-enP1001label'applies to jurisdiction'@en
2P1004-label-enP1004label'MusicBrainz place ID'@en
3P1005-label-enP1005label'Portuguese National Library ID'@en
4P101-label-enP101label'field of work'@en
5P1014-label-enP1014label'Art & Architecture Thesaurus ID'@en
6P1015-label-enP1015label'BIBSYS ID'@en
7P1017-label-enP1017label'Vatican Library ID'@en
8P1019-label-enP1019label'web feed URL'@en
\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 P10-label-en P10 label 'video'@en\n", "1 P1001-label-en P1001 label 'applies to jurisdiction'@en\n", "2 P1004-label-en P1004 label 'MusicBrainz place ID'@en\n", "3 P1005-label-en P1005 label 'Portuguese National Library ID'@en\n", "4 P101-label-en P101 label 'field of work'@en\n", "5 P1014-label-en P1014 label 'Art & Architecture Thesaurus ID'@en\n", "6 P1015-label-en P1015 label 'BIBSYS ID'@en\n", "7 P1017-label-en P1017 label 'Vatican Library ID'@en\n", "8 P1019-label-en P1019 label 'web feed URL'@en" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$TEMP\"/Q154.label.node1.tsv.gz | head \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the labels of the `node2` nodes" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i labels \\\n", "--match 'Q154: ()-[]->(n2), label: (n2)-[l]->(n3)' \\\n", "--return 'distinct l as id, n2 as node1, l.label as label, n3 as node2' \\\n", "-o \"$TEMP\"/Q154.label.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the two label files" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.label.node1.tsv.gz -i \"$TEMP\"/Q154.label.node2.tsv.gz \\\n", "-o \"$TEMP\"/labels.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Get the aliases of `node1` nodes" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$ALIAS\" --as aliases \\\n", "--match 'Q154: (n1)-[]->(), alias: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.alias.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the aliases of `node2` nodes" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i aliases \\\n", "--match 'Q154: ()-[]->(n2), alias: (n2)-[l]->(n3)' \\\n", "--return 'distinct l as id, n2 as node1, l.label as label, n3 as node2' \\\n", "-o \"$TEMP\"/Q154.alias.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the two alias files" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.alias.node1.tsv.gz -i \"$TEMP\"/Q154.alias.node2.tsv.gz \\\n", "-o \"$TEMP\"/alias.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Get the descriptions of `node1` nodes" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$DESCRIPTION\" --as descriptions \\\n", "--match 'Q154: (n1)-[]->(), description: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.description.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the descriptions of `node2` nodes" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i descriptions \\\n", "--match 'Q154: ()-[]->(n2), description: (n2)-[l]->(n3)' \\\n", "--return 'distinct l as id, n2 as node1, l.label as label, n3 as node2' \\\n", "-o \"$TEMP\"/Q154.description.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the two description files" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.description.node1.tsv.gz -i \"$TEMP\"/Q154.description.node2.tsv.gz \\\n", "-o \"$TEMP\"/Q154.description.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 7: get the qualifiers" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i qualifiers \\\n", "--match 'Q154: ()-[l]->(), qual: (l)-[lq]->(n2)' \\\n", "--return 'lq as id, l as node1, lq.label as label, n2 as node2' \\\n", "-o \"$OUT\"/Q154.qualifiers.tsv.gz" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P10-P1855-Q7378-555592a4-0-P10-8a982d-0P10-P1855-Q7378-555592a4-0P10\"Elephants Dream (2006).webm\"
1P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0P10-P2302-Q21502404-d012aef4-0P1793\"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\"
2P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0P10-P2302-Q21502404-d012aef4-0P2316Q21502408
3P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0P10-P2302-Q21502404-d012aef4-0P2916'filename with extension: webm, ogg, ogv, or g...
4P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0P10-P2302-Q21510851-5224fe0b-0P2306P175
...............
93989Q997294-P421-Q6655-f4ed577c-0-P1264-Q1777301-0Q997294-P421-Q6655-f4ed577c-0P1264Q1777301
93990Q997294-P421-Q6723-7c4a7768-0-P1264-Q36669-0Q997294-P421-Q6723-7c4a7768-0P1264Q36669
93991Q997294-P443-cb16c6-94e4e274-0-P407-Q12107-0Q997294-P443-cb16c6-94e4e274-0P407Q12107
93992Q997294-P485-Q18785452-a7748618-0-P217-2aa283-0Q997294-P485-Q18785452-a7748618-0P217\"1200 E DEPOT\"
93993Q997294-P7938-Q1345936-ebcb710a-0-P580-6eae03-0Q997294-P7938-Q1345936-ebcb710a-0P580^2015-03-22T00:00:00Z/11
\n", "

93994 rows × 4 columns

\n", "
" ], "text/plain": [ " id \\\n", "0 P10-P1855-Q7378-555592a4-0-P10-8a982d-0 \n", "1 P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0 \n", "2 P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0 \n", "3 P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 \n", "4 P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0 \n", "... ... \n", "93989 Q997294-P421-Q6655-f4ed577c-0-P1264-Q1777301-0 \n", "93990 Q997294-P421-Q6723-7c4a7768-0-P1264-Q36669-0 \n", "93991 Q997294-P443-cb16c6-94e4e274-0-P407-Q12107-0 \n", "93992 Q997294-P485-Q18785452-a7748618-0-P217-2aa283-0 \n", "93993 Q997294-P7938-Q1345936-ebcb710a-0-P580-6eae03-0 \n", "\n", " node1 label \\\n", "0 P10-P1855-Q7378-555592a4-0 P10 \n", "1 P10-P2302-Q21502404-d012aef4-0 P1793 \n", "2 P10-P2302-Q21502404-d012aef4-0 P2316 \n", "3 P10-P2302-Q21502404-d012aef4-0 P2916 \n", "4 P10-P2302-Q21510851-5224fe0b-0 P2306 \n", "... ... ... \n", "93989 Q997294-P421-Q6655-f4ed577c-0 P1264 \n", "93990 Q997294-P421-Q6723-7c4a7768-0 P1264 \n", "93991 Q997294-P443-cb16c6-94e4e274-0 P407 \n", "93992 Q997294-P485-Q18785452-a7748618-0 P217 \n", "93993 Q997294-P7938-Q1345936-ebcb710a-0 P580 \n", "\n", " node2 \n", "0 \"Elephants Dream (2006).webm\" \n", "1 \"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\" \n", "2 Q21502408 \n", "3 'filename with extension: webm, ogg, ogv, or g... \n", "4 P175 \n", "... ... \n", "93989 Q1777301 \n", "93990 Q36669 \n", "93991 Q12107 \n", "93992 \"1200 E DEPOT\" \n", "93993 ^2015-03-22T00:00:00Z/11 \n", "\n", "[93994 rows x 4 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = !zcat < \"$OUT\"/Q154.qualifiers.tsv.gz \n", "kgtk_to_dataframe(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 8: consolidate all the files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### KGTK extensions to Wikidata\n", "KGTK defines extensions to Wikidata, and we want to include those in our graph, so we download them from GitHub:" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2021-01-24 13:11:35-- https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2617 (2.6K) [text/plain]\n", "Saving to: ‘/Users/pedroszekely/Downloads/kgtk-tutorial/temp/kgtk.properties.tsv’\n", "\n", "/Users/pedroszekely 100%[===================>] 2.56K --.-KB/s in 0s \n", "\n", "2021-01-24 13:11:40 (12.2 MB/s) - ‘/Users/pedroszekely/Downloads/kgtk-tutorial/temp/kgtk.properties.tsv’ saved [2617/2617]\n", "\n" ] } ], "source": [ "!wget https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -O \"$TEMP\"/kgtk.properties.tsv" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0isalabel\"is a\"@enisa-label-e79b73
1isaalias\"isa\"@enisa-alias-7773c5
2isadescription\"Instance or subclass relationship\"@enisa-description-0b5cdc
3isaP31Q18616576isa-P31-Q18616576
4isaP31Q28326461isa-P31-Q28326461
5isaP31Q18647519isa-P31-Q18647519
6isadata_typewikibase-itemisa-data_type-643cc9
7P279starlabel\"is a\"@enP279star-label-e79b73
8P279staralias\"isa\"@enP279star-alias-7773c5
9P279stardescription\"Instance or subclass relationship\"@enP279star-description-0b5cdc
10P279starP31Q18616576P279star-P31-Q18616576
11P279starP31Q28326461P279star-P31-Q28326461
12P279starP31Q18647519P279star-P31-Q18647519
13P279starP31Q18647521P279star-P31-Q18647521
14P279stardata_typewikibase-itemP279star-data_type-643cc9
15directed_pageranklabel\"pagerank\"@endirected_pagerank-label-d3bd07
16directed_pagerankalias\"page rank\"@endirected_pagerank-alias-9d4733
17directed_pagerankdescription\"pagerank canculated on the directed graph\"@endirected_pagerank-description-b62fff
18directed_pagerankP31Q18616576directed_pagerank-P31-Q18616576
19directed_pagerankP31Q47512165directed_pagerank-P31-Q47512165
20directed_pagerankP1629Q184316directed_pagerank-P1629-Q184316
21directed_pagerankdata_typequantitydirected_pagerank-data_type-1a7b30
22undirected_pageranklabel\"pagerank\"@enundirected_pagerank-label-d3bd07
23undirected_pagerankalias\"page rank\"@enundirected_pagerank-alias-9d4733
24undirected_pagerankdescription\"pagerank canculated on the undirected graph\"@enundirected_pagerank-description-ee8b1c
25undirected_pagerankP31Q18616576undirected_pagerank-P31-Q18616576
26undirected_pagerankP31Q47512165undirected_pagerank-P31-Q47512165
27undirected_pagerankP1629Q184316undirected_pagerank-P1629-Q184316
28undirected_pagerankdata_typequantityundirected_pagerank-data_type-1a7b30
29in_degreelabel\"in degree\"@enin_degree-label-aa295d
30in_degreealias\"degree\"@enin_degree-alias-b5846a
31in_degreedescription\"in degree of a node in a graph\"@enin_degree-description-642be1
32in_degreeP31Q18616576in_degree-P31-Q18616576
33in_degreeP31Q47512165in_degree-P31-Q47512165
34in_degreeP1629Q383444in_degree-P1629-Q383444
35in_degreedata_typequantityin_degree-data_type-1a7b30
36out_degreelabel\"out degree\"@enout_degree-label-6deeae
37out_degreealias\"degree\"@enout_degree-alias-b5846a
38out_degreedescription\"out degree of a node in a graph\"@enout_degree-description-c7e312
39out_degreeP31Q18616576out_degree-P31-Q18616576
40out_degreeP31Q47512165out_degree-P31-Q47512165
41out_degreeP1629Q383444out_degree-P1629-Q383444
42out_degreedata_typequantityout_degree-data_type-1a7b30
\n", "
" ], "text/plain": [ " node1 label \\\n", "0 isa label \n", "1 isa alias \n", "2 isa description \n", "3 isa P31 \n", "4 isa P31 \n", "5 isa P31 \n", "6 isa data_type \n", "7 P279star label \n", "8 P279star alias \n", "9 P279star description \n", "10 P279star P31 \n", "11 P279star P31 \n", "12 P279star P31 \n", "13 P279star P31 \n", "14 P279star data_type \n", "15 directed_pagerank label \n", "16 directed_pagerank alias \n", "17 directed_pagerank description \n", "18 directed_pagerank P31 \n", "19 directed_pagerank P31 \n", "20 directed_pagerank P1629 \n", "21 directed_pagerank data_type \n", "22 undirected_pagerank label \n", "23 undirected_pagerank alias \n", "24 undirected_pagerank description \n", "25 undirected_pagerank P31 \n", "26 undirected_pagerank P31 \n", "27 undirected_pagerank P1629 \n", "28 undirected_pagerank data_type \n", "29 in_degree label \n", "30 in_degree alias \n", "31 in_degree description \n", "32 in_degree P31 \n", "33 in_degree P31 \n", "34 in_degree P1629 \n", "35 in_degree data_type \n", "36 out_degree label \n", "37 out_degree alias \n", "38 out_degree description \n", "39 out_degree P31 \n", "40 out_degree P31 \n", "41 out_degree P1629 \n", "42 out_degree data_type \n", "\n", " node2 \\\n", "0 \"is a\"@en \n", "1 \"isa\"@en \n", "2 \"Instance or subclass relationship\"@en \n", "3 Q18616576 \n", "4 Q28326461 \n", "5 Q18647519 \n", "6 wikibase-item \n", "7 \"is a\"@en \n", "8 \"isa\"@en \n", "9 \"Instance or subclass relationship\"@en \n", "10 Q18616576 \n", "11 Q28326461 \n", "12 Q18647519 \n", "13 Q18647521 \n", "14 wikibase-item \n", "15 \"pagerank\"@en \n", "16 \"page rank\"@en \n", "17 \"pagerank canculated on the directed graph\"@en \n", "18 Q18616576 \n", "19 Q47512165 \n", "20 Q184316 \n", "21 quantity \n", "22 \"pagerank\"@en \n", "23 \"page rank\"@en \n", "24 \"pagerank canculated on the undirected graph\"@en \n", "25 Q18616576 \n", "26 Q47512165 \n", "27 Q184316 \n", "28 quantity \n", "29 \"in degree\"@en \n", "30 \"degree\"@en \n", "31 \"in degree of a node in a graph\"@en \n", "32 Q18616576 \n", "33 Q47512165 \n", "34 Q383444 \n", "35 quantity \n", "36 \"out degree\"@en \n", "37 \"degree\"@en \n", "38 \"out degree of a node in a graph\"@en \n", "39 Q18616576 \n", "40 Q47512165 \n", "41 Q383444 \n", "42 quantity \n", "\n", " id \n", "0 isa-label-e79b73 \n", "1 isa-alias-7773c5 \n", "2 isa-description-0b5cdc \n", "3 isa-P31-Q18616576 \n", "4 isa-P31-Q28326461 \n", "5 isa-P31-Q18647519 \n", "6 isa-data_type-643cc9 \n", "7 P279star-label-e79b73 \n", "8 P279star-alias-7773c5 \n", "9 P279star-description-0b5cdc \n", "10 P279star-P31-Q18616576 \n", "11 P279star-P31-Q28326461 \n", "12 P279star-P31-Q18647519 \n", "13 P279star-P31-Q18647521 \n", "14 P279star-data_type-643cc9 \n", "15 directed_pagerank-label-d3bd07 \n", "16 directed_pagerank-alias-9d4733 \n", "17 directed_pagerank-description-b62fff \n", "18 directed_pagerank-P31-Q18616576 \n", "19 directed_pagerank-P31-Q47512165 \n", "20 directed_pagerank-P1629-Q184316 \n", "21 directed_pagerank-data_type-1a7b30 \n", "22 undirected_pagerank-label-d3bd07 \n", "23 undirected_pagerank-alias-9d4733 \n", "24 undirected_pagerank-description-ee8b1c \n", "25 undirected_pagerank-P31-Q18616576 \n", "26 undirected_pagerank-P31-Q47512165 \n", "27 undirected_pagerank-P1629-Q184316 \n", "28 undirected_pagerank-data_type-1a7b30 \n", "29 in_degree-label-aa295d \n", "30 in_degree-alias-b5846a \n", "31 in_degree-description-642be1 \n", "32 in_degree-P31-Q18616576 \n", "33 in_degree-P31-Q47512165 \n", "34 in_degree-P1629-Q383444 \n", "35 in_degree-data_type-1a7b30 \n", "36 out_degree-label-6deeae \n", "37 out_degree-alias-b5846a \n", "38 out_degree-description-c7e312 \n", "39 out_degree-P31-Q18616576 \n", "40 out_degree-P31-Q47512165 \n", "41 out_degree-P1629-Q383444 \n", "42 out_degree-data_type-1a7b30 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !cat \"$TEMP\"/kgtk.properties.tsv \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Property datatype\n", "Wikidata defines data types for properties to specify the type of value expected for each property. We follow this convention, so we include the data types in our KG. \n", "\n", "First take a look at the data types defined for Wikidata properties:" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2node2;wikidatatyperank
0P10-datatypeP10datatypecommonsMedia
1P1001-datatypeP1001datatypewikibase-item
2P1003-datatypeP1003datatypeexternal-id
3P1004-datatypeP1004datatypeexternal-id
4P1005-datatypeP1005datatypeexternal-id
.....................
1503P981-datatypeP981datatypeexternal-id
1504P982-datatypeP982datatypeexternal-id
1505P984-datatypeP984datatypeexternal-id
1506P989-datatypeP989datatypecommonsMedia
1507P998-datatypeP998datatypeexternal-id
\n", "

1508 rows × 6 columns

\n", "
" ], "text/plain": [ " id node1 label node2 node2;wikidatatype rank\n", "0 P10-datatype P10 datatype commonsMedia \n", "1 P1001-datatype P1001 datatype wikibase-item \n", "2 P1003-datatype P1003 datatype external-id \n", "3 P1004-datatype P1004 datatype external-id \n", "4 P1005-datatype P1005 datatype external-id \n", "... ... ... ... ... ... ...\n", "1503 P981-datatype P981 datatype external-id \n", "1504 P982-datatype P982 datatype external-id \n", "1505 P984-datatype P984 datatype external-id \n", "1506 P989-datatype P989 datatype commonsMedia \n", "1507 P998-datatype P998 datatype external-id \n", "\n", "[1508 rows x 6 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$PROPERTY_DATATYPES\" \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We filter this file to select the data types for the properties we use in our graph. We don't care about the datatypes for properties we don't have in our graph:" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$PROPERTY_DATATYPES\" \\\n", "--match 'Q15: (n1)-[]->(), property: (n1)-[l:datatype]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P10-datatypeP10datatypecommonsMedia
1P1001-datatypeP1001datatypewikibase-item
2P1004-datatypeP1004datatypeexternal-id
3P1005-datatypeP1005datatypeexternal-id
4P101-datatypeP101datatypewikibase-item
...............
1072P981-datatypeP981datatypeexternal-id
1073P982-datatypeP982datatypeexternal-id
1074P984-datatypeP984datatypeexternal-id
1075P989-datatypeP989datatypecommonsMedia
1076P998-datatypeP998datatypeexternal-id
\n", "

1077 rows × 4 columns

\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 P10-datatype P10 datatype commonsMedia\n", "1 P1001-datatype P1001 datatype wikibase-item\n", "2 P1004-datatype P1004 datatype external-id\n", "3 P1005-datatype P1005 datatype external-id\n", "4 P101-datatype P101 datatype wikibase-item\n", "... ... ... ... ...\n", "1072 P981-datatype P981 datatype external-id\n", "1073 P982-datatype P982 datatype external-id\n", "1074 P984-datatype P984 datatype external-id\n", "1075 P989-datatype P989 datatype commonsMedia\n", "1076 P998-datatype P998 datatype external-id\n", "\n", "[1077 rows x 4 columns]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = !zcat < \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz \n", "kgtk_to_dataframe(lines)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Produce the final `all.tsv` file" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat \\\n", "-i \"$TEMP\"/labels.tsv.gz \\\n", "-i \"$TEMP\"/alias.tsv.gz \\\n", "-i \"$TEMP\"/Q154.description.tsv.gz \\\n", "-i \"$Q154GRAPH\" \\\n", "-i \"$TEMP\"/kgtk.properties.tsv \\\n", "-i \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$OUT\"/all.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our full alcoholic beverage KG in a pandas dataframe" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P10-P1628-32b85d-7927ece6-0P10P1628\"http://www.w3.org/2006/vcard/ns#Video\"
1P10-P1628-acf60d-b8950832-0P10P1628\"https://schema.org/video\"
2P10-P1629-Q34508-bcc39400-0P10P1629Q34508
3P10-P1659-P1651-c4068028-0P10P1659P1651
4P10-P1659-P18-5e4b9c4f-0P10P1659P18
...............
310588undirected_pagerank-P31-Q47512165undirected_pagerankP31Q47512165
310589undirected_pagerank-alias-9d4733undirected_pagerankalias\"page rank\"@en
310590undirected_pagerank-data_type-1a7b30undirected_pagerankdata_typequantity
310591undirected_pagerank-description-ee8b1cundirected_pagerankdescription\"pagerank canculated on the undirected graph\"@en
310592undirected_pagerank-label-d3bd07undirected_pageranklabel\"pagerank\"@en
\n", "

310593 rows × 4 columns

\n", "
" ], "text/plain": [ " id node1 \\\n", "0 P10-P1628-32b85d-7927ece6-0 P10 \n", "1 P10-P1628-acf60d-b8950832-0 P10 \n", "2 P10-P1629-Q34508-bcc39400-0 P10 \n", "3 P10-P1659-P1651-c4068028-0 P10 \n", "4 P10-P1659-P18-5e4b9c4f-0 P10 \n", "... ... ... \n", "310588 undirected_pagerank-P31-Q47512165 undirected_pagerank \n", "310589 undirected_pagerank-alias-9d4733 undirected_pagerank \n", "310590 undirected_pagerank-data_type-1a7b30 undirected_pagerank \n", "310591 undirected_pagerank-description-ee8b1c undirected_pagerank \n", "310592 undirected_pagerank-label-d3bd07 undirected_pagerank \n", "\n", " label node2 \n", "0 P1628 \"http://www.w3.org/2006/vcard/ns#Video\" \n", "1 P1628 \"https://schema.org/video\" \n", "2 P1629 Q34508 \n", "3 P1659 P1651 \n", "4 P1659 P18 \n", "... ... ... \n", "310588 P31 Q47512165 \n", "310589 alias \"page rank\"@en \n", "310590 data_type quantity \n", "310591 description \"pagerank canculated on the undirected graph\"@en \n", "310592 label \"pagerank\"@en \n", "\n", "[310593 rows x 4 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all = !zcat < \"$OUT\"/all.tsv.gz \n", "kgtk_to_dataframe(all)" ] } ], "metadata": { "kernelspec": { "display_name": "kgtk-env", "language": "python", "name": "kgtk-env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }