{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Use Case: A Knowledge Graph About Alocholic Beverages\n",
"We are going to build a small KG about alcoholoc beverages by extracting from Wikidata the subgraph that relates to alcoholic beverages (https://www.wikidata.org/wiki/Q154)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/aliases.en.tsv.gz\"\n",
"ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/all.tsv.gz\"\n",
"CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.tsv.gz\"\n",
"DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/descriptions.en.tsv.gz\"\n",
"EXAMPLES_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/examples\"\n",
"GE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding\"\n",
"ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.isa.tsv.gz\"\n",
"ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.wikibase-item.tsv.gz\"\n",
"KGTK_PATH: \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n",
"LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/labels.en.tsv.gz\"\n",
"OUT: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output\"\n",
"P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279.tsv.gz\"\n",
"P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279star.tsv.gz\"\n",
"PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/metadata.property.datatypes.tsv.gz\"\n",
"Q154ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/aliases.en.tsv.gz\"\n",
"Q154ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/all.tsv.gz\"\n",
"Q154CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.tsv.gz\"\n",
"Q154DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/descriptions.en.tsv.gz\"\n",
"Q154ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.isa.tsv.gz\"\n",
"Q154ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.wikibase-item.tsv.gz\"\n",
"Q154LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/labels.en.tsv.gz\"\n",
"Q154P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279.tsv.gz\"\n",
"Q154P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279star.tsv.gz\"\n",
"Q154PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/metadata.property.datatypes.tsv.gz\"\n",
"Q154QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.tsv.gz\"\n",
"Q154QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.time.tsv.gz\"\n",
"Q154SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/sitelinks.tsv.gz\"\n",
"QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.tsv.gz\"\n",
"QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.time.tsv.gz\"\n",
"SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/sitelinks.tsv.gz\"\n",
"STORE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n",
"TE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding\"\n",
"TEMP: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp\"\n",
"USECASE_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/use-cases\"\n",
"WIKIDATA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/\"\n",
"kgtk: \"kgtk --debug\"\n",
"kypher: \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n"
]
}
],
"source": [
"import sys \n",
"sys.path.insert(0, 'tutorial')\n",
"from tutorial_setup import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/pedroszekely/Downloads/kgtk-tutorial\n"
]
}
],
"source": [
"%cd {output_path}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p {output_folder}\n",
"!mkdir -p {temp_folder}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 1: create a list of all descendants of `alcoholic beverage` (https://www.wikidata.org/wiki/Q154)\n",
"Here is some of the information about `Q154` in Wikidata:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" label;label | \n",
" node2 | \n",
" node2;label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q154 | \n",
" P1151 | \n",
" 'topic\\\\'s main Wikimedia portal'@en | \n",
" Q61473108 | \n",
" 'Portal:Alcoholic drinks'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q154 | \n",
" P1343 | \n",
" 'described by source'@en | \n",
" Q1768721 | \n",
" 'Gujin Tushu Jicheng'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q154 | \n",
" P1424 | \n",
" 'topic\\\\'s main template'@en | \n",
" Q10557691 | \n",
" 'Template:Infobox alcoholic beverage'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q154 | \n",
" P1552 | \n",
" 'has quality'@en | \n",
" Q1517187 | \n",
" 'bitterness'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q154 | \n",
" P2354 | \n",
" 'has list'@en | \n",
" Q2468826 | \n",
" 'list of alcoholic beverages'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q154 | \n",
" P279 | \n",
" 'subclass of'@en | \n",
" Q40050 | \n",
" 'drink'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q154 | \n",
" P31 | \n",
" 'instance of'@en | \n",
" Q187661 | \n",
" 'carcinogen'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q154 | \n",
" P31 | \n",
" 'instance of'@en | \n",
" Q8386 | \n",
" 'drug'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q154 | \n",
" P452 | \n",
" 'industry'@en | \n",
" Q3150593 | \n",
" 'alcohol industry'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q154 | \n",
" P461 | \n",
" 'opposite of'@en | \n",
" Q2647467 | \n",
" 'non-alcoholic beverage'@en | \n",
"
\n",
" \n",
" 10 | \n",
" Q154 | \n",
" P910 | \n",
" 'topic\\\\'s main category'@en | \n",
" Q7214082 | \n",
" 'Category:Alcoholic drinks'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label label;label node2 \\\n",
"0 Q154 P1151 'topic\\\\'s main Wikimedia portal'@en Q61473108 \n",
"1 Q154 P1343 'described by source'@en Q1768721 \n",
"2 Q154 P1424 'topic\\\\'s main template'@en Q10557691 \n",
"3 Q154 P1552 'has quality'@en Q1517187 \n",
"4 Q154 P2354 'has list'@en Q2468826 \n",
"5 Q154 P279 'subclass of'@en Q40050 \n",
"6 Q154 P31 'instance of'@en Q187661 \n",
"7 Q154 P31 'instance of'@en Q8386 \n",
"8 Q154 P452 'industry'@en Q3150593 \n",
"9 Q154 P461 'opposite of'@en Q2647467 \n",
"10 Q154 P910 'topic\\\\'s main category'@en Q7214082 \n",
"\n",
" node2;label \n",
"0 'Portal:Alcoholic drinks'@en \n",
"1 'Gujin Tushu Jicheng'@en \n",
"2 'Template:Infobox alcoholic beverage'@en \n",
"3 'bitterness'@en \n",
"4 'list of alcoholic beverages'@en \n",
"5 'drink'@en \n",
"6 'carcinogen'@en \n",
"7 'drug'@en \n",
"8 'alcohol industry'@en \n",
"9 'non-alcoholic beverage'@en \n",
"10 'Category:Alcoholic drinks'@en "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = !$kypher -i claims -i labels \\\n",
"--match 'claims: (n1:Q154)-[l {label:p}]->(n2), label: (n2)-[]->(n2_label), label: (p)-[]->(p_label)' \\\n",
"--return 'n1 as node1, l.label as label, p_label as `label;label`, n2 as node2, n2_label as `node2;label`' \\\n",
"--order-by 'l.label'\n",
"\n",
"\n",
"kgtk_to_dataframe(result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wikidata uses two properties to organize entities in a hierarchy: the `instance of` property (`P31`) and the `subclass of` (`P279`) property. In many cases, the distinction between instance of and subclass of is subtle, and we find many situations in Wikidata where either one or the other is used to organize hierarchies. For this reason, we created a new property called `isa` that contains the union of `P31` and `P279` and stored in the file `derived.isa.tsv`"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10 | \n",
" isa | \n",
" Q18610173 | \n",
"
\n",
" \n",
" 1 | \n",
" P1000 | \n",
" isa | \n",
" Q18608871 | \n",
"
\n",
" \n",
" 2 | \n",
" P1001 | \n",
" isa | \n",
" Q15720608 | \n",
"
\n",
" \n",
" 3 | \n",
" P1001 | \n",
" isa | \n",
" Q22984026 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2\n",
"0 P10 isa Q18610173\n",
"1 P1000 isa Q18608871\n",
"2 P1001 isa Q15720608\n",
"3 P1001 isa Q22984026"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$ISA\" | head -5\n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To get all the alcoholic beverages, we need to get all entities that are `isa` of alcoholic beverage (`Q154`) or that are `isa` of any descendant of `Q154` in the `subclass of` (`P279`) hierarchy. The length of the chain of `P279` edges can be arbitrarily long. To support this use case, KGTK offers the `derived.P279star.tsv` file that contains edges `n1/P279star/n2` if `n1` is a descendant of `n2` on chains of `P279` edges, includiing chains of zero length (`n1/P279star/n1`)."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q100000030 | \n",
" Q100000030-P279star-Q100000030-0000 | \n",
"
\n",
" \n",
" 1 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q1357761 | \n",
" Q100000030-P279star-Q1357761-0000 | \n",
"
\n",
" \n",
" 2 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q14745 | \n",
" Q100000030-P279star-Q14745-0000 | \n",
"
\n",
" \n",
" 3 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q14748 | \n",
" Q100000030-P279star-Q14748-0000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 id\n",
"0 Q100000030 P279star Q100000030 Q100000030-P279star-Q100000030-0000\n",
"1 Q100000030 P279star Q1357761 Q100000030-P279star-Q1357761-0000\n",
"2 Q100000030 P279star Q14745 Q100000030-P279star-Q14745-0000\n",
"3 Q100000030 P279star Q14748 Q100000030-P279star-Q14748-0000"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$P279STAR\" | head -5 \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To get all alcoholic beverages, we need to find all nodes `n1` that are connected to `Q154` with an `isa` edge and a chain of `P279` edges:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$ISA\" --as \"isa\" -i \"$P279STAR\" --as \"p279star\" -i labels \\\n",
"--match 'isa: (n1)-[]->(n2), star: (n2)-[]->(n3:Q154), label: (n1)-[]->(n1l)' \\\n",
"--return 'n1 as node1, n1l as `node1;label`, n3 as node2, \"isastar\" as label' \\\n",
"-o \"$TEMP\"/Q154.descendant.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here is a sample of alcoholic beverages in Wikidata"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" node1;label | \n",
" node2 | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1350656 | \n",
" 'corn whiskey'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 1 | \n",
" Q20713240 | \n",
" 'Buckwheat whisky'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 2 | \n",
" Q2535077 | \n",
" 'rye whiskey'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 3 | \n",
" Q536976 | \n",
" 'Canadian whisky'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 4 | \n",
" Q7991845 | \n",
" 'wheat whiskey'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 3346 | \n",
" Q7719471 | \n",
" 'The Botanist'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 3347 | \n",
" Q187155 | \n",
" 'Tanqueray'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 3348 | \n",
" Q62076228 | \n",
" 'dry gin'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 3349 | \n",
" Q7085234 | \n",
" 'Old Tom Gin'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
" 3350 | \n",
" Q891834 | \n",
" 'Bombay Sapphire'@en | \n",
" Q154 | \n",
" isastar | \n",
"
\n",
" \n",
"
\n",
"
3351 rows × 4 columns
\n",
"
"
],
"text/plain": [
" node1 node1;label node2 label\n",
"0 Q1350656 'corn whiskey'@en Q154 isastar\n",
"1 Q20713240 'Buckwheat whisky'@en Q154 isastar\n",
"2 Q2535077 'rye whiskey'@en Q154 isastar\n",
"3 Q536976 'Canadian whisky'@en Q154 isastar\n",
"4 Q7991845 'wheat whiskey'@en Q154 isastar\n",
"... ... ... ... ...\n",
"3346 Q7719471 'The Botanist'@en Q154 isastar\n",
"3347 Q187155 'Tanqueray'@en Q154 isastar\n",
"3348 Q62076228 'dry gin'@en Q154 isastar\n",
"3349 Q7085234 'Old Tom Gin'@en Q154 isastar\n",
"3350 Q891834 'Bombay Sapphire'@en Q154 isastar\n",
"\n",
"[3351 rows x 4 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !cat \"$TEMP\"/Q154.descendant.tsv \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The computation of `Q154.descendant.tsv` can be implemented in SPARQL using the common `P31/P279*` graph pattern, but the query will time out if the result size is large. For example, the query will time out when requesting all descendants of chemical compounds, as there are over one million chemical compounds in Wikidata. The query can be easily done in KGTK."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 2: get the incoming and outgoing edges\n",
"We want out graph to have the neighbors of all alcoholic beverages, so we need to get the incoming and outgoing edges.\n",
"\n",
"The following query gets the **outgoing** edges."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i claims -i \"$TEMP\"/Q154.descendant.tsv \\\n",
"--match 'Q154: (n1)-[]->(), claims: (n1)-[l]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.node1.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see that we are getting several properties for our items:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1000737-P1435-Q17297633-53903946-0 | \n",
" Q1000737 | \n",
" P1435 | \n",
" Q17297633 | \n",
"
\n",
" \n",
" 1 | \n",
" Q1000737-P1454-Q460178-8ad4931b-0 | \n",
" Q1000737 | \n",
" P1454 | \n",
" Q460178 | \n",
"
\n",
" \n",
" 2 | \n",
" Q1000737-P159-Q16003-31e24011-0 | \n",
" Q1000737 | \n",
" P159 | \n",
" Q16003 | \n",
"
\n",
" \n",
" 3 | \n",
" Q1000737-P17-Q183-24107fe2-0 | \n",
" Q1000737 | \n",
" P17 | \n",
" Q183 | \n",
"
\n",
" \n",
" 4 | \n",
" Q1000737-P18-147fc9-667304f8-0 | \n",
" Q1000737 | \n",
" P18 | \n",
" \"Marthabräuhalle 2011-04-03.jpg\" | \n",
"
\n",
" \n",
" 5 | \n",
" Q1000737-P31-Q131734-f97bd6f6-0 | \n",
" Q1000737 | \n",
" P31 | \n",
" Q131734 | \n",
"
\n",
" \n",
" 6 | \n",
" Q1000737-P31-Q15075508-a4c83928-0 | \n",
" Q1000737 | \n",
" P31 | \n",
" Q15075508 | \n",
"
\n",
" \n",
" 7 | \n",
" Q1000737-P373-689157-3110aade-0 | \n",
" Q1000737 | \n",
" P373 | \n",
" \"Marthabräu\" | \n",
"
\n",
" \n",
" 8 | \n",
" Q1000737-P452-Q869095-f5d8e7a2-0 | \n",
" Q1000737 | \n",
" P452 | \n",
" Q869095 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label \\\n",
"0 Q1000737-P1435-Q17297633-53903946-0 Q1000737 P1435 \n",
"1 Q1000737-P1454-Q460178-8ad4931b-0 Q1000737 P1454 \n",
"2 Q1000737-P159-Q16003-31e24011-0 Q1000737 P159 \n",
"3 Q1000737-P17-Q183-24107fe2-0 Q1000737 P17 \n",
"4 Q1000737-P18-147fc9-667304f8-0 Q1000737 P18 \n",
"5 Q1000737-P31-Q131734-f97bd6f6-0 Q1000737 P31 \n",
"6 Q1000737-P31-Q15075508-a4c83928-0 Q1000737 P31 \n",
"7 Q1000737-P373-689157-3110aade-0 Q1000737 P373 \n",
"8 Q1000737-P452-Q869095-f5d8e7a2-0 Q1000737 P452 \n",
"\n",
" node2 \n",
"0 Q17297633 \n",
"1 Q460178 \n",
"2 Q16003 \n",
"3 Q183 \n",
"4 \"Marthabräuhalle 2011-04-03.jpg\" \n",
"5 Q131734 \n",
"6 Q15075508 \n",
"7 \"Marthabräu\" \n",
"8 Q869095 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$TEMP\"/Q154.node1.tsv.gz | head \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now get the **incoming** edges:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i claims -i \"$TEMP\"/Q154.descendant.tsv \\\n",
"--match 'Q154: (n1)-[]->(), claims: (n3)-[l]->(n1)' \\\n",
"--return 'distinct l as id, n3 as node1, l.label as label, n1 as node2' \\\n",
"-o \"$TEMP\"/Q154.node2.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here is a sample of the edges we are getting"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1350656-P279-Q1007164-7e3ecba9-0 | \n",
" Q1350656 | \n",
" P279 | \n",
" Q1007164 | \n",
"
\n",
" \n",
" 1 | \n",
" Q20713240-P279-Q1007164-b3112260-0 | \n",
" Q20713240 | \n",
" P279 | \n",
" Q1007164 | \n",
"
\n",
" \n",
" 2 | \n",
" Q2535077-P279-Q1007164-b2d3684b-0 | \n",
" Q2535077 | \n",
" P279 | \n",
" Q1007164 | \n",
"
\n",
" \n",
" 3 | \n",
" Q536976-P279-Q1007164-8bf7467b-0 | \n",
" Q536976 | \n",
" P279 | \n",
" Q1007164 | \n",
"
\n",
" \n",
" 4 | \n",
" Q7991845-P279-Q1007164-18bc383a-0 | \n",
" Q7991845 | \n",
" P279 | \n",
" Q1007164 | \n",
"
\n",
" \n",
" 5 | \n",
" Q10337004-P186-Q10210-c56dd7ce-0 | \n",
" Q10337004 | \n",
" P186 | \n",
" Q10210 | \n",
"
\n",
" \n",
" 6 | \n",
" Q10429117-P31-Q10210-d342f061-0 | \n",
" Q10429117 | \n",
" P31 | \n",
" Q10210 | \n",
"
\n",
" \n",
" 7 | \n",
" Q1051699-P279-Q10210-65d32c67-0 | \n",
" Q1051699 | \n",
" P279 | \n",
" Q10210 | \n",
"
\n",
" \n",
" 8 | \n",
" Q1058259-P279-Q10210-e204554a-0 | \n",
" Q1058259 | \n",
" P279 | \n",
" Q10210 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 Q1350656-P279-Q1007164-7e3ecba9-0 Q1350656 P279 Q1007164\n",
"1 Q20713240-P279-Q1007164-b3112260-0 Q20713240 P279 Q1007164\n",
"2 Q2535077-P279-Q1007164-b2d3684b-0 Q2535077 P279 Q1007164\n",
"3 Q536976-P279-Q1007164-8bf7467b-0 Q536976 P279 Q1007164\n",
"4 Q7991845-P279-Q1007164-18bc383a-0 Q7991845 P279 Q1007164\n",
"5 Q10337004-P186-Q10210-c56dd7ce-0 Q10337004 P186 Q10210\n",
"6 Q10429117-P31-Q10210-d342f061-0 Q10429117 P31 Q10210\n",
"7 Q1051699-P279-Q10210-65d32c67-0 Q1051699 P279 Q10210\n",
"8 Q1058259-P279-Q10210-e204554a-0 Q1058259 P279 Q10210"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$TEMP\"/Q154.node2.tsv.gz | head\n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the incoming and outgoing edges to put them in a single file:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.node1.tsv.gz -i \"$TEMP\"/Q154.node2.tsv.gz -o \"$TEMP\"/Q154.claims.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See how many edges we have:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 31945 131399 1801186\n"
]
}
],
"source": [
"!zcat < \"$TEMP\"/Q154.claims.tsv.gz | wc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Summary of where we are:\n",
"- Computed the list of q-nodes below alcoholic beverage\n",
"- Found all incoming and outgoing edges to these q-nodes; for the new q-nodes we bring in, we have no information, we only have the q-node\n",
"\n",
"We have the q-nodes connected to alcoholic beverages, but so far we don't have the edges of those q-nodes. We need to go one hop out from the q-nodes that we have. We run a query to go one hop out from any qnode in `Q154.claims.tsv` which will use all the q-nodes in our graph, including the alcoholic beverages for which we already got outgoing edges; no harm done, as we can eliminate duplicated later."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i claims -i \"$TEMP\"/Q154.claims.tsv.gz \\\n",
"--match 'Q154: ()-[]->(n1), claims: (n1)-[l]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.hop.out.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For sanity check, let's take a peek:"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1000-P1014-693343-9664fa33-0 | \n",
" Q1000 | \n",
" P1014 | \n",
" \"300262648\" | \n",
"
\n",
" \n",
" 1 | \n",
" Q1000-P1036-9bef62-f77ac5cf-0 | \n",
" Q1000 | \n",
" P1036 | \n",
" \"2--6721\" | \n",
"
\n",
" \n",
" 2 | \n",
" Q1000-P1081-0d345f-3a33abf5-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.641 | \n",
"
\n",
" \n",
" 3 | \n",
" Q1000-P1081-0d345f-6da37c02-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.641 | \n",
"
\n",
" \n",
" 4 | \n",
" Q1000-P1081-1100e3-c7631769-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.624 | \n",
"
\n",
" \n",
" 5 | \n",
" Q1000-P1081-1ada51-7c71c229-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.639 | \n",
"
\n",
" \n",
" 6 | \n",
" Q1000-P1081-345681-88a99cab-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.702 | \n",
"
\n",
" \n",
" 7 | \n",
" Q1000-P1081-347db1-da0e5e03-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.637 | \n",
"
\n",
" \n",
" 8 | \n",
" Q1000-P1081-419245-b03a8b59-0 | \n",
" Q1000 | \n",
" P1081 | \n",
" +0.647 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 Q1000-P1014-693343-9664fa33-0 Q1000 P1014 \"300262648\"\n",
"1 Q1000-P1036-9bef62-f77ac5cf-0 Q1000 P1036 \"2--6721\"\n",
"2 Q1000-P1081-0d345f-3a33abf5-0 Q1000 P1081 +0.641\n",
"3 Q1000-P1081-0d345f-6da37c02-0 Q1000 P1081 +0.641\n",
"4 Q1000-P1081-1100e3-c7631769-0 Q1000 P1081 +0.624\n",
"5 Q1000-P1081-1ada51-7c71c229-0 Q1000 P1081 +0.639\n",
"6 Q1000-P1081-345681-88a99cab-0 Q1000 P1081 +0.702\n",
"7 Q1000-P1081-347db1-da0e5e03-0 Q1000 P1081 +0.637\n",
"8 Q1000-P1081-419245-b03a8b59-0 Q1000 P1081 +0.647"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$TEMP\"/Q154.hop.out.tsv.gz | head \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's consolidate our edge files into one larger file. We use compact to remove duplicates and sort to keep edges for the same subject together:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.claims.tsv.gz -i \"$TEMP\"/Q154.hop.out.tsv.gz \\\n",
"/ compact \\\n",
"/ sort2 \\\n",
"-o \"$TEMP\"/Q154.edges.1.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See how many edges we have:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 159073 655180 8549211\n"
]
}
],
"source": [
"!zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | wc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Take a peek:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P1389-P1855-Q1109662-9e2ef218-0 | \n",
" P1389 | \n",
" P1855 | \n",
" Q1109662 | \n",
"
\n",
" \n",
" 1 | \n",
" P1582-P1855-Q17329207-f4ef508d-0 | \n",
" P1582 | \n",
" P1855 | \n",
" Q17329207 | \n",
"
\n",
" \n",
" 2 | \n",
" P2581-P1855-Q7639844-08b3a4c7-0 | \n",
" P2581 | \n",
" P1855 | \n",
" Q7639844 | \n",
"
\n",
" \n",
" 3 | \n",
" P2665-P1855-Q1067702-402a80a9-0 | \n",
" P2665 | \n",
" P1855 | \n",
" Q1067702 | \n",
"
\n",
" \n",
" 4 | \n",
" P2665-P1855-Q170210-30d44f0b-0 | \n",
" P2665 | \n",
" P1855 | \n",
" Q170210 | \n",
"
\n",
" \n",
" 5 | \n",
" P5420-P1855-Q44-209cffb1-0 | \n",
" P5420 | \n",
" P1855 | \n",
" Q44 | \n",
"
\n",
" \n",
" 6 | \n",
" P5420-P1855-Q722338-73d7be75-0 | \n",
" P5420 | \n",
" P1855 | \n",
" Q722338 | \n",
"
\n",
" \n",
" 7 | \n",
" P5471-P1855-Q44-6c38949b-0 | \n",
" P5471 | \n",
" P1855 | \n",
" Q44 | \n",
"
\n",
" \n",
" 8 | \n",
" P6088-P1855-Q1543214-3d934541-0 | \n",
" P6088 | \n",
" P1855 | \n",
" Q1543214 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 P1389-P1855-Q1109662-9e2ef218-0 P1389 P1855 Q1109662\n",
"1 P1582-P1855-Q17329207-f4ef508d-0 P1582 P1855 Q17329207\n",
"2 P2581-P1855-Q7639844-08b3a4c7-0 P2581 P1855 Q7639844\n",
"3 P2665-P1855-Q1067702-402a80a9-0 P2665 P1855 Q1067702\n",
"4 P2665-P1855-Q170210-30d44f0b-0 P2665 P1855 Q170210\n",
"5 P5420-P1855-Q44-209cffb1-0 P5420 P1855 Q44\n",
"6 P5420-P1855-Q722338-73d7be75-0 P5420 P1855 Q722338\n",
"7 P5471-P1855-Q44-6c38949b-0 P5471 P1855 Q44\n",
"8 P6088-P1855-Q1543214-3d934541-0 P6088 P1855 Q1543214"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | head \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 3: get the ontology\n",
"Once we have all the alcoholic beverages, we want to get the upper ontology of all the classes used, so that every class in our KG has a path to the root of the ontology. For example, first go to `drink` (`Q40050`), then to `liquid` (`Q11435`), then `fluid` (`Q102205`) and so on until we reach `entity` (`Q35120`).\n",
"\n",
"To do this, we need to get all the `isa` of all items in our graph, then get `P279star` so we get the list of all classes that these items descend from. Finally we need to get all the `P279` edges between them."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$TEMP\"/Q154.edges.1.tsv.gz -i p279star -i isa \\\n",
"--match 'Q154: (n1)-[]->(), isa: (n1)-[]->(n2), p279star: (n2)-[]->(class)' \\\n",
"--return 'distinct class as node1' \\\n",
"-o \"$TEMP\"/Q154.classes.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See how many classes we have in the upper ontology for the entities in our graph:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2791 2791 24573 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/Q154.classes.tsv\n"
]
}
],
"source": [
"!wc \"$TEMP\"/Q154.classes.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check that `fluid` (`Q102205`) is listed in the classes:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q102205\n"
]
}
],
"source": [
"!grep Q102205 \"$TEMP\"/Q154.classes.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now use the `derived.P279.tsv` file to get the `P279` edges that connect a class to its superclass."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$TEMP\"/Q154.classes.tsv -i \"$P279\" --as \"p279\" \\\n",
"--match 'Q154: (class)-[]->(), p279: (class)-[l]->(super)' \\\n",
"--return 'distinct l as id, class as node1, l.label as label, super as node2' \\\n",
"-o \"$TEMP\"/Q154.P279.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See how many `P279` edges are in the upper ontology; we will take care of potential duplicates at a final cleanup step:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 4428 17712 245148 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/Q154.P279.tsv\n"
]
}
],
"source": [
"!wc \"$TEMP\"/Q154.P279.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see several q-nodes below `entity` (`Q35120`), a good indication that we computed the upper ontology correctly:"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q488383-P279-Q35120-5fad2ad7-0 Q488383 P279 Q35120\n",
"Q58415929-P279-Q35120-75659d0c-0 Q58415929 P279 Q35120\n",
"Q99527517-P279-Q35120-562a6511-0 Q99527517 P279 Q35120\n",
"Q16686448-P279-Q35120-674edbf9-0 Q16686448 P279 Q35120\n",
"Q23958946-P279-Q35120-70a9ed90-0 Q23958946 P279 Q35120\n"
]
}
],
"source": [
"!grep Q35120 \"$TEMP\"/Q154.P279.tsv | head -5 | column -t -s $'\\t'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's consolidate the edges again:"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.edges.1.tsv.gz -i \"$TEMP\"/Q154.P279.tsv \\\n",
"/ compact \\\n",
"/ sort2 \\\n",
"-o \"$TEMP\"/Q154.edges.2.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The number of edges is growing:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 162839 670244 8758729\n"
]
}
],
"source": [
"!zcat < \"$TEMP\"/Q154.edges.2.tsv.gz | wc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Summary:\n",
"- We have the instances of alcoholic beverages\n",
"- We added incoming and outgoing edges\n",
"- For all the q-nodes in the previous step, we went one hop forward\n",
"- We got the upper ontology"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 4: get the edges for properties\n",
"The properties are also items in Wikidata, so let's collect them all and get their edges."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$TEMP\"/Q154.edges.2.tsv.gz \\\n",
"--match '()-[l {label: property}]->()' \\\n",
"--return 'distinct property as node1' \\\n",
"-o \"$TEMP\"/Q154.properties.tsv"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"node1\n",
"P10\n",
"P1001\n",
"P1004\n",
"P1005\n",
"P101\n",
"P1014\n",
"P1015\n",
"P1017\n",
"P1019\n"
]
}
],
"source": [
"!head \"$TEMP\"/Q154.properties.tsv | column -t -s $'\\t'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's get the edges of these properties:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i claims -i \"$TEMP\"/Q154.properties.tsv \\\n",
"--match 'Q154: (p)-[]->(), claims: (p)-[l]->(n2)' \\\n",
"--return 'distinct l as id, p as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.properties.edges.tsv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the resulting file, `node1` is a property and now we have data about them:"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10-P1628-32b85d-7927ece6-0 | \n",
" P10 | \n",
" P1628 | \n",
" \"http://www.w3.org/2006/vcard/ns#Video\" | \n",
"
\n",
" \n",
" 1 | \n",
" P10-P1628-acf60d-b8950832-0 | \n",
" P10 | \n",
" P1628 | \n",
" \"https://schema.org/video\" | \n",
"
\n",
" \n",
" 2 | \n",
" P10-P1629-Q34508-bcc39400-0 | \n",
" P10 | \n",
" P1629 | \n",
" Q34508 | \n",
"
\n",
" \n",
" 3 | \n",
" P10-P1659-P1651-c4068028-0 | \n",
" P10 | \n",
" P1659 | \n",
" P1651 | \n",
"
\n",
" \n",
" 4 | \n",
" P10-P1659-P18-5e4b9c4f-0 | \n",
" P10 | \n",
" P1659 | \n",
" P18 | \n",
"
\n",
" \n",
" 5 | \n",
" P10-P1659-P4238-d21d1ac0-0 | \n",
" P10 | \n",
" P1659 | \n",
" P4238 | \n",
"
\n",
" \n",
" 6 | \n",
" P10-P1659-P51-86aca4c5-0 | \n",
" P10 | \n",
" P1659 | \n",
" P51 | \n",
"
\n",
" \n",
" 7 | \n",
" P10-P1855-Q7378-555592a4-0 | \n",
" P10 | \n",
" P1855 | \n",
" Q7378 | \n",
"
\n",
" \n",
" 8 | \n",
" P10-P2302-Q21502404-d012aef4-0 | \n",
" P10 | \n",
" P2302 | \n",
" Q21502404 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label \\\n",
"0 P10-P1628-32b85d-7927ece6-0 P10 P1628 \n",
"1 P10-P1628-acf60d-b8950832-0 P10 P1628 \n",
"2 P10-P1629-Q34508-bcc39400-0 P10 P1629 \n",
"3 P10-P1659-P1651-c4068028-0 P10 P1659 \n",
"4 P10-P1659-P18-5e4b9c4f-0 P10 P1659 \n",
"5 P10-P1659-P4238-d21d1ac0-0 P10 P1659 \n",
"6 P10-P1659-P51-86aca4c5-0 P10 P1659 \n",
"7 P10-P1855-Q7378-555592a4-0 P10 P1855 \n",
"8 P10-P2302-Q21502404-d012aef4-0 P10 P2302 \n",
"\n",
" node2 \n",
"0 \"http://www.w3.org/2006/vcard/ns#Video\" \n",
"1 \"https://schema.org/video\" \n",
"2 Q34508 \n",
"3 P1651 \n",
"4 P18 \n",
"5 P4238 \n",
"6 P51 \n",
"7 Q7378 \n",
"8 Q21502404 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !head \"$TEMP\"/Q154.properties.edges.tsv \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's consolidate the edges again:"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.edges.2.tsv.gz -i \"$TEMP\"/Q154.properties.edges.tsv \\\n",
"/ compact \\\n",
"/ sort2 \\\n",
"-o \"$TEMP\"/Q154.edges.3.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The number of edges grew a bit"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 185031 763294 10102000\n"
]
}
],
"source": [
"!zcat < \"$TEMP\"/Q154.edges.3.tsv.gz | wc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Summary:\n",
"- We have the instances of alcoholic beverages\n",
"- We added incoming and outgoing edges\n",
"- For all the q-nodes in the previous step, we went one hop forward\n",
"- We got the upper ontology\n",
"- And we have the edges on all the properties being used"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 5: get edges between any two q-nodes in our graph\n",
"As we added q-nodes to our graph, it is possible that there exist edges between these q-nodes that we didn't get when doing the one hop out. To ensure completeness, we get all edges from Wikidata between any two nodes in our Q154 graph. The following query ensures that we are not missing any adeges betwee the nodes we added to our KG. Note that this query is expensive as it needs to find edges between any two nodes. This can be done in kypher, but would be impssible in SPARQL as you would surely get a time-out."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$TEMP\"/Q154.edges.3.tsv.gz -i \"$ITEM\" --as items \\\n",
"--match 'Q154: (n1)-[]->(n2), item: (n1)-[l]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.edges.complete.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the files again:"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.edges.3.tsv.gz -i \"$TEMP\"/Q154.edges.complete.tsv.gz \\\n",
"/ compact \\\n",
"/ sort2 \\\n",
"-o \"$TEMP\"/Q154.edges.4.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now have all the edges we want:"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 185238 764122 10113585\n"
]
}
],
"source": [
"!zcat < \"$TEMP\"/Q154.edges.4.tsv.gz | wc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 6: get the labels, aliases and descriptions of all the items in our KG\n",
"Before we start, let's define an environment variable to hold the final edges file so that if we change our mind later, we can update it without having to change the commands below."
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"Q154GRAPH\"] = os.environ[\"TEMP\"] + \"/Q154.edges.4.tsv.gz\""
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/Q154.edges.4.tsv.gz\n"
]
}
],
"source": [
"!ls \"$Q154GRAPH\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get the labels of the `node1` nodes"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i labels \\\n",
"--match 'Q154: (n1)-[]->(), label: (n1)-[l]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.label.node1.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10-label-en | \n",
" P10 | \n",
" label | \n",
" 'video'@en | \n",
"
\n",
" \n",
" 1 | \n",
" P1001-label-en | \n",
" P1001 | \n",
" label | \n",
" 'applies to jurisdiction'@en | \n",
"
\n",
" \n",
" 2 | \n",
" P1004-label-en | \n",
" P1004 | \n",
" label | \n",
" 'MusicBrainz place ID'@en | \n",
"
\n",
" \n",
" 3 | \n",
" P1005-label-en | \n",
" P1005 | \n",
" label | \n",
" 'Portuguese National Library ID'@en | \n",
"
\n",
" \n",
" 4 | \n",
" P101-label-en | \n",
" P101 | \n",
" label | \n",
" 'field of work'@en | \n",
"
\n",
" \n",
" 5 | \n",
" P1014-label-en | \n",
" P1014 | \n",
" label | \n",
" 'Art & Architecture Thesaurus ID'@en | \n",
"
\n",
" \n",
" 6 | \n",
" P1015-label-en | \n",
" P1015 | \n",
" label | \n",
" 'BIBSYS ID'@en | \n",
"
\n",
" \n",
" 7 | \n",
" P1017-label-en | \n",
" P1017 | \n",
" label | \n",
" 'Vatican Library ID'@en | \n",
"
\n",
" \n",
" 8 | \n",
" P1019-label-en | \n",
" P1019 | \n",
" label | \n",
" 'web feed URL'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 P10-label-en P10 label 'video'@en\n",
"1 P1001-label-en P1001 label 'applies to jurisdiction'@en\n",
"2 P1004-label-en P1004 label 'MusicBrainz place ID'@en\n",
"3 P1005-label-en P1005 label 'Portuguese National Library ID'@en\n",
"4 P101-label-en P101 label 'field of work'@en\n",
"5 P1014-label-en P1014 label 'Art & Architecture Thesaurus ID'@en\n",
"6 P1015-label-en P1015 label 'BIBSYS ID'@en\n",
"7 P1017-label-en P1017 label 'Vatican Library ID'@en\n",
"8 P1019-label-en P1019 label 'web feed URL'@en"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$TEMP\"/Q154.label.node1.tsv.gz | head \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the labels of the `node2` nodes"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i labels \\\n",
"--match 'Q154: ()-[]->(n2), label: (n2)-[l]->(n3)' \\\n",
"--return 'distinct l as id, n2 as node1, l.label as label, n3 as node2' \\\n",
"-o \"$TEMP\"/Q154.label.node2.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the two label files"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.label.node1.tsv.gz -i \"$TEMP\"/Q154.label.node2.tsv.gz \\\n",
"-o \"$TEMP\"/labels.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get the aliases of `node1` nodes"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i \"$ALIAS\" --as aliases \\\n",
"--match 'Q154: (n1)-[]->(), alias: (n1)-[l]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.alias.node1.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the aliases of `node2` nodes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i aliases \\\n",
"--match 'Q154: ()-[]->(n2), alias: (n2)-[l]->(n3)' \\\n",
"--return 'distinct l as id, n2 as node1, l.label as label, n3 as node2' \\\n",
"-o \"$TEMP\"/Q154.alias.node2.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the two alias files"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.alias.node1.tsv.gz -i \"$TEMP\"/Q154.alias.node2.tsv.gz \\\n",
"-o \"$TEMP\"/alias.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get the descriptions of `node1` nodes"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i \"$DESCRIPTION\" --as descriptions \\\n",
"--match 'Q154: (n1)-[]->(), description: (n1)-[l]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.description.node1.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the descriptions of `node2` nodes"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i descriptions \\\n",
"--match 'Q154: ()-[]->(n2), description: (n2)-[l]->(n3)' \\\n",
"--return 'distinct l as id, n2 as node1, l.label as label, n3 as node2' \\\n",
"-o \"$TEMP\"/Q154.description.node2.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the two description files"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat -i \"$TEMP\"/Q154.description.node1.tsv.gz -i \"$TEMP\"/Q154.description.node2.tsv.gz \\\n",
"-o \"$TEMP\"/Q154.description.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 7: get the qualifiers"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i qualifiers \\\n",
"--match 'Q154: ()-[l]->(), qual: (l)-[lq]->(n2)' \\\n",
"--return 'lq as id, l as node1, lq.label as label, n2 as node2' \\\n",
"-o \"$OUT\"/Q154.qualifiers.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10-P1855-Q7378-555592a4-0-P10-8a982d-0 | \n",
" P10-P1855-Q7378-555592a4-0 | \n",
" P10 | \n",
" \"Elephants Dream (2006).webm\" | \n",
"
\n",
" \n",
" 1 | \n",
" P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0 | \n",
" P10-P2302-Q21502404-d012aef4-0 | \n",
" P1793 | \n",
" \"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\" | \n",
"
\n",
" \n",
" 2 | \n",
" P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0 | \n",
" P10-P2302-Q21502404-d012aef4-0 | \n",
" P2316 | \n",
" Q21502408 | \n",
"
\n",
" \n",
" 3 | \n",
" P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 | \n",
" P10-P2302-Q21502404-d012aef4-0 | \n",
" P2916 | \n",
" 'filename with extension: webm, ogg, ogv, or g... | \n",
"
\n",
" \n",
" 4 | \n",
" P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0 | \n",
" P10-P2302-Q21510851-5224fe0b-0 | \n",
" P2306 | \n",
" P175 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 93989 | \n",
" Q997294-P421-Q6655-f4ed577c-0-P1264-Q1777301-0 | \n",
" Q997294-P421-Q6655-f4ed577c-0 | \n",
" P1264 | \n",
" Q1777301 | \n",
"
\n",
" \n",
" 93990 | \n",
" Q997294-P421-Q6723-7c4a7768-0-P1264-Q36669-0 | \n",
" Q997294-P421-Q6723-7c4a7768-0 | \n",
" P1264 | \n",
" Q36669 | \n",
"
\n",
" \n",
" 93991 | \n",
" Q997294-P443-cb16c6-94e4e274-0-P407-Q12107-0 | \n",
" Q997294-P443-cb16c6-94e4e274-0 | \n",
" P407 | \n",
" Q12107 | \n",
"
\n",
" \n",
" 93992 | \n",
" Q997294-P485-Q18785452-a7748618-0-P217-2aa283-0 | \n",
" Q997294-P485-Q18785452-a7748618-0 | \n",
" P217 | \n",
" \"1200 E DEPOT\" | \n",
"
\n",
" \n",
" 93993 | \n",
" Q997294-P7938-Q1345936-ebcb710a-0-P580-6eae03-0 | \n",
" Q997294-P7938-Q1345936-ebcb710a-0 | \n",
" P580 | \n",
" ^2015-03-22T00:00:00Z/11 | \n",
"
\n",
" \n",
"
\n",
"
93994 rows × 4 columns
\n",
"
"
],
"text/plain": [
" id \\\n",
"0 P10-P1855-Q7378-555592a4-0-P10-8a982d-0 \n",
"1 P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0 \n",
"2 P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0 \n",
"3 P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 \n",
"4 P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0 \n",
"... ... \n",
"93989 Q997294-P421-Q6655-f4ed577c-0-P1264-Q1777301-0 \n",
"93990 Q997294-P421-Q6723-7c4a7768-0-P1264-Q36669-0 \n",
"93991 Q997294-P443-cb16c6-94e4e274-0-P407-Q12107-0 \n",
"93992 Q997294-P485-Q18785452-a7748618-0-P217-2aa283-0 \n",
"93993 Q997294-P7938-Q1345936-ebcb710a-0-P580-6eae03-0 \n",
"\n",
" node1 label \\\n",
"0 P10-P1855-Q7378-555592a4-0 P10 \n",
"1 P10-P2302-Q21502404-d012aef4-0 P1793 \n",
"2 P10-P2302-Q21502404-d012aef4-0 P2316 \n",
"3 P10-P2302-Q21502404-d012aef4-0 P2916 \n",
"4 P10-P2302-Q21510851-5224fe0b-0 P2306 \n",
"... ... ... \n",
"93989 Q997294-P421-Q6655-f4ed577c-0 P1264 \n",
"93990 Q997294-P421-Q6723-7c4a7768-0 P1264 \n",
"93991 Q997294-P443-cb16c6-94e4e274-0 P407 \n",
"93992 Q997294-P485-Q18785452-a7748618-0 P217 \n",
"93993 Q997294-P7938-Q1345936-ebcb710a-0 P580 \n",
"\n",
" node2 \n",
"0 \"Elephants Dream (2006).webm\" \n",
"1 \"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\" \n",
"2 Q21502408 \n",
"3 'filename with extension: webm, ogg, ogv, or g... \n",
"4 P175 \n",
"... ... \n",
"93989 Q1777301 \n",
"93990 Q36669 \n",
"93991 Q12107 \n",
"93992 \"1200 E DEPOT\" \n",
"93993 ^2015-03-22T00:00:00Z/11 \n",
"\n",
"[93994 rows x 4 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = !zcat < \"$OUT\"/Q154.qualifiers.tsv.gz \n",
"kgtk_to_dataframe(result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 8: consolidate all the files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### KGTK extensions to Wikidata\n",
"KGTK defines extensions to Wikidata, and we want to include those in our graph, so we download them from GitHub:"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2021-01-24 13:11:35-- https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2617 (2.6K) [text/plain]\n",
"Saving to: ‘/Users/pedroszekely/Downloads/kgtk-tutorial/temp/kgtk.properties.tsv’\n",
"\n",
"/Users/pedroszekely 100%[===================>] 2.56K --.-KB/s in 0s \n",
"\n",
"2021-01-24 13:11:40 (12.2 MB/s) - ‘/Users/pedroszekely/Downloads/kgtk-tutorial/temp/kgtk.properties.tsv’ saved [2617/2617]\n",
"\n"
]
}
],
"source": [
"!wget https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -O \"$TEMP\"/kgtk.properties.tsv"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" isa | \n",
" label | \n",
" \"is a\"@en | \n",
" isa-label-e79b73 | \n",
"
\n",
" \n",
" 1 | \n",
" isa | \n",
" alias | \n",
" \"isa\"@en | \n",
" isa-alias-7773c5 | \n",
"
\n",
" \n",
" 2 | \n",
" isa | \n",
" description | \n",
" \"Instance or subclass relationship\"@en | \n",
" isa-description-0b5cdc | \n",
"
\n",
" \n",
" 3 | \n",
" isa | \n",
" P31 | \n",
" Q18616576 | \n",
" isa-P31-Q18616576 | \n",
"
\n",
" \n",
" 4 | \n",
" isa | \n",
" P31 | \n",
" Q28326461 | \n",
" isa-P31-Q28326461 | \n",
"
\n",
" \n",
" 5 | \n",
" isa | \n",
" P31 | \n",
" Q18647519 | \n",
" isa-P31-Q18647519 | \n",
"
\n",
" \n",
" 6 | \n",
" isa | \n",
" data_type | \n",
" wikibase-item | \n",
" isa-data_type-643cc9 | \n",
"
\n",
" \n",
" 7 | \n",
" P279star | \n",
" label | \n",
" \"is a\"@en | \n",
" P279star-label-e79b73 | \n",
"
\n",
" \n",
" 8 | \n",
" P279star | \n",
" alias | \n",
" \"isa\"@en | \n",
" P279star-alias-7773c5 | \n",
"
\n",
" \n",
" 9 | \n",
" P279star | \n",
" description | \n",
" \"Instance or subclass relationship\"@en | \n",
" P279star-description-0b5cdc | \n",
"
\n",
" \n",
" 10 | \n",
" P279star | \n",
" P31 | \n",
" Q18616576 | \n",
" P279star-P31-Q18616576 | \n",
"
\n",
" \n",
" 11 | \n",
" P279star | \n",
" P31 | \n",
" Q28326461 | \n",
" P279star-P31-Q28326461 | \n",
"
\n",
" \n",
" 12 | \n",
" P279star | \n",
" P31 | \n",
" Q18647519 | \n",
" P279star-P31-Q18647519 | \n",
"
\n",
" \n",
" 13 | \n",
" P279star | \n",
" P31 | \n",
" Q18647521 | \n",
" P279star-P31-Q18647521 | \n",
"
\n",
" \n",
" 14 | \n",
" P279star | \n",
" data_type | \n",
" wikibase-item | \n",
" P279star-data_type-643cc9 | \n",
"
\n",
" \n",
" 15 | \n",
" directed_pagerank | \n",
" label | \n",
" \"pagerank\"@en | \n",
" directed_pagerank-label-d3bd07 | \n",
"
\n",
" \n",
" 16 | \n",
" directed_pagerank | \n",
" alias | \n",
" \"page rank\"@en | \n",
" directed_pagerank-alias-9d4733 | \n",
"
\n",
" \n",
" 17 | \n",
" directed_pagerank | \n",
" description | \n",
" \"pagerank canculated on the directed graph\"@en | \n",
" directed_pagerank-description-b62fff | \n",
"
\n",
" \n",
" 18 | \n",
" directed_pagerank | \n",
" P31 | \n",
" Q18616576 | \n",
" directed_pagerank-P31-Q18616576 | \n",
"
\n",
" \n",
" 19 | \n",
" directed_pagerank | \n",
" P31 | \n",
" Q47512165 | \n",
" directed_pagerank-P31-Q47512165 | \n",
"
\n",
" \n",
" 20 | \n",
" directed_pagerank | \n",
" P1629 | \n",
" Q184316 | \n",
" directed_pagerank-P1629-Q184316 | \n",
"
\n",
" \n",
" 21 | \n",
" directed_pagerank | \n",
" data_type | \n",
" quantity | \n",
" directed_pagerank-data_type-1a7b30 | \n",
"
\n",
" \n",
" 22 | \n",
" undirected_pagerank | \n",
" label | \n",
" \"pagerank\"@en | \n",
" undirected_pagerank-label-d3bd07 | \n",
"
\n",
" \n",
" 23 | \n",
" undirected_pagerank | \n",
" alias | \n",
" \"page rank\"@en | \n",
" undirected_pagerank-alias-9d4733 | \n",
"
\n",
" \n",
" 24 | \n",
" undirected_pagerank | \n",
" description | \n",
" \"pagerank canculated on the undirected graph\"@en | \n",
" undirected_pagerank-description-ee8b1c | \n",
"
\n",
" \n",
" 25 | \n",
" undirected_pagerank | \n",
" P31 | \n",
" Q18616576 | \n",
" undirected_pagerank-P31-Q18616576 | \n",
"
\n",
" \n",
" 26 | \n",
" undirected_pagerank | \n",
" P31 | \n",
" Q47512165 | \n",
" undirected_pagerank-P31-Q47512165 | \n",
"
\n",
" \n",
" 27 | \n",
" undirected_pagerank | \n",
" P1629 | \n",
" Q184316 | \n",
" undirected_pagerank-P1629-Q184316 | \n",
"
\n",
" \n",
" 28 | \n",
" undirected_pagerank | \n",
" data_type | \n",
" quantity | \n",
" undirected_pagerank-data_type-1a7b30 | \n",
"
\n",
" \n",
" 29 | \n",
" in_degree | \n",
" label | \n",
" \"in degree\"@en | \n",
" in_degree-label-aa295d | \n",
"
\n",
" \n",
" 30 | \n",
" in_degree | \n",
" alias | \n",
" \"degree\"@en | \n",
" in_degree-alias-b5846a | \n",
"
\n",
" \n",
" 31 | \n",
" in_degree | \n",
" description | \n",
" \"in degree of a node in a graph\"@en | \n",
" in_degree-description-642be1 | \n",
"
\n",
" \n",
" 32 | \n",
" in_degree | \n",
" P31 | \n",
" Q18616576 | \n",
" in_degree-P31-Q18616576 | \n",
"
\n",
" \n",
" 33 | \n",
" in_degree | \n",
" P31 | \n",
" Q47512165 | \n",
" in_degree-P31-Q47512165 | \n",
"
\n",
" \n",
" 34 | \n",
" in_degree | \n",
" P1629 | \n",
" Q383444 | \n",
" in_degree-P1629-Q383444 | \n",
"
\n",
" \n",
" 35 | \n",
" in_degree | \n",
" data_type | \n",
" quantity | \n",
" in_degree-data_type-1a7b30 | \n",
"
\n",
" \n",
" 36 | \n",
" out_degree | \n",
" label | \n",
" \"out degree\"@en | \n",
" out_degree-label-6deeae | \n",
"
\n",
" \n",
" 37 | \n",
" out_degree | \n",
" alias | \n",
" \"degree\"@en | \n",
" out_degree-alias-b5846a | \n",
"
\n",
" \n",
" 38 | \n",
" out_degree | \n",
" description | \n",
" \"out degree of a node in a graph\"@en | \n",
" out_degree-description-c7e312 | \n",
"
\n",
" \n",
" 39 | \n",
" out_degree | \n",
" P31 | \n",
" Q18616576 | \n",
" out_degree-P31-Q18616576 | \n",
"
\n",
" \n",
" 40 | \n",
" out_degree | \n",
" P31 | \n",
" Q47512165 | \n",
" out_degree-P31-Q47512165 | \n",
"
\n",
" \n",
" 41 | \n",
" out_degree | \n",
" P1629 | \n",
" Q383444 | \n",
" out_degree-P1629-Q383444 | \n",
"
\n",
" \n",
" 42 | \n",
" out_degree | \n",
" data_type | \n",
" quantity | \n",
" out_degree-data_type-1a7b30 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label \\\n",
"0 isa label \n",
"1 isa alias \n",
"2 isa description \n",
"3 isa P31 \n",
"4 isa P31 \n",
"5 isa P31 \n",
"6 isa data_type \n",
"7 P279star label \n",
"8 P279star alias \n",
"9 P279star description \n",
"10 P279star P31 \n",
"11 P279star P31 \n",
"12 P279star P31 \n",
"13 P279star P31 \n",
"14 P279star data_type \n",
"15 directed_pagerank label \n",
"16 directed_pagerank alias \n",
"17 directed_pagerank description \n",
"18 directed_pagerank P31 \n",
"19 directed_pagerank P31 \n",
"20 directed_pagerank P1629 \n",
"21 directed_pagerank data_type \n",
"22 undirected_pagerank label \n",
"23 undirected_pagerank alias \n",
"24 undirected_pagerank description \n",
"25 undirected_pagerank P31 \n",
"26 undirected_pagerank P31 \n",
"27 undirected_pagerank P1629 \n",
"28 undirected_pagerank data_type \n",
"29 in_degree label \n",
"30 in_degree alias \n",
"31 in_degree description \n",
"32 in_degree P31 \n",
"33 in_degree P31 \n",
"34 in_degree P1629 \n",
"35 in_degree data_type \n",
"36 out_degree label \n",
"37 out_degree alias \n",
"38 out_degree description \n",
"39 out_degree P31 \n",
"40 out_degree P31 \n",
"41 out_degree P1629 \n",
"42 out_degree data_type \n",
"\n",
" node2 \\\n",
"0 \"is a\"@en \n",
"1 \"isa\"@en \n",
"2 \"Instance or subclass relationship\"@en \n",
"3 Q18616576 \n",
"4 Q28326461 \n",
"5 Q18647519 \n",
"6 wikibase-item \n",
"7 \"is a\"@en \n",
"8 \"isa\"@en \n",
"9 \"Instance or subclass relationship\"@en \n",
"10 Q18616576 \n",
"11 Q28326461 \n",
"12 Q18647519 \n",
"13 Q18647521 \n",
"14 wikibase-item \n",
"15 \"pagerank\"@en \n",
"16 \"page rank\"@en \n",
"17 \"pagerank canculated on the directed graph\"@en \n",
"18 Q18616576 \n",
"19 Q47512165 \n",
"20 Q184316 \n",
"21 quantity \n",
"22 \"pagerank\"@en \n",
"23 \"page rank\"@en \n",
"24 \"pagerank canculated on the undirected graph\"@en \n",
"25 Q18616576 \n",
"26 Q47512165 \n",
"27 Q184316 \n",
"28 quantity \n",
"29 \"in degree\"@en \n",
"30 \"degree\"@en \n",
"31 \"in degree of a node in a graph\"@en \n",
"32 Q18616576 \n",
"33 Q47512165 \n",
"34 Q383444 \n",
"35 quantity \n",
"36 \"out degree\"@en \n",
"37 \"degree\"@en \n",
"38 \"out degree of a node in a graph\"@en \n",
"39 Q18616576 \n",
"40 Q47512165 \n",
"41 Q383444 \n",
"42 quantity \n",
"\n",
" id \n",
"0 isa-label-e79b73 \n",
"1 isa-alias-7773c5 \n",
"2 isa-description-0b5cdc \n",
"3 isa-P31-Q18616576 \n",
"4 isa-P31-Q28326461 \n",
"5 isa-P31-Q18647519 \n",
"6 isa-data_type-643cc9 \n",
"7 P279star-label-e79b73 \n",
"8 P279star-alias-7773c5 \n",
"9 P279star-description-0b5cdc \n",
"10 P279star-P31-Q18616576 \n",
"11 P279star-P31-Q28326461 \n",
"12 P279star-P31-Q18647519 \n",
"13 P279star-P31-Q18647521 \n",
"14 P279star-data_type-643cc9 \n",
"15 directed_pagerank-label-d3bd07 \n",
"16 directed_pagerank-alias-9d4733 \n",
"17 directed_pagerank-description-b62fff \n",
"18 directed_pagerank-P31-Q18616576 \n",
"19 directed_pagerank-P31-Q47512165 \n",
"20 directed_pagerank-P1629-Q184316 \n",
"21 directed_pagerank-data_type-1a7b30 \n",
"22 undirected_pagerank-label-d3bd07 \n",
"23 undirected_pagerank-alias-9d4733 \n",
"24 undirected_pagerank-description-ee8b1c \n",
"25 undirected_pagerank-P31-Q18616576 \n",
"26 undirected_pagerank-P31-Q47512165 \n",
"27 undirected_pagerank-P1629-Q184316 \n",
"28 undirected_pagerank-data_type-1a7b30 \n",
"29 in_degree-label-aa295d \n",
"30 in_degree-alias-b5846a \n",
"31 in_degree-description-642be1 \n",
"32 in_degree-P31-Q18616576 \n",
"33 in_degree-P31-Q47512165 \n",
"34 in_degree-P1629-Q383444 \n",
"35 in_degree-data_type-1a7b30 \n",
"36 out_degree-label-6deeae \n",
"37 out_degree-alias-b5846a \n",
"38 out_degree-description-c7e312 \n",
"39 out_degree-P31-Q18616576 \n",
"40 out_degree-P31-Q47512165 \n",
"41 out_degree-P1629-Q383444 \n",
"42 out_degree-data_type-1a7b30 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !cat \"$TEMP\"/kgtk.properties.tsv \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Property datatype\n",
"Wikidata defines data types for properties to specify the type of value expected for each property. We follow this convention, so we include the data types in our KG. \n",
"\n",
"First take a look at the data types defined for Wikidata properties:"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" node2;wikidatatype | \n",
" rank | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10-datatype | \n",
" P10 | \n",
" datatype | \n",
" commonsMedia | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 1 | \n",
" P1001-datatype | \n",
" P1001 | \n",
" datatype | \n",
" wikibase-item | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 2 | \n",
" P1003-datatype | \n",
" P1003 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 3 | \n",
" P1004-datatype | \n",
" P1004 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 4 | \n",
" P1005-datatype | \n",
" P1005 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1503 | \n",
" P981-datatype | \n",
" P981 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 1504 | \n",
" P982-datatype | \n",
" P982 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 1505 | \n",
" P984-datatype | \n",
" P984 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 1506 | \n",
" P989-datatype | \n",
" P989 | \n",
" datatype | \n",
" commonsMedia | \n",
" | \n",
" | \n",
"
\n",
" \n",
" 1507 | \n",
" P998-datatype | \n",
" P998 | \n",
" datatype | \n",
" external-id | \n",
" | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
1508 rows × 6 columns
\n",
"
"
],
"text/plain": [
" id node1 label node2 node2;wikidatatype rank\n",
"0 P10-datatype P10 datatype commonsMedia \n",
"1 P1001-datatype P1001 datatype wikibase-item \n",
"2 P1003-datatype P1003 datatype external-id \n",
"3 P1004-datatype P1004 datatype external-id \n",
"4 P1005-datatype P1005 datatype external-id \n",
"... ... ... ... ... ... ...\n",
"1503 P981-datatype P981 datatype external-id \n",
"1504 P982-datatype P982 datatype external-id \n",
"1505 P984-datatype P984 datatype external-id \n",
"1506 P989-datatype P989 datatype commonsMedia \n",
"1507 P998-datatype P998 datatype external-id \n",
"\n",
"[1508 rows x 6 columns]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$PROPERTY_DATATYPES\" \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We filter this file to select the data types for the properties we use in our graph. We don't care about the datatypes for properties we don't have in our graph:"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"!$kypher -i \"$Q154GRAPH\" -i \"$PROPERTY_DATATYPES\" \\\n",
"--match 'Q15: (n1)-[]->(), property: (n1)-[l:datatype]->(n2)' \\\n",
"--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
"-o \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10-datatype | \n",
" P10 | \n",
" datatype | \n",
" commonsMedia | \n",
"
\n",
" \n",
" 1 | \n",
" P1001-datatype | \n",
" P1001 | \n",
" datatype | \n",
" wikibase-item | \n",
"
\n",
" \n",
" 2 | \n",
" P1004-datatype | \n",
" P1004 | \n",
" datatype | \n",
" external-id | \n",
"
\n",
" \n",
" 3 | \n",
" P1005-datatype | \n",
" P1005 | \n",
" datatype | \n",
" external-id | \n",
"
\n",
" \n",
" 4 | \n",
" P101-datatype | \n",
" P101 | \n",
" datatype | \n",
" wikibase-item | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1072 | \n",
" P981-datatype | \n",
" P981 | \n",
" datatype | \n",
" external-id | \n",
"
\n",
" \n",
" 1073 | \n",
" P982-datatype | \n",
" P982 | \n",
" datatype | \n",
" external-id | \n",
"
\n",
" \n",
" 1074 | \n",
" P984-datatype | \n",
" P984 | \n",
" datatype | \n",
" external-id | \n",
"
\n",
" \n",
" 1075 | \n",
" P989-datatype | \n",
" P989 | \n",
" datatype | \n",
" commonsMedia | \n",
"
\n",
" \n",
" 1076 | \n",
" P998-datatype | \n",
" P998 | \n",
" datatype | \n",
" external-id | \n",
"
\n",
" \n",
"
\n",
"
1077 rows × 4 columns
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 P10-datatype P10 datatype commonsMedia\n",
"1 P1001-datatype P1001 datatype wikibase-item\n",
"2 P1004-datatype P1004 datatype external-id\n",
"3 P1005-datatype P1005 datatype external-id\n",
"4 P101-datatype P101 datatype wikibase-item\n",
"... ... ... ... ...\n",
"1072 P981-datatype P981 datatype external-id\n",
"1073 P982-datatype P982 datatype external-id\n",
"1074 P984-datatype P984 datatype external-id\n",
"1075 P989-datatype P989 datatype commonsMedia\n",
"1076 P998-datatype P998 datatype external-id\n",
"\n",
"[1077 rows x 4 columns]"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = !zcat < \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz \n",
"kgtk_to_dataframe(lines)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Produce the final `all.tsv` file"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"!$kgtk cat \\\n",
"-i \"$TEMP\"/labels.tsv.gz \\\n",
"-i \"$TEMP\"/alias.tsv.gz \\\n",
"-i \"$TEMP\"/Q154.description.tsv.gz \\\n",
"-i \"$Q154GRAPH\" \\\n",
"-i \"$TEMP\"/kgtk.properties.tsv \\\n",
"-i \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz \\\n",
"/ compact \\\n",
"/ sort2 \\\n",
"-o \"$OUT\"/all.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Our full alcoholic beverage KG in a pandas dataframe"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" P10-P1628-32b85d-7927ece6-0 | \n",
" P10 | \n",
" P1628 | \n",
" \"http://www.w3.org/2006/vcard/ns#Video\" | \n",
"
\n",
" \n",
" 1 | \n",
" P10-P1628-acf60d-b8950832-0 | \n",
" P10 | \n",
" P1628 | \n",
" \"https://schema.org/video\" | \n",
"
\n",
" \n",
" 2 | \n",
" P10-P1629-Q34508-bcc39400-0 | \n",
" P10 | \n",
" P1629 | \n",
" Q34508 | \n",
"
\n",
" \n",
" 3 | \n",
" P10-P1659-P1651-c4068028-0 | \n",
" P10 | \n",
" P1659 | \n",
" P1651 | \n",
"
\n",
" \n",
" 4 | \n",
" P10-P1659-P18-5e4b9c4f-0 | \n",
" P10 | \n",
" P1659 | \n",
" P18 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 310588 | \n",
" undirected_pagerank-P31-Q47512165 | \n",
" undirected_pagerank | \n",
" P31 | \n",
" Q47512165 | \n",
"
\n",
" \n",
" 310589 | \n",
" undirected_pagerank-alias-9d4733 | \n",
" undirected_pagerank | \n",
" alias | \n",
" \"page rank\"@en | \n",
"
\n",
" \n",
" 310590 | \n",
" undirected_pagerank-data_type-1a7b30 | \n",
" undirected_pagerank | \n",
" data_type | \n",
" quantity | \n",
"
\n",
" \n",
" 310591 | \n",
" undirected_pagerank-description-ee8b1c | \n",
" undirected_pagerank | \n",
" description | \n",
" \"pagerank canculated on the undirected graph\"@en | \n",
"
\n",
" \n",
" 310592 | \n",
" undirected_pagerank-label-d3bd07 | \n",
" undirected_pagerank | \n",
" label | \n",
" \"pagerank\"@en | \n",
"
\n",
" \n",
"
\n",
"
310593 rows × 4 columns
\n",
"
"
],
"text/plain": [
" id node1 \\\n",
"0 P10-P1628-32b85d-7927ece6-0 P10 \n",
"1 P10-P1628-acf60d-b8950832-0 P10 \n",
"2 P10-P1629-Q34508-bcc39400-0 P10 \n",
"3 P10-P1659-P1651-c4068028-0 P10 \n",
"4 P10-P1659-P18-5e4b9c4f-0 P10 \n",
"... ... ... \n",
"310588 undirected_pagerank-P31-Q47512165 undirected_pagerank \n",
"310589 undirected_pagerank-alias-9d4733 undirected_pagerank \n",
"310590 undirected_pagerank-data_type-1a7b30 undirected_pagerank \n",
"310591 undirected_pagerank-description-ee8b1c undirected_pagerank \n",
"310592 undirected_pagerank-label-d3bd07 undirected_pagerank \n",
"\n",
" label node2 \n",
"0 P1628 \"http://www.w3.org/2006/vcard/ns#Video\" \n",
"1 P1628 \"https://schema.org/video\" \n",
"2 P1629 Q34508 \n",
"3 P1659 P1651 \n",
"4 P1659 P18 \n",
"... ... ... \n",
"310588 P31 Q47512165 \n",
"310589 alias \"page rank\"@en \n",
"310590 data_type quantity \n",
"310591 description \"pagerank canculated on the undirected graph\"@en \n",
"310592 label \"pagerank\"@en \n",
"\n",
"[310593 rows x 4 columns]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all = !zcat < \"$OUT\"/all.tsv.gz \n",
"kgtk_to_dataframe(all)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "kgtk-env",
"language": "python",
"name": "kgtk-env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}