{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Build Graph For The Tutorial\n", "\n", "This notebook can work for any root node, the default is `Q2685` for Schwarzenegger" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "import papermill as pm\n", "\n", "sys.path.insert(0,'../..')\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "kgtk_path = \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/data3/rogers/kgtk/gd/kgtk_public_graphs/cache/wikidata-20210215/data/\"\n", "output_path = \"/data1/rogers/kgtk/tutorial/\"\n", "\n", "input_path = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/\"\n", "output_path = \"/Users/pedroszekely/Downloads/kypher/projects\"\n", "\n", "project_name = \"build-tutorial\"\n", "root = \"Q2685\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Put the root q-node in the environment variable `ROOT`" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "os.environ['ROOT'] = root" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /Users/pedroszekely\n", "Current dir: /Users/pedroszekely/Documents/GitHub/kgtk/tutorial/build-kg\n", "KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk\n", "Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n" ] } ], "source": [ "files = [\n", " \"claims\",\n", " \"item\",\n", " \"wikibase_property\",\n", " \"datatypes\",\n", " \"qualifiers\",\n", " \"p31\",\n", " \"p279\",\n", " \"p279star\",\n", " \"quantity\",\n", " \"time\",\n", " \"external_id\",\n", " \"globe_coordinate\",\n", " \"monolingualtext\",\n", " \"string\",\n", " \"label\",\n", " \"alias\",\n", " \"description\"\n", "]\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "os.environ['KGTK_LABEL_FILE'] = \"{}\".format(os.environ['label']) " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk: kgtk\n", "GRAPH: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/\n", "STORE: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/wikidata.sqlite3.db\n", "USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n", "OUT: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial\n", "EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples\n", "TEMP: /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial\n", "kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/wikidata.sqlite3.db\n", "claims: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.tsv.gz\n", "item: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.wikibase-item.tsv.gz\n", "wikibase_property: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.wikibase-property.tsv.gz\n", "datatypes: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//metadata.property.datatypes.tsv.gz\n", "qualifiers: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//qualifiers.tsv.gz\n", "p31: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//derived.P31.tsv.gz\n", "p279: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//derived.P279.tsv.gz\n", "p279star: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//derived.P279star.tsv.gz\n", "quantity: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.quantity.tsv.gz\n", "time: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.time.tsv.gz\n", "external_id: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.external-id.tsv.gz\n", "globe_coordinate: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.globe-coordinate.tsv.gz\n", "monolingualtext: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.monolingualtext.tsv.gz\n", "string: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.string.tsv.gz\n", "label: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//labels.en.tsv.gz\n", "alias: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//aliases.en.tsv.gz\n", "description: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//descriptions.en.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define a custom location for the store when working with full Wikidata so that I can reuse it" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "os.environ['STORE'] = \"/data1/rogers/kgtk/tutorial/wikidata.sqlite3.db\"\n", "os.environ['STORE'] = \"/Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Turn on debugging for kypher" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "os.environ['kypher'] = \"kgtk --debug query --graph-cache \" + os.environ['STORE']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db\n" ] } ], "source": [ "!echo \"$kypher\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load all my files into the kypher cache so that all graph aliases are defined" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.tsv.gz\" --as claims -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.wikibase-item.tsv.gz\" --as item -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.wikibase-property.tsv.gz\" --as wikibase_property -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//metadata.property.datatypes.tsv.gz\" --as datatypes -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//qualifiers.tsv.gz\" --as qualifiers -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//derived.P31.tsv.gz\" --as p31 -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//derived.P279.tsv.gz\" --as p279 -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//derived.P279star.tsv.gz\" --as p279star -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.quantity.tsv.gz\" --as quantity -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.time.tsv.gz\" --as time -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.external-id.tsv.gz\" --as external_id -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.globe-coordinate.tsv.gz\" --as globe_coordinate -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.monolingualtext.tsv.gz\" --as monolingualtext -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//claims.string.tsv.gz\" --as string -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//labels.en.tsv.gz\" --as label -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//aliases.en.tsv.gz\" --as alias -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data//descriptions.en.tsv.gz\" --as description --limit 3\n", "[2021-10-09 20:17:56 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_1 AS graph_1_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n" ] } ], "source": [ "ck.load_files_into_cache()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial\n" ] } ], "source": [ "%cd {os.environ['OUT']}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Approach:\n", "- Select a subgraph of full Wikidata that includes people (Q5), organizations (Q43229), geographic regions (Q82794), awards (Q618779) and role (Q4897819) -- otheriwe we get male but do not get female. This graph contains all edges that connect instances of the target classes listed above. Output the graph using a single relation we call `link`.\n", "- Starting from Schwarzenegger Q2685, compute reachable nodes in the graph computed in the previous step. This step will produce the collection of nodes that will be part of the Schwarzenegger graph.\n", "- Extract from Wikidata all the edges that connect nodes from the previous step.\n", "- Extract from Wikidata the time, quantity, monolingual and string properties.\n", "- Extract from Wikidata the qualifiers for the edges computed in the previous steps.\n", "- Extract from Wikidata the labels, aliases and descriptions for the Schwarzenegger nodes." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract a subset of Wikidata to use as the base for the Schewarzenegger graph\n", "\n", "This query takes a really long time, so don't re-execute unless you have to." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-09 20:17:56 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_6_c1.\"node1\" \"_aLias.node1\", ? \"_aLias.label\", graph_2_c2.\"node2\" \"_aLias.node2\", graph_2_c2.\"id\" \"_aLias.id\"\n", " FROM graph_2 AS graph_2_c2\n", " INNER JOIN graph_6 AS graph_6_c1, graph_6 AS graph_6_c3, graph_8 AS graph_8_c4, graph_8 AS graph_8_c5\n", " ON graph_2_c2.\"node2\" = graph_6_c3.\"node1\"\n", " AND graph_6_c1.\"node1\" = graph_2_c2.\"node1\"\n", " AND graph_6_c1.\"node2\" = graph_8_c4.\"node1\"\n", " AND graph_6_c3.\"node2\" = graph_8_c5.\"node1\"\n", " AND ((graph_8_c4.\"node2\" IN (?, ?, ?, ?, ?)) AND (graph_8_c5.\"node2\" IN (?, ?, ?, ?, ?, ?)))\n", " PARAS: ['link', 'Q11424', 'Q5', 'Q43229', 'Q82794', 'Q618779', 'Q11424', 'Q5', 'Q43229', 'Q82794', 'Q618779', 'Q4897819']\n", "---------------------------------------------\n", "CPU times: user 2min 42s, sys: 47.7 s, total: 3min 30s\n", "Wall time: 3h 51min 46s\n" ] } ], "source": [ "%%time\n", "!$kypher -i p31 -i item -i p279star \\\n", "--match ' \\\n", " p31: (n1)-[]->(n1_class), \\\n", " item: (n1)-[l]->(n2), \\\n", " p31: (n2)-[]->(n2_class), \\\n", " p279star: (n1_class)-[]->(n1_superclass), \\\n", " p279star: (n2_class)-[]->(n2_superclass)' \\\n", "--where 'n1_superclass in [\"Q11424\", \"Q5\", \"Q43229\", \"Q82794\", \"Q618779\"] and n2_superclass in [\"Q11424\", \"Q5\", \"Q43229\", \"Q82794\", \"Q618779\", \"Q4897819\"]' \\\n", "--return 'distinct n1 as node1, \"link\" as label, n2 as node2, l as id' \\\n", "-o \"$TEMP\"/item.per.org.cw.geo.award.link.tsv.gz " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the original graph there are qualifier values that we want to follow in the reachablity search. To do so, we will create `link` edges between the qualifier and the value of the statement on which the qualifier is defined." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:09:44 sqlstore]: DROP graph data table graph_18 from links\n", "[2021-10-10 00:10:56 sqlstore]: IMPORT graph directly into table graph_28 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/item.per.org.cw.geo.award.link.tsv.gz ...\n", "[2021-10-10 00:15:10 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_28_c1.\"node2\" \"_aLias.node1\", ? \"_aLias.label\", graph_5_c2.\"node2\" \"_aLias.node2\"\n", " FROM graph_28 AS graph_28_c1\n", " INNER JOIN graph_4 AS graph_4_c3, graph_5 AS graph_5_c2\n", " ON graph_28_c1.\"id\" = graph_5_c2.\"node1\"\n", " AND graph_4_c3.\"node1\" = graph_5_c2.\"label\"\n", " AND graph_4_c3.\"label\" = ?\n", " AND graph_5_c2.\"label\" = graph_4_c3.\"node1\"\n", " AND (graph_4_c3.\"node2\" IN (?))\n", " PARAS: ['link', 'datatype', 'wikibase-item']\n", "---------------------------------------------\n", "[2021-10-10 00:15:10 sqlstore]: CREATE INDEX on table graph_28 column id ...\n", "[2021-10-10 00:15:53 sqlstore]: ANALYZE INDEX on table graph_28 column id ...\n", "CPU times: user 24.5 s, sys: 7.41 s, total: 31.9 s\n", "Wall time: 38min 3s\n" ] } ], "source": [ "%%time\n", "!$kypher -i qualifiers -i datatypes -i \"$TEMP\"/item.per.org.cw.geo.award.link.tsv.gz --as links \\\n", "--match ' \\\n", " links: ()-[l]->(n2), \\\n", " qualifiers: (l)-[q {label: property}]->(qualifier), \\\n", " datatypes: (property)-[:datatype]->(datatype) \\\n", " ' \\\n", "--where 'datatype in [\"wikibase-item\"]' \\\n", "--return 'n2 as node1, \"link\" as label, qualifier as node2' \\\n", "/ add-id --id-style wikidata \\\n", "/ cat -i - -i \"$TEMP\"/item.per.org.cw.geo.award.link.tsv.gz \\\n", "-o \"$TEMP\"/item.per.org.cw.geo.award.link.qualifier.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Starting from `ROOT` traverse links forward in breadfirst mode up to a fixed number of levels to build the graph" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.84 s, sys: 1.18 s, total: 5.02 s\n", "Wall time: 6min 41s\n" ] } ], "source": [ "%%time\n", "!$kgtk reachable-nodes \\\n", " --root $ROOT \\\n", " --prop link \\\n", " --label \"reachable\" \\\n", " --selflink \\\n", " --breadth-first --depth-limit 3 \\\n", " -i \"$TEMP\"/item.per.org.cw.geo.award.link.qualifier.tsv.gz \\\n", " -o \"$TEMP\"/root.reachable.per.org.cw.geo.award.tsv.gz" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\n", "Q2685\treachable\tQ2685\n", "Q2685\treachable\tQ12158205\n", "Q2685\treachable\tQ170564\n", "Q2685\treachable\tQ1765879\n", "Q2685\treachable\tQ28754213\n", "Q2685\treachable\tQ1976616\n", "Q2685\treachable\tQ551327\n", "Q2685\treachable\tQ12325509\n", "Q2685\treachable\tQ30331794\n", "Q2685\treachable\tQ557584\n" ] } ], "source": [ "!$kgtk head -i \"$TEMP\"/root.reachable.per.org.cw.geo.award.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index the resulting file in kypher" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:54:29 sqlstore]: DROP graph data table graph_19 from root_nodes\n", "[2021-10-10 00:54:29 sqlstore]: IMPORT graph directly into table graph_29 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/root.reachable.per.org.cw.geo.award.tsv.gz ...\n", "[2021-10-10 00:54:29 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_29 AS graph_29_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\n", "Q2685\treachable\tQ2685\n", "Q2685\treachable\tQ12158205\n" ] } ], "source": [ "!$kypher -i $TEMP/root.reachable.per.org.cw.geo.award.tsv.gz --as root_nodes --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build initial graph containing the item edges\n", "\n", "Figure out which properties are used so so that we can add them as node1s and get all the info about them." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:54:29 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT ? \"_aLias.node1\", ? \"_aLias.label\", graph_4_c3.\"node1\" \"_aLias.node2\"\n", " FROM graph_1 AS graph_1_c2\n", " INNER JOIN graph_29 AS graph_29_c1, graph_4 AS graph_4_c3\n", " ON graph_29_c1.\"node2\" = graph_1_c2.\"node1\"\n", " AND graph_4_c3.\"node1\" = graph_1_c2.\"label\"\n", " AND graph_1_c2.\"label\" = graph_4_c3.\"node1\"\n", " AND graph_4_c3.\"label\" = ?\n", " AND (graph_4_c3.\"node2\" IN (?, ?, ?, ?, ?))\n", " PARAS: ['root', 'link', 'datatype', 'wikibase-item', 'string', 'quantity', 'time', 'monolingualtext']\n", "---------------------------------------------\n", "[2021-10-10 00:54:29 sqlstore]: CREATE INDEX on table graph_29 column node2 ...\n", "[2021-10-10 00:54:29 sqlstore]: ANALYZE INDEX on table graph_29 column node2 ...\n", "CPU times: user 295 ms, sys: 95.3 ms, total: 390 ms\n", "Wall time: 30.8 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i root_nodes -i datatypes -i claims \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " claims: (n1)-[l {label: property}]->(), \\\n", " datatypes: (property)-[:datatype]->(datatype) \\\n", " ' \\\n", "--where 'datatype in [\"wikibase-item\", \"string\", \"quantity\", \"time\", \"monolingualtext\"]' \\\n", "--return 'distinct \"root\" as node1, \"link\" as label, property as node2' \\\n", "-o \"$TEMP\"/root.nodes.property.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the new nodes with the ones we found via reachability" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "!kgtk cat -i \"$TEMP\"/root.nodes.property.tsv.gz -i \"$TEMP\"/root.reachable.per.org.cw.geo.award.tsv.gz \\\n", "-o \"$TEMP\"/root.nodes.all.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Print number of nodes that we have so far for the new graph" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 42360\n" ] } ], "source": [ "!zcat < \"$TEMP\"/root.nodes.all.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Update the Kypher database" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:01 sqlstore]: DROP graph data table graph_29 from root_nodes\n", "[2021-10-10 00:55:01 sqlstore]: IMPORT graph directly into table graph_29 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/root.nodes.all.tsv.gz ...\n", "[2021-10-10 00:55:01 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_29 AS graph_29_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\n", "root\tlink\tP1082\n", "root\tlink\tP112\n" ] } ], "source": [ "!$kypher -i \"$TEMP\"/root.nodes.all.tsv.gz --as root_nodes --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the item to item edges connecting the nodes in the new graph" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:02 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_29_c1.\"node2\" \"_aLias.node1\", graph_2_c3.\"label\" \"_aLias.label\", graph_29_c2.\"node2\" \"_aLias.node2\", graph_2_c3.\"id\" \"_aLias.id\"\n", " FROM graph_2 AS graph_2_c3\n", " INNER JOIN graph_29 AS graph_29_c1, graph_29 AS graph_29_c2\n", " ON graph_29_c1.\"node2\" = graph_2_c3.\"node1\"\n", " AND graph_29_c2.\"node2\" = graph_2_c3.\"node2\"\n", " PARAS: []\n", "---------------------------------------------\n", "[2021-10-10 00:55:02 sqlstore]: CREATE INDEX on table graph_29 column node2 ...\n", "[2021-10-10 00:55:02 sqlstore]: ANALYZE INDEX on table graph_29 column node2 ...\n", "CPU times: user 265 ms, sys: 85.9 ms, total: 351 ms\n", "Wall time: 26.7 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i root_nodes -i item \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " root_nodes: ()-[]->(n2), \\\n", " item: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.item.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add to the kypher database" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:28 sqlstore]: DROP graph data table graph_23 from rootitems\n", "[2021-10-10 00:55:28 sqlstore]: IMPORT graph directly into table graph_30 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.tsv.gz ...\n", "[2021-10-10 00:55:30 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_30 AS graph_30_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P1001\tP1855\tQ11696\tP1001-P1855-Q11696-cdbf391b-0\n", "P1001\tP1855\tQ12371988\tP1001-P1855-Q12371988-12c10bc0-0\n" ] } ], "source": [ "!$kypher -i $OUT/root.graph.item.tsv.gz --as rootitems --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract the other types of edges" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the quantities" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:30 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_29_c1.\"node2\" \"_aLias.node1\", graph_9_c2.\"label\" \"_aLias.label\", graph_9_c2.\"node2\" \"_aLias.node2\", graph_9_c2.\"id\" \"_aLias.id\"\n", " FROM graph_29 AS graph_29_c1\n", " INNER JOIN graph_9 AS graph_9_c2\n", " ON graph_29_c1.\"node2\" = graph_9_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 68.4 ms, sys: 25 ms, total: 93.4 ms\n", "Wall time: 6.42 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i quantity -i root_nodes \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " quantity: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.quantity.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the time edges" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:37 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_29_c1.\"node2\" \"_aLias.node1\", graph_10_c2.\"label\" \"_aLias.label\", graph_10_c2.\"node2\" \"_aLias.node2\", graph_10_c2.\"id\" \"_aLias.id\"\n", " FROM graph_10 AS graph_10_c2\n", " INNER JOIN graph_29 AS graph_29_c1\n", " ON graph_29_c1.\"node2\" = graph_10_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 90.8 ms, sys: 34.1 ms, total: 125 ms\n", "Wall time: 9.25 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i time -i root_nodes \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " time: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.time.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the monolingual text edges" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:46 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_29_c1.\"node2\" \"_aLias.node1\", graph_13_c2.\"label\" \"_aLias.label\", graph_13_c2.\"node2\" \"_aLias.node2\", graph_13_c2.\"id\" \"_aLias.id\"\n", " FROM graph_13 AS graph_13_c2\n", " INNER JOIN graph_29 AS graph_29_c1\n", " ON graph_29_c1.\"node2\" = graph_13_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 78.1 ms, sys: 29.3 ms, total: 107 ms\n", "Wall time: 7.56 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i monolingualtext -i root_nodes \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " monolingualtext: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.monolingual.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the string edges" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:55:54 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_29_c1.\"node2\" \"_aLias.node1\", graph_14_c2.\"label\" \"_aLias.label\", graph_14_c2.\"node2\" \"_aLias.node2\", graph_14_c2.\"id\" \"_aLias.id\"\n", " FROM graph_14 AS graph_14_c2\n", " INNER JOIN graph_29 AS graph_29_c1\n", " ON graph_29_c1.\"node2\" = graph_14_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 133 ms, sys: 46.4 ms, total: 180 ms\n", "Wall time: 12.7 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i string -i root_nodes \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " string: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.string.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract external identifiers NEW" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:56:07 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_29_c1.\"node2\" \"_aLias.node1\", graph_11_c2.\"label\" \"_aLias.label\", graph_11_c2.\"node2\" \"_aLias.node2\", graph_11_c2.\"id\" \"_aLias.id\"\n", " FROM graph_11 AS graph_11_c2\n", " INNER JOIN graph_29 AS graph_29_c1\n", " ON graph_29_c1.\"node2\" = graph_11_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 236 ms, sys: 78.1 ms, total: 314 ms\n", "Wall time: 23 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i external_id -i root_nodes \\\n", "--match ' \\\n", " root_nodes: ()-[]->(n1), \\\n", " external_id: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.external_ids.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Complete the graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add external_ids" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:56:37 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 00:56:39 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.tsv.gz ...\n", "[2021-10-10 00:56:43 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P1001\tP1855\tQ11696\tP1001-P1855-Q11696-cdbf391b-0\n", "P1001\tP1855\tQ12371988\tP1001-P1855-Q12371988-12c10bc0-0\n", "CPU times: user 157 ms, sys: 57.4 ms, total: 214 ms\n", "Wall time: 14.7 s\n" ] } ], "source": [ "%%time\n", "!kgtk cat \\\n", "-i $OUT/root.graph.item.tsv.gz \\\n", "-i $OUT/root.graph.quantity.tsv.gz \\\n", "-i $OUT/root.graph.time.tsv.gz \\\n", "-i $OUT/root.graph.monolingual.tsv.gz \\\n", "-i $OUT/root.graph.string.tsv.gz \\\n", "-i $OUT/root.graph.external_ids.tsv.gz \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.tsv.gz \n", "\n", "!$kypher -i $OUT/root.graph.item.quantity.time.monolingual.string.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Collect all the properties" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get edges for the properties" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:56:44 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_3_c2.\"node1\" \"_aLias.node1\", graph_3_c2.\"label\" \"_aLias.label\", graph_3_c2.\"node2\" \"_aLias.node2\", graph_3_c2.\"id\" \"_aLias.id\"\n", " FROM graph_24 AS graph_24_c1\n", " INNER JOIN graph_3 AS graph_3_c2\n", " ON graph_3_c2.\"node1\" = graph_24_c1.\"label\"\n", " AND graph_24_c1.\"label\" = graph_3_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "[2021-10-10 00:56:44 sqlstore]: CREATE INDEX on table graph_24 column label ...\n", "[2021-10-10 00:56:45 sqlstore]: ANALYZE INDEX on table graph_24 column label ...\n", "CPU times: user 39.1 ms, sys: 17.7 ms, total: 56.8 ms\n", "Wall time: 3.47 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase -i wikibase_property \\\n", "--match ' \\\n", " rootbase: ()-[l {label: property}]->(), \\\n", " wikibase_property: (property)-[lp]->(n) \\\n", " ' \\\n", "--return 'distinct property as node1, lp.label as label, n as node2, lp as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.property.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Update the base" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:57:08 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 00:57:09 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.property.tsv.gz ...\n", "[2021-10-10 00:57:14 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P1001\tP1647\tP276\tP1001-P1647-P276-e4e44f83-0\n", "P1001\tP1659\tP1269\tP1001-P1659-P1269-785921cd-0\n", "CPU times: user 279 ms, sys: 93.9 ms, total: 373 ms\n", "Wall time: 27 s\n" ] } ], "source": [ "%%time\n", "!kgtk cat \\\n", "-i $OUT/root.graph.item.quantity.time.monolingual.string.tsv.gz \\\n", "-i $OUT/root.graph.property.tsv.gz \\\n", "/ compact \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.property.tsv.gz \n", "\n", "!$kypher -i $OUT/root.graph.item.quantity.time.monolingual.string.property.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compute qualifiers" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:57:17 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"id\" \"_aLias.node1\", graph_5_c2.\"label\" \"_aLias.label\", graph_5_c2.\"node2\" \"_aLias.node2\", graph_5_c2.\"id\" \"_aLias.id\"\n", " FROM graph_24 AS graph_24_c1\n", " INNER JOIN graph_5 AS graph_5_c2\n", " ON graph_24_c1.\"id\" = graph_5_c2.\"node1\"\n", " AND graph_5_c2.\"label\" = graph_5_c2.\"label\"\n", " PARAS: []\n", "---------------------------------------------\n", "[2021-10-10 00:57:17 sqlstore]: CREATE INDEX on table graph_24 column id ...\n", "[2021-10-10 00:57:17 sqlstore]: ANALYZE INDEX on table graph_24 column id ...\n", "CPU times: user 278 ms, sys: 92.7 ms, total: 370 ms\n", "Wall time: 25.3 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i qualifiers -i rootbase \\\n", "--match ' \\\n", " rootbase: ()-[l]->(), \\\n", " qualifiers: (l)-[lq {label: property}]->(n) \\\n", " ' \\\n", "--return 'distinct l as node1, property as label, n as node2, lq as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.qualifiers.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Update the base again" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:58:07 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 00:58:08 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.property.qualifiers.tsv.gz ...\n", "[2021-10-10 00:58:15 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P1001\tP1647\tP276\tP1001-P1647-P276-e4e44f83-0\n", "P1001\tP1659\tP1269\tP1001-P1659-P1269-785921cd-0\n", "CPU times: user 396 ms, sys: 130 ms, total: 526 ms\n", "Wall time: 36 s\n" ] } ], "source": [ "%%time\n", "!kgtk cat \\\n", "-i $OUT/root.graph.item.quantity.time.monolingual.string.property.tsv.gz \\\n", "-i $OUT/root.graph.qualifiers.tsv.gz \\\n", "/ compact \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.tsv.gz \n", "\n", "!$kypher -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add the units" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find all values of quantity properties, and get the units defined for them.\n", "\n", "> `kgtk_quantity_wd_units` throws an exception when it gets a quantity without units, so we have to hack around that using grep." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:58:16 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node2\"\n", " FROM graph_24 AS graph_24_c1\n", " INNER JOIN graph_4 AS graph_4_c2\n", " ON graph_4_c2.\"node1\" = graph_24_c1.\"label\"\n", " AND graph_24_c1.\"label\" = graph_4_c2.\"node1\"\n", " AND graph_4_c2.\"label\" = ?\n", " AND (graph_4_c2.\"node2\" IN (?))\n", " PARAS: ['datatype', 'quantity']\n", "---------------------------------------------\n", "[2021-10-10 00:58:16 sqlstore]: CREATE INDEX on table graph_24 column label ...\n", "[2021-10-10 00:58:17 sqlstore]: ANALYZE INDEX on table graph_24 column label ...\n", "CPU times: user 24.2 ms, sys: 16.8 ms, total: 41 ms\n", "Wall time: 1.96 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i datatypes -i rootbase \\\n", "--match ' \\\n", " rootbase: ()-[l {label: property}]->(n2), \\\n", " datatypes: (property)-[:datatype]->(datatype) \\\n", " ' \\\n", "--where 'datatype in [\"quantity\"]' \\\n", "--return 'distinct n2' \\\n", "| grep Q > \"$TEMP\"/units.noheader.tsv\n", "\n", "!echo -e \"node1\" | cat - \"$TEMP\"/units.noheader.tsv > \"$TEMP\"/quantities.units.tsv" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:58:18 sqlstore]: DROP graph data table graph_25 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/quantities.units.tsv\n", "[2021-10-10 00:58:18 sqlstore]: IMPORT graph directly into table graph_25 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/quantities.units.tsv ...\n", "[2021-10-10 00:58:18 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT kgtk_quantity_wd_units(graph_25_c1.\"node1\") \"_aLias.node1\"\n", " FROM graph_25 AS graph_25_c1\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 8.65 ms, sys: 8.12 ms, total: 16.8 ms\n", "Wall time: 674 ms\n" ] } ], "source": [ "%%time\n", "!$kypher -i \"$TEMP\"/quantities.units.tsv \\\n", "--match '(quantity)' \\\n", "--return 'distinct kgtk_quantity_wd_units(quantity) as node1' \\\n", "-o \"$TEMP\"/units.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have the units in a file, we can get all the properties we want about them" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:58:19 sqlstore]: DROP graph data table graph_26 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/units.tsv\n", "[2021-10-10 00:58:19 sqlstore]: IMPORT graph directly into table graph_26 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/units.tsv ...\n", "[2021-10-10 00:58:19 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_26_c1.\"node1\" \"_aLias.node1\", graph_4_c2.\"node1\" \"_aLias.label\", graph_2_c3.\"node2\" \"_aLias.node2\", graph_2_c3.\"id\" \"_aLias.id\"\n", " FROM graph_2 AS graph_2_c3\n", " INNER JOIN graph_26 AS graph_26_c1, graph_4 AS graph_4_c2\n", " ON graph_26_c1.\"node1\" = graph_2_c3.\"node1\"\n", " AND graph_4_c2.\"node1\" = graph_2_c3.\"label\"\n", " AND graph_2_c3.\"label\" = graph_4_c2.\"node1\"\n", " AND graph_4_c2.\"label\" = ?\n", " AND (graph_4_c2.\"node2\" IN (?, ?, ?, ?, ?))\n", " PARAS: ['datatype', 'wikibase-item', 'string', 'quantity', 'time', 'monolingualtext']\n", "---------------------------------------------\n", "[2021-10-10 00:58:19 sqlstore]: CREATE INDEX on table graph_26 column node1 ...\n", "[2021-10-10 00:58:19 sqlstore]: ANALYZE INDEX on table graph_26 column node1 ...\n", "[2021-10-10 00:58:46 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 00:58:47 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.property.qualifiers.units.tsv.gz ...\n", "[2021-10-10 00:58:53 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P1001\tP1647\tP276\tP1001-P1647-P276-e4e44f83-0\n", "P1001\tP1659\tP1269\tP1001-P1659-P1269-785921cd-0\n", "CPU times: user 387 ms, sys: 127 ms, total: 514 ms\n", "Wall time: 35.7 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i \"$TEMP\"/units.tsv -i item -i datatypes \\\n", "--match ' \\\n", " units: (unit), \\\n", " datatypes: (property)-[:datatype]->(datatype), \\\n", " item: (unit)-[l {label: property}]->(n2) \\\n", " ' \\\n", "--where 'datatype in [\"wikibase-item\", \"string\", \"quantity\", \"time\", \"monolingualtext\"]' \\\n", "--return 'distinct unit as node1, property as label, n2 as node2, l as id' \\\n", "/ cat -i - -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.tsv.gz\\\n", "/ compact \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.units.tsv.gz \\\n", "\n", "!$kypher -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.units.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make sure that every q-node has at least P31 and P279\n", "need to do it twice, once for node1 and once for node2" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 00:58:54 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node1\" \"_aLias.node1\", graph_1_c2.\"label\" \"_aLias.label\", graph_1_c2.\"node2\" \"_aLias.node2\", graph_1_c2.\"id\" \"_aLias.id\"\n", " FROM graph_1 AS graph_1_c2\n", " INNER JOIN graph_24 AS graph_24_c1\n", " ON graph_24_c1.\"node1\" = graph_1_c2.\"node1\"\n", " AND graph_1_c2.\"label\" = graph_1_c2.\"label\"\n", " AND (graph_1_c2.\"label\" IN (?, ?))\n", " PARAS: ['P31', 'P279']\n", "---------------------------------------------\n", "[2021-10-10 00:58:54 sqlstore]: CREATE INDEX on table graph_24 column node1 ...\n", "[2021-10-10 00:58:55 sqlstore]: ANALYZE INDEX on table graph_24 column node1 ...\n", "CPU times: user 11.8 s, sys: 3.52 s, total: 15.4 s\n", "Wall time: 17min 41s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase -i claims \\\n", "--match 'rootbase: (n)-[]->(), claims: (n)-[l {label: property}]->(n2)' \\\n", "--where 'property in [\"P31\", \"P279\"]' \\\n", "--return 'distinct n as node1, property as label, n2 as node2, l as id' \\\n", "-o \"$TEMP\"/root.node1.P31.P279.tsv.gz" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 01:16:36 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node2\" \"_aLias.node1\", graph_1_c2.\"label\" \"_aLias.label\", graph_1_c2.\"node2\" \"_aLias.node2\", graph_1_c2.\"id\" \"_aLias.id\"\n", " FROM graph_1 AS graph_1_c2\n", " INNER JOIN graph_24 AS graph_24_c1\n", " ON graph_24_c1.\"node2\" = graph_1_c2.\"node1\"\n", " AND graph_1_c2.\"label\" = graph_1_c2.\"label\"\n", " AND (graph_1_c2.\"label\" IN (?, ?))\n", " PARAS: ['P31', 'P279']\n", "---------------------------------------------\n", "[2021-10-10 01:16:36 sqlstore]: CREATE INDEX on table graph_24 column node2 ...\n", "[2021-10-10 01:16:38 sqlstore]: ANALYZE INDEX on table graph_24 column node2 ...\n", "CPU times: user 12.3 s, sys: 3.73 s, total: 16 s\n", "Wall time: 18min 1s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase -i claims \\\n", "--match 'rootbase: ()-[]->(n), claims: (n)-[l {label: property}]->(n2)' \\\n", "--where 'property in [\"P31\", \"P279\"]' \\\n", "--return 'distinct n as node1, property as label, n2 as node2, l as id' \\\n", "-o \"$TEMP\"/root.node2.P31.P279.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Recreate the base file NEW\n", "> the output file should have the `.units` segment in the name, but I didnt' add it so that I don't have to modify all the other ocmmands\n", "> a better design for the file names would not have this problem" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 01:35:08 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 01:35:10 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.tsv.gz ...\n", "[2021-10-10 01:35:17 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P10\tP31\tQ18610173\tP10-P31-Q18610173-85ef4d24-0\n", "P1000\tP31\tQ18608871\tP1000-P31-Q18608871-093affb5-0\n", "CPU times: user 438 ms, sys: 146 ms, total: 584 ms\n", "Wall time: 39.9 s\n" ] } ], "source": [ "%%time\n", "!kgtk cat \\\n", "-i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.units.tsv.gz \\\n", "-i \"$TEMP\"/root.node2.P31.P279.tsv.gz \\\n", "/ compact \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.tsv.gz \n", "\n", "!$kypher -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Incorporate all nodes up to the top of the class hierarchy\n", "When we do a breath first traversal, we may not follow enough links on the P279 hierarchy to reach the top. We need to do a full traversal on the P279 hierarchy to incorporate all the relevant classes.\n", "\n", "Approach:\n", "- Create a graph including P31 and P279 to do the traversal\n", "- Create a file of all the nodes in the Schwarzenneger file to use as roots" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 01:35:19 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_1_c1.\"node1\" \"_aLias.node1\", ? \"_aLias.label\", graph_1_c1.\"node2\" \"_aLias.node2\"\n", " FROM graph_1 AS graph_1_c1\n", " WHERE graph_1_c1.\"label\" = graph_1_c1.\"label\"\n", " AND (graph_1_c1.\"label\" IN (?, ?))\n", " PARAS: ['link', 'P31', 'P279']\n", "---------------------------------------------\n", "CPU times: user 19.3 s, sys: 5.91 s, total: 25.2 s\n", "Wall time: 28min 37s\n" ] } ], "source": [ "%%time\n", "!$kypher -i claims \\\n", "--match '(n1)-[l {label:property}]->(n2)' \\\n", "--where 'property in [\"P31\", \"P279\"]' \\\n", "--return 'distinct n1 as node1, \"link\" as label, n2 as node2' \\\n", "-o \"$TEMP\"/P31.P279.subgraph.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create the roots" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find roots in node1\n", "\n", "> This step is including qualifier ids in node1, which makes reachable nodes have more roots than necessary. Would be nice to eliminate qualifiers here." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:03:55 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node1\" \"_aLias.node1\"\n", " FROM graph_24 AS graph_24_c1\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 81.9 ms, sys: 34 ms, total: 116 ms\n", "Wall time: 6.93 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase \\\n", "--match '(n)-[]->()' \\\n", "--return 'distinct n as node1' \\\n", "-o \"$TEMP\"/root.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find roots in node2" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:04:02 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node2\" \"_aLias.node1\"\n", " FROM graph_24 AS graph_24_c1\n", " INNER JOIN graph_4 AS graph_4_c2\n", " ON graph_4_c2.\"node1\" = graph_24_c1.\"label\"\n", " AND graph_24_c1.\"label\" = graph_4_c2.\"node1\"\n", " AND graph_4_c2.\"label\" = ?\n", " AND (graph_4_c2.\"node2\" IN (?))\n", " PARAS: ['datatype', 'wikibase-item']\n", "---------------------------------------------\n", "[2021-10-10 02:04:02 sqlstore]: CREATE INDEX on table graph_24 column label ...\n", "[2021-10-10 02:04:03 sqlstore]: ANALYZE INDEX on table graph_24 column label ...\n", "CPU times: user 27.9 ms, sys: 15.7 ms, total: 43.6 ms\n", "Wall time: 2.29 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase -i datatypes \\\n", "--match ' \\\n", " rootbase: ()-[l {label: property}]->(n), \\\n", " datatypes: (property)-[:datatype]->(datatype) \\\n", " ' \\\n", "--where 'datatype in [\"wikibase-item\"]' \\\n", "--return 'distinct n as node1' \\\n", "-o \"$TEMP\"/root.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combine the two files to create all the roots" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:04:08 sqlstore]: DROP graph data table graph_27 from rootnode1\n", "[2021-10-10 02:04:08 sqlstore]: IMPORT graph directly into table graph_27 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/root.nodes.tsv.gz ...\n", "[2021-10-10 02:04:09 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_27 AS graph_27_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\n", "P10\n", "P1000\n", "CPU times: user 58.9 ms, sys: 28.3 ms, total: 87.2 ms\n", "Wall time: 5.06 s\n" ] } ], "source": [ "%%time\n", "!$kgtk cat --mode=NONE -i \"$TEMP\"/root.node1.tsv.gz -i \"$TEMP\"/root.node2.tsv.gz \\\n", "/ compact --mode=NONE --columns node1 \\\n", "-o \"$TEMP\"/root.nodes.tsv.gz\n", "\n", "!$kypher -i \"$TEMP\"/root.nodes.tsv.gz --as rootnode1 --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Circumvent a problem in `reachable-nodes` where it does not accept a root file with column header `node1`" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 47.7 ms, sys: 19 ms, total: 66.7 ms\n", "Wall time: 3.78 s\n" ] } ], "source": [ "%%time\n", "!$kgtk rename-columns -i \"$TEMP\"/root.nodes.tsv.gz --output-columns id --mode=NONE \\\n", "/ compact -o \"$TEMP\"/root.roots.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Do a depth-first traversal of the P31/P279 graph using as roots all items in the Schewarzenegger graph" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 19.9 s, sys: 6.11 s, total: 26 s\n", "Wall time: 30min 11s\n" ] } ], "source": [ "%%time\n", "!$kgtk reachable-nodes \\\n", " --rootfile \"$TEMP\"/root.roots.tsv.gz \\\n", " --rootfilecolumn id \\\n", " --prop link \\\n", " --label \"reachable\" \\\n", " --selflink \\\n", " -i \"$TEMP\"/P31.P279.subgraph.tsv.gz \\\n", " -o \"$TEMP\"/P31.P279.reachable.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Deduplicate the reachable nodes file" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 642 ms, sys: 203 ms, total: 845 ms\n", "Wall time: 1min\n" ] } ], "source": [ "%%time\n", "!$kgtk remove-columns -i \"$TEMP\"/P31.P279.reachable.tsv.gz --columns node1 label \\\n", "/ rename-columns --mode=NONE --output-columns node1 \\\n", "/ compact --mode=NONE --columns node1 \\\n", "-o \"$TEMP\"/P31.P279.reachable.dedup.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Put all the reachable nodes in `rootnode1`" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:35:29 sqlstore]: DROP graph data table graph_27 from rootnode1\n", "[2021-10-10 02:35:29 sqlstore]: IMPORT graph directly into table graph_27 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/root.nodes.ontology.tsv.gz ...\n", "[2021-10-10 02:35:30 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_27 AS graph_27_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\n", "P10\n", "P1000\n", "CPU times: user 53.7 ms, sys: 27.2 ms, total: 81 ms\n", "Wall time: 5.17 s\n" ] } ], "source": [ "%%time\n", "!$kgtk cat --mode=NONE \\\n", "-i \"$TEMP\"/root.nodes.tsv.gz \\\n", "-i \"$TEMP\"/P31.P279.reachable.dedup.tsv.gz \\\n", "/ compact --deduplicate --mode=NONE --columns node1 \\\n", "-o \"$TEMP\"/root.nodes.ontology.tsv.gz\n", "\n", "!$kypher -i \"$TEMP\"/root.nodes.ontology.tsv.gz --as rootnode1 --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract all P31/P279 edges from Wikidata for all the nodes in the new graph and consolidate." ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:35:30 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_27_c1.\"node1\" \"_aLias.node1\", graph_1_c2.\"label\" \"_aLias.label\", graph_1_c2.\"node2\" \"_aLias.node2\", graph_1_c2.\"id\" \"_aLias.id\"\n", " FROM graph_1 AS graph_1_c2\n", " INNER JOIN graph_27 AS graph_27_c1\n", " ON graph_27_c1.\"node1\" = graph_1_c2.\"node1\"\n", " AND graph_1_c2.\"label\" = graph_1_c2.\"label\"\n", " AND (graph_1_c2.\"label\" IN (?, ?))\n", " PARAS: ['P31', 'P279']\n", "---------------------------------------------\n", "[2021-10-10 02:35:30 sqlstore]: CREATE INDEX on table graph_27 column node1 ...\n", "[2021-10-10 02:35:31 sqlstore]: ANALYZE INDEX on table graph_27 column node1 ...\n", "[2021-10-10 02:52:25 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 02:52:28 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.tsv.gz ...\n", "[2021-10-10 02:52:35 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P10\tP31\tQ18610173\tP10-P31-Q18610173-85ef4d24-0\n", "P1000\tP31\tQ18608871\tP1000-P31-Q18608871-093affb5-0\n", "CPU times: user 11 s, sys: 3.25 s, total: 14.2 s\n", "Wall time: 17min 5s\n" ] } ], "source": [ "%%time\n", "!$kypher -i claims -i rootnode1 \\\n", "--match ' \\\n", " rootnode1: (n1), \\\n", " claims: (n1)-[l {label:property}]->(n2) \\\n", " ' \\\n", "--where 'property in [\"P31\", \"P279\"]' \\\n", "--return 'distinct n1 as node1, property as label, n2 as node2, l as id' \\\n", "/ cat -i - -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.tsv.gz \\\n", "/ compact \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.tsv.gz \\\n", "\n", "!$kypher -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I am not certain about the need for this cell, whether new nodes can appear after adding P31 and P279." ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:52:36 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node2\" \"_aLias.node1\"\n", " FROM graph_24 AS graph_24_c1\n", " INNER JOIN graph_4 AS graph_4_c2\n", " ON graph_4_c2.\"node1\" = graph_24_c1.\"label\"\n", " AND graph_24_c1.\"label\" = graph_4_c2.\"node1\"\n", " AND graph_4_c2.\"label\" = ?\n", " AND (graph_4_c2.\"node2\" IN (?))\n", " PARAS: ['datatype', 'wikibase-item']\n", "---------------------------------------------\n", "[2021-10-10 02:52:36 sqlstore]: CREATE INDEX on table graph_24 column label ...\n", "[2021-10-10 02:52:37 sqlstore]: ANALYZE INDEX on table graph_24 column label ...\n", "[2021-10-10 02:52:41 sqlstore]: DROP graph data table graph_27 from rootnode1\n", "[2021-10-10 02:52:41 sqlstore]: IMPORT graph directly into table graph_27 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/root.nodes.ontology.star.tsv.gz ...\n", "[2021-10-10 02:52:42 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_27 AS graph_27_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\n", "P10\n", "P1000\n", "CPU times: user 75.7 ms, sys: 34.3 ms, total: 110 ms\n", "Wall time: 6.69 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase -i datatypes \\\n", "--match ' \\\n", " rootbase: ()-[l {label: property}]->(n), \\\n", " datatypes: (property)-[:datatype]->(datatype) \\\n", " ' \\\n", "--where 'datatype in [\"wikibase-item\"]' \\\n", "--return 'distinct n as node1' \\\n", "/ cat -i - -i \"$TEMP\"/root.nodes.ontology.tsv.gz --mode=NONE \\\n", "/ compact --mode=NONE --columns node1 \\\n", "-o \"$TEMP\"/root.nodes.ontology.star.tsv.gz \\\n", "\n", "!$kypher -i \"$TEMP\"/root.nodes.ontology.star.tsv.gz --as rootnode1 --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Include in node1 all the properties in the graph" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:52:43 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"label\" \"_aLias.node1\"\n", " FROM graph_24 AS graph_24_c1\n", " WHERE graph_24_c1.\"label\" = graph_24_c1.\"label\"\n", " PARAS: []\n", "---------------------------------------------\n", "[2021-10-10 02:52:46 sqlstore]: DROP graph data table graph_27 from rootnode1\n", "[2021-10-10 02:52:46 sqlstore]: IMPORT graph directly into table graph_27 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/temp.build-tutorial/root.nodes.ontology.star.property.tsv.gz ...\n", "[2021-10-10 02:52:47 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_27 AS graph_27_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\n", "P10\n", "P1000\n", "CPU times: user 56.2 ms, sys: 26.8 ms, total: 83 ms\n", "Wall time: 4.77 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i rootbase \\\n", "--match ' \\\n", " rootbase: ()-[l {label: property}]->(n)' \\\n", "--return 'distinct property as node1' \\\n", "/ cat -i - -i \"$TEMP\"/root.nodes.ontology.star.tsv.gz --mode=NONE \\\n", "/ compact --mode=NONE --columns node1 \\\n", "-o \"$TEMP\"/root.nodes.ontology.star.property.tsv.gz \\\n", "\n", "!$kypher -i \"$TEMP\"/root.nodes.ontology.star.property.tsv.gz --as rootnode1 --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add property datatypes" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:52:47 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_4_c2.\"node1\" \"_aLias.node1\", graph_4_c2.\"label\" \"_aLias.label\", graph_4_c2.\"node2\" \"_aLias.node2\", graph_4_c2.\"id\" \"_aLias.id\"\n", " FROM graph_24 AS graph_24_c1\n", " INNER JOIN graph_4 AS graph_4_c2\n", " ON graph_4_c2.\"node1\" = graph_24_c1.\"label\"\n", " AND graph_24_c1.\"label\" = graph_4_c2.\"node1\"\n", " AND graph_4_c2.\"label\" = ?\n", " PARAS: ['datatype']\n", "---------------------------------------------\n", "[2021-10-10 02:53:17 sqlstore]: DROP graph data table graph_24 from rootbase\n", "[2021-10-10 02:53:18 sqlstore]: IMPORT graph directly into table graph_24 from /Users/pedroszekely/Downloads/kypher/projects/build-tutorial/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.datatype.tsv.gz ...\n", "[2021-10-10 02:53:26 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_24 AS graph_24_c1\n", " LIMIT ?\n", " PARAS: [2]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P10\tP31\tQ18610173\tP10-P31-Q18610173-85ef4d24-0\n", "P1000\tP31\tQ18608871\tP1000-P31-Q18608871-093affb5-0\n", "CPU times: user 429 ms, sys: 142 ms, total: 571 ms\n", "Wall time: 39.3 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i datatypes -i rootbase \\\n", "--match ' \\\n", " rootbase: ()-[r {label: property}]->(), \\\n", " datatypes: (property)-[l:datatype]->(datatype) \\\n", " ' \\\n", "--return 'distinct property as node1, l.label as label, datatype as node2, l as id' \\\n", "/ cat -i - -i $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.tsv.gz \\\n", "/ compact \\\n", "-o $OUT/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.datatype.tsv.gz \\\n", "\n", "!$kypher -i root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.datatype.tsv.gz --as rootbase --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build labels, aliases and descriptions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the label edges" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:53:27 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_27_c1.\"node1\" \"_aLias.node1\", graph_15_c2.\"label\" \"_aLias.label\", graph_15_c2.\"node2\" \"_aLias.node2\", graph_15_c2.\"id\" \"_aLias.id\"\n", " FROM graph_15 AS graph_15_c2\n", " INNER JOIN graph_27 AS graph_27_c1\n", " ON graph_27_c1.\"node1\" = graph_15_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "[2021-10-10 02:53:27 sqlstore]: CREATE INDEX on table graph_27 column node1 ...\n", "[2021-10-10 02:53:27 sqlstore]: ANALYZE INDEX on table graph_27 column node1 ...\n", "CPU times: user 230 ms, sys: 79.7 ms, total: 310 ms\n", "Wall time: 20.7 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i label -i rootnode1 \\\n", "--match ' \\\n", " rootnode1: (n1)-[]->(), \\\n", " label: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.label.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the alias edges" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:53:47 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_27_c1.\"node1\" \"_aLias.node1\", graph_16_c2.\"label\" \"_aLias.label\", graph_16_c2.\"node2\" \"_aLias.node2\", graph_16_c2.\"id\" \"_aLias.id\"\n", " FROM graph_16 AS graph_16_c2\n", " INNER JOIN graph_27 AS graph_27_c1\n", " ON graph_27_c1.\"node1\" = graph_16_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 61.3 ms, sys: 22.3 ms, total: 83.6 ms\n", "Wall time: 5.4 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i alias -i rootnode1 \\\n", "--match ' \\\n", " rootnode1: (n1)-[]->(), \\\n", " alias: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.alias.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the description edges" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:53:53 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_27_c1.\"node1\" \"_aLias.node1\", graph_17_c2.\"label\" \"_aLias.label\", graph_17_c2.\"node2\" \"_aLias.node2\", graph_17_c2.\"id\" \"_aLias.id\"\n", " FROM graph_17 AS graph_17_c2\n", " INNER JOIN graph_27 AS graph_27_c1\n", " ON graph_27_c1.\"node1\" = graph_17_c2.\"node1\"\n", " PARAS: []\n", "---------------------------------------------\n", "CPU times: user 243 ms, sys: 77.5 ms, total: 320 ms\n", "Wall time: 21.1 s\n" ] } ], "source": [ "%%time\n", "!$kypher -i description -i rootnode1 \\\n", "--match ' \\\n", " rootnode1: (n1)-[]->(), \\\n", " description: (n1)-[l]->(n2) \\\n", " ' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "/ sort \\\n", "-o $OUT/root.graph.description.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compute useful derived files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inverses of `P279`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> To do: need to define t`P279_` property, it's datatype, label, etc." ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-10 02:54:14 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_24_c1.\"node2\" \"_aLias.node1\", ? \"_aLias.label\", graph_24_c1.\"node1\" \"_aLias.node2\"\n", " FROM graph_24 AS graph_24_c1\n", " WHERE graph_24_c1.\"label\" = ?\n", " PARAS: ['P279_', 'P279']\n", "---------------------------------------------\n", "[2021-10-10 02:54:14 sqlstore]: CREATE INDEX on table graph_24 column label ...\n", "[2021-10-10 02:54:15 sqlstore]: ANALYZE INDEX on table graph_24 column label ...\n" ] } ], "source": [ "!$kypher -i rootbase \\\n", "--match '(n1)-[:P279]->(class)' \\\n", "--return 'distinct class as node1, \"P279_\" as label, n1 as node2' \\\n", "/ add-id --id-style wikidata \\\n", "/ sort \\\n", "-o \"$OUT\"/root.derived.P279inv.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Final files\n", "- base, includes all edges except labeles, aliases and descriptions\n", "- labels\n", "- aliases\n", "- descriptions" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 134 ms, sys: 47.2 ms, total: 181 ms\n", "Wall time: 11.9 s\n" ] } ], "source": [ "%%time\n", "!$kgtk cat \\\n", "-i \"$OUT\"/root.graph.item.quantity.time.monolingual.string.property.qualifiers.P31.P279.ontology.datatype.tsv.gz \\\n", "-i \"$OUT\"/root.graph.alias.tsv.gz \\\n", "-i \"$OUT\"/root.graph.label.tsv.gz \\\n", "-i \"$OUT\"/root.graph.description.tsv.gz \\\n", "-o \"$OUT\"/all.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }