{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Add Derived Graphs To The Tutorial Graph\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "import papermill as pm\n", "\n", "sys.path.insert(0,'../..')\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "\n", "kgtk_path = \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Users/pedroszekely/Downloads/kypher/projects/build-tutorial\"\n", "output_path = \"/Users/pedroszekely/Downloads/kypher/projects\"\n", "project_name = \"tutorial-derived-graphs\"\n", "tutorial_files_path = \"/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /Users/amandeep\n", "Current dir: /Users/amandeep/Github/kgtk/tutorial/build-kg\n", "KGTK dir: /Users/amandeep/Github/kgtk\n", "Use-cases dir: /Users/amandeep/Github/kgtk/use-cases\n" ] } ], "source": [ "files = [\n", " \"all\"\n", "]\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "OUT: /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs\n", "TEMP: /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/temp.tutorial-derived-graphs\n", "STORE: /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db\n", "kypher: kgtk query --graph-cache /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db\n", "GRAPH: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold\n", "USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases\n", "kgtk: kgtk\n", "EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples\n", "all: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold/all.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Turn on debugging for kypher" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "os.environ['tutorial_files_path'] = tutorial_files_path\n", "os.environ['kgtk_path'] = kgtk_path\n", "os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']\n", "os.environ['KGTK_LABEL_FILE'] = os.environ['OUT'] + \"/parts/labels.en.tsv.gz\"\n", "os.environ['KGTK_OPTION_DEBUG'] = \"true\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load all my files into the kypher cache so that all graph aliases are defined" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db -i \"/Users/pedroszekely/Downloads/kypher/projects/build-tutorial/all.tsv.gz\" --as all --limit 3\n", "[2021-10-10 11:51:26 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_27 AS graph_27_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "P10\tP31\tQ18610173\tP10-P31-Q18610173-85ef4d24-0\n", "P1000\tP31\tQ18608871\tP1000-P31-Q18608871-093affb5-0\n", "P1001\tP1647\tP276\tP1001-P1647-P276-e4e44f83-0\n" ] } ], "source": [ "ck.load_files_into_cache()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs\n" ] } ], "source": [ "%cd {os.environ['OUT']}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run partition notebook\n", "\n", "We need the parts to run the Useful Files notebook" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a36e9d35693a4d3ea546ccff53b14e4b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Executing: 0%| | 0/49 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
P1545node1labelnode2id
265621Q23958852Pdirected_pagerank0.071410Q23958852-Pdirected_pagerank-79688
425512Q23960977Pdirected_pagerank0.032866Q23960977-Pdirected_pagerank-127655
148563Q35120Pdirected_pagerank0.028596Q35120-Pdirected_pagerank-44570
111924Q151885Pdirected_pagerank0.026957Q151885-Pdirected_pagerank-33578
4395Q5Pdirected_pagerank0.012807Q5-Pdirected_pagerank-1319
..................
3846366010Q207482Pdirected_pagerank0.000002Q207482-Pdirected_pagerank-115391
3846266011Q20747487Pdirected_pagerank0.000002Q20747487-Pdirected_pagerank-115388
3845566012Q20746713Pdirected_pagerank0.000002Q20746713-Pdirected_pagerank-115367
3845366013Q20746702Pdirected_pagerank0.000002Q20746702-Pdirected_pagerank-115361
6601366014Q99975457Pdirected_pagerank0.000002Q99975457-Pdirected_pagerank-198041
\n", "

66014 rows × 5 columns

\n", "" ], "text/plain": [ " P1545 node1 label node2 \\\n", "26562 1 Q23958852 Pdirected_pagerank 0.071410 \n", "42551 2 Q23960977 Pdirected_pagerank 0.032866 \n", "14856 3 Q35120 Pdirected_pagerank 0.028596 \n", "11192 4 Q151885 Pdirected_pagerank 0.026957 \n", "439 5 Q5 Pdirected_pagerank 0.012807 \n", "... ... ... ... ... \n", "38463 66010 Q207482 Pdirected_pagerank 0.000002 \n", "38462 66011 Q20747487 Pdirected_pagerank 0.000002 \n", "38455 66012 Q20746713 Pdirected_pagerank 0.000002 \n", "38453 66013 Q20746702 Pdirected_pagerank 0.000002 \n", "66013 66014 Q99975457 Pdirected_pagerank 0.000002 \n", "\n", " id \n", "26562 Q23958852-Pdirected_pagerank-79688 \n", "42551 Q23960977-Pdirected_pagerank-127655 \n", "14856 Q35120-Pdirected_pagerank-44570 \n", "11192 Q151885-Pdirected_pagerank-33578 \n", "439 Q5-Pdirected_pagerank-1319 \n", "... ... \n", "38463 Q207482-Pdirected_pagerank-115391 \n", "38462 Q20747487-Pdirected_pagerank-115388 \n", "38455 Q20746713-Pdirected_pagerank-115367 \n", "38453 Q20746702-Pdirected_pagerank-115361 \n", "66013 Q99975457-Pdirected_pagerank-198041 \n", "\n", "[66014 rows x 5 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "directed_pagerank = kgtk(\"\"\"\n", " query -i $OUT/useful_files/metadata.pagerank.directed.tsv.gz \n", " --match '(n1)-[l:Pdirected_pagerank]->(pagerank)'\n", "\"\"\")\n", "\n", "directed_pagerank_sorted = directed_pagerank.sort_values(\"node2\", ascending=False)\n", "directed_pagerank_sorted.insert(0, 'P1545', range(1, 1 + len(directed_pagerank_sorted)))\n", "directed_pagerank_sorted.to_csv(f\"{os.environ['TEMP']}/directed-pagerank.ordinal.tsv\", index=False, sep='\\t')\n", "directed_pagerank_sorted" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The temporary file looks good, next steps:\n", "- `normalize` to put the qualifiers as extra edges so the file has only `node1/label/node2/id`\n", "- `add-ids` as we want all edges to have ids" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " normalize -i \"$TEMP\"/directed-pagerank.ordinal.tsv\n", " / add-id --id-style wikidata \n", " -o \"$OUT\"/useful_files/metadata.pagerank.directed.ordinal.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Look at the result to confirm that we are generating the data we want." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode1;labellabel;label
0Q23958852Pdirected_pagerank0.071410Q23958852-Pdirected_pagerank-79688'variable-order class'@enNaN
1Q23958852-Pdirected_pagerank-79688P15451.000000Q23958852-Pdirected_pagerank-79688-P1545-6b86b2NaN'series ordinal'@en
2Q23960977Pdirected_pagerank0.032866Q23960977-Pdirected_pagerank-127655'(meta)class'@enNaN
3Q23960977-Pdirected_pagerank-127655P15452.000000Q23960977-Pdirected_pagerank-127655-P1545-d4735eNaN'series ordinal'@en
4Q35120Pdirected_pagerank0.028596Q35120-Pdirected_pagerank-44570'entity'@enNaN
5Q35120-Pdirected_pagerank-44570P15453.000000Q35120-Pdirected_pagerank-44570-P1545-4e0740NaN'series ordinal'@en
6Q151885Pdirected_pagerank0.026957Q151885-Pdirected_pagerank-33578'concept'@enNaN
7Q151885-Pdirected_pagerank-33578P15454.000000Q151885-Pdirected_pagerank-33578-P1545-4b2277NaN'series ordinal'@en
8Q5Pdirected_pagerank0.012807Q5-Pdirected_pagerank-1319'human'@enNaN
9Q5-Pdirected_pagerank-1319P15455.000000Q5-Pdirected_pagerank-1319-P1545-ef2d12NaN'series ordinal'@en
\n", "
" ], "text/plain": [ " node1 label node2 \\\n", "0 Q23958852 Pdirected_pagerank 0.071410 \n", "1 Q23958852-Pdirected_pagerank-79688 P1545 1.000000 \n", "2 Q23960977 Pdirected_pagerank 0.032866 \n", "3 Q23960977-Pdirected_pagerank-127655 P1545 2.000000 \n", "4 Q35120 Pdirected_pagerank 0.028596 \n", "5 Q35120-Pdirected_pagerank-44570 P1545 3.000000 \n", "6 Q151885 Pdirected_pagerank 0.026957 \n", "7 Q151885-Pdirected_pagerank-33578 P1545 4.000000 \n", "8 Q5 Pdirected_pagerank 0.012807 \n", "9 Q5-Pdirected_pagerank-1319 P1545 5.000000 \n", "\n", " id \\\n", "0 Q23958852-Pdirected_pagerank-79688 \n", "1 Q23958852-Pdirected_pagerank-79688-P1545-6b86b2 \n", "2 Q23960977-Pdirected_pagerank-127655 \n", "3 Q23960977-Pdirected_pagerank-127655-P1545-d4735e \n", "4 Q35120-Pdirected_pagerank-44570 \n", "5 Q35120-Pdirected_pagerank-44570-P1545-4e0740 \n", "6 Q151885-Pdirected_pagerank-33578 \n", "7 Q151885-Pdirected_pagerank-33578-P1545-4b2277 \n", "8 Q5-Pdirected_pagerank-1319 \n", "9 Q5-Pdirected_pagerank-1319-P1545-ef2d12 \n", "\n", " node1;label label;label \n", "0 'variable-order class'@en NaN \n", "1 NaN 'series ordinal'@en \n", "2 '(meta)class'@en NaN \n", "3 NaN 'series ordinal'@en \n", "4 'entity'@en NaN \n", "5 NaN 'series ordinal'@en \n", "6 'concept'@en NaN \n", "7 NaN 'series ordinal'@en \n", "8 'human'@en NaN \n", "9 NaN 'series ordinal'@en " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " head -i \"$OUT\"/useful_files/metadata.pagerank.directed.ordinal.tsv.gz / add-labels\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Repeat the same steps for `undirected_pagerank`" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-10-12 16:36:03 sqlstore]: IMPORT graph directly into table graph_2 from /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/useful_files/metadata.pagerank.undirected.tsv.gz ...\n", "[2021-10-12 16:36:04 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_2 AS graph_2_c1\n", " WHERE graph_2_c1.\"label\" = ?\n", " PARAS: ['Pundirected_pagerank']\n", "---------------------------------------------\n", "[2021-10-12 16:36:04 sqlstore]: CREATE INDEX on table graph_2 column label ...\n", "[2021-10-12 16:36:04 sqlstore]: ANALYZE INDEX on table graph_2 column label ...\n", "\n", "CPU times: user 698 ms, sys: 104 ms, total: 802 ms\n", "Wall time: 2.89 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
P1545node1labelnode2id
4391Q5Pundirected_pagerank0.022010Q5-Pundirected_pagerank-1319
1732Q30Pundirected_pagerank0.012919Q30-Pundirected_pagerank-521
47823Q6581097Pundirected_pagerank0.008353Q6581097-Pundirected_pagerank-14348
70974Q15221623Pundirected_pagerank0.004738Q15221623-Pundirected_pagerank-21293
13915Q1860Pundirected_pagerank0.004441Q1860-Pundirected_pagerank-4175
..................
6400566010Q7958659Pundirected_pagerank0.000003Q7958659-Pundirected_pagerank-192017
3223166011Q17021934Pundirected_pagerank0.000003Q17021934-Pundirected_pagerank-96695
4531966012Q27890917Pundirected_pagerank0.000003Q27890917-Pundirected_pagerank-135959
3544366013Q10876480Pundirected_pagerank0.000003Q10876480-Pundirected_pagerank-106331
1769766014Q1199713Pundirected_pagerank0.000003Q1199713-Pundirected_pagerank-53093
\n", "

66014 rows × 5 columns

\n", "
" ], "text/plain": [ " P1545 node1 label node2 \\\n", "439 1 Q5 Pundirected_pagerank 0.022010 \n", "173 2 Q30 Pundirected_pagerank 0.012919 \n", "4782 3 Q6581097 Pundirected_pagerank 0.008353 \n", "7097 4 Q15221623 Pundirected_pagerank 0.004738 \n", "1391 5 Q1860 Pundirected_pagerank 0.004441 \n", "... ... ... ... ... \n", "64005 66010 Q7958659 Pundirected_pagerank 0.000003 \n", "32231 66011 Q17021934 Pundirected_pagerank 0.000003 \n", "45319 66012 Q27890917 Pundirected_pagerank 0.000003 \n", "35443 66013 Q10876480 Pundirected_pagerank 0.000003 \n", "17697 66014 Q1199713 Pundirected_pagerank 0.000003 \n", "\n", " id \n", "439 Q5-Pundirected_pagerank-1319 \n", "173 Q30-Pundirected_pagerank-521 \n", "4782 Q6581097-Pundirected_pagerank-14348 \n", "7097 Q15221623-Pundirected_pagerank-21293 \n", "1391 Q1860-Pundirected_pagerank-4175 \n", "... ... \n", "64005 Q7958659-Pundirected_pagerank-192017 \n", "32231 Q17021934-Pundirected_pagerank-96695 \n", "45319 Q27890917-Pundirected_pagerank-135959 \n", "35443 Q10876480-Pundirected_pagerank-106331 \n", "17697 Q1199713-Pundirected_pagerank-53093 \n", "\n", "[66014 rows x 5 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "undirected_pagerank = kgtk(\"\"\"\n", " query -i $OUT/useful_files/metadata.pagerank.undirected.tsv.gz \n", " --match '(n1)-[l:Pundirected_pagerank]->(pagerank)'\n", "\"\"\")\n", "\n", "undirected_pagerank = undirected_pagerank.sort_values(\"node2\", ascending=False)\n", "undirected_pagerank.insert(0, 'P1545', range(1, 1 + len(undirected_pagerank)))\n", "undirected_pagerank.to_csv(f\"{os.environ['TEMP']}/undirected-pagerank.ordinal.tsv\", index=False, sep='\\t')\n", "undirected_pagerank" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " normalize -i \"$TEMP\"/undirected-pagerank.ordinal.tsv\n", " / add-id --id-style wikidata \n", " -o \"$OUT\"/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode1;labellabel;label
0Q5Pundirected_pagerank0.022010Q5-Pundirected_pagerank-1319'human'@enNaN
1Q5-Pundirected_pagerank-1319P15451.000000Q5-Pundirected_pagerank-1319-P1545-6b86b2NaN'series ordinal'@en
2Q30Pundirected_pagerank0.012919Q30-Pundirected_pagerank-521'United States of America'@enNaN
3Q30-Pundirected_pagerank-521P15452.000000Q30-Pundirected_pagerank-521-P1545-d4735eNaN'series ordinal'@en
4Q6581097Pundirected_pagerank0.008353Q6581097-Pundirected_pagerank-14348'male'@enNaN
5Q6581097-Pundirected_pagerank-14348P15453.000000Q6581097-Pundirected_pagerank-14348-P1545-4e0740NaN'series ordinal'@en
6Q15221623Pundirected_pagerank0.004738Q15221623-Pundirected_pagerank-21293'bilateral relation'@enNaN
7Q15221623-Pundirected_pagerank-21293P15454.000000Q15221623-Pundirected_pagerank-21293-P1545-4b2277NaN'series ordinal'@en
8Q1860Pundirected_pagerank0.004441Q1860-Pundirected_pagerank-4175'English'@enNaN
9Q1860-Pundirected_pagerank-4175P15455.000000Q1860-Pundirected_pagerank-4175-P1545-ef2d12NaN'series ordinal'@en
\n", "
" ], "text/plain": [ " node1 label node2 \\\n", "0 Q5 Pundirected_pagerank 0.022010 \n", "1 Q5-Pundirected_pagerank-1319 P1545 1.000000 \n", "2 Q30 Pundirected_pagerank 0.012919 \n", "3 Q30-Pundirected_pagerank-521 P1545 2.000000 \n", "4 Q6581097 Pundirected_pagerank 0.008353 \n", "5 Q6581097-Pundirected_pagerank-14348 P1545 3.000000 \n", "6 Q15221623 Pundirected_pagerank 0.004738 \n", "7 Q15221623-Pundirected_pagerank-21293 P1545 4.000000 \n", "8 Q1860 Pundirected_pagerank 0.004441 \n", "9 Q1860-Pundirected_pagerank-4175 P1545 5.000000 \n", "\n", " id \\\n", "0 Q5-Pundirected_pagerank-1319 \n", "1 Q5-Pundirected_pagerank-1319-P1545-6b86b2 \n", "2 Q30-Pundirected_pagerank-521 \n", "3 Q30-Pundirected_pagerank-521-P1545-d4735e \n", "4 Q6581097-Pundirected_pagerank-14348 \n", "5 Q6581097-Pundirected_pagerank-14348-P1545-4e0740 \n", "6 Q15221623-Pundirected_pagerank-21293 \n", "7 Q15221623-Pundirected_pagerank-21293-P1545-4b2277 \n", "8 Q1860-Pundirected_pagerank-4175 \n", "9 Q1860-Pundirected_pagerank-4175-P1545-ef2d12 \n", "\n", " node1;label label;label \n", "0 'human'@en NaN \n", "1 NaN 'series ordinal'@en \n", "2 'United States of America'@en NaN \n", "3 NaN 'series ordinal'@en \n", "4 'male'@en NaN \n", "5 NaN 'series ordinal'@en \n", "6 'bilateral relation'@en NaN \n", "7 NaN 'series ordinal'@en \n", "8 'English'@en NaN \n", "9 NaN 'series ordinal'@en " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " head -i \"$OUT\"/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz / add-labels\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deploy the tutorial files to `$tutorial_files_path`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define the files we want to have in the tutorial" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tutorial_files_parts = [\n", " \"labels.en.tsv.gz\",\n", " \"aliases.en.tsv.gz\",\n", " \"descriptions.en.tsv.gz\",\n", " \"claims.external-id.tsv.gz\",\n", " \"claims.monolingualtext.tsv.gz\",\n", " \"claims.quantity.tsv.gz\",\n", " \"claims.string.tsv.gz\",\n", " \"claims.time.tsv.gz\",\n", " \"claims.wikibase-item.tsv.gz\",\n", " \"claims.wikibase-property.tsv.gz\",\n", " \"qualifiers.tsv.gz\"\n", "]\n", "\n", "tutorial_files_useful = [\n", " \"derived.P279.tsv.gz\",\n", " \"derived.P279star.tsv.gz\",\n", " \"derived.P31.tsv.gz\",\n", " \"metadata.in_degree.tsv.gz\",\n", " \"metadata.out_degree.tsv.gz\"\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Deploy the files from the partition and useful notebooks. " ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "for file in tutorial_files_parts:\n", " path = \"$OUT/parts/\" + file\n", " !cp -p {path} $tutorial_files_path\n", "\n", "for file in tutorial_files_useful:\n", " path = \"$OUT/useful_files/\" + file\n", " !cp -p {path} $tutorial_files_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Overwrite the original pagerank files with the ones that include ordinal" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "!cp -p $OUT/useful_files/metadata.pagerank.directed.ordinal.tsv.gz $tutorial_files_path/metadata.pagerank.directed.tsv.gz\n", "!cp -p $OUT/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz $tutorial_files_path/metadata.pagerank.undirected.tsv.gz " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Important to deply the custom KGTK properties file. Copy using KGTK to conveniently compress the file." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat \n", " -i \"$kgtk_path\"/kgtk-properties/kgtk.properties.tsv \n", " -i \"$OUT\"/parts/metadata.property.datatypes.tsv.gz\n", " -o \"$tutorial_files_path\"/metadata.property.datatypes.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 181880\n", "-rw-r--r-- 1 pedroszekely staff 1342345 Oct 10 11:52 aliases.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 44564618 Oct 10 11:36 all.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 13620313 Oct 10 11:52 claims.external-id.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1069769 Oct 10 11:52 claims.monolingualtext.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1936951 Oct 10 11:52 claims.quantity.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1095875 Oct 10 11:52 claims.string.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 781182 Oct 10 11:52 claims.time.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 6332200 Oct 10 11:52 claims.wikibase-item.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 97267 Oct 10 11:52 claims.wikibase-property.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 390973 Oct 10 11:53 derived.P279.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 3325552 Oct 10 11:54 derived.P279star.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1181395 Oct 10 11:53 derived.P31.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1339811 Oct 10 11:52 descriptions.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1064283 Oct 10 11:52 labels.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 309510 Oct 10 11:54 metadata.in_degree.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 585326 Oct 10 11:54 metadata.out_degree.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1879161 Oct 10 11:55 metadata.pagerank.directed.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 2198657 Oct 10 11:55 metadata.pagerank.undirected.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 22631 Oct 10 11:55 metadata.property.datatypes.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 6009306 Oct 10 11:52 qualifiers.tsv.gz\n" ] } ], "source": [ "!ls -l \"$tutorial_files_path\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create an `all.tsv.gz` file" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 285 ms, sys: 105 ms, total: 390 ms\n", "Wall time: 24 s\n" ] } ], "source": [ "%%time\n", "all_file_path = os.environ['tutorial_files_path'] + \"/all.tsv.gz\"\n", "if os.path.exists(all_file_path):\n", " !rm {all_file_path}\n", "!kgtk cat -i \"$tutorial_files_path\"/*.tsv.gz -o {all_file_path}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Peek at the file" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode2;wikidatatype
0P10alias'gif'@enP10-alias-en-282226-0NaN
1P10alias'animation'@enP10-alias-en-2f86d8-0NaN
2P10alias'media'@enP10-alias-en-c1427e-0NaN
3P10alias'trailer (Commons)'@enP10-alias-en-c61ab1-0NaN
4P1001alias'belongs to jurisdiction'@enP1001-alias-en-0dd7ce-0NaN
5P1001alias'linked to jurisdiction'@enP1001-alias-en-106818-0NaN
6P1001alias'of jurisdiction'@enP1001-alias-en-7e4abe-0NaN
7P1001alias'applied to jurisdiction'@enP1001-alias-en-89ed18-0NaN
8P1001alias'jurisdiction'@enP1001-alias-en-a524ab-0NaN
9P1001alias'valid in jurisdiction'@enP1001-alias-en-ca2e7c-0NaN
\n", "
" ], "text/plain": [ " node1 label node2 id \\\n", "0 P10 alias 'gif'@en P10-alias-en-282226-0 \n", "1 P10 alias 'animation'@en P10-alias-en-2f86d8-0 \n", "2 P10 alias 'media'@en P10-alias-en-c1427e-0 \n", "3 P10 alias 'trailer (Commons)'@en P10-alias-en-c61ab1-0 \n", "4 P1001 alias 'belongs to jurisdiction'@en P1001-alias-en-0dd7ce-0 \n", "5 P1001 alias 'linked to jurisdiction'@en P1001-alias-en-106818-0 \n", "6 P1001 alias 'of jurisdiction'@en P1001-alias-en-7e4abe-0 \n", "7 P1001 alias 'applied to jurisdiction'@en P1001-alias-en-89ed18-0 \n", "8 P1001 alias 'jurisdiction'@en P1001-alias-en-a524ab-0 \n", "9 P1001 alias 'valid in jurisdiction'@en P1001-alias-en-ca2e7c-0 \n", "\n", " node2;wikidatatype \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "6 NaN \n", "7 NaN \n", "8 NaN \n", "9 NaN " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " head -i \"$tutorial_files_path\"/all.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run the KGTK validator on the new knowledge graph" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================================================\n", "Data lines read: 2614949\n", "Data lines passed: 2614949\n", "CPU times: user 1.34 s, sys: 426 ms, total: 1.76 s\n", "Wall time: 2min 3s\n" ] } ], "source": [ "%%time\n", "!kgtk validate -i \"$tutorial_files_path\"/all.tsv.gz \\\n", " --allow-wikidata-lq-strings True \\\n", " --ignore-minimum-year True \\\n", " --ignore-maximum-year True" ] } ], "metadata": { "kernelspec": { "display_name": "kgtk-env", "language": "python", "name": "kgtk-env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }