{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Class Visualization\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Preamble: set up the environment and files used in the tutorial"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:15.019726Z",
"iopub.status.busy": "2021-12-29T05:43:15.019511Z",
"iopub.status.idle": "2021-12-29T05:43:17.301772Z",
"shell.execute_reply": "2021-12-29T05:43:17.301144Z",
"shell.execute_reply.started": "2021-12-29T05:43:15.019679Z"
},
"tags": []
},
"outputs": [],
"source": [
"import io\n",
"import os\n",
"import subprocess\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import display, HTML\n",
"\n",
"from graph_tool.all import *\n",
"\n",
"import papermill as pm\n",
"\n",
"from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n",
"from kgtk.functions import kgtk, kypher"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:17.302558Z",
"iopub.status.busy": "2021-12-29T05:43:17.302429Z",
"iopub.status.idle": "2021-12-29T05:43:17.305783Z",
"shell.execute_reply": "2021-12-29T05:43:17.305151Z",
"shell.execute_reply.started": "2021-12-29T05:43:17.302540Z"
},
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Parameters\n",
"\n",
"kgtk_path = \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n",
"\n",
"# Folder on local machine where to create the output and temporary folders\n",
"input_path = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/\"\n",
"output_path = \"/Users/pedroszekely/Downloads/kypher/projects\"\n",
"graph_cache_path = \"/Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\"\n",
"project_name = \"class-visualization\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Our Wikidata distribution partitions the knowledge in Wikidata into smaller files that make it possible for you to pick and choose which files you want to use. Our tutorial KG is a subset of Wikidata, and is partitioned in the same way as the full Wikidata. The following is a partial list of all the files:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:17.308422Z",
"iopub.status.busy": "2021-12-29T05:43:17.308212Z",
"iopub.status.idle": "2021-12-29T05:43:17.312824Z",
"shell.execute_reply": "2021-12-29T05:43:17.312414Z",
"shell.execute_reply.started": "2021-12-29T05:43:17.308399Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"User home: /Users/pedroszekely\n",
"Current dir: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/use-cases\n",
"KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk\n",
"Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n"
]
}
],
"source": [
"files = [\n",
" \"p279\",\n",
" \"p279star\",\n",
" \"label\"\n",
"]\n",
"\n",
"# statistics.Pinstance_count.tsv.gz\n",
"\n",
"ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n",
"ck.configure_kgtk(input_graph_path=input_path,\n",
" output_path=output_path,\n",
" graph_cache_path=graph_cache_path,\n",
" project_name=project_name,\n",
" debug=True\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The KGTK setup command defines environment variables for all the files so that you can reuse the Jupyter notebook when you install it on your local machine."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:17.313894Z",
"iopub.status.busy": "2021-12-29T05:43:17.313762Z",
"iopub.status.idle": "2021-12-29T05:43:17.317074Z",
"shell.execute_reply": "2021-12-29T05:43:17.316628Z",
"shell.execute_reply.started": "2021-12-29T05:43:17.313876Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n",
"kypher: kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\n",
"GRAPH: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/\n",
"kgtk: kgtk --debug\n",
"KGTK_GRAPH_CACHE: /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\n",
"EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples\n",
"STORE: /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\n",
"KGTK_OPTION_DEBUG: false\n",
"OUT: /Users/pedroszekely/Downloads/kypher/projects/class-visualization\n",
"TEMP: /Users/pedroszekely/Downloads/kypher/projects/class-visualization/temp.class-visualization\n",
"KGTK_LABEL_FILE: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\n",
"p279: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279.tsv.gz\n",
"p279star: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz\n",
"label: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\n"
]
}
],
"source": [
"ck.print_env_variables()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:17.318022Z",
"iopub.status.busy": "2021-12-29T05:43:17.317885Z",
"iopub.status.idle": "2021-12-29T05:43:20.187984Z",
"shell.execute_reply": "2021-12-29T05:43:20.187179Z",
"shell.execute_reply.started": "2021-12-29T05:43:17.318005Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279.tsv.gz\" --as p279 -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz\" --as p279star -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\" --as label --limit 3\n",
"[2021-12-28 21:43:20 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_1 AS graph_1_c1\n",
" LIMIT ?\n",
" PARAS: [3]\n",
"---------------------------------------------\n",
"id\tnode1\tlabel\tnode2\n",
"Q100000030-P279-Q14748-30394205-0\tQ100000030\tP279\tQ14748\n",
"Q100000058-P279-Q1622444-bd182663-0\tQ100000058\tP279\tQ1622444\n",
"Q1000032-P279-Q1813494-0aa0f1dc-0\tQ1000032\tP279\tQ1813494\n"
]
}
],
"source": [
"ck.load_files_into_cache()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:20.189425Z",
"iopub.status.busy": "2021-12-29T05:43:20.189217Z",
"iopub.status.idle": "2021-12-29T05:43:23.099815Z",
"shell.execute_reply": "2021-12-29T05:43:23.098839Z",
"shell.execute_reply.started": "2021-12-29T05:43:20.189405Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-28 21:43:22 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_1 AS graph_1_c1\n",
" LIMIT ?\n",
" PARAS: [5]\n",
"---------------------------------------------\n",
"id\tnode1\tlabel\tnode2\n",
"Q100000030-P279-Q14748-30394205-0\tQ100000030\tP279\tQ14748\n",
"Q100000058-P279-Q1622444-bd182663-0\tQ100000058\tP279\tQ1622444\n",
"Q1000032-P279-Q1813494-0aa0f1dc-0\tQ1000032\tP279\tQ1813494\n",
"Q1000032-P279-Q83602-482a1943-0\tQ1000032\tP279\tQ83602\n",
"Q1000039-P279-Q11555767-2dddfd86-0\tQ1000039\tP279\tQ11555767\n"
]
}
],
"source": [
"!kgtk --debug query -i p279 --idx mode:monograph --limit 5"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:23.101657Z",
"iopub.status.busy": "2021-12-29T05:43:23.101339Z",
"iopub.status.idle": "2021-12-29T05:43:25.461409Z",
"shell.execute_reply": "2021-12-29T05:43:25.460655Z",
"shell.execute_reply.started": "2021-12-29T05:43:23.101626Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-28 21:43:25 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_2 AS graph_2_c1\n",
" LIMIT ?\n",
" PARAS: [5]\n",
"---------------------------------------------\n",
"node1\tlabel\tnode2\tid\n",
"Q100000030\tP279star\tQ100000030\tQ100000030-P279star-Q100000030-0000\n",
"Q100000030\tP279star\tQ1357761\tQ100000030-P279star-Q1357761-0000\n",
"Q100000030\tP279star\tQ14745\tQ100000030-P279star-Q14745-0000\n",
"Q100000030\tP279star\tQ14748\tQ100000030-P279star-Q14748-0000\n",
"Q100000030\tP279star\tQ15401930\tQ100000030-P279star-Q15401930-0000\n"
]
}
],
"source": [
"!kgtk --debug query -i p279star --idx mode:monograph --limit 5"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:25.463116Z",
"iopub.status.busy": "2021-12-29T05:43:25.462861Z",
"iopub.status.idle": "2021-12-29T05:43:28.104074Z",
"shell.execute_reply": "2021-12-29T05:43:28.103009Z",
"shell.execute_reply.started": "2021-12-29T05:43:25.463088Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-28 21:43:27 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_3 AS graph_3_c1\n",
" LIMIT ?\n",
" PARAS: [5]\n",
"---------------------------------------------\n",
"id\tnode1\tlabel\tnode2\n",
"P10-label-en\tP10\tlabel\t'video'@en\n",
"P1000-label-en\tP1000\tlabel\t'record held'@en\n",
"P1001-label-en\tP1001\tlabel\t'applies to jurisdiction'@en\n",
"P1002-label-en\tP1002\tlabel\t'engine configuration'@en\n",
"P1003-label-en\tP1003\tlabel\t'National Library of Romania ID'@en\n"
]
}
],
"source": [
"!kgtk --debug query -i label --idx mode:monograph --limit 5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get a list of all the classes\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T19:59:38.096353Z",
"iopub.status.busy": "2021-12-23T19:59:38.096121Z",
"iopub.status.idle": "2021-12-23T19:59:38.100393Z",
"shell.execute_reply": "2021-12-23T19:59:38.099645Z",
"shell.execute_reply.started": "2021-12-23T19:59:38.096330Z"
}
},
"source": [
"First get a list of all the `node1` in p279"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:28.106498Z",
"iopub.status.busy": "2021-12-29T05:43:28.105980Z",
"iopub.status.idle": "2021-12-29T05:43:36.573544Z",
"shell.execute_reply": "2021-12-29T05:43:36.572740Z",
"shell.execute_reply.started": "2021-12-29T05:43:28.106461Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i p279\n",
" --match '(class)-[]->()'\n",
" --return 'distinct class as id'\n",
" -o $TEMP/p279.node1.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:36.575074Z",
"iopub.status.busy": "2021-12-29T05:43:36.574825Z",
"iopub.status.idle": "2021-12-29T05:43:36.819701Z",
"shell.execute_reply": "2021-12-29T05:43:36.818712Z",
"shell.execute_reply.started": "2021-12-29T05:43:36.575046Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2493245\n"
]
}
],
"source": [
"!zcat < $TEMP/p279.node1.tsv.gz | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now get a list of all the node2 in p279"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:36.821835Z",
"iopub.status.busy": "2021-12-29T05:43:36.821560Z",
"iopub.status.idle": "2021-12-29T05:43:39.656109Z",
"shell.execute_reply": "2021-12-29T05:43:39.655342Z",
"shell.execute_reply.started": "2021-12-29T05:43:36.821802Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i p279\n",
" --match '()-[]->(class)'\n",
" --return 'distinct class as id'\n",
" -o $TEMP/p279.node2.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:39.661726Z",
"iopub.status.busy": "2021-12-29T05:43:39.661485Z",
"iopub.status.idle": "2021-12-29T05:43:39.797443Z",
"shell.execute_reply": "2021-12-29T05:43:39.796756Z",
"shell.execute_reply.started": "2021-12-29T05:43:39.661699Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 126327\n"
]
}
],
"source": [
"!zcat < $TEMP/p279.node2.tsv.gz | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:39.798885Z",
"iopub.status.busy": "2021-12-29T05:43:39.798598Z",
"iopub.status.idle": "2021-12-29T05:43:45.898775Z",
"shell.execute_reply": "2021-12-29T05:43:45.897950Z",
"shell.execute_reply.started": "2021-12-29T05:43:39.798860Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" ifnotexists --mode NONE \n",
" -i $TEMP/p279.node2.tsv.gz\n",
" --filter-on $TEMP/p279.node1.tsv.gz\n",
" --input-keys id\n",
" --filter-keys id\n",
" -o $TEMP/p279.classes-that-are-not-subclasses.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:45.900456Z",
"iopub.status.busy": "2021-12-29T05:43:45.900068Z",
"iopub.status.idle": "2021-12-29T05:43:46.032973Z",
"shell.execute_reply": "2021-12-29T05:43:46.032391Z",
"shell.execute_reply.started": "2021-12-29T05:43:45.900426Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 10700\n"
]
}
],
"source": [
"!zcat < $TEMP/p279.classes-that-are-not-subclasses.tsv.gz | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the files to get a list of all the classes"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:46.034585Z",
"iopub.status.busy": "2021-12-29T05:43:46.034288Z",
"iopub.status.idle": "2021-12-29T05:43:52.926892Z",
"shell.execute_reply": "2021-12-29T05:43:52.926055Z",
"shell.execute_reply.started": "2021-12-29T05:43:46.034566Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" cat --mode NONE -i $TEMP/p279.node1.tsv.gz -i $TEMP/p279.classes-that-are-not-subclasses.tsv.gz\n",
" / sort --mode NONE --column id\n",
" -o $OUT/classes.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:52.928636Z",
"iopub.status.busy": "2021-12-29T05:43:52.928389Z",
"iopub.status.idle": "2021-12-29T05:43:53.110141Z",
"shell.execute_reply": "2021-12-29T05:43:53.109140Z",
"shell.execute_reply.started": "2021-12-29T05:43:52.928605Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2503944\n"
]
}
],
"source": [
"!zcat < $OUT/classes.tsv.gz | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Measure the degree of classes"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:43:53.112571Z",
"iopub.status.busy": "2021-12-29T05:43:53.111921Z",
"iopub.status.idle": "2021-12-29T05:45:05.983870Z",
"shell.execute_reply": "2021-12-29T05:45:05.983073Z",
"shell.execute_reply.started": "2021-12-29T05:43:53.112538Z"
}
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" graph-statistics -i \"$p279\" -o $OUT/statistics.p279.tsv.gz \n",
" --compute-pagerank False \n",
" --compute-hits False \n",
" --page-rank-property Pdirected_pagerank \n",
" --vertex-in-degree-property Pindegree\n",
" --vertex-out-degree-property Poutdegree\n",
" --output-degrees True \n",
" --output-pagerank False \n",
" --output-hits False \\\n",
" --output-statistics-only \n",
" --undirected False \n",
" --log-file $TEMP/statistics.summary.txt\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:45:05.985638Z",
"iopub.status.busy": "2021-12-29T05:45:05.985353Z",
"iopub.status.idle": "2021-12-29T05:45:15.728202Z",
"shell.execute_reply": "2021-12-29T05:45:15.727311Z",
"shell.execute_reply.started": "2021-12-29T05:45:05.985611Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"sort -i $OUT/statistics.p279.tsv.gz --columns node2 --numeric --reverse -o $TEMP.p279.indegree.tsv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:45:15.729799Z",
"iopub.status.busy": "2021-12-29T05:45:15.729515Z",
"iopub.status.idle": "2021-12-29T05:47:08.424492Z",
"shell.execute_reply": "2021-12-29T05:47:08.423765Z",
"shell.execute_reply.started": "2021-12-29T05:45:15.729770Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" id | \n",
" node1;label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q20747295 | \n",
" Pindegree | \n",
" 942004 | \n",
" Q20747295-Pindegree-19626 | \n",
" 'protein-coding gene'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q8054 | \n",
" Pindegree | \n",
" 764038 | \n",
" Q8054-Pindegree-15274 | \n",
" 'protein'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q7187 | \n",
" Pindegree | \n",
" 449619 | \n",
" Q7187-Pindegree-5566 | \n",
" 'gene'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q277338 | \n",
" Pindegree | \n",
" 49936 | \n",
" Q277338-Pindegree-220748 | \n",
" 'pseudogene'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q427087 | \n",
" Pindegree | \n",
" 47843 | \n",
" Q427087-Pindegree-197396 | \n",
" 'non-coding RNA'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q382617 | \n",
" Pindegree | \n",
" 40184 | \n",
" Q382617-Pindegree-45664 | \n",
" 'mayor of a place in France'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q15113603 | \n",
" Pindegree | \n",
" 40179 | \n",
" Q15113603-Pindegree-197900 | \n",
" 'municipal councillor'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q11173 | \n",
" Pindegree | \n",
" 14255 | \n",
" Q11173-Pindegree-638 | \n",
" 'chemical compound'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q64698614 | \n",
" Pindegree | \n",
" 8832 | \n",
" Q64698614-Pindegree-2767278 | \n",
" 'pseudogenic transcript'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q201448 | \n",
" Pindegree | \n",
" 8724 | \n",
" Q201448-Pindegree-278588 | \n",
" 'transfer RNA'@en | \n",
"
\n",
" \n",
" 10 | \n",
" Q5663900 | \n",
" Pindegree | \n",
" 8011 | \n",
" Q5663900-Pindegree-46632 | \n",
" 'mayor of a place in Spain'@en | \n",
"
\n",
" \n",
" 11 | \n",
" Q11436 | \n",
" Pindegree | \n",
" 4486 | \n",
" Q11436-Pindegree-620 | \n",
" 'aircraft'@en | \n",
"
\n",
" \n",
" 12 | \n",
" Q284416 | \n",
" Pindegree | \n",
" 3392 | \n",
" Q284416-Pindegree-280684 | \n",
" 'small nucleolar RNA'@en | \n",
"
\n",
" \n",
" 13 | \n",
" Q99762605 | \n",
" Pindegree | \n",
" 3186 | \n",
" Q99762605-Pindegree-588940 | \n",
" 'mayor of a place in Romania'@en | \n",
"
\n",
" \n",
" 14 | \n",
" Q2095 | \n",
" Pindegree | \n",
" 2837 | \n",
" Q2095-Pindegree-1938 | \n",
" 'food'@en | \n",
"
\n",
" \n",
" 15 | \n",
" Q20650761 | \n",
" Pindegree | \n",
" 2459 | \n",
" Q20650761-Pindegree-5618 | \n",
" 'tender locomotive'@en | \n",
"
\n",
" \n",
" 16 | \n",
" Q1125341 | \n",
" Pindegree | \n",
" 2379 | \n",
" Q1125341-Pindegree-26474 | \n",
" 'Italian wine'@en | \n",
"
\n",
" \n",
" 17 | \n",
" Q30185 | \n",
" Pindegree | \n",
" 2350 | \n",
" Q30185-Pindegree-1190 | \n",
" 'mayor'@en | \n",
"
\n",
" \n",
" 18 | \n",
" Q483373 | \n",
" Pindegree | \n",
" 2063 | \n",
" Q483373-Pindegree-2144 | \n",
" 'electric multiple unit'@en | \n",
"
\n",
" \n",
" 19 | \n",
" Q200779 | \n",
" Pindegree | \n",
" 2027 | \n",
" Q200779-Pindegree-24990 | \n",
" 'genetic disease'@en | \n",
"
\n",
" \n",
" 20 | \n",
" Q215980 | \n",
" Pindegree | \n",
" 1843 | \n",
" Q215980-Pindegree-113320 | \n",
" 'ribosomal RNA'@en | \n",
"
\n",
" \n",
" 21 | \n",
" Q17517 | \n",
" Pindegree | \n",
" 1755 | \n",
" Q17517-Pindegree-24936 | \n",
" 'mobile phone'@en | \n",
"
\n",
" \n",
" 22 | \n",
" Q13219666 | \n",
" Pindegree | \n",
" 1748 | \n",
" Q13219666-Pindegree-23322 | \n",
" 'tennis tournament'@en | \n",
"
\n",
" \n",
" 23 | \n",
" Q2449730 | \n",
" Pindegree | \n",
" 1739 | \n",
" Q2449730-Pindegree-16150 | \n",
" 'transport protein'@en | \n",
"
\n",
" \n",
" 24 | \n",
" Q785745 | \n",
" Pindegree | \n",
" 1536 | \n",
" Q785745-Pindegree-5976 | \n",
" 'tank locomotive'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 id \\\n",
"0 Q20747295 Pindegree 942004 Q20747295-Pindegree-19626 \n",
"1 Q8054 Pindegree 764038 Q8054-Pindegree-15274 \n",
"2 Q7187 Pindegree 449619 Q7187-Pindegree-5566 \n",
"3 Q277338 Pindegree 49936 Q277338-Pindegree-220748 \n",
"4 Q427087 Pindegree 47843 Q427087-Pindegree-197396 \n",
"5 Q382617 Pindegree 40184 Q382617-Pindegree-45664 \n",
"6 Q15113603 Pindegree 40179 Q15113603-Pindegree-197900 \n",
"7 Q11173 Pindegree 14255 Q11173-Pindegree-638 \n",
"8 Q64698614 Pindegree 8832 Q64698614-Pindegree-2767278 \n",
"9 Q201448 Pindegree 8724 Q201448-Pindegree-278588 \n",
"10 Q5663900 Pindegree 8011 Q5663900-Pindegree-46632 \n",
"11 Q11436 Pindegree 4486 Q11436-Pindegree-620 \n",
"12 Q284416 Pindegree 3392 Q284416-Pindegree-280684 \n",
"13 Q99762605 Pindegree 3186 Q99762605-Pindegree-588940 \n",
"14 Q2095 Pindegree 2837 Q2095-Pindegree-1938 \n",
"15 Q20650761 Pindegree 2459 Q20650761-Pindegree-5618 \n",
"16 Q1125341 Pindegree 2379 Q1125341-Pindegree-26474 \n",
"17 Q30185 Pindegree 2350 Q30185-Pindegree-1190 \n",
"18 Q483373 Pindegree 2063 Q483373-Pindegree-2144 \n",
"19 Q200779 Pindegree 2027 Q200779-Pindegree-24990 \n",
"20 Q215980 Pindegree 1843 Q215980-Pindegree-113320 \n",
"21 Q17517 Pindegree 1755 Q17517-Pindegree-24936 \n",
"22 Q13219666 Pindegree 1748 Q13219666-Pindegree-23322 \n",
"23 Q2449730 Pindegree 1739 Q2449730-Pindegree-16150 \n",
"24 Q785745 Pindegree 1536 Q785745-Pindegree-5976 \n",
"\n",
" node1;label \n",
"0 'protein-coding gene'@en \n",
"1 'protein'@en \n",
"2 'gene'@en \n",
"3 'pseudogene'@en \n",
"4 'non-coding RNA'@en \n",
"5 'mayor of a place in France'@en \n",
"6 'municipal councillor'@en \n",
"7 'chemical compound'@en \n",
"8 'pseudogenic transcript'@en \n",
"9 'transfer RNA'@en \n",
"10 'mayor of a place in Spain'@en \n",
"11 'aircraft'@en \n",
"12 'small nucleolar RNA'@en \n",
"13 'mayor of a place in Romania'@en \n",
"14 'food'@en \n",
"15 'tender locomotive'@en \n",
"16 'Italian wine'@en \n",
"17 'mayor'@en \n",
"18 'electric multiple unit'@en \n",
"19 'genetic disease'@en \n",
"20 'ribosomal RNA'@en \n",
"21 'mobile phone'@en \n",
"22 'tennis tournament'@en \n",
"23 'transport protein'@en \n",
"24 'tank locomotive'@en "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -i $TEMP.p279.indegree.tsv.gz -n 25 / add-labels\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:47:08.425868Z",
"iopub.status.busy": "2021-12-29T05:47:08.425574Z",
"iopub.status.idle": "2021-12-29T05:47:36.770575Z",
"shell.execute_reply": "2021-12-29T05:47:36.769788Z",
"shell.execute_reply.started": "2021-12-29T05:47:08.425840Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q20747295 | \n",
" Pindegree | \n",
" 942004 | \n",
" Q20747295-Pindegree-19626 | \n",
"
\n",
" \n",
" 1 | \n",
" Q8054 | \n",
" Pindegree | \n",
" 764038 | \n",
" Q8054-Pindegree-15274 | \n",
"
\n",
" \n",
" 2 | \n",
" Q7187 | \n",
" Pindegree | \n",
" 449619 | \n",
" Q7187-Pindegree-5566 | \n",
"
\n",
" \n",
" 3 | \n",
" Q277338 | \n",
" Pindegree | \n",
" 49936 | \n",
" Q277338-Pindegree-220748 | \n",
"
\n",
" \n",
" 4 | \n",
" Q427087 | \n",
" Pindegree | \n",
" 47843 | \n",
" Q427087-Pindegree-197396 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 63 | \n",
" Q1183543 | \n",
" Pindegree | \n",
" 517 | \n",
" Q1183543-Pindegree-330 | \n",
"
\n",
" \n",
" 64 | \n",
" Q7368 | \n",
" Pindegree | \n",
" 516 | \n",
" Q7368-Pindegree-2150 | \n",
"
\n",
" \n",
" 65 | \n",
" Q11415564 | \n",
" Pindegree | \n",
" 512 | \n",
" Q11415564-Pindegree-656 | \n",
"
\n",
" \n",
" 66 | \n",
" Q87008012 | \n",
" Pindegree | \n",
" 501 | \n",
" Q87008012-Pindegree-5226 | \n",
"
\n",
" \n",
" 67 | \n",
" Q62927 | \n",
" Pindegree | \n",
" 501 | \n",
" Q62927-Pindegree-26744 | \n",
"
\n",
" \n",
"
\n",
"
68 rows × 4 columns
\n",
"
"
],
"text/plain": [
" node1 label node2 id\n",
"0 Q20747295 Pindegree 942004 Q20747295-Pindegree-19626\n",
"1 Q8054 Pindegree 764038 Q8054-Pindegree-15274\n",
"2 Q7187 Pindegree 449619 Q7187-Pindegree-5566\n",
"3 Q277338 Pindegree 49936 Q277338-Pindegree-220748\n",
"4 Q427087 Pindegree 47843 Q427087-Pindegree-197396\n",
".. ... ... ... ...\n",
"63 Q1183543 Pindegree 517 Q1183543-Pindegree-330\n",
"64 Q7368 Pindegree 516 Q7368-Pindegree-2150\n",
"65 Q11415564 Pindegree 512 Q11415564-Pindegree-656\n",
"66 Q87008012 Pindegree 501 Q87008012-Pindegree-5226\n",
"67 Q62927 Pindegree 501 Q62927-Pindegree-26744\n",
"\n",
"[68 rows x 4 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $OUT/statistics.p279.tsv.gz \n",
" --match '(n1)-[eid]->(degree)' \n",
" --where 'cast(degree, int) > 500' \n",
" --order-by 'cast(degree, int) desc'\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create list of high and low `P279` degree classes "
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:09:08.111294Z",
"iopub.status.busy": "2021-12-31T03:09:08.110981Z",
"iopub.status.idle": "2021-12-31T03:09:14.055689Z",
"shell.execute_reply": "2021-12-31T03:09:14.054890Z",
"shell.execute_reply.started": "2021-12-31T03:09:08.111253Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i $OUT/statistics.p279.tsv.gz \n",
" --match '(n1)-[:Pindegree]->(degree)' \n",
" --where 'cast(degree, int) < 500' \n",
" --return 'n1 as node1, \"few_subclasses\" as node_type'\n",
" --order-by 'cast(degree, int) desc'\n",
" -o $OUT/class-browsing.low-degree-nodes.tsv\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `class-browsing.low-degree-nodes.tsv` is simply a list of nodes:"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:09:14.057768Z",
"iopub.status.busy": "2021-12-31T03:09:14.057544Z",
"iopub.status.idle": "2021-12-31T03:09:16.630284Z",
"shell.execute_reply": "2021-12-31T03:09:16.629376Z",
"shell.execute_reply.started": "2021-12-31T03:09:14.057738Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q898273 | \n",
"
\n",
" \n",
" 1 | \n",
" Q1002954 | \n",
"
\n",
" \n",
" 2 | \n",
" Q11446 | \n",
"
\n",
" \n",
" 3 | \n",
" Q22325163 | \n",
"
\n",
" \n",
" 4 | \n",
" Q79529 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1\n",
"0 Q898273\n",
"1 Q1002954\n",
"2 Q11446\n",
"3 Q22325163\n",
"4 Q79529"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $OUT/class-browsing.low-degree-nodes.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:09:16.631813Z",
"iopub.status.busy": "2021-12-31T03:09:16.631562Z",
"iopub.status.idle": "2021-12-31T03:09:19.487327Z",
"shell.execute_reply": "2021-12-31T03:09:19.486446Z",
"shell.execute_reply.started": "2021-12-31T03:09:16.631784Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i $OUT/statistics.p279.tsv.gz \n",
" --match '(n1)-[:Pindegree]->(degree)' \n",
" --where 'cast(degree, int) > 499'\n",
" --return 'n1 as node1, \"many_subclasses\" as node_type'\n",
" --order-by 'cast(degree, int) desc'\n",
" -o $OUT/class-browsing.high-degree-nodes.tsv\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:17:03.265480Z",
"iopub.status.busy": "2021-12-31T03:17:03.265178Z",
"iopub.status.idle": "2021-12-31T03:17:06.304141Z",
"shell.execute_reply": "2021-12-31T03:17:06.303177Z",
"shell.execute_reply.started": "2021-12-31T03:17:03.265449Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" cat --use-graph-cache-envar False --mode NONE -i $OUT/class-browsing.low-degree-nodes.tsv -i $OUT/class-browsing.high-degree-nodes.tsv\n",
" -o $OUT/class-browsing.all-nodes.tsv\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:17:34.320154Z",
"iopub.status.busy": "2021-12-31T03:17:34.319885Z",
"iopub.status.idle": "2021-12-31T03:17:36.605149Z",
"shell.execute_reply": "2021-12-31T03:17:36.604366Z",
"shell.execute_reply.started": "2021-12-31T03:17:34.320125Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" node_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q898273 | \n",
" few_subclasses | \n",
"
\n",
" \n",
" 1 | \n",
" Q1002954 | \n",
" few_subclasses | \n",
"
\n",
" \n",
" 2 | \n",
" Q11446 | \n",
" few_subclasses | \n",
"
\n",
" \n",
" 3 | \n",
" Q22325163 | \n",
" few_subclasses | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 node_type\n",
"0 Q898273 few_subclasses\n",
"1 Q1002954 few_subclasses\n",
"2 Q11446 few_subclasses\n",
"3 Q22325163 few_subclasses"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -i $OUT/class-browsing.all-nodes.tsv -n 4\")"
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:18:34.732099Z",
"iopub.status.busy": "2021-12-31T03:18:34.731864Z",
"iopub.status.idle": "2021-12-31T03:18:43.863929Z",
"shell.execute_reply": "2021-12-31T03:18:43.863301Z",
"shell.execute_reply.started": "2021-12-31T03:18:34.732078Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-30 19:18:36 sqlstore]: IMPORT graph directly into table graph_43 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/class-browsing.all-nodes.tsv ...\n",
"[2021-12-30 19:18:42 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_43 AS graph_43_c1\n",
" LIMIT ?\n",
" PARAS: [3]\n",
"---------------------------------------------\n",
"[2021-12-30 19:18:42 sqlstore]: CREATE INDEX \"graph_43_node1_node_type_idx\" ON \"graph_43\" (\"node1\", \"node_type\")\n",
"[2021-12-30 19:18:43 sqlstore]: ANALYZE \"graph_43_node1_node_type_idx\"\n",
"node1\tnode_type\n",
"Q898273\tfew_subclasses\n",
"Q1002954\tfew_subclasses\n",
"Q11446\tfew_subclasses\n"
]
}
],
"source": [
"!kgtk --debug query -i $OUT/class-browsing.all-nodes.tsv --as browsernodes --idx index:node1,node_type --limit 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a P279star file that we will use for visualization.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### First create a complete p279star file containing all classes\n",
"\n",
"First create a complete P279star file that contains all classes as our starting point. We do this because in the browser, users can click on any class."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T05:47:50.803142Z",
"iopub.status.busy": "2021-12-29T05:47:50.802648Z",
"iopub.status.idle": "2021-12-29T06:57:29.672950Z",
"shell.execute_reply": "2021-12-29T06:57:29.672140Z",
"shell.execute_reply.started": "2021-12-29T05:47:50.803099Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" reachable-nodes\n",
" --rootfile $OUT/classes.tsv.gz\n",
" --selflink \n",
" --breadth-first True\n",
" --show-distance True\n",
" --label P279star\n",
" -i \"$p279\"\n",
" -o $TEMP/derived.p279star.complete.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T06:57:29.674498Z",
"iopub.status.busy": "2021-12-29T06:57:29.674193Z",
"iopub.status.idle": "2021-12-29T06:57:32.581005Z",
"shell.execute_reply": "2021-12-29T06:57:32.580365Z",
"shell.execute_reply.started": "2021-12-29T06:57:29.674466Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" distance | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q100000030 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q14748 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q14745 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q1357761 | \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q2424752 | \n",
" 3 | \n",
"
\n",
" \n",
" 5 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q31807746 | \n",
" 3 | \n",
"
\n",
" \n",
" 6 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q8205328 | \n",
" 3 | \n",
"
\n",
" \n",
" 7 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q223557 | \n",
" 4 | \n",
"
\n",
" \n",
" 8 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q15401930 | \n",
" 4 | \n",
"
\n",
" \n",
" 9 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q28877 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 distance\n",
"0 Q100000030 P279star Q100000030 0\n",
"1 Q100000030 P279star Q14748 1\n",
"2 Q100000030 P279star Q14745 2\n",
"3 Q100000030 P279star Q1357761 3\n",
"4 Q100000030 P279star Q2424752 3\n",
"5 Q100000030 P279star Q31807746 3\n",
"6 Q100000030 P279star Q8205328 3\n",
"7 Q100000030 P279star Q223557 4\n",
"8 Q100000030 P279star Q15401930 4\n",
"9 Q100000030 P279star Q28877 4"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -i $TEMP/derived.p279star.complete.tsv.gz -n 10\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The complete p279star file has only a few more edges than the default one. We should replace the original one with the complete one in any case."
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T06:57:32.582753Z",
"iopub.status.busy": "2021-12-29T06:57:32.582281Z",
"iopub.status.idle": "2021-12-29T06:58:36.326425Z",
"shell.execute_reply": "2021-12-29T06:58:36.325226Z",
"shell.execute_reply.started": "2021-12-29T06:57:32.582726Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 87773437\n"
]
}
],
"source": [
"!zcat < \"$p279star\" | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T06:58:36.328860Z",
"iopub.status.busy": "2021-12-29T06:58:36.328552Z",
"iopub.status.idle": "2021-12-29T06:58:39.481549Z",
"shell.execute_reply": "2021-12-29T06:58:39.480285Z",
"shell.execute_reply.started": "2021-12-29T06:58:36.328831Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 87783113\n"
]
}
],
"source": [
"!zcat < $TEMP/derived.p279star.complete.tsv.gz | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add ids and index for use in queries. The new file has a distance column, which we index too so that we can do index queries quickly."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T06:58:39.483647Z",
"iopub.status.busy": "2021-12-29T06:58:39.483339Z",
"iopub.status.idle": "2021-12-29T07:14:43.442208Z",
"shell.execute_reply": "2021-12-29T07:14:43.441420Z",
"shell.execute_reply.started": "2021-12-29T06:58:39.483617Z"
}
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" add-id --id-style wikidata -i $TEMP/derived.p279star.complete.tsv.gz\n",
" -o $OUT/derived.p279star.complete.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:14:43.444107Z",
"iopub.status.busy": "2021-12-29T07:14:43.443770Z",
"iopub.status.idle": "2021-12-29T07:25:14.256091Z",
"shell.execute_reply": "2021-12-29T07:25:14.255260Z",
"shell.execute_reply.started": "2021-12-29T07:14:43.444076Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-28 23:14:45 sqlstore]: DROP graph data table graph_5 from p279stard\n",
"[2021-12-28 23:16:30 sqlstore]: IMPORT graph directly into table graph_28 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/derived.p279star.complete.tsv.gz ...\n",
"[2021-12-28 23:22:35 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_28 AS graph_28_c1\n",
" LIMIT ?\n",
" PARAS: [3]\n",
"---------------------------------------------\n",
"[2021-12-28 23:22:35 sqlstore]: CREATE INDEX \"graph_28_node2_node1_distance_idx\" ON \"graph_28\" (\"node2\", \"node1\", \"distance\")\n",
"[2021-12-28 23:25:02 sqlstore]: ANALYZE \"graph_28_node2_node1_distance_idx\"\n",
"node1\tlabel\tnode2\tdistance\tid\n",
"Q100000030\tP279star\tQ100000030\t0\tQ100000030-P279star-Q100000030\n",
"Q100000030\tP279star\tQ14748\t1\tQ100000030-P279star-Q14748\n",
"Q100000030\tP279star\tQ14745\t2\tQ100000030-P279star-Q14745\n"
]
}
],
"source": [
"!kgtk --debug query -i $OUT/derived.p279star.complete.tsv.gz --as p279stard --idx index:node2,node1,distance --limit 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Count the number of subclasses \n",
"We eventually want to build the subclass graph for each class, but some may be too large"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:25:14.257723Z",
"iopub.status.busy": "2021-12-29T07:25:14.257496Z",
"iopub.status.idle": "2021-12-29T07:27:12.034978Z",
"shell.execute_reply": "2021-12-29T07:27:12.033921Z",
"shell.execute_reply.started": "2021-12-29T07:25:14.257696Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i p279starcomplete\n",
" --match '\n",
" (subclass)-[]->(class)'\n",
" --return 'class as node1, \"Pcount_subclasses\" as label, count(distinct subclass) as node2, class as graph'\n",
" --where 'subclass != class'\n",
" --order-by 'cast(node2, int) desc'\n",
" -o $TEMP/subclass.count.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get an overview of the file. The top classes have an enormous number of subclasses, which will cause trouble for visualization.\n",
"Also, only 126K classes with subclasses, so there are a lot of leaf classes in Wikidata.\n",
"\n",
"In the steps below we exclude the high degree classes, but that won't fix the problem as the top classes have too many subclasses anyway. Sigh. The browser will freeze and the user will be annoyed."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:27:12.036779Z",
"iopub.status.busy": "2021-12-29T07:27:12.036491Z",
"iopub.status.idle": "2021-12-29T07:28:57.065443Z",
"shell.execute_reply": "2021-12-29T07:28:57.064720Z",
"shell.execute_reply.started": "2021-12-29T07:27:12.036749Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" node1;label | \n",
" graph;label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q35120 | \n",
" Pcount_subclasses | \n",
" 2461204 | \n",
" Q35120 | \n",
" 'entity'@en | \n",
" 'entity'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q99527517 | \n",
" Pcount_subclasses | \n",
" 2254394 | \n",
" Q99527517 | \n",
" 'collection entity'@en | \n",
" 'collection entity'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q28813620 | \n",
" Pcount_subclasses | \n",
" 1362927 | \n",
" Q28813620 | \n",
" 'set'@en | \n",
" 'set'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q16887380 | \n",
" Pcount_subclasses | \n",
" 1362452 | \n",
" Q16887380 | \n",
" 'group'@en | \n",
" 'group'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q488383 | \n",
" Pcount_subclasses | \n",
" 1286223 | \n",
" Q488383 | \n",
" 'object'@en | \n",
" 'object'@en | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 126319 | \n",
" Q99970237 | \n",
" Pcount_subclasses | \n",
" 1 | \n",
" Q99970237 | \n",
" 'anthropomorphic deer'@en | \n",
" 'anthropomorphic deer'@en | \n",
"
\n",
" \n",
" 126320 | \n",
" Q99971015 | \n",
" Pcount_subclasses | \n",
" 1 | \n",
" Q99971015 | \n",
" 'anthropomorphic cow or other cattle'@en | \n",
" 'anthropomorphic cow or other cattle'@en | \n",
"
\n",
" \n",
" 126321 | \n",
" Q99972330 | \n",
" Pcount_subclasses | \n",
" 1 | \n",
" Q99972330 | \n",
" 'video game occupation'@en | \n",
" 'video game occupation'@en | \n",
"
\n",
" \n",
" 126322 | \n",
" Q99974769 | \n",
" Pcount_subclasses | \n",
" 1 | \n",
" Q99974769 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 126323 | \n",
" Q999897 | \n",
" Pcount_subclasses | \n",
" 1 | \n",
" Q999897 | \n",
" 'middle management'@en | \n",
" 'middle management'@en | \n",
"
\n",
" \n",
"
\n",
"
126324 rows × 6 columns
\n",
"
"
],
"text/plain": [
" node1 label node2 graph \\\n",
"0 Q35120 Pcount_subclasses 2461204 Q35120 \n",
"1 Q99527517 Pcount_subclasses 2254394 Q99527517 \n",
"2 Q28813620 Pcount_subclasses 1362927 Q28813620 \n",
"3 Q16887380 Pcount_subclasses 1362452 Q16887380 \n",
"4 Q488383 Pcount_subclasses 1286223 Q488383 \n",
"... ... ... ... ... \n",
"126319 Q99970237 Pcount_subclasses 1 Q99970237 \n",
"126320 Q99971015 Pcount_subclasses 1 Q99971015 \n",
"126321 Q99972330 Pcount_subclasses 1 Q99972330 \n",
"126322 Q99974769 Pcount_subclasses 1 Q99974769 \n",
"126323 Q999897 Pcount_subclasses 1 Q999897 \n",
"\n",
" node1;label \\\n",
"0 'entity'@en \n",
"1 'collection entity'@en \n",
"2 'set'@en \n",
"3 'group'@en \n",
"4 'object'@en \n",
"... ... \n",
"126319 'anthropomorphic deer'@en \n",
"126320 'anthropomorphic cow or other cattle'@en \n",
"126321 'video game occupation'@en \n",
"126322 NaN \n",
"126323 'middle management'@en \n",
"\n",
" graph;label \n",
"0 'entity'@en \n",
"1 'collection entity'@en \n",
"2 'set'@en \n",
"3 'group'@en \n",
"4 'object'@en \n",
"... ... \n",
"126319 'anthropomorphic deer'@en \n",
"126320 'anthropomorphic cow or other cattle'@en \n",
"126321 'video game occupation'@en \n",
"126322 NaN \n",
"126323 'middle management'@en \n",
"\n",
"[126324 rows x 6 columns]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = kgtk(\"\"\"\n",
" cat -i $TEMP/subclass.count.tsv.gz / add-labels\n",
"\"\"\")\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a subset of p279 that excludes high in-degree classes in node2\n",
"\n",
"File `class-browsing.low-degree-nodes.tsv` has the class with a low number of subclasses, which we call the low degree nodes. Our low degree P279 file will have all P279 edges that arrive at a low degree class, i.e., where the superclass is a low degree class."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:28:57.067028Z",
"iopub.status.busy": "2021-12-29T07:28:57.066692Z",
"iopub.status.idle": "2021-12-29T07:29:12.667171Z",
"shell.execute_reply": "2021-12-29T07:29:12.666615Z",
"shell.execute_reply.started": "2021-12-29T07:28:57.067000Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i p279 -i $OUT/class-browsing.low-degree-nodes.tsv\n",
" --match '\n",
" p279: (class)-[eid]->(superclass),\n",
" low: (superclass)'\n",
" --return 'class as node1, eid.label as label, superclass as node2, eid as id'\n",
" -o $OUT/p279.lowdegree.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:29:12.668213Z",
"iopub.status.busy": "2021-12-29T07:29:12.668090Z",
"iopub.status.idle": "2021-12-29T07:29:17.950168Z",
"shell.execute_reply": "2021-12-29T07:29:17.949332Z",
"shell.execute_reply.started": "2021-12-29T07:29:12.668196Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 3077832\n"
]
}
],
"source": [
"!zcat < \"$p279\" | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The low degree P279 file has many fewer edges, which is expected as the high degree classes account for a lot of edges."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:29:17.951851Z",
"iopub.status.busy": "2021-12-29T07:29:17.951555Z",
"iopub.status.idle": "2021-12-29T07:29:18.205220Z",
"shell.execute_reply": "2021-12-29T07:29:18.204410Z",
"shell.execute_reply.started": "2021-12-29T07:29:17.951822Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 633444\n"
]
}
],
"source": [
"!zcat < $OUT/p279.lowdegree.tsv.gz | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Recompute P279star with the low degree classes\n",
"The output will be `derived.p279star.low-degree.complete.tsv.gz`\n",
"\n",
"We start at all classes, and find all superclasses for them, excluding the high degree classes."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:29:18.207450Z",
"iopub.status.busy": "2021-12-29T07:29:18.207201Z",
"iopub.status.idle": "2021-12-29T07:35:39.186557Z",
"shell.execute_reply": "2021-12-29T07:35:39.185809Z",
"shell.execute_reply.started": "2021-12-29T07:29:18.207425Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" reachable-nodes\n",
" --rootfile $OUT/classes.tsv.gz\n",
" --selflink \n",
" --breadth-first True\n",
" --show-distance True\n",
" --label P279star\n",
" -i $OUT/p279.lowdegree.tsv.gz\n",
" -o $TEMP/derived.p279star.low-degree.complete.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add ids"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:35:39.188259Z",
"iopub.status.busy": "2021-12-29T07:35:39.187991Z",
"iopub.status.idle": "2021-12-29T07:38:20.061963Z",
"shell.execute_reply": "2021-12-29T07:38:20.061099Z",
"shell.execute_reply.started": "2021-12-29T07:35:39.188231Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" add-id --id-style wikidata -i $TEMP/derived.p279star.low-degree.complete.tsv.gz\n",
" -o $OUT/derived.p279star.low-degree.complete.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Index using node1, node2 and distance. I wonder if we should also index the id column?"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:38:20.063686Z",
"iopub.status.busy": "2021-12-29T07:38:20.063392Z",
"iopub.status.idle": "2021-12-29T07:40:26.966537Z",
"shell.execute_reply": "2021-12-29T07:40:26.965978Z",
"shell.execute_reply.started": "2021-12-29T07:38:20.063658Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-28 23:38:22 sqlstore]: DROP graph data table graph_11 from p279starlow\n",
"[2021-12-28 23:38:57 sqlstore]: IMPORT graph directly into table graph_30 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/derived.p279star.low-degree.complete.tsv.gz ...\n",
"[2021-12-28 23:40:01 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_30 AS graph_30_c1\n",
" LIMIT ?\n",
" PARAS: [3]\n",
"---------------------------------------------\n",
"[2021-12-28 23:40:01 sqlstore]: CREATE INDEX \"graph_30_node2_node1_distance_idx\" ON \"graph_30\" (\"node2\", \"node1\", \"distance\")\n",
"[2021-12-28 23:40:24 sqlstore]: ANALYZE \"graph_30_node2_node1_distance_idx\"\n",
"node1\tlabel\tnode2\tdistance\tid\n",
"Q100000030\tP279star\tQ100000030\t0\tQ100000030-P279star-Q100000030\n",
"Q100000030\tP279star\tQ14748\t1\tQ100000030-P279star-Q14748\n",
"Q100000030\tP279star\tQ14745\t2\tQ100000030-P279star-Q14745\n"
]
}
],
"source": [
"!kgtk --debug query -i $OUT/derived.p279star.low-degree.complete.tsv.gz --as p279starlow --idx index:node2,node1,distance --limit 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Statistics to show in the graph\n",
"\n",
"> We are not computing the statistics file in this notebook as it is computed in the `p1963` project. \n",
"> We need the file here, so Pedro copied it from the `p1963` project and put it in the `$TEMP` folder\n",
"\n",
"File is `statistics.Pinstance_count.tsv.gz`\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:40:26.968247Z",
"iopub.status.busy": "2021-12-29T07:40:26.967653Z",
"iopub.status.idle": "2021-12-29T07:40:29.007057Z",
"shell.execute_reply": "2021-12-29T07:40:29.006255Z",
"shell.execute_reply.started": "2021-12-29T07:40:26.968208Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1000017 | \n",
" Pinstance_count | \n",
" 1 | \n",
" Q1000017-Pinstance_count-6b86b2 | \n",
"
\n",
" \n",
" 1 | \n",
" Q1000091 | \n",
" Pinstance_count | \n",
" 1 | \n",
" Q1000091-Pinstance_count-6b86b2 | \n",
"
\n",
" \n",
" 2 | \n",
" Q1000156 | \n",
" Pinstance_count | \n",
" 11 | \n",
" Q1000156-Pinstance_count-4fc82b | \n",
"
\n",
" \n",
" 3 | \n",
" Q100023 | \n",
" Pinstance_count | \n",
" 1 | \n",
" Q100023-Pinstance_count-6b86b2 | \n",
"
\n",
" \n",
" 4 | \n",
" Q100026 | \n",
" Pinstance_count | \n",
" 1 | \n",
" Q100026-Pinstance_count-6b86b2 | \n",
"
\n",
" \n",
" 5 | \n",
" Q100029091 | \n",
" Pinstance_count | \n",
" 10 | \n",
" Q100029091-Pinstance_count-4a44dc | \n",
"
\n",
" \n",
" 6 | \n",
" Q1000300 | \n",
" Pinstance_count | \n",
" 2 | \n",
" Q1000300-Pinstance_count-d4735e | \n",
"
\n",
" \n",
" 7 | \n",
" Q100034524 | \n",
" Pinstance_count | \n",
" 3 | \n",
" Q100034524-Pinstance_count-4e0740 | \n",
"
\n",
" \n",
" 8 | \n",
" Q1000371 | \n",
" Pinstance_count | \n",
" 3 | \n",
" Q1000371-Pinstance_count-4e0740 | \n",
"
\n",
" \n",
" 9 | \n",
" Q100038174 | \n",
" Pinstance_count | \n",
" 11 | \n",
" Q100038174-Pinstance_count-4fc82b | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 id\n",
"0 Q1000017 Pinstance_count 1 Q1000017-Pinstance_count-6b86b2\n",
"1 Q1000091 Pinstance_count 1 Q1000091-Pinstance_count-6b86b2\n",
"2 Q1000156 Pinstance_count 11 Q1000156-Pinstance_count-4fc82b\n",
"3 Q100023 Pinstance_count 1 Q100023-Pinstance_count-6b86b2\n",
"4 Q100026 Pinstance_count 1 Q100026-Pinstance_count-6b86b2\n",
"5 Q100029091 Pinstance_count 10 Q100029091-Pinstance_count-4a44dc\n",
"6 Q1000300 Pinstance_count 2 Q1000300-Pinstance_count-d4735e\n",
"7 Q100034524 Pinstance_count 3 Q100034524-Pinstance_count-4e0740\n",
"8 Q1000371 Pinstance_count 3 Q1000371-Pinstance_count-4e0740\n",
"9 Q100038174 Pinstance_count 11 Q100038174-Pinstance_count-4fc82b"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -i $TEMP/statistics.Pinstance_count.tsv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:40:29.016568Z",
"iopub.status.busy": "2021-12-29T07:40:29.016325Z",
"iopub.status.idle": "2021-12-29T07:40:31.099738Z",
"shell.execute_reply": "2021-12-29T07:40:31.098475Z",
"shell.execute_reply.started": "2021-12-29T07:40:29.016542Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-28 23:40:30 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_12 AS graph_12_c1\n",
" LIMIT ?\n",
" PARAS: [5]\n",
"---------------------------------------------\n",
"node1\tlabel\tnode2\tid\n",
"Q1000017\tPinstance_count\t1\tQ1000017-Pinstance_count-6b86b2\n",
"Q1000091\tPinstance_count\t1\tQ1000091-Pinstance_count-6b86b2\n",
"Q1000156\tPinstance_count\t11\tQ1000156-Pinstance_count-4fc82b\n",
"Q100023\tPinstance_count\t1\tQ100023-Pinstance_count-6b86b2\n",
"Q100026\tPinstance_count\t1\tQ100026-Pinstance_count-6b86b2\n"
]
}
],
"source": [
"!kgtk --debug query -i $TEMP/statistics.Pinstance_count.tsv.gz --idx mode:monograph --limit 5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute the edge file that contains the graph we want to visualize for each class\n",
"\n",
"The edge file contains `subclass / P279 / class` edges, but we add two columns to support the visualization:\n",
"\n",
"- `graph:` is the id of a class we want to visualize. This columns allows us to quickly fetch all the edges to build the visualization of a class.\n",
"- `edge_type`: in the visualization we want to distinguish `subclass` and `superclass` edges so the viewer can easily distinguish subclasses and superclasses."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Compute the subclass edges\n",
"\n",
"For every class (the graph) we want to find all the P279 edges for subclasses of the given class. We use `class-browsing.low-degree-nodes.tsv` so that we don't include high degree classes that will blow up the browser."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:40:31.101895Z",
"iopub.status.busy": "2021-12-29T07:40:31.101538Z",
"iopub.status.idle": "2021-12-29T07:43:33.472188Z",
"shell.execute_reply": "2021-12-29T07:43:33.470811Z",
"shell.execute_reply.started": "2021-12-29T07:40:31.101865Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(f\"\"\"\n",
" query -i p279starlow -i p279 -i $OUT/class-browsing.low-degree-nodes.tsv\n",
" --match '\n",
" p279starlow: (subclass1)-[]->(class),\n",
" p279starlow: (subclass2)-[]->(class),\n",
" low: (subclass1),\n",
" low: (subclass2),\n",
" p279: (subclass1)-[]->(subclass2)'\n",
" --return 'distinct subclass1 as node1, \"P279\" as label, subclass2 as node2, class as graph, \"subclass\" as edge_type'\n",
" -o $TEMP/all.graph.low.sub.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:43:33.474060Z",
"iopub.status.busy": "2021-12-29T07:43:33.473698Z",
"iopub.status.idle": "2021-12-29T07:43:34.561343Z",
"shell.execute_reply": "2021-12-29T07:43:34.560217Z",
"shell.execute_reply.started": "2021-12-29T07:43:33.474028Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 18213555\n"
]
}
],
"source": [
"!zcat < $TEMP/all.graph.low.sub.tsv.gz | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We have a lot of edges because we make copies for every graph, i.e., the same edge appears in many graphs. This is annoying, but it allows us to fetch the graphs very quickly, in less than 2 seconds."
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:43:34.563594Z",
"iopub.status.busy": "2021-12-29T07:43:34.563232Z",
"iopub.status.idle": "2021-12-29T07:43:37.249049Z",
"shell.execute_reply": "2021-12-29T07:43:37.248128Z",
"shell.execute_reply.started": "2021-12-29T07:43:34.563564Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q100000030 | \n",
" P279 | \n",
" Q14748 | \n",
" Q14748 | \n",
" subclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q100000030 | \n",
" P279 | \n",
" Q14748 | \n",
" Q14745 | \n",
" subclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q100000030 | \n",
" P279 | \n",
" Q14748 | \n",
" Q1357761 | \n",
" subclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q100000030 | \n",
" P279 | \n",
" Q14748 | \n",
" Q2424752 | \n",
" subclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q100000030 | \n",
" P279 | \n",
" Q14748 | \n",
" Q31807746 | \n",
" subclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q100000030 P279 Q14748 Q14748 subclass\n",
"1 Q100000030 P279 Q14748 Q14745 subclass\n",
"2 Q100000030 P279 Q14748 Q1357761 subclass\n",
"3 Q100000030 P279 Q14748 Q2424752 subclass\n",
"4 Q100000030 P279 Q14748 Q31807746 subclass"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $TEMP/all.graph.low.sub.tsv.gz\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Compute the superclass edges\n",
"\n",
"The superclass edges are also P279 edges, but they sit above the given class. We don't need to filter to low degree classes because we are going up the P279 hierarchy."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:43:37.250715Z",
"iopub.status.busy": "2021-12-29T07:43:37.250451Z",
"iopub.status.idle": "2021-12-29T07:58:57.345943Z",
"shell.execute_reply": "2021-12-29T07:58:57.344236Z",
"shell.execute_reply.started": "2021-12-29T07:43:37.250685Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(f\"\"\"\n",
" query -i p279stard -i p279\n",
" --match '\n",
" p279stard: (class)-[]->(superclass1),\n",
" p279stard: (class)-[]->(superclass2),\n",
" p279: (superclass1)-[]->(superclass2)'\n",
" --return 'distinct superclass1 as node1, \"P279\" as label, superclass2 as node2, class as graph, \"superclass\" as edge_type'\n",
" -o $TEMP/all.graph.low.super.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:58:57.348250Z",
"iopub.status.busy": "2021-12-29T07:58:57.347964Z",
"iopub.status.idle": "2021-12-29T07:59:01.873930Z",
"shell.execute_reply": "2021-12-29T07:59:01.872748Z",
"shell.execute_reply.started": "2021-12-29T07:58:57.348216Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 121028861\n"
]
}
],
"source": [
"!zcat < $TEMP/all.graph.low.super.tsv.gz | wc -l"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:59:01.875811Z",
"iopub.status.busy": "2021-12-29T07:59:01.875569Z",
"iopub.status.idle": "2021-12-29T07:59:05.117542Z",
"shell.execute_reply": "2021-12-29T07:59:05.116752Z",
"shell.execute_reply.started": "2021-12-29T07:59:01.875782Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q95079834 | \n",
" P279 | \n",
" Q1000068 | \n",
" Q95079834 | \n",
" superclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q17372279 | \n",
" P279 | \n",
" Q100026 | \n",
" Q17372279 | \n",
" superclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q17372377 | \n",
" P279 | \n",
" Q100026 | \n",
" Q17372377 | \n",
" superclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q17372377 | \n",
" P279 | \n",
" Q100026 | \n",
" Q17372463 | \n",
" superclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q17372377 | \n",
" P279 | \n",
" Q100026 | \n",
" Q17372473 | \n",
" superclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q95079834 P279 Q1000068 Q95079834 superclass\n",
"1 Q17372279 P279 Q100026 Q17372279 superclass\n",
"2 Q17372377 P279 Q100026 Q17372377 superclass\n",
"3 Q17372377 P279 Q100026 Q17372463 superclass\n",
"4 Q17372377 P279 Q100026 Q17372473 superclass"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $TEMP/all.graph.low.super.tsv.gz\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Concatenate the subclass and superclass files, and store in `$TEMP/graph.low.tsv.gz`\n",
"\n",
"We keep the file in `$TEMP` because for the final file we want to add he high degree nodes so that the user sees that they exist (we will not add the subclasses). Once we have the complete file, we will put it in `$OUT`."
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T07:59:05.119086Z",
"iopub.status.busy": "2021-12-29T07:59:05.118834Z",
"iopub.status.idle": "2021-12-29T08:04:23.557081Z",
"shell.execute_reply": "2021-12-29T08:04:23.555928Z",
"shell.execute_reply.started": "2021-12-29T07:59:05.119056Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(f\"\"\"\n",
" cat --use-graph-cache-envar False -i $TEMP/all.graph.low.sub.tsv.gz -i $TEMP/all.graph.low.super.tsv.gz\n",
" -o $TEMP/graph.low.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Index the file to allow fast queries on all columns"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T08:04:23.558788Z",
"iopub.status.busy": "2021-12-29T08:04:23.558504Z",
"iopub.status.idle": "2021-12-29T08:18:26.810653Z",
"shell.execute_reply": "2021-12-29T08:18:26.810026Z",
"shell.execute_reply.started": "2021-12-29T08:04:23.558760Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-29 00:04:25 sqlstore]: DROP graph data table graph_15 from graphbrowser\n",
"[2021-12-29 00:08:16 sqlstore]: IMPORT graph directly into table graph_31 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/temp.class-visualization/graph.low.tsv.gz ...\n",
"[2021-12-29 00:15:23 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT *\n",
" FROM graph_31 AS graph_31_c1\n",
" LIMIT ?\n",
" PARAS: [3]\n",
"---------------------------------------------\n",
"[2021-12-29 00:15:23 sqlstore]: CREATE INDEX \"graph_31_node1_node2_graph_edge_type_idx\" ON \"graph_31\" (\"node1\", \"node2\", \"graph\", \"edge_type\")\n",
"[2021-12-29 00:17:54 sqlstore]: ANALYZE \"graph_31_node1_node2_graph_edge_type_idx\"\n",
"node1\tlabel\tnode2\tgraph\tedge_type\n",
"Q100000030\tP279\tQ14748\tQ14748\tsubclass\n",
"Q100000030\tP279\tQ14748\tQ14745\tsubclass\n",
"Q100000030\tP279\tQ14748\tQ1357761\tsubclass\n"
]
}
],
"source": [
"!kgtk --debug query -i $TEMP/graph.low.tsv.gz --as graphbrowser --idx index:node1,node2,graph,edge_type --limit 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute the node file for visualization\n",
"\n",
"The node file for visualization needs the labels for the nodes, and the `graph` to pull it out quickly. We add:\n",
"\n",
"- `instance_count`: the number of direct instances of the class, as it is interesting for the user to see this information."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extract the nodes from the edge file\n",
"\n",
"The reason to use the edge file is that we need the `graph` id. We do it in two steps, first extract `node1` and then extract `node2`"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:20:12.851665Z",
"iopub.status.busy": "2021-12-31T03:20:12.851393Z",
"iopub.status.idle": "2021-12-31T03:34:51.505030Z",
"shell.execute_reply": "2021-12-31T03:34:51.504124Z",
"shell.execute_reply.started": "2021-12-31T03:20:12.851635Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i graphbrowser -i browsernodes\n",
" --match '\n",
" graphbrowser: (c)-[{graph: graph}]->(),\n",
" browsernodes: (c)-[{node_type: nt}]->()'\n",
" --opt 'label: (c)-[]->(class_label)'\n",
" --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'\n",
" --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, nt as node_type, class_label as label'\n",
" -o $TEMP/graph.low.node1.tsv.gz\n",
"\"\"\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is what our node file looks like:"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:34:51.507287Z",
"iopub.status.busy": "2021-12-31T03:34:51.507028Z",
"iopub.status.idle": "2021-12-31T03:34:54.628575Z",
"shell.execute_reply": "2021-12-31T03:34:54.627739Z",
"shell.execute_reply.started": "2021-12-31T03:34:51.507259Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q898273 | \n",
" Q103839965 | \n",
" 11047 | \n",
" few_subclasses | \n",
" 'protein domain'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q898273 | \n",
" Q103839987 | \n",
" 11047 | \n",
" few_subclasses | \n",
" 'protein domain'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q898273 | \n",
" Q103840002 | \n",
" 11047 | \n",
" few_subclasses | \n",
" 'protein domain'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q898273 | \n",
" Q103840059 | \n",
" 11047 | \n",
" few_subclasses | \n",
" 'protein domain'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q898273 | \n",
" Q103840066 | \n",
" 11047 | \n",
" few_subclasses | \n",
" 'protein domain'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type label\n",
"0 Q898273 Q103839965 11047 few_subclasses 'protein domain'@en\n",
"1 Q898273 Q103839987 11047 few_subclasses 'protein domain'@en\n",
"2 Q898273 Q103840002 11047 few_subclasses 'protein domain'@en\n",
"3 Q898273 Q103840059 11047 few_subclasses 'protein domain'@en\n",
"4 Q898273 Q103840066 11047 few_subclasses 'protein domain'@en"
]
},
"execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $TEMP/graph.low.node1.tsv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:34:54.630103Z",
"iopub.status.busy": "2021-12-31T03:34:54.629857Z",
"iopub.status.idle": "2021-12-31T03:53:45.913263Z",
"shell.execute_reply": "2021-12-31T03:53:45.912358Z",
"shell.execute_reply.started": "2021-12-31T03:34:54.630077Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i graphbrowser -i browsernodes\n",
" --match '\n",
" graphbrowser: ()-[{graph: graph}]->(c),\n",
" browsernodes: (c)-[{node_type: nt}]->()'\n",
" --opt 'label: (c)-[]->(class_label)'\n",
" --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'\n",
" --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, nt as node_type, class_label as label'\n",
" -o $TEMP/graph.low.node2.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Concatenate the two node files, deduplicate and index\n",
"\n",
"To-do: try presorting the files to see if compact will run faster, as it is, this command takes over 2.5 hours"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T03:53:45.916913Z",
"iopub.status.busy": "2021-12-31T03:53:45.916591Z",
"iopub.status.idle": "2021-12-31T06:55:08.988812Z",
"shell.execute_reply": "2021-12-31T06:55:08.971399Z",
"shell.execute_reply.started": "2021-12-31T03:53:45.916855Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" cat --use-graph-cache-envar False --mode NONE -i $TEMP/graph.low.node1.tsv.gz -i $TEMP/graph.low.node2.tsv.gz\n",
" / compact --mode NONE --columns node1 graph\n",
" -o $TEMP/graph.low.node.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We only need to index on `graph` as we will not do node queries on it:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Special handling of high degree nodes"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:09.091806Z",
"iopub.status.busy": "2021-12-31T06:55:09.091041Z",
"iopub.status.idle": "2021-12-31T06:55:13.898708Z",
"shell.execute_reply": "2021-12-31T06:55:13.897855Z",
"shell.execute_reply.started": "2021-12-31T06:55:09.091769Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q20747295 | \n",
"
\n",
" \n",
" 1 | \n",
" Q8054 | \n",
"
\n",
" \n",
" 2 | \n",
" Q7187 | \n",
"
\n",
" \n",
" 3 | \n",
" Q277338 | \n",
"
\n",
" \n",
" 4 | \n",
" Q427087 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1\n",
"0 Q20747295\n",
"1 Q8054\n",
"2 Q7187\n",
"3 Q277338\n",
"4 Q427087"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $OUT/class-browsing.high-degree-nodes.tsv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Make a graph file with the `P279` edges where the subclass is a high degree class\n",
"\n",
"Do this only to add edges that connect to the subclasses of our target node, so `class` has to be in `$TEMP/all.graph.low.sub.tsv.gz`"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:13.904805Z",
"iopub.status.busy": "2021-12-31T06:55:13.904473Z",
"iopub.status.idle": "2021-12-31T06:55:16.567164Z",
"shell.execute_reply": "2021-12-31T06:55:16.565789Z",
"shell.execute_reply.started": "2021-12-31T06:55:13.904785Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-12-30 22:55:16 sqlstore]: DROP graph data table graph_33 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/class-browsing.high-degree-nodes.tsv\n",
"[2021-12-30 22:55:16 sqlstore]: IMPORT graph directly into table graph_33 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/class-browsing.high-degree-nodes.tsv ...\n",
"[2021-12-30 22:55:16 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT DISTINCT graph_33_c2.\"node1\" \"_aLias.node1\", ? \"_aLias.label\", graph_40_c1.\"node1\" \"_aLias.node2\", graph_40_c1.\"graph\" \"_aLias.graph\", ? \"_aLias.edge_type\"\n",
" FROM graph_1 AS graph_1_c3\n",
" INNER JOIN graph_33 AS graph_33_c2, graph_40 AS graph_40_c1\n",
" ON graph_33_c2.\"node1\" = graph_1_c3.\"node1\"\n",
" AND graph_40_c1.\"node1\" = graph_1_c3.\"node2\"\n",
" AND graph_40_c1.\"graph\" = graph_40_c1.\"graph\"\n",
" AND (graph_33_c2.\"node1\" != graph_40_c1.\"node1\")\n",
" PARAS: ['P279', 'subclass']\n",
"---------------------------------------------\n",
"[2021-12-30 22:55:16 sqlstore]: CREATE INDEX \"graph_33_node1_idx\" ON \"graph_33\" (\"node1\")\n",
"[2021-12-30 22:55:16 sqlstore]: ANALYZE \"graph_33_node1_idx\"\n",
"\n"
]
}
],
"source": [
"kgtk(\"\"\"\n",
" query --debug -i $OUT/class-browsing.high-degree-nodes.tsv -i p279 -i $TEMP/all.graph.low.sub.tsv.gz\n",
" --match '\n",
" low: (class)-[{graph: graph}]->(),\n",
" high: (subclass),\n",
" p279: (subclass)-[]->(class)'\n",
" --where 'subclass != class'\n",
" --return 'distinct subclass as node1, \"P279\" as label, class as node2, graph as graph, \"subclass\" as edge_type'\n",
" -o $TEMP/graph.high.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:16.570428Z",
"iopub.status.busy": "2021-12-31T06:55:16.570225Z",
"iopub.status.idle": "2021-12-31T06:55:19.060840Z",
"shell.execute_reply": "2021-12-31T06:55:19.059658Z",
"shell.execute_reply.started": "2021-12-31T06:55:16.570407Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q10267817 | \n",
" P279 | \n",
" Q18553442 | \n",
" Q1225194 | \n",
" subclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q246672 | \n",
" subclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q937228 | \n",
" subclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q7184903 | \n",
" subclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q35120 | \n",
" subclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q10267817 P279 Q18553442 Q1225194 subclass\n",
"1 Q107715 P279 Q309314 Q246672 subclass\n",
"2 Q107715 P279 Q309314 Q937228 subclass\n",
"3 Q107715 P279 Q309314 Q7184903 subclass\n",
"4 Q107715 P279 Q309314 Q35120 subclass"
]
},
"execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $TEMP/graph.high.tsv.gz\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Make a node file with the high degree nodes\n",
"\n",
"We use the edge file because we need to put the `graph` in the node file too."
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:19.063302Z",
"iopub.status.busy": "2021-12-31T06:55:19.063050Z",
"iopub.status.idle": "2021-12-31T06:55:21.837516Z",
"shell.execute_reply": "2021-12-31T06:55:21.836580Z",
"shell.execute_reply.started": "2021-12-31T06:55:19.063273Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i $TEMP/graph.high.tsv.gz\n",
" --match 'high: (c)-[{graph: graph}]->()'\n",
" --opt 'label: (c)-[]->(class_label)'\n",
" --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'\n",
" --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, \"many_subclasses\" as node_type, class_label as label'\n",
" -o $TEMP/graph.high.node.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 202,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:21.841239Z",
"iopub.status.busy": "2021-12-31T06:55:21.840955Z",
"iopub.status.idle": "2021-12-31T06:55:24.497776Z",
"shell.execute_reply": "2021-12-31T06:55:24.497058Z",
"shell.execute_reply.started": "2021-12-31T06:55:21.841210Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q10267817 | \n",
" Q1225194 | \n",
" 1 | \n",
" many_subclasses | \n",
" 'autosomal recessive disease'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q107715 | \n",
" Q246672 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q107715 | \n",
" Q937228 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q107715 | \n",
" Q7184903 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q107715 | \n",
" Q35120 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q10267817 Q1225194 1 many_subclasses \n",
"1 Q107715 Q246672 93 many_subclasses \n",
"2 Q107715 Q937228 93 many_subclasses \n",
"3 Q107715 Q7184903 93 many_subclasses \n",
"4 Q107715 Q35120 93 many_subclasses \n",
"\n",
" label \n",
"0 'autosomal recessive disease'@en \n",
"1 'physical quantity'@en \n",
"2 'physical quantity'@en \n",
"3 'physical quantity'@en \n",
"4 'physical quantity'@en "
]
},
"execution_count": 202,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $TEMP/graph.high.node.tsv.gz\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Just to make sure, count the number of sublcasses of one of our supposedly high degree nodes, innocent looking with one instance, but indeed many subclasses."
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:24.517353Z",
"iopub.status.busy": "2021-12-31T06:55:24.517102Z",
"iopub.status.idle": "2021-12-31T06:55:27.081133Z",
"shell.execute_reply": "2021-12-31T06:55:27.080295Z",
"shell.execute_reply.started": "2021-12-31T06:55:24.517329Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count(DISTINCT graph_1_c1.\"node1\") | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1097 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count(DISTINCT graph_1_c1.\"node1\")\n",
"0 1097"
]
},
"execution_count": 203,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"query -i p279 --match '(subclass)-[]->(:Q10267817)' --return 'count(distinct subclass)'\")"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T06:55:27.083260Z",
"iopub.status.busy": "2021-12-31T06:55:27.082938Z",
"iopub.status.idle": "2021-12-31T06:55:29.543565Z",
"shell.execute_reply": "2021-12-31T06:55:29.542954Z",
"shell.execute_reply.started": "2021-12-31T06:55:27.083231Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count(DISTINCT graph_1_c1.\"node1\") | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2350 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count(DISTINCT graph_1_c1.\"node1\")\n",
"0 2350"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"query -i p279 --match '(subclass)-[]->(:Q30185)' --return 'count(distinct subclass)'\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Augment the low degree edge and node files with the high degree info"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenating without deduplication is sufficient as the files cannot have duplicate edges or nodes."
]
},
{
"cell_type": "code",
"execution_count": 225,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T16:25:04.014190Z",
"iopub.status.busy": "2021-12-31T16:25:04.013972Z",
"iopub.status.idle": "2021-12-31T16:30:22.602854Z",
"shell.execute_reply": "2021-12-31T16:30:22.601637Z",
"shell.execute_reply.started": "2021-12-31T16:25:04.014167Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" cat --use-graph-cache-envar False -i $TEMP/graph.high.tsv.gz -i $TEMP/graph.low.tsv.gz\n",
" -o $OUT/class-visualization.edge.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T07:06:41.114279Z",
"iopub.status.busy": "2021-12-31T07:06:41.113987Z",
"iopub.status.idle": "2021-12-31T07:06:43.937936Z",
"shell.execute_reply": "2021-12-31T07:06:43.937152Z",
"shell.execute_reply.started": "2021-12-31T07:06:41.114250Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q10267817 | \n",
" P279 | \n",
" Q18553442 | \n",
" Q1225194 | \n",
" subclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q246672 | \n",
" subclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q937228 | \n",
" subclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q7184903 | \n",
" subclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q107715 | \n",
" P279 | \n",
" Q309314 | \n",
" Q35120 | \n",
" subclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q10267817 P279 Q18553442 Q1225194 subclass\n",
"1 Q107715 P279 Q309314 Q246672 subclass\n",
"2 Q107715 P279 Q309314 Q937228 subclass\n",
"3 Q107715 P279 Q309314 Q7184903 subclass\n",
"4 Q107715 P279 Q309314 Q35120 subclass"
]
},
"execution_count": 206,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $OUT/class-visualization.edge.tsv.gz\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Index the file for query using the `graph` column:"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T07:06:43.939700Z",
"iopub.status.busy": "2021-12-31T07:06:43.939509Z",
"iopub.status.idle": "2021-12-31T07:21:29.714900Z",
"shell.execute_reply": "2021-12-31T07:21:29.713832Z",
"shell.execute_reply.started": "2021-12-31T07:06:43.939676Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"node1\tlabel\tnode2\tgraph\tedge_type\n",
"Q10267817\tP279\tQ18553442\tQ1225194\tsubclass\n",
"Q107715\tP279\tQ309314\tQ246672\tsubclass\n",
"Q107715\tP279\tQ309314\tQ937228\tsubclass\n"
]
}
],
"source": [
"!kgtk query -i $OUT/class-visualization.edge.tsv.gz --as classvizedge --idx index:graph --limit 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the node files:"
]
},
{
"cell_type": "code",
"execution_count": 226,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T16:30:22.605793Z",
"iopub.status.busy": "2021-12-31T16:30:22.605511Z",
"iopub.status.idle": "2021-12-31T16:33:45.325717Z",
"shell.execute_reply": "2021-12-31T16:33:45.324843Z",
"shell.execute_reply.started": "2021-12-31T16:30:22.605761Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" cat --use-graph-cache-envar False --mode NONE -i $TEMP/graph.high.node.tsv.gz -i $TEMP/graph.low.node.tsv.gz\n",
" -o $TEMP/class-visualization.node.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add a tooltip with meaningful information"
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T16:46:30.875509Z",
"iopub.status.busy": "2021-12-31T16:46:30.875231Z",
"iopub.status.idle": "2021-12-31T17:03:52.239600Z",
"shell.execute_reply": "2021-12-31T17:03:52.236869Z",
"shell.execute_reply.started": "2021-12-31T16:46:30.875480Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/class-visualization.node.tsv.gz\n",
" --match '(node)-[{graph: g, instance_count: ic, node_type: nt, label: l}]->()'\n",
" --return 'distinct\n",
" node as node1, g as graph, ic as instance_count, nt as node_type, l as label,\n",
" printf(\"%s (%s)
instance count: %s
node type: %s\", kgtk_lqstring_text(l), node, cast(ic, int), nt) as tooltip'\n",
" -o $OUT/class-visualization.node.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 229,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T17:03:52.246880Z",
"iopub.status.busy": "2021-12-31T17:03:52.246529Z",
"iopub.status.idle": "2021-12-31T17:03:55.111637Z",
"shell.execute_reply": "2021-12-31T17:03:55.110862Z",
"shell.execute_reply.started": "2021-12-31T17:03:52.246849Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
" tooltip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q10267817 | \n",
" Q1225194 | \n",
" 1 | \n",
" many_subclasses | \n",
" 'autosomal recessive disease'@en | \n",
" autosomal recessive disease (Q10267817)<BR/>in... | \n",
"
\n",
" \n",
" 1 | \n",
" Q107715 | \n",
" Q246672 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
" physical quantity (Q107715)<BR/>instance count... | \n",
"
\n",
" \n",
" 2 | \n",
" Q107715 | \n",
" Q937228 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
" physical quantity (Q107715)<BR/>instance count... | \n",
"
\n",
" \n",
" 3 | \n",
" Q107715 | \n",
" Q7184903 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
" physical quantity (Q107715)<BR/>instance count... | \n",
"
\n",
" \n",
" 4 | \n",
" Q107715 | \n",
" Q35120 | \n",
" 93 | \n",
" many_subclasses | \n",
" 'physical quantity'@en | \n",
" physical quantity (Q107715)<BR/>instance count... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q10267817 Q1225194 1 many_subclasses \n",
"1 Q107715 Q246672 93 many_subclasses \n",
"2 Q107715 Q937228 93 many_subclasses \n",
"3 Q107715 Q7184903 93 many_subclasses \n",
"4 Q107715 Q35120 93 many_subclasses \n",
"\n",
" label \\\n",
"0 'autosomal recessive disease'@en \n",
"1 'physical quantity'@en \n",
"2 'physical quantity'@en \n",
"3 'physical quantity'@en \n",
"4 'physical quantity'@en \n",
"\n",
" tooltip \n",
"0 autosomal recessive disease (Q10267817)
in... \n",
"1 physical quantity (Q107715)
instance count... \n",
"2 physical quantity (Q107715)
instance count... \n",
"3 physical quantity (Q107715)
instance count... \n",
"4 physical quantity (Q107715)
instance count... "
]
},
"execution_count": 229,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -n 5 -i $OUT/class-visualization.node.tsv.gz\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Index the file for query using the `graph` column:"
]
},
{
"cell_type": "code",
"execution_count": 230,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T17:03:55.113296Z",
"iopub.status.busy": "2021-12-31T17:03:55.113107Z",
"iopub.status.idle": "2021-12-31T17:31:20.264932Z",
"shell.execute_reply": "2021-12-31T17:31:20.264132Z",
"shell.execute_reply.started": "2021-12-31T17:03:55.113277Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"node1\tgraph\tinstance_count\tnode_type\tlabel\ttooltip\n",
"Q10267817\tQ1225194\t1\tmany_subclasses\t'autosomal recessive disease'@en\tautosomal recessive disease (Q10267817)
instance count: 1
node type: many_subclasses\n",
"Q107715\tQ246672\t93\tmany_subclasses\t'physical quantity'@en\tphysical quantity (Q107715)
instance count: 93
node type: many_subclasses\n",
"Q107715\tQ937228\t93\tmany_subclasses\t'physical quantity'@en\tphysical quantity (Q107715)
instance count: 93
node type: many_subclasses\n"
]
}
],
"source": [
"!kgtk query -i $OUT/class-visualization.node.tsv.gz --as classviznode --idx index:graph --limit 3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Temporary: we need this file for my current version of visualize because it needs labels in the edge file, the new version can have the labels in the node file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Test creation of the node file:"
]
},
{
"cell_type": "code",
"execution_count": 236,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T18:05:24.418050Z",
"iopub.status.busy": "2021-12-31T18:05:24.417795Z",
"iopub.status.idle": "2021-12-31T18:05:26.475411Z",
"shell.execute_reply": "2021-12-31T18:05:26.474730Z",
"shell.execute_reply.started": "2021-12-31T18:05:24.418026Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
" tooltip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" Q1420 | \n",
" 99 | \n",
" few_subclasses | \n",
" 'machine'@en | \n",
" machine (Q11019)<BR/>instance count: 99<BR/>no... | \n",
"
\n",
" \n",
" 1 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" 198 | \n",
" many_subclasses | \n",
" 'device'@en | \n",
" device (Q1183543)<BR/>instance count: 198<BR/>... | \n",
"
\n",
" \n",
" 2 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" 17 | \n",
" few_subclasses | \n",
" 'land vehicle'@en | \n",
" land vehicle (Q1301433)<BR/>instance count: 17... | \n",
"
\n",
" \n",
" 3 | \n",
" Q1420 | \n",
" Q1420 | \n",
" 862 | \n",
" many_subclasses | \n",
" 'motor car'@en | \n",
" motor car (Q1420)<BR/>instance count: 862<BR/>... | \n",
"
\n",
" \n",
" 4 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" 12 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
" product (Q15401930)<BR/>instance count: 12<BR/... | \n",
"
\n",
" \n",
" 5 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'wheeled vehicle'@en | \n",
" wheeled vehicle (Q15618781)<BR/>instance count... | \n",
"
\n",
" \n",
" 6 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" 24 | \n",
" few_subclasses | \n",
" 'artificial entity'@en | \n",
" artificial entity (Q16686448)<BR/>instance cou... | \n",
"
\n",
" \n",
" 7 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" 389 | \n",
" few_subclasses | \n",
" 'equipment'@en | \n",
" equipment (Q16798631)<BR/>instance count: 389<... | \n",
"
\n",
" \n",
" 8 | \n",
" Q223557 | \n",
" Q1420 | \n",
" 110 | \n",
" few_subclasses | \n",
" 'physical object'@en | \n",
" physical object (Q223557)<BR/>instance count: ... | \n",
"
\n",
" \n",
" 9 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" 412 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
" product (Q2424752)<BR/>instance count: 412<BR/... | \n",
"
\n",
" \n",
" 10 | \n",
" Q28877 | \n",
" Q1420 | \n",
" 11 | \n",
" few_subclasses | \n",
" 'goods'@en | \n",
" goods (Q28877)<BR/>instance count: 11<BR/>node... | \n",
"
\n",
" \n",
" 11 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" 1 | \n",
" few_subclasses | \n",
" 'finished good'@en | \n",
" finished good (Q3245975)<BR/>instance count: 1... | \n",
"
\n",
" \n",
" 12 | \n",
" Q337060 | \n",
" Q1420 | \n",
" 38 | \n",
" few_subclasses | \n",
" 'perceptible object'@en | \n",
" perceptible object (Q337060)<BR/>instance coun... | \n",
"
\n",
" \n",
" 13 | \n",
" Q35120 | \n",
" Q1420 | \n",
" 34 | \n",
" few_subclasses | \n",
" 'entity'@en | \n",
" entity (Q35120)<BR/>instance count: 34<BR/>nod... | \n",
"
\n",
" \n",
" 14 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" 2 | \n",
" few_subclasses | \n",
" 'converter'@en | \n",
" converter (Q35825432)<BR/>instance count: 2<BR... | \n",
"
\n",
" \n",
" 15 | \n",
" Q39546 | \n",
" Q1420 | \n",
" 1029 | \n",
" many_subclasses | \n",
" 'tool'@en | \n",
" tool (Q39546)<BR/>instance count: 1029<BR/>nod... | \n",
"
\n",
" \n",
" 16 | \n",
" Q42889 | \n",
" Q1420 | \n",
" 114 | \n",
" few_subclasses | \n",
" 'vehicle'@en | \n",
" vehicle (Q42889)<BR/>instance count: 114<BR/>n... | \n",
"
\n",
" \n",
" 17 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" 322 | \n",
" few_subclasses | \n",
" 'concrete object'@en | \n",
" concrete object (Q4406616)<BR/>instance count:... | \n",
"
\n",
" \n",
" 18 | \n",
" Q488383 | \n",
" Q1420 | \n",
" 701 | \n",
" few_subclasses | \n",
" 'object'@en | \n",
" object (Q488383)<BR/>instance count: 701<BR/>n... | \n",
"
\n",
" \n",
" 19 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'structure'@en | \n",
" structure (Q6671777)<BR/>instance count: 29<BR... | \n",
"
\n",
" \n",
" 20 | \n",
" Q752870 | \n",
" Q1420 | \n",
" 8 | \n",
" few_subclasses | \n",
" 'motor vehicle'@en | \n",
" motor vehicle (Q752870)<BR/>instance count: 8<... | \n",
"
\n",
" \n",
" 21 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" 52 | \n",
" few_subclasses | \n",
" 'artificial physical object'@en | \n",
" artificial physical object (Q8205328)<BR/>inst... | \n",
"
\n",
" \n",
" 22 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" 63 | \n",
" few_subclasses | \n",
" 'storage'@en | \n",
" storage (Q9158768)<BR/>instance count: 63<BR/>... | \n",
"
\n",
" \n",
" 23 | \n",
" Q987767 | \n",
" Q1420 | \n",
" 282 | \n",
" few_subclasses | \n",
" 'container'@en | \n",
" container (Q987767)<BR/>instance count: 282<BR... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q11019 Q1420 99 few_subclasses \n",
"1 Q1183543 Q1420 198 many_subclasses \n",
"2 Q1301433 Q1420 17 few_subclasses \n",
"3 Q1420 Q1420 862 many_subclasses \n",
"4 Q15401930 Q1420 12 few_subclasses \n",
"5 Q15618781 Q1420 29 few_subclasses \n",
"6 Q16686448 Q1420 24 few_subclasses \n",
"7 Q16798631 Q1420 389 few_subclasses \n",
"8 Q223557 Q1420 110 few_subclasses \n",
"9 Q2424752 Q1420 412 few_subclasses \n",
"10 Q28877 Q1420 11 few_subclasses \n",
"11 Q3245975 Q1420 1 few_subclasses \n",
"12 Q337060 Q1420 38 few_subclasses \n",
"13 Q35120 Q1420 34 few_subclasses \n",
"14 Q35825432 Q1420 2 few_subclasses \n",
"15 Q39546 Q1420 1029 many_subclasses \n",
"16 Q42889 Q1420 114 few_subclasses \n",
"17 Q4406616 Q1420 322 few_subclasses \n",
"18 Q488383 Q1420 701 few_subclasses \n",
"19 Q6671777 Q1420 29 few_subclasses \n",
"20 Q752870 Q1420 8 few_subclasses \n",
"21 Q8205328 Q1420 52 few_subclasses \n",
"22 Q9158768 Q1420 63 few_subclasses \n",
"23 Q987767 Q1420 282 few_subclasses \n",
"\n",
" label \\\n",
"0 'machine'@en \n",
"1 'device'@en \n",
"2 'land vehicle'@en \n",
"3 'motor car'@en \n",
"4 'product'@en \n",
"5 'wheeled vehicle'@en \n",
"6 'artificial entity'@en \n",
"7 'equipment'@en \n",
"8 'physical object'@en \n",
"9 'product'@en \n",
"10 'goods'@en \n",
"11 'finished good'@en \n",
"12 'perceptible object'@en \n",
"13 'entity'@en \n",
"14 'converter'@en \n",
"15 'tool'@en \n",
"16 'vehicle'@en \n",
"17 'concrete object'@en \n",
"18 'object'@en \n",
"19 'structure'@en \n",
"20 'motor vehicle'@en \n",
"21 'artificial physical object'@en \n",
"22 'storage'@en \n",
"23 'container'@en \n",
"\n",
" tooltip \n",
"0 machine (Q11019)
instance count: 99
no... \n",
"1 device (Q1183543)
instance count: 198
... \n",
"2 land vehicle (Q1301433)
instance count: 17... \n",
"3 motor car (Q1420)
instance count: 862
... \n",
"4 product (Q15401930)
instance count: 12
instance count... \n",
"6 artificial entity (Q16686448)
instance cou... \n",
"7 equipment (Q16798631)
instance count: 389<... \n",
"8 physical object (Q223557)
instance count: ... \n",
"9 product (Q2424752)
instance count: 412
instance count: 11
node... \n",
"11 finished good (Q3245975)
instance count: 1... \n",
"12 perceptible object (Q337060)
instance coun... \n",
"13 entity (Q35120)
instance count: 34
nod... \n",
"14 converter (Q35825432)
instance count: 2instance count: 1029
nod... \n",
"16 vehicle (Q42889)
instance count: 114
n... \n",
"17 concrete object (Q4406616)
instance count:... \n",
"18 object (Q488383)
instance count: 701
n... \n",
"19 structure (Q6671777)
instance count: 29instance count: 8<... \n",
"21 artificial physical object (Q8205328)
inst... \n",
"22 storage (Q9158768)
instance count: 63
... \n",
"23 container (Q987767)
instance count: 282()'\n",
"\"\"\")"
]
},
{
"cell_type": "raw",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T14:13:01.391259Z",
"iopub.status.busy": "2021-12-29T14:13:01.391031Z",
"iopub.status.idle": "2021-12-29T14:16:00.942753Z",
"shell.execute_reply": "2021-12-29T14:16:00.941721Z",
"shell.execute_reply.started": "2021-12-29T14:13:01.391236Z"
}
},
"source": [
" kgtk(f\"\"\"\n",
" query -i classvizedgetest\n",
" --match '(class)-[{{label: property, graph: \"{root}\", edge_type: edge_type}}]->(superclass)'\n",
" -o $TEMP/browser/{root}.graph.low.tsv\n",
" \"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test creation of visualizations"
]
},
{
"cell_type": "code",
"execution_count": 234,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T18:00:44.743349Z",
"iopub.status.busy": "2021-12-31T18:00:44.743062Z",
"iopub.status.idle": "2021-12-31T18:01:54.519639Z",
"shell.execute_reply": "2021-12-31T18:01:54.518704Z",
"shell.execute_reply.started": "2021-12-31T18:00:44.743323Z"
},
"tags": []
},
"outputs": [],
"source": [
"roots = [\n",
" \"Q11424\",\n",
" \"Q391342\",\n",
" \"Q1420\",\n",
" \"Q1107\",\n",
" \"Q889821\",\n",
" \"Q1549591\",\n",
" \"Q188724\",\n",
" \"Q946808\",\n",
" \"Q33999\",\n",
" \"Q483501\",\n",
" \"Q2221906\",\n",
" \"Q144\",\n",
" \"Q516021\",\n",
" \"Q10494269\"\n",
"]\n",
"\n",
"for root in roots:\n",
" kgtk(f\"\"\"\n",
" query -i classvizedgetest\n",
" --match '(class)-[{{label: property, graph: \"{root}\", edge_type: edge_type}}]->(superclass)'\n",
" -o $TEMP/browser/{root}.graph.low.tsv\n",
" \"\"\")\n",
"\n",
" kgtk(f\"\"\"\n",
" query -i classviznode\n",
" --match '(class)-[{{graph: \"{root}\", instance_count: instance_count, label: label}}]->()'\n",
" -o $TEMP/browser/{root}.node.graph.low.tsv\n",
" \"\"\")\n",
"\n",
" # kgtk(f\"\"\"\n",
" # visualize-force-graph -i $TEMP/browser/{root}.graph.low.tsv\n",
" # --direction arrow\n",
" # -o $TEMP/browser/{root}.graph.low.html\n",
" # \"\"\")"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
" kgtk(f\"\"\"\n",
" visualize-force-graph -i $TEMP/{root}.graph.low.tsv --node-file $TEMP/{root}.node.graph.low.tsv \n",
" --direction arrow\n",
" --node-size-column instance_count \n",
" --node-size-minimum 2.0 \n",
" --node-size-maximum 8.0 \n",
" --node-size-default 1.0 \n",
" --node-size-scale log \n",
" --node-color-column node_type\n",
" --node-color-scale categorical\n",
" --edge-color-column edge_type \n",
" --edge-color-style categorical \n",
" -o $TEMP/browser/{root}.graph.low.html\n",
" \"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tests for individual files"
]
},
{
"cell_type": "code",
"execution_count": 224,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T16:19:46.509942Z",
"iopub.status.busy": "2021-12-31T16:19:46.509616Z",
"iopub.status.idle": "2021-12-31T16:20:38.883353Z",
"shell.execute_reply": "2021-12-31T16:20:38.882501Z",
"shell.execute_reply.started": "2021-12-31T16:19:46.509911Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" Q1420 | \n",
" 99 | \n",
" few_subclasses | \n",
" 'machine'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" 198 | \n",
" many_subclasses | \n",
" 'device'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" 17 | \n",
" few_subclasses | \n",
" 'land vehicle'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q1420 | \n",
" Q1420 | \n",
" 862 | \n",
" many_subclasses | \n",
" 'motor car'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" 12 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'wheeled vehicle'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" 24 | \n",
" few_subclasses | \n",
" 'artificial entity'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" 389 | \n",
" few_subclasses | \n",
" 'equipment'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q223557 | \n",
" Q1420 | \n",
" 110 | \n",
" few_subclasses | \n",
" 'physical object'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" 412 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
"
\n",
" \n",
" 10 | \n",
" Q28877 | \n",
" Q1420 | \n",
" 11 | \n",
" few_subclasses | \n",
" 'goods'@en | \n",
"
\n",
" \n",
" 11 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" 1 | \n",
" few_subclasses | \n",
" 'finished good'@en | \n",
"
\n",
" \n",
" 12 | \n",
" Q337060 | \n",
" Q1420 | \n",
" 38 | \n",
" few_subclasses | \n",
" 'perceptible object'@en | \n",
"
\n",
" \n",
" 13 | \n",
" Q35120 | \n",
" Q1420 | \n",
" 34 | \n",
" few_subclasses | \n",
" 'entity'@en | \n",
"
\n",
" \n",
" 14 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" 2 | \n",
" few_subclasses | \n",
" 'converter'@en | \n",
"
\n",
" \n",
" 15 | \n",
" Q39546 | \n",
" Q1420 | \n",
" 1029 | \n",
" many_subclasses | \n",
" 'tool'@en | \n",
"
\n",
" \n",
" 16 | \n",
" Q42889 | \n",
" Q1420 | \n",
" 114 | \n",
" few_subclasses | \n",
" 'vehicle'@en | \n",
"
\n",
" \n",
" 17 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" 322 | \n",
" few_subclasses | \n",
" 'concrete object'@en | \n",
"
\n",
" \n",
" 18 | \n",
" Q488383 | \n",
" Q1420 | \n",
" 701 | \n",
" few_subclasses | \n",
" 'object'@en | \n",
"
\n",
" \n",
" 19 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'structure'@en | \n",
"
\n",
" \n",
" 20 | \n",
" Q752870 | \n",
" Q1420 | \n",
" 8 | \n",
" few_subclasses | \n",
" 'motor vehicle'@en | \n",
"
\n",
" \n",
" 21 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" 52 | \n",
" few_subclasses | \n",
" 'artificial physical object'@en | \n",
"
\n",
" \n",
" 22 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" 63 | \n",
" few_subclasses | \n",
" 'storage'@en | \n",
"
\n",
" \n",
" 23 | \n",
" Q987767 | \n",
" Q1420 | \n",
" 282 | \n",
" few_subclasses | \n",
" 'container'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q11019 Q1420 99 few_subclasses \n",
"1 Q1183543 Q1420 198 many_subclasses \n",
"2 Q1301433 Q1420 17 few_subclasses \n",
"3 Q1420 Q1420 862 many_subclasses \n",
"4 Q15401930 Q1420 12 few_subclasses \n",
"5 Q15618781 Q1420 29 few_subclasses \n",
"6 Q16686448 Q1420 24 few_subclasses \n",
"7 Q16798631 Q1420 389 few_subclasses \n",
"8 Q223557 Q1420 110 few_subclasses \n",
"9 Q2424752 Q1420 412 few_subclasses \n",
"10 Q28877 Q1420 11 few_subclasses \n",
"11 Q3245975 Q1420 1 few_subclasses \n",
"12 Q337060 Q1420 38 few_subclasses \n",
"13 Q35120 Q1420 34 few_subclasses \n",
"14 Q35825432 Q1420 2 few_subclasses \n",
"15 Q39546 Q1420 1029 many_subclasses \n",
"16 Q42889 Q1420 114 few_subclasses \n",
"17 Q4406616 Q1420 322 few_subclasses \n",
"18 Q488383 Q1420 701 few_subclasses \n",
"19 Q6671777 Q1420 29 few_subclasses \n",
"20 Q752870 Q1420 8 few_subclasses \n",
"21 Q8205328 Q1420 52 few_subclasses \n",
"22 Q9158768 Q1420 63 few_subclasses \n",
"23 Q987767 Q1420 282 few_subclasses \n",
"\n",
" label \n",
"0 'machine'@en \n",
"1 'device'@en \n",
"2 'land vehicle'@en \n",
"3 'motor car'@en \n",
"4 'product'@en \n",
"5 'wheeled vehicle'@en \n",
"6 'artificial entity'@en \n",
"7 'equipment'@en \n",
"8 'physical object'@en \n",
"9 'product'@en \n",
"10 'goods'@en \n",
"11 'finished good'@en \n",
"12 'perceptible object'@en \n",
"13 'entity'@en \n",
"14 'converter'@en \n",
"15 'tool'@en \n",
"16 'vehicle'@en \n",
"17 'concrete object'@en \n",
"18 'object'@en \n",
"19 'structure'@en \n",
"20 'motor vehicle'@en \n",
"21 'artificial physical object'@en \n",
"22 'storage'@en \n",
"23 'container'@en "
]
},
"execution_count": 224,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/graph.low.node.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 223,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T16:19:33.684922Z",
"iopub.status.busy": "2021-12-31T16:19:33.684648Z",
"iopub.status.idle": "2021-12-31T16:19:36.275324Z",
"shell.execute_reply": "2021-12-31T16:19:36.274525Z",
"shell.execute_reply.started": "2021-12-31T16:19:33.684894Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [node1, graph, instance_count, node_type, label]\n",
"Index: []"
]
},
"execution_count": 223,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/graph.high.node.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 227,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T16:33:45.327442Z",
"iopub.status.busy": "2021-12-31T16:33:45.327041Z",
"iopub.status.idle": "2021-12-31T16:45:39.288687Z",
"shell.execute_reply": "2021-12-31T16:45:39.287966Z",
"shell.execute_reply.started": "2021-12-31T16:33:45.327414Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" Q1420 | \n",
" 99 | \n",
" few_subclasses | \n",
" 'machine'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" 198 | \n",
" many_subclasses | \n",
" 'device'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" 17 | \n",
" few_subclasses | \n",
" 'land vehicle'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q1420 | \n",
" Q1420 | \n",
" 862 | \n",
" many_subclasses | \n",
" 'motor car'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" 12 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'wheeled vehicle'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" 24 | \n",
" few_subclasses | \n",
" 'artificial entity'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" 389 | \n",
" few_subclasses | \n",
" 'equipment'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q223557 | \n",
" Q1420 | \n",
" 110 | \n",
" few_subclasses | \n",
" 'physical object'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" 412 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
"
\n",
" \n",
" 10 | \n",
" Q28877 | \n",
" Q1420 | \n",
" 11 | \n",
" few_subclasses | \n",
" 'goods'@en | \n",
"
\n",
" \n",
" 11 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" 1 | \n",
" few_subclasses | \n",
" 'finished good'@en | \n",
"
\n",
" \n",
" 12 | \n",
" Q337060 | \n",
" Q1420 | \n",
" 38 | \n",
" few_subclasses | \n",
" 'perceptible object'@en | \n",
"
\n",
" \n",
" 13 | \n",
" Q35120 | \n",
" Q1420 | \n",
" 34 | \n",
" few_subclasses | \n",
" 'entity'@en | \n",
"
\n",
" \n",
" 14 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" 2 | \n",
" few_subclasses | \n",
" 'converter'@en | \n",
"
\n",
" \n",
" 15 | \n",
" Q39546 | \n",
" Q1420 | \n",
" 1029 | \n",
" many_subclasses | \n",
" 'tool'@en | \n",
"
\n",
" \n",
" 16 | \n",
" Q42889 | \n",
" Q1420 | \n",
" 114 | \n",
" few_subclasses | \n",
" 'vehicle'@en | \n",
"
\n",
" \n",
" 17 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" 322 | \n",
" few_subclasses | \n",
" 'concrete object'@en | \n",
"
\n",
" \n",
" 18 | \n",
" Q488383 | \n",
" Q1420 | \n",
" 701 | \n",
" few_subclasses | \n",
" 'object'@en | \n",
"
\n",
" \n",
" 19 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'structure'@en | \n",
"
\n",
" \n",
" 20 | \n",
" Q752870 | \n",
" Q1420 | \n",
" 8 | \n",
" few_subclasses | \n",
" 'motor vehicle'@en | \n",
"
\n",
" \n",
" 21 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" 52 | \n",
" few_subclasses | \n",
" 'artificial physical object'@en | \n",
"
\n",
" \n",
" 22 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" 63 | \n",
" few_subclasses | \n",
" 'storage'@en | \n",
"
\n",
" \n",
" 23 | \n",
" Q987767 | \n",
" Q1420 | \n",
" 282 | \n",
" few_subclasses | \n",
" 'container'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q11019 Q1420 99 few_subclasses \n",
"1 Q1183543 Q1420 198 many_subclasses \n",
"2 Q1301433 Q1420 17 few_subclasses \n",
"3 Q1420 Q1420 862 many_subclasses \n",
"4 Q15401930 Q1420 12 few_subclasses \n",
"5 Q15618781 Q1420 29 few_subclasses \n",
"6 Q16686448 Q1420 24 few_subclasses \n",
"7 Q16798631 Q1420 389 few_subclasses \n",
"8 Q223557 Q1420 110 few_subclasses \n",
"9 Q2424752 Q1420 412 few_subclasses \n",
"10 Q28877 Q1420 11 few_subclasses \n",
"11 Q3245975 Q1420 1 few_subclasses \n",
"12 Q337060 Q1420 38 few_subclasses \n",
"13 Q35120 Q1420 34 few_subclasses \n",
"14 Q35825432 Q1420 2 few_subclasses \n",
"15 Q39546 Q1420 1029 many_subclasses \n",
"16 Q42889 Q1420 114 few_subclasses \n",
"17 Q4406616 Q1420 322 few_subclasses \n",
"18 Q488383 Q1420 701 few_subclasses \n",
"19 Q6671777 Q1420 29 few_subclasses \n",
"20 Q752870 Q1420 8 few_subclasses \n",
"21 Q8205328 Q1420 52 few_subclasses \n",
"22 Q9158768 Q1420 63 few_subclasses \n",
"23 Q987767 Q1420 282 few_subclasses \n",
"\n",
" label \n",
"0 'machine'@en \n",
"1 'device'@en \n",
"2 'land vehicle'@en \n",
"3 'motor car'@en \n",
"4 'product'@en \n",
"5 'wheeled vehicle'@en \n",
"6 'artificial entity'@en \n",
"7 'equipment'@en \n",
"8 'physical object'@en \n",
"9 'product'@en \n",
"10 'goods'@en \n",
"11 'finished good'@en \n",
"12 'perceptible object'@en \n",
"13 'entity'@en \n",
"14 'converter'@en \n",
"15 'tool'@en \n",
"16 'vehicle'@en \n",
"17 'concrete object'@en \n",
"18 'object'@en \n",
"19 'structure'@en \n",
"20 'motor vehicle'@en \n",
"21 'artificial physical object'@en \n",
"22 'storage'@en \n",
"23 'container'@en "
]
},
"execution_count": 227,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/class-visualization.node.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 232,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T17:34:29.568315Z",
"iopub.status.busy": "2021-12-31T17:34:29.567994Z",
"iopub.status.idle": "2021-12-31T17:34:32.435182Z",
"shell.execute_reply": "2021-12-31T17:34:32.434584Z",
"shell.execute_reply.started": "2021-12-31T17:34:29.568280Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
" tooltip | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" Q1420 | \n",
" 99 | \n",
" few_subclasses | \n",
" 'machine'@en | \n",
" machine (Q11019)<BR/>instance count: 99<BR/>no... | \n",
"
\n",
" \n",
" 1 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" 198 | \n",
" many_subclasses | \n",
" 'device'@en | \n",
" device (Q1183543)<BR/>instance count: 198<BR/>... | \n",
"
\n",
" \n",
" 2 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" 17 | \n",
" few_subclasses | \n",
" 'land vehicle'@en | \n",
" land vehicle (Q1301433)<BR/>instance count: 17... | \n",
"
\n",
" \n",
" 3 | \n",
" Q1420 | \n",
" Q1420 | \n",
" 862 | \n",
" many_subclasses | \n",
" 'motor car'@en | \n",
" motor car (Q1420)<BR/>instance count: 862<BR/>... | \n",
"
\n",
" \n",
" 4 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" 12 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
" product (Q15401930)<BR/>instance count: 12<BR/... | \n",
"
\n",
" \n",
" 5 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'wheeled vehicle'@en | \n",
" wheeled vehicle (Q15618781)<BR/>instance count... | \n",
"
\n",
" \n",
" 6 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" 24 | \n",
" few_subclasses | \n",
" 'artificial entity'@en | \n",
" artificial entity (Q16686448)<BR/>instance cou... | \n",
"
\n",
" \n",
" 7 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" 389 | \n",
" few_subclasses | \n",
" 'equipment'@en | \n",
" equipment (Q16798631)<BR/>instance count: 389<... | \n",
"
\n",
" \n",
" 8 | \n",
" Q223557 | \n",
" Q1420 | \n",
" 110 | \n",
" few_subclasses | \n",
" 'physical object'@en | \n",
" physical object (Q223557)<BR/>instance count: ... | \n",
"
\n",
" \n",
" 9 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" 412 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
" product (Q2424752)<BR/>instance count: 412<BR/... | \n",
"
\n",
" \n",
" 10 | \n",
" Q28877 | \n",
" Q1420 | \n",
" 11 | \n",
" few_subclasses | \n",
" 'goods'@en | \n",
" goods (Q28877)<BR/>instance count: 11<BR/>node... | \n",
"
\n",
" \n",
" 11 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" 1 | \n",
" few_subclasses | \n",
" 'finished good'@en | \n",
" finished good (Q3245975)<BR/>instance count: 1... | \n",
"
\n",
" \n",
" 12 | \n",
" Q337060 | \n",
" Q1420 | \n",
" 38 | \n",
" few_subclasses | \n",
" 'perceptible object'@en | \n",
" perceptible object (Q337060)<BR/>instance coun... | \n",
"
\n",
" \n",
" 13 | \n",
" Q35120 | \n",
" Q1420 | \n",
" 34 | \n",
" few_subclasses | \n",
" 'entity'@en | \n",
" entity (Q35120)<BR/>instance count: 34<BR/>nod... | \n",
"
\n",
" \n",
" 14 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" 2 | \n",
" few_subclasses | \n",
" 'converter'@en | \n",
" converter (Q35825432)<BR/>instance count: 2<BR... | \n",
"
\n",
" \n",
" 15 | \n",
" Q39546 | \n",
" Q1420 | \n",
" 1029 | \n",
" many_subclasses | \n",
" 'tool'@en | \n",
" tool (Q39546)<BR/>instance count: 1029<BR/>nod... | \n",
"
\n",
" \n",
" 16 | \n",
" Q42889 | \n",
" Q1420 | \n",
" 114 | \n",
" few_subclasses | \n",
" 'vehicle'@en | \n",
" vehicle (Q42889)<BR/>instance count: 114<BR/>n... | \n",
"
\n",
" \n",
" 17 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" 322 | \n",
" few_subclasses | \n",
" 'concrete object'@en | \n",
" concrete object (Q4406616)<BR/>instance count:... | \n",
"
\n",
" \n",
" 18 | \n",
" Q488383 | \n",
" Q1420 | \n",
" 701 | \n",
" few_subclasses | \n",
" 'object'@en | \n",
" object (Q488383)<BR/>instance count: 701<BR/>n... | \n",
"
\n",
" \n",
" 19 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'structure'@en | \n",
" structure (Q6671777)<BR/>instance count: 29<BR... | \n",
"
\n",
" \n",
" 20 | \n",
" Q752870 | \n",
" Q1420 | \n",
" 8 | \n",
" few_subclasses | \n",
" 'motor vehicle'@en | \n",
" motor vehicle (Q752870)<BR/>instance count: 8<... | \n",
"
\n",
" \n",
" 21 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" 52 | \n",
" few_subclasses | \n",
" 'artificial physical object'@en | \n",
" artificial physical object (Q8205328)<BR/>inst... | \n",
"
\n",
" \n",
" 22 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" 63 | \n",
" few_subclasses | \n",
" 'storage'@en | \n",
" storage (Q9158768)<BR/>instance count: 63<BR/>... | \n",
"
\n",
" \n",
" 23 | \n",
" Q987767 | \n",
" Q1420 | \n",
" 282 | \n",
" few_subclasses | \n",
" 'container'@en | \n",
" container (Q987767)<BR/>instance count: 282<BR... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q11019 Q1420 99 few_subclasses \n",
"1 Q1183543 Q1420 198 many_subclasses \n",
"2 Q1301433 Q1420 17 few_subclasses \n",
"3 Q1420 Q1420 862 many_subclasses \n",
"4 Q15401930 Q1420 12 few_subclasses \n",
"5 Q15618781 Q1420 29 few_subclasses \n",
"6 Q16686448 Q1420 24 few_subclasses \n",
"7 Q16798631 Q1420 389 few_subclasses \n",
"8 Q223557 Q1420 110 few_subclasses \n",
"9 Q2424752 Q1420 412 few_subclasses \n",
"10 Q28877 Q1420 11 few_subclasses \n",
"11 Q3245975 Q1420 1 few_subclasses \n",
"12 Q337060 Q1420 38 few_subclasses \n",
"13 Q35120 Q1420 34 few_subclasses \n",
"14 Q35825432 Q1420 2 few_subclasses \n",
"15 Q39546 Q1420 1029 many_subclasses \n",
"16 Q42889 Q1420 114 few_subclasses \n",
"17 Q4406616 Q1420 322 few_subclasses \n",
"18 Q488383 Q1420 701 few_subclasses \n",
"19 Q6671777 Q1420 29 few_subclasses \n",
"20 Q752870 Q1420 8 few_subclasses \n",
"21 Q8205328 Q1420 52 few_subclasses \n",
"22 Q9158768 Q1420 63 few_subclasses \n",
"23 Q987767 Q1420 282 few_subclasses \n",
"\n",
" label \\\n",
"0 'machine'@en \n",
"1 'device'@en \n",
"2 'land vehicle'@en \n",
"3 'motor car'@en \n",
"4 'product'@en \n",
"5 'wheeled vehicle'@en \n",
"6 'artificial entity'@en \n",
"7 'equipment'@en \n",
"8 'physical object'@en \n",
"9 'product'@en \n",
"10 'goods'@en \n",
"11 'finished good'@en \n",
"12 'perceptible object'@en \n",
"13 'entity'@en \n",
"14 'converter'@en \n",
"15 'tool'@en \n",
"16 'vehicle'@en \n",
"17 'concrete object'@en \n",
"18 'object'@en \n",
"19 'structure'@en \n",
"20 'motor vehicle'@en \n",
"21 'artificial physical object'@en \n",
"22 'storage'@en \n",
"23 'container'@en \n",
"\n",
" tooltip \n",
"0 machine (Q11019)
instance count: 99
no... \n",
"1 device (Q1183543)
instance count: 198
... \n",
"2 land vehicle (Q1301433)
instance count: 17... \n",
"3 motor car (Q1420)
instance count: 862
... \n",
"4 product (Q15401930)
instance count: 12
instance count... \n",
"6 artificial entity (Q16686448)
instance cou... \n",
"7 equipment (Q16798631)
instance count: 389<... \n",
"8 physical object (Q223557)
instance count: ... \n",
"9 product (Q2424752)
instance count: 412
instance count: 11
node... \n",
"11 finished good (Q3245975)
instance count: 1... \n",
"12 perceptible object (Q337060)
instance coun... \n",
"13 entity (Q35120)
instance count: 34
nod... \n",
"14 converter (Q35825432)
instance count: 2instance count: 1029
nod... \n",
"16 vehicle (Q42889)
instance count: 114
n... \n",
"17 concrete object (Q4406616)
instance count:... \n",
"18 object (Q488383)
instance count: 701
n... \n",
"19 structure (Q6671777)
instance count: 29instance count: 8<... \n",
"21 artificial physical object (Q8205328)
inst... \n",
"22 storage (Q9158768)
instance count: 63
... \n",
"23 container (Q987767)
instance count: 282()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T08:29:21.356605Z",
"iopub.status.busy": "2021-12-31T08:29:21.356378Z",
"iopub.status.idle": "2021-12-31T08:29:34.399092Z",
"shell.execute_reply": "2021-12-31T08:29:34.398417Z",
"shell.execute_reply.started": "2021-12-31T08:29:21.356580Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" P279 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q11019 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 5 | \n",
" Q1301433 | \n",
" P279 | \n",
" Q42889 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 6 | \n",
" Q1420 | \n",
" P279 | \n",
" Q752870 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 7 | \n",
" Q15401930 | \n",
" P279 | \n",
" Q488383 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 8 | \n",
" Q15618781 | \n",
" P279 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 9 | \n",
" Q16686448 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 10 | \n",
" Q16798631 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 11 | \n",
" Q16798631 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 12 | \n",
" Q223557 | \n",
" P279 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 13 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 14 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q28877 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 15 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 16 | \n",
" Q28877 | \n",
" P279 | \n",
" Q337060 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 17 | \n",
" Q3245975 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 18 | \n",
" Q337060 | \n",
" P279 | \n",
" Q223557 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 19 | \n",
" Q35825432 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 20 | \n",
" Q39546 | \n",
" P279 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 21 | \n",
" Q39546 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 22 | \n",
" Q39546 | \n",
" P279 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 23 | \n",
" Q42889 | \n",
" P279 | \n",
" Q11019 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 24 | \n",
" Q42889 | \n",
" P279 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 25 | \n",
" Q42889 | \n",
" P279 | \n",
" Q987767 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 26 | \n",
" Q4406616 | \n",
" P279 | \n",
" Q488383 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 27 | \n",
" Q488383 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 28 | \n",
" Q6671777 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 29 | \n",
" Q752870 | \n",
" P279 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 30 | \n",
" Q8205328 | \n",
" P279 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 31 | \n",
" Q8205328 | \n",
" P279 | \n",
" Q223557 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 32 | \n",
" Q9158768 | \n",
" P279 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 33 | \n",
" Q987767 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 34 | \n",
" Q987767 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 35 | \n",
" Q987767 | \n",
" P279 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q11019 P279 Q1183543 Q1420 superclass\n",
"1 Q11019 P279 Q39546 Q1420 superclass\n",
"2 Q1183543 P279 Q16686448 Q1420 superclass\n",
"3 Q1183543 P279 Q16798631 Q1420 superclass\n",
"4 Q1183543 P279 Q39546 Q1420 superclass\n",
"5 Q1301433 P279 Q42889 Q1420 superclass\n",
"6 Q1420 P279 Q752870 Q1420 superclass\n",
"7 Q15401930 P279 Q488383 Q1420 superclass\n",
"8 Q15618781 P279 Q1301433 Q1420 superclass\n",
"9 Q16686448 P279 Q35120 Q1420 superclass\n",
"10 Q16798631 P279 Q2424752 Q1420 superclass\n",
"11 Q16798631 P279 Q8205328 Q1420 superclass\n",
"12 Q223557 P279 Q4406616 Q1420 superclass\n",
"13 Q2424752 P279 Q15401930 Q1420 superclass\n",
"14 Q2424752 P279 Q28877 Q1420 superclass\n",
"15 Q2424752 P279 Q8205328 Q1420 superclass\n",
"16 Q28877 P279 Q337060 Q1420 superclass\n",
"17 Q3245975 P279 Q2424752 Q1420 superclass\n",
"18 Q337060 P279 Q223557 Q1420 superclass\n",
"19 Q35825432 P279 Q35120 Q1420 superclass\n",
"20 Q39546 P279 Q16798631 Q1420 superclass\n",
"21 Q39546 P279 Q2424752 Q1420 superclass\n",
"22 Q39546 P279 Q35825432 Q1420 superclass\n",
"23 Q42889 P279 Q11019 Q1420 superclass\n",
"24 Q42889 P279 Q3245975 Q1420 superclass\n",
"25 Q42889 P279 Q987767 Q1420 superclass\n",
"26 Q4406616 P279 Q488383 Q1420 superclass\n",
"27 Q488383 P279 Q35120 Q1420 superclass\n",
"28 Q6671777 P279 Q35120 Q1420 superclass\n",
"29 Q752870 P279 Q15618781 Q1420 superclass\n",
"30 Q8205328 P279 Q16686448 Q1420 superclass\n",
"31 Q8205328 P279 Q223557 Q1420 superclass\n",
"32 Q9158768 P279 Q6671777 Q1420 superclass\n",
"33 Q987767 P279 Q39546 Q1420 superclass\n",
"34 Q987767 P279 Q8205328 Q1420 superclass\n",
"35 Q987767 P279 Q9158768 Q1420 superclass"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i graphbrowser\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T08:29:34.401024Z",
"iopub.status.busy": "2021-12-31T08:29:34.400813Z",
"iopub.status.idle": "2021-12-31T08:29:36.392851Z",
"shell.execute_reply": "2021-12-31T08:29:36.392273Z",
"shell.execute_reply.started": "2021-12-31T08:29:34.401000Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [node1, label, node2, graph, edge_type]\n",
"Index: []"
]
},
"execution_count": 215,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/graph.high.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 216,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T08:29:36.394019Z",
"iopub.status.busy": "2021-12-31T08:29:36.393858Z",
"iopub.status.idle": "2021-12-31T08:30:07.925774Z",
"shell.execute_reply": "2021-12-31T08:30:07.924705Z",
"shell.execute_reply.started": "2021-12-31T08:29:36.393997Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" P279 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q11019 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 5 | \n",
" Q1301433 | \n",
" P279 | \n",
" Q42889 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 6 | \n",
" Q1420 | \n",
" P279 | \n",
" Q752870 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 7 | \n",
" Q15401930 | \n",
" P279 | \n",
" Q488383 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 8 | \n",
" Q15618781 | \n",
" P279 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 9 | \n",
" Q16686448 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 10 | \n",
" Q16798631 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 11 | \n",
" Q16798631 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 12 | \n",
" Q223557 | \n",
" P279 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 13 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 14 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q28877 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 15 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 16 | \n",
" Q28877 | \n",
" P279 | \n",
" Q337060 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 17 | \n",
" Q3245975 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 18 | \n",
" Q337060 | \n",
" P279 | \n",
" Q223557 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 19 | \n",
" Q35825432 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 20 | \n",
" Q39546 | \n",
" P279 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 21 | \n",
" Q39546 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 22 | \n",
" Q39546 | \n",
" P279 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 23 | \n",
" Q42889 | \n",
" P279 | \n",
" Q11019 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 24 | \n",
" Q42889 | \n",
" P279 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 25 | \n",
" Q42889 | \n",
" P279 | \n",
" Q987767 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 26 | \n",
" Q4406616 | \n",
" P279 | \n",
" Q488383 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 27 | \n",
" Q488383 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 28 | \n",
" Q6671777 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 29 | \n",
" Q752870 | \n",
" P279 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 30 | \n",
" Q8205328 | \n",
" P279 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 31 | \n",
" Q8205328 | \n",
" P279 | \n",
" Q223557 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 32 | \n",
" Q9158768 | \n",
" P279 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 33 | \n",
" Q987767 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 34 | \n",
" Q987767 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 35 | \n",
" Q987767 | \n",
" P279 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q11019 P279 Q1183543 Q1420 superclass\n",
"1 Q11019 P279 Q39546 Q1420 superclass\n",
"2 Q1183543 P279 Q16686448 Q1420 superclass\n",
"3 Q1183543 P279 Q16798631 Q1420 superclass\n",
"4 Q1183543 P279 Q39546 Q1420 superclass\n",
"5 Q1301433 P279 Q42889 Q1420 superclass\n",
"6 Q1420 P279 Q752870 Q1420 superclass\n",
"7 Q15401930 P279 Q488383 Q1420 superclass\n",
"8 Q15618781 P279 Q1301433 Q1420 superclass\n",
"9 Q16686448 P279 Q35120 Q1420 superclass\n",
"10 Q16798631 P279 Q2424752 Q1420 superclass\n",
"11 Q16798631 P279 Q8205328 Q1420 superclass\n",
"12 Q223557 P279 Q4406616 Q1420 superclass\n",
"13 Q2424752 P279 Q15401930 Q1420 superclass\n",
"14 Q2424752 P279 Q28877 Q1420 superclass\n",
"15 Q2424752 P279 Q8205328 Q1420 superclass\n",
"16 Q28877 P279 Q337060 Q1420 superclass\n",
"17 Q3245975 P279 Q2424752 Q1420 superclass\n",
"18 Q337060 P279 Q223557 Q1420 superclass\n",
"19 Q35825432 P279 Q35120 Q1420 superclass\n",
"20 Q39546 P279 Q16798631 Q1420 superclass\n",
"21 Q39546 P279 Q2424752 Q1420 superclass\n",
"22 Q39546 P279 Q35825432 Q1420 superclass\n",
"23 Q42889 P279 Q11019 Q1420 superclass\n",
"24 Q42889 P279 Q3245975 Q1420 superclass\n",
"25 Q42889 P279 Q987767 Q1420 superclass\n",
"26 Q4406616 P279 Q488383 Q1420 superclass\n",
"27 Q488383 P279 Q35120 Q1420 superclass\n",
"28 Q6671777 P279 Q35120 Q1420 superclass\n",
"29 Q752870 P279 Q15618781 Q1420 superclass\n",
"30 Q8205328 P279 Q16686448 Q1420 superclass\n",
"31 Q8205328 P279 Q223557 Q1420 superclass\n",
"32 Q9158768 P279 Q6671777 Q1420 superclass\n",
"33 Q987767 P279 Q39546 Q1420 superclass\n",
"34 Q987767 P279 Q8205328 Q1420 superclass\n",
"35 Q987767 P279 Q9158768 Q1420 superclass"
]
},
"execution_count": 216,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/graph.low.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T08:30:07.928220Z",
"iopub.status.busy": "2021-12-31T08:30:07.927917Z",
"iopub.status.idle": "2021-12-31T08:30:29.665757Z",
"shell.execute_reply": "2021-12-31T08:30:29.665176Z",
"shell.execute_reply.started": "2021-12-31T08:30:07.928191Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [node1, label, node2, graph, edge_type]\n",
"Index: []"
]
},
"execution_count": 217,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/all.graph.low.sub.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T08:30:29.668385Z",
"iopub.status.busy": "2021-12-31T08:30:29.668199Z",
"iopub.status.idle": "2021-12-31T08:32:04.988275Z",
"shell.execute_reply": "2021-12-31T08:32:04.987475Z",
"shell.execute_reply.started": "2021-12-31T08:30:29.668368Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" graph | \n",
" edge_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" P279 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 1 | \n",
" Q11019 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 2 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 3 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 4 | \n",
" Q1183543 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 5 | \n",
" Q1301433 | \n",
" P279 | \n",
" Q42889 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 6 | \n",
" Q1420 | \n",
" P279 | \n",
" Q752870 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 7 | \n",
" Q15401930 | \n",
" P279 | \n",
" Q488383 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 8 | \n",
" Q15618781 | \n",
" P279 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 9 | \n",
" Q16686448 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 10 | \n",
" Q16798631 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 11 | \n",
" Q16798631 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 12 | \n",
" Q223557 | \n",
" P279 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 13 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 14 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q28877 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 15 | \n",
" Q2424752 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 16 | \n",
" Q28877 | \n",
" P279 | \n",
" Q337060 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 17 | \n",
" Q3245975 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 18 | \n",
" Q337060 | \n",
" P279 | \n",
" Q223557 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 19 | \n",
" Q35825432 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 20 | \n",
" Q39546 | \n",
" P279 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 21 | \n",
" Q39546 | \n",
" P279 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 22 | \n",
" Q39546 | \n",
" P279 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 23 | \n",
" Q42889 | \n",
" P279 | \n",
" Q11019 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 24 | \n",
" Q42889 | \n",
" P279 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 25 | \n",
" Q42889 | \n",
" P279 | \n",
" Q987767 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 26 | \n",
" Q4406616 | \n",
" P279 | \n",
" Q488383 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 27 | \n",
" Q488383 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 28 | \n",
" Q6671777 | \n",
" P279 | \n",
" Q35120 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 29 | \n",
" Q752870 | \n",
" P279 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 30 | \n",
" Q8205328 | \n",
" P279 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 31 | \n",
" Q8205328 | \n",
" P279 | \n",
" Q223557 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 32 | \n",
" Q9158768 | \n",
" P279 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 33 | \n",
" Q987767 | \n",
" P279 | \n",
" Q39546 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 34 | \n",
" Q987767 | \n",
" P279 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
" 35 | \n",
" Q987767 | \n",
" P279 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" superclass | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 graph edge_type\n",
"0 Q11019 P279 Q1183543 Q1420 superclass\n",
"1 Q11019 P279 Q39546 Q1420 superclass\n",
"2 Q1183543 P279 Q16686448 Q1420 superclass\n",
"3 Q1183543 P279 Q16798631 Q1420 superclass\n",
"4 Q1183543 P279 Q39546 Q1420 superclass\n",
"5 Q1301433 P279 Q42889 Q1420 superclass\n",
"6 Q1420 P279 Q752870 Q1420 superclass\n",
"7 Q15401930 P279 Q488383 Q1420 superclass\n",
"8 Q15618781 P279 Q1301433 Q1420 superclass\n",
"9 Q16686448 P279 Q35120 Q1420 superclass\n",
"10 Q16798631 P279 Q2424752 Q1420 superclass\n",
"11 Q16798631 P279 Q8205328 Q1420 superclass\n",
"12 Q223557 P279 Q4406616 Q1420 superclass\n",
"13 Q2424752 P279 Q15401930 Q1420 superclass\n",
"14 Q2424752 P279 Q28877 Q1420 superclass\n",
"15 Q2424752 P279 Q8205328 Q1420 superclass\n",
"16 Q28877 P279 Q337060 Q1420 superclass\n",
"17 Q3245975 P279 Q2424752 Q1420 superclass\n",
"18 Q337060 P279 Q223557 Q1420 superclass\n",
"19 Q35825432 P279 Q35120 Q1420 superclass\n",
"20 Q39546 P279 Q16798631 Q1420 superclass\n",
"21 Q39546 P279 Q2424752 Q1420 superclass\n",
"22 Q39546 P279 Q35825432 Q1420 superclass\n",
"23 Q42889 P279 Q11019 Q1420 superclass\n",
"24 Q42889 P279 Q3245975 Q1420 superclass\n",
"25 Q42889 P279 Q987767 Q1420 superclass\n",
"26 Q4406616 P279 Q488383 Q1420 superclass\n",
"27 Q488383 P279 Q35120 Q1420 superclass\n",
"28 Q6671777 P279 Q35120 Q1420 superclass\n",
"29 Q752870 P279 Q15618781 Q1420 superclass\n",
"30 Q8205328 P279 Q16686448 Q1420 superclass\n",
"31 Q8205328 P279 Q223557 Q1420 superclass\n",
"32 Q9158768 P279 Q6671777 Q1420 superclass\n",
"33 Q987767 P279 Q39546 Q1420 superclass\n",
"34 Q987767 P279 Q8205328 Q1420 superclass\n",
"35 Q987767 P279 Q9158768 Q1420 superclass"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/all.graph.low.super.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-31T08:32:04.990543Z",
"iopub.status.busy": "2021-12-31T08:32:04.990275Z",
"iopub.status.idle": "2021-12-31T08:43:25.333732Z",
"shell.execute_reply": "2021-12-31T08:43:25.333041Z",
"shell.execute_reply.started": "2021-12-31T08:32:04.990525Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" graph | \n",
" instance_count | \n",
" node_type | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q11019 | \n",
" Q1420 | \n",
" 99 | \n",
" few_subclasses | \n",
" 'machine'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q1183543 | \n",
" Q1420 | \n",
" 198 | \n",
" many_subclasses | \n",
" 'device'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q1301433 | \n",
" Q1420 | \n",
" 17 | \n",
" few_subclasses | \n",
" 'land vehicle'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q1420 | \n",
" Q1420 | \n",
" 862 | \n",
" many_subclasses | \n",
" 'motor car'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q15401930 | \n",
" Q1420 | \n",
" 12 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q15618781 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'wheeled vehicle'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q16686448 | \n",
" Q1420 | \n",
" 24 | \n",
" few_subclasses | \n",
" 'artificial entity'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q16798631 | \n",
" Q1420 | \n",
" 389 | \n",
" few_subclasses | \n",
" 'equipment'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q223557 | \n",
" Q1420 | \n",
" 110 | \n",
" few_subclasses | \n",
" 'physical object'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q2424752 | \n",
" Q1420 | \n",
" 412 | \n",
" few_subclasses | \n",
" 'product'@en | \n",
"
\n",
" \n",
" 10 | \n",
" Q28877 | \n",
" Q1420 | \n",
" 11 | \n",
" few_subclasses | \n",
" 'goods'@en | \n",
"
\n",
" \n",
" 11 | \n",
" Q3245975 | \n",
" Q1420 | \n",
" 1 | \n",
" few_subclasses | \n",
" 'finished good'@en | \n",
"
\n",
" \n",
" 12 | \n",
" Q337060 | \n",
" Q1420 | \n",
" 38 | \n",
" few_subclasses | \n",
" 'perceptible object'@en | \n",
"
\n",
" \n",
" 13 | \n",
" Q35120 | \n",
" Q1420 | \n",
" 34 | \n",
" few_subclasses | \n",
" 'entity'@en | \n",
"
\n",
" \n",
" 14 | \n",
" Q35825432 | \n",
" Q1420 | \n",
" 2 | \n",
" few_subclasses | \n",
" 'converter'@en | \n",
"
\n",
" \n",
" 15 | \n",
" Q39546 | \n",
" Q1420 | \n",
" 1029 | \n",
" many_subclasses | \n",
" 'tool'@en | \n",
"
\n",
" \n",
" 16 | \n",
" Q42889 | \n",
" Q1420 | \n",
" 114 | \n",
" few_subclasses | \n",
" 'vehicle'@en | \n",
"
\n",
" \n",
" 17 | \n",
" Q4406616 | \n",
" Q1420 | \n",
" 322 | \n",
" few_subclasses | \n",
" 'concrete object'@en | \n",
"
\n",
" \n",
" 18 | \n",
" Q488383 | \n",
" Q1420 | \n",
" 701 | \n",
" few_subclasses | \n",
" 'object'@en | \n",
"
\n",
" \n",
" 19 | \n",
" Q6671777 | \n",
" Q1420 | \n",
" 29 | \n",
" few_subclasses | \n",
" 'structure'@en | \n",
"
\n",
" \n",
" 20 | \n",
" Q752870 | \n",
" Q1420 | \n",
" 8 | \n",
" few_subclasses | \n",
" 'motor vehicle'@en | \n",
"
\n",
" \n",
" 21 | \n",
" Q8205328 | \n",
" Q1420 | \n",
" 52 | \n",
" few_subclasses | \n",
" 'artificial physical object'@en | \n",
"
\n",
" \n",
" 22 | \n",
" Q9158768 | \n",
" Q1420 | \n",
" 63 | \n",
" few_subclasses | \n",
" 'storage'@en | \n",
"
\n",
" \n",
" 23 | \n",
" Q987767 | \n",
" Q1420 | \n",
" 282 | \n",
" few_subclasses | \n",
" 'container'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 graph instance_count node_type \\\n",
"0 Q11019 Q1420 99 few_subclasses \n",
"1 Q1183543 Q1420 198 many_subclasses \n",
"2 Q1301433 Q1420 17 few_subclasses \n",
"3 Q1420 Q1420 862 many_subclasses \n",
"4 Q15401930 Q1420 12 few_subclasses \n",
"5 Q15618781 Q1420 29 few_subclasses \n",
"6 Q16686448 Q1420 24 few_subclasses \n",
"7 Q16798631 Q1420 389 few_subclasses \n",
"8 Q223557 Q1420 110 few_subclasses \n",
"9 Q2424752 Q1420 412 few_subclasses \n",
"10 Q28877 Q1420 11 few_subclasses \n",
"11 Q3245975 Q1420 1 few_subclasses \n",
"12 Q337060 Q1420 38 few_subclasses \n",
"13 Q35120 Q1420 34 few_subclasses \n",
"14 Q35825432 Q1420 2 few_subclasses \n",
"15 Q39546 Q1420 1029 many_subclasses \n",
"16 Q42889 Q1420 114 few_subclasses \n",
"17 Q4406616 Q1420 322 few_subclasses \n",
"18 Q488383 Q1420 701 few_subclasses \n",
"19 Q6671777 Q1420 29 few_subclasses \n",
"20 Q752870 Q1420 8 few_subclasses \n",
"21 Q8205328 Q1420 52 few_subclasses \n",
"22 Q9158768 Q1420 63 few_subclasses \n",
"23 Q987767 Q1420 282 few_subclasses \n",
"\n",
" label \n",
"0 'machine'@en \n",
"1 'device'@en \n",
"2 'land vehicle'@en \n",
"3 'motor car'@en \n",
"4 'product'@en \n",
"5 'wheeled vehicle'@en \n",
"6 'artificial entity'@en \n",
"7 'equipment'@en \n",
"8 'physical object'@en \n",
"9 'product'@en \n",
"10 'goods'@en \n",
"11 'finished good'@en \n",
"12 'perceptible object'@en \n",
"13 'entity'@en \n",
"14 'converter'@en \n",
"15 'tool'@en \n",
"16 'vehicle'@en \n",
"17 'concrete object'@en \n",
"18 'object'@en \n",
"19 'structure'@en \n",
"20 'motor vehicle'@en \n",
"21 'artificial physical object'@en \n",
"22 'storage'@en \n",
"23 'container'@en "
]
},
"execution_count": 219,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i $TEMP/graph.low.node.tsv.gz\n",
" --match '(node)-[{graph: \"Q1420\"}]->()'\n",
" --order-by 'node'\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"### In progress: Trim the subclasses based on the levels\n",
"\n",
"The idea is to also trim the graph based on the number of levels, this may be difficult as I think some small graphs may have lots of levels, and some graphs may become large with just a small number of levels."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is our starting point:"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T14:17:24.935299Z",
"iopub.status.busy": "2021-12-29T14:17:24.935039Z",
"iopub.status.idle": "2021-12-29T14:17:27.057601Z",
"shell.execute_reply": "2021-12-29T14:17:27.056689Z",
"shell.execute_reply.started": "2021-12-29T14:17:24.935267Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" distance | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q100000030 | \n",
" 0 | \n",
" Q100000030-P279star-Q100000030 | \n",
"
\n",
" \n",
" 1 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q14748 | \n",
" 1 | \n",
" Q100000030-P279star-Q14748 | \n",
"
\n",
" \n",
" 2 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q14745 | \n",
" 2 | \n",
" Q100000030-P279star-Q14745 | \n",
"
\n",
" \n",
" 3 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q1357761 | \n",
" 3 | \n",
" Q100000030-P279star-Q1357761 | \n",
"
\n",
" \n",
" 4 | \n",
" Q100000030 | \n",
" P279star | \n",
" Q2424752 | \n",
" 3 | \n",
" Q100000030-P279star-Q2424752 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label node2 distance id\n",
"0 Q100000030 P279star Q100000030 0 Q100000030-P279star-Q100000030\n",
"1 Q100000030 P279star Q14748 1 Q100000030-P279star-Q14748\n",
"2 Q100000030 P279star Q14745 2 Q100000030-P279star-Q14745\n",
"3 Q100000030 P279star Q1357761 3 Q100000030-P279star-Q1357761\n",
"4 Q100000030 P279star Q2424752 3 Q100000030-P279star-Q2424752"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"head -i $OUT/derived.p279star.complete.tsv.gz -n 5\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's look at the distribution of distances"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T14:17:27.059230Z",
"iopub.status.busy": "2021-12-29T14:17:27.058956Z",
"iopub.status.idle": "2021-12-29T14:18:51.300383Z",
"shell.execute_reply": "2021-12-29T14:18:51.299311Z",
"shell.execute_reply.started": "2021-12-29T14:17:27.059198Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" distance | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 14920344 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 12395081 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 12068280 | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 11432165 | \n",
"
\n",
" \n",
" 4 | \n",
" 7 | \n",
" 8660425 | \n",
"
\n",
" \n",
" 5 | \n",
" 2 | \n",
" 6976960 | \n",
"
\n",
" \n",
" 6 | \n",
" 8 | \n",
" 6681393 | \n",
"
\n",
" \n",
" 7 | \n",
" 9 | \n",
" 4448827 | \n",
"
\n",
" \n",
" 8 | \n",
" 1 | \n",
" 3077658 | \n",
"
\n",
" \n",
" 9 | \n",
" 0 | \n",
" 2503943 | \n",
"
\n",
" \n",
" 10 | \n",
" 10 | \n",
" 1873495 | \n",
"
\n",
" \n",
" 11 | \n",
" 11 | \n",
" 1159780 | \n",
"
\n",
" \n",
" 12 | \n",
" 12 | \n",
" 781940 | \n",
"
\n",
" \n",
" 13 | \n",
" 13 | \n",
" 362901 | \n",
"
\n",
" \n",
" 14 | \n",
" 14 | \n",
" 216027 | \n",
"
\n",
" \n",
" 15 | \n",
" 15 | \n",
" 119855 | \n",
"
\n",
" \n",
" 16 | \n",
" 16 | \n",
" 55762 | \n",
"
\n",
" \n",
" 17 | \n",
" 17 | \n",
" 27343 | \n",
"
\n",
" \n",
" 18 | \n",
" 18 | \n",
" 12478 | \n",
"
\n",
" \n",
" 19 | \n",
" 19 | \n",
" 5166 | \n",
"
\n",
" \n",
" 20 | \n",
" 20 | \n",
" 2427 | \n",
"
\n",
" \n",
" 21 | \n",
" 21 | \n",
" 659 | \n",
"
\n",
" \n",
" 22 | \n",
" 22 | \n",
" 188 | \n",
"
\n",
" \n",
" 23 | \n",
" 23 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" distance count\n",
"0 6 14920344\n",
"1 4 12395081\n",
"2 5 12068280\n",
"3 3 11432165\n",
"4 7 8660425\n",
"5 2 6976960\n",
"6 8 6681393\n",
"7 9 4448827\n",
"8 1 3077658\n",
"9 0 2503943\n",
"10 10 1873495\n",
"11 11 1159780\n",
"12 12 781940\n",
"13 13 362901\n",
"14 14 216027\n",
"15 15 119855\n",
"16 16 55762\n",
"17 17 27343\n",
"18 18 12478\n",
"19 19 5166\n",
"20 20 2427\n",
"21 21 659\n",
"22 22 188\n",
"23 23 15"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kgtk(\"\"\"\n",
" query -i p279starcomplete\n",
" --match '(class)-[eid {distance: d}]->(superclass)'\n",
" --return 'distinct d as distance, count(eid) as count'\n",
" --order-by 'cast(count, int) desc'\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Filter the `p279starcomplete` file to keep only the subclasses with distance < K=10"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T14:18:51.303245Z",
"iopub.status.busy": "2021-12-29T14:18:51.302746Z",
"iopub.status.idle": "2021-12-29T14:20:52.297428Z",
"shell.execute_reply": "2021-12-29T14:20:52.296441Z",
"shell.execute_reply.started": "2021-12-29T14:18:51.303199Z"
},
"tags": []
},
"outputs": [],
"source": [
"kgtk(\"\"\"\n",
" query -i p279stard\n",
" --match '(subclass)-[eid {distance: d}]->(class)'\n",
" --return 'class as node1, \"Pcount_subclasses\" as label, count(distinct subclass) as node2'\n",
" --where 'subclass != class and d < 9'\n",
" --order-by 'cast(node2, int) desc'\n",
" -o $TEMP/subclass.count.d10.tsv.gz\n",
"\"\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`kgtk add-labels` drives me crazy, as it takes sooooo long."
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-29T14:20:52.299701Z",
"iopub.status.busy": "2021-12-29T14:20:52.299403Z",
"iopub.status.idle": "2021-12-29T14:22:33.279199Z",
"shell.execute_reply": "2021-12-29T14:22:33.277680Z",
"shell.execute_reply.started": "2021-12-29T14:20:52.299672Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zcat: error writing to output: Broken pipe\n",
"| node1 | label | node2 | node1;label |\n",
"| --------- | ----------------- | ------- | --------------------------- |\n",
"| Q35120 | Pcount_subclasses | 2366995 | 'entity'@en |\n",
"| Q99527517 | Pcount_subclasses | 1440970 | 'collection entity'@en |\n",
"| Q16887380 | Pcount_subclasses | 1326944 | 'group'@en |\n",
"| Q20937557 | Pcount_subclasses | 1255680 | 'series'@en |\n",
"| Q28813620 | Pcount_subclasses | 1226806 | 'set'@en |\n",
"| Q488383 | Pcount_subclasses | 1185270 | 'object'@en |\n",
"| Q4406616 | Pcount_subclasses | 1144700 | 'concrete object'@en |\n",
"| Q223557 | Pcount_subclasses | 1136457 | 'physical object'@en |\n",
"| Q6671777 | Pcount_subclasses | 1110651 | 'structure'@en |\n",
"| Q58415929 | Pcount_subclasses | 1091001 | 'spatio-temporal entity'@en |\n",
"| Q219858 | Pcount_subclasses | 1056942 | 'zone'@en |\n",
"| Q50365914 | Pcount_subclasses | 1056855 | 'biological region'@en |\n",
"| Q97669203 | Pcount_subclasses | 1007358 | 'molecular conformation'@en |\n",
"| Q15712714 | Pcount_subclasses | 1007317 | 'biomolecular structure'@en |\n",
"| Q3511065 | Pcount_subclasses | 1007234 | 'biological sequence'@en |\n",
"| Q7187 | Pcount_subclasses | 1004629 | 'gene'@en |\n",
"| Q3771876 | Pcount_subclasses | 1004622 | 'nucleic acid structure'@en |\n",
"| Q37500013 | Pcount_subclasses | 1004619 | 'primary structure'@en |\n",
"| Q863908 | Pcount_subclasses | 1004579 | 'nucleic acid sequence'@en |\n"
]
}
],
"source": [
"!zcat < $TEMP/subclass.count.d10.tsv.gz | head -20 | kgtk add-labels / table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "kgtk",
"language": "python",
"name": "kgtk"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}