{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Class Visualization\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preamble: set up the environment and files used in the tutorial" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:15.019726Z", "iopub.status.busy": "2021-12-29T05:43:15.019511Z", "iopub.status.idle": "2021-12-29T05:43:17.301772Z", "shell.execute_reply": "2021-12-29T05:43:17.301144Z", "shell.execute_reply.started": "2021-12-29T05:43:15.019679Z" }, "tags": [] }, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "from graph_tool.all import *\n", "\n", "import papermill as pm\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:17.302558Z", "iopub.status.busy": "2021-12-29T05:43:17.302429Z", "iopub.status.idle": "2021-12-29T05:43:17.305783Z", "shell.execute_reply": "2021-12-29T05:43:17.305151Z", "shell.execute_reply.started": "2021-12-29T05:43:17.302540Z" }, "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "\n", "kgtk_path = \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/\"\n", "output_path = \"/Users/pedroszekely/Downloads/kypher/projects\"\n", "graph_cache_path = \"/Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\"\n", "project_name = \"class-visualization\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our Wikidata distribution partitions the knowledge in Wikidata into smaller files that make it possible for you to pick and choose which files you want to use. Our tutorial KG is a subset of Wikidata, and is partitioned in the same way as the full Wikidata. The following is a partial list of all the files:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:17.308422Z", "iopub.status.busy": "2021-12-29T05:43:17.308212Z", "iopub.status.idle": "2021-12-29T05:43:17.312824Z", "shell.execute_reply": "2021-12-29T05:43:17.312414Z", "shell.execute_reply.started": "2021-12-29T05:43:17.308399Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /Users/pedroszekely\n", "Current dir: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/use-cases\n", "KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk\n", "Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n" ] } ], "source": [ "files = [\n", " \"p279\",\n", " \"p279star\",\n", " \"label\"\n", "]\n", "\n", "# statistics.Pinstance_count.tsv.gz\n", "\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " graph_cache_path=graph_cache_path,\n", " project_name=project_name,\n", " debug=True\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The KGTK setup command defines environment variables for all the files so that you can reuse the Jupyter notebook when you install it on your local machine." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:17.313894Z", "iopub.status.busy": "2021-12-29T05:43:17.313762Z", "iopub.status.idle": "2021-12-29T05:43:17.317074Z", "shell.execute_reply": "2021-12-29T05:43:17.316628Z", "shell.execute_reply.started": "2021-12-29T05:43:17.313876Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n", "kypher: kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\n", "GRAPH: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/\n", "kgtk: kgtk --debug\n", "KGTK_GRAPH_CACHE: /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\n", "EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples\n", "STORE: /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db\n", "KGTK_OPTION_DEBUG: false\n", "OUT: /Users/pedroszekely/Downloads/kypher/projects/class-visualization\n", "TEMP: /Users/pedroszekely/Downloads/kypher/projects/class-visualization/temp.class-visualization\n", "KGTK_LABEL_FILE: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\n", "p279: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279.tsv.gz\n", "p279star: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz\n", "label: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:17.318022Z", "iopub.status.busy": "2021-12-29T05:43:17.317885Z", "iopub.status.idle": "2021-12-29T05:43:20.187984Z", "shell.execute_reply": "2021-12-29T05:43:20.187179Z", "shell.execute_reply.started": "2021-12-29T05:43:17.318005Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/class-visualization.sqlite3.db -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279.tsv.gz\" --as p279 -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz\" --as p279star -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\" --as label --limit 3\n", "[2021-12-28 21:43:20 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_1 AS graph_1_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "id\tnode1\tlabel\tnode2\n", "Q100000030-P279-Q14748-30394205-0\tQ100000030\tP279\tQ14748\n", "Q100000058-P279-Q1622444-bd182663-0\tQ100000058\tP279\tQ1622444\n", "Q1000032-P279-Q1813494-0aa0f1dc-0\tQ1000032\tP279\tQ1813494\n" ] } ], "source": [ "ck.load_files_into_cache()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:20.189425Z", "iopub.status.busy": "2021-12-29T05:43:20.189217Z", "iopub.status.idle": "2021-12-29T05:43:23.099815Z", "shell.execute_reply": "2021-12-29T05:43:23.098839Z", "shell.execute_reply.started": "2021-12-29T05:43:20.189405Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-28 21:43:22 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_1 AS graph_1_c1\n", " LIMIT ?\n", " PARAS: [5]\n", "---------------------------------------------\n", "id\tnode1\tlabel\tnode2\n", "Q100000030-P279-Q14748-30394205-0\tQ100000030\tP279\tQ14748\n", "Q100000058-P279-Q1622444-bd182663-0\tQ100000058\tP279\tQ1622444\n", "Q1000032-P279-Q1813494-0aa0f1dc-0\tQ1000032\tP279\tQ1813494\n", "Q1000032-P279-Q83602-482a1943-0\tQ1000032\tP279\tQ83602\n", "Q1000039-P279-Q11555767-2dddfd86-0\tQ1000039\tP279\tQ11555767\n" ] } ], "source": [ "!kgtk --debug query -i p279 --idx mode:monograph --limit 5" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:23.101657Z", "iopub.status.busy": "2021-12-29T05:43:23.101339Z", "iopub.status.idle": "2021-12-29T05:43:25.461409Z", "shell.execute_reply": "2021-12-29T05:43:25.460655Z", "shell.execute_reply.started": "2021-12-29T05:43:23.101626Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-28 21:43:25 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_2 AS graph_2_c1\n", " LIMIT ?\n", " PARAS: [5]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "Q100000030\tP279star\tQ100000030\tQ100000030-P279star-Q100000030-0000\n", "Q100000030\tP279star\tQ1357761\tQ100000030-P279star-Q1357761-0000\n", "Q100000030\tP279star\tQ14745\tQ100000030-P279star-Q14745-0000\n", "Q100000030\tP279star\tQ14748\tQ100000030-P279star-Q14748-0000\n", "Q100000030\tP279star\tQ15401930\tQ100000030-P279star-Q15401930-0000\n" ] } ], "source": [ "!kgtk --debug query -i p279star --idx mode:monograph --limit 5" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:25.463116Z", "iopub.status.busy": "2021-12-29T05:43:25.462861Z", "iopub.status.idle": "2021-12-29T05:43:28.104074Z", "shell.execute_reply": "2021-12-29T05:43:28.103009Z", "shell.execute_reply.started": "2021-12-29T05:43:25.463088Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-28 21:43:27 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_3 AS graph_3_c1\n", " LIMIT ?\n", " PARAS: [5]\n", "---------------------------------------------\n", "id\tnode1\tlabel\tnode2\n", "P10-label-en\tP10\tlabel\t'video'@en\n", "P1000-label-en\tP1000\tlabel\t'record held'@en\n", "P1001-label-en\tP1001\tlabel\t'applies to jurisdiction'@en\n", "P1002-label-en\tP1002\tlabel\t'engine configuration'@en\n", "P1003-label-en\tP1003\tlabel\t'National Library of Romania ID'@en\n" ] } ], "source": [ "!kgtk --debug query -i label --idx mode:monograph --limit 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get a list of all the classes\n" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2021-12-23T19:59:38.096353Z", "iopub.status.busy": "2021-12-23T19:59:38.096121Z", "iopub.status.idle": "2021-12-23T19:59:38.100393Z", "shell.execute_reply": "2021-12-23T19:59:38.099645Z", "shell.execute_reply.started": "2021-12-23T19:59:38.096330Z" } }, "source": [ "First get a list of all the `node1` in p279" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:28.106498Z", "iopub.status.busy": "2021-12-29T05:43:28.105980Z", "iopub.status.idle": "2021-12-29T05:43:36.573544Z", "shell.execute_reply": "2021-12-29T05:43:36.572740Z", "shell.execute_reply.started": "2021-12-29T05:43:28.106461Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i p279\n", " --match '(class)-[]->()'\n", " --return 'distinct class as id'\n", " -o $TEMP/p279.node1.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:36.575074Z", "iopub.status.busy": "2021-12-29T05:43:36.574825Z", "iopub.status.idle": "2021-12-29T05:43:36.819701Z", "shell.execute_reply": "2021-12-29T05:43:36.818712Z", "shell.execute_reply.started": "2021-12-29T05:43:36.575046Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2493245\n" ] } ], "source": [ "!zcat < $TEMP/p279.node1.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now get a list of all the node2 in p279" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:36.821835Z", "iopub.status.busy": "2021-12-29T05:43:36.821560Z", "iopub.status.idle": "2021-12-29T05:43:39.656109Z", "shell.execute_reply": "2021-12-29T05:43:39.655342Z", "shell.execute_reply.started": "2021-12-29T05:43:36.821802Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i p279\n", " --match '()-[]->(class)'\n", " --return 'distinct class as id'\n", " -o $TEMP/p279.node2.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:39.661726Z", "iopub.status.busy": "2021-12-29T05:43:39.661485Z", "iopub.status.idle": "2021-12-29T05:43:39.797443Z", "shell.execute_reply": "2021-12-29T05:43:39.796756Z", "shell.execute_reply.started": "2021-12-29T05:43:39.661699Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 126327\n" ] } ], "source": [ "!zcat < $TEMP/p279.node2.tsv.gz | wc -l" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:39.798885Z", "iopub.status.busy": "2021-12-29T05:43:39.798598Z", "iopub.status.idle": "2021-12-29T05:43:45.898775Z", "shell.execute_reply": "2021-12-29T05:43:45.897950Z", "shell.execute_reply.started": "2021-12-29T05:43:39.798860Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " ifnotexists --mode NONE \n", " -i $TEMP/p279.node2.tsv.gz\n", " --filter-on $TEMP/p279.node1.tsv.gz\n", " --input-keys id\n", " --filter-keys id\n", " -o $TEMP/p279.classes-that-are-not-subclasses.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:45.900456Z", "iopub.status.busy": "2021-12-29T05:43:45.900068Z", "iopub.status.idle": "2021-12-29T05:43:46.032973Z", "shell.execute_reply": "2021-12-29T05:43:46.032391Z", "shell.execute_reply.started": "2021-12-29T05:43:45.900426Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 10700\n" ] } ], "source": [ "!zcat < $TEMP/p279.classes-that-are-not-subclasses.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the files to get a list of all the classes" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:46.034585Z", "iopub.status.busy": "2021-12-29T05:43:46.034288Z", "iopub.status.idle": "2021-12-29T05:43:52.926892Z", "shell.execute_reply": "2021-12-29T05:43:52.926055Z", "shell.execute_reply.started": "2021-12-29T05:43:46.034566Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat --mode NONE -i $TEMP/p279.node1.tsv.gz -i $TEMP/p279.classes-that-are-not-subclasses.tsv.gz\n", " / sort --mode NONE --column id\n", " -o $OUT/classes.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:52.928636Z", "iopub.status.busy": "2021-12-29T05:43:52.928389Z", "iopub.status.idle": "2021-12-29T05:43:53.110141Z", "shell.execute_reply": "2021-12-29T05:43:53.109140Z", "shell.execute_reply.started": "2021-12-29T05:43:52.928605Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2503944\n" ] } ], "source": [ "!zcat < $OUT/classes.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Measure the degree of classes" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:43:53.112571Z", "iopub.status.busy": "2021-12-29T05:43:53.111921Z", "iopub.status.idle": "2021-12-29T05:45:05.983870Z", "shell.execute_reply": "2021-12-29T05:45:05.983073Z", "shell.execute_reply.started": "2021-12-29T05:43:53.112538Z" } }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " graph-statistics -i \"$p279\" -o $OUT/statistics.p279.tsv.gz \n", " --compute-pagerank False \n", " --compute-hits False \n", " --page-rank-property Pdirected_pagerank \n", " --vertex-in-degree-property Pindegree\n", " --vertex-out-degree-property Poutdegree\n", " --output-degrees True \n", " --output-pagerank False \n", " --output-hits False \\\n", " --output-statistics-only \n", " --undirected False \n", " --log-file $TEMP/statistics.summary.txt\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:45:05.985638Z", "iopub.status.busy": "2021-12-29T05:45:05.985353Z", "iopub.status.idle": "2021-12-29T05:45:15.728202Z", "shell.execute_reply": "2021-12-29T05:45:15.727311Z", "shell.execute_reply.started": "2021-12-29T05:45:05.985611Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"sort -i $OUT/statistics.p279.tsv.gz --columns node2 --numeric --reverse -o $TEMP.p279.indegree.tsv.gz\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:45:15.729799Z", "iopub.status.busy": "2021-12-29T05:45:15.729515Z", "iopub.status.idle": "2021-12-29T05:47:08.424492Z", "shell.execute_reply": "2021-12-29T05:47:08.423765Z", "shell.execute_reply.started": "2021-12-29T05:45:15.729770Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2idnode1;label
0Q20747295Pindegree942004Q20747295-Pindegree-19626'protein-coding gene'@en
1Q8054Pindegree764038Q8054-Pindegree-15274'protein'@en
2Q7187Pindegree449619Q7187-Pindegree-5566'gene'@en
3Q277338Pindegree49936Q277338-Pindegree-220748'pseudogene'@en
4Q427087Pindegree47843Q427087-Pindegree-197396'non-coding RNA'@en
5Q382617Pindegree40184Q382617-Pindegree-45664'mayor of a place in France'@en
6Q15113603Pindegree40179Q15113603-Pindegree-197900'municipal councillor'@en
7Q11173Pindegree14255Q11173-Pindegree-638'chemical compound'@en
8Q64698614Pindegree8832Q64698614-Pindegree-2767278'pseudogenic transcript'@en
9Q201448Pindegree8724Q201448-Pindegree-278588'transfer RNA'@en
10Q5663900Pindegree8011Q5663900-Pindegree-46632'mayor of a place in Spain'@en
11Q11436Pindegree4486Q11436-Pindegree-620'aircraft'@en
12Q284416Pindegree3392Q284416-Pindegree-280684'small nucleolar RNA'@en
13Q99762605Pindegree3186Q99762605-Pindegree-588940'mayor of a place in Romania'@en
14Q2095Pindegree2837Q2095-Pindegree-1938'food'@en
15Q20650761Pindegree2459Q20650761-Pindegree-5618'tender locomotive'@en
16Q1125341Pindegree2379Q1125341-Pindegree-26474'Italian wine'@en
17Q30185Pindegree2350Q30185-Pindegree-1190'mayor'@en
18Q483373Pindegree2063Q483373-Pindegree-2144'electric multiple unit'@en
19Q200779Pindegree2027Q200779-Pindegree-24990'genetic disease'@en
20Q215980Pindegree1843Q215980-Pindegree-113320'ribosomal RNA'@en
21Q17517Pindegree1755Q17517-Pindegree-24936'mobile phone'@en
22Q13219666Pindegree1748Q13219666-Pindegree-23322'tennis tournament'@en
23Q2449730Pindegree1739Q2449730-Pindegree-16150'transport protein'@en
24Q785745Pindegree1536Q785745-Pindegree-5976'tank locomotive'@en
\n", "
" ], "text/plain": [ " node1 label node2 id \\\n", "0 Q20747295 Pindegree 942004 Q20747295-Pindegree-19626 \n", "1 Q8054 Pindegree 764038 Q8054-Pindegree-15274 \n", "2 Q7187 Pindegree 449619 Q7187-Pindegree-5566 \n", "3 Q277338 Pindegree 49936 Q277338-Pindegree-220748 \n", "4 Q427087 Pindegree 47843 Q427087-Pindegree-197396 \n", "5 Q382617 Pindegree 40184 Q382617-Pindegree-45664 \n", "6 Q15113603 Pindegree 40179 Q15113603-Pindegree-197900 \n", "7 Q11173 Pindegree 14255 Q11173-Pindegree-638 \n", "8 Q64698614 Pindegree 8832 Q64698614-Pindegree-2767278 \n", "9 Q201448 Pindegree 8724 Q201448-Pindegree-278588 \n", "10 Q5663900 Pindegree 8011 Q5663900-Pindegree-46632 \n", "11 Q11436 Pindegree 4486 Q11436-Pindegree-620 \n", "12 Q284416 Pindegree 3392 Q284416-Pindegree-280684 \n", "13 Q99762605 Pindegree 3186 Q99762605-Pindegree-588940 \n", "14 Q2095 Pindegree 2837 Q2095-Pindegree-1938 \n", "15 Q20650761 Pindegree 2459 Q20650761-Pindegree-5618 \n", "16 Q1125341 Pindegree 2379 Q1125341-Pindegree-26474 \n", "17 Q30185 Pindegree 2350 Q30185-Pindegree-1190 \n", "18 Q483373 Pindegree 2063 Q483373-Pindegree-2144 \n", "19 Q200779 Pindegree 2027 Q200779-Pindegree-24990 \n", "20 Q215980 Pindegree 1843 Q215980-Pindegree-113320 \n", "21 Q17517 Pindegree 1755 Q17517-Pindegree-24936 \n", "22 Q13219666 Pindegree 1748 Q13219666-Pindegree-23322 \n", "23 Q2449730 Pindegree 1739 Q2449730-Pindegree-16150 \n", "24 Q785745 Pindegree 1536 Q785745-Pindegree-5976 \n", "\n", " node1;label \n", "0 'protein-coding gene'@en \n", "1 'protein'@en \n", "2 'gene'@en \n", "3 'pseudogene'@en \n", "4 'non-coding RNA'@en \n", "5 'mayor of a place in France'@en \n", "6 'municipal councillor'@en \n", "7 'chemical compound'@en \n", "8 'pseudogenic transcript'@en \n", "9 'transfer RNA'@en \n", "10 'mayor of a place in Spain'@en \n", "11 'aircraft'@en \n", "12 'small nucleolar RNA'@en \n", "13 'mayor of a place in Romania'@en \n", "14 'food'@en \n", "15 'tender locomotive'@en \n", "16 'Italian wine'@en \n", "17 'mayor'@en \n", "18 'electric multiple unit'@en \n", "19 'genetic disease'@en \n", "20 'ribosomal RNA'@en \n", "21 'mobile phone'@en \n", "22 'tennis tournament'@en \n", "23 'transport protein'@en \n", "24 'tank locomotive'@en " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $TEMP.p279.indegree.tsv.gz -n 25 / add-labels\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:47:08.425868Z", "iopub.status.busy": "2021-12-29T05:47:08.425574Z", "iopub.status.idle": "2021-12-29T05:47:36.770575Z", "shell.execute_reply": "2021-12-29T05:47:36.769788Z", "shell.execute_reply.started": "2021-12-29T05:47:08.425840Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q20747295Pindegree942004Q20747295-Pindegree-19626
1Q8054Pindegree764038Q8054-Pindegree-15274
2Q7187Pindegree449619Q7187-Pindegree-5566
3Q277338Pindegree49936Q277338-Pindegree-220748
4Q427087Pindegree47843Q427087-Pindegree-197396
...............
63Q1183543Pindegree517Q1183543-Pindegree-330
64Q7368Pindegree516Q7368-Pindegree-2150
65Q11415564Pindegree512Q11415564-Pindegree-656
66Q87008012Pindegree501Q87008012-Pindegree-5226
67Q62927Pindegree501Q62927-Pindegree-26744
\n", "

68 rows × 4 columns

\n", "
" ], "text/plain": [ " node1 label node2 id\n", "0 Q20747295 Pindegree 942004 Q20747295-Pindegree-19626\n", "1 Q8054 Pindegree 764038 Q8054-Pindegree-15274\n", "2 Q7187 Pindegree 449619 Q7187-Pindegree-5566\n", "3 Q277338 Pindegree 49936 Q277338-Pindegree-220748\n", "4 Q427087 Pindegree 47843 Q427087-Pindegree-197396\n", ".. ... ... ... ...\n", "63 Q1183543 Pindegree 517 Q1183543-Pindegree-330\n", "64 Q7368 Pindegree 516 Q7368-Pindegree-2150\n", "65 Q11415564 Pindegree 512 Q11415564-Pindegree-656\n", "66 Q87008012 Pindegree 501 Q87008012-Pindegree-5226\n", "67 Q62927 Pindegree 501 Q62927-Pindegree-26744\n", "\n", "[68 rows x 4 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/statistics.p279.tsv.gz \n", " --match '(n1)-[eid]->(degree)' \n", " --where 'cast(degree, int) > 500' \n", " --order-by 'cast(degree, int) desc'\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create list of high and low `P279` degree classes " ] }, { "cell_type": "code", "execution_count": 182, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:09:08.111294Z", "iopub.status.busy": "2021-12-31T03:09:08.110981Z", "iopub.status.idle": "2021-12-31T03:09:14.055689Z", "shell.execute_reply": "2021-12-31T03:09:14.054890Z", "shell.execute_reply.started": "2021-12-31T03:09:08.111253Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/statistics.p279.tsv.gz \n", " --match '(n1)-[:Pindegree]->(degree)' \n", " --where 'cast(degree, int) < 500' \n", " --return 'n1 as node1, \"few_subclasses\" as node_type'\n", " --order-by 'cast(degree, int) desc'\n", " -o $OUT/class-browsing.low-degree-nodes.tsv\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `class-browsing.low-degree-nodes.tsv` is simply a list of nodes:" ] }, { "cell_type": "code", "execution_count": 183, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:09:14.057768Z", "iopub.status.busy": "2021-12-31T03:09:14.057544Z", "iopub.status.idle": "2021-12-31T03:09:16.630284Z", "shell.execute_reply": "2021-12-31T03:09:16.629376Z", "shell.execute_reply.started": "2021-12-31T03:09:14.057738Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1
0Q898273
1Q1002954
2Q11446
3Q22325163
4Q79529
\n", "
" ], "text/plain": [ " node1\n", "0 Q898273\n", "1 Q1002954\n", "2 Q11446\n", "3 Q22325163\n", "4 Q79529" ] }, "execution_count": 183, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $OUT/class-browsing.low-degree-nodes.tsv\")" ] }, { "cell_type": "code", "execution_count": 184, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:09:16.631813Z", "iopub.status.busy": "2021-12-31T03:09:16.631562Z", "iopub.status.idle": "2021-12-31T03:09:19.487327Z", "shell.execute_reply": "2021-12-31T03:09:19.486446Z", "shell.execute_reply.started": "2021-12-31T03:09:16.631784Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/statistics.p279.tsv.gz \n", " --match '(n1)-[:Pindegree]->(degree)' \n", " --where 'cast(degree, int) > 499'\n", " --return 'n1 as node1, \"many_subclasses\" as node_type'\n", " --order-by 'cast(degree, int) desc'\n", " -o $OUT/class-browsing.high-degree-nodes.tsv\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 189, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:17:03.265480Z", "iopub.status.busy": "2021-12-31T03:17:03.265178Z", "iopub.status.idle": "2021-12-31T03:17:06.304141Z", "shell.execute_reply": "2021-12-31T03:17:06.303177Z", "shell.execute_reply.started": "2021-12-31T03:17:03.265449Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat --use-graph-cache-envar False --mode NONE -i $OUT/class-browsing.low-degree-nodes.tsv -i $OUT/class-browsing.high-degree-nodes.tsv\n", " -o $OUT/class-browsing.all-nodes.tsv\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 190, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:17:34.320154Z", "iopub.status.busy": "2021-12-31T03:17:34.319885Z", "iopub.status.idle": "2021-12-31T03:17:36.605149Z", "shell.execute_reply": "2021-12-31T03:17:36.604366Z", "shell.execute_reply.started": "2021-12-31T03:17:34.320125Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node_type
0Q898273few_subclasses
1Q1002954few_subclasses
2Q11446few_subclasses
3Q22325163few_subclasses
\n", "
" ], "text/plain": [ " node1 node_type\n", "0 Q898273 few_subclasses\n", "1 Q1002954 few_subclasses\n", "2 Q11446 few_subclasses\n", "3 Q22325163 few_subclasses" ] }, "execution_count": 190, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $OUT/class-browsing.all-nodes.tsv -n 4\")" ] }, { "cell_type": "code", "execution_count": 191, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:18:34.732099Z", "iopub.status.busy": "2021-12-31T03:18:34.731864Z", "iopub.status.idle": "2021-12-31T03:18:43.863929Z", "shell.execute_reply": "2021-12-31T03:18:43.863301Z", "shell.execute_reply.started": "2021-12-31T03:18:34.732078Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-30 19:18:36 sqlstore]: IMPORT graph directly into table graph_43 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/class-browsing.all-nodes.tsv ...\n", "[2021-12-30 19:18:42 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_43 AS graph_43_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "[2021-12-30 19:18:42 sqlstore]: CREATE INDEX \"graph_43_node1_node_type_idx\" ON \"graph_43\" (\"node1\", \"node_type\")\n", "[2021-12-30 19:18:43 sqlstore]: ANALYZE \"graph_43_node1_node_type_idx\"\n", "node1\tnode_type\n", "Q898273\tfew_subclasses\n", "Q1002954\tfew_subclasses\n", "Q11446\tfew_subclasses\n" ] } ], "source": [ "!kgtk --debug query -i $OUT/class-browsing.all-nodes.tsv --as browsernodes --idx index:node1,node_type --limit 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a P279star file that we will use for visualization.\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### First create a complete p279star file containing all classes\n", "\n", "First create a complete P279star file that contains all classes as our starting point. We do this because in the browser, users can click on any class." ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T05:47:50.803142Z", "iopub.status.busy": "2021-12-29T05:47:50.802648Z", "iopub.status.idle": "2021-12-29T06:57:29.672950Z", "shell.execute_reply": "2021-12-29T06:57:29.672140Z", "shell.execute_reply.started": "2021-12-29T05:47:50.803099Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " reachable-nodes\n", " --rootfile $OUT/classes.tsv.gz\n", " --selflink \n", " --breadth-first True\n", " --show-distance True\n", " --label P279star\n", " -i \"$p279\"\n", " -o $TEMP/derived.p279star.complete.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T06:57:29.674498Z", "iopub.status.busy": "2021-12-29T06:57:29.674193Z", "iopub.status.idle": "2021-12-29T06:57:32.581005Z", "shell.execute_reply": "2021-12-29T06:57:32.580365Z", "shell.execute_reply.started": "2021-12-29T06:57:29.674466Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2distance
0Q100000030P279starQ1000000300
1Q100000030P279starQ147481
2Q100000030P279starQ147452
3Q100000030P279starQ13577613
4Q100000030P279starQ24247523
5Q100000030P279starQ318077463
6Q100000030P279starQ82053283
7Q100000030P279starQ2235574
8Q100000030P279starQ154019304
9Q100000030P279starQ288774
\n", "
" ], "text/plain": [ " node1 label node2 distance\n", "0 Q100000030 P279star Q100000030 0\n", "1 Q100000030 P279star Q14748 1\n", "2 Q100000030 P279star Q14745 2\n", "3 Q100000030 P279star Q1357761 3\n", "4 Q100000030 P279star Q2424752 3\n", "5 Q100000030 P279star Q31807746 3\n", "6 Q100000030 P279star Q8205328 3\n", "7 Q100000030 P279star Q223557 4\n", "8 Q100000030 P279star Q15401930 4\n", "9 Q100000030 P279star Q28877 4" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $TEMP/derived.p279star.complete.tsv.gz -n 10\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The complete p279star file has only a few more edges than the default one. We should replace the original one with the complete one in any case." ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T06:57:32.582753Z", "iopub.status.busy": "2021-12-29T06:57:32.582281Z", "iopub.status.idle": "2021-12-29T06:58:36.326425Z", "shell.execute_reply": "2021-12-29T06:58:36.325226Z", "shell.execute_reply.started": "2021-12-29T06:57:32.582726Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 87773437\n" ] } ], "source": [ "!zcat < \"$p279star\" | wc -l" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T06:58:36.328860Z", "iopub.status.busy": "2021-12-29T06:58:36.328552Z", "iopub.status.idle": "2021-12-29T06:58:39.481549Z", "shell.execute_reply": "2021-12-29T06:58:39.480285Z", "shell.execute_reply.started": "2021-12-29T06:58:36.328831Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 87783113\n" ] } ], "source": [ "!zcat < $TEMP/derived.p279star.complete.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add ids and index for use in queries. The new file has a distance column, which we index too so that we can do index queries quickly." ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T06:58:39.483647Z", "iopub.status.busy": "2021-12-29T06:58:39.483339Z", "iopub.status.idle": "2021-12-29T07:14:43.442208Z", "shell.execute_reply": "2021-12-29T07:14:43.441420Z", "shell.execute_reply.started": "2021-12-29T06:58:39.483617Z" } }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " add-id --id-style wikidata -i $TEMP/derived.p279star.complete.tsv.gz\n", " -o $OUT/derived.p279star.complete.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:14:43.444107Z", "iopub.status.busy": "2021-12-29T07:14:43.443770Z", "iopub.status.idle": "2021-12-29T07:25:14.256091Z", "shell.execute_reply": "2021-12-29T07:25:14.255260Z", "shell.execute_reply.started": "2021-12-29T07:14:43.444076Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-28 23:14:45 sqlstore]: DROP graph data table graph_5 from p279stard\n", "[2021-12-28 23:16:30 sqlstore]: IMPORT graph directly into table graph_28 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/derived.p279star.complete.tsv.gz ...\n", "[2021-12-28 23:22:35 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_28 AS graph_28_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "[2021-12-28 23:22:35 sqlstore]: CREATE INDEX \"graph_28_node2_node1_distance_idx\" ON \"graph_28\" (\"node2\", \"node1\", \"distance\")\n", "[2021-12-28 23:25:02 sqlstore]: ANALYZE \"graph_28_node2_node1_distance_idx\"\n", "node1\tlabel\tnode2\tdistance\tid\n", "Q100000030\tP279star\tQ100000030\t0\tQ100000030-P279star-Q100000030\n", "Q100000030\tP279star\tQ14748\t1\tQ100000030-P279star-Q14748\n", "Q100000030\tP279star\tQ14745\t2\tQ100000030-P279star-Q14745\n" ] } ], "source": [ "!kgtk --debug query -i $OUT/derived.p279star.complete.tsv.gz --as p279stard --idx index:node2,node1,distance --limit 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count the number of subclasses \n", "We eventually want to build the subclass graph for each class, but some may be too large" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:25:14.257723Z", "iopub.status.busy": "2021-12-29T07:25:14.257496Z", "iopub.status.idle": "2021-12-29T07:27:12.034978Z", "shell.execute_reply": "2021-12-29T07:27:12.033921Z", "shell.execute_reply.started": "2021-12-29T07:25:14.257696Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i p279starcomplete\n", " --match '\n", " (subclass)-[]->(class)'\n", " --return 'class as node1, \"Pcount_subclasses\" as label, count(distinct subclass) as node2, class as graph'\n", " --where 'subclass != class'\n", " --order-by 'cast(node2, int) desc'\n", " -o $TEMP/subclass.count.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get an overview of the file. The top classes have an enormous number of subclasses, which will cause trouble for visualization.\n", "Also, only 126K classes with subclasses, so there are a lot of leaf classes in Wikidata.\n", "\n", "In the steps below we exclude the high degree classes, but that won't fix the problem as the top classes have too many subclasses anyway. Sigh. The browser will freeze and the user will be annoyed." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:27:12.036779Z", "iopub.status.busy": "2021-12-29T07:27:12.036491Z", "iopub.status.idle": "2021-12-29T07:28:57.065443Z", "shell.execute_reply": "2021-12-29T07:28:57.064720Z", "shell.execute_reply.started": "2021-12-29T07:27:12.036749Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphnode1;labelgraph;label
0Q35120Pcount_subclasses2461204Q35120'entity'@en'entity'@en
1Q99527517Pcount_subclasses2254394Q99527517'collection entity'@en'collection entity'@en
2Q28813620Pcount_subclasses1362927Q28813620'set'@en'set'@en
3Q16887380Pcount_subclasses1362452Q16887380'group'@en'group'@en
4Q488383Pcount_subclasses1286223Q488383'object'@en'object'@en
.....................
126319Q99970237Pcount_subclasses1Q99970237'anthropomorphic deer'@en'anthropomorphic deer'@en
126320Q99971015Pcount_subclasses1Q99971015'anthropomorphic cow or other cattle'@en'anthropomorphic cow or other cattle'@en
126321Q99972330Pcount_subclasses1Q99972330'video game occupation'@en'video game occupation'@en
126322Q99974769Pcount_subclasses1Q99974769NaNNaN
126323Q999897Pcount_subclasses1Q999897'middle management'@en'middle management'@en
\n", "

126324 rows × 6 columns

\n", "
" ], "text/plain": [ " node1 label node2 graph \\\n", "0 Q35120 Pcount_subclasses 2461204 Q35120 \n", "1 Q99527517 Pcount_subclasses 2254394 Q99527517 \n", "2 Q28813620 Pcount_subclasses 1362927 Q28813620 \n", "3 Q16887380 Pcount_subclasses 1362452 Q16887380 \n", "4 Q488383 Pcount_subclasses 1286223 Q488383 \n", "... ... ... ... ... \n", "126319 Q99970237 Pcount_subclasses 1 Q99970237 \n", "126320 Q99971015 Pcount_subclasses 1 Q99971015 \n", "126321 Q99972330 Pcount_subclasses 1 Q99972330 \n", "126322 Q99974769 Pcount_subclasses 1 Q99974769 \n", "126323 Q999897 Pcount_subclasses 1 Q999897 \n", "\n", " node1;label \\\n", "0 'entity'@en \n", "1 'collection entity'@en \n", "2 'set'@en \n", "3 'group'@en \n", "4 'object'@en \n", "... ... \n", "126319 'anthropomorphic deer'@en \n", "126320 'anthropomorphic cow or other cattle'@en \n", "126321 'video game occupation'@en \n", "126322 NaN \n", "126323 'middle management'@en \n", "\n", " graph;label \n", "0 'entity'@en \n", "1 'collection entity'@en \n", "2 'set'@en \n", "3 'group'@en \n", "4 'object'@en \n", "... ... \n", "126319 'anthropomorphic deer'@en \n", "126320 'anthropomorphic cow or other cattle'@en \n", "126321 'video game occupation'@en \n", "126322 NaN \n", "126323 'middle management'@en \n", "\n", "[126324 rows x 6 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = kgtk(\"\"\"\n", " cat -i $TEMP/subclass.count.tsv.gz / add-labels\n", "\"\"\")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a subset of p279 that excludes high in-degree classes in node2\n", "\n", "File `class-browsing.low-degree-nodes.tsv` has the class with a low number of subclasses, which we call the low degree nodes. Our low degree P279 file will have all P279 edges that arrive at a low degree class, i.e., where the superclass is a low degree class." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:28:57.067028Z", "iopub.status.busy": "2021-12-29T07:28:57.066692Z", "iopub.status.idle": "2021-12-29T07:29:12.667171Z", "shell.execute_reply": "2021-12-29T07:29:12.666615Z", "shell.execute_reply.started": "2021-12-29T07:28:57.067000Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i p279 -i $OUT/class-browsing.low-degree-nodes.tsv\n", " --match '\n", " p279: (class)-[eid]->(superclass),\n", " low: (superclass)'\n", " --return 'class as node1, eid.label as label, superclass as node2, eid as id'\n", " -o $OUT/p279.lowdegree.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:29:12.668213Z", "iopub.status.busy": "2021-12-29T07:29:12.668090Z", "iopub.status.idle": "2021-12-29T07:29:17.950168Z", "shell.execute_reply": "2021-12-29T07:29:17.949332Z", "shell.execute_reply.started": "2021-12-29T07:29:12.668196Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 3077832\n" ] } ], "source": [ "!zcat < \"$p279\" | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The low degree P279 file has many fewer edges, which is expected as the high degree classes account for a lot of edges." ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:29:17.951851Z", "iopub.status.busy": "2021-12-29T07:29:17.951555Z", "iopub.status.idle": "2021-12-29T07:29:18.205220Z", "shell.execute_reply": "2021-12-29T07:29:18.204410Z", "shell.execute_reply.started": "2021-12-29T07:29:17.951822Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 633444\n" ] } ], "source": [ "!zcat < $OUT/p279.lowdegree.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Recompute P279star with the low degree classes\n", "The output will be `derived.p279star.low-degree.complete.tsv.gz`\n", "\n", "We start at all classes, and find all superclasses for them, excluding the high degree classes." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:29:18.207450Z", "iopub.status.busy": "2021-12-29T07:29:18.207201Z", "iopub.status.idle": "2021-12-29T07:35:39.186557Z", "shell.execute_reply": "2021-12-29T07:35:39.185809Z", "shell.execute_reply.started": "2021-12-29T07:29:18.207425Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " reachable-nodes\n", " --rootfile $OUT/classes.tsv.gz\n", " --selflink \n", " --breadth-first True\n", " --show-distance True\n", " --label P279star\n", " -i $OUT/p279.lowdegree.tsv.gz\n", " -o $TEMP/derived.p279star.low-degree.complete.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add ids" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:35:39.188259Z", "iopub.status.busy": "2021-12-29T07:35:39.187991Z", "iopub.status.idle": "2021-12-29T07:38:20.061963Z", "shell.execute_reply": "2021-12-29T07:38:20.061099Z", "shell.execute_reply.started": "2021-12-29T07:35:39.188231Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " add-id --id-style wikidata -i $TEMP/derived.p279star.low-degree.complete.tsv.gz\n", " -o $OUT/derived.p279star.low-degree.complete.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index using node1, node2 and distance. I wonder if we should also index the id column?" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:38:20.063686Z", "iopub.status.busy": "2021-12-29T07:38:20.063392Z", "iopub.status.idle": "2021-12-29T07:40:26.966537Z", "shell.execute_reply": "2021-12-29T07:40:26.965978Z", "shell.execute_reply.started": "2021-12-29T07:38:20.063658Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-28 23:38:22 sqlstore]: DROP graph data table graph_11 from p279starlow\n", "[2021-12-28 23:38:57 sqlstore]: IMPORT graph directly into table graph_30 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/derived.p279star.low-degree.complete.tsv.gz ...\n", "[2021-12-28 23:40:01 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_30 AS graph_30_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "[2021-12-28 23:40:01 sqlstore]: CREATE INDEX \"graph_30_node2_node1_distance_idx\" ON \"graph_30\" (\"node2\", \"node1\", \"distance\")\n", "[2021-12-28 23:40:24 sqlstore]: ANALYZE \"graph_30_node2_node1_distance_idx\"\n", "node1\tlabel\tnode2\tdistance\tid\n", "Q100000030\tP279star\tQ100000030\t0\tQ100000030-P279star-Q100000030\n", "Q100000030\tP279star\tQ14748\t1\tQ100000030-P279star-Q14748\n", "Q100000030\tP279star\tQ14745\t2\tQ100000030-P279star-Q14745\n" ] } ], "source": [ "!kgtk --debug query -i $OUT/derived.p279star.low-degree.complete.tsv.gz --as p279starlow --idx index:node2,node1,distance --limit 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Statistics to show in the graph\n", "\n", "> We are not computing the statistics file in this notebook as it is computed in the `p1963` project. \n", "> We need the file here, so Pedro copied it from the `p1963` project and put it in the `$TEMP` folder\n", "\n", "File is `statistics.Pinstance_count.tsv.gz`\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:40:26.968247Z", "iopub.status.busy": "2021-12-29T07:40:26.967653Z", "iopub.status.idle": "2021-12-29T07:40:29.007057Z", "shell.execute_reply": "2021-12-29T07:40:29.006255Z", "shell.execute_reply.started": "2021-12-29T07:40:26.968208Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q1000017Pinstance_count1Q1000017-Pinstance_count-6b86b2
1Q1000091Pinstance_count1Q1000091-Pinstance_count-6b86b2
2Q1000156Pinstance_count11Q1000156-Pinstance_count-4fc82b
3Q100023Pinstance_count1Q100023-Pinstance_count-6b86b2
4Q100026Pinstance_count1Q100026-Pinstance_count-6b86b2
5Q100029091Pinstance_count10Q100029091-Pinstance_count-4a44dc
6Q1000300Pinstance_count2Q1000300-Pinstance_count-d4735e
7Q100034524Pinstance_count3Q100034524-Pinstance_count-4e0740
8Q1000371Pinstance_count3Q1000371-Pinstance_count-4e0740
9Q100038174Pinstance_count11Q100038174-Pinstance_count-4fc82b
\n", "
" ], "text/plain": [ " node1 label node2 id\n", "0 Q1000017 Pinstance_count 1 Q1000017-Pinstance_count-6b86b2\n", "1 Q1000091 Pinstance_count 1 Q1000091-Pinstance_count-6b86b2\n", "2 Q1000156 Pinstance_count 11 Q1000156-Pinstance_count-4fc82b\n", "3 Q100023 Pinstance_count 1 Q100023-Pinstance_count-6b86b2\n", "4 Q100026 Pinstance_count 1 Q100026-Pinstance_count-6b86b2\n", "5 Q100029091 Pinstance_count 10 Q100029091-Pinstance_count-4a44dc\n", "6 Q1000300 Pinstance_count 2 Q1000300-Pinstance_count-d4735e\n", "7 Q100034524 Pinstance_count 3 Q100034524-Pinstance_count-4e0740\n", "8 Q1000371 Pinstance_count 3 Q1000371-Pinstance_count-4e0740\n", "9 Q100038174 Pinstance_count 11 Q100038174-Pinstance_count-4fc82b" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $TEMP/statistics.Pinstance_count.tsv.gz\")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:40:29.016568Z", "iopub.status.busy": "2021-12-29T07:40:29.016325Z", "iopub.status.idle": "2021-12-29T07:40:31.099738Z", "shell.execute_reply": "2021-12-29T07:40:31.098475Z", "shell.execute_reply.started": "2021-12-29T07:40:29.016542Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-28 23:40:30 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_12 AS graph_12_c1\n", " LIMIT ?\n", " PARAS: [5]\n", "---------------------------------------------\n", "node1\tlabel\tnode2\tid\n", "Q1000017\tPinstance_count\t1\tQ1000017-Pinstance_count-6b86b2\n", "Q1000091\tPinstance_count\t1\tQ1000091-Pinstance_count-6b86b2\n", "Q1000156\tPinstance_count\t11\tQ1000156-Pinstance_count-4fc82b\n", "Q100023\tPinstance_count\t1\tQ100023-Pinstance_count-6b86b2\n", "Q100026\tPinstance_count\t1\tQ100026-Pinstance_count-6b86b2\n" ] } ], "source": [ "!kgtk --debug query -i $TEMP/statistics.Pinstance_count.tsv.gz --idx mode:monograph --limit 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compute the edge file that contains the graph we want to visualize for each class\n", "\n", "The edge file contains `subclass / P279 / class` edges, but we add two columns to support the visualization:\n", "\n", "- `graph:` is the id of a class we want to visualize. This columns allows us to quickly fetch all the edges to build the visualization of a class.\n", "- `edge_type`: in the visualization we want to distinguish `subclass` and `superclass` edges so the viewer can easily distinguish subclasses and superclasses." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compute the subclass edges\n", "\n", "For every class (the graph) we want to find all the P279 edges for subclasses of the given class. We use `class-browsing.low-degree-nodes.tsv` so that we don't include high degree classes that will blow up the browser." ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:40:31.101895Z", "iopub.status.busy": "2021-12-29T07:40:31.101538Z", "iopub.status.idle": "2021-12-29T07:43:33.472188Z", "shell.execute_reply": "2021-12-29T07:43:33.470811Z", "shell.execute_reply.started": "2021-12-29T07:40:31.101865Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(f\"\"\"\n", " query -i p279starlow -i p279 -i $OUT/class-browsing.low-degree-nodes.tsv\n", " --match '\n", " p279starlow: (subclass1)-[]->(class),\n", " p279starlow: (subclass2)-[]->(class),\n", " low: (subclass1),\n", " low: (subclass2),\n", " p279: (subclass1)-[]->(subclass2)'\n", " --return 'distinct subclass1 as node1, \"P279\" as label, subclass2 as node2, class as graph, \"subclass\" as edge_type'\n", " -o $TEMP/all.graph.low.sub.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:43:33.474060Z", "iopub.status.busy": "2021-12-29T07:43:33.473698Z", "iopub.status.idle": "2021-12-29T07:43:34.561343Z", "shell.execute_reply": "2021-12-29T07:43:34.560217Z", "shell.execute_reply.started": "2021-12-29T07:43:33.474028Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 18213555\n" ] } ], "source": [ "!zcat < $TEMP/all.graph.low.sub.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have a lot of edges because we make copies for every graph, i.e., the same edge appears in many graphs. This is annoying, but it allows us to fetch the graphs very quickly, in less than 2 seconds." ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:43:34.563594Z", "iopub.status.busy": "2021-12-29T07:43:34.563232Z", "iopub.status.idle": "2021-12-29T07:43:37.249049Z", "shell.execute_reply": "2021-12-29T07:43:37.248128Z", "shell.execute_reply.started": "2021-12-29T07:43:34.563564Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q100000030P279Q14748Q14748subclass
1Q100000030P279Q14748Q14745subclass
2Q100000030P279Q14748Q1357761subclass
3Q100000030P279Q14748Q2424752subclass
4Q100000030P279Q14748Q31807746subclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q100000030 P279 Q14748 Q14748 subclass\n", "1 Q100000030 P279 Q14748 Q14745 subclass\n", "2 Q100000030 P279 Q14748 Q1357761 subclass\n", "3 Q100000030 P279 Q14748 Q2424752 subclass\n", "4 Q100000030 P279 Q14748 Q31807746 subclass" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $TEMP/all.graph.low.sub.tsv.gz\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compute the superclass edges\n", "\n", "The superclass edges are also P279 edges, but they sit above the given class. We don't need to filter to low degree classes because we are going up the P279 hierarchy." ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:43:37.250715Z", "iopub.status.busy": "2021-12-29T07:43:37.250451Z", "iopub.status.idle": "2021-12-29T07:58:57.345943Z", "shell.execute_reply": "2021-12-29T07:58:57.344236Z", "shell.execute_reply.started": "2021-12-29T07:43:37.250685Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(f\"\"\"\n", " query -i p279stard -i p279\n", " --match '\n", " p279stard: (class)-[]->(superclass1),\n", " p279stard: (class)-[]->(superclass2),\n", " p279: (superclass1)-[]->(superclass2)'\n", " --return 'distinct superclass1 as node1, \"P279\" as label, superclass2 as node2, class as graph, \"superclass\" as edge_type'\n", " -o $TEMP/all.graph.low.super.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:58:57.348250Z", "iopub.status.busy": "2021-12-29T07:58:57.347964Z", "iopub.status.idle": "2021-12-29T07:59:01.873930Z", "shell.execute_reply": "2021-12-29T07:59:01.872748Z", "shell.execute_reply.started": "2021-12-29T07:58:57.348216Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 121028861\n" ] } ], "source": [ "!zcat < $TEMP/all.graph.low.super.tsv.gz | wc -l" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:59:01.875811Z", "iopub.status.busy": "2021-12-29T07:59:01.875569Z", "iopub.status.idle": "2021-12-29T07:59:05.117542Z", "shell.execute_reply": "2021-12-29T07:59:05.116752Z", "shell.execute_reply.started": "2021-12-29T07:59:01.875782Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q95079834P279Q1000068Q95079834superclass
1Q17372279P279Q100026Q17372279superclass
2Q17372377P279Q100026Q17372377superclass
3Q17372377P279Q100026Q17372463superclass
4Q17372377P279Q100026Q17372473superclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q95079834 P279 Q1000068 Q95079834 superclass\n", "1 Q17372279 P279 Q100026 Q17372279 superclass\n", "2 Q17372377 P279 Q100026 Q17372377 superclass\n", "3 Q17372377 P279 Q100026 Q17372463 superclass\n", "4 Q17372377 P279 Q100026 Q17372473 superclass" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $TEMP/all.graph.low.super.tsv.gz\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatenate the subclass and superclass files, and store in `$TEMP/graph.low.tsv.gz`\n", "\n", "We keep the file in `$TEMP` because for the final file we want to add he high degree nodes so that the user sees that they exist (we will not add the subclasses). Once we have the complete file, we will put it in `$OUT`." ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T07:59:05.119086Z", "iopub.status.busy": "2021-12-29T07:59:05.118834Z", "iopub.status.idle": "2021-12-29T08:04:23.557081Z", "shell.execute_reply": "2021-12-29T08:04:23.555928Z", "shell.execute_reply.started": "2021-12-29T07:59:05.119056Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(f\"\"\"\n", " cat --use-graph-cache-envar False -i $TEMP/all.graph.low.sub.tsv.gz -i $TEMP/all.graph.low.super.tsv.gz\n", " -o $TEMP/graph.low.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index the file to allow fast queries on all columns" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T08:04:23.558788Z", "iopub.status.busy": "2021-12-29T08:04:23.558504Z", "iopub.status.idle": "2021-12-29T08:18:26.810653Z", "shell.execute_reply": "2021-12-29T08:18:26.810026Z", "shell.execute_reply.started": "2021-12-29T08:04:23.558760Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-29 00:04:25 sqlstore]: DROP graph data table graph_15 from graphbrowser\n", "[2021-12-29 00:08:16 sqlstore]: IMPORT graph directly into table graph_31 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/temp.class-visualization/graph.low.tsv.gz ...\n", "[2021-12-29 00:15:23 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_31 AS graph_31_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "[2021-12-29 00:15:23 sqlstore]: CREATE INDEX \"graph_31_node1_node2_graph_edge_type_idx\" ON \"graph_31\" (\"node1\", \"node2\", \"graph\", \"edge_type\")\n", "[2021-12-29 00:17:54 sqlstore]: ANALYZE \"graph_31_node1_node2_graph_edge_type_idx\"\n", "node1\tlabel\tnode2\tgraph\tedge_type\n", "Q100000030\tP279\tQ14748\tQ14748\tsubclass\n", "Q100000030\tP279\tQ14748\tQ14745\tsubclass\n", "Q100000030\tP279\tQ14748\tQ1357761\tsubclass\n" ] } ], "source": [ "!kgtk --debug query -i $TEMP/graph.low.tsv.gz --as graphbrowser --idx index:node1,node2,graph,edge_type --limit 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compute the node file for visualization\n", "\n", "The node file for visualization needs the labels for the nodes, and the `graph` to pull it out quickly. We add:\n", "\n", "- `instance_count`: the number of direct instances of the class, as it is interesting for the user to see this information." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extract the nodes from the edge file\n", "\n", "The reason to use the edge file is that we need the `graph` id. We do it in two steps, first extract `node1` and then extract `node2`" ] }, { "cell_type": "code", "execution_count": 194, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:20:12.851665Z", "iopub.status.busy": "2021-12-31T03:20:12.851393Z", "iopub.status.idle": "2021-12-31T03:34:51.505030Z", "shell.execute_reply": "2021-12-31T03:34:51.504124Z", "shell.execute_reply.started": "2021-12-31T03:20:12.851635Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i graphbrowser -i browsernodes\n", " --match '\n", " graphbrowser: (c)-[{graph: graph}]->(),\n", " browsernodes: (c)-[{node_type: nt}]->()'\n", " --opt 'label: (c)-[]->(class_label)'\n", " --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'\n", " --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, nt as node_type, class_label as label'\n", " -o $TEMP/graph.low.node1.tsv.gz\n", "\"\"\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is what our node file looks like:" ] }, { "cell_type": "code", "execution_count": 195, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:34:51.507287Z", "iopub.status.busy": "2021-12-31T03:34:51.507028Z", "iopub.status.idle": "2021-12-31T03:34:54.628575Z", "shell.execute_reply": "2021-12-31T03:34:54.627739Z", "shell.execute_reply.started": "2021-12-31T03:34:51.507259Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabel
0Q898273Q10383996511047few_subclasses'protein domain'@en
1Q898273Q10383998711047few_subclasses'protein domain'@en
2Q898273Q10384000211047few_subclasses'protein domain'@en
3Q898273Q10384005911047few_subclasses'protein domain'@en
4Q898273Q10384006611047few_subclasses'protein domain'@en
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type label\n", "0 Q898273 Q103839965 11047 few_subclasses 'protein domain'@en\n", "1 Q898273 Q103839987 11047 few_subclasses 'protein domain'@en\n", "2 Q898273 Q103840002 11047 few_subclasses 'protein domain'@en\n", "3 Q898273 Q103840059 11047 few_subclasses 'protein domain'@en\n", "4 Q898273 Q103840066 11047 few_subclasses 'protein domain'@en" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $TEMP/graph.low.node1.tsv.gz\")" ] }, { "cell_type": "code", "execution_count": 196, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:34:54.630103Z", "iopub.status.busy": "2021-12-31T03:34:54.629857Z", "iopub.status.idle": "2021-12-31T03:53:45.913263Z", "shell.execute_reply": "2021-12-31T03:53:45.912358Z", "shell.execute_reply.started": "2021-12-31T03:34:54.630077Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i graphbrowser -i browsernodes\n", " --match '\n", " graphbrowser: ()-[{graph: graph}]->(c),\n", " browsernodes: (c)-[{node_type: nt}]->()'\n", " --opt 'label: (c)-[]->(class_label)'\n", " --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'\n", " --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, nt as node_type, class_label as label'\n", " -o $TEMP/graph.low.node2.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatenate the two node files, deduplicate and index\n", "\n", "To-do: try presorting the files to see if compact will run faster, as it is, this command takes over 2.5 hours" ] }, { "cell_type": "code", "execution_count": 197, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T03:53:45.916913Z", "iopub.status.busy": "2021-12-31T03:53:45.916591Z", "iopub.status.idle": "2021-12-31T06:55:08.988812Z", "shell.execute_reply": "2021-12-31T06:55:08.971399Z", "shell.execute_reply.started": "2021-12-31T03:53:45.916855Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat --use-graph-cache-envar False --mode NONE -i $TEMP/graph.low.node1.tsv.gz -i $TEMP/graph.low.node2.tsv.gz\n", " / compact --mode NONE --columns node1 graph\n", " -o $TEMP/graph.low.node.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We only need to index on `graph` as we will not do node queries on it:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Special handling of high degree nodes" ] }, { "cell_type": "code", "execution_count": 198, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:09.091806Z", "iopub.status.busy": "2021-12-31T06:55:09.091041Z", "iopub.status.idle": "2021-12-31T06:55:13.898708Z", "shell.execute_reply": "2021-12-31T06:55:13.897855Z", "shell.execute_reply.started": "2021-12-31T06:55:09.091769Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1
0Q20747295
1Q8054
2Q7187
3Q277338
4Q427087
\n", "
" ], "text/plain": [ " node1\n", "0 Q20747295\n", "1 Q8054\n", "2 Q7187\n", "3 Q277338\n", "4 Q427087" ] }, "execution_count": 198, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $OUT/class-browsing.high-degree-nodes.tsv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make a graph file with the `P279` edges where the subclass is a high degree class\n", "\n", "Do this only to add edges that connect to the subclasses of our target node, so `class` has to be in `$TEMP/all.graph.low.sub.tsv.gz`" ] }, { "cell_type": "code", "execution_count": 199, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:13.904805Z", "iopub.status.busy": "2021-12-31T06:55:13.904473Z", "iopub.status.idle": "2021-12-31T06:55:16.567164Z", "shell.execute_reply": "2021-12-31T06:55:16.565789Z", "shell.execute_reply.started": "2021-12-31T06:55:13.904785Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-12-30 22:55:16 sqlstore]: DROP graph data table graph_33 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/class-browsing.high-degree-nodes.tsv\n", "[2021-12-30 22:55:16 sqlstore]: IMPORT graph directly into table graph_33 from /Users/pedroszekely/Downloads/kypher/projects/class-visualization/class-browsing.high-degree-nodes.tsv ...\n", "[2021-12-30 22:55:16 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_33_c2.\"node1\" \"_aLias.node1\", ? \"_aLias.label\", graph_40_c1.\"node1\" \"_aLias.node2\", graph_40_c1.\"graph\" \"_aLias.graph\", ? \"_aLias.edge_type\"\n", " FROM graph_1 AS graph_1_c3\n", " INNER JOIN graph_33 AS graph_33_c2, graph_40 AS graph_40_c1\n", " ON graph_33_c2.\"node1\" = graph_1_c3.\"node1\"\n", " AND graph_40_c1.\"node1\" = graph_1_c3.\"node2\"\n", " AND graph_40_c1.\"graph\" = graph_40_c1.\"graph\"\n", " AND (graph_33_c2.\"node1\" != graph_40_c1.\"node1\")\n", " PARAS: ['P279', 'subclass']\n", "---------------------------------------------\n", "[2021-12-30 22:55:16 sqlstore]: CREATE INDEX \"graph_33_node1_idx\" ON \"graph_33\" (\"node1\")\n", "[2021-12-30 22:55:16 sqlstore]: ANALYZE \"graph_33_node1_idx\"\n", "\n" ] } ], "source": [ "kgtk(\"\"\"\n", " query --debug -i $OUT/class-browsing.high-degree-nodes.tsv -i p279 -i $TEMP/all.graph.low.sub.tsv.gz\n", " --match '\n", " low: (class)-[{graph: graph}]->(),\n", " high: (subclass),\n", " p279: (subclass)-[]->(class)'\n", " --where 'subclass != class'\n", " --return 'distinct subclass as node1, \"P279\" as label, class as node2, graph as graph, \"subclass\" as edge_type'\n", " -o $TEMP/graph.high.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 200, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:16.570428Z", "iopub.status.busy": "2021-12-31T06:55:16.570225Z", "iopub.status.idle": "2021-12-31T06:55:19.060840Z", "shell.execute_reply": "2021-12-31T06:55:19.059658Z", "shell.execute_reply.started": "2021-12-31T06:55:16.570407Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q10267817P279Q18553442Q1225194subclass
1Q107715P279Q309314Q246672subclass
2Q107715P279Q309314Q937228subclass
3Q107715P279Q309314Q7184903subclass
4Q107715P279Q309314Q35120subclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q10267817 P279 Q18553442 Q1225194 subclass\n", "1 Q107715 P279 Q309314 Q246672 subclass\n", "2 Q107715 P279 Q309314 Q937228 subclass\n", "3 Q107715 P279 Q309314 Q7184903 subclass\n", "4 Q107715 P279 Q309314 Q35120 subclass" ] }, "execution_count": 200, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $TEMP/graph.high.tsv.gz\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make a node file with the high degree nodes\n", "\n", "We use the edge file because we need to put the `graph` in the node file too." ] }, { "cell_type": "code", "execution_count": 201, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:19.063302Z", "iopub.status.busy": "2021-12-31T06:55:19.063050Z", "iopub.status.idle": "2021-12-31T06:55:21.837516Z", "shell.execute_reply": "2021-12-31T06:55:21.836580Z", "shell.execute_reply.started": "2021-12-31T06:55:19.063273Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i label -i $TEMP/statistics.Pinstance_count.tsv.gz -i $TEMP/graph.high.tsv.gz\n", " --match 'high: (c)-[{graph: graph}]->()'\n", " --opt 'label: (c)-[]->(class_label)'\n", " --opt 'Pinstance_count: (c)-[:Pinstance_count]->(instance_count)'\n", " --return 'distinct c as node1, graph as graph, coalesce(instance_count,0) as instance_count, \"many_subclasses\" as node_type, class_label as label'\n", " -o $TEMP/graph.high.node.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 202, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:21.841239Z", "iopub.status.busy": "2021-12-31T06:55:21.840955Z", "iopub.status.idle": "2021-12-31T06:55:24.497776Z", "shell.execute_reply": "2021-12-31T06:55:24.497058Z", "shell.execute_reply.started": "2021-12-31T06:55:21.841210Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabel
0Q10267817Q12251941many_subclasses'autosomal recessive disease'@en
1Q107715Q24667293many_subclasses'physical quantity'@en
2Q107715Q93722893many_subclasses'physical quantity'@en
3Q107715Q718490393many_subclasses'physical quantity'@en
4Q107715Q3512093many_subclasses'physical quantity'@en
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q10267817 Q1225194 1 many_subclasses \n", "1 Q107715 Q246672 93 many_subclasses \n", "2 Q107715 Q937228 93 many_subclasses \n", "3 Q107715 Q7184903 93 many_subclasses \n", "4 Q107715 Q35120 93 many_subclasses \n", "\n", " label \n", "0 'autosomal recessive disease'@en \n", "1 'physical quantity'@en \n", "2 'physical quantity'@en \n", "3 'physical quantity'@en \n", "4 'physical quantity'@en " ] }, "execution_count": 202, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $TEMP/graph.high.node.tsv.gz\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Just to make sure, count the number of sublcasses of one of our supposedly high degree nodes, innocent looking with one instance, but indeed many subclasses." ] }, { "cell_type": "code", "execution_count": 203, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:24.517353Z", "iopub.status.busy": "2021-12-31T06:55:24.517102Z", "iopub.status.idle": "2021-12-31T06:55:27.081133Z", "shell.execute_reply": "2021-12-31T06:55:27.080295Z", "shell.execute_reply.started": "2021-12-31T06:55:24.517329Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count(DISTINCT graph_1_c1.\"node1\")
01097
\n", "
" ], "text/plain": [ " count(DISTINCT graph_1_c1.\"node1\")\n", "0 1097" ] }, "execution_count": 203, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"query -i p279 --match '(subclass)-[]->(:Q10267817)' --return 'count(distinct subclass)'\")" ] }, { "cell_type": "code", "execution_count": 204, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T06:55:27.083260Z", "iopub.status.busy": "2021-12-31T06:55:27.082938Z", "iopub.status.idle": "2021-12-31T06:55:29.543565Z", "shell.execute_reply": "2021-12-31T06:55:29.542954Z", "shell.execute_reply.started": "2021-12-31T06:55:27.083231Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count(DISTINCT graph_1_c1.\"node1\")
02350
\n", "
" ], "text/plain": [ " count(DISTINCT graph_1_c1.\"node1\")\n", "0 2350" ] }, "execution_count": 204, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"query -i p279 --match '(subclass)-[]->(:Q30185)' --return 'count(distinct subclass)'\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Augment the low degree edge and node files with the high degree info" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenating without deduplication is sufficient as the files cannot have duplicate edges or nodes." ] }, { "cell_type": "code", "execution_count": 225, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T16:25:04.014190Z", "iopub.status.busy": "2021-12-31T16:25:04.013972Z", "iopub.status.idle": "2021-12-31T16:30:22.602854Z", "shell.execute_reply": "2021-12-31T16:30:22.601637Z", "shell.execute_reply.started": "2021-12-31T16:25:04.014167Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat --use-graph-cache-envar False -i $TEMP/graph.high.tsv.gz -i $TEMP/graph.low.tsv.gz\n", " -o $OUT/class-visualization.edge.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 206, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T07:06:41.114279Z", "iopub.status.busy": "2021-12-31T07:06:41.113987Z", "iopub.status.idle": "2021-12-31T07:06:43.937936Z", "shell.execute_reply": "2021-12-31T07:06:43.937152Z", "shell.execute_reply.started": "2021-12-31T07:06:41.114250Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q10267817P279Q18553442Q1225194subclass
1Q107715P279Q309314Q246672subclass
2Q107715P279Q309314Q937228subclass
3Q107715P279Q309314Q7184903subclass
4Q107715P279Q309314Q35120subclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q10267817 P279 Q18553442 Q1225194 subclass\n", "1 Q107715 P279 Q309314 Q246672 subclass\n", "2 Q107715 P279 Q309314 Q937228 subclass\n", "3 Q107715 P279 Q309314 Q7184903 subclass\n", "4 Q107715 P279 Q309314 Q35120 subclass" ] }, "execution_count": 206, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $OUT/class-visualization.edge.tsv.gz\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index the file for query using the `graph` column:" ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T07:06:43.939700Z", "iopub.status.busy": "2021-12-31T07:06:43.939509Z", "iopub.status.idle": "2021-12-31T07:21:29.714900Z", "shell.execute_reply": "2021-12-31T07:21:29.713832Z", "shell.execute_reply.started": "2021-12-31T07:06:43.939676Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tgraph\tedge_type\n", "Q10267817\tP279\tQ18553442\tQ1225194\tsubclass\n", "Q107715\tP279\tQ309314\tQ246672\tsubclass\n", "Q107715\tP279\tQ309314\tQ937228\tsubclass\n" ] } ], "source": [ "!kgtk query -i $OUT/class-visualization.edge.tsv.gz --as classvizedge --idx index:graph --limit 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the node files:" ] }, { "cell_type": "code", "execution_count": 226, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T16:30:22.605793Z", "iopub.status.busy": "2021-12-31T16:30:22.605511Z", "iopub.status.idle": "2021-12-31T16:33:45.325717Z", "shell.execute_reply": "2021-12-31T16:33:45.324843Z", "shell.execute_reply.started": "2021-12-31T16:30:22.605761Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat --use-graph-cache-envar False --mode NONE -i $TEMP/graph.high.node.tsv.gz -i $TEMP/graph.low.node.tsv.gz\n", " -o $TEMP/class-visualization.node.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add a tooltip with meaningful information" ] }, { "cell_type": "code", "execution_count": 228, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T16:46:30.875509Z", "iopub.status.busy": "2021-12-31T16:46:30.875231Z", "iopub.status.idle": "2021-12-31T17:03:52.239600Z", "shell.execute_reply": "2021-12-31T17:03:52.236869Z", "shell.execute_reply.started": "2021-12-31T16:46:30.875480Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/class-visualization.node.tsv.gz\n", " --match '(node)-[{graph: g, instance_count: ic, node_type: nt, label: l}]->()'\n", " --return 'distinct\n", " node as node1, g as graph, ic as instance_count, nt as node_type, l as label,\n", " printf(\"%s (%s)
instance count: %s
node type: %s\", kgtk_lqstring_text(l), node, cast(ic, int), nt) as tooltip'\n", " -o $OUT/class-visualization.node.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 229, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T17:03:52.246880Z", "iopub.status.busy": "2021-12-31T17:03:52.246529Z", "iopub.status.idle": "2021-12-31T17:03:55.111637Z", "shell.execute_reply": "2021-12-31T17:03:55.110862Z", "shell.execute_reply.started": "2021-12-31T17:03:52.246849Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabeltooltip
0Q10267817Q12251941many_subclasses'autosomal recessive disease'@enautosomal recessive disease (Q10267817)<BR/>in...
1Q107715Q24667293many_subclasses'physical quantity'@enphysical quantity (Q107715)<BR/>instance count...
2Q107715Q93722893many_subclasses'physical quantity'@enphysical quantity (Q107715)<BR/>instance count...
3Q107715Q718490393many_subclasses'physical quantity'@enphysical quantity (Q107715)<BR/>instance count...
4Q107715Q3512093many_subclasses'physical quantity'@enphysical quantity (Q107715)<BR/>instance count...
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q10267817 Q1225194 1 many_subclasses \n", "1 Q107715 Q246672 93 many_subclasses \n", "2 Q107715 Q937228 93 many_subclasses \n", "3 Q107715 Q7184903 93 many_subclasses \n", "4 Q107715 Q35120 93 many_subclasses \n", "\n", " label \\\n", "0 'autosomal recessive disease'@en \n", "1 'physical quantity'@en \n", "2 'physical quantity'@en \n", "3 'physical quantity'@en \n", "4 'physical quantity'@en \n", "\n", " tooltip \n", "0 autosomal recessive disease (Q10267817)
in... \n", "1 physical quantity (Q107715)
instance count... \n", "2 physical quantity (Q107715)
instance count... \n", "3 physical quantity (Q107715)
instance count... \n", "4 physical quantity (Q107715)
instance count... " ] }, "execution_count": 229, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -n 5 -i $OUT/class-visualization.node.tsv.gz\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index the file for query using the `graph` column:" ] }, { "cell_type": "code", "execution_count": 230, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T17:03:55.113296Z", "iopub.status.busy": "2021-12-31T17:03:55.113107Z", "iopub.status.idle": "2021-12-31T17:31:20.264932Z", "shell.execute_reply": "2021-12-31T17:31:20.264132Z", "shell.execute_reply.started": "2021-12-31T17:03:55.113277Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tgraph\tinstance_count\tnode_type\tlabel\ttooltip\n", "Q10267817\tQ1225194\t1\tmany_subclasses\t'autosomal recessive disease'@en\tautosomal recessive disease (Q10267817)
instance count: 1
node type: many_subclasses\n", "Q107715\tQ246672\t93\tmany_subclasses\t'physical quantity'@en\tphysical quantity (Q107715)
instance count: 93
node type: many_subclasses\n", "Q107715\tQ937228\t93\tmany_subclasses\t'physical quantity'@en\tphysical quantity (Q107715)
instance count: 93
node type: many_subclasses\n" ] } ], "source": [ "!kgtk query -i $OUT/class-visualization.node.tsv.gz --as classviznode --idx index:graph --limit 3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Temporary: we need this file for my current version of visualize because it needs labels in the edge file, the new version can have the labels in the node file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test creation of the node file:" ] }, { "cell_type": "code", "execution_count": 236, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T18:05:24.418050Z", "iopub.status.busy": "2021-12-31T18:05:24.417795Z", "iopub.status.idle": "2021-12-31T18:05:26.475411Z", "shell.execute_reply": "2021-12-31T18:05:26.474730Z", "shell.execute_reply.started": "2021-12-31T18:05:24.418026Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabeltooltip
0Q11019Q142099few_subclasses'machine'@enmachine (Q11019)<BR/>instance count: 99<BR/>no...
1Q1183543Q1420198many_subclasses'device'@endevice (Q1183543)<BR/>instance count: 198<BR/>...
2Q1301433Q142017few_subclasses'land vehicle'@enland vehicle (Q1301433)<BR/>instance count: 17...
3Q1420Q1420862many_subclasses'motor car'@enmotor car (Q1420)<BR/>instance count: 862<BR/>...
4Q15401930Q142012few_subclasses'product'@enproduct (Q15401930)<BR/>instance count: 12<BR/...
5Q15618781Q142029few_subclasses'wheeled vehicle'@enwheeled vehicle (Q15618781)<BR/>instance count...
6Q16686448Q142024few_subclasses'artificial entity'@enartificial entity (Q16686448)<BR/>instance cou...
7Q16798631Q1420389few_subclasses'equipment'@enequipment (Q16798631)<BR/>instance count: 389<...
8Q223557Q1420110few_subclasses'physical object'@enphysical object (Q223557)<BR/>instance count: ...
9Q2424752Q1420412few_subclasses'product'@enproduct (Q2424752)<BR/>instance count: 412<BR/...
10Q28877Q142011few_subclasses'goods'@engoods (Q28877)<BR/>instance count: 11<BR/>node...
11Q3245975Q14201few_subclasses'finished good'@enfinished good (Q3245975)<BR/>instance count: 1...
12Q337060Q142038few_subclasses'perceptible object'@enperceptible object (Q337060)<BR/>instance coun...
13Q35120Q142034few_subclasses'entity'@enentity (Q35120)<BR/>instance count: 34<BR/>nod...
14Q35825432Q14202few_subclasses'converter'@enconverter (Q35825432)<BR/>instance count: 2<BR...
15Q39546Q14201029many_subclasses'tool'@entool (Q39546)<BR/>instance count: 1029<BR/>nod...
16Q42889Q1420114few_subclasses'vehicle'@envehicle (Q42889)<BR/>instance count: 114<BR/>n...
17Q4406616Q1420322few_subclasses'concrete object'@enconcrete object (Q4406616)<BR/>instance count:...
18Q488383Q1420701few_subclasses'object'@enobject (Q488383)<BR/>instance count: 701<BR/>n...
19Q6671777Q142029few_subclasses'structure'@enstructure (Q6671777)<BR/>instance count: 29<BR...
20Q752870Q14208few_subclasses'motor vehicle'@enmotor vehicle (Q752870)<BR/>instance count: 8<...
21Q8205328Q142052few_subclasses'artificial physical object'@enartificial physical object (Q8205328)<BR/>inst...
22Q9158768Q142063few_subclasses'storage'@enstorage (Q9158768)<BR/>instance count: 63<BR/>...
23Q987767Q1420282few_subclasses'container'@encontainer (Q987767)<BR/>instance count: 282<BR...
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q11019 Q1420 99 few_subclasses \n", "1 Q1183543 Q1420 198 many_subclasses \n", "2 Q1301433 Q1420 17 few_subclasses \n", "3 Q1420 Q1420 862 many_subclasses \n", "4 Q15401930 Q1420 12 few_subclasses \n", "5 Q15618781 Q1420 29 few_subclasses \n", "6 Q16686448 Q1420 24 few_subclasses \n", "7 Q16798631 Q1420 389 few_subclasses \n", "8 Q223557 Q1420 110 few_subclasses \n", "9 Q2424752 Q1420 412 few_subclasses \n", "10 Q28877 Q1420 11 few_subclasses \n", "11 Q3245975 Q1420 1 few_subclasses \n", "12 Q337060 Q1420 38 few_subclasses \n", "13 Q35120 Q1420 34 few_subclasses \n", "14 Q35825432 Q1420 2 few_subclasses \n", "15 Q39546 Q1420 1029 many_subclasses \n", "16 Q42889 Q1420 114 few_subclasses \n", "17 Q4406616 Q1420 322 few_subclasses \n", "18 Q488383 Q1420 701 few_subclasses \n", "19 Q6671777 Q1420 29 few_subclasses \n", "20 Q752870 Q1420 8 few_subclasses \n", "21 Q8205328 Q1420 52 few_subclasses \n", "22 Q9158768 Q1420 63 few_subclasses \n", "23 Q987767 Q1420 282 few_subclasses \n", "\n", " label \\\n", "0 'machine'@en \n", "1 'device'@en \n", "2 'land vehicle'@en \n", "3 'motor car'@en \n", "4 'product'@en \n", "5 'wheeled vehicle'@en \n", "6 'artificial entity'@en \n", "7 'equipment'@en \n", "8 'physical object'@en \n", "9 'product'@en \n", "10 'goods'@en \n", "11 'finished good'@en \n", "12 'perceptible object'@en \n", "13 'entity'@en \n", "14 'converter'@en \n", "15 'tool'@en \n", "16 'vehicle'@en \n", "17 'concrete object'@en \n", "18 'object'@en \n", "19 'structure'@en \n", "20 'motor vehicle'@en \n", "21 'artificial physical object'@en \n", "22 'storage'@en \n", "23 'container'@en \n", "\n", " tooltip \n", "0 machine (Q11019)
instance count: 99
no... \n", "1 device (Q1183543)
instance count: 198
... \n", "2 land vehicle (Q1301433)
instance count: 17... \n", "3 motor car (Q1420)
instance count: 862
... \n", "4 product (Q15401930)
instance count: 12
instance count... \n", "6 artificial entity (Q16686448)
instance cou... \n", "7 equipment (Q16798631)
instance count: 389<... \n", "8 physical object (Q223557)
instance count: ... \n", "9 product (Q2424752)
instance count: 412
instance count: 11
node... \n", "11 finished good (Q3245975)
instance count: 1... \n", "12 perceptible object (Q337060)
instance coun... \n", "13 entity (Q35120)
instance count: 34
nod... \n", "14 converter (Q35825432)
instance count: 2instance count: 1029
nod... \n", "16 vehicle (Q42889)
instance count: 114
n... \n", "17 concrete object (Q4406616)
instance count:... \n", "18 object (Q488383)
instance count: 701
n... \n", "19 structure (Q6671777)
instance count: 29instance count: 8<... \n", "21 artificial physical object (Q8205328)
inst... \n", "22 storage (Q9158768)
instance count: 63
... \n", "23 container (Q987767)
instance count: 282()'\n", "\"\"\")" ] }, { "cell_type": "raw", "metadata": { "execution": { "iopub.execute_input": "2021-12-29T14:13:01.391259Z", "iopub.status.busy": "2021-12-29T14:13:01.391031Z", "iopub.status.idle": "2021-12-29T14:16:00.942753Z", "shell.execute_reply": "2021-12-29T14:16:00.941721Z", "shell.execute_reply.started": "2021-12-29T14:13:01.391236Z" } }, "source": [ " kgtk(f\"\"\"\n", " query -i classvizedgetest\n", " --match '(class)-[{{label: property, graph: \"{root}\", edge_type: edge_type}}]->(superclass)'\n", " -o $TEMP/browser/{root}.graph.low.tsv\n", " \"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test creation of visualizations" ] }, { "cell_type": "code", "execution_count": 234, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T18:00:44.743349Z", "iopub.status.busy": "2021-12-31T18:00:44.743062Z", "iopub.status.idle": "2021-12-31T18:01:54.519639Z", "shell.execute_reply": "2021-12-31T18:01:54.518704Z", "shell.execute_reply.started": "2021-12-31T18:00:44.743323Z" }, "tags": [] }, "outputs": [], "source": [ "roots = [\n", " \"Q11424\",\n", " \"Q391342\",\n", " \"Q1420\",\n", " \"Q1107\",\n", " \"Q889821\",\n", " \"Q1549591\",\n", " \"Q188724\",\n", " \"Q946808\",\n", " \"Q33999\",\n", " \"Q483501\",\n", " \"Q2221906\",\n", " \"Q144\",\n", " \"Q516021\",\n", " \"Q10494269\"\n", "]\n", "\n", "for root in roots:\n", " kgtk(f\"\"\"\n", " query -i classvizedgetest\n", " --match '(class)-[{{label: property, graph: \"{root}\", edge_type: edge_type}}]->(superclass)'\n", " -o $TEMP/browser/{root}.graph.low.tsv\n", " \"\"\")\n", "\n", " kgtk(f\"\"\"\n", " query -i classviznode\n", " --match '(class)-[{{graph: \"{root}\", instance_count: instance_count, label: label}}]->()'\n", " -o $TEMP/browser/{root}.node.graph.low.tsv\n", " \"\"\")\n", "\n", " # kgtk(f\"\"\"\n", " # visualize-force-graph -i $TEMP/browser/{root}.graph.low.tsv\n", " # --direction arrow\n", " # -o $TEMP/browser/{root}.graph.low.html\n", " # \"\"\")" ] }, { "cell_type": "raw", "metadata": {}, "source": [ " kgtk(f\"\"\"\n", " visualize-force-graph -i $TEMP/{root}.graph.low.tsv --node-file $TEMP/{root}.node.graph.low.tsv \n", " --direction arrow\n", " --node-size-column instance_count \n", " --node-size-minimum 2.0 \n", " --node-size-maximum 8.0 \n", " --node-size-default 1.0 \n", " --node-size-scale log \n", " --node-color-column node_type\n", " --node-color-scale categorical\n", " --edge-color-column edge_type \n", " --edge-color-style categorical \n", " -o $TEMP/browser/{root}.graph.low.html\n", " \"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tests for individual files" ] }, { "cell_type": "code", "execution_count": 224, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T16:19:46.509942Z", "iopub.status.busy": "2021-12-31T16:19:46.509616Z", "iopub.status.idle": "2021-12-31T16:20:38.883353Z", "shell.execute_reply": "2021-12-31T16:20:38.882501Z", "shell.execute_reply.started": "2021-12-31T16:19:46.509911Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabel
0Q11019Q142099few_subclasses'machine'@en
1Q1183543Q1420198many_subclasses'device'@en
2Q1301433Q142017few_subclasses'land vehicle'@en
3Q1420Q1420862many_subclasses'motor car'@en
4Q15401930Q142012few_subclasses'product'@en
5Q15618781Q142029few_subclasses'wheeled vehicle'@en
6Q16686448Q142024few_subclasses'artificial entity'@en
7Q16798631Q1420389few_subclasses'equipment'@en
8Q223557Q1420110few_subclasses'physical object'@en
9Q2424752Q1420412few_subclasses'product'@en
10Q28877Q142011few_subclasses'goods'@en
11Q3245975Q14201few_subclasses'finished good'@en
12Q337060Q142038few_subclasses'perceptible object'@en
13Q35120Q142034few_subclasses'entity'@en
14Q35825432Q14202few_subclasses'converter'@en
15Q39546Q14201029many_subclasses'tool'@en
16Q42889Q1420114few_subclasses'vehicle'@en
17Q4406616Q1420322few_subclasses'concrete object'@en
18Q488383Q1420701few_subclasses'object'@en
19Q6671777Q142029few_subclasses'structure'@en
20Q752870Q14208few_subclasses'motor vehicle'@en
21Q8205328Q142052few_subclasses'artificial physical object'@en
22Q9158768Q142063few_subclasses'storage'@en
23Q987767Q1420282few_subclasses'container'@en
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q11019 Q1420 99 few_subclasses \n", "1 Q1183543 Q1420 198 many_subclasses \n", "2 Q1301433 Q1420 17 few_subclasses \n", "3 Q1420 Q1420 862 many_subclasses \n", "4 Q15401930 Q1420 12 few_subclasses \n", "5 Q15618781 Q1420 29 few_subclasses \n", "6 Q16686448 Q1420 24 few_subclasses \n", "7 Q16798631 Q1420 389 few_subclasses \n", "8 Q223557 Q1420 110 few_subclasses \n", "9 Q2424752 Q1420 412 few_subclasses \n", "10 Q28877 Q1420 11 few_subclasses \n", "11 Q3245975 Q1420 1 few_subclasses \n", "12 Q337060 Q1420 38 few_subclasses \n", "13 Q35120 Q1420 34 few_subclasses \n", "14 Q35825432 Q1420 2 few_subclasses \n", "15 Q39546 Q1420 1029 many_subclasses \n", "16 Q42889 Q1420 114 few_subclasses \n", "17 Q4406616 Q1420 322 few_subclasses \n", "18 Q488383 Q1420 701 few_subclasses \n", "19 Q6671777 Q1420 29 few_subclasses \n", "20 Q752870 Q1420 8 few_subclasses \n", "21 Q8205328 Q1420 52 few_subclasses \n", "22 Q9158768 Q1420 63 few_subclasses \n", "23 Q987767 Q1420 282 few_subclasses \n", "\n", " label \n", "0 'machine'@en \n", "1 'device'@en \n", "2 'land vehicle'@en \n", "3 'motor car'@en \n", "4 'product'@en \n", "5 'wheeled vehicle'@en \n", "6 'artificial entity'@en \n", "7 'equipment'@en \n", "8 'physical object'@en \n", "9 'product'@en \n", "10 'goods'@en \n", "11 'finished good'@en \n", "12 'perceptible object'@en \n", "13 'entity'@en \n", "14 'converter'@en \n", "15 'tool'@en \n", "16 'vehicle'@en \n", "17 'concrete object'@en \n", "18 'object'@en \n", "19 'structure'@en \n", "20 'motor vehicle'@en \n", "21 'artificial physical object'@en \n", "22 'storage'@en \n", "23 'container'@en " ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/graph.low.node.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 223, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T16:19:33.684922Z", "iopub.status.busy": "2021-12-31T16:19:33.684648Z", "iopub.status.idle": "2021-12-31T16:19:36.275324Z", "shell.execute_reply": "2021-12-31T16:19:36.274525Z", "shell.execute_reply.started": "2021-12-31T16:19:33.684894Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabel
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [node1, graph, instance_count, node_type, label]\n", "Index: []" ] }, "execution_count": 223, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/graph.high.node.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 227, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T16:33:45.327442Z", "iopub.status.busy": "2021-12-31T16:33:45.327041Z", "iopub.status.idle": "2021-12-31T16:45:39.288687Z", "shell.execute_reply": "2021-12-31T16:45:39.287966Z", "shell.execute_reply.started": "2021-12-31T16:33:45.327414Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabel
0Q11019Q142099few_subclasses'machine'@en
1Q1183543Q1420198many_subclasses'device'@en
2Q1301433Q142017few_subclasses'land vehicle'@en
3Q1420Q1420862many_subclasses'motor car'@en
4Q15401930Q142012few_subclasses'product'@en
5Q15618781Q142029few_subclasses'wheeled vehicle'@en
6Q16686448Q142024few_subclasses'artificial entity'@en
7Q16798631Q1420389few_subclasses'equipment'@en
8Q223557Q1420110few_subclasses'physical object'@en
9Q2424752Q1420412few_subclasses'product'@en
10Q28877Q142011few_subclasses'goods'@en
11Q3245975Q14201few_subclasses'finished good'@en
12Q337060Q142038few_subclasses'perceptible object'@en
13Q35120Q142034few_subclasses'entity'@en
14Q35825432Q14202few_subclasses'converter'@en
15Q39546Q14201029many_subclasses'tool'@en
16Q42889Q1420114few_subclasses'vehicle'@en
17Q4406616Q1420322few_subclasses'concrete object'@en
18Q488383Q1420701few_subclasses'object'@en
19Q6671777Q142029few_subclasses'structure'@en
20Q752870Q14208few_subclasses'motor vehicle'@en
21Q8205328Q142052few_subclasses'artificial physical object'@en
22Q9158768Q142063few_subclasses'storage'@en
23Q987767Q1420282few_subclasses'container'@en
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q11019 Q1420 99 few_subclasses \n", "1 Q1183543 Q1420 198 many_subclasses \n", "2 Q1301433 Q1420 17 few_subclasses \n", "3 Q1420 Q1420 862 many_subclasses \n", "4 Q15401930 Q1420 12 few_subclasses \n", "5 Q15618781 Q1420 29 few_subclasses \n", "6 Q16686448 Q1420 24 few_subclasses \n", "7 Q16798631 Q1420 389 few_subclasses \n", "8 Q223557 Q1420 110 few_subclasses \n", "9 Q2424752 Q1420 412 few_subclasses \n", "10 Q28877 Q1420 11 few_subclasses \n", "11 Q3245975 Q1420 1 few_subclasses \n", "12 Q337060 Q1420 38 few_subclasses \n", "13 Q35120 Q1420 34 few_subclasses \n", "14 Q35825432 Q1420 2 few_subclasses \n", "15 Q39546 Q1420 1029 many_subclasses \n", "16 Q42889 Q1420 114 few_subclasses \n", "17 Q4406616 Q1420 322 few_subclasses \n", "18 Q488383 Q1420 701 few_subclasses \n", "19 Q6671777 Q1420 29 few_subclasses \n", "20 Q752870 Q1420 8 few_subclasses \n", "21 Q8205328 Q1420 52 few_subclasses \n", "22 Q9158768 Q1420 63 few_subclasses \n", "23 Q987767 Q1420 282 few_subclasses \n", "\n", " label \n", "0 'machine'@en \n", "1 'device'@en \n", "2 'land vehicle'@en \n", "3 'motor car'@en \n", "4 'product'@en \n", "5 'wheeled vehicle'@en \n", "6 'artificial entity'@en \n", "7 'equipment'@en \n", "8 'physical object'@en \n", "9 'product'@en \n", "10 'goods'@en \n", "11 'finished good'@en \n", "12 'perceptible object'@en \n", "13 'entity'@en \n", "14 'converter'@en \n", "15 'tool'@en \n", "16 'vehicle'@en \n", "17 'concrete object'@en \n", "18 'object'@en \n", "19 'structure'@en \n", "20 'motor vehicle'@en \n", "21 'artificial physical object'@en \n", "22 'storage'@en \n", "23 'container'@en " ] }, "execution_count": 227, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/class-visualization.node.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 232, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T17:34:29.568315Z", "iopub.status.busy": "2021-12-31T17:34:29.567994Z", "iopub.status.idle": "2021-12-31T17:34:32.435182Z", "shell.execute_reply": "2021-12-31T17:34:32.434584Z", "shell.execute_reply.started": "2021-12-31T17:34:29.568280Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabeltooltip
0Q11019Q142099few_subclasses'machine'@enmachine (Q11019)<BR/>instance count: 99<BR/>no...
1Q1183543Q1420198many_subclasses'device'@endevice (Q1183543)<BR/>instance count: 198<BR/>...
2Q1301433Q142017few_subclasses'land vehicle'@enland vehicle (Q1301433)<BR/>instance count: 17...
3Q1420Q1420862many_subclasses'motor car'@enmotor car (Q1420)<BR/>instance count: 862<BR/>...
4Q15401930Q142012few_subclasses'product'@enproduct (Q15401930)<BR/>instance count: 12<BR/...
5Q15618781Q142029few_subclasses'wheeled vehicle'@enwheeled vehicle (Q15618781)<BR/>instance count...
6Q16686448Q142024few_subclasses'artificial entity'@enartificial entity (Q16686448)<BR/>instance cou...
7Q16798631Q1420389few_subclasses'equipment'@enequipment (Q16798631)<BR/>instance count: 389<...
8Q223557Q1420110few_subclasses'physical object'@enphysical object (Q223557)<BR/>instance count: ...
9Q2424752Q1420412few_subclasses'product'@enproduct (Q2424752)<BR/>instance count: 412<BR/...
10Q28877Q142011few_subclasses'goods'@engoods (Q28877)<BR/>instance count: 11<BR/>node...
11Q3245975Q14201few_subclasses'finished good'@enfinished good (Q3245975)<BR/>instance count: 1...
12Q337060Q142038few_subclasses'perceptible object'@enperceptible object (Q337060)<BR/>instance coun...
13Q35120Q142034few_subclasses'entity'@enentity (Q35120)<BR/>instance count: 34<BR/>nod...
14Q35825432Q14202few_subclasses'converter'@enconverter (Q35825432)<BR/>instance count: 2<BR...
15Q39546Q14201029many_subclasses'tool'@entool (Q39546)<BR/>instance count: 1029<BR/>nod...
16Q42889Q1420114few_subclasses'vehicle'@envehicle (Q42889)<BR/>instance count: 114<BR/>n...
17Q4406616Q1420322few_subclasses'concrete object'@enconcrete object (Q4406616)<BR/>instance count:...
18Q488383Q1420701few_subclasses'object'@enobject (Q488383)<BR/>instance count: 701<BR/>n...
19Q6671777Q142029few_subclasses'structure'@enstructure (Q6671777)<BR/>instance count: 29<BR...
20Q752870Q14208few_subclasses'motor vehicle'@enmotor vehicle (Q752870)<BR/>instance count: 8<...
21Q8205328Q142052few_subclasses'artificial physical object'@enartificial physical object (Q8205328)<BR/>inst...
22Q9158768Q142063few_subclasses'storage'@enstorage (Q9158768)<BR/>instance count: 63<BR/>...
23Q987767Q1420282few_subclasses'container'@encontainer (Q987767)<BR/>instance count: 282<BR...
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q11019 Q1420 99 few_subclasses \n", "1 Q1183543 Q1420 198 many_subclasses \n", "2 Q1301433 Q1420 17 few_subclasses \n", "3 Q1420 Q1420 862 many_subclasses \n", "4 Q15401930 Q1420 12 few_subclasses \n", "5 Q15618781 Q1420 29 few_subclasses \n", "6 Q16686448 Q1420 24 few_subclasses \n", "7 Q16798631 Q1420 389 few_subclasses \n", "8 Q223557 Q1420 110 few_subclasses \n", "9 Q2424752 Q1420 412 few_subclasses \n", "10 Q28877 Q1420 11 few_subclasses \n", "11 Q3245975 Q1420 1 few_subclasses \n", "12 Q337060 Q1420 38 few_subclasses \n", "13 Q35120 Q1420 34 few_subclasses \n", "14 Q35825432 Q1420 2 few_subclasses \n", "15 Q39546 Q1420 1029 many_subclasses \n", "16 Q42889 Q1420 114 few_subclasses \n", "17 Q4406616 Q1420 322 few_subclasses \n", "18 Q488383 Q1420 701 few_subclasses \n", "19 Q6671777 Q1420 29 few_subclasses \n", "20 Q752870 Q1420 8 few_subclasses \n", "21 Q8205328 Q1420 52 few_subclasses \n", "22 Q9158768 Q1420 63 few_subclasses \n", "23 Q987767 Q1420 282 few_subclasses \n", "\n", " label \\\n", "0 'machine'@en \n", "1 'device'@en \n", "2 'land vehicle'@en \n", "3 'motor car'@en \n", "4 'product'@en \n", "5 'wheeled vehicle'@en \n", "6 'artificial entity'@en \n", "7 'equipment'@en \n", "8 'physical object'@en \n", "9 'product'@en \n", "10 'goods'@en \n", "11 'finished good'@en \n", "12 'perceptible object'@en \n", "13 'entity'@en \n", "14 'converter'@en \n", "15 'tool'@en \n", "16 'vehicle'@en \n", "17 'concrete object'@en \n", "18 'object'@en \n", "19 'structure'@en \n", "20 'motor vehicle'@en \n", "21 'artificial physical object'@en \n", "22 'storage'@en \n", "23 'container'@en \n", "\n", " tooltip \n", "0 machine (Q11019)
instance count: 99
no... \n", "1 device (Q1183543)
instance count: 198
... \n", "2 land vehicle (Q1301433)
instance count: 17... \n", "3 motor car (Q1420)
instance count: 862
... \n", "4 product (Q15401930)
instance count: 12
instance count... \n", "6 artificial entity (Q16686448)
instance cou... \n", "7 equipment (Q16798631)
instance count: 389<... \n", "8 physical object (Q223557)
instance count: ... \n", "9 product (Q2424752)
instance count: 412
instance count: 11
node... \n", "11 finished good (Q3245975)
instance count: 1... \n", "12 perceptible object (Q337060)
instance coun... \n", "13 entity (Q35120)
instance count: 34
nod... \n", "14 converter (Q35825432)
instance count: 2instance count: 1029
nod... \n", "16 vehicle (Q42889)
instance count: 114
n... \n", "17 concrete object (Q4406616)
instance count:... \n", "18 object (Q488383)
instance count: 701
n... \n", "19 structure (Q6671777)
instance count: 29instance count: 8<... \n", "21 artificial physical object (Q8205328)
inst... \n", "22 storage (Q9158768)
instance count: 63
... \n", "23 container (Q987767)
instance count: 282()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 214, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T08:29:21.356605Z", "iopub.status.busy": "2021-12-31T08:29:21.356378Z", "iopub.status.idle": "2021-12-31T08:29:34.399092Z", "shell.execute_reply": "2021-12-31T08:29:34.398417Z", "shell.execute_reply.started": "2021-12-31T08:29:21.356580Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q11019P279Q1183543Q1420superclass
1Q11019P279Q39546Q1420superclass
2Q1183543P279Q16686448Q1420superclass
3Q1183543P279Q16798631Q1420superclass
4Q1183543P279Q39546Q1420superclass
5Q1301433P279Q42889Q1420superclass
6Q1420P279Q752870Q1420superclass
7Q15401930P279Q488383Q1420superclass
8Q15618781P279Q1301433Q1420superclass
9Q16686448P279Q35120Q1420superclass
10Q16798631P279Q2424752Q1420superclass
11Q16798631P279Q8205328Q1420superclass
12Q223557P279Q4406616Q1420superclass
13Q2424752P279Q15401930Q1420superclass
14Q2424752P279Q28877Q1420superclass
15Q2424752P279Q8205328Q1420superclass
16Q28877P279Q337060Q1420superclass
17Q3245975P279Q2424752Q1420superclass
18Q337060P279Q223557Q1420superclass
19Q35825432P279Q35120Q1420superclass
20Q39546P279Q16798631Q1420superclass
21Q39546P279Q2424752Q1420superclass
22Q39546P279Q35825432Q1420superclass
23Q42889P279Q11019Q1420superclass
24Q42889P279Q3245975Q1420superclass
25Q42889P279Q987767Q1420superclass
26Q4406616P279Q488383Q1420superclass
27Q488383P279Q35120Q1420superclass
28Q6671777P279Q35120Q1420superclass
29Q752870P279Q15618781Q1420superclass
30Q8205328P279Q16686448Q1420superclass
31Q8205328P279Q223557Q1420superclass
32Q9158768P279Q6671777Q1420superclass
33Q987767P279Q39546Q1420superclass
34Q987767P279Q8205328Q1420superclass
35Q987767P279Q9158768Q1420superclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q11019 P279 Q1183543 Q1420 superclass\n", "1 Q11019 P279 Q39546 Q1420 superclass\n", "2 Q1183543 P279 Q16686448 Q1420 superclass\n", "3 Q1183543 P279 Q16798631 Q1420 superclass\n", "4 Q1183543 P279 Q39546 Q1420 superclass\n", "5 Q1301433 P279 Q42889 Q1420 superclass\n", "6 Q1420 P279 Q752870 Q1420 superclass\n", "7 Q15401930 P279 Q488383 Q1420 superclass\n", "8 Q15618781 P279 Q1301433 Q1420 superclass\n", "9 Q16686448 P279 Q35120 Q1420 superclass\n", "10 Q16798631 P279 Q2424752 Q1420 superclass\n", "11 Q16798631 P279 Q8205328 Q1420 superclass\n", "12 Q223557 P279 Q4406616 Q1420 superclass\n", "13 Q2424752 P279 Q15401930 Q1420 superclass\n", "14 Q2424752 P279 Q28877 Q1420 superclass\n", "15 Q2424752 P279 Q8205328 Q1420 superclass\n", "16 Q28877 P279 Q337060 Q1420 superclass\n", "17 Q3245975 P279 Q2424752 Q1420 superclass\n", "18 Q337060 P279 Q223557 Q1420 superclass\n", "19 Q35825432 P279 Q35120 Q1420 superclass\n", "20 Q39546 P279 Q16798631 Q1420 superclass\n", "21 Q39546 P279 Q2424752 Q1420 superclass\n", "22 Q39546 P279 Q35825432 Q1420 superclass\n", "23 Q42889 P279 Q11019 Q1420 superclass\n", "24 Q42889 P279 Q3245975 Q1420 superclass\n", "25 Q42889 P279 Q987767 Q1420 superclass\n", "26 Q4406616 P279 Q488383 Q1420 superclass\n", "27 Q488383 P279 Q35120 Q1420 superclass\n", "28 Q6671777 P279 Q35120 Q1420 superclass\n", "29 Q752870 P279 Q15618781 Q1420 superclass\n", "30 Q8205328 P279 Q16686448 Q1420 superclass\n", "31 Q8205328 P279 Q223557 Q1420 superclass\n", "32 Q9158768 P279 Q6671777 Q1420 superclass\n", "33 Q987767 P279 Q39546 Q1420 superclass\n", "34 Q987767 P279 Q8205328 Q1420 superclass\n", "35 Q987767 P279 Q9158768 Q1420 superclass" ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i graphbrowser\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 215, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T08:29:34.401024Z", "iopub.status.busy": "2021-12-31T08:29:34.400813Z", "iopub.status.idle": "2021-12-31T08:29:36.392851Z", "shell.execute_reply": "2021-12-31T08:29:36.392273Z", "shell.execute_reply.started": "2021-12-31T08:29:34.401000Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [node1, label, node2, graph, edge_type]\n", "Index: []" ] }, "execution_count": 215, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/graph.high.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 216, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T08:29:36.394019Z", "iopub.status.busy": "2021-12-31T08:29:36.393858Z", "iopub.status.idle": "2021-12-31T08:30:07.925774Z", "shell.execute_reply": "2021-12-31T08:30:07.924705Z", "shell.execute_reply.started": "2021-12-31T08:29:36.393997Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q11019P279Q1183543Q1420superclass
1Q11019P279Q39546Q1420superclass
2Q1183543P279Q16686448Q1420superclass
3Q1183543P279Q16798631Q1420superclass
4Q1183543P279Q39546Q1420superclass
5Q1301433P279Q42889Q1420superclass
6Q1420P279Q752870Q1420superclass
7Q15401930P279Q488383Q1420superclass
8Q15618781P279Q1301433Q1420superclass
9Q16686448P279Q35120Q1420superclass
10Q16798631P279Q2424752Q1420superclass
11Q16798631P279Q8205328Q1420superclass
12Q223557P279Q4406616Q1420superclass
13Q2424752P279Q15401930Q1420superclass
14Q2424752P279Q28877Q1420superclass
15Q2424752P279Q8205328Q1420superclass
16Q28877P279Q337060Q1420superclass
17Q3245975P279Q2424752Q1420superclass
18Q337060P279Q223557Q1420superclass
19Q35825432P279Q35120Q1420superclass
20Q39546P279Q16798631Q1420superclass
21Q39546P279Q2424752Q1420superclass
22Q39546P279Q35825432Q1420superclass
23Q42889P279Q11019Q1420superclass
24Q42889P279Q3245975Q1420superclass
25Q42889P279Q987767Q1420superclass
26Q4406616P279Q488383Q1420superclass
27Q488383P279Q35120Q1420superclass
28Q6671777P279Q35120Q1420superclass
29Q752870P279Q15618781Q1420superclass
30Q8205328P279Q16686448Q1420superclass
31Q8205328P279Q223557Q1420superclass
32Q9158768P279Q6671777Q1420superclass
33Q987767P279Q39546Q1420superclass
34Q987767P279Q8205328Q1420superclass
35Q987767P279Q9158768Q1420superclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q11019 P279 Q1183543 Q1420 superclass\n", "1 Q11019 P279 Q39546 Q1420 superclass\n", "2 Q1183543 P279 Q16686448 Q1420 superclass\n", "3 Q1183543 P279 Q16798631 Q1420 superclass\n", "4 Q1183543 P279 Q39546 Q1420 superclass\n", "5 Q1301433 P279 Q42889 Q1420 superclass\n", "6 Q1420 P279 Q752870 Q1420 superclass\n", "7 Q15401930 P279 Q488383 Q1420 superclass\n", "8 Q15618781 P279 Q1301433 Q1420 superclass\n", "9 Q16686448 P279 Q35120 Q1420 superclass\n", "10 Q16798631 P279 Q2424752 Q1420 superclass\n", "11 Q16798631 P279 Q8205328 Q1420 superclass\n", "12 Q223557 P279 Q4406616 Q1420 superclass\n", "13 Q2424752 P279 Q15401930 Q1420 superclass\n", "14 Q2424752 P279 Q28877 Q1420 superclass\n", "15 Q2424752 P279 Q8205328 Q1420 superclass\n", "16 Q28877 P279 Q337060 Q1420 superclass\n", "17 Q3245975 P279 Q2424752 Q1420 superclass\n", "18 Q337060 P279 Q223557 Q1420 superclass\n", "19 Q35825432 P279 Q35120 Q1420 superclass\n", "20 Q39546 P279 Q16798631 Q1420 superclass\n", "21 Q39546 P279 Q2424752 Q1420 superclass\n", "22 Q39546 P279 Q35825432 Q1420 superclass\n", "23 Q42889 P279 Q11019 Q1420 superclass\n", "24 Q42889 P279 Q3245975 Q1420 superclass\n", "25 Q42889 P279 Q987767 Q1420 superclass\n", "26 Q4406616 P279 Q488383 Q1420 superclass\n", "27 Q488383 P279 Q35120 Q1420 superclass\n", "28 Q6671777 P279 Q35120 Q1420 superclass\n", "29 Q752870 P279 Q15618781 Q1420 superclass\n", "30 Q8205328 P279 Q16686448 Q1420 superclass\n", "31 Q8205328 P279 Q223557 Q1420 superclass\n", "32 Q9158768 P279 Q6671777 Q1420 superclass\n", "33 Q987767 P279 Q39546 Q1420 superclass\n", "34 Q987767 P279 Q8205328 Q1420 superclass\n", "35 Q987767 P279 Q9158768 Q1420 superclass" ] }, "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/graph.low.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 217, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T08:30:07.928220Z", "iopub.status.busy": "2021-12-31T08:30:07.927917Z", "iopub.status.idle": "2021-12-31T08:30:29.665757Z", "shell.execute_reply": "2021-12-31T08:30:29.665176Z", "shell.execute_reply.started": "2021-12-31T08:30:07.928191Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [node1, label, node2, graph, edge_type]\n", "Index: []" ] }, "execution_count": 217, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/all.graph.low.sub.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 218, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T08:30:29.668385Z", "iopub.status.busy": "2021-12-31T08:30:29.668199Z", "iopub.status.idle": "2021-12-31T08:32:04.988275Z", "shell.execute_reply": "2021-12-31T08:32:04.987475Z", "shell.execute_reply.started": "2021-12-31T08:30:29.668368Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2graphedge_type
0Q11019P279Q1183543Q1420superclass
1Q11019P279Q39546Q1420superclass
2Q1183543P279Q16686448Q1420superclass
3Q1183543P279Q16798631Q1420superclass
4Q1183543P279Q39546Q1420superclass
5Q1301433P279Q42889Q1420superclass
6Q1420P279Q752870Q1420superclass
7Q15401930P279Q488383Q1420superclass
8Q15618781P279Q1301433Q1420superclass
9Q16686448P279Q35120Q1420superclass
10Q16798631P279Q2424752Q1420superclass
11Q16798631P279Q8205328Q1420superclass
12Q223557P279Q4406616Q1420superclass
13Q2424752P279Q15401930Q1420superclass
14Q2424752P279Q28877Q1420superclass
15Q2424752P279Q8205328Q1420superclass
16Q28877P279Q337060Q1420superclass
17Q3245975P279Q2424752Q1420superclass
18Q337060P279Q223557Q1420superclass
19Q35825432P279Q35120Q1420superclass
20Q39546P279Q16798631Q1420superclass
21Q39546P279Q2424752Q1420superclass
22Q39546P279Q35825432Q1420superclass
23Q42889P279Q11019Q1420superclass
24Q42889P279Q3245975Q1420superclass
25Q42889P279Q987767Q1420superclass
26Q4406616P279Q488383Q1420superclass
27Q488383P279Q35120Q1420superclass
28Q6671777P279Q35120Q1420superclass
29Q752870P279Q15618781Q1420superclass
30Q8205328P279Q16686448Q1420superclass
31Q8205328P279Q223557Q1420superclass
32Q9158768P279Q6671777Q1420superclass
33Q987767P279Q39546Q1420superclass
34Q987767P279Q8205328Q1420superclass
35Q987767P279Q9158768Q1420superclass
\n", "
" ], "text/plain": [ " node1 label node2 graph edge_type\n", "0 Q11019 P279 Q1183543 Q1420 superclass\n", "1 Q11019 P279 Q39546 Q1420 superclass\n", "2 Q1183543 P279 Q16686448 Q1420 superclass\n", "3 Q1183543 P279 Q16798631 Q1420 superclass\n", "4 Q1183543 P279 Q39546 Q1420 superclass\n", "5 Q1301433 P279 Q42889 Q1420 superclass\n", "6 Q1420 P279 Q752870 Q1420 superclass\n", "7 Q15401930 P279 Q488383 Q1420 superclass\n", "8 Q15618781 P279 Q1301433 Q1420 superclass\n", "9 Q16686448 P279 Q35120 Q1420 superclass\n", "10 Q16798631 P279 Q2424752 Q1420 superclass\n", "11 Q16798631 P279 Q8205328 Q1420 superclass\n", "12 Q223557 P279 Q4406616 Q1420 superclass\n", "13 Q2424752 P279 Q15401930 Q1420 superclass\n", "14 Q2424752 P279 Q28877 Q1420 superclass\n", "15 Q2424752 P279 Q8205328 Q1420 superclass\n", "16 Q28877 P279 Q337060 Q1420 superclass\n", "17 Q3245975 P279 Q2424752 Q1420 superclass\n", "18 Q337060 P279 Q223557 Q1420 superclass\n", "19 Q35825432 P279 Q35120 Q1420 superclass\n", "20 Q39546 P279 Q16798631 Q1420 superclass\n", "21 Q39546 P279 Q2424752 Q1420 superclass\n", "22 Q39546 P279 Q35825432 Q1420 superclass\n", "23 Q42889 P279 Q11019 Q1420 superclass\n", "24 Q42889 P279 Q3245975 Q1420 superclass\n", "25 Q42889 P279 Q987767 Q1420 superclass\n", "26 Q4406616 P279 Q488383 Q1420 superclass\n", "27 Q488383 P279 Q35120 Q1420 superclass\n", "28 Q6671777 P279 Q35120 Q1420 superclass\n", "29 Q752870 P279 Q15618781 Q1420 superclass\n", "30 Q8205328 P279 Q16686448 Q1420 superclass\n", "31 Q8205328 P279 Q223557 Q1420 superclass\n", "32 Q9158768 P279 Q6671777 Q1420 superclass\n", "33 Q987767 P279 Q39546 Q1420 superclass\n", "34 Q987767 P279 Q8205328 Q1420 superclass\n", "35 Q987767 P279 Q9158768 Q1420 superclass" ] }, "execution_count": 218, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/all.graph.low.super.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 219, "metadata": { "execution": { "iopub.execute_input": "2021-12-31T08:32:04.990543Z", "iopub.status.busy": "2021-12-31T08:32:04.990275Z", "iopub.status.idle": "2021-12-31T08:43:25.333732Z", "shell.execute_reply": "2021-12-31T08:43:25.333041Z", "shell.execute_reply.started": "2021-12-31T08:32:04.990525Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1graphinstance_countnode_typelabel
0Q11019Q142099few_subclasses'machine'@en
1Q1183543Q1420198many_subclasses'device'@en
2Q1301433Q142017few_subclasses'land vehicle'@en
3Q1420Q1420862many_subclasses'motor car'@en
4Q15401930Q142012few_subclasses'product'@en
5Q15618781Q142029few_subclasses'wheeled vehicle'@en
6Q16686448Q142024few_subclasses'artificial entity'@en
7Q16798631Q1420389few_subclasses'equipment'@en
8Q223557Q1420110few_subclasses'physical object'@en
9Q2424752Q1420412few_subclasses'product'@en
10Q28877Q142011few_subclasses'goods'@en
11Q3245975Q14201few_subclasses'finished good'@en
12Q337060Q142038few_subclasses'perceptible object'@en
13Q35120Q142034few_subclasses'entity'@en
14Q35825432Q14202few_subclasses'converter'@en
15Q39546Q14201029many_subclasses'tool'@en
16Q42889Q1420114few_subclasses'vehicle'@en
17Q4406616Q1420322few_subclasses'concrete object'@en
18Q488383Q1420701few_subclasses'object'@en
19Q6671777Q142029few_subclasses'structure'@en
20Q752870Q14208few_subclasses'motor vehicle'@en
21Q8205328Q142052few_subclasses'artificial physical object'@en
22Q9158768Q142063few_subclasses'storage'@en
23Q987767Q1420282few_subclasses'container'@en
\n", "
" ], "text/plain": [ " node1 graph instance_count node_type \\\n", "0 Q11019 Q1420 99 few_subclasses \n", "1 Q1183543 Q1420 198 many_subclasses \n", "2 Q1301433 Q1420 17 few_subclasses \n", "3 Q1420 Q1420 862 many_subclasses \n", "4 Q15401930 Q1420 12 few_subclasses \n", "5 Q15618781 Q1420 29 few_subclasses \n", "6 Q16686448 Q1420 24 few_subclasses \n", "7 Q16798631 Q1420 389 few_subclasses \n", "8 Q223557 Q1420 110 few_subclasses \n", "9 Q2424752 Q1420 412 few_subclasses \n", "10 Q28877 Q1420 11 few_subclasses \n", "11 Q3245975 Q1420 1 few_subclasses \n", "12 Q337060 Q1420 38 few_subclasses \n", "13 Q35120 Q1420 34 few_subclasses \n", "14 Q35825432 Q1420 2 few_subclasses \n", "15 Q39546 Q1420 1029 many_subclasses \n", "16 Q42889 Q1420 114 few_subclasses \n", "17 Q4406616 Q1420 322 few_subclasses \n", "18 Q488383 Q1420 701 few_subclasses \n", "19 Q6671777 Q1420 29 few_subclasses \n", "20 Q752870 Q1420 8 few_subclasses \n", "21 Q8205328 Q1420 52 few_subclasses \n", "22 Q9158768 Q1420 63 few_subclasses \n", "23 Q987767 Q1420 282 few_subclasses \n", "\n", " label \n", "0 'machine'@en \n", "1 'device'@en \n", "2 'land vehicle'@en \n", "3 'motor car'@en \n", "4 'product'@en \n", "5 'wheeled vehicle'@en \n", "6 'artificial entity'@en \n", "7 'equipment'@en \n", "8 'physical object'@en \n", "9 'product'@en \n", "10 'goods'@en \n", "11 'finished good'@en \n", "12 'perceptible object'@en \n", "13 'entity'@en \n", "14 'converter'@en \n", "15 'tool'@en \n", "16 'vehicle'@en \n", "17 'concrete object'@en \n", "18 'object'@en \n", "19 'structure'@en \n", "20 'motor vehicle'@en \n", "21 'artificial physical object'@en \n", "22 'storage'@en \n", "23 'container'@en " ] }, "execution_count": 219, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/graph.low.node.tsv.gz\n", " --match '(node)-[{graph: \"Q1420\"}]->()'\n", " --order-by 'node'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### In progress: Trim the subclasses based on the levels\n", "\n", "The idea is to also trim the graph based on the number of levels, this may be difficult as I think some small graphs may have lots of levels, and some graphs may become large with just a small number of levels." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is our starting point:" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T14:17:24.935299Z", "iopub.status.busy": "2021-12-29T14:17:24.935039Z", "iopub.status.idle": "2021-12-29T14:17:27.057601Z", "shell.execute_reply": "2021-12-29T14:17:27.056689Z", "shell.execute_reply.started": "2021-12-29T14:17:24.935267Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2distanceid
0Q100000030P279starQ1000000300Q100000030-P279star-Q100000030
1Q100000030P279starQ147481Q100000030-P279star-Q14748
2Q100000030P279starQ147452Q100000030-P279star-Q14745
3Q100000030P279starQ13577613Q100000030-P279star-Q1357761
4Q100000030P279starQ24247523Q100000030-P279star-Q2424752
\n", "
" ], "text/plain": [ " node1 label node2 distance id\n", "0 Q100000030 P279star Q100000030 0 Q100000030-P279star-Q100000030\n", "1 Q100000030 P279star Q14748 1 Q100000030-P279star-Q14748\n", "2 Q100000030 P279star Q14745 2 Q100000030-P279star-Q14745\n", "3 Q100000030 P279star Q1357761 3 Q100000030-P279star-Q1357761\n", "4 Q100000030 P279star Q2424752 3 Q100000030-P279star-Q2424752" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $OUT/derived.p279star.complete.tsv.gz -n 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at the distribution of distances" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T14:17:27.059230Z", "iopub.status.busy": "2021-12-29T14:17:27.058956Z", "iopub.status.idle": "2021-12-29T14:18:51.300383Z", "shell.execute_reply": "2021-12-29T14:18:51.299311Z", "shell.execute_reply.started": "2021-12-29T14:17:27.059198Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
distancecount
0614920344
1412395081
2512068280
3311432165
478660425
526976960
686681393
794448827
813077658
902503943
10101873495
11111159780
1212781940
1313362901
1414216027
1515119855
161655762
171727343
181812478
19195166
20202427
2121659
2222188
232315
\n", "
" ], "text/plain": [ " distance count\n", "0 6 14920344\n", "1 4 12395081\n", "2 5 12068280\n", "3 3 11432165\n", "4 7 8660425\n", "5 2 6976960\n", "6 8 6681393\n", "7 9 4448827\n", "8 1 3077658\n", "9 0 2503943\n", "10 10 1873495\n", "11 11 1159780\n", "12 12 781940\n", "13 13 362901\n", "14 14 216027\n", "15 15 119855\n", "16 16 55762\n", "17 17 27343\n", "18 18 12478\n", "19 19 5166\n", "20 20 2427\n", "21 21 659\n", "22 22 188\n", "23 23 15" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i p279starcomplete\n", " --match '(class)-[eid {distance: d}]->(superclass)'\n", " --return 'distinct d as distance, count(eid) as count'\n", " --order-by 'cast(count, int) desc'\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Filter the `p279starcomplete` file to keep only the subclasses with distance < K=10" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T14:18:51.303245Z", "iopub.status.busy": "2021-12-29T14:18:51.302746Z", "iopub.status.idle": "2021-12-29T14:20:52.297428Z", "shell.execute_reply": "2021-12-29T14:20:52.296441Z", "shell.execute_reply.started": "2021-12-29T14:18:51.303199Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i p279stard\n", " --match '(subclass)-[eid {distance: d}]->(class)'\n", " --return 'class as node1, \"Pcount_subclasses\" as label, count(distinct subclass) as node2'\n", " --where 'subclass != class and d < 9'\n", " --order-by 'cast(node2, int) desc'\n", " -o $TEMP/subclass.count.d10.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`kgtk add-labels` drives me crazy, as it takes sooooo long." ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "execution": { "iopub.execute_input": "2021-12-29T14:20:52.299701Z", "iopub.status.busy": "2021-12-29T14:20:52.299403Z", "iopub.status.idle": "2021-12-29T14:22:33.279199Z", "shell.execute_reply": "2021-12-29T14:22:33.277680Z", "shell.execute_reply.started": "2021-12-29T14:20:52.299672Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zcat: error writing to output: Broken pipe\n", "| node1 | label | node2 | node1;label |\n", "| --------- | ----------------- | ------- | --------------------------- |\n", "| Q35120 | Pcount_subclasses | 2366995 | 'entity'@en |\n", "| Q99527517 | Pcount_subclasses | 1440970 | 'collection entity'@en |\n", "| Q16887380 | Pcount_subclasses | 1326944 | 'group'@en |\n", "| Q20937557 | Pcount_subclasses | 1255680 | 'series'@en |\n", "| Q28813620 | Pcount_subclasses | 1226806 | 'set'@en |\n", "| Q488383 | Pcount_subclasses | 1185270 | 'object'@en |\n", "| Q4406616 | Pcount_subclasses | 1144700 | 'concrete object'@en |\n", "| Q223557 | Pcount_subclasses | 1136457 | 'physical object'@en |\n", "| Q6671777 | Pcount_subclasses | 1110651 | 'structure'@en |\n", "| Q58415929 | Pcount_subclasses | 1091001 | 'spatio-temporal entity'@en |\n", "| Q219858 | Pcount_subclasses | 1056942 | 'zone'@en |\n", "| Q50365914 | Pcount_subclasses | 1056855 | 'biological region'@en |\n", "| Q97669203 | Pcount_subclasses | 1007358 | 'molecular conformation'@en |\n", "| Q15712714 | Pcount_subclasses | 1007317 | 'biomolecular structure'@en |\n", "| Q3511065 | Pcount_subclasses | 1007234 | 'biological sequence'@en |\n", "| Q7187 | Pcount_subclasses | 1004629 | 'gene'@en |\n", "| Q3771876 | Pcount_subclasses | 1004622 | 'nucleic acid structure'@en |\n", "| Q37500013 | Pcount_subclasses | 1004619 | 'primary structure'@en |\n", "| Q863908 | Pcount_subclasses | 1004579 | 'nucleic acid sequence'@en |\n" ] } ], "source": [ "!zcat < $TEMP/subclass.count.d10.tsv.gz | head -20 | kgtk add-labels / table" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtk", "language": "python", "name": "kgtk" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 4 }