{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Profiling To Support The Browser\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preamble: set up the environment and files used in the tutorial" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2021-12-12T04:06:35.455863Z", "iopub.status.busy": "2021-12-12T04:06:35.455528Z", "iopub.status.idle": "2021-12-12T04:06:38.105977Z", "shell.execute_reply": "2021-12-12T04:06:38.105152Z", "shell.execute_reply.started": "2021-12-12T04:06:35.455800Z" }, "tags": [] }, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", "\n", "from graph_tool.all import *\n", "\n", "import papermill as pm\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2021-12-12T04:06:38.108114Z", "iopub.status.busy": "2021-12-12T04:06:38.107561Z", "iopub.status.idle": "2021-12-12T04:06:38.111493Z", "shell.execute_reply": "2021-12-12T04:06:38.110982Z", "shell.execute_reply.started": "2021-12-12T04:06:38.108075Z" }, "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "\n", "kgtk_path = \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/\"\n", "input_path = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/\"\n", "output_path = \"/Users/pedroszekely/Downloads/kypher/projects\"\n", "graph_cache_path = \"/Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db\"\n", "project_name = \"browser-profiling\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our Wikidata distribution partitions the knowledge in Wikidata into smaller files that make it possible for you to pick and choose which files you want to use. Our tutorial KG is a subset of Wikidata, and is partitioned in the same way as the full Wikidata. The following is a partial list of all the files:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-12-12T04:06:38.780937Z", "iopub.status.busy": "2021-12-12T04:06:38.780747Z", "iopub.status.idle": "2021-12-12T04:06:38.785867Z", "shell.execute_reply": "2021-12-12T04:06:38.785361Z", "shell.execute_reply.started": "2021-12-12T04:06:38.780920Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /Users/pedroszekely\n", "Current dir: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/use-cases\n", "KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk\n", "Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n" ] } ], "source": [ "files = [\n", " \"claims\",\n", " \"item\",\n", " \"p279star\",\n", " \"label\"\n", "]\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " graph_cache_path=graph_cache_path,\n", " project_name=project_name,\n", " debug=True\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The KGTK setup command defines environment variables for all the files so that you can reuse the Jupyter notebook when you install it on your local machine." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2021-12-12T04:06:40.129461Z", "iopub.status.busy": "2021-12-12T04:06:40.129256Z", "iopub.status.idle": "2021-12-12T04:06:40.132852Z", "shell.execute_reply": "2021-12-12T04:06:40.132431Z", "shell.execute_reply.started": "2021-12-12T04:06:40.129443Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GRAPH: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/\n", "OUT: /Users/pedroszekely/Downloads/kypher/projects/browser-profiling\n", "kgtk: kgtk --debug\n", "kypher: kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db\n", "KGTK_GRAPH_CACHE: /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db\n", "KGTK_LABEL_FILE: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\n", "STORE: /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db\n", "KGTK_OPTION_DEBUG: false\n", "USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n", "TEMP: /Users/pedroszekely/Downloads/kypher/projects/browser-profiling/temp.browser-profiling\n", "EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples\n", "claims: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.tsv.gz\n", "item: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.wikibase-item.tsv.gz\n", "p279star: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz\n", "label: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-12-12T04:06:45.349466Z", "iopub.status.busy": "2021-12-12T04:06:45.349214Z", "iopub.status.idle": "2021-12-12T05:09:05.724144Z", "shell.execute_reply": "2021-12-12T05:09:05.722696Z", "shell.execute_reply.started": "2021-12-12T04:06:45.349447Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.tsv.gz\" --as claims -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.wikibase-item.tsv.gz\" --as item -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz\" --as p279star -i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz\" --as label --limit 3\n", "[2021-12-11 20:06:46 sqlstore]: IMPORT graph directly into table graph_4 from /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/claims.tsv.gz ...\n", "[2021-12-11 21:09:05 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT *\n", " FROM graph_4 AS graph_4_c1\n", " LIMIT ?\n", " PARAS: [3]\n", "---------------------------------------------\n", "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n" ] } ], "source": [ "ck.load_files_into_cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# best indexing for this notebook\n", "# !kgtk --debug query -i claims --idx mode:graph -i p279star --idx mode:monograph" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T18:18:15.352846Z", "iopub.status.busy": "2021-11-30T18:18:15.352532Z", "iopub.status.idle": "2021-11-30T18:18:15.356311Z", "shell.execute_reply": "2021-11-30T18:18:15.355711Z", "shell.execute_reply.started": "2021-11-30T18:18:15.352809Z" }, "tags": [] }, "outputs": [], "source": [ "# how to. drop existing indices to create new ones\n", "# !kgtk --debug query -i claims --idx mode:clear mode:graph -i p279star --idx mode:clear mode:monograph --limit 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Profile incoming edges\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T18:54:47.930137Z", "iopub.status.busy": "2021-11-30T18:54:47.929845Z", "iopub.status.idle": "2021-11-30T19:09:18.606185Z", "shell.execute_reply": "2021-11-30T19:09:18.605127Z", "shell.execute_reply.started": "2021-11-30T18:54:47.930105Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i item\n", " --match '(n1)-[edgeid {label: property}]->(n2)'\n", " --return 'distinct n2 as node1, \"Pincoming_statement_count\" as label, property as node2, count(edgeid) as P1114'\n", " -o $OUT/incoming.property.count.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T20:35:32.434104Z", "iopub.status.busy": "2021-11-30T20:35:32.433579Z", "iopub.status.idle": "2021-11-30T20:35:37.721978Z", "shell.execute_reply": "2021-11-30T20:35:37.721134Z", "shell.execute_reply.started": "2021-11-30T20:35:32.434073Z" }, "tags": [] }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/incoming.property.count.tsv.gz\n", " --match '(n1)-[e {P1114: quantity}]->(property)'\n", " --where 'cast(quantity, int) > 25'\n", " --order-by 'n1, cast(quantity, int) desc'\n", " -o $OUT/incoming.property.count.25.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "execution": { "iopub.execute_input": "2021-12-01T00:15:01.718006Z", "iopub.status.busy": "2021-12-01T00:15:01.717504Z", "iopub.status.idle": "2021-12-01T00:15:01.907188Z", "shell.execute_reply": "2021-12-01T00:15:01.906572Z", "shell.execute_reply.started": "2021-12-01T00:15:01.717977Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 342816\n" ] } ], "source": [ "!zcat < $OUT/incoming.property.count.25.tsv.gz | wc -l" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 45, "metadata": { "execution": { "iopub.execute_input": "2021-12-01T04:52:58.020933Z", "iopub.status.busy": "2021-12-01T04:52:58.020660Z", "iopub.status.idle": "2021-12-01T04:54:11.117036Z", "shell.execute_reply": "2021-12-01T04:54:11.116459Z", "shell.execute_reply.started": "2021-12-01T04:52:58.020907Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
propertytotalproperty;label
0P13153811'located in the administrative territorial ent...
1P17119082'parent taxon'@en
2P36118820'part of'@en
3P68417790'ortholog'@en
4P3113658'instance of'@en
5P73412763'family name'@en
6P16112532'cast member'@en
7P1911403'place of birth'@en
8P7359824'given name'@en
9P548519'member of sports team'@en
10P2767476'location'@en
11P7036932'found in taxon'@en
12P1086131'employer'@en
13P695799'educated at'@en
14P505707'author'@en
15P6824772'biological process'@en
16P1704445'creator'@en
17P1284382'regulates (molecular biology)'@en
18P5274358'has part'@en
19P1664339'award received'@en
20P393927'position held'@en
21P203839'place of death'@en
22P2793648'subclass of'@en
23P9713442'category combines topics'@en
24P1753355'performer'@en
\n", "
" ], "text/plain": [ " property total property;label\n", "0 P131 53811 'located in the administrative territorial ent...\n", "1 P171 19082 'parent taxon'@en\n", "2 P361 18820 'part of'@en\n", "3 P684 17790 'ortholog'@en\n", "4 P31 13658 'instance of'@en\n", "5 P734 12763 'family name'@en\n", "6 P161 12532 'cast member'@en\n", "7 P19 11403 'place of birth'@en\n", "8 P735 9824 'given name'@en\n", "9 P54 8519 'member of sports team'@en\n", "10 P276 7476 'location'@en\n", "11 P703 6932 'found in taxon'@en\n", "12 P108 6131 'employer'@en\n", "13 P69 5799 'educated at'@en\n", "14 P50 5707 'author'@en\n", "15 P682 4772 'biological process'@en\n", "16 P170 4445 'creator'@en\n", "17 P128 4382 'regulates (molecular biology)'@en\n", "18 P527 4358 'has part'@en\n", "19 P166 4339 'award received'@en\n", "20 P39 3927 'position held'@en\n", "21 P20 3839 'place of death'@en\n", "22 P279 3648 'subclass of'@en\n", "23 P971 3442 'category combines topics'@en\n", "24 P175 3355 'performer'@en" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/incoming.property.count.25.tsv.gz\n", " --match '(n1)-[e {P1114: quantity}]->(property)'\n", " --return 'property as property, count(distinct n1) as total'\n", " --order-by 'cast(total, int) desc'\n", " --limit 25\n", " / add-labels\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 28, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T20:37:11.302868Z", "iopub.status.busy": "2021-11-30T20:37:11.302658Z", "iopub.status.idle": "2021-11-30T20:38:23.969482Z", "shell.execute_reply": "2021-11-30T20:38:23.968888Z", "shell.execute_reply.started": "2021-11-30T20:37:11.302844Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2P1114node1;labelnode2;label
0Q100Pincoming_statement_countP194141'Boston'@en'place of birth'@en
1Q100Pincoming_statement_countP201649'Boston'@en'place of death'@en
2Q100Pincoming_statement_countP2911572'Boston'@en'place of publication'@en
3Q100Pincoming_statement_countP9371305'Boston'@en'work location'@en
4Q100Pincoming_statement_countP1591026'Boston'@en'headquarters location'@en
5Q100Pincoming_statement_countP1311006'Boston'@en'located in the administrative territorial ent...
6Q100Pincoming_statement_countP551355'Boston'@en'residence'@en
7Q100Pincoming_statement_countP1071336'Boston'@en'location of creation'@en
8Q100Pincoming_statement_countP740259'Boston'@en'location of formation'@en
9Q100Pincoming_statement_countP840195'Boston'@en'narrative location'@en
\n", "
" ], "text/plain": [ " node1 label node2 P1114 node1;label \\\n", "0 Q100 Pincoming_statement_count P19 4141 'Boston'@en \n", "1 Q100 Pincoming_statement_count P20 1649 'Boston'@en \n", "2 Q100 Pincoming_statement_count P291 1572 'Boston'@en \n", "3 Q100 Pincoming_statement_count P937 1305 'Boston'@en \n", "4 Q100 Pincoming_statement_count P159 1026 'Boston'@en \n", "5 Q100 Pincoming_statement_count P131 1006 'Boston'@en \n", "6 Q100 Pincoming_statement_count P551 355 'Boston'@en \n", "7 Q100 Pincoming_statement_count P1071 336 'Boston'@en \n", "8 Q100 Pincoming_statement_count P740 259 'Boston'@en \n", "9 Q100 Pincoming_statement_count P840 195 'Boston'@en \n", "\n", " node2;label \n", "0 'place of birth'@en \n", "1 'place of death'@en \n", "2 'place of publication'@en \n", "3 'work location'@en \n", "4 'headquarters location'@en \n", "5 'located in the administrative territorial ent... \n", "6 'residence'@en \n", "7 'location of creation'@en \n", "8 'location of formation'@en \n", "9 'narrative location'@en " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $OUT/incoming.property.count.25.tsv.gz / add-labels\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 31, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T20:40:37.257076Z", "iopub.status.busy": "2021-11-30T20:40:37.256805Z", "iopub.status.idle": "2021-11-30T20:41:48.783776Z", "shell.execute_reply": "2021-11-30T20:41:48.783233Z", "shell.execute_reply.started": "2021-11-30T20:40:37.257048Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2P1114node1;labelnode2;label
0Q76Pincoming_statement_countP50302'Barack Obama'@en'author'@en
1Q76Pincoming_statement_countP991109'Barack Obama'@en'successful candidate'@en
2Q76Pincoming_statement_countP726106'Barack Obama'@en'candidate'@en
3Q76Pincoming_statement_countP92147'Barack Obama'@en'main subject'@en
4Q76Pincoming_statement_countP189137'Barack Obama'@en'signatory'@en
5Q76Pincoming_statement_countP16134'Barack Obama'@en'cast member'@en
6Q76Pincoming_statement_countP13833'Barack Obama'@en'named after'@en
7Q76Pincoming_statement_countP82330'Barack Obama'@en'speaker'@en
\n", "
" ], "text/plain": [ " node1 label node2 P1114 node1;label \\\n", "0 Q76 Pincoming_statement_count P50 302 'Barack Obama'@en \n", "1 Q76 Pincoming_statement_count P991 109 'Barack Obama'@en \n", "2 Q76 Pincoming_statement_count P726 106 'Barack Obama'@en \n", "3 Q76 Pincoming_statement_count P921 47 'Barack Obama'@en \n", "4 Q76 Pincoming_statement_count P1891 37 'Barack Obama'@en \n", "5 Q76 Pincoming_statement_count P161 34 'Barack Obama'@en \n", "6 Q76 Pincoming_statement_count P138 33 'Barack Obama'@en \n", "7 Q76 Pincoming_statement_count P823 30 'Barack Obama'@en \n", "\n", " node2;label \n", "0 'author'@en \n", "1 'successful candidate'@en \n", "2 'candidate'@en \n", "3 'main subject'@en \n", "4 'signatory'@en \n", "5 'cast member'@en \n", "6 'named after'@en \n", "7 'speaker'@en " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/incoming.property.count.25.tsv.gz\n", " --match '(entity)-[e {P1114: quantity}]->(property)'\n", " --where 'entity = \"Q76\"'\n", " / add-labels\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 32, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T22:26:28.892450Z", "iopub.status.busy": "2021-11-30T22:26:28.892071Z", "iopub.status.idle": "2021-11-30T23:41:55.673234Z", "shell.execute_reply": "2021-11-30T23:41:55.670781Z", "shell.execute_reply.started": "2021-11-30T22:26:28.892417Z" } }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i claims\n", " --match '(n1)-[edgeid {label: property}]->(n2)'\n", " --return 'distinct n1 as node1, \"Poutgoing_statement_count\" as label, property as node2, count(edgeid) as P1114'\n", " -o $OUT/outgoing.property.count.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 33, "metadata": { "execution": { "iopub.execute_input": "2021-11-30T23:41:55.679990Z", "iopub.status.busy": "2021-11-30T23:41:55.679603Z", "iopub.status.idle": "2021-12-01T00:11:16.305617Z", "shell.execute_reply": "2021-12-01T00:11:16.304637Z", "shell.execute_reply.started": "2021-11-30T23:41:55.679959Z" } }, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/outgoing.property.count.tsv.gz\n", " --match '(n1)-[e {P1114: quantity}]->(property)'\n", " --where 'cast(quantity, int) > 25'\n", " --order-by 'n1, cast(quantity, int) desc'\n", " -o $OUT/outgoing.property.count.25.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "execution": { "iopub.execute_input": "2021-12-01T00:15:31.303570Z", "iopub.status.busy": "2021-12-01T00:15:31.303067Z", "iopub.status.idle": "2021-12-01T00:15:31.439252Z", "shell.execute_reply": "2021-12-01T00:15:31.438571Z", "shell.execute_reply.started": "2021-12-01T00:15:31.303542Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 104727\n" ] } ], "source": [ "!zcat < $OUT/outgoing.property.count.25.tsv.gz | wc -l" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 46, "metadata": { "execution": { "iopub.execute_input": "2021-12-01T04:54:11.118546Z", "iopub.status.busy": "2021-12-01T04:54:11.118317Z", "iopub.status.idle": "2021-12-01T04:55:21.397524Z", "shell.execute_reply": "2021-12-01T04:55:21.396974Z", "shell.execute_reply.started": "2021-12-01T04:54:11.118525Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
propertytotalproperty;label
0P68417782'ortholog'@en
1P108717018'Elo rating'@en
2P5288996'catalog code'@en
3P1508950'contains administrative territorial entity'@en
4P1615075'cast member'@en
5P18434018'taxon common name'@en
6P6823726'biological process'@en
7P5273171'has part'@en
8P10822777'population'@en
9P1282343'regulates (molecular biology)'@en
10P3482111'software version identifier'@en
11P7032000'found in taxon'@en
12P13441787'participant in'@en
13P7101303'participant'@en
14P14781268'has immediate cause'@en
15P38721256'patronage'@en
16P1851159'doctoral student'@en
17P61531082'research site'@en
18P639988'RefSeq RNA ID'@en
19P460935'said to be the same as'@en
20P166878'award received'@en
21P3373872'sibling'@en
22P704845'Ensembl transcript ID'@en
23P637724'RefSeq protein ID'@en
24P638589'PDB structure ID'@en
\n", "
" ], "text/plain": [ " property total property;label\n", "0 P684 17782 'ortholog'@en\n", "1 P1087 17018 'Elo rating'@en\n", "2 P528 8996 'catalog code'@en\n", "3 P150 8950 'contains administrative territorial entity'@en\n", "4 P161 5075 'cast member'@en\n", "5 P1843 4018 'taxon common name'@en\n", "6 P682 3726 'biological process'@en\n", "7 P527 3171 'has part'@en\n", "8 P1082 2777 'population'@en\n", "9 P128 2343 'regulates (molecular biology)'@en\n", "10 P348 2111 'software version identifier'@en\n", "11 P703 2000 'found in taxon'@en\n", "12 P1344 1787 'participant in'@en\n", "13 P710 1303 'participant'@en\n", "14 P1478 1268 'has immediate cause'@en\n", "15 P3872 1256 'patronage'@en\n", "16 P185 1159 'doctoral student'@en\n", "17 P6153 1082 'research site'@en\n", "18 P639 988 'RefSeq RNA ID'@en\n", "19 P460 935 'said to be the same as'@en\n", "20 P166 878 'award received'@en\n", "21 P3373 872 'sibling'@en\n", "22 P704 845 'Ensembl transcript ID'@en\n", "23 P637 724 'RefSeq protein ID'@en\n", "24 P638 589 'PDB structure ID'@en" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/outgoing.property.count.25.tsv.gz\n", " --match '(n1)-[e {P1114: quantity}]->(property)'\n", " --return 'property as property, count(distinct n1) as total'\n", " --order-by 'cast(total, int) desc'\n", " --limit 25\n", " / add-labels\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 34, "metadata": { "execution": { "iopub.execute_input": "2021-12-01T00:11:16.307235Z", "iopub.status.busy": "2021-12-01T00:11:16.306983Z", "iopub.status.idle": "2021-12-01T00:13:17.232647Z", "shell.execute_reply": "2021-12-01T00:13:17.231875Z", "shell.execute_reply.started": "2021-12-01T00:11:16.307210Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2P1114node1;labelnode2;label
0P1533Poutgoing_statement_countP230226'family name identical to this given name'@en'property constraint'@en
1P154Poutgoing_statement_countP230228'logo image'@en'property constraint'@en
2P18Poutgoing_statement_countP165930'image'@en'see also'@en
3P2586Poutgoing_statement_countP230228'INSEE department code'@en'property constraint'@en
4P3171Poutgoing_statement_countP165930'International Olympic Committee athlete ID'@en'see also'@en
5P4839Poutgoing_statement_countP226485'Wolfram Language entity code'@en'mix'n'match catalog ID'@en
6P5086Poutgoing_statement_countP230245'FIPS 5-2 alpha code (US states)'@en'property constraint'@en
7P5087Poutgoing_statement_countP230229'FIPS 5-2 numeric code (US states)'@en'property constraint'@en
8P5209Poutgoing_statement_countP185552'ISO 3950 code'@en'Wikidata property example'@en
9P553Poutgoing_statement_countP165950'website account on'@en'see also'@en
\n", "
" ], "text/plain": [ " node1 label node2 P1114 \\\n", "0 P1533 Poutgoing_statement_count P2302 26 \n", "1 P154 Poutgoing_statement_count P2302 28 \n", "2 P18 Poutgoing_statement_count P1659 30 \n", "3 P2586 Poutgoing_statement_count P2302 28 \n", "4 P3171 Poutgoing_statement_count P1659 30 \n", "5 P4839 Poutgoing_statement_count P2264 85 \n", "6 P5086 Poutgoing_statement_count P2302 45 \n", "7 P5087 Poutgoing_statement_count P2302 29 \n", "8 P5209 Poutgoing_statement_count P1855 52 \n", "9 P553 Poutgoing_statement_count P1659 50 \n", "\n", " node1;label \\\n", "0 'family name identical to this given name'@en \n", "1 'logo image'@en \n", "2 'image'@en \n", "3 'INSEE department code'@en \n", "4 'International Olympic Committee athlete ID'@en \n", "5 'Wolfram Language entity code'@en \n", "6 'FIPS 5-2 alpha code (US states)'@en \n", "7 'FIPS 5-2 numeric code (US states)'@en \n", "8 'ISO 3950 code'@en \n", "9 'website account on'@en \n", "\n", " node2;label \n", "0 'property constraint'@en \n", "1 'property constraint'@en \n", "2 'see also'@en \n", "3 'property constraint'@en \n", "4 'see also'@en \n", "5 'mix'n'match catalog ID'@en \n", "6 'property constraint'@en \n", "7 'property constraint'@en \n", "8 'Wikidata property example'@en \n", "9 'see also'@en " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"head -i $OUT/outgoing.property.count.25.tsv.gz / add-labels\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "execution": { "iopub.execute_input": "2021-12-01T00:22:17.553437Z", "iopub.status.busy": "2021-12-01T00:22:17.552945Z", "iopub.status.idle": "2021-12-01T00:23:35.644547Z", "shell.execute_reply": "2021-12-01T00:23:35.643996Z", "shell.execute_reply.started": "2021-12-01T00:22:17.553409Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2P1114node1;labelnode2;label
0Q99Poutgoing_statement_countP293661'California'@en'language used'@en
1Q99Poutgoing_statement_countP15058'California'@en'contains administrative territorial entity'@en
2Q99Poutgoing_statement_countP108239'California'@en'population'@en
\n", "
" ], "text/plain": [ " node1 label node2 P1114 node1;label \\\n", "0 Q99 Poutgoing_statement_count P2936 61 'California'@en \n", "1 Q99 Poutgoing_statement_count P150 58 'California'@en \n", "2 Q99 Poutgoing_statement_count P1082 39 'California'@en \n", "\n", " node2;label \n", "0 'language used'@en \n", "1 'contains administrative territorial entity'@en \n", "2 'population'@en " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/outgoing.property.count.25.tsv.gz\n", " --match '(entity)-[e {P1114: quantity}]->(property)'\n", " --where 'entity = \"Q99\"'\n", " / add-labels\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtk", "language": "python", "name": "kgtk" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 4 }