{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Playground" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "wikidata_home = \"/Users/pedroszekely/Downloads/kypher\"\n", "wikidata_parts_folder = \"/Users/pedroszekely/Downloads/kypher/useful_wikidata_files\"\n", "home = \"/Users/pedroszekely/Downloads/kypher\"\n", "cache_folder = \"/Users/pedroszekely/Downloads/kypher\"\n", "output_folder = \"/Users/pedroszekely/Downloads/scratch\"\n", "delete_database = \"no\"" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import altair as alt\n", "\n", "# from IPython.display import display, HTML, Image\n", "# from pandas_profiling import ProfileReport" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set up environment variables and folders that we need" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# folder containing wikidata broken down into smaller files.\n", "os.environ['WIKIDATA_PARTS'] = wikidata_parts_folder\n", "# path of folder where the wikidata parts folder is stored.\n", "os.environ['WIKIDATA_HOME'] = wikidata_home\n", "os.environ['KYPHER'] = home\n", "os.environ['OUT'] = output_folder\n", "# kgtk command to run\n", "os.environ['kgtk'] = \"kgtk\"\n", "os.environ['kgtk'] = \"time kgtk --debug\"\n", "# absolute path of the db\n", "os.environ['STORE'] = \"{}/Q44.wikidata.sqlite3.db\".format(cache_folder)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher\n" ] } ], "source": [ "cd $home" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def run_command(cmd, substitution_dictionary = {}):\n", " \"\"\"Run a templetized command.\"\"\"\n", " for k, v in substitution_dictionary.items():\n", " cmd = cmd.replace(k, v)\n", " \n", " print(cmd)\n", " output = subprocess.run([cmd], shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", " print(output.stdout)\n", " print(output.stderr)\n", " #print(output.returncode)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# data = pd.read_csv(os.environ['OUT']+\"/test.tsv\", delimiter='\\t')" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "def bar_chart(data, x_column, y_column):\n", " \"\"\"Construct a simple bar chart with two properties\"\"\"\n", " bars = alt.Chart(data).mark_bar().encode(\n", " y=alt.Y(y_column, sort='-x'),\n", " x=x_column\n", " )\n", "\n", " text = bars.mark_text(\n", " align='left',\n", " baseline='middle',\n", " dx=3 # Nudges text to right so it doesn't appear on top of the bar\n", " ).encode(\n", " text=x_column\n", " )\n", "\n", " return (bars + text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove a list of Qnodes from a KG\n", "\n", "Let's try to remove country from the Q44 KG" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[90mid\u001b[39m Q6256\n", "\u001b[42mLabel\u001b[49m country\n", "\u001b[44mDescription\u001b[49m distinct region in geography; a broad term that can include political divisions or regions associated with distinct political characteristics\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mpolitical territorial entity \u001b[90m(Q1048835)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m Q112099\n", "\u001b[42mLabel\u001b[49m island nation\n", "\u001b[44mDescription\u001b[49m state whose primary territory consists of one or more islands or parts of islands\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mstate \u001b[90m(Q7275)\u001b[39m | country \u001b[90m(Q6256)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m Q20181813\n", "\u001b[42mLabel\u001b[49m colonial power\n", "\u001b[44mDescription\u001b[49m country that controls colonies\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39msovereign state \u001b[90m(Q3624078)\u001b[39m\n" ] } ], "source": [ "!wd u Q6256 Q112099 Q20181813" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find all the countries in the Q44 KG" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 22:29:34 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_1_c1.\"node1\" \"node1\", graph_1_c1.\"node2\" \"node2\"\n", " FROM graph_1 AS graph_1_c1, graph_2 AS graph_2_c2\n", " WHERE graph_1_c1.\"label\"=?\n", " AND graph_2_c2.\"label\"=?\n", " AND graph_2_c2.\"node2\"=?\n", " AND graph_1_c1.\"node2\"=graph_2_c2.\"node1\"\n", " PARAS: ['P31', 'P279star', 'Q6256']\n", "---------------------------------------------\n", " 0.94 real 0.55 user 0.16 sys\n" ] } ], "source": [ "!$kgtk query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.P279star.tsv.gz --graph-cache $STORE \\\n", "--match 'Q44: (n1)-[:P31]->(n2), P279star: (n2)-[:P279star]->(:Q6256)' \\\n", "--return 'distinct n1 as node1, n2 as node2' \\\n", "> $OUT/Q44.items.remove.tsv" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 162 324 2169 /Users/pedroszekely/Downloads/scratch/Q44.items.remove.tsv\n" ] } ], "source": [ "!wc $OUT/Q44.items.remove.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have 162 countries to remove, let's get all their edges that are present in Q44.\n", "\n", "To do this we need to scan all the partitions, so we need to concatenate them first." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m416\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:39\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.alias.en.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m490\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:39\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.description.en.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m456\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:39\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.label.en.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m1.3\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:39\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.P279star.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m22\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.commonsMedia.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m95\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.external-id.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m938\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.geo-shape.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m6.4\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.globe-coordinate.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m62\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.math.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m38\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.monolingualtext.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m74\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.musical-notation.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m205\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.quantity.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m14\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.string.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m3.9\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.time.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m5.7\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.url.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m71\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.wikibase-form.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m158\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.wikibase-item.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m239\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:38\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.part.wikibase-property.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m77\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:39\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.properties.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m1.7\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m16 Oct 22:39\u001b[0m \u001b[36m/Users/pedroszekely/Downloads/kypher/Q44/\u001b[31mQ44.statistics.tsv.gz\u001b[0m\n" ] } ], "source": [ "!exa -l $KYPHER/Q44/*.tsv.gz" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1.70 real 1.41 user 0.14 sys\n" ] } ], "source": [ "!$kgtk cat \\\n", "-i $KYPHER/Q44/Q44.part.*.tsv.gz \\\n", "-i $KYPHER/Q44/Q44.alias.en.tsv.gz \\\n", "-i $KYPHER/Q44/Q44.description.en.tsv.gz \\\n", "-i $KYPHER/Q44/Q44.label.en.tsv.gz \\\n", "> $OUT/Q44.all.edges.tsv" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 22:29:48 sqlstore]: DROP graph data table graph_3 from /Users/pedroszekely/Downloads/scratch/Q44.items.remove.tsv\n", "[2020-10-18 22:29:48 sqlstore]: IMPORT graph directly into table graph_12 from /Users/pedroszekely/Downloads/scratch/Q44.items.remove.tsv ...\n", "[2020-10-18 22:29:48 sqlstore]: DROP graph data table graph_5 from /Users/pedroszekely/Downloads/scratch/Q44.all.edges.tsv\n", "[2020-10-18 22:29:48 sqlstore]: IMPORT graph directly into table graph_13 from /Users/pedroszekely/Downloads/scratch/Q44.all.edges.tsv ...\n", "[2020-10-18 22:29:49 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_13_c2.\"node1\" \"node1\", graph_13_c2.\"label\" \"label\", graph_13_c2.\"node2\" \"node2\", graph_13_c2.\"id\" \"id\"\n", " FROM graph_12 AS graph_12_c1, graph_13 AS graph_13_c2\n", " WHERE graph_12_c1.\"node1\"=graph_13_c2.\"node1\"\n", " ORDER BY graph_13_c2.\"id\" ASC\n", " PARAS: []\n", "---------------------------------------------\n", "[2020-10-18 22:29:49 sqlstore]: CREATE INDEX on table graph_13 column node1 ...\n", "[2020-10-18 22:29:49 sqlstore]: ANALYZE INDEX on table graph_13 column node1 ...\n", "[2020-10-18 22:29:49 sqlstore]: CREATE INDEX on table graph_12 column node1 ...\n", "[2020-10-18 22:29:49 sqlstore]: ANALYZE INDEX on table graph_12 column node1 ...\n", " 2.39 real 2.14 user 0.37 sys\n" ] } ], "source": [ "!$kgtk query -i $OUT/Q44.items.remove.tsv -i $OUT/Q44.all.edges.tsv --graph-cache $STORE \\\n", "--match 'remove: (n1)-[]->(), all: (n1)-[l]->(n2)' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "--order-by l \\\n", "-o $OUT/Q44.edges.remove.tsv" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "Q1011\tP1036\t\"2--6658\"\tQ1011-P1036-1\n", "Q1011\tP1081\t+0.572\tQ1011-P1081-1\n", "Q1011\tP1081\t+0.585\tQ1011-P1081-10\n", "Q1011\tP1081\t+0.589\tQ1011-P1081-11\n", "Q1011\tP1081\t+0.592\tQ1011-P1081-12\n", "Q1011\tP1081\t+0.598\tQ1011-P1081-13\n", "Q1011\tP1081\t+0.613\tQ1011-P1081-14\n", "Q1011\tP1081\t+0.619\tQ1011-P1081-15\n", "Q1011\tP1081\t+0.625\tQ1011-P1081-16\n" ] } ], "source": [ "!head $OUT/Q44.edges.remove.tsv" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1.58 real 1.40 user 0.13 sys\n" ] } ], "source": [ "!$kgtk ifnotexists -i $OUT/Q44.all.edges.tsv --filter-on $OUT/Q44.edges.remove.tsv \\\n", "> $OUT/Q44.trimmed.edges.tsv" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 163405 /Users/pedroszekely/Downloads/scratch/Q44.all.edges.tsv\n", " 55382 /Users/pedroszekely/Downloads/scratch/Q44.edges.remove.tsv\n", " 162 /Users/pedroszekely/Downloads/scratch/Q44.items.remove.tsv\n", " 4988 /Users/pedroszekely/Downloads/scratch/Q44.items.tsv\n", " 360 /Users/pedroszekely/Downloads/scratch/Q44.language.distribution.tsv\n", " 2779 /Users/pedroszekely/Downloads/scratch/Q44.trimmed.clusters.tsv\n", " 107147 /Users/pedroszekely/Downloads/scratch/Q44.trimmed.edges.tsv\n", " 4988 /Users/pedroszekely/Downloads/scratch/Q44.trimmed.edges.wikibase-item.tsv\n" ] } ], "source": [ "%%bash\n", "for f in `ls $OUT/Q44*`\n", "do\n", " wc -l $f\n", "done" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\n", "Q1000597-P18-1\tQ1000597\tP18\t\"Town Hall - geograph.org.uk - 352398.jpg\"\n", "Q1017471-P18-1\tQ1017471\tP18\t\"Bush beer.jpg\"\n", "Q1020773-P18-1\tQ1020773\tP18\t\"Tecate 003.jpg\"\n", "Q1026242-P154-1\tQ1026242\tP154\t\"Calanda Bierdeckel.jpg\"\n", "Q10304159-P154-1\tQ10304159\tP154\t\"Itaipava-logo.gif\"\n", "Q1035257-P18-1\tQ1035257\tP18\t\"Carapils.jpg\"\n", "Q10507704-P18-1\tQ10507704\tP18\t\"Gotlandsdricka.jpg\"\n", "Q1050906-P18-1\tQ1050906\tP18\t\"Abtbier 12.jpg\"\n", "Q1056430-P18-1\tQ1056430\tP18\t\"Cerveceria.cuahotemoc.monterrey.ncs.jpg\"\n" ] } ], "source": [ "!head $OUT/Q44.trimmed.edges.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's compute connected components on the results to see if we created islands" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we need to create a file with only wikibase-item edges and calculate the connected components on this file" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 22:30:07 sqlstore]: DROP graph data table graph_7 from /Users/pedroszekely/Downloads/scratch/Q44.trimmed.edges.tsv\n", "[2020-10-18 22:30:07 sqlstore]: IMPORT graph directly into table graph_14 from /Users/pedroszekely/Downloads/scratch/Q44.trimmed.edges.tsv ...\n", "[2020-10-18 22:30:08 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_14_c1.\"id\", graph_14_c1.\"node1\", graph_14_c1.\"label\", graph_14_c1.\"node2\"\n", " FROM graph_1 AS graph_1_c2, graph_14 AS graph_14_c1\n", " WHERE graph_14_c1.\"node1\"=graph_1_c2.\"node1\"\n", " AND graph_14_c1.\"node2\"=graph_1_c2.\"node2\"\n", " PARAS: []\n", "---------------------------------------------\n", "[2020-10-18 22:30:08 sqlstore]: CREATE INDEX on table graph_14 column node1 ...\n", "[2020-10-18 22:30:08 sqlstore]: ANALYZE INDEX on table graph_14 column node1 ...\n", "[2020-10-18 22:30:08 sqlstore]: CREATE INDEX on table graph_14 column node2 ...\n", "[2020-10-18 22:30:08 sqlstore]: ANALYZE INDEX on table graph_14 column node2 ...\n", " 1.91 real 1.68 user 0.32 sys\n" ] } ], "source": [ "!$kgtk query -i $OUT/Q44.trimmed.edges.tsv -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz --graph-cache $STORE \\\n", "--match 'trimmed: (n1)-[l]->(n2), wikibase: (n1)-[]-(n2)' \\\n", "--return 'l, n1, l.label, n2'\\\n", "-o $OUT/Q44.trimmed.edges.wikibase-item.tsv" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2.20 real 0.73 user 0.19 sys\n" ] } ], "source": [ "!$kgtk connected-components -i $OUT/Q44.trimmed.edges.wikibase-item.tsv \\\n", "--undirected \\\n", "--minimum-cluster-size 1 \\\n", "--cluster-name-method numbered \\\n", "-o $OUT/Q44.trimmed.clusters.tsv " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2779 8337 84897 /Users/pedroszekely/Downloads/scratch/Q44.trimmed.clusters.tsv\n" ] } ], "source": [ "!wc $OUT/Q44.trimmed.clusters.tsv " ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\n", "Q100\tconnected_component\t0\n", "Q1000115\tconnected_component\t0\n", "Q100019\tconnected_component\t0\n", "Q10002\tconnected_component\t0\n", "Q10002198\tconnected_component\t0\n", "Q1000597\tconnected_component\t0\n", "Q100188\tconnected_component\t0\n", "Q10023446\tconnected_component\t0\n", "Q10072884\tconnected_component\t0\n" ] } ], "source": [ "!head $OUT/Q44.trimmed.clusters.tsv " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 22:30:17 sqlstore]: DROP graph data table graph_8 from /Users/pedroszekely/Downloads/scratch/Q44.trimmed.clusters.tsv\n", "[2020-10-18 22:30:17 sqlstore]: IMPORT graph directly into table graph_15 from /Users/pedroszekely/Downloads/scratch/Q44.trimmed.clusters.tsv ...\n", "[2020-10-18 22:30:17 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_15_c1.\"node2\", count(DISTINCT graph_15_c1.\"node1\")\n", " FROM graph_15 AS graph_15_c1\n", " GROUP BY graph_15_c1.\"node2\"\n", " ORDER BY count(DISTINCT graph_15_c1.\"node1\") DESC\n", " LIMIT ?\n", " PARAS: [10]\n", "---------------------------------------------\n", "node2\tcount(DISTINCT graph_15_c1.\"node1\")\n", "0\t2778\n", " 0.75 real 0.58 user 0.14 sys\n" ] } ], "source": [ "!$kgtk query -i $OUT/Q44.trimmed.clusters.tsv --graph-cache $STORE \\\n", "--match '(n1)-[]-(n2)' \\\n", "--return 'distinct n2, count(distinct n1)' \\\n", "--order-by 'count(distinct n1) desc' \\\n", "--limit 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is a single cluster, this is what we wanted" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count instances of all classes" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 17:39:23 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_9_c1.\"node2\" \"class\", ? \"label\", count(DISTINCT graph_9_c1.\"node1\") \"count\"\n", " FROM graph_9 AS graph_9_c1\n", " WHERE graph_9_c1.\"label\"=?\n", " GROUP BY class, label\n", " ORDER BY count(DISTINCT graph_9_c1.\"node1\") DESC\n", " PARAS: ['P1114', 'P31']\n", "---------------------------------------------\n", " 363.60 real 128.95 user 92.06 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/all.P31.tsv.gz --graph-cache $STORE \\\n", "--match '(n1)-[:P31]-(n2)' \\\n", "--return 'distinct n2 as class, \"P1114\" as label, count(distinct n1) as count' \\\n", "--order-by 'count(distinct n1) desc' \\\n", "-o $OUT/custom.all.P31.count.tsv" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "class\tlabel\tcount\n", "Q13442814\tP1114\t35933550\n", "Q5\tP1114\t8064154\n", "Q523\tP1114\t3297566\n", "Q16521\tP1114\t2745073\n", "Q318\tP1114\t2102876\n", "Q7318358\tP1114\t2068766\n", "Q7187\tP1114\t1196161\n", "Q11173\tP1114\t1063060\n", "Q8054\tP1114\t979961\n" ] } ], "source": [ "!head $OUT/custom.all.P31.count.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Argh, the column names are not standard for an edge file. Kypher does not allow me to name the columns the way I want as `node1, label, node2`.\n", "We need to rename the columns and add an `id` column, otherwise Kypher is unhappy" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1.89 real 2.35 user 0.40 sys\n" ] } ], "source": [ "!$kgtk \\\n", " rename-columns --mode NONE -i $OUT/custom.all.P31.count.tsv --output-columns node1 label node2 \\\n", " / add-id --mode NONE --id-style node1-label-node2 \\\n", " > $OUT/all.P31.count.tsv" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1 label node2 id\n", "Q13442814 P1114 35933550 Q13442814-P1114-35933550\n", "Q5 P1114 8064154 Q5-P1114-8064154\n", "Q523 P1114 3297566 Q523-P1114-3297566\n", "Q16521 P1114 2745073 Q16521-P1114-2745073\n", "Q318 P1114 2102876 Q318-P1114-2102876\n", "Q7318358 P1114 2068766 Q7318358-P1114-2068766\n", "Q7187 P1114 1196161 Q7187-P1114-1196161\n", "Q11173 P1114 1063060 Q11173-P1114-1063060\n", "Q8054 P1114 979961 Q8054-P1114-979961\n" ] } ], "source": [ "!head $OUT/all.P31.count.tsv | column -t -s $'\\t' " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add labels to the file so we can see the names of the classes" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 17:59:55 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_11_c2.\"node1\" \"node1\", graph_10_c1.\"label\" \"label\", graph_10_c1.\"node2\" \"node2\", graph_10_c1.\"id\" \"id\", graph_11_c2.\"node2\" \"node1;label\"\n", " FROM graph_10 AS graph_10_c1, graph_11 AS graph_11_c2\n", " WHERE graph_11_c2.\"label\"=?\n", " AND graph_10_c1.\"node1\"=graph_11_c2.\"node1\"\n", " AND (kgtk_lqstring_lang_suffix(graph_11_c2.\"node2\") = ?)\n", " ORDER BY CAST(graph_10_c1.\"node2\" AS int) DESC\n", " PARAS: ['label', 'en']\n", "---------------------------------------------\n", " 2.13 real 1.53 user 0.57 sys\n" ] } ], "source": [ "!$kgtk query -i $OUT/all.P31.count.tsv -i $WIKIDATA_PARTS/part.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'P31: (n1)-[l]-(n2), label: (n1)-[:label]->(label)' \\\n", "--where 'label.kgtk_lqstring_lang_suffix = \"en\"' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id, label as `node1;label`' \\\n", "--order-by 'cast(n2, int) desc' \\\n", "-o $OUT/all.P31.count.labeled.tsv" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1 label node2 id node1;label\n", "Q13442814 P1114 35933550 Q13442814-P1114-35933550 'scholarly article'@en\n", "Q5 P1114 8064154 Q5-P1114-8064154 'human'@en\n", "Q523 P1114 3297566 Q523-P1114-3297566 'star'@en\n", "Q16521 P1114 2745073 Q16521-P1114-2745073 'taxon'@en\n", "Q318 P1114 2102876 Q318-P1114-2102876 'galaxy'@en\n", "Q7318358 P1114 2068766 Q7318358-P1114-2068766 'review article'@en\n", "Q7187 P1114 1196161 Q7187-P1114-1196161 'gene'@en\n", "Q11173 P1114 1063060 Q11173-P1114-1063060 'chemical compound'@en\n", "Q8054 P1114 979961 Q8054-P1114-979961 'protein'@en\n", "Q486972 P1114 609721 Q486972-P1114-609721 'human settlement'@en\n", "Q13100073 P1114 588509 Q13100073-P1114-588509 'village-level division in China'@en\n", "Q8502 P1114 525197 Q8502-P1114-525197 'mountain'@en\n", "Q871232 P1114 512908 Q871232-P1114-512908 'editorial'@en\n", "Q3305213 P1114 453407 Q3305213-P1114-453407 'painting'@en\n", "Q4022 P1114 401203 Q4022-P1114-401203 'river'@en\n", "Q79007 P1114 395456 Q79007-P1114-395456 'street'@en\n", "Q1931185 P1114 354162 Q1931185-P1114-354162 'astronomical radio source'@en\n", "Q30612 P1114 343340 Q30612-P1114-343340 'clinical trial'@en\n", "Q101352 P1114 341799 Q101352-P1114-341799 'family name'@en\n", "Q54050 P1114 324642 Q54050-P1114-324642 'hill'@en\n", "Q13433827 P1114 316560 Q13433827-P1114-316560 'encyclopedic article'@en\n", "Q1457376 P1114 292038 Q1457376-P1114-292038 'eclipsing binary star'@en\n", "Q2668072 P1114 286740 Q2668072-P1114-286740 'collection'@en\n", "Q2247863 P1114 285465 Q2247863-P1114-285465 'high proper-motion star'@en\n", "Q532 P1114 276670 Q532-P1114-276670 'village'@en\n", "Q23397 P1114 258572 Q23397-P1114-258572 'lake'@en\n", "Q3863 P1114 247663 Q3863-P1114-247663 'asteroid'@en\n", "Q11424 P1114 246951 Q11424-P1114-246951 'film'@en\n", "Q41176 P1114 221594 Q41176-P1114-221594 'building'@en\n", "Q482994 P1114 219632 Q482994-P1114-219632 'album'@en\n", "Q17633526 P1114 196297 Q17633526-P1114-196297 'Wikinews article'@en\n", "Q3947 P1114 191987 Q3947-P1114-191987 'house'@en\n", "Q47150325 P1114 189235 Q47150325-P1114-189235 'calendar day of a given year'@en\n", "Q16970 P1114 188597 Q16970-P1114-188597 'church building'@en\n", "Q2782326 P1114 186902 Q2782326-P1114-186902 'case report'@en\n", "Q18593264 P1114 182635 Q18593264-P1114-182635 'item of collection or exhibition'@en\n", "Q355304 P1114 174885 Q355304-P1114-174885 'watercourse'@en\n", "Q4830453 P1114 174234 Q4830453-P1114-174234 'business'@en\n", "Q83373 P1114 168509 Q83373-P1114-168509 'quasar'@en\n", "Q9842 P1114 160945 Q9842-P1114-160945 'primary school'@en\n", "Q7725634 P1114 155848 Q7725634-P1114-155848 'literary work'@en\n", "Q23442 P1114 148531 Q23442-P1114-148531 'island'@en\n", "Q27020041 P1114 145360 Q27020041-P1114-145360 'sports season'@en\n", "Q56436498 P1114 145300 Q56436498-P1114-145300 'village in India'@en\n", "Q2154519 P1114 144638 Q2154519-P1114-144638 'astrophysical X-ray source'@en\n", "Q811979 P1114 142346 Q811979-P1114-142346 'architectural structure'@en\n", "Q61443690 P1114 129184 Q61443690-P1114-129184 'branch post office'@en\n", "Q49008 P1114 127165 Q49008-P1114-127165 'prime number'@en\n", "Q3331189 P1114 124850 Q3331189-P1114-124850 'version, edition, or translation'@en\n" ] } ], "source": [ "!head -50 $OUT/all.P31.count.labeled.tsv | column -t -s $'\\t' " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "!head -20 $OUT/all.P31.count.labeled.tsv > $OUT/temp.top-classes.50.tsv" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(os.environ['OUT']+\"/temp.top-classes.50.tsv\", delimiter='\\t')\n", "bar_chart('node2', 'node1;label')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1443196924 7311117754 80501996248\n" ] } ], "source": [ "!gzcat $WIKIDATA_PARTS/almost.all.edges.tsv.gz | wc" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[37mstar\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ523\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •227\u001b[39m\u001b[36m ×3296581\u001b[39m\n", "\u001b[2m├──\u001b[22m\u001b[37mastronomical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ6999\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •88\u001b[39m\u001b[36m ×25626\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ ├──\u001b[22m\u001b[37mphysical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ223557\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×105\u001b[39m\n", "\u001b[2m│ │ └──\u001b[22m\u001b[37mconcrete object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ4406616\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •11\u001b[39m\u001b[36m ×313\u001b[39m\n", "\u001b[2m│ │ └──\u001b[22m\u001b[37mobject\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ488383\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •39\u001b[39m\u001b[36m ×697\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ │ └──\u001b[22m\u001b[37mentity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ35120\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×31\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ ├──\u001b[22m\u001b[37mspace object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ4235019\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\n", "\u001b[2m│ │ ╘══\u001b[22m\u001b[37mobject\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ488383\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •39\u001b[39m\u001b[36m ×697\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m│ └──\u001b[22m\u001b[37mlocation\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ17334923\u001b[39m\u001b[2m)\u001b[22m\u001b[36m ×196\u001b[39m\n", "\u001b[2m│ └──\u001b[22m\u001b[37mgeographic entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ27096213\u001b[39m\u001b[2m)\u001b[22m\u001b[36m ×53\u001b[39m\n", "\u001b[2m│ └──\u001b[22m\u001b[37mspatial entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ58416391\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m│ └──\u001b[22m\u001b[37mspatio-temporal entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ58415929\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m│ ╘══\u001b[22m\u001b[37mentity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ35120\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×31\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m└──\u001b[22m\u001b[37mfusor\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ1027098\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •12\u001b[39m\n", "\u001b[2m ╘══\u001b[22m\u001b[37mastronomical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ6999\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •88\u001b[39m\u001b[36m ×25626\u001b[39m\u001b[31m ↑ …\u001b[39m\n" ] } ], "source": [ "!wdtaxonomy -r Q523 Q318" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[37mgalaxy\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ318\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •164\u001b[39m\u001b[36m ×2101227\u001b[39m\n", "\u001b[2m└──\u001b[22m\u001b[37mdeep-sky object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ249389\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •27\u001b[39m\u001b[36m ×1\u001b[39m\n", "\u001b[2m └──\u001b[22m\u001b[37mastronomical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ6999\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •88\u001b[39m\u001b[36m ×25626\u001b[39m\n", "\u001b[2m ├──\u001b[22m\u001b[37mphysical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ223557\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×105\u001b[39m\n", "\u001b[2m │ └──\u001b[22m\u001b[37mconcrete object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ4406616\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •11\u001b[39m\u001b[36m ×313\u001b[39m\n", "\u001b[2m │ └──\u001b[22m\u001b[37mobject\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ488383\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •39\u001b[39m\u001b[36m ×697\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m │ └──\u001b[22m\u001b[37mentity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ35120\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×31\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m ├──\u001b[22m\u001b[37mspace object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ4235019\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\n", "\u001b[2m │ ╘══\u001b[22m\u001b[37mobject\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ488383\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •39\u001b[39m\u001b[36m ×697\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m └──\u001b[22m\u001b[37mlocation\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ17334923\u001b[39m\u001b[2m)\u001b[22m\u001b[36m ×196\u001b[39m\n", "\u001b[2m └──\u001b[22m\u001b[37mgeographic entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ27096213\u001b[39m\u001b[2m)\u001b[22m\u001b[36m ×53\u001b[39m\n", "\u001b[2m └──\u001b[22m\u001b[37mspatial entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ58416391\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m └──\u001b[22m\u001b[37mspatio-temporal entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ58415929\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m ╘══\u001b[22m\u001b[37mentity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ35120\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×31\u001b[39m\u001b[31m ↑ …\u001b[39m\n" ] } ], "source": [ "!wdtaxonomy -r Q318" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[37minner planet\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ3504248\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •16\u001b[39m\u001b[36m ×3\u001b[39m\n", "\u001b[2m├──\u001b[22m\u001b[37mterrestrial planet\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ128207\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •78\u001b[39m\u001b[36m ×12\u001b[39m\n", "\u001b[2m│ └──\u001b[22m\u001b[37mplanet\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ634\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •226\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ ├──\u001b[22m\u001b[37mplanemo\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ400144\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •20\u001b[39m\n", "\u001b[2m│ │ ├──\u001b[22m\u001b[37msubstellar object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ3132741\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •15\u001b[39m\u001b[36m ×3\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ │ │ └──\u001b[22m\u001b[37mastronomical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ6999\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •88\u001b[39m\u001b[36m ×25626\u001b[39m\u001b[31m ↑↑\u001b[39m\n", "\u001b[2m│ │ │ ├──\u001b[22m\u001b[37mphysical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ223557\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×105\u001b[39m\n", "\u001b[2m│ │ │ │ └──\u001b[22m\u001b[37mconcrete object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ4406616\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •11\u001b[39m\u001b[36m ×313\u001b[39m\n", "\u001b[2m│ │ │ │ └──\u001b[22m\u001b[37mobject\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ488383\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •39\u001b[39m\u001b[36m ×697\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ │ │ │ └──\u001b[22m\u001b[37mentity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ35120\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×31\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ │ │ ├──\u001b[22m\u001b[37mspace object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ4235019\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\n", "\u001b[2m│ │ │ │ ╘══\u001b[22m\u001b[37mobject\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ488383\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •39\u001b[39m\u001b[36m ×697\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m│ │ │ └──\u001b[22m\u001b[37mlocation\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ17334923\u001b[39m\u001b[2m)\u001b[22m\u001b[36m ×196\u001b[39m\n", "\u001b[2m│ │ │ └──\u001b[22m\u001b[37mgeographic entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ27096213\u001b[39m\u001b[2m)\u001b[22m\u001b[36m ×53\u001b[39m\n", "\u001b[2m│ │ │ └──\u001b[22m\u001b[37mspatial entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ58416391\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m│ │ │ └──\u001b[22m\u001b[37mspatio-temporal entity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ58415929\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m│ │ │ ╘══\u001b[22m\u001b[37mentity\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ35120\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •46\u001b[39m\u001b[36m ×31\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m│ │ └──\u001b[22m\u001b[37mplanetary body\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ16873378\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\u001b[31m ↑\u001b[39m\n", "\u001b[2m│ │ ╞══\u001b[22m\u001b[37msubstellar object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ3132741\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •15\u001b[39m\u001b[36m ×3\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m│ │ └──\u001b[22m\u001b[37msecondary body\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ15731960\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\n", "\u001b[2m│ │ ╘══\u001b[22m\u001b[37mastronomical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ6999\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •88\u001b[39m\u001b[36m ×25626\u001b[39m\u001b[31m ↑↑ …\u001b[39m\n", "\u001b[2m│ ╘══\u001b[22m\u001b[37mplanetary body\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ16873378\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m├──\u001b[22m\u001b[37mplanet of the Solar System\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ13205267\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •3\u001b[39m\n", "\u001b[2m│ ╘══\u001b[22m\u001b[37mplanet\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ634\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •226\u001b[39m\u001b[31m ↑ …\u001b[39m\n", "\u001b[2m└──\u001b[22m\u001b[37mobject of the inner Solar System\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ30249972\u001b[39m\u001b[2m)\u001b[22m\n", "\u001b[2m ╘══\u001b[22m\u001b[37mastronomical object\u001b[39m\u001b[2m (\u001b[22m\u001b[32mQ6999\u001b[39m\u001b[2m)\u001b[22m\u001b[33m •88\u001b[39m\u001b[36m ×25626\u001b[39m\u001b[31m ↑↑ …\u001b[39m\n" ] } ], "source": [ "!wdtaxonomy -r Q3504248" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m333\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m28 Sep 15:16\u001b[0m \u001b[31mall.isa.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m522\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 21:54\u001b[0m \u001b[31mall.P31.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m542\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 22:16\u001b[0m \u001b[31mall.P31_P279.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m20\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 21:57\u001b[0m \u001b[31mall.P279.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m541\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m28 Sep 12:18\u001b[0m \u001b[31mall.P279star.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m111\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 16:22\u001b[0m \u001b[31mpart.alias.en.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m772\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 9:10\u001b[0m \u001b[31mpart.alias.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m109\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 0:31\u001b[0m \u001b[31mpart.commonsMedia.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m1.0\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 17:19\u001b[0m \u001b[31mpart.description.en.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m21\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 13:58\u001b[0m \u001b[31mpart.description.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m2.0\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 0:21\u001b[0m \u001b[31mpart.external-id.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m424\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 1:19\u001b[0m \u001b[31mpart.geo-shape.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m127\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 0:42\u001b[0m \u001b[31mpart.globe-coordinate.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m2.4\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 16:11\u001b[0m \u001b[31mpart.label.en.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m6.0\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 1:08\u001b[0m \u001b[31mpart.label.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m417\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 22:22\u001b[0m \u001b[31mpart.math.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m1.8\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 0:59\u001b[0m \u001b[31mpart.monolingualtext.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m16\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 1:09\u001b[0m \u001b[31mpart.musical-notation.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m1.2\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 22:54\u001b[0m \u001b[31mpart.quantity.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m3.3\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 23:51\u001b[0m \u001b[31mpart.string.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m443\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 20:48\u001b[0m \u001b[31mpart.time.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m484\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 21:26\u001b[0m \u001b[31mpart.type.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m106\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 1:38\u001b[0m \u001b[31mpart.url.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m4.2\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 22:31\u001b[0m \u001b[31mpart.wikibase-form.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m4.0\u001b[0m\u001b[32mG\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 22:12\u001b[0m \u001b[31mpart.wikibase-item.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m254\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 1:28\u001b[0m \u001b[31mpart.wikibase-property.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m264\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m25 Sep 20:27\u001b[0m \u001b[31mpart.wikidatatype.distribution.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-------\u001b[0m \u001b[1;32m906\u001b[0m\u001b[32mM\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m26 Sep 9:45\u001b[0m \u001b[31mpart.wikipedia-sitelink.tsv.gz\u001b[0m\n", ".\u001b[1;33mr\u001b[31mw\u001b[0m\u001b[38;5;244m-\u001b[33mr\u001b[38;5;244m--\u001b[33mr\u001b[38;5;244m--\u001b[0m \u001b[1;32m1.2\u001b[0m\u001b[32mk\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m28 Sep 18:52\u001b[0m rename.sh\n", "\u001b[1;34md\u001b[33mr\u001b[31mw\u001b[32mx\u001b[0m\u001b[38;5;244m------\u001b[0m@ \u001b[38;5;244m-\u001b[0m \u001b[1;33mpedroszekely\u001b[0m \u001b[34m28 Sep 15:33\u001b[0m \u001b[1;34msample\u001b[0m\n" ] } ], "source": [ "!exa -l $WIKIDATA_PARTS/" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "P1114 quantity \u001b[90mnumber of instances of this subject\u001b[39m\n", "P17 country \u001b[90msovereign state of this item (not to be used for human beings)\u001b[39m\n", "P27 country of citizenship \u001b[90mthe object is a country that recognizes the subject as its citizen\u001b[39m\n", "P1001 applies to jurisdiction \u001b[90mthe item (an institution, law, public office ...) or statement belongs to or has power over or applies to the value (a territorial jurisdiction: a country, state, municipality, ...)\u001b[39m\n", "P495 country of origin \u001b[90mcountry of origin of this item (creative work, food, phrase, product, etc.)\u001b[39m\n", "P1532 country for sport \u001b[90mcountry a person or a team represents when playing a sport\u001b[39m\n", "P36 capital \u001b[90mprimary city of a country, province, state or other type of administrative territorial entity\u001b[39m\n", "P1376 capital of \u001b[90mcountry, state, department, canton or other administrative division of which the municipality is the governmental seat\u001b[39m\n", "P2196 students count \u001b[90mnumber of students of any type in an educational organization\u001b[39m\n", "P8047 country of registry \u001b[90mcountry where a ship is or has been registered\u001b[39m\n", "P3005 valid in place \u001b[90mplace where a statement is valid\u001b[39m\n", "P474 country calling code \u001b[90midentifier for a country - dialed on phone after the international dialing prefix (precede value by +)\u001b[39m\n", "P297 ISO 3166-1 alpha-2 code \u001b[90midentifier for a country in two-letter format per ISO 3166-1\u001b[39m\n", "P299 ISO 3166-1 numeric code \u001b[90midentifier for a country in numeric format per ISO 3166-1\u001b[39m\n", "P298 ISO 3166-1 alpha-3 code \u001b[90midentifier for a country in three-letter format per ISO 3166-1\u001b[39m\n", "P5978 classifier \u001b[90mclassifier word used with this sense\u001b[39m\n", "P507 Swedish county code \u001b[90midentifier for a county in Sweden \"länskod\" (two-digit)\u001b[39m\n", "P131 located in the administrative territorial entity \u001b[90mthe item is located on the territory of the following administrative entity. Use P276 (location) for specifying locations that are non-administrative places and for items about events\u001b[39m\n" ] } ], "source": [ "!wd f -t p count" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " Usage: wd-search [options]