{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating a subset of Wikidata\n", "\n", "This notebook illustrates how to create a subset of Wikidata. We use as an example https://www.wikidata.org/wiki/Q11173 (chemical compound)\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "wikidata_home = \"/Users/pedroszekely/Downloads/kypher\"\n", "wikidata_file = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v3/all.tsv.gz\"\n", "wikidata_file = \"all.10.tsv.gz\"\n", "wikidata_parts_folder = \"/Users/pedroszekely/Downloads/kypher/useful_wikidata_files\"\n", "#wikidata_parts_folder = \"/Users/pedroszekely/Downloads/kypher/output.all.10\"\n", "home = \"/Users/pedroszekely/Downloads/kypher\"\n", "cache_folder = \"/Users/pedroszekely/Downloads/kypher\"\n", "output_folder = \"/Users/pedroszekely/Downloads/scratch\"\n", "delete_database = \"yes\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import altair as alt\n", "\n", "# from IPython.display import display, HTML, Image\n", "# from pandas_profiling import ProfileReport" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set up environment variables and folders that we need" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# folder containing wikidata broken down into smaller files.\n", "os.environ['WIKIDATA_PARTS'] = wikidata_parts_folder\n", "# path of folder where the wikidata parts folder is stored.\n", "os.environ['WIKIDATA_HOME'] = wikidata_home\n", "os.environ['KYPHER'] = home\n", "os.environ['OUT'] = output_folder\n", "# kgtk command to run\n", "os.environ['kgtk'] = \"kgtk\"\n", "os.environ['kgtk'] = \"time kgtk --debug\"\n", "# absolute path of the db\n", "os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_folder)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher\n" ] } ], "source": [ "cd $home" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "def bar_chart(data, x_column, y_column):\n", " \"\"\"Construct a simple bar chart with two properties\"\"\"\n", " bars = alt.Chart(data).mark_bar().encode(\n", " y=alt.Y(y_column, sort='-x'),\n", " x=x_column\n", " )\n", "\n", " text = bars.mark_text(\n", " align='left',\n", " baseline='middle',\n", " dx=3 # Nudges text to right so it doesn't appear on top of the bar\n", " ).encode(\n", " text=x_column\n", " )\n", "\n", " return (bars + text)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "def run_command(cmd, substitution_dictionary = {}):\n", " \"\"\"Run a templetized command.\"\"\"\n", " for k, v in substitution_dictionary.items():\n", " cmd = cmd.replace(k, v)\n", " \n", " print(cmd)\n", " output = subprocess.run([cmd], shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", " print(output.stdout)\n", " print(output.stderr)\n", " #print(output.returncode)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-20 22:48:09 sqlstore]: IMPORT graph directly into table graph_3 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.wikibase-item.tsv.gz ...\n", "^C\n", "\n", "Keyboard interrupt in query -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.wikibase-item.tsv.gz -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.label.en.tsv.gz --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db --match item: (n1)-[l:P452]->(n2:Q507443), label: (n1)-[:label]->(label) --return distinct n1 as pharamceutical_company, label as name --where label.kgtk_lqstring_lang_suffix = \"en\" --order-by label --limit 10.\n", " 45.47 real 74.94 user 1.90 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[l:P452]->(n2:Q507443), label: (n1)-[:label]->(label)' \\\n", "--return 'distinct n1 as pharamceutical_company, label as name' \\\n", "--where 'label.kgtk_lqstring_lang_suffix = \"en\"' \\\n", "--order-by 'label' \\\n", "--limit 10" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-24 18:53:48 sqlstore]: IMPORT graph directly into table graph_15 from /Users/pedroszekely/Downloads/kypher/Q44/Q44.part.wikibase-item.tsv.gz ...\n", "[2020-10-24 18:53:48 sqlstore]: IMPORT graph directly into table graph_16 from /Users/pedroszekely/Downloads/kypher/Q44/Q44.label.en.tsv.gz ...\n", "[2020-10-24 18:53:48 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_16_c2.\"node2\" \"property\", graph_16_c2.\"node1\" \"property\", count(graph_15_c1.\"node1\") \"count\"\n", " FROM graph_15 AS graph_15_c1, graph_16 AS graph_16_c2\n", " WHERE graph_15_c1.\"label\"=graph_16_c2.\"node1\"\n", " AND graph_16_c2.\"label\"=?\n", " AND graph_15_c1.\"label\"=graph_16_c2.\"node1\"\n", " AND (kgtk_lqstring_lang_suffix(graph_16_c2.\"node2\") = ?)\n", " GROUP BY property, property\n", " ORDER BY count(graph_15_c1.\"node1\") DESC\n", " LIMIT ?\n", " PARAS: ['label', 'en', 20]\n", "---------------------------------------------\n", "[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_15 column label ...\n", "[2020-10-24 18:53:48 sqlstore]: ANALYZE INDEX on table graph_15 column label ...\n", "[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_16 column node1 ...\n", "[2020-10-24 18:53:48 sqlstore]: ANALYZE INDEX on table graph_16 column node1 ...\n", "[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_16 column label ...\n", "[2020-10-24 18:53:48 sqlstore]: ANALYZE INDEX on table graph_16 column label ...\n", " 1.36 real 1.11 user 0.23 sys\n" ] } ], "source": [ "!$kgtk query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[l {label: llab}]->(n2), label: (llab)-[:label]->(name)' \\\n", "--return 'distinct name as property, llab as property, count(n1) as count' \\\n", "--where 'name.kgtk_lqstring_lang_suffix = \"en\"' \\\n", "--order-by 'count(n1) desc' \\\n", "--limit 20 \\\n", "-o $OUT/test.tsv " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "import io\n", "import pandas\n", "import subprocess\n", "\n", "def shell_df(command, shell=False, **kwargs):\n", " \"\"\"\n", " Takes a shell command as a string and and reads the result into a Pandas DataFrame.\n", " \n", " Additional keyword arguments are passed through to pandas.read_csv.\n", " \n", " :param command: a shell command that returns tabular data\n", " :type command: str\n", " :param shell: passed to subprocess.Popen\n", " :type shell: bool\n", " \n", " :return: a pandas dataframe\n", " :rtype: :class:`pandas.dataframe`\n", " \"\"\"\n", " proc = subprocess.Popen(command, \n", " shell=shell,\n", " stdout=subprocess.PIPE, \n", " stderr=subprocess.PIPE)\n", " output, error = proc.communicate()\n", " \n", " if proc.returncode == 0:\n", " if error:\n", " print(error.decode())\n", " with io.StringIO(output.decode()) as buffer:\n", " return pandas.read_csv(buffer, **kwargs)\n", " else:\n", " message = (\"Shell command returned non-zero exit status: {0}\\n\\n\"\n", " \"Command was:\\n{1}\\n\\n\"\n", " \"Standard error was:\\n{2}\")\n", " raise IOError(message.format(proc.returncode, command, error.decode()))" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "command = \"kgtk --debug query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[l {label: llab}]->(n2), label: (llab)-[:label]->(name)' \\\n", "--return 'distinct name as property, llab as property, count(n1) as count' \\\n", "--where 'name.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", "--order-by 'count(n1) desc' \\\n", "--limit 20\"" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-24 19:22:07 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_16_c2.\"node2\" \"property\", graph_15_c1.\"label\" \"property\", count(graph_15_c1.\"node1\") \"count\"\n", " FROM graph_15 AS graph_15_c1, graph_16 AS graph_16_c2\n", " WHERE graph_15_c1.\"label\"=graph_15_c1.\"label\"\n", " AND graph_16_c2.\"label\"=?\n", " AND graph_15_c1.\"label\"=graph_16_c2.\"node1\"\n", " AND (kgtk_lqstring_lang_suffix(graph_16_c2.\"node2\") = ?)\n", " GROUP BY property, property\n", " ORDER BY count(graph_15_c1.\"node1\") DESC\n", " LIMIT ?\n", " PARAS: ['label', 'en', 20]\n", "---------------------------------------------\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
propertyproperty.1count
0'diplomatic relation'@enP5303434
1'language used'@enP29363235
2'member of'@enP4632111
3'contains administrative territorial entity'@enP1501864
4'country'@enP17824
5'shares border with'@enP47564
6'head of government'@enP6562
7'described by source'@enP1343420
8'head of state'@enP35344
9'located in time zone'@enP421319
10'owner of'@enP1830310
11'public holiday'@enP832282
12'twinned administrative body'@enP190235
13'topic\\\\\\\\\\\\\\\\'s main category'@enP910228
14'industry'@enP452218
15'manufacturer'@enP176188
16'official language'@enP37182
17'ethnic group'@enP172173
18'electrical plug type'@enP2853148
19'located in or next to body of water'@enP206144
\n", "
" ], "text/plain": [ " property property.1 count\n", "0 'diplomatic relation'@en P530 3434\n", "1 'language used'@en P2936 3235\n", "2 'member of'@en P463 2111\n", "3 'contains administrative territorial entity'@en P150 1864\n", "4 'country'@en P17 824\n", "5 'shares border with'@en P47 564\n", "6 'head of government'@en P6 562\n", "7 'described by source'@en P1343 420\n", "8 'head of state'@en P35 344\n", "9 'located in time zone'@en P421 319\n", "10 'owner of'@en P1830 310\n", "11 'public holiday'@en P832 282\n", "12 'twinned administrative body'@en P190 235\n", "13 'topic\\\\\\\\\\\\\\\\'s main category'@en P910 228\n", "14 'industry'@en P452 218\n", "15 'manufacturer'@en P176 188\n", "16 'official language'@en P37 182\n", "17 'ethnic group'@en P172 173\n", "18 'electrical plug type'@en P2853 148\n", "19 'located in or next to body of water'@en P206 144" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shell_df(command, shell=True, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bar_chart(_, 'count', 'property')" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Q318: galaxy\n", "Q11173: chemical compound\n", "Q5: human \n", "Q7187: gene\n", "Q28885102: pharmaceutical product" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "graphs = [\"Q318\", \"Q11173\", \"Q5\", \"Q7187\", \"Q28885102\"]\n", "\n", "command = \"time papermill /Users/pedroszekely/Documents/GitHub/kgtk/examples/Example8\\ -\\ Wikidata\\ Subset.ipynb $OUT.GRAPH.out.ipynb \\\n", "-p wikidata_home /Users/pedroszekely/Downloads/kypher \\\n", "-p wikidata_file wikidata-20200803-all-edges.tsv.gz \\\n", "-p wikidata_parts_folder /Users/pedroszekely/Downloads/kypher/useful_wikidata_files \\\n", "-p subset_name GRAPH \\\n", "-p home /Users/pedroszekely/Downloads/kypher \\\n", "-p cache_folder /Users/pedroszekely/Downloads/kypher \\\n", "-p delete_database no\"\n", "\n", "\n", "# for g in graphs:\n", "# run_command(command, {\"GRAPH\": g})" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 21:34:09 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_25_c2.\"node2\" \"node1\"\n", " FROM graph_25 AS graph_25_c2, graph_5 AS graph_5_c1\n", " WHERE graph_25_c2.\"label\"=?\n", " AND graph_5_c1.\"label\"=graph_5_c1.\"label\"\n", " AND graph_5_c1.\"node2\"=?\n", " AND graph_25_c2.\"node1\"=graph_5_c1.\"label\"\n", " AND (kgtk_lqstring_lang_suffix(graph_25_c2.\"node2\") = ?)\n", " LIMIT ?\n", " PARAS: ['label', 'Q20978643', 'en', 10]\n", "---------------------------------------------\n", "node1\n", "'instance of'@en\n", "'subclass of'@en\n", "'facet of'@en\n", "'opposite of'@en\n", "'uses'@en\n", " 0.67 real 0.53 user 0.12 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.en.tsv.gz --graph-cache $STORE \\\n", " --match 'item: ()-[l {label: p}]->(:Q20978643), label: (p)-[:label]->(label)' \\\n", " --return 'distinct label as node1' \\\n", " --where 'label.kgtk_lqstring_lang_suffix = \"en\"' \\\n", " --limit 10" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-17 19:51:40 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT kgtk_lqstring_lang(graph_111_c2.\"node2\") \"langauge\", count(DISTINCT graph_111_c2.\"node2\") \"count\"\n", " FROM graph_111 AS graph_111_c2, graph_5 AS graph_5_c1\n", " WHERE graph_111_c2.\"label\"=?\n", " AND graph_111_c2.\"node1\"=graph_5_c1.\"node1\"\n", " GROUP BY langauge\n", " ORDER BY count(kgtk_lqstring_lang(graph_111_c2.\"node2\")) DESC\n", " PARAS: ['label']\n", "---------------------------------------------\n", "^C\n" ] } ], "source": [ "# Only useful when first argument is a separate list of nodes\n", "!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[]->(), label: (n1)-[:label]->(label)' \\\n", "--return 'distinct kgtk_lqstring_lang(label) as langauge, count(distinct label) as count' \\\n", "--order-by 'count(kgtk_lqstring_lang(label)) desc' \\\n", "-o $OUT/language.label.distribution.tsv" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-17 10:32:38 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT kgtk_lqstring_lang(graph_111_c2.\"node2\") \"langauge\", count(DISTINCT graph_111_c2.\"node2\") \"count\"\n", " FROM graph_111 AS graph_111_c2, graph_99 AS graph_99_c1\n", " WHERE graph_111_c2.\"label\"=?\n", " AND graph_111_c2.\"node1\"=graph_99_c1.\"node1\"\n", " GROUP BY langauge\n", " ORDER BY count(kgtk_lqstring_lang(graph_111_c2.\"node2\")) DESC\n", " PARAS: ['label']\n", "---------------------------------------------\n", " 20.09 real 19.80 user 0.25 sys\n" ] } ], "source": [ "!$kgtk query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.tsv.gz --graph-cache $STORE \\\n", "--match 'Q44: (n1)-[]->(), label: (n1)-[:label]->(label)' \\\n", "--return 'distinct kgtk_lqstring_lang(label) as langauge, count(distinct label) as count' \\\n", "--order-by 'count(kgtk_lqstring_lang(label)) desc' \\\n", "-o $OUT/Q44.language.distribution.tsv" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "langauge\tcount\n", "zh\t1142\n", "en\t861\n", "de\t541\n", "sr\t321\n", "gom\t173\n", "pt\t362\n", "be\t250\n", "crh\t150\n", "nds\t188\n" ] } ], "source": [ "!head $OUT/Q44.language.distribution.tsv" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-17 20:08:21 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT kgtk_lqstring_lang(graph_111_c1.\"node2\") \"langauge\", count(DISTINCT graph_111_c1.\"node2\") \"count\"\n", " FROM graph_111 AS graph_111_c1\n", " WHERE graph_111_c1.\"label\"=?\n", " GROUP BY langauge\n", " ORDER BY count(kgtk_lqstring_lang(graph_111_c1.\"node2\")) DESC\n", " PARAS: ['label']\n", "---------------------------------------------\n", " 54265.51 real 3271.07 user 50061.03 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/part.label.tsv.gz --graph-cache $STORE \\\n", "--match '(n1)-[:label]->(label)' \\\n", "--return 'distinct kgtk_lqstring_lang(label) as langauge, count(distinct label) as count' \\\n", "--order-by 'count(kgtk_lqstring_lang(label)) desc' \\\n", "-o $OUT/language.label.distribution.tsv" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "langauge\tcount\n", "en\t72063244\n", "nl\t47885659\n", "de\t12428668\n", "ast\t12238864\n", "fr\t11285204\n", "es\t10692133\n", "zh\t8090879\n", "pt\t7586786\n", "it\t7364609\n" ] } ], "source": [ "!head -10 $OUT/language.label.distribution.tsv" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 396 791 4127 /Users/pedroszekely/Downloads/scratch/language.label.distribution.tsv\n" ] } ], "source": [ "!wc $OUT/language.label.distribution.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Remove large classes from Wikidata" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First create a file with all the edges we need for doing a test" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 8035.64 real 7898.06 user 79.77 sys\n" ] } ], "source": [ "!$kgtk cat \\\n", "-i $WIKIDATA_PARTS/part.alias.en.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.commonsMedia.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.description.en.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.external-id.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.geo-shape.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.globe-coordinate.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.label.en.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.math.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.monolingualtext.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.musical-notation.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.quantity.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.string.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.time.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.type.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.url.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.wikibase-form.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.wikibase-property.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.wikidatatype.distribution.tsv.gz \\\n", "-i $WIKIDATA_PARTS/part.wikipedia-sitelink.tsv.gz \\\n", "| gzip > $WIKIDATA_PARTS/almost.all.edges.tsv.gz" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Get a list of q-nodes that we want to remove\n", "Q13442814 P1114 35933550 Q13442814-P1114-35933550 'scholarly article'@en\n", "Q523 P1114 3297566 Q523-P1114-3297566 'star'@en\n", "*not removed Q16521 P1114 2745073 Q16521-P1114-2745073 'taxon'@en \n", "Q318 P1114 2102876 Q318-P1114-2102876 'galaxy'@en\n", "Q7318358 P1114 2068766 Q7318358-P1114-2068766 'review article'@en\n", "Q7187 P1114 1196161 Q7187-P1114-1196161 'gene'@en\n", "Q11173 P1114 1063060 Q11173-P1114-1063060 'chemical compound'@en\n", "Q8054 P1114 979961 Q8054-P1114-979961 'protein'@en" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make a list of the Q-nodes that we want to remove" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-18 23:49:28 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_5_c1.\"node1\" \"node1\", graph_2_c2.\"node1\" \"node2\"\n", " FROM graph_2 AS graph_2_c2, graph_5 AS graph_5_c1\n", " WHERE graph_2_c2.\"label\"=?\n", " AND graph_5_c1.\"label\"=?\n", " AND graph_2_c2.\"node1\"=graph_5_c1.\"node2\"\n", " AND (graph_2_c2.\"node2\" IN (?, ?, ?, ?, ?, ?, ?))\n", " PARAS: ['P279star', 'P31', 'Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']\n", "---------------------------------------------\n", " 1006.83 real 234.25 user 275.94 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/all.P279star.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[:P31]->(n2), P279star: (n2)-[:P279star]->(q)' \\\n", "--where \"q in ['Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']\" \\\n", "--return 'distinct n1 as node1, n2 as node2' \\\n", "> $OUT/temp.items.remove.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Need to make a list of classes to remove: if we remove we should remove all the subclasses too" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 48643331 97286662 928980995 /Users/pedroszekely/Downloads/scratch/temp.items.remove.tsv\n" ] } ], "source": [ "!wc $OUT/temp.items.remove.tsv " ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tnode2\n", "Q65225360\tQ101487\n", "Q221307\tQ101487\n", "Q413421\tQ101487\n", "Q415750\tQ101487\n", "Q411073\tQ101487\n", "Q416972\tQ101487\n", "Q905731\tQ101487\n", "Q49081089\tQ101487\n", "Q382897\tQ101487\n" ] } ], "source": [ "!head $OUT/temp.items.remove.tsv " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-19 08:24:17 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node1\" \"node1\", graph_2_c1.\"node2\" \"node2\"\n", " FROM graph_2 AS graph_2_c1\n", " WHERE graph_2_c1.\"label\"=?\n", " AND (graph_2_c1.\"node2\" IN (?, ?, ?, ?, ?, ?, ?))\n", " PARAS: ['P279star', 'Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']\n", "---------------------------------------------\n", " 54.97 real 13.30 user 13.09 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/all.P279star.tsv.gz --graph-cache $STORE \\\n", "--match '(n1)-[:P279star]->(q)' \\\n", "--where \"q in ['Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']\" \\\n", "--return 'distinct n1 as node1, q as node2' \\\n", "> $OUT/temp.subclasses.remove.tsv" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2654438 5308876 43316342 /Users/pedroszekely/Downloads/scratch/temp.subclasses.remove.tsv\n" ] } ], "source": [ "!wc $OUT/temp.subclasses.remove.tsv" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tnode2\n", "Q1000726\tQ11173\n", "Q1010629\tQ11173\n", "Q1013950\tQ11173\n", "Q101487\tQ11173\n", "Q101497\tQ11173\n", "Q1018145\tQ11173\n", "Q1018211\tQ11173\n", "Q1018722\tQ11173\n", "Q1018754\tQ11173\n" ] } ], "source": [ "!head $OUT/temp.subclasses.remove.tsv" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[90mid\u001b[39m Q1000726\n", "\u001b[42mLabel\u001b[49m native strength\n", "\u001b[44mDescription\u001b[49m organisches, makromolekulares Polysaccharid\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mpolysaccharides \u001b[90m(Q134219)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m Q1018754\n", "\u001b[42mLabel\u001b[49m diazoles\n", "\u001b[44mDescription\u001b[49m class of chemical compounds which contain five-membered aromatic ring with two nitrogen heteroatoms\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m structural class of chemical compounds \u001b[90m(Q47154513)\u001b[39m\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39morganonitrogen heterocyclic compound \u001b[90m(Q72084374)\u001b[39m | heteroarene \u001b[90m(Q907447)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m Q98066085\n", "\u001b[42mLabel\u001b[49m surface protease GP63 (pseudogene), putative\n", "\u001b[44mDescription\u001b[49m بروتين في تريبانوسوما كروزية\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m protein \u001b[90m(Q8054)\u001b[39m\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mpseudogenic transcript \u001b[90m(Q64698614)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m Q982914\n", "\u001b[42mLabel\u001b[49m Hexokinase\n", "\u001b[44mDescription\u001b[49m class of enzymes\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m enzyme family \u001b[90m(Q67015883)\u001b[39m\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mphosphotransferase \u001b[90m(Q3381520)\u001b[39m\n" ] } ], "source": [ "!wd u Q1000726 Q1018754 Q98066085 Q982914" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get all the items we want to remove" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 201.60 real 197.55 user 2.73 sys\n" ] } ], "source": [ "!$kgtk cat --mode NONE -i $OUT/temp.items.remove.tsv -i $OUT/temp.subclasses.remove.tsv > $OUT/temp.things.remove.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get all the edges we want to remove" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-19 11:34:14 sqlstore]: IMPORT graph directly into table graph_1 from /Users/pedroszekely/Downloads/scratch/temp.things.remove.tsv ...\n", "[2020-10-19 11:35:37 sqlstore]: IMPORT graph directly into table graph_2 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.tsv.gz ...\n", "[2020-10-19 13:28:58 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c2.\"node1\" \"node1\", graph_2_c2.\"label\" \"label\", graph_2_c2.\"node2\" \"node2\", graph_2_c2.\"id\" \"id\"\n", " FROM graph_1 AS graph_1_c1, graph_2 AS graph_2_c2\n", " WHERE graph_1_c1.\"node1\"=graph_2_c2.\"node1\"\n", " ORDER BY graph_2_c2.\"id\" ASC\n", " PARAS: []\n", "---------------------------------------------\n", "[2020-10-19 13:28:58 sqlstore]: CREATE INDEX on table graph_2 column node1 ...\n", "[2020-10-19 14:10:58 sqlstore]: ANALYZE INDEX on table graph_2 column node1 ...\n", "[2020-10-19 14:13:40 sqlstore]: CREATE INDEX on table graph_1 column node1 ...\n", "[2020-10-19 14:14:20 sqlstore]: ANALYZE INDEX on table graph_1 column node1 ...\n", " 28337.09 real 23020.91 user 4318.42 sys\n" ] } ], "source": [ "!$kgtk query -i $OUT/temp.things.remove.tsv -i $WIKIDATA_PARTS/almost.all.edges.tsv.gz --graph-cache $STORE \\\n", "--match 'remove: (n1)-[]->(), all: (n1)-[l]->(n2)' \\\n", "--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \\\n", "--order-by l \\\n", "-o $OUT/temp.edges.remove.tsv.gz" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 984873984 5242393011 54970910630\n" ] } ], "source": [ "!gzcat $OUT/temp.edges.remove.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We must sort the `almost.all.edges` file because ifnotexists will run out of memory if the files are not sorted. Note that we don't need to sort the `temp.edges.remove` because the `query` command has an `--order-by` clause." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 7377.98 real 6833.53 user 647.69 sys\n" ] } ], "source": [ "!$kgtk sort2 \\\n", " --columns id node1 label node2 \\\n", " -X \"--buffer-size 50% --parallel 4 -T $OUT/sort\" \\\n", " -i $WIKIDATA_PARTS/almost.all.edges.tsv.gz \\\n", "| gzip > $WIKIDATA_PARTS/almost.all.edges.sorted.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now remove the edges" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KgtkIfEfexists version: 2020-10-20T00:17:59.814324+00:00#EbuHEPUZTUwEzyNtkR5BuhxPXaSCQze2GEwE595ETRtwEUcrAgPganWGSJiuEW0a3Y1DWNRvQxqgHe+vlColrw==\n", "Opening the input file: /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.sorted.tsv.gz\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading gzip /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.sorted.tsv.gz\n", "header: id\tnode1\tlabel\tnode2\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading an edge file.\n", "Opening the filter input file: /Users/pedroszekely/Downloads/scratch/temp.edges.remove.tsv.gz\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading gzip /Users/pedroszekely/Downloads/scratch/temp.edges.remove.tsv.gz\n", "header: node1\tlabel\tnode2\tid\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading an edge file.\n", "Opening the output file: /Users/pedroszekely/Downloads/scratch/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing gzip /Users/pedroszekely/Downloads/scratch/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz\n", "header: id\tnode1\tlabel\tnode2\n", "Opening the reject file: /Users/pedroszekely/Downloads/scratch/rejected.edges.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing gzip /Users/pedroszekely/Downloads/scratch/rejected.edges.tsv.gz\n", "header: id\tnode1\tlabel\tnode2\n", "Processing presorted files.\n", "Exception in thread background thread for pid 44042:\n", "Traceback (most recent call last):\n", " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/threading.py\", line 926, in _bootstrap_inner\n", " self.run()\n", " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/threading.py\", line 870, in run\n", " self._target(*self._args, **self._kwargs)\n", " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 1662, in wrap\n", " fn(*args, **kwargs)\n", " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 2606, in background_thread\n", " handle_exit_code(exit_code)\n", " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 2304, in fn\n", " return self.command.handle_command_exit_code(exit_code)\n", " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 877, in handle_command_exit_code\n", " raise exc\n", "sh.ErrorReturnCode_1: \n", "\n", " RAN: /usr/local/bin/pv '-d 44039'\n", "\n", " STDOUT:\n", "\n", "\n", " STDERR:\n", "\n", "\n", "Read 1443196923 records, accepted 462394639 records, rejected 980802284 records using 984873983 filter records.\n", "Timing: elapsed=4:56:45.249203 CPU=4:55:48.657906 ( 99.7%): ifnotexists --verbose -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.sorted.tsv.gz --filter-on /Users/pedroszekely/Downloads/scratch/temp.edges.remove.tsv.gz --presorted --input-keys id node1 label node2 --filter-keys id node1 label node2 -o /Users/pedroszekely/Downloads/scratch/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz --reject-file /Users/pedroszekely/Downloads/scratch/rejected.edges.tsv.gz\n", " 17805.73 real 17682.21 user 67.15 sys\n" ] } ], "source": [ "!$kgtk --timing --progress \\\n", " ifnotexists --verbose \\\n", " -i $WIKIDATA_PARTS/almost.all.edges.sorted.tsv.gz \\\n", " --filter-on $OUT/temp.edges.remove.tsv.gz \\\n", " --presorted \\\n", " --input-keys id node1 label node2 \\\n", " --filter-keys id node1 label node2 \\\n", " -o $OUT/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz \\\n", " --reject-file $OUT/rejected.edges.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See how many edges we have now" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 462394640 2123743245 26042700951\n" ] } ], "source": [ "!gzcat $OUT/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz | wc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtk", "language": "python", "name": "kgtk" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.8" } }, "nbformat": 4, "nbformat_minor": 4 }