{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating a subset of Wikidata\n", "\n", "This notebook illustrates how to create a subset of Wikidata. We use as an example https://www.wikidata.org/wiki/Q11173 (chemical compound)\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "wikidata_home = \"/Users/pedroszekely/Downloads/kypher\"\n", "wikidata_file = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v3/all.tsv.gz\"\n", "wikidata_file = \"all.10.tsv.gz\"\n", "wikidata_parts_folder = \"/Users/pedroszekely/Downloads/kypher/useful_wikidata_files\"\n", "#wikidata_parts_folder = \"/Users/pedroszekely/Downloads/kypher/output.all.10\"\n", "home = \"/Users/pedroszekely/Downloads/kypher\"\n", "cache_folder = \"/Users/pedroszekely/Downloads/kypher\"\n", "output_folder = \"/Users/pedroszekely/Downloads/scratch\"\n", "delete_database = \"yes\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import altair as alt\n", "\n", "# from IPython.display import display, HTML, Image\n", "# from pandas_profiling import ProfileReport" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set up environment variables and folders that we need" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# folder containing wikidata broken down into smaller files.\n", "os.environ['WIKIDATA_PARTS'] = wikidata_parts_folder\n", "# path of folder where the wikidata parts folder is stored.\n", "os.environ['WIKIDATA_HOME'] = wikidata_home\n", "os.environ['KYPHER'] = home\n", "os.environ['OUT'] = output_folder\n", "# kgtk command to run\n", "os.environ['kgtk'] = \"kgtk\"\n", "os.environ['kgtk'] = \"time kgtk --debug\"\n", "# absolute path of the db\n", "os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_folder)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher\n" ] } ], "source": [ "cd $home" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "def bar_chart(data, x_column, y_column):\n", " \"\"\"Construct a simple bar chart with two properties\"\"\"\n", " bars = alt.Chart(data).mark_bar().encode(\n", " y=alt.Y(y_column, sort='-x'),\n", " x=x_column\n", " )\n", "\n", " text = bars.mark_text(\n", " align='left',\n", " baseline='middle',\n", " dx=3 # Nudges text to right so it doesn't appear on top of the bar\n", " ).encode(\n", " text=x_column\n", " )\n", "\n", " return (bars + text)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "def run_command(cmd, substitution_dictionary = {}):\n", " \"\"\"Run a templetized command.\"\"\"\n", " for k, v in substitution_dictionary.items():\n", " cmd = cmd.replace(k, v)\n", " \n", " print(cmd)\n", " output = subprocess.run([cmd], shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", " print(output.stdout)\n", " print(output.stderr)\n", " #print(output.returncode)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-20 22:48:09 sqlstore]: IMPORT graph directly into table graph_3 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.wikibase-item.tsv.gz ...\n", "^C\n", "\n", "Keyboard interrupt in query -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.wikibase-item.tsv.gz -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.label.en.tsv.gz --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db --match item: (n1)-[l:P452]->(n2:Q507443), label: (n1)-[:label]->(label) --return distinct n1 as pharamceutical_company, label as name --where label.kgtk_lqstring_lang_suffix = \"en\" --order-by label --limit 10.\n", " 45.47 real 74.94 user 1.90 sys\n" ] } ], "source": [ "!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[l:P452]->(n2:Q507443), label: (n1)-[:label]->(label)' \\\n", "--return 'distinct n1 as pharamceutical_company, label as name' \\\n", "--where 'label.kgtk_lqstring_lang_suffix = \"en\"' \\\n", "--order-by 'label' \\\n", "--limit 10" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-24 18:53:48 sqlstore]: IMPORT graph directly into table graph_15 from /Users/pedroszekely/Downloads/kypher/Q44/Q44.part.wikibase-item.tsv.gz ...\n", "[2020-10-24 18:53:48 sqlstore]: IMPORT graph directly into table graph_16 from /Users/pedroszekely/Downloads/kypher/Q44/Q44.label.en.tsv.gz ...\n", "[2020-10-24 18:53:48 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_16_c2.\"node2\" \"property\", graph_16_c2.\"node1\" \"property\", count(graph_15_c1.\"node1\") \"count\"\n", " FROM graph_15 AS graph_15_c1, graph_16 AS graph_16_c2\n", " WHERE graph_15_c1.\"label\"=graph_16_c2.\"node1\"\n", " AND graph_16_c2.\"label\"=?\n", " AND graph_15_c1.\"label\"=graph_16_c2.\"node1\"\n", " AND (kgtk_lqstring_lang_suffix(graph_16_c2.\"node2\") = ?)\n", " GROUP BY property, property\n", " ORDER BY count(graph_15_c1.\"node1\") DESC\n", " LIMIT ?\n", " PARAS: ['label', 'en', 20]\n", "---------------------------------------------\n", "[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_15 column label ...\n", "[2020-10-24 18:53:48 sqlstore]: ANALYZE INDEX on table graph_15 column label ...\n", "[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_16 column node1 ...\n", "[2020-10-24 18:53:48 sqlstore]: ANALYZE INDEX on table graph_16 column node1 ...\n", "[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_16 column label ...\n", "[2020-10-24 18:53:48 sqlstore]: ANALYZE INDEX on table graph_16 column label ...\n", " 1.36 real 1.11 user 0.23 sys\n" ] } ], "source": [ "!$kgtk query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[l {label: llab}]->(n2), label: (llab)-[:label]->(name)' \\\n", "--return 'distinct name as property, llab as property, count(n1) as count' \\\n", "--where 'name.kgtk_lqstring_lang_suffix = \"en\"' \\\n", "--order-by 'count(n1) desc' \\\n", "--limit 20 \\\n", "-o $OUT/test.tsv " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "import io\n", "import pandas\n", "import subprocess\n", "\n", "def shell_df(command, shell=False, **kwargs):\n", " \"\"\"\n", " Takes a shell command as a string and and reads the result into a Pandas DataFrame.\n", " \n", " Additional keyword arguments are passed through to pandas.read_csv.\n", " \n", " :param command: a shell command that returns tabular data\n", " :type command: str\n", " :param shell: passed to subprocess.Popen\n", " :type shell: bool\n", " \n", " :return: a pandas dataframe\n", " :rtype: :class:`pandas.dataframe`\n", " \"\"\"\n", " proc = subprocess.Popen(command, \n", " shell=shell,\n", " stdout=subprocess.PIPE, \n", " stderr=subprocess.PIPE)\n", " output, error = proc.communicate()\n", " \n", " if proc.returncode == 0:\n", " if error:\n", " print(error.decode())\n", " with io.StringIO(output.decode()) as buffer:\n", " return pandas.read_csv(buffer, **kwargs)\n", " else:\n", " message = (\"Shell command returned non-zero exit status: {0}\\n\\n\"\n", " \"Command was:\\n{1}\\n\\n\"\n", " \"Standard error was:\\n{2}\")\n", " raise IOError(message.format(proc.returncode, command, error.decode()))" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "command = \"kgtk --debug query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.label.en.tsv.gz --graph-cache $STORE \\\n", "--match 'item: (n1)-[l {label: llab}]->(n2), label: (llab)-[:label]->(name)' \\\n", "--return 'distinct name as property, llab as property, count(n1) as count' \\\n", "--where 'name.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", "--order-by 'count(n1) desc' \\\n", "--limit 20\"" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2020-10-24 19:22:07 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_16_c2.\"node2\" \"property\", graph_15_c1.\"label\" \"property\", count(graph_15_c1.\"node1\") \"count\"\n", " FROM graph_15 AS graph_15_c1, graph_16 AS graph_16_c2\n", " WHERE graph_15_c1.\"label\"=graph_15_c1.\"label\"\n", " AND graph_16_c2.\"label\"=?\n", " AND graph_15_c1.\"label\"=graph_16_c2.\"node1\"\n", " AND (kgtk_lqstring_lang_suffix(graph_16_c2.\"node2\") = ?)\n", " GROUP BY property, property\n", " ORDER BY count(graph_15_c1.\"node1\") DESC\n", " LIMIT ?\n", " PARAS: ['label', 'en', 20]\n", "---------------------------------------------\n", "\n" ] }, { "data": { "text/html": [ "
\n", " | property | \n", "property.1 | \n", "count | \n", "
---|---|---|---|
0 | \n", "'diplomatic relation'@en | \n", "P530 | \n", "3434 | \n", "
1 | \n", "'language used'@en | \n", "P2936 | \n", "3235 | \n", "
2 | \n", "'member of'@en | \n", "P463 | \n", "2111 | \n", "
3 | \n", "'contains administrative territorial entity'@en | \n", "P150 | \n", "1864 | \n", "
4 | \n", "'country'@en | \n", "P17 | \n", "824 | \n", "
5 | \n", "'shares border with'@en | \n", "P47 | \n", "564 | \n", "
6 | \n", "'head of government'@en | \n", "P6 | \n", "562 | \n", "
7 | \n", "'described by source'@en | \n", "P1343 | \n", "420 | \n", "
8 | \n", "'head of state'@en | \n", "P35 | \n", "344 | \n", "
9 | \n", "'located in time zone'@en | \n", "P421 | \n", "319 | \n", "
10 | \n", "'owner of'@en | \n", "P1830 | \n", "310 | \n", "
11 | \n", "'public holiday'@en | \n", "P832 | \n", "282 | \n", "
12 | \n", "'twinned administrative body'@en | \n", "P190 | \n", "235 | \n", "
13 | \n", "'topic\\\\\\\\\\\\\\\\'s main category'@en | \n", "P910 | \n", "228 | \n", "
14 | \n", "'industry'@en | \n", "P452 | \n", "218 | \n", "
15 | \n", "'manufacturer'@en | \n", "P176 | \n", "188 | \n", "
16 | \n", "'official language'@en | \n", "P37 | \n", "182 | \n", "
17 | \n", "'ethnic group'@en | \n", "P172 | \n", "173 | \n", "
18 | \n", "'electrical plug type'@en | \n", "P2853 | \n", "148 | \n", "
19 | \n", "'located in or next to body of water'@en | \n", "P206 | \n", "144 | \n", "