{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 6: partition the files to follow the conventions KGTK uses for Wikidata" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/aliases.en.tsv.gz\"\n", "ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/all.tsv.gz\"\n", "CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.tsv.gz\"\n", "DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/descriptions.en.tsv.gz\"\n", "EXAMPLES_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/examples\"\n", "GE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding\"\n", "ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.isa.tsv.gz\"\n", "ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.wikibase-item.tsv.gz\"\n", "KGTK_PATH: \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/labels.en.tsv.gz\"\n", "OUT: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output\"\n", "P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279.tsv.gz\"\n", "P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279star.tsv.gz\"\n", "PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/metadata.property.datatypes.tsv.gz\"\n", "Q154ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/aliases.en.tsv.gz\"\n", "Q154ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/all.tsv.gz\"\n", "Q154CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.tsv.gz\"\n", "Q154DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/descriptions.en.tsv.gz\"\n", "Q154ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.isa.tsv.gz\"\n", "Q154ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.wikibase-item.tsv.gz\"\n", "Q154LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/labels.en.tsv.gz\"\n", "Q154P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279.tsv.gz\"\n", "Q154P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279star.tsv.gz\"\n", "Q154PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/metadata.property.datatypes.tsv.gz\"\n", "Q154QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.tsv.gz\"\n", "Q154QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.time.tsv.gz\"\n", "Q154SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/sitelinks.tsv.gz\"\n", "QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.tsv.gz\"\n", "QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.time.tsv.gz\"\n", "SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/sitelinks.tsv.gz\"\n", "STORE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n", "TE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding\"\n", "TEMP: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp\"\n", "USECASE_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/use-cases\"\n", "WIKIDATA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/\"\n", "kgtk: \"kgtk --debug\"\n", "kypher: \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n" ] } ], "source": [ "import sys \n", "sys.path.insert(0, 'tutorial')\n", "from tutorial_setup import *\n", "from generate_report import run" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kgtk-tutorial\n" ] } ], "source": [ "%cd {output_path}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'll use the partition-wikidata notebook to complete this step. This notebook expects an input file that includes all edges and qualifiers together. We also need to specify a directory where partitioned files should be created, and a directory where temporary files can be sent (this should be different from our temp directory as the partition notebook will clear any existing files in this folder)." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "!mkdir -p $OUT/parts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combine the main edges with the qualifiers" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i $OUT/all.tsv.gz -i $OUT/Q154.qualifiers.tsv.gz -o $TEMP/all_and_qualifiers.tsv.gz" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\n", "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\n", "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\n", "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\n", "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\n", "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\n", "P10-P1855-Q7378-555592a4-0\tP10\tP1855\tQ7378\n", "P10-P2302-Q21502404-d012aef4-0\tP10\tP2302\tQ21502404\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < $TEMP/all_and_qualifiers.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8852c21cabe64f209d795f18ceb39283", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Executing: 0%| | 0/49 [00:00, ?cell/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm.execute_notebook(\n", " os.environ[\"EXAMPLES_DIR\"] + \"/partition-wikidata.ipynb\",\n", " os.environ[\"TEMP\"] + \"/partition-wikidata.out.ipynb\",\n", " parameters=dict(\n", " wikidata_input_path = os.environ[\"TEMP\"] + \"/all_and_qualifiers.tsv.gz\",\n", " wikidata_parts_path = os.environ[\"OUT\"] + \"/parts\",\n", " temp_folder_path = os.environ[\"OUT\"] + \"/parts/temp\",\n", " sort_extras = \"--buffer-size 30% --temporary-directory $OUT/parts/temp\",\n", " verbose = False\n", " )\n", ")\n", ";" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The partition-wikidata notebook created the following partitioned kgtk-files:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "aliases.en.tsv.gz metadata.property.datatypes.tsv.gz\n", "aliases.tsv.gz metadata.types.tsv.gz\n", "all.tsv.gz qualifiers.commonsMedia.tsv.gz\n", "claims.commonsMedia.tsv.gz qualifiers.external-id.tsv.gz\n", "claims.external-id.tsv.gz qualifiers.geo-shape.tsv.gz\n", "claims.geo-shape.tsv.gz qualifiers.globe-coordinate.tsv.gz\n", "claims.globe-coordinate.tsv.gz qualifiers.math.tsv.gz\n", "claims.math.tsv.gz qualifiers.monolingualtext.tsv.gz\n", "claims.monolingualtext.tsv.gz qualifiers.musical-notation.tsv.gz\n", "claims.musical-notation.tsv.gz qualifiers.quantity.tsv.gz\n", "claims.other.tsv.gz qualifiers.string.tsv.gz\n", "claims.quantity.tsv.gz qualifiers.tabular-data.tsv.gz\n", "claims.string.tsv.gz qualifiers.time.tsv.gz\n", "claims.tabular-data.tsv.gz qualifiers.tsv.gz\n", "claims.time.tsv.gz qualifiers.url.tsv.gz\n", "claims.tsv.gz qualifiers.wikibase-form.tsv.gz\n", "claims.url.tsv.gz qualifiers.wikibase-item.tsv.gz\n", "claims.wikibase-form.tsv.gz qualifiers.wikibase-lexeme.tsv.gz\n", "claims.wikibase-item.tsv.gz qualifiers.wikibase-property.tsv.gz\n", "claims.wikibase-lexeme.tsv.gz qualifiers.wikibase-sense.tsv.gz\n", "claims.wikibase-property.tsv.gz sitelinks.en.tsv.gz\n", "claims.wikibase-sense.tsv.gz sitelinks.qualifiers.en.tsv.gz\n", "descriptions.en.tsv.gz sitelinks.qualifiers.tsv.gz\n", "descriptions.tsv.gz sitelinks.tsv.gz\n", "labels.en.tsv.gz \u001b[34mtemp\u001b[m\u001b[m\n", "labels.tsv.gz\n" ] } ], "source": [ "!ls $OUT/parts" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count(DISTINCT graph_20_c1.\"node1\")\n", "15513\n" ] } ], "source": [ "!$kypher -i $OUT/parts/claims.tsv.gz \\\n", "--match '(n1)-[]->()' \\\n", "--return 'count(distinct n1)'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 7 Run Useful files Notebook" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f56046e4ba58403fbe0fb0d6913e0701", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Executing: 0%| | 0/96 [00:00, ?cell/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm.execute_notebook(\n", " os.environ[\"USECASE_DIR\"] + \"/Wikidata Useful Files.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Wikidata Useful Files Out.ipynb\",\n", " parameters=dict(\n", " output_path = os.environ[\"OUT\"],\n", " output_folder = \"useful_files\",\n", " temp_folder = \"temp.useful_files\",\n", " wiki_root_folder = os.environ[\"OUT\"] + \"/parts/\",\n", " cache_path = os.environ[\"OUT\"] + \"/temp.useful_files\",\n", " languages = 'en',\n", " compute_pagerank = True,\n", " delete_database = True\n", " )\n", ")\n", ";" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The useful files notebook created the following files" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 15904\n", "-rw-r--r-- 1 pedroszekely staff 628K Jan 24 13:13 aliases.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 148K Jan 24 13:13 derived.P279.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1.7M Jan 24 13:14 derived.P279star.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 176K Jan 24 13:13 derived.P31.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 99K Jan 24 13:14 derived.isa.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 678K Jan 24 13:13 descriptions.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 627K Jan 24 13:13 labels.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 205K Jan 24 13:14 metadata.in_degree.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 114K Jan 24 13:14 metadata.out_degree.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1.0M Jan 24 13:14 metadata.pagerank.directed.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1.1M Jan 24 13:14 metadata.pagerank.undirected.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 1.2K Jan 24 13:14 statistics.in_degree.distribution.tsv\n", "-rw-r--r-- 1 pedroszekely staff 3.3K Jan 24 13:15 statistics.out_degree.distribution.tsv\n" ] } ], "source": [ "!ls -lh $OUT/useful_files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Look at the distribution of out degrees" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | out_degree | \n", "count | \n", "label | \n", "
---|---|---|---|
0 | \n", "1 | \n", "6206 | \n", "count | \n", "
1 | \n", "2 | \n", "3820 | \n", "count | \n", "
2 | \n", "3 | \n", "761 | \n", "count | \n", "
3 | \n", "4 | \n", "460 | \n", "count | \n", "
4 | \n", "5 | \n", "424 | \n", "count | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
276 | \n", "1140 | \n", "1 | \n", "count | \n", "
277 | \n", "1246 | \n", "1 | \n", "count | \n", "
278 | \n", "1256 | \n", "1 | \n", "count | \n", "
279 | \n", "1356 | \n", "1 | \n", "count | \n", "
280 | \n", "1535 | \n", "1 | \n", "count | \n", "
281 rows × 3 columns
\n", "