{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 6: partition the files to follow the conventions KGTK uses for Wikidata" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/aliases.en.tsv.gz\"\n", "ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/all.tsv.gz\"\n", "CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.tsv.gz\"\n", "DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/descriptions.en.tsv.gz\"\n", "EXAMPLES_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/examples\"\n", "GE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding\"\n", "ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.isa.tsv.gz\"\n", "ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.wikibase-item.tsv.gz\"\n", "KGTK_PATH: \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/labels.en.tsv.gz\"\n", "OUT: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output\"\n", "P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279.tsv.gz\"\n", "P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279star.tsv.gz\"\n", "PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/metadata.property.datatypes.tsv.gz\"\n", "Q154ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/aliases.en.tsv.gz\"\n", "Q154ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/all.tsv.gz\"\n", "Q154CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.tsv.gz\"\n", "Q154DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/descriptions.en.tsv.gz\"\n", "Q154ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.isa.tsv.gz\"\n", "Q154ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.wikibase-item.tsv.gz\"\n", "Q154LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/labels.en.tsv.gz\"\n", "Q154P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279.tsv.gz\"\n", "Q154P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279star.tsv.gz\"\n", "Q154PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/metadata.property.datatypes.tsv.gz\"\n", "Q154QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.tsv.gz\"\n", "Q154QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.time.tsv.gz\"\n", "Q154SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/sitelinks.tsv.gz\"\n", "QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.tsv.gz\"\n", "QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.time.tsv.gz\"\n", "SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/sitelinks.tsv.gz\"\n", "STORE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n", "TE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding\"\n", "TEMP: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp\"\n", "USECASE_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/use-cases\"\n", "WIKIDATA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/\"\n", "kgtk: \"kgtk --debug\"\n", "kypher: \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n" ] } ], "source": [ "import sys \n", "sys.path.insert(0, 'tutorial')\n", "from tutorial_setup import *\n", "from generate_report import run" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kgtk-tutorial\n" ] } ], "source": [ "%cd {output_path}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'll use the partition-wikidata notebook to complete this step. This notebook expects an input file that includes all edges and qualifiers together. We also need to specify a directory where partitioned files should be created, and a directory where temporary files can be sent (this should be different from our temp directory as the partition notebook will clear any existing files in this folder)." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "!mkdir -p $OUT/parts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combine the main edges with the qualifiers" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i $OUT/all.tsv.gz -i $OUT/Q154.qualifiers.tsv.gz -o $TEMP/all_and_qualifiers.tsv.gz" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\n", "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\n", "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\n", "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\n", "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\n", "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\n", "P10-P1855-Q7378-555592a4-0\tP10\tP1855\tQ7378\n", "P10-P2302-Q21502404-d012aef4-0\tP10\tP2302\tQ21502404\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < $TEMP/all_and_qualifiers.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8852c21cabe64f209d795f18ceb39283", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Executing: 0%| | 0/49 [00:00()' \\\n", "--return 'count(distinct n1)'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 7 Run Useful files Notebook" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f56046e4ba58403fbe0fb0d6913e0701", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Executing: 0%| | 0/96 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
out_degreecountlabel
016206count
123820count
23761count
34460count
45424count
............
27611401count
27712461count
27812561count
27913561count
28015351count
\n", "

281 rows × 3 columns

\n", "" ], "text/plain": [ " out_degree count label\n", "0 1 6206 count\n", "1 2 3820 count\n", "2 3 761 count\n", "3 4 460 count\n", "4 5 424 count\n", ".. ... ... ...\n", "276 1140 1 count\n", "277 1246 1 count\n", "278 1256 1 count\n", "279 1356 1 count\n", "280 1535 1 count\n", "\n", "[281 rows x 3 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_table(os.environ['OUT']+'/useful_files/statistics.out_degree.distribution.tsv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 8 Run the Knowledge Graph Profiler" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0f674b5bacb64645a41b6f2d93319a28", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Executing: 0%| | 0/76 [00:00