{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Generate triples parameter\n",
"\n",
"#kgtk_path takes in the directory which contains the kgtk subgraph\n",
"kgtk_path = '/Users/amandeep/Documents/kypher/wikidata_os_v5'\n",
"kgtk_file_name = 'all_and_qualifiers.sorted.tsv.gz'\n",
"triple_filename= 'all.ttl'\n",
"triple_generation_log = 'triple_generation_log.txt'\n",
"properties_file_path = f'{kgtk_path}/all.metadata.property.datatypes.tsv.gz'\n",
"\n",
"# Load triples to blazegraph\n",
"wikibase_ui_port = '10001'\n",
"wikibase_sparql_port = '10002'\n",
"wikibase_proxy_port = '10003'\n",
"wikibase_qs_port = '10005'\n",
"wikibase_volume = '.'\n",
"docker_name = 'blazegraphpipeline'\n",
"create_new = True\n",
"stop_docker = \"No\"\n",
"blazegraph_image = 'wikibase/wdqs:0.3.10'\n",
"ttl_path = ''\n",
"query_service_name = 'ISI SPARQL Query Service'\n",
"\n",
"#Parameterize whether you want to run just the generate_wikidata_triples part or loading to blazegraph part\n",
"gen_triples = True\n",
"load_triples = True\n",
"\n",
"#Create new image\n",
"create_image = True\n",
"image_tag = 'blazegraph_image'\n",
"dockerfile_path = './0.3.10/'"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import subprocess\n",
"import gzip\n",
"import subprocess\n",
"import socket\n",
"import sys\n",
"import shutil\n",
"import time\n",
"import glob\n",
"import json\n",
"from IPython.display import display, Markdown, HTML\n",
"from pathlib import Path\n",
"\n",
"wikibase_volume = f'{kgtk_path}/docker_volume'\n",
"\n",
"Path(wikibase_volume).mkdir(parents=True, exist_ok=True)\n",
"\n",
"input_file_path = f'{kgtk_path}/{kgtk_file_name}'"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"Utility class to print stuff in Bold.\n",
"'''\n",
"class color:\n",
" PURPLE = '\\033[95m'\n",
" CYAN = '\\033[96m'\n",
" DARKCYAN = '\\033[36m'\n",
" BLUE = '\\033[94m'\n",
" GREEN = '\\033[92m'\n",
" YELLOW = '\\033[93m'\n",
" RED = '\\033[91m'\n",
" BOLD = '\\033[1m'\n",
" UNDERLINE = '\\033[4m'\n",
" END = '\\033[0m'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate Wikidata triples"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m------------Head of the KGTK edge file-------------\u001b[0m\n",
"\n",
"id\tnode1\tlabel\tnode2\n",
"P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\n",
"P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\n",
"P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\n",
"P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\n",
"P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\n",
"P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\n",
"P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\n",
"P10-P1855-Q15075950-7eff6d65-0\tP10\tP1855\tQ15075950\n",
"P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0\tP10-P1855-Q15075950-7eff6d65-0\tP10\t\"Smoorverliefd 12 september.webm\"\n",
"P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0\tP10-P1855-Q15075950-7eff6d65-0\tP3831\tQ622550\n",
"P10-P1855-Q69063653-c8cdb04c-0\tP10\tP1855\tQ69063653\n",
"P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0\tP10-P1855-Q69063653-c8cdb04c-0\tP10\t\"Couch Commander.webm\"\n",
"P10-P1855-Q7378-555592a4-0\tP10\tP1855\tQ7378\n",
"P10-P1855-Q7378-555592a4-0-P10-8a982d-0\tP10-P1855-Q7378-555592a4-0\tP10\t\"Elephants Dream (2006).webm\"\n",
"P10-P2302-Q21502404-d012aef4-0\tP10\tP2302\tQ21502404\n",
"P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0\tP10-P2302-Q21502404-d012aef4-0\tP1793\t\"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\"\n",
"P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0\tP10-P2302-Q21502404-d012aef4-0\tP2316\tQ21502408\n",
"P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0\tP10-P2302-Q21502404-d012aef4-0\tP2916\t'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en\n",
"P10-P2302-Q21510851-5224fe0b-0\tP10\tP2302\tQ21510851\n",
"gzcat: error writing to output: Broken pipe\n",
"gzcat: /Users/amandeep/Documents/kypher/wikidata_os_v5/all_and_qualifiers.sorted.tsv.gz: uncompress failed\n",
"gzip: can't stat: triple_output_save_path (triple_output_save_path): No such file or directory\n",
"\n",
"\u001b[1mThe triple file is generated and saved at:\u001b[0m /Users/amandeep/Documents/kypher/wikidata_os_v5/all.ttl.gz\n",
"\n",
"\u001b[1m------------Head of the triple file-------------\u001b[0m\n",
"\n",
"gzcat: can't stat: /Users/amandeep/Documents/kypher/wikidata_os_v5/all.ttl.gz (/Users/amandeep/Documents/kypher/wikidata_os_v5/all.ttl.gz.gz): No such file or directory\n"
]
}
],
"source": [
"##generate_wikidata_triples\n",
"#Run only generate triples\n",
"'''\n",
"1. This cell will run only if you just want to generate triples aligned to wikidata schema. \n",
"It will first concatenate the KGTK edge files and then will use the KGTK generate_wikidata_triples \n",
"command to generate triples for the concatenated file.\n",
"\n",
"2. The generate_wikidata_triples takes in the properities file path as a parameter. \n",
"The properties file should have the data_type mentioned for each of the property used in the KGTK edge file.\n",
"\n",
"3. The generated triple file is then gzipped.\n",
"'''\n",
"\n",
"if gen_triples:\n",
" print(color.BOLD + '------------Head of the KGTK edge file-------------' + color.END)\n",
" \n",
" print()\n",
" \n",
" !gzcat $input_file_path | head -n 20\n",
"\n",
" triple_output_save_path = os.path.join(kgtk_path,triple_filename) # Name of the output triple file\n",
" log_save_path = os.path.join(kgtk_path,triple_generation_log) # Name of the log file\n",
" \n",
" # generate the triples\n",
" !kgtk generate_wikidata_triples -i $input_file_path \\\n",
" -ap alias -lp label -dp description \\\n",
" -pf $properties_file_path \\\n",
" -n 1000 \\\n",
" --debug \\\n",
" -gt yes -gz yes -w yes \\\n",
" -log $log_save_path > $triple_output_save_path\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\u001b[1mThe triple file is generated and saved at:\u001b[0m /Users/amandeep/Documents/kypher/wikidata_os_v5/all.ttl.gz\n",
"\n",
"\u001b[1m------------Head of the triple file-------------\u001b[0m\n",
"\n",
"@prefix wikibase:
COMMIT: totalElapsed=57155ms, commitTime=1611257027095, mutationCount=2007140
File wikidump-000000002.ttl.gz not found, terminating\\n'\n" ] } ], "source": [ "# Run only load triples\n", "'''\n", "1. This cell is used to load a given triple file to blazegraph triple store.\n", "\n", "2. It will run only if the parameter only_load_triples is set to True\n", "'''\n", "if (gen_triples and load_triples) or load_triples:\n", " if gen_triples:\n", " ttl_path = triple_path\n", " print(color.BOLD + '------------Log output of loading the triple file to Blazegraph-------------' + color.END)\n", " print()\n", " loader_obj = BlazegraphLoad(ttl_path,wikibase_ui_port,wikibase_sparql_port,wikibase_proxy_port,wikibase_qs_port,\n", " wikibase_volume,create_new,docker_name,stop_docker,blazegraph_image,query_service_name)\n", " loader_obj.driver_fn()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Sparql Endpoint" ], "text/plain": [ "