{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using KGTK query to do interesting queries in Wikidata\n",
    "This notebook shows use cases of interesting queries on Wikidata that can be done using the KGTK query command (aka Kypher), and that cannot be done using the public Wikidata SPARQL endpoint\n",
    "\n",
    "The notebook has a preamble to set up environment variables to access the relevant files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "\n",
    "# Folder on local machine where to create the output and temporary folders\n",
    "output_path = \"/Users/pedroszekely/Downloads/kypher\"\n",
    "\n",
    "# The names of the output and temporary folders\n",
    "output_folder = \"wd-workshop\"\n",
    "temp_folder = \"temp.wd-workshop\"\n",
    "\n",
    "# The location of input Wikidata files\n",
    "wikidata_folder = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/\"\n",
    "# wikidata_folder = \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/\"\n",
    "# The wikidata_os files can be downloaded from https://drive.google.com/drive/folders/1V6oAQKmwQ4LJnrBai-uv5gHWphFSCt50?usp=sharing\n",
    "\n",
    "wikidata_dbpedia_folder = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-dbpedia\"\n",
    "\n",
    "# Location of the cache database for kypher\n",
    "cache_path = \"/Users/pedroszekely/Downloads/kypher/temp.novartis\"\n",
    "# cache_path = \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"\n",
    "# Whether to delete the cache database\n",
    "delete_database = False\n",
    "\n",
    "# shortcuts to commands\n",
    "kgtk = \"time kgtk --debug\"\n",
    "# kgtk = \"kgtk --debug\"\n",
    "# kgtk = \"kgtk\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import os\n",
    "import subprocess\n",
    "import sys\n",
    "import time\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import altair as alt\n",
    "\n",
    "import papermill as pm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_time = round(time.time())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ALIAS: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/aliases.en.tsv.gz\"\n",
      "CLAIMS: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/claims.tsv.gz\"\n",
      "DESCRIPTION: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/descriptions.en.tsv.gz\"\n",
      "DWD_ISA: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/derived.dwd_isa.tsv.gz\"\n",
      "EXAMPLES_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk-at-2021-wikidata-workshop\"\n",
      "EXTERNAL_ID: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/claims.external-id.tsv.gz\"\n",
      "ISA: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/derived.isa.tsv.gz\"\n",
      "ITEM: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/claims.wikibase-item.tsv.gz\"\n",
      "LABEL: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/labels.en.tsv.gz\"\n",
      "OUT: \"/Users/pedroszekely/Downloads/kypher/wd-workshop\"\n",
      "P279: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/derived.P279.tsv.gz\"\n",
      "P279STAR: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/derived.P279star.tsv.gz\"\n",
      "P31: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/derived.P31.tsv.gz\"\n",
      "PROPERTY_DATATYPES: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/metadata.property.datatypes.tsv.gz\"\n",
      "QUALIFIERS: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/qualifiers.tsv.gz\"\n",
      "QUALIFIERS_TIME: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/qualifiers.time.tsv.gz\"\n",
      "QUANTITY: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/claims.quantity.tsv.gz\"\n",
      "SITELINKS: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/sitelinks.tsv.gz\"\n",
      "STORE: \"/Users/pedroszekely/Downloads/kypher/temp.novartis/wikidata.sqlite3.db\"\n",
      "TEMP: \"/Users/pedroszekely/Downloads/kypher/temp.wd-workshop\"\n",
      "TIME: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/claims.time.tsv.gz\"\n",
      "WD2DB: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-dbpedia/wikidata_to_dbpedia_edge_file.tsv.gz\"\n",
      "WIKIDATA: \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/\"\n",
      "kgtk: \"time kgtk --debug\"\n",
      "kypher: \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.novartis/wikidata.sqlite3.db\"\n"
     ]
    }
   ],
   "source": [
    "# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.\n",
    "file_names = {\n",
    "    \"claims\": \"claims.tsv.gz\",\n",
    "    \"quantity\": \"claims.quantity.tsv.gz\",\n",
    "    \"time\": \"claims.time.tsv.gz\",\n",
    "    \"label\": \"labels.en.tsv.gz\",\n",
    "    \"alias\": \"aliases.en.tsv.gz\",\n",
    "    \"description\": \"descriptions.en.tsv.gz\",\n",
    "    \"item\": \"claims.wikibase-item.tsv.gz\",\n",
    "    \"external_id\": \"claims.external-id.tsv.gz\",\n",
    "    \"qualifiers\": \"qualifiers.tsv.gz\",\n",
    "    \"sitelinks\": \"sitelinks.tsv.gz\",\n",
    "    \"qualifiers_time\": \"qualifiers.time.tsv.gz\",\n",
    "    \"property_datatypes\": \"metadata.property.datatypes.tsv.gz\",\n",
    "    \"isa\": \"derived.isa.tsv.gz\",\n",
    "    \"p279star\": \"derived.P279star.tsv.gz\",\n",
    "    \"p279\": \"derived.P279.tsv.gz\",\n",
    "    \"p31\": \"derived.P31.tsv.gz\",\n",
    "    \"dwd_isa\": \"derived.dwd_isa.tsv.gz\"\n",
    "}\n",
    "\n",
    "# We will define environment variables to hold the full paths to the files as we will use them in the shell commands\n",
    "kgtk_environment_variables = []\n",
    "\n",
    "os.environ['WIKIDATA'] = wikidata_folder\n",
    "kgtk_environment_variables.append('WIKIDATA')\n",
    "\n",
    "for key, value in file_names.items():\n",
    "    variable = key.upper()\n",
    "    os.environ[variable] = wikidata_folder + value\n",
    "    kgtk_environment_variables.append(variable)\n",
    "\n",
    "os.environ[\"WD2DB\"] = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-dbpedia/wikidata_to_dbpedia_edge_file.tsv.gz\"\n",
    "kgtk_environment_variables.append(\"WD2DB\")\n",
    "\n",
    "\n",
    "# KGTK creates a SQLite database to index the knowledge graph.\n",
    "if cache_path:\n",
    "    os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_path)\n",
    "else:\n",
    "    os.environ['STORE'] = \"{}/{}/wikidata.sqlite3.db\".format(output_path, temp_folder)\n",
    "kgtk_environment_variables.append('STORE')\n",
    "\n",
    "# We will create many temporary files, so set up a folder for outputs and one for the temporary files.\n",
    "os.environ['TEMP'] = \"{}/{}\".format(output_path, temp_folder) \n",
    "os.environ['OUT'] = \"{}/{}\".format(output_path, output_folder) \n",
    "kgtk_environment_variables.append('TEMP')\n",
    "kgtk_environment_variables.append('OUT')\n",
    "\n",
    "# Envronment variables with shortcuts to the commands we use often\n",
    "os.environ['kgtk'] = kgtk\n",
    "# Use for debugging, but careful as it causes import to dataframes to break\n",
    "# os.environ['kypher'] = \"time kgtk --debug query --graph-cache \" + os.environ['STORE']\n",
    "os.environ['kypher'] = \"kgtk query --graph-cache \" + os.environ['STORE']\n",
    "kgtk_environment_variables.append('kgtk')\n",
    "kgtk_environment_variables.append('kypher')\n",
    "\n",
    "# We'll save the current working directory so we can call into other example notebooks later\n",
    "os.environ[\"EXAMPLES_DIR\"] = os.getcwd()\n",
    "kgtk_environment_variables.append('EXAMPLES_DIR')\n",
    "\n",
    "kgtk_environment_variables.sort()\n",
    "for variable in kgtk_environment_variables:\n",
    "    print(\"{}: \\\"{}\\\"\".format(variable, os.environ[variable]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/pedroszekely/Downloads/kypher\n"
     ]
    }
   ],
   "source": [
    "%cd {output_path}"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "!$kgtk add-id -i \"$OUT\"/wikidata_infobox_raw.tsv.gz --id-style wikidata -o \"$OUT\"/wikidata_infobox.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define the shortcuts for Kypher so that import the relevant files into the Kypher index and define shortcuts to make the queries nicer to write"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "!$kypher \\\n",
    "-i \"$ITEM\" --as items \\\n",
    "-i \"$TIME\" --as time \\\n",
    "-i \"$P31\" --as p31 \\\n",
    "-i \"$P279\" --as p279 \\\n",
    "-i \"$LABEL\" --as labels \\\n",
    "-i \"$ALIAS\" --as aliases \\\n",
    "-i \"$P279STAR\" --as p279star \\\n",
    "-i \"$QUALIFIERS\" --as qualifiers \\\n",
    "-i \"$DESCRIPTION\" --as descriptions \\\n",
    "-i \"$EXTERNAL_ID\" --as external_ids \\\n",
    "-i \"$WD2DB\" --as wd2db \\\n",
    "--limit 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n",
      "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n",
      "P10-P1855-Q15075950-7eff6d65-0\tP10\tP1855\tQ15075950\tnormal\twikibase-item\n",
      "P10-P1855-Q4504-a69d2c73-0\tP10\tP1855\tQ4504\tnormal\twikibase-item\n",
      "P10-P1855-Q69063653-c8cdb04c-0\tP10\tP1855\tQ69063653\tnormal\twikibase-item\n",
      "P10-P1855-Q7378-555592a4-0\tP10\tP1855\tQ7378\tnormal\twikibase-item\n",
      "P10-P2302-Q21502404-d012aef4-0\tP10\tP2302\tQ21502404\tnormal\twikibase-item\n",
      "P10-P2302-Q21510851-5224fe0b-0\tP10\tP2302\tQ21510851\tnormal\twikibase-item\n",
      "P10-P2302-Q21510852-dde2f0ce-0\tP10\tP2302\tQ21510852\tnormal\twikibase-item\n",
      "P10-P2302-Q52004125-d0288d06-0\tP10\tP2302\tQ52004125\tnormal\twikibase-item\n",
      "P10-P2302-Q53869507-974ce3b1-0\tP10\tP2302\tQ53869507\tnormal\twikibase-item\n"
     ]
    }
   ],
   "source": [
    "!$kypher \\\n",
    "-i \"$ITEM\" --as items \\\n",
    "-i \"$TIME\" --as time \\\n",
    "-i \"$P31\" --as p31 \\\n",
    "-i \"$P279\" --as p279 \\\n",
    "-i \"$LABEL\" --as labels \\\n",
    "-i \"$P279STAR\" --as p279star \\\n",
    "-i \"$EXTERNAL_ID\" --as external_ids \\\n",
    "-i \"$OUT\"/ulan.tsv --as ulan \\\n",
    "--limit 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Retrieve large amounts of data from Wikidata\n",
    "\n",
    "John is doing research on the popularity of first names to improve his entity resolution algorithm for people. He sees that Wikidata contains about 9 million people, so he wants to get the distribution of counts of first names from Wikidata. He writes a SPARQL query, but it times out, so he downloads the Wikidata KGTK files on his laptop and writes a kypher query. The query retrieves all instances of human (Q5), gets their frst names using the P735 property and return the counts.\n",
    "\n",
    "John thinks he will want to do additional analysis on the data, so chooses standard KGTK names for the headers to generate the data as a KGTK graph that then he can use as input to other KGTK commands."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.21 s, sys: 1.7 s, total: 7.91 s\n",
      "Wall time: 8min 17s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# compare to SPARQL\n",
    "# paper: first names\n",
    "!$kypher -i items -i p31 -i labels \\\n",
    "--match '\\\n",
    "    p31: (person)-[]->(:Q5), \\\n",
    "    items: (person)-[:P735]->(given_name), \\\n",
    "    labels: (given_name)-[]->(given_name_label)' \\\n",
    "--return 'distinct given_name as node1, count(given_name) as node2, given_name_label as `node1;label`, \"count_names\" as label' \\\n",
    "--order-by 'node2 desc' \\\n",
    "-o \"$OUT\"/given-names.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   53253  216696 1988754 /Users/pedroszekely/Downloads/kypher/wd-workshop/given-names.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$OUT\"/given-names.tsv "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John takes a peek at the file to make sure he got the headers correcly: an edge from the q-node to the count, using `count_names` as the property, and including the `label` of `node1` so he can read the data. John sees that his name is by far the most popular name in Wikidata, and gets the information he needs to fine tune his entity resolution algorithms. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1      node2   node1;label   label\n",
      "Q4925477   120416  'John'@en     count_names\n",
      "Q12344159  74235   'William'@en  count_names\n",
      "Q4927937   59298   'Robert'@en   count_names\n",
      "Q16428906  57107   'Thomas'@en   count_names\n",
      "Q677191    52568   'James'@en    count_names\n",
      "Q18057751  49005   'David'@en    count_names\n",
      "Q2958359   44735   'Charles'@en  count_names\n",
      "Q2793400   40987   'Peter'@en    count_names\n",
      "Q1249148   40149   'Richard'@en  count_names\n"
     ]
    }
   ],
   "source": [
    "!head \"$OUT\"/given-names.tsv | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John gets curious and wants to know whether the popularity of names depends of time, so modifies his query to partition the data by people's year of birth."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i items -i time -i p31 -i labels \\\n",
    "--match '\\\n",
    "    p31: (person)-[]->(:Q5), \\\n",
    "    items: (person)-[:P735]->(given_name), \\\n",
    "    time: (person)-[:P569]->(date_of_birth), \\\n",
    "    labels: (given_name)-[]->(given_name_label)' \\\n",
    "--return 'distinct given_name as node1, kgtk_date_year(date_of_birth) as year, count(given_name) as node2, given_name_label as `node1;label`, \"count_names_yearly\" as label' \\\n",
    "--order-by 'given_name, cast(year, integer), node2 desc' \\\n",
    "-o \"$OUT\"/given-names.year.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  882179 4431755 42410662 /Users/pedroszekely/Downloads/kypher/wd-workshop/given-names.year.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$OUT\"/given-names.year.tsv "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John takea a quick peek at the file to verify that the headers are correct."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1     year  node2  node1;label      label\n",
      "Q1000387  1798  1      'Ferdinanda'@en  count_names_yearly\n",
      "Q1000387  1849  1      'Ferdinanda'@en  count_names_yearly\n",
      "Q1000387  1868  1      'Ferdinanda'@en  count_names_yearly\n",
      "Q1000387  1870  1      'Ferdinanda'@en  count_names_yearly\n"
     ]
    }
   ],
   "source": [
    "!head -5 \"$OUT\"/given-names.year.tsv | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John heard anecdotaly that Jessica had become a popular name in the late 90s and greps for Jessica in the file. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Q630846\t1995\t58\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t1996\t41\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t1997\t27\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t1998\t23\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t1999\t23\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2000\t51\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2001\t18\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2002\t18\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2003\t11\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2004\t6\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2005\t2\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2009\t1\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2011\t1\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2014\t1\t'Jessica'@en\tcount_names_yearly\n",
      "Q630846\t2016\t2\t'Jessica'@en\tcount_names_yearly\n"
     ]
    }
   ],
   "source": [
    "!grep \"'Jessica'\" \"$OUT\"/given-names.year.tsv | tail -15"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John realizes that he needs to normalize the counts of names by the number of people born in each year. He wonders whether he can do it in one kypher query, but takes the easy way out and writes a simple query to get the counts of people born each year. He can do this faster than he can think of a complex query to get the final result in one go."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i time -i p31 \\\n",
    "--match ' \\\n",
    "    p31: (person)-[]->(:Q5), \\\n",
    "    time: (person)-[:P569]->(date_of_birth)' \\\n",
    "--return 'kgtk_date_year(date_of_birth) as node1, count(person) as node2, \"count_people_born\" as label' \\\n",
    "-o \"$TEMP\"/human.count.year.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John is happy that KGTK accepts literals as subjects of triples because here the subjects (`node1`) are years."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tnode2\tlabel\n",
      "1\t105\tcount_people_born\n",
      "2\t5\tcount_people_born\n",
      "3\t9\tcount_people_born\n",
      "4\t8\tcount_people_born\n",
      "5\t11\tcount_people_born\n",
      "6\t10\tcount_people_born\n",
      "7\t7\tcount_people_born\n",
      "8\t5\tcount_people_born\n",
      "9\t9\tcount_people_born\n"
     ]
    }
   ],
   "source": [
    "!head \"$TEMP\"/human.count.year.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John knows he is almost there. He needs to get the names from the `given-names.year.tsv` file, and needs to pick out the year from the qualifier he put on the edge using the syntax to get the attributes of edges `[r {year: the_year}]`. He computes the fraction of people with each name and multiplies by 100,000 so that the numbers are not so tiny and easier to read. John also gets the labels of the q-nodes from the attribute he put on `node1` so tha the doesn't have to join with the `labels.tsv` file again."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$OUT\"/given-names.year.tsv -i \"$TEMP\"/human.count.year.tsv \\\n",
    "--match ' \\\n",
    "    names: (given_name {label: given_name_label})-[r {year: the_year}]->(count_names), \\\n",
    "    year: (the_year)-[]->(count_people)' \\\n",
    "--return 'given_name as node1, \"normalized_count_names_yearly\" as label, cast(count_names, float) * 10000 / cast(count_people, float) as node2, the_year as year, given_name_label as `node1;label`' \\\n",
    "--order-by 'given_name, cast(the_year, integer), node2 desc' \\\n",
    "-o \"$OUT\"/given-names.year.normalized.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1     label                          node2               year  node1;label\n",
      "Q1000387  normalized_count_names_yearly  2.207505518763797   1798  'Ferdinanda'@en\n",
      "Q1000387  normalized_count_names_yearly  1.1767474699929394  1849  'Ferdinanda'@en\n",
      "Q1000387  normalized_count_names_yearly  0.6827336655970506  1868  'Ferdinanda'@en\n",
      "Q1000387  normalized_count_names_yearly  0.6334726973267453  1870  'Ferdinanda'@en\n",
      "Q1000387  normalized_count_names_yearly  0.536711034778875   1888  'Ferdinanda'@en\n",
      "Q1000433  normalized_count_names_yearly  1.0892059688487092  1852  'Bud'@en\n",
      "Q1000433  normalized_count_names_yearly  0.9004141905276427  1858  'Bud'@en\n",
      "Q1000433  normalized_count_names_yearly  0.6082355087890031  1881  'Bud'@en\n",
      "Q1000433  normalized_count_names_yearly  0.5845218611176058  1882  'Bud'@en\n"
     ]
    }
   ],
   "source": [
    "!head \"$OUT\"/given-names.year.normalized.tsv | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "John greps the normalized file again. Jessica was not a popular name in the 60s and began to get popular in the late 70s. John satisfied his curiosity. The popularity of names is time dependent, but for now, John will work to integrate the aggregate data into his entity resolution algorithm."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Q630846\tnormalized_count_names_yearly\t1.5782828282828283\t1960\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t1.6427682992654478\t1961\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t0.6936416184971098\t1962\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t2.2975301550832854\t1963\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t2.7612232218872963\t1964\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t3.534651365553644\t1965\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t3.7685601587820012\t1966\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t2.2576760987357014\t1967\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t3.7844383893430216\t1968\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t4.271034846619601\t1969\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t5.083022704168078\t1970\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t8.008208413623965\t1971\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t5.099569086412198\t1972\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t6.0163750032697685\t1973\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t5.754944020089986\t1974\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t5.728569940631185\t1975\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t9.301089556205154\t1976\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t10.779346771585644\t1977\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t12.160190239420636\t1978\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t10.66564568178089\t1979\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t15.91069146300112\t1980\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t15.169066137128357\t1981\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t17.050067658998646\t1982\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t17.04045734388742\t1983\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t14.809126810004388\t1984\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t18.211965533175675\t1985\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t16.454134101192924\t1986\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t15.175359712230215\t1987\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t21.840097312838658\t1988\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t19.4208031073285\t1989\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t16.48898365316276\t1990\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t15.925029859430985\t1991\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t20.21772939346812\t1992\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t17.561880778024452\t1993\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t20.131709147985124\t1994\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t21.767686245074124\t1995\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t16.636234530330697\t1996\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t11.965433192998006\t1997\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t12.055139158236805\t1998\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t14.090547080806223\t1999\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t9.757217471158812\t2000\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t17.07455890722823\t2001\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t22.304832713754646\t2002\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t21.202775636083267\t2003\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t17.846519928613922\t2004\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t12.106537530266344\t2005\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t15.19756838905775\t2009\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t20.74688796680498\t2011\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t25.188916876574307\t2014\t'Jessica'@en\n",
      "Q630846\tnormalized_count_names_yearly\t64.72491909385113\t2016\t'Jessica'@en\n"
     ]
    }
   ],
   "source": [
    "!grep \"'Jessica'\" \"$OUT\"/given-names.year.normalized.tsv | tail -50"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Analytics on full Wikidata\n",
    "\n",
    "Jessica is working with John on the entity resolution algorithm and her job is to use the number of instances of each class in Wikidata as a feature. The query that Jessica needs to write is simple as she just needs to count the number of instances of each class, summing up over the instances of all subclasses. She knows that there are over 1 million classes in Wikidata (entities with a P279 property), so she knows it will not run on the public SPARQL endpint. Jessica gets the SQLite database from John so that she does not have to wait the 2 or so hours to load it on her laptop, writes the query and goes for lunch as she knows it will take a while for it to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 1s, sys: 17.6 s, total: 1min 19s\n",
      "Wall time: 1h 22min 58s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# compare to SPARQL\n",
    "# paper: instances\n",
    "!$kypher -i p31 -i p279star \\\n",
    "--match '\\\n",
    "    p31: (entity)-[]->(class), \\\n",
    "    p279star: (class)-[]->(super_class)' \\\n",
    "--return 'distinct super_class as node1, count(distinct entity) as node2, \"entity_count\" as label' \\\n",
    "--order-by 'node2 desc, node1' \\\n",
    "-o \"$OUT\"/class.count.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After coming back from lunch, the file is ready, it contains data for 75K classes, she figures that the other classes don't have instances."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   75195  225585 1863081\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$OUT\"/class.count.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tnode2\tlabel\n",
      "Q35120\t88859643\tentity_count\n",
      "Q99527517\t74418826\tentity_count\n",
      "Q488383\t73704542\tentity_count\n",
      "Q28813620\t68227171\tentity_count\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$OUT\"/class.count.tsv.gz | head -5 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Jessica is curious about the data, so she writes a query to get the counts of different classes of film (Q11424). Jessica had been working with John, so she learned the trick to use the standard names for column headings so that she can use the output of previous queries as new graphs. She shudders to think that if she was using SPARQL she would have had to set up a new Wikidata SPARQL endpoint to be able to load her personal data in it, and to be extremely caeful to not make a mistake because deleting the data would have been a chore. Jessica had watched John make several mistakes when he was building the files for the names. John had simply fixed the queries and re-run the other queries that depended on the data he had just fixed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1      node1;label               node2\n",
      "Q11424     'film'@en                 314889\n",
      "Q24862     'short film'@en           33733\n",
      "Q506240    'television film'@en      17310\n",
      "Q226730    'silent film'@en          17131\n",
      "Q20667187  'silent short film'@en    16302\n",
      "Q202866    'animated film'@en        9019\n",
      "Q17517379  'animated short film'@en  4100\n",
      "Q10590726  'video album'@en          1931\n",
      "Q24869     'feature film'@en         1643\n",
      "Q430525    'concert film'@en         1319\n",
      "CPU times: user 18.1 ms, sys: 13.1 ms, total: 31.2 ms\n",
      "Wall time: 1.49 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# compare to SPARQL\n",
    "# paper: film instances\n",
    "!$kypher -i p279star -i labels -i \"$OUT\"/class.count.tsv.gz \\\n",
    "--match ' \\\n",
    "    p279star: (class)-[]->(:Q11424), \\\n",
    "    count: (class)-[]->(count), \\\n",
    "    labels: (class)-[]->(class_label)' \\\n",
    "--return 'class as node1, class_label as `node1;label`, count as node2' \\\n",
    "--order-by 'cast(count, integer) desc' \\\n",
    "--limit 10 \\\n",
    "| column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Jessica now has the statistics she needs to work on her feature for the entity resolution algorithm. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract new graphs from Wikidata\n",
    "\n",
    "Bill is working on a project to find networks of researchers working on specific topics. He wants to use publication data to find relationships among authors using publications. Bill knows that he can get lots of publication data from Pubmed or Microsoft Academic graph, but wants to give Wikidata a try as he heard that Wikidata has close to 40 million publications, and that in Wikidata publications have links to other entities such as main subjects.\n",
    "\n",
    "Bill decides that the simplest experiment to try first is to build a network of authors of publications in Wikidata: he wants to create a graph of people in Wikidata who authored papers, to put a link between two people if the coauthored a paper, and to add a qualifier with the count of papers they coauthored. He knows the computation is expensive as there are 40ish million papers in Wikidata, so the network will be large. He doesn't even try to write a SPARQL query because he knows it will time out. Bill downloads the KGTK files and decides to write his first query using only 2019 data so he doesn't have to wait so long if he makes a mistake."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### First do it for 2019 to debug the query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i p31 -i p279star -i items -i time -i labels \\\n",
    "--match '\\\n",
    "    p31: (pub)-[]->(class), \\\n",
    "    p279star: (class)-[]->(:Q591041), \\\n",
    "    time: (pub)-[:P577]->(pub_date), \\\n",
    "    items: (pub)-[:P50]->(author1), \\\n",
    "    items: (pub)-[:P50]->(author2), \\\n",
    "    labels: (author1)-[]->(author1_label)' \\\n",
    "--where 'author1 < author2 and kgtk_date_year(pub_date) = 2019' \\\n",
    "--return 'distinct author1 as node1, \"Pcoauthor\" as label, author2 as node2, count(distinct pub) as count_publications, author1_label as `node1;label`' \\\n",
    "--order-by 'count_publications desc' \\\n",
    "-o \"$TEMP\"/coauthors.2019.id.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tlabel\tnode2\tcount_publications\tnode1;label\n",
      "Q104625960\tPcoauthor\tQ104626213\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ42121517\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ46702124\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ57221019\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ57235422\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ62593499\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ62607742\t117\t'Secundino López Puente'@en\n",
      "Q104625960\tPcoauthor\tQ80042771\t117\t'Secundino López Puente'@en\n",
      "Q104626213\tPcoauthor\tQ42121517\t117\t'Roberto Edoardo Villa'@en\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/coauthors.2019.id.tsv.gz | head"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Bill wants to sanity check his data so he looks up the first person in Google Scholar and finds that Secundino López Puente has many publications in 2019. Looks like the query is working fine."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build the network for all authors\n",
    "\n",
    "Bill removes the year restriction and runs the query for the full data. The query for a single year took close to 10 minutes, so Bill decides to leave the query running overnight."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# compare to SPARQL\n",
    "!$kypher -i p31 -i p279star -i items \\\n",
    "--match '\\\n",
    "    p31: (pub)-[]->(class), \\\n",
    "    p279star: (class)-[]->(:Q591041), \\\n",
    "    items: (pub)-[:P50]->(author1), \\\n",
    "    items: (pub)-[:P50]->(author2)' \\\n",
    "--where 'author1 < author2' \\\n",
    "--return 'distinct author1 as node1, \"Pcoauthor\" as label, author2 as node2, count(distinct pub) as count_publications' \\\n",
    "--order-by 'count_publications desc' \\\n",
    "-o \"$TEMP\"/coauthors.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zcat: error writing to output: Broken pipe\n",
      "node1      label      node2      count_publications\n",
      "Q67650927  Pcoauthor  Q84519428  1705\n",
      "Q92189676  Pcoauthor  Q92232927  1666\n",
      "Q92189676  Pcoauthor  Q92470745  1659\n",
      "Q92232927  Pcoauthor  Q92602887  1653\n",
      "Q92232927  Pcoauthor  Q92470745  1650\n",
      "Q92189676  Pcoauthor  Q92602887  1647\n",
      "Q92470745  Pcoauthor  Q92602887  1631\n",
      "Q67732460  Pcoauthor  Q92602887  1618\n",
      "Q67732460  Pcoauthor  Q92189676  1616\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/coauthors.tsv.gz | head | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build a network of authors who authored papers about cancer\n",
    "\n",
    "Bill is interested in cancer research, so he wants to build the same network but using only the papers about cancer. He knows Wikidata has an extensive class hiearchy, so he writes a query to peek at the hierarchy below the q-node for cancer.\n",
    "He writes a query to retrieve subclasses of cancer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tnode2\n",
      "Q101541302\t'pulmonary artery intimal sarcoma'@en\n",
      "Q101541613\t'rectal small cell carcinoma'@en\n",
      "Q101541672\t'CIC-DUX4 sarcoma'@en\n",
      "Q101541689\t'colorectal large cell neuroendocrine carcinoma'@en\n",
      "Q1016605\t'Burkitt lymphoma'@en\n",
      "Q102258467\t'diffuse gastric cancer'@en\n",
      "Q102293219\t'luminal breast carcinoma B'@en\n",
      "Q102293292\t'skin meningioma'@en\n",
      "Q102293358\t'breast implant-associated anaplastic large cell lymphoma'@en\n",
      "Q102293373\t'salivary gland mucinous adenocarcinoma'@en\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i p279star -i labels \\\n",
    "--match '\\\n",
    "    p279star: (cancer_type)-[]->(:Q12078), \\\n",
    "    labels: (cancer_type)-[]->(cancer_type_label)' \\\n",
    "--return 'cancer_type as node1, cancer_type_label as node2' \\\n",
    "--limit 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The results are promising, so Bill now incorporates the query for types of cancer into the query for building the coauthor network. He just needs to get the main subject of the paper using the `P921` property and test that the main subject is a subclass of cancer. He expects the query to be much faster because now it has strong restriction, so he gives it a try."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.83 s, sys: 509 ms, total: 2.34 s\n",
      "Wall time: 2min 37s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# compare to SPARQL\n",
    "!$kypher -i p31 -i p279star -i items -i labels \\\n",
    "--match '\\\n",
    "    p31: (pub)-[]->(class), \\\n",
    "    p279star: (class)-[]->(:Q591041), \\\n",
    "    items: (pub)-[:P50]->(author1), \\\n",
    "    items: (pub)-[:P50]->(author2), \\\n",
    "    items: (pub)-[:P921]->(cancer_type), \\\n",
    "    p279star: (cancer_type)-[]->(:Q12078), \\\n",
    "    labels: (author1)-[]->(author1_label), \\\n",
    "    labels: (author2)-[]->(author2_label)' \\\n",
    "--where 'author1 < author2' \\\n",
    "--return 'distinct author1 as node1, \"Pcoauthor\" as label, author2 as node2, count(distinct pub) as count_publications, author1_label as `node1;label`, author2_label as `node2;label`' \\\n",
    "--order-by 'count_publications desc' \\\n",
    "-o \"$TEMP\"/coauthors.cancer.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The query takes less than a minute and produces a network with close to half a million edges. Bill takes a peek to see what is in it, and now wonders whether he could have written the query in SPARQL and run it on the public SPARQL endpoint."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  228007 1959466 17105798\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/coauthors.cancer.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node_x     relation   node_y     count_publications  node1;label                node2;label\n",
      "Q60320900  Pcoauthor  Q60394812  396                 'Jorge Eduardo Cortes'@en  'Hagop Kantarjian'@en\n",
      "Q60394812  Pcoauthor  Q66370727  236                 'Hagop Kantarjian'@en      'Susan O\\'Brien'@en\n",
      "zcat: Q40614280  Pcoauthor  Q60394812  186                 'Farhad Ravandi'@en        'Hagop Kantarjian'@en\n",
      "Q60394812  Pcoauthor  Q66385413  180                 'Hagop Kantarjian'@en      'Guillermo Garcia-Manero'@en\n",
      "Q60320900  Pcoauthor  Q66370727  172                 'Jorge Eduardo Cortes'@en  'Susan O\\'Brien'@en\n",
      "Q28958315  Pcoauthor  Q42748966  150                 'Hermann Brenner'@en       'Michael Hoffmeister'@en\n",
      "Q60394812  Pcoauthor  Q64026412  150                 'Hagop Kantarjian'@en      'Stefan Faderl'@en\n",
      "Q60394812  Pcoauthor  Q66370888  150                 'Hagop Kantarjian'@en      'Elias Jabbour'@en\n",
      "error writing to outputQ60320900  Pcoauthor  Q66385413  146                 'Jorge Eduardo Cortes'@en  'Guillermo Garcia-Manero'@en\n",
      ": Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/coauthors.cancer.tsv.gz | head | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Bill puts the first two names in Google and finds that they are famous and have publshied a lot together. Bill is happy to have a network with close to half a million edges that he can use to do interesting analyses."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combining Wikidata with external files\n",
    "\n",
    "Abigail is working on a cultural heritage project, collaborating with the Getty Research Institute who gave her a file with 27 thousand ULAN identifiers. Abigail has a database indexed using VIAF identifiers, and wants to map her ULAN identifiers to VIAF identifiers so that she can use her database. She puts one of the ULAN identifiers in the Wikidata search box and discovers that Wikidata has both ULAN and VIAF identifiers for many artists. Abigail knows a little bit of SPARQL and easity figures out that it is easy to write a query to retrieve the VIAF identifier given a ULAN identifier. Her solution would require sending 27,000 queries to Wikidata, which would involve writing a Python script."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   27415   27415  356389 /Users/pedroszekely/Downloads/kypher/wd-workshop/ulan.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$OUT\"/ulan.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Her colleague Bill tells her that she can easily solve the problem using KGTK query. The only thing she needs to do is to rename the header of her file with identifiers to `node1` and write a Kypher query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 149 ms, sys: 49 ms, total: 198 ms\n",
      "Wall time: 11.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# paper: ULAN ids\n",
    "# compare to SPARQL\n",
    "!$kypher -i items -i external_ids -i labels -i ulan \\\n",
    "--match '\\\n",
    "    ulan: (ulan_id)-[]->(), \\\n",
    "    external_ids: (viaf_id)<-[:P214]-(artist)-[:P245]->(ulan_id), \\\n",
    "    labels: (artist)-[]->(artist_label)' \\\n",
    "--return 'artist as node1, viaf_id as `node1;P214`, ulan_id as `node1;P245`, artist_label as `node1;label`' \\\n",
    "-o \"$OUT\"/ulan-to-viaf.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail is thrilled to see that the query ran in less than 30 seconds and is curious to see the results. She got matches for 8,116 ULAN ids, which means that now she can get a lot of data from her database to do her analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    8116   42730  443909 /Users/pedroszekely/Downloads/kypher/wd-workshop/ulan-to-viaf.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$OUT\"/ulan-to-viaf.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1       node1;P214   node1;P245   node1;label\n",
      "Q1000596    \"20822441\"   \"500072302\"  'Noémi Ferenczy'@en\n",
      "Q1001063    \"96418002\"   \"500099612\"  'Olga Fialka'@en\n",
      "Q100156272  \"309815799\"  \"500335625\"  'Gloria López Córdova'@en\n",
      "Q100249806  \"184467129\"  \"500040990\"  'Alice Denniston Laughlin'@en\n",
      "Q100250000  \"63899160\"   \"500034511\"  'Shirley L. Bolton'@en\n",
      "Q100278786  \"309815915\"  \"500336052\"  'Winifred Casson'@en\n",
      "Q100323915  \"95510425\"   \"500332031\"  'Claudia Müller'@en\n",
      "Q100348403  \"95887586\"   \"500033567\"  'Priscilla Kepner Sage'@en\n",
      "Q100377312  \"233761\"     \"500288751\"  'Cristina Castel-Branco'@en\n"
     ]
    }
   ],
   "source": [
    "!head \"$OUT\"/ulan-to-viaf.tsv | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combining Wikidata with DBpedia"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After mining her VIAF database, Abigail realizes that there is a lot of interesting data in Wikipedia infoboxes that she would like to use in her analysis. She hears from a colleague that DBpedia extracts data from Wikipedia infoboxes. She is curious whether Wikidata already has most of this data. She browses the pages for some artists in Wikipedia and sees that the Wikipedia infoboxes have interesting information that she may want to include in her dataset.\n",
    "\n",
    "Abigail downloads the DBpedia infobox data in RDF format and uses KGTK to convert it into KGTK format and to substitute the DBpedia URIs with Wikidata Q-nodes. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 99251608\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$OUT\"/wikidata_infobox.tsv.gz | wc -l"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail sees that there are almost 100 million edges in the DBpedia infobox graph, so she first adds the dataset to the kypher index.\n",
    "> This operation is similar to loading the triples into a triple store to enable running queries. In kypher it is not necessary to explicitly load the file as kypher will automatically load the file and build indices the first time the file is used. Abigail is doing separately as she is curious to see how long it takes to load the file in kypher."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tlabel\tnode2\tid\n",
      "nodemxZbyK2VRrGoaxfdLmyLxw-1\tdbpedia:structured_value\t\"2019-08-07\"\tnodemxZbyK2VRrGoaxfdLmyLxw-1-dbpedia:structured_value-944f0e\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$OUT\"/wikidata_infobox.tsv.gz --as infobox \\\n",
    "--limit 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail first wants to see which properties are available for the people she has in the ULAN file. She constructs a query to count the number of statements for each property for the artists in her ULAN file.\n",
    "\n",
    "> This query combines Wikidata with two external sources, her ULAN identifiers and DBPedia infoboxes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1                  node2\n",
      "property:birthPlace    3725\n",
      "property:name          3208\n",
      "property:birthDate     2738\n",
      "property:nationality   2464\n",
      "property:spouse        2265\n",
      "property:deathPlace    1967\n",
      "property:deathDate     1955\n",
      "property:field         1645\n",
      "property:caption       1494\n",
      "property:birthName     1139\n",
      "property:knownFor      1065\n",
      "property:education     972\n",
      "property:training      924\n",
      "property:movement      803\n",
      "property:occupation    785\n",
      "property:awards        687\n",
      "property:almaMater     557\n",
      "property:works         308\n",
      "property:children      278\n",
      "property:notableWorks  273\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i infobox -i ulan -i external_ids \\\n",
    "--match '\\\n",
    "    ulan: (ulan_id)-[]->(), \\\n",
    "    external_ids: (artist)-[:P245]->(ulan_id), \\\n",
    "    infobox: (artist)-[l]->()' \\\n",
    "--return 'l.label as node1, count(distinct l) as node2' \\\n",
    "--order-by 'node2 desc' \\\n",
    "--limit 20 \\\n",
    "| column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail is interested in the information about spouses as she is thinking of doing an analysis on the occupation of spouses. She constructs a query to retrieve the spouse statements already present in Wikidata for her ULAN artists."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i items -i ulan -i external_ids -i labels \\\n",
    "--match '\\\n",
    "    ulan: (ulan_id)-[]->(), \\\n",
    "    external_ids: (artist)-[:P245]->(ulan_id), \\\n",
    "    items: (artist)-[l:P26]->(spouse)' \\\n",
    "--return 'artist as node1, l.label as label, spouse as node2' \\\n",
    "-o \"$OUT\"/spouses.ulan.wikidata.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    1342    4026   29557 /Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.ulan.wikidata.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$OUT\"/spouses.ulan.wikidata.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "She also constructs a query to count the spouse statements of ULAN artists in the DBpedia dataset an converts the DBpedia property to `P26`, the Wikidata property."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i items -i infobox -i ulan -i external_ids -i labels \\\n",
    "--match '\\\n",
    "    ulan: (ulan_id)-[]->(), \\\n",
    "    external_ids: (artist)-[:P245]->(ulan_id), \\\n",
    "    infobox: (artist)-[:`property:spouse`]->(spouse)' \\\n",
    "--return 'artist as node1, \"P26\" as label, spouse as node2' \\\n",
    "-o \"$OUT\"/spouses.ulan.dbpedia.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail is encouraged as there are almost 1,000 additional statements in DBpedia that she may be able to import into her dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    2270    8022   57656 /Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.ulan.dbpedia.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$OUT\"/spouses.ulan.dbpedia.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail worries that some of the spouses may be strings rather than entities as she has seen Wikipedia infoboxes where some values are links and others are strings. She uses the regex feature in Kypher to count the number of spouses that are Wikidata q-nodes. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count_spouses_with_qnodes\n",
      "449\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$OUT\"/spouses.ulan.dbpedia.tsv \\\n",
    "--match '()-[]->(spouse)' \\\n",
    "--where 'spouse =~ \"^Q[0-9]+\"' \\\n",
    "--return 'count(distinct spouse) as count_spouses_with_qnodes'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail is disappointed to see that only 449 are q-nodes, so uses grep to see what else is in the file. She sees that the DBpedia data is noisy as there are empty strings, numbers that look like dates and entities that do not correspond to Wikidata entities. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tlabel\tnode2\n",
      "Q235186\tP26\t''@en\n",
      "Q235186\tP26\tnodemxZbyK2VRrGoaxfdLmyLxw-3223237\n",
      "Q235186\tP26\tnodemxZbyK2VRrGoaxfdLmyLxw-3223238\n",
      "Q466241\tP26\t''@en\n",
      "Q466241\tP26\t1985\n",
      "Q466241\tP26\t'Patrick Robyn'@en\n",
      "Q534385\tP26\t''@en\n",
      "Q534385\tP26\t2010\n",
      "Q3816460\tP26\t''@en\n"
     ]
    }
   ],
   "source": [
    "!grep -v '\\tQ[0-9]' \"$OUT\"/spouses.ulan.dbpedia.tsv | head"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail does not want to take on what looks like a difficult data cleaning and entity linking job, so she keeps the 449 clean entities and puts them in `spouses.ulan.dbpedia.qnodes.tsv`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "!grep '\\tQ[0-9]' \"$OUT\"/spouses.ulan.dbpedia.tsv > \"$OUT\"/spouses.ulan.dbpedia.qnodes.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail wants to see how many artists have spouse statements in both Wikidata and DBpedia, so she adds another clause to the query and sees that only 359 are in both, so she can get about 100 new statements from DBpedia. While not a lot, she will diff the two files and keep the new statements (not shown in this notebook)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count(DISTINCT graph_12_c3.\"id\")\n",
      "359\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i items -i infobox -i ulan -i external_ids -i labels \\\n",
    "--match '\\\n",
    "    ulan: (ulan_id)-[]->(), \\\n",
    "    external_ids: (artist)-[:P245]->(ulan_id), \\\n",
    "    infobox: (artist)-[l:`property:spouse`]->(spouse), \\\n",
    "    items: (artist)-[:P26]->(spouse), \\\n",
    "    labels: (spouse)-[]->(spouse_label)' \\\n",
    "--return 'count(distinct l)' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### spouse statements in DBpedia\n",
    "Abigail gets curious wether it is worth working on a project to augment Wikidata with spouse statements from DBpedia. Now she knows that she needs to focus on the ones that are mapped to q-nodes, so writes a query to fetch all the spouse statements from DBpedia. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   41211\n",
      "CPU times: user 49.8 ms, sys: 23.1 ms, total: 72.9 ms\n",
      "Wall time: 3.62 s\n"
     ]
    }
   ],
   "source": [
    "# paper: dbpedia spouses\n",
    "!$kypher -i infobox -i p31 \\\n",
    "--match ' \\\n",
    "    infobox: (artist)-[:`property:spouse`]->(spouse)' \\\n",
    "--where 'spouse =~ \"^Q[0-9]+\"' \\\n",
    "--return 'artist as node1, \"P26\" as label, spouse as node2' \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail wants to make sure to get clean data, so she adds a constraint to verify that the q-nodes she gets from DBpedia are instances of `Q5`  (human) in Wikidata."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.56 s, sys: 714 ms, total: 3.27 s\n",
      "Wall time: 3min 26s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# paper: dbpedia spouses\n",
    "!$kypher -i infobox -i p31 -i labels \\\n",
    "--match ' \\\n",
    "    infobox: (artist)-[:`property:spouse`]->(spouse), \\\n",
    "    p31: (spouse)-[]->(:Q5)' \\\n",
    "--opt 'labels: (spouse)-[:label]->(spouse_label)' \\\n",
    "--return 'artist as node1, \"P26\" as label, spouse as node2, spouse_label as `node2;label`' \\\n",
    "-o \"$OUT\"/spouses.dbpedia.qnodes.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail sees that testing that the spouses are instances of `Q5` reduces the number of statements, so some of the URIs from DBpedia are either incorrect or incorrectly mapped to Wikidata. She wants to play it safe, so keeps the data that verifies that the spouses are human."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   39778 /Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.dbpedia.qnodes.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc -l \"$OUT\"/spouses.dbpedia.qnodes.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1     label  node2     node2;label\n",
      "Q268177   P26    Q1000505  'Bud Lee'@en\n",
      "Q673856   P26    Q1000682  'Fernando Carrillo'@en\n",
      "Q1325720  P26    Q1000874  'Thomas Montacute, 4th Earl of Salisbury'@en\n",
      "Q264908   P26    Q1001     'Mahatma Gandhi'@en\n",
      "Q8250426  P26    Q10011    'Jiajing Emperor of Ming'@en\n",
      "Q3784373  P26    Q10011    'Jiajing Emperor of Ming'@en\n",
      "Q4834024  P26    Q1001114  'Buddy Baker'@en\n",
      "Q1707377  P26    Q1001130  'Buddy Bregman'@en\n",
      "Q33941    P26    Q1001933  'Helena'@en\n"
     ]
    }
   ],
   "source": [
    "!head \"$OUT\"/spouses.dbpedia.qnodes.tsv | column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail also gets all the spouse statements from Wikidata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i items \\\n",
    "--match '(artist)-[l:P26]->(spouse)' \\\n",
    "--return 'artist as node1, l.label as label, spouse as node2' \\\n",
    "-o \"$OUT\"/spouses.wikidata.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  681174 /Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.wikidata.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc -l \"$OUT\"/spouses.wikidata.tsv "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Abigail sees that there is a significant difference, so writes a query to identify the common statements in both datasets and sees that there are about clean 7,000 spouse statements in DBpedia that are not present in Wikidata. It will be easy to add them using Wikidata quick statements, a project that she will try to do later.\n",
    "> Kypher can run queries over the two new graphs created in the previous queries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   32453\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$OUT\"/spouses.wikidata.tsv  -i \"$OUT\"/spouses.dbpedia.qnodes.tsv \\\n",
    "--match '\\\n",
    "    wikidata: (person)-[]->(spouse), \\\n",
    "    dbpedia: (person)-[]->(spouse)' \\\n",
    "--return 'distinct person as person, spouse as spouse' \\\n",
    "| wc -l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validate property constraints in Wikidata"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Amir and Sarah are starting a project to find constraint violations in Wikidata. They find that constraints are associated with properties using the `P2302` (property constraint) property. There are over 44,000 constraints, so finding violations is a dauting task as many constraints apply to a very large number of statements."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count\n",
      "44552\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i items \\\n",
    "--match '(property)-[l:P2302]->(constraint)' \\\n",
    "--return 'count(distinct l) as count' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sarah refines the query to print the counts of the different types of constraints, and they see that 9 constraint types are significantly more popular than the others. Amir and Sarah decide to focus on the `value type constraint` as this is the constraint that checks that the value of a statement belongs to specific classes. This constraint is defined for 964 properties, so it is worth working on it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "constraint_label                                        count\n",
      "'item requires statement constraint'@en                 7576\n",
      "'allowed entity types constraint'@en                    6401\n",
      "'format constraint'@en                                  6042\n",
      "'distinct values constraint'@en                         5513\n",
      "'single value constraint'@en                            5453\n",
      "'type constraint'@en                                    5070\n",
      "'property scope constraint'@en                          2908\n",
      "'conflicts-with constraint'@en                          1275\n",
      "'value type constraint'@en                              964\n",
      "'allowed qualifiers constraint'@en                      573\n",
      "'allowed units constraint'@en                           483\n",
      "'required qualifier constraint'@en                      391\n",
      "'range constraint'@en                                   327\n",
      "'value requires statement constraint'@en                320\n",
      "'citation needed constraint'@en                         284\n",
      "'one-of constraint'@en                                  156\n",
      "'integer constraint'@en                                 145\n",
      "'contemporary constraint'@en                            124\n",
      "'inverse constraint'@en                                 110\n",
      "'single-best-value constraint'@en                       101\n",
      "'none of constraint'@en                                 74\n",
      "'no bounds constraint'@en                               74\n",
      "'Commons link constraint'@en                            73\n",
      "'symmetric constraint'@en                               44\n",
      "'multi-value constraint'@en                             27\n",
      "'lexeme requires language constraint'@en                25\n",
      "'difference within range constraint'@en                 9\n",
      "'lexeme requires lexical category constraint'@en        5\n",
      "'one-of qualifier value property constraint'@en         4\n",
      "'lexeme value requires lexical category constraint'@en  1\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i items -i labels -i p279star \\\n",
    "--match ' \\\n",
    "    items: (property)-[l:P2302]->(constraint), \\\n",
    "    labels: (constraint)-[]->(constraint_label)' \\\n",
    "--return 'constraint_label as constraint_label, count(distinct l) as count' \\\n",
    "--order-by 'count desc' \\\n",
    "| column -ts $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Graph Cache:\n",
      "DB file: /Users/pedroszekely/Downloads/kypher/temp.novartis/wikidata.sqlite3.db\n",
      "  size:  142.07 GB   \tfree:  0 Bytes   \tmodified:  2021-07-25 21:24:24\n",
      "\n",
      "KGTK File Information:\n",
      "/Users/pedroszekely/Downloads/kypher/temp.wd-workshop/human.count.year.tsv:\n",
      "  size:  50.79 KB   \tmodified:  2021-07-25 19:00:52   \tgraph:  graph_17\n",
      "/Users/pedroszekely/Downloads/kypher/wd-workshop/class.count.tsv.gz:\n",
      "  size:  322.36 KB   \tmodified:  2021-07-25 20:24:03   \tgraph:  graph_18\n",
      "/Users/pedroszekely/Downloads/kypher/wd-workshop/given-names.year.tsv:\n",
      "  size:  40.45 MB   \tmodified:  2021-07-25 18:57:56   \tgraph:  graph_16\n",
      "/Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.dbpedia.qnodes.tsv:\n",
      "  size:  826.70 KB   \tmodified:  2021-07-25 21:23:33   \tgraph:  graph_15\n",
      "/Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.ulan.dbpedia.tsv:\n",
      "  size:  56.30 KB   \tmodified:  2021-07-25 21:21:12   \tgraph:  graph_19\n",
      "/Users/pedroszekely/Downloads/kypher/wd-workshop/spouses.wikidata.tsv:\n",
      "  size:  15.26 MB   \tmodified:  2021-07-25 21:24:16   \tgraph:  graph_20\n",
      "external_ids:\n",
      "  size:  3.59 GB   \tmodified:  2021-03-11 06:19:15   \tgraph:  graph_7\n",
      "infobox:\n",
      "  size:  1.46 GB   \tmodified:  2021-07-24 11:42:08   \tgraph:  graph_12\n",
      "items:\n",
      "  size:  7.47 GB   \tmodified:  2021-02-26 08:03:09   \tgraph:  graph_1\n",
      "labels:\n",
      "  size:  2.09 GB   \tmodified:  2021-03-15 17:22:24   \tgraph:  graph_5\n",
      "p279:\n",
      "  size:  37.88 MB   \tmodified:  2021-03-10 20:10:45   \tgraph:  graph_4\n",
      "p279star:\n",
      "  size:  529.33 MB   \tmodified:  2021-03-12 01:04:52   \tgraph:  graph_6\n",
      "p31:\n",
      "  size:  1.09 GB   \tmodified:  2021-04-18 13:13:37   \tgraph:  graph_3\n",
      "time:\n",
      "  size:  809.00 MB   \tmodified:  2021-02-26 04:49:21   \tgraph:  graph_2\n",
      "ulan:\n",
      "  size:  348.04 KB   \tmodified:  2021-07-18 09:31:25   \tgraph:  graph_8\n",
      "\n",
      "Graph Table Information:\n",
      "graph_1:\n",
      "  size:  69.23 GB   \tcreated:  2021-07-25 00:00:58\n",
      "  header:  ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n",
      "graph_12:\n",
      "  size:  16.15 GB   \tcreated:  2021-07-25 04:49:13\n",
      "  header:  ['node1', 'label', 'node2', 'id']\n",
      "graph_15:\n",
      "  size:  2.28 MB   \tcreated:  2021-07-25 21:24:23\n",
      "  header:  ['node1', 'label', 'node2']\n",
      "graph_16:\n",
      "  size:  46.83 MB   \tcreated:  2021-07-25 19:00:59\n",
      "  header:  ['node1', 'year', 'node2', 'node1;label', 'label']\n",
      "graph_17:\n",
      "  size:  96.00 KB   \tcreated:  2021-07-25 19:00:59\n",
      "  header:  ['node1', 'node2', 'label']\n",
      "graph_18:\n",
      "  size:  3.50 MB   \tcreated:  2021-07-25 20:24:07\n",
      "  header:  ['node1', 'node2', 'label']\n",
      "graph_19:\n",
      "  size:  76.00 KB   \tcreated:  2021-07-25 21:21:16\n",
      "  header:  ['node1', 'label', 'node2']\n",
      "graph_2:\n",
      "  size:  6.13 GB   \tcreated:  2021-07-25 00:06:53\n",
      "  header:  ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n",
      "graph_20:\n",
      "  size:  43.06 MB   \tcreated:  2021-07-25 21:24:22\n",
      "  header:  ['node1', 'label', 'node2']\n",
      "graph_3:\n",
      "  size:  8.80 GB   \tcreated:  2021-07-25 00:13:14\n",
      "  header:  ['id', 'node1', 'label', 'node2']\n",
      "graph_4:\n",
      "  size:  191.14 MB   \tcreated:  2021-07-25 00:13:39\n",
      "  header:  ['id', 'node1', 'label', 'node2']\n",
      "graph_5:\n",
      "  size:  9.03 GB   \tcreated:  2021-07-25 00:22:44\n",
      "  header:  ['id', 'node1', 'label', 'node2']\n",
      "graph_6:\n",
      "  size:  8.69 GB   \tcreated:  2021-07-25 00:29:11\n",
      "  header:  ['node1', 'label', 'node2', 'id']\n",
      "graph_7:\n",
      "  size:  23.77 GB   \tcreated:  2021-07-25 00:47:29\n",
      "  header:  ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n",
      "graph_8:\n",
      "  size:  1012.00 KB   \tcreated:  2021-07-25 00:47:29\n",
      "  header:  ['node1']\n"
     ]
    }
   ],
   "source": [
    "!$kypher --show-cache"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Took 164.55 minutes to run the notebook from start to end'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"Took {:.2f} minutes to run the notebook from start to end\".format((round(time.time()) - start_time)/60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kgtk-env",
   "language": "python",
   "name": "kgtk-env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}