{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parameters\n",
    "\n",
    "# Folder on local machine where to create the output and temporary folders\n",
    "output_path = \"/Users/pedroszekely/Downloads/kypher\"\n",
    "\n",
    "# The names of the output and temporary folders\n",
    "output_folder = \"wikidata_os_v5\"\n",
    "temp_folder = \"temp.wikidata_os_v5\"\n",
    "\n",
    "# The location of input Wikidata files\n",
    "wikidata_folder = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/\"\n",
    "wikidata_folder = \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/\"\n",
    "# The wikidata_os files can be downloaded from https://drive.google.com/drive/folders/1V6oAQKmwQ4LJnrBai-uv5gHWphFSCt50?usp=sharing\n",
    "\n",
    "# Location of the cache database for kypher\n",
    "cache_path = \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4\"\n",
    "\n",
    "# Whether to delete the cache database\n",
    "delete_database = False\n",
    "\n",
    "# shortcuts to commands\n",
    "kgtk = \"time kgtk --debug\"\n",
    "# kgtk = \"kgtk --debug\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# KGTK Tutorial\n",
    "\n",
    "Beer sites:\n",
    "- https://www.realbeer.com/edu/health/calories.php\n",
    "- http://getdrunknotfat.com/alcohol-content-of-beer/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "import os\n",
    "import subprocess\n",
    "import sys\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import altair as alt\n",
    "\n",
    "import papermill as pm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ALIAS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/aliases.en.tsv.gz\"\n",
      "CLAIMS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.tsv.gz\"\n",
      "DESCRIPTION: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/descriptions.en.tsv.gz\"\n",
      "ISA: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.isa.tsv.gz\"\n",
      "ITEM: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.wikibase-item.tsv.gz\"\n",
      "LABEL: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/labels.en.tsv.gz\"\n",
      "OUT: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v5\"\n",
      "P279: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279.tsv.gz\"\n",
      "P279STAR: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279star.tsv.gz\"\n",
      "PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/metadata.property.datatypes.tsv.gz\"\n",
      "QUALIFIERS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/qualifiers.tsv.gz\"\n",
      "QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/qualifiers.time.tsv.gz\"\n",
      "SITELINKS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/sitelinks.tsv.gz\"\n",
      "STORE: \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"\n",
      "TEMP: \"/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5\"\n",
      "WIKIDATA: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/\"\n",
      "kgtk: \"time kgtk --debug\"\n",
      "kypher: \"time kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"\n"
     ]
    }
   ],
   "source": [
    "# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.\n",
    "file_names = {\n",
    "    \"claims\": \"claims.tsv.gz\",\n",
    "    \"label\": \"labels.en.tsv.gz\",\n",
    "    \"alias\": \"aliases.en.tsv.gz\",\n",
    "    \"description\": \"descriptions.en.tsv.gz\",\n",
    "    \"item\": \"claims.wikibase-item.tsv.gz\",\n",
    "    \"qualifiers\": \"qualifiers.tsv.gz\",\n",
    "    \"sitelinks\": \"sitelinks.tsv.gz\",\n",
    "    \"qualifiers_time\": \"qualifiers.time.tsv.gz\",\n",
    "    \"property_datatypes\": \"metadata.property.datatypes.tsv.gz\",\n",
    "    \"isa\": \"derived.isa.tsv.gz\",\n",
    "    \"p279star\": \"derived.P279star.tsv.gz\",\n",
    "    \"p279\": \"derived.P279.tsv.gz\"\n",
    "}\n",
    "\n",
    "# We will define environment variables to hold the full paths to the files as we will use them in the shell commands\n",
    "kgtk_environment_variables = []\n",
    "\n",
    "os.environ['WIKIDATA'] = wikidata_folder\n",
    "kgtk_environment_variables.append('WIKIDATA')\n",
    "\n",
    "for key, value in file_names.items():\n",
    "    variable = key.upper()\n",
    "    os.environ[variable] = wikidata_folder + value\n",
    "    kgtk_environment_variables.append(variable)\n",
    "    \n",
    "# KGTK creates a SQLite database to index the knowledge graph.\n",
    "if cache_path:\n",
    "    os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_path)\n",
    "else:\n",
    "    os.environ['STORE'] = \"{}/{}/wikidata.sqlite3.db\".format(output_path, temp_folder)\n",
    "kgtk_environment_variables.append('STORE')\n",
    "\n",
    "# We will create many temporary files, so set up a folder for outputs and one for the temporary files.\n",
    "os.environ['TEMP'] = \"{}/{}\".format(output_path, temp_folder) \n",
    "os.environ['OUT'] = \"{}/{}\".format(output_path, output_folder) \n",
    "kgtk_environment_variables.append('TEMP')\n",
    "kgtk_environment_variables.append('OUT')\n",
    "\n",
    "# Envronment variables with shortcuts to the commands we use often\n",
    "os.environ['kgtk'] = kgtk\n",
    "os.environ['kypher'] = \"time kgtk --debug query --graph-cache \" + os.environ['STORE']\n",
    "os.environ['kypher'] = \"time kgtk query --graph-cache \" + os.environ['STORE']\n",
    "\n",
    "\n",
    "kgtk_environment_variables.append('kgtk')\n",
    "kgtk_environment_variables.append('kypher')\n",
    "\n",
    "kgtk_environment_variables.sort()\n",
    "for variable in kgtk_environment_variables:\n",
    "    print(\"{}: \\\"{}\\\"\".format(variable, os.environ[variable]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/pedroszekely/Downloads/kypher\n"
     ]
    }
   ],
   "source": [
    "%cd {output_path}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: wikidata_os_v5: File exists\n",
      "mkdir: temp.wikidata_os_v5: File exists\n"
     ]
    }
   ],
   "source": [
    "!mkdir {output_folder}\n",
    "!mkdir {temp_folder}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Wikidata in KGTK\n",
    "KGTK has the ability to import a Wikidata JSON dump and covert it to the KGTK representation to make it easy to process the full Wikidata KG in a laptop. There are 86 files which include all the information available in the Wikidata dump and files containing commonly used information derived from the dump. We partitioned the files because in most use cases you only need to use a subset of the files.\n",
    "\n",
    "The files are very large. `claims.tsv` (23GB compressed) contains all the statements in the Wikidata dump, `qualifiers.tsv` contains the qualifiers of those edges, and `labels.en.tsv`, `aliases.en.tsv` and `descriptions.en.tsv` contain the English labels, aliases and descriptions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r--  1 pedroszekely  staff    68M Nov 16 08:07 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/aliases.en.tsv.gz\n",
      "-rw-r--r--  1 pedroszekely  staff   4.7G Nov 16 08:05 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.tsv.gz\n",
      "-rw-r--r--  1 pedroszekely  staff   269M Nov 16 08:08 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/descriptions.en.tsv.gz\n",
      "-rw-r--r--  1 pedroszekely  staff   376M Nov 16 08:06 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/labels.en.tsv.gz\n",
      "-rw-r--r--  1 pedroszekely  staff   662M Nov 16 08:43 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/qualifiers.tsv.gz\n"
     ]
    }
   ],
   "source": [
    "!ls -lh \"$CLAIMS\" \"$QUALIFIERS\" \"$LABEL\" \"$ALIAS\" \"$DESCRIPTION\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`claims.tsv` contains many edges:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 254135077 1578463882 20285305033\n",
      "\n",
      "real\t1m19.657s\n",
      "user\t2m12.459s\n",
      "sys\t0m8.915s\n"
     ]
    }
   ],
   "source": [
    "!time zcat < \"$CLAIMS\" | wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# KGTK Data Model\n",
    "The KGTK data model is a generalization of RDF and property graphs, inspired by the Wikidata data model. In KGTK, a KG is represented using TSV files with four columns: three columns to store the subject, predicate and object of a triple, and a fourth column to store an identifier for the triple. By convention, we use the heading `id` for the identifier, `node1` for the subject, `node2` for the object and `label` for the predicate, as it labels the edge between `node1` and `node2`. The order of the columns is arbitrary.\n",
    "\n",
    "All KGTK files must include the required `id`, `node1`, `label` and `node2` columns, and can contain additional columns to store addtional information about an edge or the nodes in the edge. We will explain the details after we discuss *qualifiers*.\n",
    "Let's take a look at the first few lines of the `claims.tsv` file. We see the four required columns and two additional columns that the Wikidata import includes to facilitate processing of the `claims` file using custom scripts. The `rank` column records the Wikidata rank of a statement, and the `node2;wikidatatype` records the Wikidata type of the value in the `node2` column."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Claims"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zcat: error writing to output: Broken pipe\n",
      "id                              node1  label  node2                                    rank    node2;wikidatatype\n",
      "P10-P1628-32b85d-7927ece6-0     P10    P1628  \"http://www.w3.org/2006/vcard/ns#Video\"  normal  url\n",
      "P10-P1628-acf60d-b8950832-0     P10    P1628  \"https://schema.org/video\"               normal  url\n",
      "P10-P1629-Q34508-bcc39400-0     P10    P1629  Q34508                                   normal  wikibase-item\n",
      "P10-P1659-P1651-c4068028-0      P10    P1659  P1651                                    normal  wikibase-property\n",
      "P10-P1659-P18-5e4b9c4f-0        P10    P1659  P18                                      normal  wikibase-property\n",
      "P10-P1659-P4238-d21d1ac0-0      P10    P1659  P4238                                    normal  wikibase-property\n",
      "P10-P1659-P51-86aca4c5-0        P10    P1659  P51                                      normal  wikibase-property\n",
      "P10-P1855-Q15075950-7eff6d65-0  P10    P1855  Q15075950                                normal  wikibase-item\n",
      "P10-P1855-Q69063653-c8cdb04c-0  P10    P1855  Q69063653                                normal  wikibase-item\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$CLAIMS\" | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wikidata uses numbers to identify items and properties. We can use the `wd` utility (https://github.com/maxlath/wikibase-cli) to understand the first few lines. The second line states that the `P10` property in Wikidata has an equivalent property in another ontology. Notice that each edge has a distinct id. These ids are unique identifiers for statements (the format of the id can be arbitrary, but we assigned ids so that sorting files by id arranges the information so that all edges about a subject are consecutive."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[90mid\u001b[39m P10\n",
      "\u001b[42mLabel\u001b[49m video\n",
      "\u001b[44mDescription\u001b[49m relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)\n",
      "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mWikidata property to link to Commons \u001b[90m(Q18610173)\u001b[39m\n",
      "\n",
      "\u001b[90mid\u001b[39m P1628\n",
      "\u001b[42mLabel\u001b[49m equivalent property\n",
      "\u001b[44mDescription\u001b[49m equivalent property in other ontologies (use in statements on properties, use property URI)\n",
      "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mWikidata metaproperty for ontology mapping \u001b[90m(Q42842547)\u001b[39m\n",
      "\n",
      "\u001b[90mid\u001b[39m P1629\n",
      "\u001b[42mLabel\u001b[49m subject item of this property\n",
      "\u001b[44mDescription\u001b[49m relationship represented by the property\n",
      "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mWikidata property for property documentation \u001b[90m(Q19820110)\u001b[39m\n"
     ]
    }
   ],
   "source": [
    "!wd u P10 P1628 P1629"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's look at a more meaningful example. `Q31` (https://www.wikidata.org/wiki/Q31) is the Wikidata item about Belgium. We will use the KGTK query to fetch edges about Belgium. `$kypher` is a shortcut to the `kgtk query` command where in addition we pass in the location of the SQLite database we are using ot store the files. KGTK queries use Cypher syntax (https://neo4j.com/developer/cypher/): the following simple query retrieves 10 edges where `node1` is `Q31`, the q-node for Belgium. The results include an edge with `label` `P1036` (Dewey Decimal Classification) and several edges with label `P1081` (human development index)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     2046.59 real      2931.96 user       189.44 sys\n",
      "id                           node1  label  node2     rank    node2;wikidatatype\n",
      "Q31-P1036-c4e1ad-df86eeb8-0  Q31    P1036  \"2--493\"  normal  external-id\n",
      "Q31-P1081-02c2ed-033524b0-0  Q31    P1081  +0.866    normal  quantity\n",
      "Q31-P1081-02c2ed-7971505b-0  Q31    P1081  +0.866    normal  quantity\n",
      "Q31-P1081-068470-c1c63b8d-0  Q31    P1081  +0.889    normal  quantity\n",
      "Q31-P1081-068470-ddac01e0-0  Q31    P1081  +0.889    normal  quantity\n",
      "Q31-P1081-144738-c1851cdc-0  Q31    P1081  +0.905    normal  quantity\n",
      "Q31-P1081-175742-c07ac1c8-0  Q31    P1081  +0.888    normal  quantity\n",
      "Q31-P1081-19636d-c08dd8a8-0  Q31    P1081  +0.896    normal  quantity\n",
      "Q31-P1081-1efc03-433a7a4d-0  Q31    P1081  +0.913    normal  quantity\n",
      "Q31-P1081-1f8602-ddac530d-0  Q31    P1081  +0.852    normal  quantity\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" \\\n",
    "--match '(:Q31)-[]-()' \\\n",
    "--limit 10 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The output of the command above is hard to read because we are seeing the numeric Wikidata identifiers. To make the output more readable, we need to look up the labels of the Wikidata nodes. This information is in the `labels.en.tsv` file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zcat: error writing to output: Broken pipe\n",
      "id              node1  label  node2\n",
      "P10-label-en    P10    label  'video'@en\n",
      "P1000-label-en  P1000  label  'record held'@en\n",
      "P1001-label-en  P1001  label  'applies to jurisdiction'@en\n",
      "P1002-label-en  P1002  label  'engine configuration'@en\n",
      "P1003-label-en  P1003  label  'National Library of Romania ID'@en\n",
      "P1004-label-en  P1004  label  'MusicBrainz place ID'@en\n",
      "P1005-label-en  P1005  label  'Portuguese National Library ID'@en\n",
      "P1006-label-en  P1006  label  'Nationale Thesaurus voor Auteurs ID'@en\n",
      "P1007-label-en  P1007  label  'Lattes Platform number'@en\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$LABEL\" | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With KGTK accepts multiple files as input, and can do a join to retrieve the label for each property. When using multiple files, it is necessary to tag each clause with the file that provides the data for the clause. For example, the first clause is tagged with `claim` as the word `claim` is part of the file name. The variable property is used to connect the two clauses."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      629.00 real       531.54 user       102.50 sys\n",
      "id                           node1  label  node2     label;label\n",
      "Q31-P1036-c4e1ad-df86eeb8-0  Q31    P1036  \"2--493\"  'Dewey Decimal Classification'@en\n",
      "Q31-P1081-02c2ed-033524b0-0  Q31    P1081  +0.866    'Human Development Index'@en\n",
      "Q31-P1081-02c2ed-7971505b-0  Q31    P1081  +0.866    'Human Development Index'@en\n",
      "Q31-P1081-068470-c1c63b8d-0  Q31    P1081  +0.889    'Human Development Index'@en\n",
      "Q31-P1081-068470-ddac01e0-0  Q31    P1081  +0.889    'Human Development Index'@en\n",
      "Q31-P1081-144738-c1851cdc-0  Q31    P1081  +0.905    'Human Development Index'@en\n",
      "Q31-P1081-175742-c07ac1c8-0  Q31    P1081  +0.888    'Human Development Index'@en\n",
      "Q31-P1081-19636d-c08dd8a8-0  Q31    P1081  +0.896    'Human Development Index'@en\n",
      "Q31-P1081-1efc03-433a7a4d-0  Q31    P1081  +0.913    'Human Development Index'@en\n",
      "Q31-P1081-1f8602-ddac530d-0  Q31    P1081  +0.852    'Human Development Index'@en\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$LABEL\" \\\n",
    "--match 'claim: (n1:Q31)-[l {label: property}]-(n2), label: (property)-[:label]->(property_label)' \\\n",
    "--return 'l as id, n1 as node1, property as label, n2 as node2, property_label as `label;label`' \\\n",
    "--limit 10 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's look at a the heads of state of Belgium recorded in property `P35`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      687.06 real       391.17 user       144.53 sys\n",
      "id                            node1  label  node2      node2;label\n",
      "Q31-P35-Q1079522-c82ed584-0   Q31    P35    Q1079522   'Erasme Louis Surlet de Chokier'@en\n",
      "Q31-P35-Q12967-f2b9aaf3-0     Q31    P35    Q12967     'Leopold II of Belgium'@en\n",
      "Q31-P35-Q12971-2088471b-0     Q31    P35    Q12971     'Leopold I of Belgium'@en\n",
      "Q31-P35-Q12973-31c1b700-0     Q31    P35    Q12973     'Leopold III of Belgium'@en\n",
      "Q31-P35-Q12976-f3e8a567-0     Q31    P35    Q12976     'Baudouin I of Belgium'@en\n",
      "Q31-P35-Q155004-619ba603-0    Q31    P35    Q155004    'Philippe I of Belgium'@en\n",
      "Q31-P35-Q3911-137f01fe-0      Q31    P35    Q3911      'Albert II of Belgium'@en\n",
      "Q31-P35-Q445553-7599749f-0    Q31    P35    Q445553    'Prince Charles, Count of Flanders'@en\n",
      "Q31-P35-Q55008046-725dce40-0  Q31    P35    Q55008046  'Albert I of Belgium'@en\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$LABEL\" \\\n",
    "--match 'claims: (n1:Q31)-[l:P35]->(n2), labels: (n2)-[:label]->(n2_label)' \\\n",
    "--return 'l as id, n1 as node1, l.label as label, n2 as node2, n2_label as `node2;label`' \\\n",
    "--limit 10 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Qualifiers\n",
    "Qualifiers provide additional information about the claims stated in the edges. For `P1081` the qualifiers tell use the year, and for head of state the qualifiers provide information about the period of time and position held by the head of state. The qualifiers can be retrieved using the identifiers of the edges. Let's retrieve the qualifiers associated with the edge for the first head of state (Erasme Louis). To do so, we use the identifier of the edge (`Q31-P35-Q1079522-c82ed584-0`) as `node1` in the `qualifiers.tsv` file. We get three edges, meaning that the edge `Q31/P35/Q1079522` has three qualifiers. Note that the qualifier edges are the same as any other edge in KGTK, having `id`, `node1`, `label` and `node2` columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      407.47 real       576.47 user        28.22 sys\n",
      "id                                         node1                        label  node2                     node2;wikidatatype\n",
      "Q31-P35-Q1079522-c82ed584-0-P39-Q477406-0  Q31-P35-Q1079522-c82ed584-0  P39    Q477406                   wikibase-item\n",
      "Q31-P35-Q1079522-c82ed584-0-P580-106076-0  Q31-P35-Q1079522-c82ed584-0  P580   ^1831-02-25T00:00:00Z/11  time\n",
      "Q31-P35-Q1079522-c82ed584-0-P582-774519-0  Q31-P35-Q1079522-c82ed584-0  P582   ^1831-07-20T00:00:00Z/11  time\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$QUALIFIERS\" \\\n",
    "--match '(n1:`Q31-P35-Q1079522-c82ed584-0`)-[l]->(n2)' \\\n",
    "--limit 10 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's make them readable: the following query combines the patterns of the previous two queries to retrieve the labels of the property and node2. The query omits the identifier of the qualifier edges to save space. Also, the headers of the two additional columns can be arbitrary, i.e., you can name them whatever you want; the names used follow a KGTK convention that enabled KGTK to automatically parse the output, which is useful if we want to use the output as an input to another KGTK command. The word before the `;` refers to one of the standard columns, and the name after the `;` refers to a property of that element. In this example, we used `label` as the column contains the label of the entity."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       52.73 real        28.95 user         9.37 sys\n",
      "node1                        label  node2                     label;label\n",
      "Q31-P35-Q1079522-c82ed584-0  P39    Q477406                   'position held'@en\n",
      "Q31-P35-Q1079522-c82ed584-0  P580   ^1831-02-25T00:00:00Z/11  'start time'@en\n",
      "Q31-P35-Q1079522-c82ed584-0  P582   ^1831-07-20T00:00:00Z/11  'end time'@en\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$QUALIFIERS\" -i \"$LABEL\" \\\n",
    "--match 'qual: (n1:`Q31-P35-Q1079522-c82ed584-0`)-[l {label: property}]->(n2), labels: (property)-[:label]->(property_label)' \\\n",
    "--return 'n1 as node1, property as label, n2 as node2, property_label as `label;label`' \\\n",
    "--limit 10 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's put all the values of `P35` in a file, which we will conveniently name `Q31.P35.tsv`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        0.88 real         0.57 user         0.16 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" \\\n",
    "--match '(n1:Q31)-[l:P35]->(n2)' \\\n",
    "--return 'l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q31.P35.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we are going to combine the `P35` edges of Belgium with the qualifiers. To do this we will run a query that uses the edges that we stored in `Q31.P35.tsv`, and retrieve the qualifiers for each of those edges; the result of our query will be the qualifier edges of the head of state edges. To union the qualifier edges with the claim edges, we feed the output of the query to the `cat` command (concatenate), and then feed the output to the `sort2` command to sort the edges. The first 12 edges are shown below. We see a claim edge followed by the qualifiers defined for it.\n",
    "\n",
    "This snippet illustrates that KGTK commands can be chained using the `/` chain operator to compose more complex workflows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                                         node1                        label  node2\n",
      "Q31-P35-Q1079522-c82ed584-0                Q31                          P35    Q1079522\n",
      "Q31-P35-Q1079522-c82ed584-0-P39-Q477406-0  Q31-P35-Q1079522-c82ed584-0  P39    Q477406\n",
      "Q31-P35-Q1079522-c82ed584-0-P580-106076-0  Q31-P35-Q1079522-c82ed584-0  P580   ^1831-02-25T00:00:00Z/11\n",
      "Q31-P35-Q1079522-c82ed584-0-P582-774519-0  Q31-P35-Q1079522-c82ed584-0  P582   ^1831-07-20T00:00:00Z/11\n",
      "Q31-P35-Q12967-f2b9aaf3-0                  Q31                          P35    Q12967\n",
      "Q31-P35-Q12967-f2b9aaf3-0-P39-Q13592862-0  Q31-P35-Q12967-f2b9aaf3-0    P39    Q13592862\n",
      "Q31-P35-Q12967-f2b9aaf3-0-P580-f29037-0    Q31-P35-Q12967-f2b9aaf3-0    P580   ^1865-12-17T00:00:00Z/11\n",
      "Q31-P35-Q12967-f2b9aaf3-0-P582-136f02-0    Q31-P35-Q12967-f2b9aaf3-0    P582   ^1909-12-17T00:00:00Z/11\n",
      "Q31-P35-Q12971-2088471b-0                  Q31                          P35    Q12971\n",
      "Q31-P35-Q12971-2088471b-0-P39-Q13592862-0  Q31-P35-Q12971-2088471b-0    P39    Q13592862\n",
      "Q31-P35-Q12971-2088471b-0-P580-a35d41-0    Q31-P35-Q12971-2088471b-0    P580   ^1831-06-04T00:00:00Z/11\n",
      "        1.61 real         2.27 user         0.55 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$QUALIFIERS\" -i \"$TEMP\"/Q31.P35.tsv \\\n",
    "--match 'P35: ()-[l]->(), qual: (l)-[lq]->(n2)' \\\n",
    "--return 'lq as id, l as node1, lq.label as label, n2 as node2' \\\n",
    "/ cat -i - -i \"$TEMP\"/Q31.P35.tsv \\\n",
    "/ sort2 \\\n",
    "| head -12 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary\n",
    "\n",
    "- KGTK represents graphs in TSV files with standard columns `id`, `node1`, `label` and `node2`\n",
    "- It is possible to include arbitrary additional columns in KGTK files\n",
    "- The identifier of an edge can be used as a node in another edge enabling the representation of edges about edges\n",
    "- KGTK provides a powerful query command based on Cypher as well as a host of other commands, type `kgtk --help` to see the list of commands."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Case: A Knowledge Graph About Alocholic Beverages\n",
    "We are going to build a small KG about alcoholoc beverages by extracting from Wikidata the subgraph that relates to alcoholic beverages (https://www.wikidata.org/wiki/Q154)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1: create a list of all descendants of `alcoholic beverage` (https://www.wikidata.org/wiki/Q154)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[90mid\u001b[39m Q154\n",
      "\u001b[42mLabel\u001b[49m alcoholic beverage\n",
      "\u001b[44mDescription\u001b[49m drink containing alcohols, typically ethanol\n",
      "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m drug \u001b[90m(Q8386)\u001b[39m | carcinogen \u001b[90m(Q187661)\u001b[39m\n",
      "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mdrink \u001b[90m(Q40050)\u001b[39m\n"
     ]
    }
   ],
   "source": [
    "!wd u Q154"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wikidata uses two properties to organize entities in a hierarchy: the `instance of` property (`P31`) and the `subclass of` (`P279`) property. In many cases, the distinction between instance of and subclass of is subtle, and we find many situations in Wikidata where either one or the other is used to organize hierarchies. For this reason, we created a new property called `isa` that contains the union of `P31` and `P279` and stored in the file `derived.isa.tsv`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tlabel\tnode2\n",
      "P10\tisa\tQ18610173\n",
      "P1000\tisa\tQ18608871\n",
      "P1001\tisa\tQ15720608\n",
      "P1001\tisa\tQ22984026\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$ISA\" | head -5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get all the alcoholic beverages, we need to get all entities that are `isa` of alcoholic beverage (`Q154`) or that are `isa` of any descendant of `Q154` in the `subclass of` (`P279`) hierarchy. The length of the chain of `P279` edges can be arbitrarily long. To support this uise case, KGTK offers the `derived.P279star.tsv` file that contains edges `n1/P279star/n2` if `n1` is a descendant of `n2` on chains of `P279` edges, includiing chains of zero length (`n1/P279star/n1`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1     label     node2     id\n",
      "zcat: Q1000032  P279star  Q1000032  Q1000032-P279star-Q1000032-0000\n",
      "Q1000032  P279star  Q1150070  Q1000032-P279star-Q1150070-0000\n",
      "Q1000032  P279star  Q1190554  Q1000032-P279star-Q1190554-0000\n",
      "Q1000032  P279star  Q133500   Q1000032-P279star-Q133500-0000\n",
      "error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$P279STAR\" | head -5 | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get all alcoholic beverages, we need to find all nodes `n1` that are connected to `Q154` with an `isa` edge and a chain of `P279` edges:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      285.63 real       381.88 user        22.08 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$ISA\" -i \"$P279STAR\" -i \"$LABEL\" \\\n",
    "--match 'isa: (n1)-[]->(n2), star: (n2)-[]->(n3:Q154), label: (n1)-[]->(n1l)' \\\n",
    "--return 'n1 as node1, n1l as `node1;label`, n3 as node2, \"isastar\" as label' \\\n",
    "-o \"$TEMP\"/Q154.descendant.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is a sample of alcoholic beverages in Wikidata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1      node1;label                  node2  label\n",
      "Q1350656   'Corn whiskey'@en            Q154   isastar\n",
      "Q20713240  'Buckwheat whisky'@en        Q154   isastar\n",
      "Q2535077   'Rye Whiskey'@en             Q154   isastar\n",
      "Q536976    'Canadian whisky'@en         Q154   isastar\n",
      "Q7991845   'Wheat whiskey'@en           Q154   isastar\n",
      "Q10429117  'Beyaz'@en                   Q154   isastar\n",
      "Q1069954   'Prosecco'@en                Q154   isastar\n",
      "Q1094850   'Clairette du Languedoc'@en  Q154   isastar\n",
      "Q1135592   'Cortese di Gavi'@en         Q154   isastar\n"
     ]
    }
   ],
   "source": [
    "!head \"$TEMP\"/Q154.descendant.tsv | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An the total number:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    3251   16116  133341 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.descendant.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$TEMP\"/Q154.descendant.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The computation of `Q154.descendant.tsv` can be implemented in SPARQL using the common `P31/P279*` graph pattern, but the query will time out if the result size is large. For example, the query will time out when requesting all descendants of chemical compounds, as there are over one million chemical compounds in Wikidata. The query can be easily done in KGTK."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2: get the incoming and outgoing edges\n",
    "We want out graph to have the neighbors of all alcoholic beverages, so we need to get the incoming and outgoing edges.\n",
    "\n",
    "The following query gets the outgoing edges."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        2.35 real         0.84 user         0.41 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.descendant.tsv \\\n",
    "--match 'Q154: (n1)-[]->(), claims: (n1)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.node1.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see that we are getting several properties for our items:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                                   node1     label  node2\n",
      "Q1000737-P1435-Q17297633-53903946-0  Q1000737  P1435  Q17297633\n",
      "Q1000737-P1454-Q460178-8ad4931b-0    Q1000737  P1454  Q460178\n",
      "Q1000737-P159-Q16003-31e24011-0      Q1000737  P159   Q16003\n",
      "Q1000737-P17-Q183-24107fe2-0         Q1000737  P17    Q183\n",
      "Q1000737-P18-147fc9-667304f8-0       Q1000737  P18    \"Marthabräuhalle 2011-04-03.jpg\"\n",
      "Q1000737-P31-Q131734-f97bd6f6-0      Q1000737  P31    Q131734\n",
      "Q1000737-P31-Q15075508-a4c83928-0    Q1000737  P31    Q15075508\n",
      "Q1000737-P373-689157-3110aade-0      Q1000737  P373   \"Marthabräu\"\n",
      "Q1000737-P452-Q869095-f5d8e7a2-0     Q1000737  P452   Q869095\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.node1.tsv.gz | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now get the incoming edges:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        2.00 real         0.75 user         0.36 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.descendant.tsv \\\n",
    "--match 'Q154: (n1)-[]->(), claims: (n3)-[l]->(n1)' \\\n",
    "--return 'distinct l as id, n3 as node1, l.label as label, n1 as node2' \\\n",
    "-o \"$TEMP\"/Q154.node2.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is a sample of the edges we are getting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zcat: id                                  node1      label  node2\n",
      "Q1350656-P279-Q1007164-7e3ecba9-0   Q1350656   P279   Q1007164\n",
      "error writing to outputQ20713240-P279-Q1007164-b3112260-0  Q20713240  P279   Q1007164\n",
      ": Q2535077-P279-Q1007164-b2d3684b-0   Q2535077   P279   Q1007164\n",
      "Broken pipe\n",
      "Q536976-P279-Q1007164-8bf7467b-0    Q536976    P279   Q1007164\n",
      "Q7991845-P279-Q1007164-18bc383a-0   Q7991845   P279   Q1007164\n",
      "Q10337004-P186-Q10210-c56dd7ce-0    Q10337004  P186   Q10210\n",
      "Q10429117-P31-Q10210-d342f061-0     Q10429117  P31    Q10210\n",
      "Q1051699-P279-Q10210-65d32c67-0     Q1051699   P279   Q10210\n",
      "Q1058259-P279-Q10210-e204554a-0     Q1058259   P279   Q10210\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.node2.tsv.gz | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Concatenate the incoming and outgoing edges to put them in a single file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        0.96 real         0.84 user         0.11 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.node1.tsv.gz -i \"$TEMP\"/Q154.node2.tsv.gz -o \"$TEMP\"/Q154.claims.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have over 30,000 edges:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   28142  116045 1584824\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.claims.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Summary of where we are:\n",
    "- Computed the list of entities below alcoholic beverage\n",
    "- Found all incoming and outgoing edges to these entities; for the new entities we bring in, we have no information, we only have the q-node\n",
    "\n",
    "Not having any information about the entities connected to the alcoholic beverages is limiting, so let's get their outgoing edges. We run the query with `Q154.claims.tsv` which will use all the entities in our graph, including the alcoholic beverages for which we already got outgoing edges; no harm done, as we can eliminate duplicated later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        5.27 real         3.61 user         0.51 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.claims.tsv.gz \\\n",
    "--match 'Q154: ()-[]->(n1), claims: (n1)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.hop.out.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For sanity check, let's take a peek:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                             node1  label  node2\n",
      "Q1000-P1036-9bef62-f77ac5cf-0  Q1000  P1036  \"2--6721\"\n",
      "Q1000-P1081-0d345f-3a33abf5-0  Q1000  P1081  +0.641\n",
      "Q1000-P1081-0d345f-6da37c02-0  Q1000  P1081  +0.641\n",
      "Q1000-P1081-1100e3-c7631769-0  Q1000  P1081  +0.624\n",
      "Q1000-P1081-1ada51-7c71c229-0  Q1000  P1081  +0.639\n",
      "Q1000-P1081-345681-88a99cab-0  Q1000  P1081  +0.702\n",
      "Q1000-P1081-347db1-da0e5e03-0  Q1000  P1081  +0.637\n",
      "Q1000-P1081-419245-b03a8b59-0  Q1000  P1081  +0.647\n",
      "Q1000-P1081-419245-f8cd58e8-0  Q1000  P1081  +0.647\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.hop.out.tsv.gz | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's consolidate our edge files into one larger file. We use compact to remove duplicates and sort to keep edges for the same subject together:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        4.65 real         6.28 user         0.63 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.claims.tsv.gz -i \"$TEMP\"/Q154.hop.out.tsv.gz \\\n",
    "/ compact \\\n",
    "/ sort2 \\\n",
    "-o \"$TEMP\"/Q154.edges.1.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we have over 170,000 edges:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  165133  678398 8868474\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take a peek:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                                node1  label  node2\n",
      "P1389-P1855-Q1109662-9e2ef218-0   P1389  P1855  Q1109662\n",
      "P1582-P1855-Q17329207-f4ef508d-0  P1582  P1855  Q17329207\n",
      "P2581-P1855-Q7639844-08b3a4c7-0   P2581  P1855  Q7639844\n",
      "P2665-P1855-Q1067702-402a80a9-0   P2665  P1855  Q1067702\n",
      "P2665-P1855-Q170210-30d44f0b-0    P2665  P1855  Q170210\n",
      "P5420-P1855-Q44-209cffb1-0        P5420  P1855  Q44\n",
      "P5420-P1855-Q722338-73d7be75-0    P5420  P1855  Q722338\n",
      "P6088-P1855-Q1543214-3d934541-0   P6088  P1855  Q1543214\n",
      "P6088-P1855-Q4626-4ed65964-0      P6088  P1855  Q4626\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once we have all the alcoholic beverages, we want to get the upper ontology of all the classes used, so that every class in our KG has a path to the root of the ontology. For example, first go to `drink` (`Q40050`), then to `liquid` (`Q11435`), then `fluid` (`Q102205`) and so on until we reach `entity` (`Q35120`).\n",
    "\n",
    "To do this, we need to get all the `isa` of all items in our graph, then get `P279star` so we get the list of all classes that these items descend from. Finally we need to get all the `P279` edges between them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       12.28 real         9.14 user         0.95 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$TEMP\"/Q154.edges.1.tsv.gz -i \"$P279STAR\" -i \"$ISA\" \\\n",
    "--match 'Q154: (n1)-[]->(), isa: (n1)-[]->(n2), P279: (n2)-[]->(class)' \\\n",
    "--return 'distinct class as node1' \\\n",
    "-o \"$TEMP\"/Q154.classes.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have almost 3,000 classes in the upper ontology for the entities in our graph:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    2846    2846   24939 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.classes.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$TEMP\"/Q154.classes.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now use the `derived.P279.tsv` file to get the `P279` edges that connect a class to its superclass."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        4.16 real         5.91 user         0.34 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$TEMP\"/Q154.classes.tsv -i \"$P279\" \\\n",
    "--match 'Q154: (class)-[]->(), P279: (class)-[l]->(super)' \\\n",
    "--return 'distinct l as id, class as node1, l.label as label, super as node2' \\\n",
    "-o \"$TEMP\"/Q154.P279.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We get close to 5,000 `P279` edges in the upper ontology; we will take care of potential duplicates at a final cleanup step:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    4517   18068  249492 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.P279.tsv\n"
     ]
    }
   ],
   "source": [
    "!wc \"$TEMP\"/Q154.P279.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see several q-nodes below `entity` (`Q35120`), a good indication that we computed the upper ontology correctly:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Q16686448-P279-Q35120-674edbf9-0  Q16686448  P279  Q35120\n",
      "Q35120-P279-25b964-0520e300-0     Q35120     P279  novalue\n",
      "Q58415929-P279-Q35120-75659d0c-0  Q58415929  P279  Q35120\n",
      "Q23958946-P279-Q35120-70a9ed90-0  Q23958946  P279  Q35120\n",
      "Q488383-P279-Q35120-5fad2ad7-0    Q488383    P279  Q35120\n"
     ]
    }
   ],
   "source": [
    "!grep Q35120 \"$TEMP\"/Q154.P279.tsv | head -5 | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's consolidate the edges again:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        4.41 real         5.94 user         0.59 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.edges.1.tsv.gz -i \"$TEMP\"/Q154.P279.tsv \\\n",
    "/ compact \\\n",
    "/ sort2 \\\n",
    "-o \"$TEMP\"/Q154.edges.2.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have over 175,000 edges:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  169047  694054 9085731\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.edges.2.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Summary:\n",
    "- We have the instances of alcoholic beverages\n",
    "- We added incoming and outgoing edges\n",
    "- For the outgoing edges, we went one hop forward\n",
    "- We got the upper ontology\n",
    "\n",
    "The properties are also items in Wikidata, so let's collect them all and get their edges."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        1.53 real         1.83 user         0.19 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$TEMP\"/Q154.edges.2.tsv.gz \\\n",
    "--match '()-[l {label: property}]->()' \\\n",
    "--return 'distinct property as node1' \\\n",
    "-o \"$TEMP\"/Q154.properties.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\n",
      "P10\n",
      "P1001\n",
      "P1003\n",
      "P1004\n",
      "P1005\n",
      "P1006\n",
      "P101\n",
      "P1014\n",
      "P1015\n"
     ]
    }
   ],
   "source": [
    "!head \"$TEMP\"/Q154.properties.tsv | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's get the edges of these properties:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        0.99 real         0.67 user         0.18 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.properties.tsv \\\n",
    "--match 'Q154: (p)-[]->(), claims: (p)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, p as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.properties.edges.tsv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take a peek, looks like what we had before as the file is sorted, let's proceed:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                              node1  label  node2\n",
      "P10-P1628-32b85d-7927ece6-0     P10    P1628  \"http://www.w3.org/2006/vcard/ns#Video\"\n",
      "P10-P1628-acf60d-b8950832-0     P10    P1628  \"https://schema.org/video\"\n",
      "P10-P1629-Q34508-bcc39400-0     P10    P1629  Q34508\n",
      "P10-P1659-P1651-c4068028-0      P10    P1659  P1651\n",
      "P10-P1659-P18-5e4b9c4f-0        P10    P1659  P18\n",
      "P10-P1659-P4238-d21d1ac0-0      P10    P1659  P4238\n",
      "P10-P1659-P51-86aca4c5-0        P10    P1659  P51\n",
      "P10-P1855-Q15075950-7eff6d65-0  P10    P1855  Q15075950\n",
      "P10-P1855-Q69063653-c8cdb04c-0  P10    P1855  Q69063653\n"
     ]
    }
   ],
   "source": [
    "!head \"$TEMP\"/Q154.properties.edges.tsv | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's consolidate the edges again:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        5.03 real         6.84 user         0.64 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.edges.2.tsv.gz -i \"$TEMP\"/Q154.properties.edges.tsv \\\n",
    "/ compact \\\n",
    "/ sort2 \\\n",
    "-o \"$TEMP\"/Q154.edges.3.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The number of edges grew a bit to 206,000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  197521  811687 10791930\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.edges.3.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Summary:\n",
    "- We have the instances of alcoholic beverages\n",
    "- We added incoming and outgoing edges\n",
    "- For the outgoing edges, we went one hop forward\n",
    "- We got the upper ontology\n",
    "- And we have the edges on all the properties being used\n",
    "\n",
    "We will stop adding nodes to the KG at this time, and proceed to add the labels for all the nodes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3: get the labels, aliases and descriptions of all the items in our KG\n",
    "Before we start, let's define an environment variable to hold the final edges file so that if we change our mind later, we can update it without having to change the commands below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"Q154GRAPH\"] = os.environ[\"TEMP\"] + \"/Q154.edges.3.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.edges.3.tsv.gz\n"
     ]
    }
   ],
   "source": [
    "!ls \"$Q154GRAPH\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the labels of the `node1` nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        3.18 real         2.44 user         0.45 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$LABEL\" \\\n",
    "--match 'Q154: (n1)-[]-(), label: (n1)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.label.node1.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id              node1  label  node2\n",
      "P10-label-en    P10    label  'video'@en\n",
      "P1001-label-en  P1001  label  'applies to jurisdiction'@en\n",
      "P1003-label-en  P1003  label  'National Library of Romania ID'@en\n",
      "P1004-label-en  P1004  label  'MusicBrainz place ID'@en\n",
      "P1005-label-en  P1005  label  'Portuguese National Library ID'@en\n",
      "P1006-label-en  P1006  label  'Nationale Thesaurus voor Auteurs ID'@en\n",
      "P101-label-en   P101   label  'field of work'@en\n",
      "P1014-label-en  P1014  label  'Getty AAT ID'@en\n",
      "P1015-label-en  P1015  label  'NORAF ID'@en\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.label.node1.tsv.gz | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the labels of the `node2` nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       46.12 real        34.97 user         6.91 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$LABEL\" \\\n",
    "--match 'Q154: ()-[]-(n2), label: (n2)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n2 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.label.node2.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Concatenate the two label files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        1.10 real         0.75 user         0.18 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.label.node1.tsv.gz -i \"$TEMP\"/Q154.label.node2.tsv.gz \\\n",
    "-o \"$TEMP\"/labels.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the aliases of `node1` nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       24.79 real        37.19 user         1.39 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$ALIAS\" \\\n",
    "--match 'Q154: (n1)-[]-(), alias: (n1)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.alias.node1.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the aliases of `node2` nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        5.75 real         0.84 user         0.33 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$ALIAS\" \\\n",
    "--match 'Q154: ()-[]-(n2), alias: (n2)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n2 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.alias.node2.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Concatenate the two alias files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        0.91 real         0.72 user         0.14 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.alias.node1.tsv.gz -i \"$TEMP\"/Q154.alias.node2.tsv.gz \\\n",
    "-o \"$TEMP\"/alias.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the descriptions of `node1` nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      202.73 real       287.06 user        13.62 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$DESCRIPTION\" \\\n",
    "--match 'Q154: (n1)-[]-(), description: (n1)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.description.node1.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the descriptions of `node2` nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       40.84 real        31.30 user         7.57 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$DESCRIPTION\" \\\n",
    "--match 'Q154: ()-[]-(n2), description: (n2)-[l]->(n2)' \\\n",
    "--return 'distinct l as id, n2 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q154.description.node2.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Concatenate the two description files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        0.81 real         0.65 user         0.12 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat -i \"$TEMP\"/Q154.description.node1.tsv.gz -i \"$TEMP\"/Q154.description.node2.tsv.gz \\\n",
    "-o \"$TEMP\"/description.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4: get the qualifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        5.36 real         2.23 user         0.79 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$QUALIFIERS\" \\\n",
    "--match 'Q154: ()-[l]->(), qual: (l)-[lq]->(n2)' \\\n",
    "--return 'lq as id, l as node1, lq.label as label, n2 as node2' \\\n",
    "-o \"$OUT\"/qualifiers.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zcat: error writing to output: Broken pipe\n",
      "id                                                node1                           label  node2\n",
      "P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0       P10-P1855-Q15075950-7eff6d65-0  P10    \"Smoorverliefd 12 september.webm\"\n",
      "P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0    P10-P1855-Q15075950-7eff6d65-0  P3831  Q622550\n",
      "P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0       P10-P1855-Q69063653-c8cdb04c-0  P10    \"Couch Commander.webm\"\n",
      "P10-P1855-Q7378-555592a4-0-P10-8a982d-0           P10-P1855-Q7378-555592a4-0      P10    \"Elephants Dream (2006).webm\"\n",
      "P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0     P10-P2302-Q21502404-d012aef4-0  P1793  \"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\"\n",
      "P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0  P10-P2302-Q21502404-d012aef4-0  P2316  Q21502408\n",
      "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0     P10-P2302-Q21502404-d012aef4-0  P2916  'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en\n",
      "P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0       P10-P2302-Q21510851-5224fe0b-0  P2306  P175\n",
      "P10-P2302-Q21510851-5224fe0b-0-P2306-P180-0       P10-P2302-Q21510851-5224fe0b-0  P2306  P180\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.qualifiers.tsv.gz | head | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  109816  446163 10639203\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$TEMP\"/Q154.qualifiers.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5: consolidate all the files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-12-13 16:52:20--  https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 3108 (3.0K) [text/plain]\n",
      "Saving to: ‘/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/kgtk.properties.tsv’\n",
      "\n",
      "/Users/pedroszekely 100%[===================>]   3.04K  --.-KB/s    in 0s      \n",
      "\n",
      "2020-12-13 16:52:22 (17.0 MB/s) - ‘/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/kgtk.properties.tsv’ saved [3108/3108]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -O \"$TEMP\"/kgtk.properties.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1     label        node2                                   id\n",
      "isa       label        \"is a\"@en                               isa-label-\"is a\"@en-0000\n",
      "isa       alias        \"isa\"@en                                isa-alias-\"isa\"@en-0000\n",
      "isa       description  \"Instance or subclass relationship\"@en  isa-description-\"Instance or subclass relationship\"@en-0000\n",
      "isa       P31          Q18616576                               isa-P31-Q18616576-0000\n",
      "isa       P31          Q28326461                               isa-P31-Q28326461-0000\n",
      "isa       P31          Q18647519                               isa-P31-Q18647519-0000\n",
      "isa       data_type    wikibase-item                           isa-data_type-item-0000\n",
      "P279star  label        \"is a\"@en                               P279star-label-\"is a\"@en-0000\n",
      "P279star  alias        \"isa\"@en                                P279star-alias-\"isa\"@en-0000\n"
     ]
    }
   ],
   "source": [
    "!head \"$TEMP\"/kgtk.properties.tsv | column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id\tnode1\tlabel\tnode2\n",
      "P10-datatype\tP10\tdatatype\tcommonsMedia\n",
      "P1000-datatype\tP1000\tdatatype\twikibase-item\n",
      "P1001-datatype\tP1001\tdatatype\twikibase-item\n",
      "P1002-datatype\tP1002\tdatatype\twikibase-item\n",
      "P1003-datatype\tP1003\tdatatype\texternal-id\n",
      "P1004-datatype\tP1004\tdatatype\texternal-id\n",
      "P1005-datatype\tP1005\tdatatype\texternal-id\n",
      "P1006-datatype\tP1006\tdatatype\texternal-id\n",
      "P1007-datatype\tP1007\tdatatype\texternal-id\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$PROPERTY_DATATYPES\" | head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        0.80 real         0.62 user         0.15 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$Q154GRAPH\" -i \"$PROPERTY_DATATYPES\" \\\n",
    "--match 'Q15: (n1)-[]->(), property: (n1)-[l:datatype]->(n2)' \\\n",
    "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n",
    "-o \"$TEMP\"/Q15.metadata.property.datatype.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        5.44 real         7.13 user         0.67 sys\n"
     ]
    }
   ],
   "source": [
    "!$kgtk cat \\\n",
    "-i \"$TEMP\"/Q154.label.node2.tsv.gz \\\n",
    "-i \"$TEMP\"/alias.tsv.gz \\\n",
    "-i \"$TEMP\"/description.tsv.gz \\\n",
    "-i \"$TEMP\"/Q154.edges.3.tsv.gz \\\n",
    "-i \"$TEMP\"/kgtk.properties.tsv \\\n",
    "-i \"$TEMP\"/Q15.metadata.property.datatype.tsv.gz \\\n",
    "/ compact \\\n",
    "/ sort2 \\\n",
    "-o \"$OUT\"/all.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  218110  955849 12264507\n"
     ]
    }
   ],
   "source": [
    "!zcat < \"$OUT\"/all.tsv.gz | wc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 6: partition the files to follow the conventions KGTK uses for Wikidata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stop here: the stuff below is Pedro's scratchpad, will be deleted later"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cleanup\n",
    "\n",
    "Remove `novalue` and `somevalue`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      510.97 real       166.66 user       197.67 sys\n",
      "id                           node1  label  node2     rank       node2;wikidatatype  id                                         node1                        label  node2                    node2;wikidatatype\n",
      "Q65-P1082-02e70e-ea7734b4-0  Q65    P1082  +3792621  normal     quantity            Q65-P1082-02e70e-ea7734b4-0-P585-6e6a88-0  Q65-P1082-02e70e-ea7734b4-0  P585   ^2010-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-2c08e1-12f0f95e-0  Q65    P1082  +5728     normal     quantity            Q65-P1082-2c08e1-12f0f95e-0-P585-4ab039-0  Q65-P1082-2c08e1-12f0f95e-0  P585   ^1870-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-418d5a-b540356a-0  Q65    P1082  +319198   normal     quantity            Q65-P1082-418d5a-b540356a-0-P585-6efbd3-0  Q65-P1082-418d5a-b540356a-0  P585   ^1910-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-808058-b69b4060-0  Q65    P1082  +50395    normal     quantity            Q65-P1082-808058-b69b4060-0-P585-b45c46-0  Q65-P1082-808058-b69b4060-0  P585   ^1890-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-982d82-a3b6b816-0  Q65    P1082  +3976322  preferred  quantity            Q65-P1082-982d82-a3b6b816-0-P585-cd3f49-0  Q65-P1082-982d82-a3b6b816-0  P585   ^2016-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-a403b5-8ac2d57f-0  Q65    P1082  +1610     normal     quantity            Q65-P1082-a403b5-8ac2d57f-0-P585-4b4a18-0  Q65-P1082-a403b5-8ac2d57f-0  P585   ^1850-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-aa27be-2612ff2a-0  Q65    P1082  +102479   normal     quantity            Q65-P1082-aa27be-2612ff2a-0-P585-662d67-0  Q65-P1082-aa27be-2612ff2a-0  P585   ^1900-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-b41a90-8b71e469-0  Q65    P1082  +11183    normal     quantity            Q65-P1082-b41a90-8b71e469-0-P585-211752-0  Q65-P1082-b41a90-8b71e469-0  P585   ^1880-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-c0f75e-73a2c23f-0  Q65    P1082  +3990456  normal     quantity            Q65-P1082-c0f75e-73a2c23f-0-P585-364baf-0  Q65-P1082-c0f75e-73a2c23f-0  P585   ^2018-00-00T00:00:00Z/9  time\n",
      "Q65-P1082-d6a4d0-28aeb70a-0  Q65    P1082  +4385     normal     quantity            Q65-P1082-d6a4d0-28aeb70a-0-P585-a9ad71-0  Q65-P1082-d6a4d0-28aeb70a-0  P585   ^1860-00-00T00:00:00Z/9  time\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$QUALIFIERS\" \\\n",
    "--match 'claim: (n1:Q65)-[l]-(n2), qual: (l)-[ql]->(qn2)' \\\n",
    "--limit 10 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/{wos}/derived.P279.tsv.gz'\n",
      "\n",
      "        1.08 real         0.59 user         0.19 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i {wos}/derived.P279.tsv.gz \\\n",
    "--match '(n1)-[]-()' \\\n",
    "--return 'count(distinct n1)' \\\n",
    "--limit 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/bin/bash: {kypher}: command not found\n"
     ]
    }
   ],
   "source": [
    "!{kypher} -i {wos}/derived.P279star.tsv.gz -i {wos}/labels.en.tsv.gz \\\n",
    "--match 'P279star: (n1)-[]-(:Q18518465), label: (n1)-[]->(label)' \\\n",
    "--return 'n1 as class, label as name' \\\n",
    "--limit 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'quals_time' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-60-0b2813d448e6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mquals_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'quals_time' is not defined"
     ]
    }
   ],
   "source": [
    "quals_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "claims"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%env CLAIMS={claims}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%env CLAIMS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%env ST=\"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%env kypher"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id\tnode1\tlabel\tnode2\tlang\n",
      "Q2860568-addl_wikipedia_sitelink-93c252-0\tQ2860568\taddl_wikipedia_sitelink\thttp://commonswiki.org/wiki/Category:Archives_of_American_Art\ten\n",
      "Q2860568-wikipedia_sitelink-0d01d2-0\tQ2860568\twikipedia_sitelink\thttp://es.wikipedia.org/wiki/Archivos_de_arte_estadounidense\tes\n",
      "Q2860568-wikipedia_sitelink-14b314-0\tQ2860568\twikipedia_sitelink\thttp://fr.wikipedia.org/wiki/Archives_of_American_Art\tfr\n",
      "Q2860568-wikipedia_sitelink-8e7449-0\tQ2860568\twikipedia_sitelink\thttp://ca.wikipedia.org/wiki/Arxius_d'Art_Americà\tca\n",
      "Q2860568-wikipedia_sitelink-9e4854-0\tQ2860568\twikipedia_sitelink\thttp://en.wikipedia.org/wiki/Archives_of_American_Art\ten\n",
      "Q2860568-wikipedia_sitelink-c1e42a-0\tQ2860568\twikipedia_sitelink\thttp://la.wikipedia.org/wiki/Tabulae_Artis_Americanae\tla\n",
      "Q2860568-wikipedia_sitelink-c68de4-0\tQ2860568\twikipedia_sitelink\thttp://pl.wikipedia.org/wiki/Archives_of_American_Art\tpl\n",
      "     2694.67 real      3603.61 user       285.78 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$SITELINKS\" \\\n",
    "--match '(n1:Q2860568)-[l]->(n2)' \\\n",
    "--limit 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$SITELINKS\" \\\n",
    "--match '(n1)-[l {lang: \"en\"}]->(n2)' \\\n",
    "--return 'n2 as wikipedia, count(n1) as n1_count' \\\n",
    "--order-by 'n1_count desc' \\\n",
    "-o $TEMP/sitelinks.count.en.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$SITELINKS\" \\\n",
    "--match '(n1)-[l:`wikipedia_sitelink` {lang: language}]->(n2)' \\\n",
    "--return 'n2 as wikipedia, count(n1) as n1_count' \\\n",
    "--order-by 'n1_count desc' \\\n",
    "-o $TEMP/sitelinks.count.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$SITELINKS\" \\\n",
    "--match '(n1)-[l:`wikipedia_sitelink` {lang: language}]->(n2)' \\\n",
    "--return 'n1 as qnode, language, count(n2) as n1_count' \\\n",
    "--order-by 'n1_count desc' \\\n",
    "-o $TEMP/sitelinks.qnode.count.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     2872.01 real      1038.30 user       693.38 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$SITELINKS\" \\\n",
    "--match '(n1)-[l:`wikipedia_sitelink` {lang: language, label:lab}]->(n2), (l)-[:`sitelink-site`]->(site)' \\\n",
    "--return 'n1 as node1, lab as label, n2 as node2, language as language, site as site' \\\n",
    "-o $TEMP/sitelinks.wikipedia.tsv.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tlabel\tnode2\tlanguage\tsite\n",
      "Q1\twikipedia_sitelink\thttp://oc.wikipedia.org/wiki/Univèrs\toc\tocwiki\n",
      "Q1\twikipedia_sitelink\thttp://cdo.wikipedia.org/wiki/Ṳ̄-dêu\tcdo\tcdowiki\n",
      "Q1\twikipedia_sitelink\thttp://ml.wikipedia.org/wiki/പ്രപഞ്ചം\tml\tmlwiki\n",
      "Q1\twikipedia_sitelink\thttp://si.wikipedia.org/wiki/විශ්වය\tsi\tsiwiki\n",
      "Q1\twikipedia_sitelink\thttp://bxr.wikipedia.org/wiki/Оршолон\tbxr\tbxrwiki\n",
      "Q1\twikipedia_sitelink\thttp://jam.wikipedia.org/wiki/Yunivoers\tjam\tjamwiki\n",
      "Q1\twikipedia_sitelink\thttp://hr.wikipedia.org/wiki/Svemir\thr\thrwiki\n",
      "Q1\twikipedia_sitelink\thttp://chr.wikipedia.org/wiki/ᎦᎸᎶᎯ_ᎦᎸᎾᏗ\tchr\tchrwiki\n",
      "Q1\twikipedia_sitelink\thttp://pfl.wikipedia.org/wiki/Weltall\tpfl\tpflwiki\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < $TEMP/sitelinks.wikipedia.tsv.gz | head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "qnode\tlang\tn1_count\n",
      "Q13107716\t\t2\n",
      "Q14357839\t\t2\n",
      "Q15098140\t\t2\n",
      "Q15116966\t\t2\n",
      "Q15117218\t\t2\n",
      "Q15117391\t\t2\n",
      "Q15379728\t\t2\n",
      "Q15506579\t\t2\n",
      "Q16748603\t\t2\n",
      "Q16830095\t\t2\n",
      "Q17121869\t\t2\n",
      "Q17347205\t\t2\n",
      "Q17347215\t\t2\n",
      "Q17347224\t\t2\n",
      "Q17347230\t\t2\n",
      "Q20962109\t\t2\n",
      "Q21451097\t\t2\n",
      "Q25714577\t\t2\n",
      "Q26905045\t\t2\n",
      "Q26905108\t\t2\n",
      "Q4375196\t\t2\n",
      "Q48010913\t\t2\n",
      "Q4847311\t\t2\n",
      "Q5296\t\t2\n",
      "Q5453037\t\t2\n",
      "Q56528363\t\t2\n",
      "Q56528384\t\t2\n",
      "Q58832772\t\t2\n",
      "Q7253814\t\t2\n",
      "Q7348344\t\t2\n",
      "Q1\taf\t1\n",
      "Q1\tak\t1\n",
      "Q1\tals\t1\n",
      "Q1\tam\t1\n",
      "Q1\tan\t1\n",
      "Q1\tar\t1\n",
      "Q1\tarc\t1\n",
      "Q1\tarz\t1\n",
      "Q1\tas\t1\n",
      "Q1\tast\t1\n",
      "Q1\taz\t1\n",
      "Q1\tba\t1\n",
      "Q1\tbar\t1\n",
      "Q1\tbat-smg\t1\n",
      "Q1\tbe\t1\n",
      "Q1\tbe-x-old\t1\n",
      "Q1\tbg\t1\n",
      "Q1\tbh\t1\n",
      "Q1\tbn\t1\n",
      "Q1\tbr\t1\n",
      "Q1\tbs\t1\n",
      "Q1\tbxr\t1\n",
      "Q1\tca\t1\n",
      "Q1\tcdo\t1\n",
      "Q1\tce\t1\n",
      "Q1\tchr\t1\n",
      "Q1\tckb\t1\n",
      "Q1\tcs\t1\n",
      "Q1\tcsb\t1\n",
      "Q1\tcv\t1\n",
      "Q1\tcy\t1\n",
      "Q1\tda\t1\n",
      "Q1\tde\t1\n",
      "Q1\tdiq\t1\n",
      "Q1\tdsb\t1\n",
      "Q1\tel\t1\n",
      "Q1\ten\t1\n",
      "Q1\teo\t1\n",
      "Q1\tes\t1\n",
      "Q1\tet\t1\n",
      "Q1\teu\t1\n",
      "Q1\text\t1\n",
      "Q1\tfa\t1\n",
      "Q1\tfi\t1\n",
      "Q1\tfj\t1\n",
      "Q1\tfo\t1\n",
      "Q1\tfr\t1\n",
      "Q1\tfrr\t1\n",
      "Q1\tfy\t1\n",
      "Q1\tga\t1\n",
      "Q1\tgcr\t1\n",
      "Q1\tgl\t1\n",
      "Q1\tgn\t1\n",
      "Q1\tgu\t1\n",
      "Q1\thak\t1\n",
      "Q1\the\t1\n",
      "Q1\thi\t1\n",
      "Q1\thif\t1\n",
      "Q1\thr\t1\n",
      "Q1\tht\t1\n",
      "Q1\thu\t1\n",
      "Q1\thy\t1\n",
      "Q1\thyw\t1\n",
      "Q1\tia\t1\n",
      "Q1\tid\t1\n",
      "Q1\tilo\t1\n",
      "Q1\tinh\t1\n",
      "Q1\tio\t1\n",
      "Q1\tis\t1\n",
      "zcat: error writing to output: Broken pipe\n"
     ]
    }
   ],
   "source": [
    "!zcat < $TEMP/sitelinks.qnode.count.tsv.gz | head -100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = \"time kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$CLAIMS\" -i \"$QUALS\"  \\\n",
    "--match 'claims: (n1:Q30)-[l {label: property}]->(n2), qual: (l)-[q]->(t {wikidatatype: \"time\"})' \\\n",
    "--return 'distinct n1, property as label, n2 as node2, q.label as qualifier, kgtk_date_and_time(t) as time, l as id' \\\n",
    "--order-by 'n1, property, qualifier, time desc' \\\n",
    "--limit 100 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "kgtk query --debug --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db \\\n",
    "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz\" \\\n",
    "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/qualifiers.tsv.gz\" \\\n",
    "--match 'claims: (n1:Q30)-[l {label: property}]->(n2), qual: (l)-[q]->(t {wikidatatype: \"time\"})' \\\n",
    "--return 'distinct n1, property as label, n2 as node2, q.label as qualifier, kgtk_date_and_time(t) as time, l as id' \\\n",
    "--order-by 'n1, property, qualifier, time desc' \\\n",
    "--limit 100 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher \\\n",
    "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz\" \\\n",
    "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/qualifiers.tsv.gz\" \\\n",
    "--match 'claims: (n1:Q30)-[l {label: property}]->(n2), qual: (l)-[q]->(t {wikidatatype: \"time\"})' \\\n",
    "--return 'distinct n1, property as label, n2 as node2, q.label as qualifier, kgtk_date_and_time(t) as time' \\\n",
    "--order-by 'n1, property, qualifier, time desc' \\\n",
    "--limit 100 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!{kypher} -i {claims} -i {quals} \\\n",
    "--match 'claims: (n1:Q1431229)-[l]->(n2)' \\\n",
    "--limit 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "quals_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!zcat < {quals_time} | grep 'Q1431229'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"PYWIKIBOT_DIR\"] = \"/Users/pedroszekely/Documents/GitHub/core\"\n",
    "os.environ[\"PYWIKIBOT_DIR\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pprint  # Only for structuring the JSON file\n",
    "\n",
    "import pywikibot\n",
    "import pywikibot.data.api as api\n",
    "\n",
    "\"\"\"\n",
    "Using API calls to get pageviews\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "site = pywikibot.Site(\"wikidata\", \"wikidata\")\n",
    "repo = site.data_repository()\n",
    "item = pywikibot.ItemPage(repo, \"Q216916\")\n",
    "\n",
    "req = api.Request(\n",
    "    site=site,\n",
    "    parameters={\n",
    "        \"action\": \"query\",  # https://www.wikidata.org/w/api.php?action=query&titles=Q42&prop=pageviews\n",
    "        \"titles\": item,\n",
    "        \"prop\": \"pageviews\",\n",
    "    },\n",
    ")\n",
    "\n",
    "pprint.pprint(\n",
    "    req.submit()[\"query\"][\"pages\"][str(item.pageid)][\"pageviews\"]\n",
    ")  # Can also use print"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$CLAIMS\" -o $TEMP/new.metadata.out_degree.tsv.gz \\\n",
    "--match '(n1)-[l]->()' \\\n",
    "--return 'distinct n1 as node1, count(l) as node2, \"out_degree\" as label' \\\n",
    "--order-by 'n1 desc'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!$kypher -i \"$CLAIMS\" -o $TEMP/new.metadata.in_degree.tsv.gz \\\n",
    "--match '()-[l]->(n2 {`wikidatatype`:\"wikibase-item\"})' \\\n",
    "--return 'distinct n2 as node1, count(distinct l) as node2, \"in_degree\" as label' \\\n",
    "--order-by 'n2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\n"
     ]
    }
   ],
   "source": [
    "!echo \"$kypher\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "label\tcount\tnode2\n",
      "P882\t2905\t'FIPS 6-4 (US counties)'@en\n",
      "P5736\t2904\t'Minor Planet Center body ID'@en\n",
      "P4683\t2866\t'National Gallery of Art artwork ID'@en\n",
      "P8286\t2864\t'Olympedia athlete ID'@en\n",
      "P698\t2862\t'PubMed ID'@en\n",
      "P7263\t2817\t'Prime Pages ID'@en\n",
      "P374\t2707\t'INSEE municipality code'@en\n",
      "P6018\t2654\t'SeaLifeBase ID'@en\n",
      "P4129\t2462\t'Cinema Treasures ID'@en\n",
      "P3064\t2330\t'LepIndex ID'@en\n",
      "P1415\t2301\t'Oxford Dictionary of National Biography ID'@en\n",
      "P815\t2226\t'ITIS TSN'@en\n",
      "P830\t2165\t'Encyclopedia of Life ID'@en\n",
      "P354\t2123\t'HGNC ID'@en\n",
      "P359\t2072\t'Rijksmonument ID'@en\n",
      "P7202\t2046\t'Belgian Species List ID'@en\n",
      "P1970\t2046\t'MovieMeter film ID'@en\n",
      "P7224\t2024\t'Insects (Insecta) of the World ID'@en\n",
      "P351\t1974\t'Entrez Gene ID'@en\n",
      "P4381\t1932\t'Soccerdonna player ID'@en\n",
      "P932\t1927\t'PMCID'@en\n",
      "P3151\t1911\t'iNaturalist taxon ID'@en\n",
      "P3138\t1830\t'OFDb ID'@en\n",
      "P5573\t1825\t'archINFORM location ID'@en\n",
      "P2603\t1821\t'Kinopoisk film ID'@en\n",
      "P2574\t1760\t'National-Football-Teams.com player ID'@en\n",
      "P8422\t1698\t'EHESS ID of a French commune'@en\n",
      "P2163\t1697\t'FAST ID'@en\n",
      "P772\t1671\t'INE municipality code'@en\n",
      "P2840\t1638\t'NSC number'@en\n",
      "P3143\t1595\t'elFilm film ID'@en\n",
      "P4327\t1512\t'BHL bibliography ID'@en\n",
      "P6736\t1494\t'Drobné památky ID'@en\n",
      "P2529\t1451\t'ČSFD film ID'@en\n",
      "P8351\t1423\t'vglist video game ID'@en\n",
      "P685\t1401\t'NCBI taxonomy ID'@en\n",
      "P5263\t1393\t'Czech NDOP taxon ID'@en\n",
      "P1600\t1343\t'Inventari del Patrimoni Arquitectònic de Catalunya code'@en\n",
      "P650\t1338\t'RKDartists ID'@en\n",
      "P2334\t1327\t'Swedish Film Database film ID'@en\n",
      "P838\t1302\t'BioLib taxon ID'@en\n",
      "P5739\t1300\t'Pontificia Università della Santa Croce ID'@en\n",
      "P3302\t1247\t'Open Media Database film ID'@en\n",
      "P2446\t1242\t'Transfermarkt player ID'@en\n",
      "P3844\t1221\t'Deutsche Synchronkartei film ID'@en\n",
      "P2605\t1201\t'ČSFD person ID'@en\n",
      "P1156\t1174\t'Scopus Source ID'@en\n",
      "P5731\t1155\t'Angelicum ID'@en\n",
      "P1225\t1139\t'U.S. National Archives Identifier'@en\n",
      "P5383\t1110\t'archINFORM project ID'@en\n",
      "       19.99 real         2.36 user         4.37 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$IDS\" -i \"$LABEL\" -i \"/Users/pedroszekely/Downloads/fips-large.tsv\" \\\n",
    "--match 'fips: (fips)-[]->(), external: (n1)-[l {label: p}]->(fips), label: (p)-[]->(p_label)' \\\n",
    "--return 'p, count(p) as count, p_label' \\\n",
    "--order-by 'count desc' \\\n",
    "--limit 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node1\tnode1\tnode2\tnode2\tnode1\n",
      "P882\tQ156168\t'FIPS 6-4 (US counties)'@en\t'Autauga County'@en\t\"01001\"\n",
      "P882\tQ156163\t'FIPS 6-4 (US counties)'@en\t'Baldwin County'@en\t\"01003\"\n",
      "P882\tQ109437\t'FIPS 6-4 (US counties)'@en\t'Barbour County'@en\t\"01005\"\n",
      "P882\tQ461204\t'FIPS 6-4 (US counties)'@en\t'Bibb County'@en\t\"01007\"\n",
      "P882\tQ111250\t'FIPS 6-4 (US counties)'@en\t'Blount County'@en\t\"01009\"\n",
      "P882\tQ111259\t'FIPS 6-4 (US counties)'@en\t'Bullock County'@en\t\"01011\"\n",
      "P882\tQ108871\t'FIPS 6-4 (US counties)'@en\t'Butler County'@en\t\"01013\"\n",
      "P882\tQ108856\t'FIPS 6-4 (US counties)'@en\t'Calhoun County'@en\t\"01015\"\n",
      "P882\tQ111280\t'FIPS 6-4 (US counties)'@en\t'Chambers County'@en\t\"01017\"\n",
      "P882\tQ108832\t'FIPS 6-4 (US counties)'@en\t'Cherokee County'@en\t\"01019\"\n",
      "P882\tQ111266\t'FIPS 6-4 (US counties)'@en\t'Chilton County'@en\t\"01021\"\n",
      "P882\tQ111254\t'FIPS 6-4 (US counties)'@en\t'Choctaw County'@en\t\"01023\"\n",
      "P882\tQ111273\t'FIPS 6-4 (US counties)'@en\t'Clarke County'@en\t\"01025\"\n",
      "P882\tQ156570\t'FIPS 6-4 (US counties)'@en\t'Clay County'@en\t\"01027\"\n",
      "P882\tQ327080\t'FIPS 6-4 (US counties)'@en\t'Cleburne County'@en\t\"01029\"\n",
      "P882\tQ485660\t'FIPS 6-4 (US counties)'@en\t'Coffee County'@en\t\"01031\"\n",
      "P882\tQ487731\t'FIPS 6-4 (US counties)'@en\t'Colbert County'@en\t\"01033\"\n",
      "P882\tQ487716\t'FIPS 6-4 (US counties)'@en\t'Conecuh County'@en\t\"01035\"\n",
      "P882\tQ487738\t'FIPS 6-4 (US counties)'@en\t'Coosa County'@en\t\"01037\"\n",
      "P882\tQ487725\t'FIPS 6-4 (US counties)'@en\t'Covington County'@en\t\"01039\"\n",
      "P882\tQ488831\t'FIPS 6-4 (US counties)'@en\t'Crenshaw County'@en\t\"01041\"\n",
      "P882\tQ188204\t'FIPS 6-4 (US counties)'@en\t'Cullman County'@en\t\"01043\"\n",
      "P882\tQ488840\t'FIPS 6-4 (US counties)'@en\t'Dale County'@en\t\"01045\"\n",
      "P882\tQ488847\t'FIPS 6-4 (US counties)'@en\t'Dallas County'@en\t\"01047\"\n",
      "P882\tQ494626\t'FIPS 6-4 (US counties)'@en\t'DeKalb County'@en\t\"01049\"\n",
      "P882\tQ494630\t'FIPS 6-4 (US counties)'@en\t'Elmore County'@en\t\"01051\"\n",
      "P882\tQ487744\t'FIPS 6-4 (US counties)'@en\t'Escambia County'@en\t\"01053\"\n",
      "P882\tQ493951\t'FIPS 6-4 (US counties)'@en\t'Etowah County'@en\t\"01055\"\n",
      "P882\tQ493957\t'FIPS 6-4 (US counties)'@en\t'Fayette County'@en\t\"01057\"\n",
      "P882\tQ488892\t'FIPS 6-4 (US counties)'@en\t'Franklin County'@en\t\"01059\"\n",
      "P882\tQ494620\t'FIPS 6-4 (US counties)'@en\t'Geneva County'@en\t\"01061\"\n",
      "P882\tQ493709\t'FIPS 6-4 (US counties)'@en\t'Greene County'@en\t\"01063\"\n",
      "P882\tQ501147\t'FIPS 6-4 (US counties)'@en\t'Hale County'@en\t\"01065\"\n",
      "P882\tQ501000\t'FIPS 6-4 (US counties)'@en\t'Henry County'@en\t\"01067\"\n",
      "P882\tQ496292\t'FIPS 6-4 (US counties)'@en\t'Houston County'@en\t\"01069\"\n",
      "P882\tQ366959\t'FIPS 6-4 (US counties)'@en\t'Jackson County'@en\t\"01071\"\n",
      "P882\tQ112271\t'FIPS 6-4 (US counties)'@en\t'Jefferson County'@en\t\"01073\"\n",
      "P882\tQ505317\t'FIPS 6-4 (US counties)'@en\t'Lamar County'@en\t\"01075\"\n",
      "P882\tQ261672\t'FIPS 6-4 (US counties)'@en\t'Lauderdale County'@en\t\"01077\"\n",
      "P882\tQ502737\t'FIPS 6-4 (US counties)'@en\t'Lawrence County'@en\t\"01079\"\n",
      "P882\tQ501055\t'FIPS 6-4 (US counties)'@en\t'Lee County'@en\t\"01081\"\n",
      "P882\tQ501108\t'FIPS 6-4 (US counties)'@en\t'Limestone County'@en\t\"01083\"\n",
      "P882\tQ503461\t'FIPS 6-4 (US counties)'@en\t'Lowndes County'@en\t\"01085\"\n",
      "P882\tQ502777\t'FIPS 6-4 (US counties)'@en\t'Macon County'@en\t\"01087\"\n",
      "P882\tQ493715\t'FIPS 6-4 (US counties)'@en\t'Madison County'@en\t\"01089\"\n",
      "P882\tQ501074\t'FIPS 6-4 (US counties)'@en\t'Marengo County'@en\t\"01091\"\n",
      "P882\tQ502739\t'FIPS 6-4 (US counties)'@en\t'Marion County'@en\t\"01093\"\n",
      "P882\tQ502925\t'FIPS 6-4 (US counties)'@en\t'Marshall County'@en\t\"01095\"\n",
      "P882\tQ495738\t'FIPS 6-4 (US counties)'@en\t'Mobile County'@en\t\"01097\"\n",
      "P882\tQ501060\t'FIPS 6-4 (US counties)'@en\t'Monroe County'@en\t\"01099\"\n",
      "P882\tQ502784\t'FIPS 6-4 (US counties)'@en\t'Montgomery County'@en\t\"01101\"\n",
      "P882\tQ137828\t'FIPS 6-4 (US counties)'@en\t'Morgan County'@en\t\"01103\"\n",
      "P882\tQ253538\t'FIPS 6-4 (US counties)'@en\t'Perry County'@en\t\"01105\"\n",
      "P882\tQ949766\t'FIPS 6-4 (US counties)'@en\t'Pickens County'@en\t\"01107\"\n",
      "P882\tQ492888\t'FIPS 6-4 (US counties)'@en\t'Pike County'@en\t\"01109\"\n",
      "P882\tQ502743\t'FIPS 6-4 (US counties)'@en\t'Randolph County'@en\t\"01111\"\n",
      "P882\tQ503329\t'FIPS 6-4 (US counties)'@en\t'Russell County'@en\t\"01113\"\n",
      "P882\tQ503451\t'FIPS 6-4 (US counties)'@en\t'St. Clair County'@en\t\"01115\"\n",
      "P882\tQ501084\t'FIPS 6-4 (US counties)'@en\t'Shelby County'@en\t\"01117\"\n",
      "P882\tQ501051\t'FIPS 6-4 (US counties)'@en\t'Sumter County'@en\t\"01119\"\n",
      "P882\tQ302918\t'FIPS 6-4 (US counties)'@en\t'Talladega County'@en\t\"01121\"\n",
      "P882\tQ512787\t'FIPS 6-4 (US counties)'@en\t'Tallapoosa County'@en\t\"01123\"\n",
      "P882\tQ503877\t'FIPS 6-4 (US counties)'@en\t'Tuscaloosa County'@en\t\"01125\"\n",
      "P882\tQ506291\t'FIPS 6-4 (US counties)'@en\t'Walker County'@en\t\"01127\"\n",
      "P882\tQ501157\t'FIPS 6-4 (US counties)'@en\t'Washington County'@en\t\"01129\"\n",
      "P882\tQ503081\t'FIPS 6-4 (US counties)'@en\t'Wilcox County'@en\t\"01131\"\n",
      "P882\tQ503088\t'FIPS 6-4 (US counties)'@en\t'Winston County'@en\t\"01133\"\n",
      "P882\tQ504371\t'FIPS 6-4 (US counties)'@en\t'Aleutians East Borough'@en\t\"02013\"\n",
      "P882\tQ185533\t'FIPS 6-4 (US counties)'@en\t'Aleutians West Census Area'@en\t\"02016\"\n",
      "P882\tQ39450\t'FIPS 6-4 (US counties)'@en\t'Anchorage'@en\t\"02020\"\n",
      "P882\tQ49297981\t'FIPS 6-4 (US counties)'@en\t'Anchorage Municipality'@en\t\"02020\"\n",
      "P882\tQ500312\t'FIPS 6-4 (US counties)'@en\t'Bethel Census Area'@en\t\"02050\"\n",
      "P882\tQ501130\t'FIPS 6-4 (US counties)'@en\t'Bristol Bay Borough'@en\t\"02060\"\n",
      "P882\tQ179950\t'FIPS 6-4 (US counties)'@en\t'Denali Borough'@en\t\"02068\"\n",
      "P882\tQ277728\t'FIPS 6-4 (US counties)'@en\t'Dillingham Census Area'@en\t\"02070\"\n",
      "P882\tQ512901\t'FIPS 6-4 (US counties)'@en\t'Fairbanks North Star Borough'@en\t\"02090\"\n",
      "P882\tQ512981\t'FIPS 6-4 (US counties)'@en\t'Haines Borough'@en\t\"02100\"\n",
      "P882\tQ500827\t'FIPS 6-4 (US counties)'@en\t'Hoonah–Angoon Census Area'@en\t\"02105\"\n",
      "P882\tQ29445\t'FIPS 6-4 (US counties)'@en\t'Juneau'@en\t\"02110\"\n",
      "P882\tQ512713\t'FIPS 6-4 (US counties)'@en\t'Kenai Peninsula Borough'@en\t\"02122\"\n",
      "P882\tQ506064\t'FIPS 6-4 (US counties)'@en\t'Ketchikan Gateway Borough'@en\t\"02130\"\n",
      "P882\tQ514093\t'FIPS 6-4 (US counties)'@en\t'Kodiak Island Borough'@en\t\"02150\"\n",
      "P882\tQ379474\t'FIPS 6-4 (US counties)'@en\t'Kusilvak Census Area'@en\t\"02158\"\n",
      "P882\tQ511679\t'FIPS 6-4 (US counties)'@en\t'Lake and Peninsula Borough'@en\t\"02164\"\n",
      "P882\tQ512925\t'FIPS 6-4 (US counties)'@en\t'Matanuska-Susitna Borough'@en\t\"02170\"\n",
      "P882\tQ503023\t'FIPS 6-4 (US counties)'@en\t'Nome Census Area'@en\t\"02180\"\n",
      "P882\tQ511806\t'FIPS 6-4 (US counties)'@en\t'North Slope Borough'@en\t\"02185\"\n",
      "P882\tQ511723\t'FIPS 6-4 (US counties)'@en\t'Northwest Arctic Borough'@en\t\"02188\"\n",
      "P882\tQ25408755\t'FIPS 6-4 (US counties)'@en\t'Petersburg Borough'@en\t\"02195\"\n",
      "P882\tQ503028\t'FIPS 6-4 (US counties)'@en\t'Petersburg Census Area'@en\t\"02195\"\n",
      "P882\tQ18120072\t'FIPS 6-4 (US counties)'@en\t'Prince of Wales–Hyder Census Area'@en\t\"02198\"\n",
      "P882\tQ79804\t'FIPS 6-4 (US counties)'@en\t'Sitka'@en\t\"02220\"\n",
      "P882\tQ615975\t'FIPS 6-4 (US counties)'@en\t'Skagway'@en\t\"02230\"\n",
      "P882\tQ500845\t'FIPS 6-4 (US counties)'@en\t'Southeast Fairbanks Census Area'@en\t\"02240\"\n",
      "P882\tQ508618\t'FIPS 6-4 (US counties)'@en\t'Valdez–Cordova Census Area'@en\t\"02261\"\n",
      "P882\tQ43983\t'FIPS 6-4 (US counties)'@en\t'Wrangell'@en\t\"02275\"\n",
      "P882\tQ487681\t'FIPS 6-4 (US counties)'@en\t'Yakutat'@en\t\"02282\"\n",
      "P882\tQ500818\t'FIPS 6-4 (US counties)'@en\t'Yukon–Koyukuk Census Area'@en\t\"02290\"\n",
      "P882\tQ58771\t'FIPS 6-4 (US counties)'@en\t'Apache County'@en\t\"04001\"\n",
      "P882\tQ58774\t'FIPS 6-4 (US counties)'@en\t'Cochise County'@en\t\"04003\"\n",
      "P882\tQ58684\t'FIPS 6-4 (US counties)'@en\t'Coconino County'@en\t\"04005\"\n",
      "P882\tQ58686\t'FIPS 6-4 (US counties)'@en\t'Gila County'@en\t\"04007\"\n",
      "P882\tQ58692\t'FIPS 6-4 (US counties)'@en\t'Graham County'@en\t\"04009\"\n",
      "P882\tQ58683\t'FIPS 6-4 (US counties)'@en\t'Greenlee County'@en\t\"04011\"\n",
      "P882\tQ58759\t'FIPS 6-4 (US counties)'@en\t'La Paz County'@en\t\"04012\"\n",
      "P882\tQ58691\t'FIPS 6-4 (US counties)'@en\t'Maricopa County'@en\t\"04013\"\n",
      "P882\tQ58696\t'FIPS 6-4 (US counties)'@en\t'Mohave County'@en\t\"04015\"\n",
      "P882\tQ58694\t'FIPS 6-4 (US counties)'@en\t'Navajo County'@en\t\"04017\"\n",
      "P882\tQ58688\t'FIPS 6-4 (US counties)'@en\t'Pima County'@en\t\"04019\"\n",
      "P882\tQ58712\t'FIPS 6-4 (US counties)'@en\t'Pinal County'@en\t\"04021\"\n",
      "P882\tQ58689\t'FIPS 6-4 (US counties)'@en\t'Santa Cruz County'@en\t\"04023\"\n",
      "P882\tQ58711\t'FIPS 6-4 (US counties)'@en\t'Yavapai County'@en\t\"04025\"\n",
      "P882\tQ58698\t'FIPS 6-4 (US counties)'@en\t'Yuma County'@en\t\"04027\"\n",
      "P882\tQ61414\t'FIPS 6-4 (US counties)'@en\t'Arkansas County'@en\t\"05001\"\n",
      "P882\tQ61026\t'FIPS 6-4 (US counties)'@en\t'Ashley County'@en\t\"05003\"\n",
      "P882\tQ61086\t'FIPS 6-4 (US counties)'@en\t'Baxter County'@en\t\"05005\"\n",
      "P882\tQ61020\t'FIPS 6-4 (US counties)'@en\t'Benton County'@en\t\"05007\"\n",
      "P882\tQ61010\t'FIPS 6-4 (US counties)'@en\t'Boone County'@en\t\"05009\"\n",
      "P882\tQ61024\t'FIPS 6-4 (US counties)'@en\t'Bradley County'@en\t\"05011\"\n",
      "P882\tQ61461\t'FIPS 6-4 (US counties)'@en\t'Calhoun County'@en\t\"05013\"\n",
      "P882\tQ61216\t'FIPS 6-4 (US counties)'@en\t'Carroll County'@en\t\"05015\"\n",
      "P882\tQ61458\t'FIPS 6-4 (US counties)'@en\t'Chicot County'@en\t\"05017\"\n",
      "P882\tQ61200\t'FIPS 6-4 (US counties)'@en\t'Clark County'@en\t\"05019\"\n",
      "P882\tQ61330\t'FIPS 6-4 (US counties)'@en\t'Clay County'@en\t\"05021\"\n",
      "P882\tQ61039\t'FIPS 6-4 (US counties)'@en\t'Cleburne County'@en\t\"05023\"\n",
      "P882\tQ61032\t'FIPS 6-4 (US counties)'@en\t'Cleveland County'@en\t\"05025\"\n",
      "P882\tQ61358\t'FIPS 6-4 (US counties)'@en\t'Columbia County'@en\t\"05027\"\n",
      "P882\tQ61352\t'FIPS 6-4 (US counties)'@en\t'Conway County'@en\t\"05029\"\n",
      "P882\tQ61354\t'FIPS 6-4 (US counties)'@en\t'Craighead County'@en\t\"05031\"\n",
      "P882\tQ61005\t'FIPS 6-4 (US counties)'@en\t'Crawford County'@en\t\"05033\"\n",
      "P882\tQ61346\t'FIPS 6-4 (US counties)'@en\t'Crittenden County'@en\t\"05035\"\n",
      "P882\tQ61036\t'FIPS 6-4 (US counties)'@en\t'Cross County'@en\t\"05037\"\n",
      "P882\tQ61012\t'FIPS 6-4 (US counties)'@en\t'Dallas County'@en\t\"05039\"\n",
      "P882\tQ61029\t'FIPS 6-4 (US counties)'@en\t'Desha County'@en\t\"05041\"\n",
      "P882\tQ61478\t'FIPS 6-4 (US counties)'@en\t'Drew County'@en\t\"05043\"\n",
      "P882\tQ61468\t'FIPS 6-4 (US counties)'@en\t'Faulkner County'@en\t\"05045\"\n",
      "P882\tQ61084\t'FIPS 6-4 (US counties)'@en\t'Franklin County'@en\t\"05047\"\n",
      "P882\tQ61007\t'FIPS 6-4 (US counties)'@en\t'Fulton County'@en\t\"05049\"\n",
      "P882\tQ61077\t'FIPS 6-4 (US counties)'@en\t'Garland County'@en\t\"05051\"\n",
      "        0.81 real         0.58 user         0.16 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$IDS\" -i \"$LABEL\" -i \"/Users/pedroszekely/Downloads/fips-sample.tsv\" \\\n",
    "--match 'fips: (fips)-[]->(), external: (n1)-[l:P882 {label: p}]->(fips), label: (p)-[]->(p_label), label: (n1)-[]->(q_label)' \\\n",
    "--return 'p, n1, p_label, q_label, fips' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.external-id.tsv.gz\n"
     ]
    }
   ],
   "source": [
    "!echo \"$IDS\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['IDS'] = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.external-id.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['TIME'] = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.time.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        4.95 real         3.58 user         0.59 sys\n",
      "node1      node1;label                   class  node2;year\n",
      "Q11285199  'Ayusan'@en                   Q726   2010\n",
      "Q11290535  'Epiphaneia'@en               Q726   2010\n",
      "Q11296691  'Kizuna'@en                   Q726   2010\n",
      "Q11297901  'Kingsbarns'@en               Q726   2010\n",
      "Q11343357  'Meisho Mambo'@en             Q726   2010\n",
      "Q11350241  'Logotype'@en                 Q726   2010\n",
      "Q11351036  'Robe Tissage'@en             Q726   2010\n",
      "Q11576902  'Nao Tamura'@en               Q5     2010\n",
      "Q12495326  'Louis, Duke of Burgundy'@en  Q5     2010\n",
      "Q12516585  'Suhel Fahmi'@en              Q5     2010\n",
      "Q12981960  'Orb'@en                      Q726   2010\n",
      "Q13512747  'Vahideh Nazeri'@en           Q5     2010\n",
      "Q15052027  'Trêve'@en                    Q726   2010\n",
      "Q16335413  'Max Alan Shatto'@en          Q5     2010\n",
      "Q16515807  'Nami Havelková'@en           Q5     2010\n",
      "Q16727999  'Chinawoman'@en               Q5     2010\n",
      "Q16889222  'Oxbow'@en                    Q726   2010\n",
      "Q16950986  'Beholder'@en                 Q726   2010\n",
      "Q16963128  'Winsili'@en                  Q726   2010\n",
      "Q16971546  'Shamus Award'@en             Q726   2010\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$TIME\" -i \"$LABEL\" -i \"$ISA\" \\\n",
    "--match 'time: (n1)-[l:P569]->(n2), label: (n1)-[]->(lab), isa: (n1)-[]->(class)' \\\n",
    "--return 'n1 as node1, lab as `node1;label`, class as class, kgtk_date_year(n2) as `node2;year`' \\\n",
    "--where 'kgtk_date_year(n2) = 2010' \\\n",
    "--limit 20 \\\n",
    "| column -t -s $'\\t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n",
      "Q16515807-P106-Q33999-285a55e8-0\tQ16515807\tP106\tQ33999\tnormal\twikibase-item\n",
      "Q16515807-P106-Q970153-f9c11847-0\tQ16515807\tP106\tQ970153\tnormal\twikibase-item\n",
      "Q16515807-P1477-69fe1d-f8504ec5-0\tQ16515807\tP1477\t'Natálie Miroslava Havelková'@cs\tnormal\tmonolingualtext\n",
      "Q16515807-P19-Q155993-9c796f27-0\tQ16515807\tP19\tQ155993\tnormal\twikibase-item\n",
      "Q16515807-P21-Q6581072-70378435-0\tQ16515807\tP21\tQ6581072\tnormal\twikibase-item\n",
      "Q16515807-P2605-8cb85f-9a0573db-0\tQ16515807\tP2605\t\"292876\"\tnormal\texternal-id\n",
      "Q16515807-P27-Q213-98d068e5-0\tQ16515807\tP27\tQ213\tnormal\twikibase-item\n",
      "Q16515807-P31-Q5-3aba8c99-0\tQ16515807\tP31\tQ5\tnormal\twikibase-item\n",
      "Q16515807-P569-42a69c-36932550-0\tQ16515807\tP569\t^2010-00-00T00:00:00Z/7\tnormal\ttime\n",
      "Q16515807-P735-Q28732407-65ef2f48-0\tQ16515807\tP735\tQ28732407\tnormal\twikibase-item\n",
      "Q16515807-P735-Q923005-d5e0f80d-0\tQ16515807\tP735\tQ923005\tnormal\twikibase-item\n",
      "        0.99 real         0.55 user         0.18 sys\n"
     ]
    }
   ],
   "source": [
    "!$kypher -i \"$CLAIMS\" \\\n",
    "--match '(n1:Q16515807)-[l]-(n2)'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kgtk",
   "language": "python",
   "name": "kgtk"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}