{ "cells": [ { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "# Parameters\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "output_path = \"/Users/pedroszekely/Downloads/kypher\"\n", "\n", "# The names of the output and temporary folders\n", "output_folder = \"wikidata_os_v5\"\n", "temp_folder = \"temp.wikidata_os_v5\"\n", "\n", "# The location of input Wikidata files\n", "wikidata_folder = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/\"\n", "wikidata_folder = \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/\"\n", "# The wikidata_os files can be downloaded from https://drive.google.com/drive/folders/1V6oAQKmwQ4LJnrBai-uv5gHWphFSCt50?usp=sharing\n", "\n", "# Location of the cache database for kypher\n", "cache_path = \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4\"\n", "\n", "# Whether to delete the cache database\n", "delete_database = False\n", "\n", "# shortcuts to commands\n", "kgtk = \"time kgtk --debug\"\n", "# kgtk = \"kgtk --debug\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# KGTK Tutorial\n", "\n", "Beer sites:\n", "- https://www.realbeer.com/edu/health/calories.php\n", "- http://getdrunknotfat.com/alcohol-content-of-beer/" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import altair as alt\n", "\n", "import papermill as pm" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ALIAS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/aliases.en.tsv.gz\"\n", "CLAIMS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.tsv.gz\"\n", "DESCRIPTION: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/descriptions.en.tsv.gz\"\n", "ISA: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.isa.tsv.gz\"\n", "ITEM: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.wikibase-item.tsv.gz\"\n", "LABEL: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/labels.en.tsv.gz\"\n", "OUT: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v5\"\n", "P279: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279.tsv.gz\"\n", "P279STAR: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/derived.P279star.tsv.gz\"\n", "PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/metadata.property.datatypes.tsv.gz\"\n", "QUALIFIERS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/qualifiers.tsv.gz\"\n", "QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/qualifiers.time.tsv.gz\"\n", "SITELINKS: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/sitelinks.tsv.gz\"\n", "STORE: \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"\n", "TEMP: \"/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5\"\n", "WIKIDATA: \"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/\"\n", "kgtk: \"time kgtk --debug\"\n", "kypher: \"time kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"\n" ] } ], "source": [ "# The names of files in the KGTK Wikidata distirbution that we will use in this notebook.\n", "file_names = {\n", " \"claims\": \"claims.tsv.gz\",\n", " \"label\": \"labels.en.tsv.gz\",\n", " \"alias\": \"aliases.en.tsv.gz\",\n", " \"description\": \"descriptions.en.tsv.gz\",\n", " \"item\": \"claims.wikibase-item.tsv.gz\",\n", " \"qualifiers\": \"qualifiers.tsv.gz\",\n", " \"sitelinks\": \"sitelinks.tsv.gz\",\n", " \"qualifiers_time\": \"qualifiers.time.tsv.gz\",\n", " \"property_datatypes\": \"metadata.property.datatypes.tsv.gz\",\n", " \"isa\": \"derived.isa.tsv.gz\",\n", " \"p279star\": \"derived.P279star.tsv.gz\",\n", " \"p279\": \"derived.P279.tsv.gz\"\n", "}\n", "\n", "# We will define environment variables to hold the full paths to the files as we will use them in the shell commands\n", "kgtk_environment_variables = []\n", "\n", "os.environ['WIKIDATA'] = wikidata_folder\n", "kgtk_environment_variables.append('WIKIDATA')\n", "\n", "for key, value in file_names.items():\n", " variable = key.upper()\n", " os.environ[variable] = wikidata_folder + value\n", " kgtk_environment_variables.append(variable)\n", " \n", "# KGTK creates a SQLite database to index the knowledge graph.\n", "if cache_path:\n", " os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_path)\n", "else:\n", " os.environ['STORE'] = \"{}/{}/wikidata.sqlite3.db\".format(output_path, temp_folder)\n", "kgtk_environment_variables.append('STORE')\n", "\n", "# We will create many temporary files, so set up a folder for outputs and one for the temporary files.\n", "os.environ['TEMP'] = \"{}/{}\".format(output_path, temp_folder) \n", "os.environ['OUT'] = \"{}/{}\".format(output_path, output_folder) \n", "kgtk_environment_variables.append('TEMP')\n", "kgtk_environment_variables.append('OUT')\n", "\n", "# Envronment variables with shortcuts to the commands we use often\n", "os.environ['kgtk'] = kgtk\n", "os.environ['kypher'] = \"time kgtk --debug query --graph-cache \" + os.environ['STORE']\n", "os.environ['kypher'] = \"time kgtk query --graph-cache \" + os.environ['STORE']\n", "\n", "\n", "kgtk_environment_variables.append('kgtk')\n", "kgtk_environment_variables.append('kypher')\n", "\n", "kgtk_environment_variables.sort()\n", "for variable in kgtk_environment_variables:\n", " print(\"{}: \\\"{}\\\"\".format(variable, os.environ[variable]))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher\n" ] } ], "source": [ "%cd {output_path}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: wikidata_os_v5: File exists\n", "mkdir: temp.wikidata_os_v5: File exists\n" ] } ], "source": [ "!mkdir {output_folder}\n", "!mkdir {temp_folder}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Wikidata in KGTK\n", "KGTK has the ability to import a Wikidata JSON dump and covert it to the KGTK representation to make it easy to process the full Wikidata KG in a laptop. There are 86 files which include all the information available in the Wikidata dump and files containing commonly used information derived from the dump. We partitioned the files because in most use cases you only need to use a subset of the files.\n", "\n", "The files are very large. `claims.tsv` (23GB compressed) contains all the statements in the Wikidata dump, `qualifiers.tsv` contains the qualifiers of those edges, and `labels.en.tsv`, `aliases.en.tsv` and `descriptions.en.tsv` contain the English labels, aliases and descriptions." ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 pedroszekely staff 68M Nov 16 08:07 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/aliases.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 4.7G Nov 16 08:05 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/claims.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 269M Nov 16 08:08 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/descriptions.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 376M Nov 16 08:06 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/labels.en.tsv.gz\n", "-rw-r--r-- 1 pedroszekely staff 662M Nov 16 08:43 /Users/pedroszekely/Downloads/kypher/wikidata_os_v1/qualifiers.tsv.gz\n" ] } ], "source": [ "!ls -lh \"$CLAIMS\" \"$QUALIFIERS\" \"$LABEL\" \"$ALIAS\" \"$DESCRIPTION\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`claims.tsv` contains many edges:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 254135077 1578463882 20285305033\n", "\n", "real\t1m19.657s\n", "user\t2m12.459s\n", "sys\t0m8.915s\n" ] } ], "source": [ "!time zcat < \"$CLAIMS\" | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# KGTK Data Model\n", "The KGTK data model is a generalization of RDF and property graphs, inspired by the Wikidata data model. In KGTK, a KG is represented using TSV files with four columns: three columns to store the subject, predicate and object of a triple, and a fourth column to store an identifier for the triple. By convention, we use the heading `id` for the identifier, `node1` for the subject, `node2` for the object and `label` for the predicate, as it labels the edge between `node1` and `node2`. The order of the columns is arbitrary.\n", "\n", "All KGTK files must include the required `id`, `node1`, `label` and `node2` columns, and can contain additional columns to store addtional information about an edge or the nodes in the edge. We will explain the details after we discuss *qualifiers*.\n", "Let's take a look at the first few lines of the `claims.tsv` file. We see the four required columns and two additional columns that the Wikidata import includes to facilitate processing of the `claims` file using custom scripts. The `rank` column records the Wikidata rank of a statement, and the `node2;wikidatatype` records the Wikidata type of the value in the `node2` column." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Claims" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zcat: error writing to output: Broken pipe\n", "id node1 label node2 rank node2;wikidatatype\n", "P10-P1628-32b85d-7927ece6-0 P10 P1628 \"http://www.w3.org/2006/vcard/ns#Video\" normal url\n", "P10-P1628-acf60d-b8950832-0 P10 P1628 \"https://schema.org/video\" normal url\n", "P10-P1629-Q34508-bcc39400-0 P10 P1629 Q34508 normal wikibase-item\n", "P10-P1659-P1651-c4068028-0 P10 P1659 P1651 normal wikibase-property\n", "P10-P1659-P18-5e4b9c4f-0 P10 P1659 P18 normal wikibase-property\n", "P10-P1659-P4238-d21d1ac0-0 P10 P1659 P4238 normal wikibase-property\n", "P10-P1659-P51-86aca4c5-0 P10 P1659 P51 normal wikibase-property\n", "P10-P1855-Q15075950-7eff6d65-0 P10 P1855 Q15075950 normal wikibase-item\n", "P10-P1855-Q69063653-c8cdb04c-0 P10 P1855 Q69063653 normal wikibase-item\n" ] } ], "source": [ "!zcat < \"$CLAIMS\" | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Wikidata uses numbers to identify items and properties. We can use the `wd` utility (https://github.com/maxlath/wikibase-cli) to understand the first few lines. The second line states that the `P10` property in Wikidata has an equivalent property in another ontology. Notice that each edge has a distinct id. These ids are unique identifiers for statements (the format of the id can be arbitrary, but we assigned ids so that sorting files by id arranges the information so that all edges about a subject are consecutive." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[90mid\u001b[39m P10\n", "\u001b[42mLabel\u001b[49m video\n", "\u001b[44mDescription\u001b[49m relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mWikidata property to link to Commons \u001b[90m(Q18610173)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m P1628\n", "\u001b[42mLabel\u001b[49m equivalent property\n", "\u001b[44mDescription\u001b[49m equivalent property in other ontologies (use in statements on properties, use property URI)\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mWikidata metaproperty for ontology mapping \u001b[90m(Q42842547)\u001b[39m\n", "\n", "\u001b[90mid\u001b[39m P1629\n", "\u001b[42mLabel\u001b[49m subject item of this property\n", "\u001b[44mDescription\u001b[49m relationship represented by the property\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mWikidata property for property documentation \u001b[90m(Q19820110)\u001b[39m\n" ] } ], "source": [ "!wd u P10 P1628 P1629" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at a more meaningful example. `Q31` (https://www.wikidata.org/wiki/Q31) is the Wikidata item about Belgium. We will use the KGTK query to fetch edges about Belgium. `$kypher` is a shortcut to the `kgtk query` command where in addition we pass in the location of the SQLite database we are using ot store the files. KGTK queries use Cypher syntax (https://neo4j.com/developer/cypher/): the following simple query retrieves 10 edges where `node1` is `Q31`, the q-node for Belgium. The results include an edge with `label` `P1036` (Dewey Decimal Classification) and several edges with label `P1081` (human development index)." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2046.59 real 2931.96 user 189.44 sys\n", "id node1 label node2 rank node2;wikidatatype\n", "Q31-P1036-c4e1ad-df86eeb8-0 Q31 P1036 \"2--493\" normal external-id\n", "Q31-P1081-02c2ed-033524b0-0 Q31 P1081 +0.866 normal quantity\n", "Q31-P1081-02c2ed-7971505b-0 Q31 P1081 +0.866 normal quantity\n", "Q31-P1081-068470-c1c63b8d-0 Q31 P1081 +0.889 normal quantity\n", "Q31-P1081-068470-ddac01e0-0 Q31 P1081 +0.889 normal quantity\n", "Q31-P1081-144738-c1851cdc-0 Q31 P1081 +0.905 normal quantity\n", "Q31-P1081-175742-c07ac1c8-0 Q31 P1081 +0.888 normal quantity\n", "Q31-P1081-19636d-c08dd8a8-0 Q31 P1081 +0.896 normal quantity\n", "Q31-P1081-1efc03-433a7a4d-0 Q31 P1081 +0.913 normal quantity\n", "Q31-P1081-1f8602-ddac530d-0 Q31 P1081 +0.852 normal quantity\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" \\\n", "--match '(:Q31)-[]-()' \\\n", "--limit 10 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The output of the command above is hard to read because we are seeing the numeric Wikidata identifiers. To make the output more readable, we need to look up the labels of the Wikidata nodes. This information is in the `labels.en.tsv` file." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zcat: error writing to output: Broken pipe\n", "id node1 label node2\n", "P10-label-en P10 label 'video'@en\n", "P1000-label-en P1000 label 'record held'@en\n", "P1001-label-en P1001 label 'applies to jurisdiction'@en\n", "P1002-label-en P1002 label 'engine configuration'@en\n", "P1003-label-en P1003 label 'National Library of Romania ID'@en\n", "P1004-label-en P1004 label 'MusicBrainz place ID'@en\n", "P1005-label-en P1005 label 'Portuguese National Library ID'@en\n", "P1006-label-en P1006 label 'Nationale Thesaurus voor Auteurs ID'@en\n", "P1007-label-en P1007 label 'Lattes Platform number'@en\n" ] } ], "source": [ "!zcat < \"$LABEL\" | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With KGTK accepts multiple files as input, and can do a join to retrieve the label for each property. When using multiple files, it is necessary to tag each clause with the file that provides the data for the clause. For example, the first clause is tagged with `claim` as the word `claim` is part of the file name. The variable property is used to connect the two clauses." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 629.00 real 531.54 user 102.50 sys\n", "id node1 label node2 label;label\n", "Q31-P1036-c4e1ad-df86eeb8-0 Q31 P1036 \"2--493\" 'Dewey Decimal Classification'@en\n", "Q31-P1081-02c2ed-033524b0-0 Q31 P1081 +0.866 'Human Development Index'@en\n", "Q31-P1081-02c2ed-7971505b-0 Q31 P1081 +0.866 'Human Development Index'@en\n", "Q31-P1081-068470-c1c63b8d-0 Q31 P1081 +0.889 'Human Development Index'@en\n", "Q31-P1081-068470-ddac01e0-0 Q31 P1081 +0.889 'Human Development Index'@en\n", "Q31-P1081-144738-c1851cdc-0 Q31 P1081 +0.905 'Human Development Index'@en\n", "Q31-P1081-175742-c07ac1c8-0 Q31 P1081 +0.888 'Human Development Index'@en\n", "Q31-P1081-19636d-c08dd8a8-0 Q31 P1081 +0.896 'Human Development Index'@en\n", "Q31-P1081-1efc03-433a7a4d-0 Q31 P1081 +0.913 'Human Development Index'@en\n", "Q31-P1081-1f8602-ddac530d-0 Q31 P1081 +0.852 'Human Development Index'@en\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$LABEL\" \\\n", "--match 'claim: (n1:Q31)-[l {label: property}]-(n2), label: (property)-[:label]->(property_label)' \\\n", "--return 'l as id, n1 as node1, property as label, n2 as node2, property_label as `label;label`' \\\n", "--limit 10 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at a the heads of state of Belgium recorded in property `P35`" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 687.06 real 391.17 user 144.53 sys\n", "id node1 label node2 node2;label\n", "Q31-P35-Q1079522-c82ed584-0 Q31 P35 Q1079522 'Erasme Louis Surlet de Chokier'@en\n", "Q31-P35-Q12967-f2b9aaf3-0 Q31 P35 Q12967 'Leopold II of Belgium'@en\n", "Q31-P35-Q12971-2088471b-0 Q31 P35 Q12971 'Leopold I of Belgium'@en\n", "Q31-P35-Q12973-31c1b700-0 Q31 P35 Q12973 'Leopold III of Belgium'@en\n", "Q31-P35-Q12976-f3e8a567-0 Q31 P35 Q12976 'Baudouin I of Belgium'@en\n", "Q31-P35-Q155004-619ba603-0 Q31 P35 Q155004 'Philippe I of Belgium'@en\n", "Q31-P35-Q3911-137f01fe-0 Q31 P35 Q3911 'Albert II of Belgium'@en\n", "Q31-P35-Q445553-7599749f-0 Q31 P35 Q445553 'Prince Charles, Count of Flanders'@en\n", "Q31-P35-Q55008046-725dce40-0 Q31 P35 Q55008046 'Albert I of Belgium'@en\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$LABEL\" \\\n", "--match 'claims: (n1:Q31)-[l:P35]->(n2), labels: (n2)-[:label]->(n2_label)' \\\n", "--return 'l as id, n1 as node1, l.label as label, n2 as node2, n2_label as `node2;label`' \\\n", "--limit 10 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Qualifiers\n", "Qualifiers provide additional information about the claims stated in the edges. For `P1081` the qualifiers tell use the year, and for head of state the qualifiers provide information about the period of time and position held by the head of state. The qualifiers can be retrieved using the identifiers of the edges. Let's retrieve the qualifiers associated with the edge for the first head of state (Erasme Louis). To do so, we use the identifier of the edge (`Q31-P35-Q1079522-c82ed584-0`) as `node1` in the `qualifiers.tsv` file. We get three edges, meaning that the edge `Q31/P35/Q1079522` has three qualifiers. Note that the qualifier edges are the same as any other edge in KGTK, having `id`, `node1`, `label` and `node2` columns:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 407.47 real 576.47 user 28.22 sys\n", "id node1 label node2 node2;wikidatatype\n", "Q31-P35-Q1079522-c82ed584-0-P39-Q477406-0 Q31-P35-Q1079522-c82ed584-0 P39 Q477406 wikibase-item\n", "Q31-P35-Q1079522-c82ed584-0-P580-106076-0 Q31-P35-Q1079522-c82ed584-0 P580 ^1831-02-25T00:00:00Z/11 time\n", "Q31-P35-Q1079522-c82ed584-0-P582-774519-0 Q31-P35-Q1079522-c82ed584-0 P582 ^1831-07-20T00:00:00Z/11 time\n" ] } ], "source": [ "!$kypher -i \"$QUALIFIERS\" \\\n", "--match '(n1:`Q31-P35-Q1079522-c82ed584-0`)-[l]->(n2)' \\\n", "--limit 10 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's make them readable: the following query combines the patterns of the previous two queries to retrieve the labels of the property and node2. The query omits the identifier of the qualifier edges to save space. Also, the headers of the two additional columns can be arbitrary, i.e., you can name them whatever you want; the names used follow a KGTK convention that enabled KGTK to automatically parse the output, which is useful if we want to use the output as an input to another KGTK command. The word before the `;` refers to one of the standard columns, and the name after the `;` refers to a property of that element. In this example, we used `label` as the column contains the label of the entity." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 52.73 real 28.95 user 9.37 sys\n", "node1 label node2 label;label\n", "Q31-P35-Q1079522-c82ed584-0 P39 Q477406 'position held'@en\n", "Q31-P35-Q1079522-c82ed584-0 P580 ^1831-02-25T00:00:00Z/11 'start time'@en\n", "Q31-P35-Q1079522-c82ed584-0 P582 ^1831-07-20T00:00:00Z/11 'end time'@en\n" ] } ], "source": [ "!$kypher -i \"$QUALIFIERS\" -i \"$LABEL\" \\\n", "--match 'qual: (n1:`Q31-P35-Q1079522-c82ed584-0`)-[l {label: property}]->(n2), labels: (property)-[:label]->(property_label)' \\\n", "--return 'n1 as node1, property as label, n2 as node2, property_label as `label;label`' \\\n", "--limit 10 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's put all the values of `P35` in a file, which we will conveniently name `Q31.P35.tsv`" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.88 real 0.57 user 0.16 sys\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" \\\n", "--match '(n1:Q31)-[l:P35]->(n2)' \\\n", "--return 'l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q31.P35.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are going to combine the `P35` edges of Belgium with the qualifiers. To do this we will run a query that uses the edges that we stored in `Q31.P35.tsv`, and retrieve the qualifiers for each of those edges; the result of our query will be the qualifier edges of the head of state edges. To union the qualifier edges with the claim edges, we feed the output of the query to the `cat` command (concatenate), and then feed the output to the `sort2` command to sort the edges. The first 12 edges are shown below. We see a claim edge followed by the qualifiers defined for it.\n", "\n", "This snippet illustrates that KGTK commands can be chained using the `/` chain operator to compose more complex workflows." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id node1 label node2\n", "Q31-P35-Q1079522-c82ed584-0 Q31 P35 Q1079522\n", "Q31-P35-Q1079522-c82ed584-0-P39-Q477406-0 Q31-P35-Q1079522-c82ed584-0 P39 Q477406\n", "Q31-P35-Q1079522-c82ed584-0-P580-106076-0 Q31-P35-Q1079522-c82ed584-0 P580 ^1831-02-25T00:00:00Z/11\n", "Q31-P35-Q1079522-c82ed584-0-P582-774519-0 Q31-P35-Q1079522-c82ed584-0 P582 ^1831-07-20T00:00:00Z/11\n", "Q31-P35-Q12967-f2b9aaf3-0 Q31 P35 Q12967\n", "Q31-P35-Q12967-f2b9aaf3-0-P39-Q13592862-0 Q31-P35-Q12967-f2b9aaf3-0 P39 Q13592862\n", "Q31-P35-Q12967-f2b9aaf3-0-P580-f29037-0 Q31-P35-Q12967-f2b9aaf3-0 P580 ^1865-12-17T00:00:00Z/11\n", "Q31-P35-Q12967-f2b9aaf3-0-P582-136f02-0 Q31-P35-Q12967-f2b9aaf3-0 P582 ^1909-12-17T00:00:00Z/11\n", "Q31-P35-Q12971-2088471b-0 Q31 P35 Q12971\n", "Q31-P35-Q12971-2088471b-0-P39-Q13592862-0 Q31-P35-Q12971-2088471b-0 P39 Q13592862\n", "Q31-P35-Q12971-2088471b-0-P580-a35d41-0 Q31-P35-Q12971-2088471b-0 P580 ^1831-06-04T00:00:00Z/11\n", " 1.61 real 2.27 user 0.55 sys\n" ] } ], "source": [ "!$kypher -i \"$QUALIFIERS\" -i \"$TEMP\"/Q31.P35.tsv \\\n", "--match 'P35: ()-[l]->(), qual: (l)-[lq]->(n2)' \\\n", "--return 'lq as id, l as node1, lq.label as label, n2 as node2' \\\n", "/ cat -i - -i \"$TEMP\"/Q31.P35.tsv \\\n", "/ sort2 \\\n", "| head -12 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summary\n", "\n", "- KGTK represents graphs in TSV files with standard columns `id`, `node1`, `label` and `node2`\n", "- It is possible to include arbitrary additional columns in KGTK files\n", "- The identifier of an edge can be used as a node in another edge enabling the representation of edges about edges\n", "- KGTK provides a powerful query command based on Cypher as well as a host of other commands, type `kgtk --help` to see the list of commands." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use Case: A Knowledge Graph About Alocholic Beverages\n", "We are going to build a small KG about alcoholoc beverages by extracting from Wikidata the subgraph that relates to alcoholic beverages (https://www.wikidata.org/wiki/Q154)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 1: create a list of all descendants of `alcoholic beverage` (https://www.wikidata.org/wiki/Q154)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[90mid\u001b[39m Q154\n", "\u001b[42mLabel\u001b[49m alcoholic beverage\n", "\u001b[44mDescription\u001b[49m drink containing alcohols, typically ethanol\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m drug \u001b[90m(Q8386)\u001b[39m | carcinogen \u001b[90m(Q187661)\u001b[39m\n", "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mdrink \u001b[90m(Q40050)\u001b[39m\n" ] } ], "source": [ "!wd u Q154" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Wikidata uses two properties to organize entities in a hierarchy: the `instance of` property (`P31`) and the `subclass of` (`P279`) property. In many cases, the distinction between instance of and subclass of is subtle, and we find many situations in Wikidata where either one or the other is used to organize hierarchies. For this reason, we created a new property called `isa` that contains the union of `P31` and `P279` and stored in the file `derived.isa.tsv`" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\n", "P10\tisa\tQ18610173\n", "P1000\tisa\tQ18608871\n", "P1001\tisa\tQ15720608\n", "P1001\tisa\tQ22984026\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$ISA\" | head -5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get all the alcoholic beverages, we need to get all entities that are `isa` of alcoholic beverage (`Q154`) or that are `isa` of any descendant of `Q154` in the `subclass of` (`P279`) hierarchy. The length of the chain of `P279` edges can be arbitrarily long. To support this uise case, KGTK offers the `derived.P279star.tsv` file that contains edges `n1/P279star/n2` if `n1` is a descendant of `n2` on chains of `P279` edges, includiing chains of zero length (`n1/P279star/n1`)." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1 label node2 id\n", "zcat: Q1000032 P279star Q1000032 Q1000032-P279star-Q1000032-0000\n", "Q1000032 P279star Q1150070 Q1000032-P279star-Q1150070-0000\n", "Q1000032 P279star Q1190554 Q1000032-P279star-Q1190554-0000\n", "Q1000032 P279star Q133500 Q1000032-P279star-Q133500-0000\n", "error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$P279STAR\" | head -5 | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get all alcoholic beverages, we need to find all nodes `n1` that are connected to `Q154` with an `isa` edge and a chain of `P279` edges:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 285.63 real 381.88 user 22.08 sys\n" ] } ], "source": [ "!$kypher -i \"$ISA\" -i \"$P279STAR\" -i \"$LABEL\" \\\n", "--match 'isa: (n1)-[]->(n2), star: (n2)-[]->(n3:Q154), label: (n1)-[]->(n1l)' \\\n", "--return 'n1 as node1, n1l as `node1;label`, n3 as node2, \"isastar\" as label' \\\n", "-o \"$TEMP\"/Q154.descendant.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is a sample of alcoholic beverages in Wikidata" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1 node1;label node2 label\n", "Q1350656 'Corn whiskey'@en Q154 isastar\n", "Q20713240 'Buckwheat whisky'@en Q154 isastar\n", "Q2535077 'Rye Whiskey'@en Q154 isastar\n", "Q536976 'Canadian whisky'@en Q154 isastar\n", "Q7991845 'Wheat whiskey'@en Q154 isastar\n", "Q10429117 'Beyaz'@en Q154 isastar\n", "Q1069954 'Prosecco'@en Q154 isastar\n", "Q1094850 'Clairette du Languedoc'@en Q154 isastar\n", "Q1135592 'Cortese di Gavi'@en Q154 isastar\n" ] } ], "source": [ "!head \"$TEMP\"/Q154.descendant.tsv | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An the total number:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 3251 16116 133341 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.descendant.tsv\n" ] } ], "source": [ "!wc \"$TEMP\"/Q154.descendant.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The computation of `Q154.descendant.tsv` can be implemented in SPARQL using the common `P31/P279*` graph pattern, but the query will time out if the result size is large. For example, the query will time out when requesting all descendants of chemical compounds, as there are over one million chemical compounds in Wikidata. The query can be easily done in KGTK." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 2: get the incoming and outgoing edges\n", "We want out graph to have the neighbors of all alcoholic beverages, so we need to get the incoming and outgoing edges.\n", "\n", "The following query gets the outgoing edges." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2.35 real 0.84 user 0.41 sys\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.descendant.tsv \\\n", "--match 'Q154: (n1)-[]->(), claims: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that we are getting several properties for our items:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id node1 label node2\n", "Q1000737-P1435-Q17297633-53903946-0 Q1000737 P1435 Q17297633\n", "Q1000737-P1454-Q460178-8ad4931b-0 Q1000737 P1454 Q460178\n", "Q1000737-P159-Q16003-31e24011-0 Q1000737 P159 Q16003\n", "Q1000737-P17-Q183-24107fe2-0 Q1000737 P17 Q183\n", "Q1000737-P18-147fc9-667304f8-0 Q1000737 P18 \"Marthabräuhalle 2011-04-03.jpg\"\n", "Q1000737-P31-Q131734-f97bd6f6-0 Q1000737 P31 Q131734\n", "Q1000737-P31-Q15075508-a4c83928-0 Q1000737 P31 Q15075508\n", "Q1000737-P373-689157-3110aade-0 Q1000737 P373 \"Marthabräu\"\n", "Q1000737-P452-Q869095-f5d8e7a2-0 Q1000737 P452 Q869095\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.node1.tsv.gz | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now get the incoming edges:" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2.00 real 0.75 user 0.36 sys\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.descendant.tsv \\\n", "--match 'Q154: (n1)-[]->(), claims: (n3)-[l]->(n1)' \\\n", "--return 'distinct l as id, n3 as node1, l.label as label, n1 as node2' \\\n", "-o \"$TEMP\"/Q154.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is a sample of the edges we are getting" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zcat: id node1 label node2\n", "Q1350656-P279-Q1007164-7e3ecba9-0 Q1350656 P279 Q1007164\n", "error writing to outputQ20713240-P279-Q1007164-b3112260-0 Q20713240 P279 Q1007164\n", ": Q2535077-P279-Q1007164-b2d3684b-0 Q2535077 P279 Q1007164\n", "Broken pipe\n", "Q536976-P279-Q1007164-8bf7467b-0 Q536976 P279 Q1007164\n", "Q7991845-P279-Q1007164-18bc383a-0 Q7991845 P279 Q1007164\n", "Q10337004-P186-Q10210-c56dd7ce-0 Q10337004 P186 Q10210\n", "Q10429117-P31-Q10210-d342f061-0 Q10429117 P31 Q10210\n", "Q1051699-P279-Q10210-65d32c67-0 Q1051699 P279 Q10210\n", "Q1058259-P279-Q10210-e204554a-0 Q1058259 P279 Q10210\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.node2.tsv.gz | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the incoming and outgoing edges to put them in a single file:" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.96 real 0.84 user 0.11 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.node1.tsv.gz -i \"$TEMP\"/Q154.node2.tsv.gz -o \"$TEMP\"/Q154.claims.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have over 30,000 edges:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 28142 116045 1584824\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.claims.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary of where we are:\n", "- Computed the list of entities below alcoholic beverage\n", "- Found all incoming and outgoing edges to these entities; for the new entities we bring in, we have no information, we only have the q-node\n", "\n", "Not having any information about the entities connected to the alcoholic beverages is limiting, so let's get their outgoing edges. We run the query with `Q154.claims.tsv` which will use all the entities in our graph, including the alcoholic beverages for which we already got outgoing edges; no harm done, as we can eliminate duplicated later." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5.27 real 3.61 user 0.51 sys\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.claims.tsv.gz \\\n", "--match 'Q154: ()-[]->(n1), claims: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.hop.out.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For sanity check, let's take a peek:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id node1 label node2\n", "Q1000-P1036-9bef62-f77ac5cf-0 Q1000 P1036 \"2--6721\"\n", "Q1000-P1081-0d345f-3a33abf5-0 Q1000 P1081 +0.641\n", "Q1000-P1081-0d345f-6da37c02-0 Q1000 P1081 +0.641\n", "Q1000-P1081-1100e3-c7631769-0 Q1000 P1081 +0.624\n", "Q1000-P1081-1ada51-7c71c229-0 Q1000 P1081 +0.639\n", "Q1000-P1081-345681-88a99cab-0 Q1000 P1081 +0.702\n", "Q1000-P1081-347db1-da0e5e03-0 Q1000 P1081 +0.637\n", "Q1000-P1081-419245-b03a8b59-0 Q1000 P1081 +0.647\n", "Q1000-P1081-419245-f8cd58e8-0 Q1000 P1081 +0.647\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.hop.out.tsv.gz | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's consolidate our edge files into one larger file. We use compact to remove duplicates and sort to keep edges for the same subject together:" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4.65 real 6.28 user 0.63 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.claims.tsv.gz -i \"$TEMP\"/Q154.hop.out.tsv.gz \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we have over 170,000 edges:" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 165133 678398 8868474\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Take a peek:" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id node1 label node2\n", "P1389-P1855-Q1109662-9e2ef218-0 P1389 P1855 Q1109662\n", "P1582-P1855-Q17329207-f4ef508d-0 P1582 P1855 Q17329207\n", "P2581-P1855-Q7639844-08b3a4c7-0 P2581 P1855 Q7639844\n", "P2665-P1855-Q1067702-402a80a9-0 P2665 P1855 Q1067702\n", "P2665-P1855-Q170210-30d44f0b-0 P2665 P1855 Q170210\n", "P5420-P1855-Q44-209cffb1-0 P5420 P1855 Q44\n", "P5420-P1855-Q722338-73d7be75-0 P5420 P1855 Q722338\n", "P6088-P1855-Q1543214-3d934541-0 P6088 P1855 Q1543214\n", "P6088-P1855-Q4626-4ed65964-0 P6088 P1855 Q4626\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.1.tsv.gz | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Once we have all the alcoholic beverages, we want to get the upper ontology of all the classes used, so that every class in our KG has a path to the root of the ontology. For example, first go to `drink` (`Q40050`), then to `liquid` (`Q11435`), then `fluid` (`Q102205`) and so on until we reach `entity` (`Q35120`).\n", "\n", "To do this, we need to get all the `isa` of all items in our graph, then get `P279star` so we get the list of all classes that these items descend from. Finally we need to get all the `P279` edges between them." ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 12.28 real 9.14 user 0.95 sys\n" ] } ], "source": [ "!$kypher -i \"$TEMP\"/Q154.edges.1.tsv.gz -i \"$P279STAR\" -i \"$ISA\" \\\n", "--match 'Q154: (n1)-[]->(), isa: (n1)-[]->(n2), P279: (n2)-[]->(class)' \\\n", "--return 'distinct class as node1' \\\n", "-o \"$TEMP\"/Q154.classes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have almost 3,000 classes in the upper ontology for the entities in our graph:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2846 2846 24939 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.classes.tsv\n" ] } ], "source": [ "!wc \"$TEMP\"/Q154.classes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now use the `derived.P279.tsv` file to get the `P279` edges that connect a class to its superclass." ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4.16 real 5.91 user 0.34 sys\n" ] } ], "source": [ "!$kypher -i \"$TEMP\"/Q154.classes.tsv -i \"$P279\" \\\n", "--match 'Q154: (class)-[]->(), P279: (class)-[l]->(super)' \\\n", "--return 'distinct l as id, class as node1, l.label as label, super as node2' \\\n", "-o \"$TEMP\"/Q154.P279.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We get close to 5,000 `P279` edges in the upper ontology; we will take care of potential duplicates at a final cleanup step:" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4517 18068 249492 /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.P279.tsv\n" ] } ], "source": [ "!wc \"$TEMP\"/Q154.P279.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see several q-nodes below `entity` (`Q35120`), a good indication that we computed the upper ontology correctly:" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q16686448-P279-Q35120-674edbf9-0 Q16686448 P279 Q35120\n", "Q35120-P279-25b964-0520e300-0 Q35120 P279 novalue\n", "Q58415929-P279-Q35120-75659d0c-0 Q58415929 P279 Q35120\n", "Q23958946-P279-Q35120-70a9ed90-0 Q23958946 P279 Q35120\n", "Q488383-P279-Q35120-5fad2ad7-0 Q488383 P279 Q35120\n" ] } ], "source": [ "!grep Q35120 \"$TEMP\"/Q154.P279.tsv | head -5 | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's consolidate the edges again:" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4.41 real 5.94 user 0.59 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.edges.1.tsv.gz -i \"$TEMP\"/Q154.P279.tsv \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have over 175,000 edges:" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 169047 694054 9085731\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.2.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary:\n", "- We have the instances of alcoholic beverages\n", "- We added incoming and outgoing edges\n", "- For the outgoing edges, we went one hop forward\n", "- We got the upper ontology\n", "\n", "The properties are also items in Wikidata, so let's collect them all and get their edges." ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1.53 real 1.83 user 0.19 sys\n" ] } ], "source": [ "!$kypher -i \"$TEMP\"/Q154.edges.2.tsv.gz \\\n", "--match '()-[l {label: property}]->()' \\\n", "--return 'distinct property as node1' \\\n", "-o \"$TEMP\"/Q154.properties.tsv" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\n", "P10\n", "P1001\n", "P1003\n", "P1004\n", "P1005\n", "P1006\n", "P101\n", "P1014\n", "P1015\n" ] } ], "source": [ "!head \"$TEMP\"/Q154.properties.tsv | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's get the edges of these properties:" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.99 real 0.67 user 0.18 sys\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$TEMP\"/Q154.properties.tsv \\\n", "--match 'Q154: (p)-[]->(), claims: (p)-[l]->(n2)' \\\n", "--return 'distinct l as id, p as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.properties.edges.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Take a peek, looks like what we had before as the file is sorted, let's proceed:" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id node1 label node2\n", "P10-P1628-32b85d-7927ece6-0 P10 P1628 \"http://www.w3.org/2006/vcard/ns#Video\"\n", "P10-P1628-acf60d-b8950832-0 P10 P1628 \"https://schema.org/video\"\n", "P10-P1629-Q34508-bcc39400-0 P10 P1629 Q34508\n", "P10-P1659-P1651-c4068028-0 P10 P1659 P1651\n", "P10-P1659-P18-5e4b9c4f-0 P10 P1659 P18\n", "P10-P1659-P4238-d21d1ac0-0 P10 P1659 P4238\n", "P10-P1659-P51-86aca4c5-0 P10 P1659 P51\n", "P10-P1855-Q15075950-7eff6d65-0 P10 P1855 Q15075950\n", "P10-P1855-Q69063653-c8cdb04c-0 P10 P1855 Q69063653\n" ] } ], "source": [ "!head \"$TEMP\"/Q154.properties.edges.tsv | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's consolidate the edges again:" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5.03 real 6.84 user 0.64 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.edges.2.tsv.gz -i \"$TEMP\"/Q154.properties.edges.tsv \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$TEMP\"/Q154.edges.3.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The number of edges grew a bit to 206,000" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 197521 811687 10791930\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.edges.3.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Summary:\n", "- We have the instances of alcoholic beverages\n", "- We added incoming and outgoing edges\n", "- For the outgoing edges, we went one hop forward\n", "- We got the upper ontology\n", "- And we have the edges on all the properties being used\n", "\n", "We will stop adding nodes to the KG at this time, and proceed to add the labels for all the nodes." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3: get the labels, aliases and descriptions of all the items in our KG\n", "Before we start, let's define an environment variable to hold the final edges file so that if we change our mind later, we can update it without having to change the commands below." ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "os.environ[\"Q154GRAPH\"] = os.environ[\"TEMP\"] + \"/Q154.edges.3.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/Q154.edges.3.tsv.gz\n" ] } ], "source": [ "!ls \"$Q154GRAPH\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the labels of the `node1` nodes" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 3.18 real 2.44 user 0.45 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$LABEL\" \\\n", "--match 'Q154: (n1)-[]-(), label: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.label.node1.tsv.gz" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id node1 label node2\n", "P10-label-en P10 label 'video'@en\n", "P1001-label-en P1001 label 'applies to jurisdiction'@en\n", "P1003-label-en P1003 label 'National Library of Romania ID'@en\n", "P1004-label-en P1004 label 'MusicBrainz place ID'@en\n", "P1005-label-en P1005 label 'Portuguese National Library ID'@en\n", "P1006-label-en P1006 label 'Nationale Thesaurus voor Auteurs ID'@en\n", "P101-label-en P101 label 'field of work'@en\n", "P1014-label-en P1014 label 'Getty AAT ID'@en\n", "P1015-label-en P1015 label 'NORAF ID'@en\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.label.node1.tsv.gz | head | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the labels of the `node2` nodes" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 46.12 real 34.97 user 6.91 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$LABEL\" \\\n", "--match 'Q154: ()-[]-(n2), label: (n2)-[l]->(n2)' \\\n", "--return 'distinct l as id, n2 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.label.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the two label files" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1.10 real 0.75 user 0.18 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.label.node1.tsv.gz -i \"$TEMP\"/Q154.label.node2.tsv.gz \\\n", "-o \"$TEMP\"/labels.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the aliases of `node1` nodes" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 24.79 real 37.19 user 1.39 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$ALIAS\" \\\n", "--match 'Q154: (n1)-[]-(), alias: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.alias.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the aliases of `node2` nodes" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5.75 real 0.84 user 0.33 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$ALIAS\" \\\n", "--match 'Q154: ()-[]-(n2), alias: (n2)-[l]->(n2)' \\\n", "--return 'distinct l as id, n2 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.alias.node2.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Concatenate the two alias files" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.91 real 0.72 user 0.14 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.alias.node1.tsv.gz -i \"$TEMP\"/Q154.alias.node2.tsv.gz \\\n", "-o \"$TEMP\"/alias.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the descriptions of `node1` nodes" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 202.73 real 287.06 user 13.62 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$DESCRIPTION\" \\\n", "--match 'Q154: (n1)-[]-(), description: (n1)-[l]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.description.node1.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the descriptions of `node2` nodes" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 40.84 real 31.30 user 7.57 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$DESCRIPTION\" \\\n", "--match 'Q154: ()-[]-(n2), description: (n2)-[l]->(n2)' \\\n", "--return 'distinct l as id, n2 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q154.description.node2.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate the two description files" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.81 real 0.65 user 0.12 sys\n" ] } ], "source": [ "!$kgtk cat -i \"$TEMP\"/Q154.description.node1.tsv.gz -i \"$TEMP\"/Q154.description.node2.tsv.gz \\\n", "-o \"$TEMP\"/description.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 4: get the qualifiers" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5.36 real 2.23 user 0.79 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$QUALIFIERS\" \\\n", "--match 'Q154: ()-[l]->(), qual: (l)-[lq]->(n2)' \\\n", "--return 'lq as id, l as node1, lq.label as label, n2 as node2' \\\n", "-o \"$OUT\"/qualifiers.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "zcat: error writing to output: Broken pipe\n", "id node1 label node2\n", "P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0 P10-P1855-Q15075950-7eff6d65-0 P10 \"Smoorverliefd 12 september.webm\"\n", "P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0 P10-P1855-Q15075950-7eff6d65-0 P3831 Q622550\n", "P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0 P10-P1855-Q69063653-c8cdb04c-0 P10 \"Couch Commander.webm\"\n", "P10-P1855-Q7378-555592a4-0-P10-8a982d-0 P10-P1855-Q7378-555592a4-0 P10 \"Elephants Dream (2006).webm\"\n", "P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0 P10-P2302-Q21502404-d012aef4-0 P1793 \"(?i).+\\\\\\\\.(webm\\\\|ogv\\\\|ogg\\\\|gif)\"\n", "P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0 P10-P2302-Q21502404-d012aef4-0 P2316 Q21502408\n", "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 P10-P2302-Q21502404-d012aef4-0 P2916 'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en\n", "P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0 P10-P2302-Q21510851-5224fe0b-0 P2306 P175\n", "P10-P2302-Q21510851-5224fe0b-0-P2306-P180-0 P10-P2302-Q21510851-5224fe0b-0 P2306 P180\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.qualifiers.tsv.gz | head | column -t -s $'\\t'" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 109816 446163 10639203\n" ] } ], "source": [ "!zcat < \"$TEMP\"/Q154.qualifiers.tsv.gz | wc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 5: consolidate all the files" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2020-12-13 16:52:20-- https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 3108 (3.0K) [text/plain]\n", "Saving to: ‘/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/kgtk.properties.tsv’\n", "\n", "/Users/pedroszekely 100%[===================>] 3.04K --.-KB/s in 0s \n", "\n", "2020-12-13 16:52:22 (17.0 MB/s) - ‘/Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/kgtk.properties.tsv’ saved [3108/3108]\n", "\n" ] } ], "source": [ "!wget https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -O \"$TEMP\"/kgtk.properties.tsv" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1 label node2 id\n", "isa label \"is a\"@en isa-label-\"is a\"@en-0000\n", "isa alias \"isa\"@en isa-alias-\"isa\"@en-0000\n", "isa description \"Instance or subclass relationship\"@en isa-description-\"Instance or subclass relationship\"@en-0000\n", "isa P31 Q18616576 isa-P31-Q18616576-0000\n", "isa P31 Q28326461 isa-P31-Q28326461-0000\n", "isa P31 Q18647519 isa-P31-Q18647519-0000\n", "isa data_type wikibase-item isa-data_type-item-0000\n", "P279star label \"is a\"@en P279star-label-\"is a\"@en-0000\n", "P279star alias \"isa\"@en P279star-alias-\"isa\"@en-0000\n" ] } ], "source": [ "!head \"$TEMP\"/kgtk.properties.tsv | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\n", "P10-datatype\tP10\tdatatype\tcommonsMedia\n", "P1000-datatype\tP1000\tdatatype\twikibase-item\n", "P1001-datatype\tP1001\tdatatype\twikibase-item\n", "P1002-datatype\tP1002\tdatatype\twikibase-item\n", "P1003-datatype\tP1003\tdatatype\texternal-id\n", "P1004-datatype\tP1004\tdatatype\texternal-id\n", "P1005-datatype\tP1005\tdatatype\texternal-id\n", "P1006-datatype\tP1006\tdatatype\texternal-id\n", "P1007-datatype\tP1007\tdatatype\texternal-id\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$PROPERTY_DATATYPES\" | head" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.80 real 0.62 user 0.15 sys\n" ] } ], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$PROPERTY_DATATYPES\" \\\n", "--match 'Q15: (n1)-[]->(), property: (n1)-[l:datatype]->(n2)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$TEMP\"/Q15.metadata.property.datatype.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5.44 real 7.13 user 0.67 sys\n" ] } ], "source": [ "!$kgtk cat \\\n", "-i \"$TEMP\"/Q154.label.node2.tsv.gz \\\n", "-i \"$TEMP\"/alias.tsv.gz \\\n", "-i \"$TEMP\"/description.tsv.gz \\\n", "-i \"$TEMP\"/Q154.edges.3.tsv.gz \\\n", "-i \"$TEMP\"/kgtk.properties.tsv \\\n", "-i \"$TEMP\"/Q15.metadata.property.datatype.tsv.gz \\\n", "/ compact \\\n", "/ sort2 \\\n", "-o \"$OUT\"/all.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 218110 955849 12264507\n" ] } ], "source": [ "!zcat < \"$OUT\"/all.tsv.gz | wc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 6: partition the files to follow the conventions KGTK uses for Wikidata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Stop here: the stuff below is Pedro's scratchpad, will be deleted later" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleanup\n", "\n", "Remove `novalue` and `somevalue`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 510.97 real 166.66 user 197.67 sys\n", "id node1 label node2 rank node2;wikidatatype id node1 label node2 node2;wikidatatype\n", "Q65-P1082-02e70e-ea7734b4-0 Q65 P1082 +3792621 normal quantity Q65-P1082-02e70e-ea7734b4-0-P585-6e6a88-0 Q65-P1082-02e70e-ea7734b4-0 P585 ^2010-00-00T00:00:00Z/9 time\n", "Q65-P1082-2c08e1-12f0f95e-0 Q65 P1082 +5728 normal quantity Q65-P1082-2c08e1-12f0f95e-0-P585-4ab039-0 Q65-P1082-2c08e1-12f0f95e-0 P585 ^1870-00-00T00:00:00Z/9 time\n", "Q65-P1082-418d5a-b540356a-0 Q65 P1082 +319198 normal quantity Q65-P1082-418d5a-b540356a-0-P585-6efbd3-0 Q65-P1082-418d5a-b540356a-0 P585 ^1910-00-00T00:00:00Z/9 time\n", "Q65-P1082-808058-b69b4060-0 Q65 P1082 +50395 normal quantity Q65-P1082-808058-b69b4060-0-P585-b45c46-0 Q65-P1082-808058-b69b4060-0 P585 ^1890-00-00T00:00:00Z/9 time\n", "Q65-P1082-982d82-a3b6b816-0 Q65 P1082 +3976322 preferred quantity Q65-P1082-982d82-a3b6b816-0-P585-cd3f49-0 Q65-P1082-982d82-a3b6b816-0 P585 ^2016-00-00T00:00:00Z/9 time\n", "Q65-P1082-a403b5-8ac2d57f-0 Q65 P1082 +1610 normal quantity Q65-P1082-a403b5-8ac2d57f-0-P585-4b4a18-0 Q65-P1082-a403b5-8ac2d57f-0 P585 ^1850-00-00T00:00:00Z/9 time\n", "Q65-P1082-aa27be-2612ff2a-0 Q65 P1082 +102479 normal quantity Q65-P1082-aa27be-2612ff2a-0-P585-662d67-0 Q65-P1082-aa27be-2612ff2a-0 P585 ^1900-00-00T00:00:00Z/9 time\n", "Q65-P1082-b41a90-8b71e469-0 Q65 P1082 +11183 normal quantity Q65-P1082-b41a90-8b71e469-0-P585-211752-0 Q65-P1082-b41a90-8b71e469-0 P585 ^1880-00-00T00:00:00Z/9 time\n", "Q65-P1082-c0f75e-73a2c23f-0 Q65 P1082 +3990456 normal quantity Q65-P1082-c0f75e-73a2c23f-0-P585-364baf-0 Q65-P1082-c0f75e-73a2c23f-0 P585 ^2018-00-00T00:00:00Z/9 time\n", "Q65-P1082-d6a4d0-28aeb70a-0 Q65 P1082 +4385 normal quantity Q65-P1082-d6a4d0-28aeb70a-0-P585-a9ad71-0 Q65-P1082-d6a4d0-28aeb70a-0 P585 ^1860-00-00T00:00:00Z/9 time\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$QUALIFIERS\" \\\n", "--match 'claim: (n1:Q65)-[l]-(n2), qual: (l)-[ql]->(qn2)' \\\n", "--limit 10 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/{wos}/derived.P279.tsv.gz'\n", "\n", " 1.08 real 0.59 user 0.19 sys\n" ] } ], "source": [ "!$kypher -i {wos}/derived.P279.tsv.gz \\\n", "--match '(n1)-[]-()' \\\n", "--return 'count(distinct n1)' \\\n", "--limit 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: {kypher}: command not found\n" ] } ], "source": [ "!{kypher} -i {wos}/derived.P279star.tsv.gz -i {wos}/labels.en.tsv.gz \\\n", "--match 'P279star: (n1)-[]-(:Q18518465), label: (n1)-[]->(label)' \\\n", "--return 'n1 as class, label as name' \\\n", "--limit 10" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'quals_time' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mquals_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'quals_time' is not defined" ] } ], "source": [ "quals_time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "claims" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env CLAIMS={claims}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env CLAIMS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env ST=\"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%env kypher" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\tlang\n", "Q2860568-addl_wikipedia_sitelink-93c252-0\tQ2860568\taddl_wikipedia_sitelink\thttp://commonswiki.org/wiki/Category:Archives_of_American_Art\ten\n", "Q2860568-wikipedia_sitelink-0d01d2-0\tQ2860568\twikipedia_sitelink\thttp://es.wikipedia.org/wiki/Archivos_de_arte_estadounidense\tes\n", "Q2860568-wikipedia_sitelink-14b314-0\tQ2860568\twikipedia_sitelink\thttp://fr.wikipedia.org/wiki/Archives_of_American_Art\tfr\n", "Q2860568-wikipedia_sitelink-8e7449-0\tQ2860568\twikipedia_sitelink\thttp://ca.wikipedia.org/wiki/Arxius_d'Art_Americà\tca\n", "Q2860568-wikipedia_sitelink-9e4854-0\tQ2860568\twikipedia_sitelink\thttp://en.wikipedia.org/wiki/Archives_of_American_Art\ten\n", "Q2860568-wikipedia_sitelink-c1e42a-0\tQ2860568\twikipedia_sitelink\thttp://la.wikipedia.org/wiki/Tabulae_Artis_Americanae\tla\n", "Q2860568-wikipedia_sitelink-c68de4-0\tQ2860568\twikipedia_sitelink\thttp://pl.wikipedia.org/wiki/Archives_of_American_Art\tpl\n", " 2694.67 real 3603.61 user 285.78 sys\n" ] } ], "source": [ "!$kypher -i \"$SITELINKS\" \\\n", "--match '(n1:Q2860568)-[l]->(n2)' \\\n", "--limit 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$SITELINKS\" \\\n", "--match '(n1)-[l {lang: \"en\"}]->(n2)' \\\n", "--return 'n2 as wikipedia, count(n1) as n1_count' \\\n", "--order-by 'n1_count desc' \\\n", "-o $TEMP/sitelinks.count.en.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$SITELINKS\" \\\n", "--match '(n1)-[l:`wikipedia_sitelink` {lang: language}]->(n2)' \\\n", "--return 'n2 as wikipedia, count(n1) as n1_count' \\\n", "--order-by 'n1_count desc' \\\n", "-o $TEMP/sitelinks.count.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$SITELINKS\" \\\n", "--match '(n1)-[l:`wikipedia_sitelink` {lang: language}]->(n2)' \\\n", "--return 'n1 as qnode, language, count(n2) as n1_count' \\\n", "--order-by 'n1_count desc' \\\n", "-o $TEMP/sitelinks.qnode.count.tsv.gz" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2872.01 real 1038.30 user 693.38 sys\n" ] } ], "source": [ "!$kypher -i \"$SITELINKS\" \\\n", "--match '(n1)-[l:`wikipedia_sitelink` {lang: language, label:lab}]->(n2), (l)-[:`sitelink-site`]->(site)' \\\n", "--return 'n1 as node1, lab as label, n2 as node2, language as language, site as site' \\\n", "-o $TEMP/sitelinks.wikipedia.tsv.gz" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tlanguage\tsite\n", "Q1\twikipedia_sitelink\thttp://oc.wikipedia.org/wiki/Univèrs\toc\tocwiki\n", "Q1\twikipedia_sitelink\thttp://cdo.wikipedia.org/wiki/Ṳ̄-dêu\tcdo\tcdowiki\n", "Q1\twikipedia_sitelink\thttp://ml.wikipedia.org/wiki/പ്രപഞ്ചം\tml\tmlwiki\n", "Q1\twikipedia_sitelink\thttp://si.wikipedia.org/wiki/විශ්වය\tsi\tsiwiki\n", "Q1\twikipedia_sitelink\thttp://bxr.wikipedia.org/wiki/Оршолон\tbxr\tbxrwiki\n", "Q1\twikipedia_sitelink\thttp://jam.wikipedia.org/wiki/Yunivoers\tjam\tjamwiki\n", "Q1\twikipedia_sitelink\thttp://hr.wikipedia.org/wiki/Svemir\thr\thrwiki\n", "Q1\twikipedia_sitelink\thttp://chr.wikipedia.org/wiki/ᎦᎸᎶᎯ_ᎦᎸᎾᏗ\tchr\tchrwiki\n", "Q1\twikipedia_sitelink\thttp://pfl.wikipedia.org/wiki/Weltall\tpfl\tpflwiki\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < $TEMP/sitelinks.wikipedia.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "qnode\tlang\tn1_count\n", "Q13107716\t\t2\n", "Q14357839\t\t2\n", "Q15098140\t\t2\n", "Q15116966\t\t2\n", "Q15117218\t\t2\n", "Q15117391\t\t2\n", "Q15379728\t\t2\n", "Q15506579\t\t2\n", "Q16748603\t\t2\n", "Q16830095\t\t2\n", "Q17121869\t\t2\n", "Q17347205\t\t2\n", "Q17347215\t\t2\n", "Q17347224\t\t2\n", "Q17347230\t\t2\n", "Q20962109\t\t2\n", "Q21451097\t\t2\n", "Q25714577\t\t2\n", "Q26905045\t\t2\n", "Q26905108\t\t2\n", "Q4375196\t\t2\n", "Q48010913\t\t2\n", "Q4847311\t\t2\n", "Q5296\t\t2\n", "Q5453037\t\t2\n", "Q56528363\t\t2\n", "Q56528384\t\t2\n", "Q58832772\t\t2\n", "Q7253814\t\t2\n", "Q7348344\t\t2\n", "Q1\taf\t1\n", "Q1\tak\t1\n", "Q1\tals\t1\n", "Q1\tam\t1\n", "Q1\tan\t1\n", "Q1\tar\t1\n", "Q1\tarc\t1\n", "Q1\tarz\t1\n", "Q1\tas\t1\n", "Q1\tast\t1\n", "Q1\taz\t1\n", "Q1\tba\t1\n", "Q1\tbar\t1\n", "Q1\tbat-smg\t1\n", "Q1\tbe\t1\n", "Q1\tbe-x-old\t1\n", "Q1\tbg\t1\n", "Q1\tbh\t1\n", "Q1\tbn\t1\n", "Q1\tbr\t1\n", "Q1\tbs\t1\n", "Q1\tbxr\t1\n", "Q1\tca\t1\n", "Q1\tcdo\t1\n", "Q1\tce\t1\n", "Q1\tchr\t1\n", "Q1\tckb\t1\n", "Q1\tcs\t1\n", "Q1\tcsb\t1\n", "Q1\tcv\t1\n", "Q1\tcy\t1\n", "Q1\tda\t1\n", "Q1\tde\t1\n", "Q1\tdiq\t1\n", "Q1\tdsb\t1\n", "Q1\tel\t1\n", "Q1\ten\t1\n", "Q1\teo\t1\n", "Q1\tes\t1\n", "Q1\tet\t1\n", "Q1\teu\t1\n", "Q1\text\t1\n", "Q1\tfa\t1\n", "Q1\tfi\t1\n", "Q1\tfj\t1\n", "Q1\tfo\t1\n", "Q1\tfr\t1\n", "Q1\tfrr\t1\n", "Q1\tfy\t1\n", "Q1\tga\t1\n", "Q1\tgcr\t1\n", "Q1\tgl\t1\n", "Q1\tgn\t1\n", "Q1\tgu\t1\n", "Q1\thak\t1\n", "Q1\the\t1\n", "Q1\thi\t1\n", "Q1\thif\t1\n", "Q1\thr\t1\n", "Q1\tht\t1\n", "Q1\thu\t1\n", "Q1\thy\t1\n", "Q1\thyw\t1\n", "Q1\tia\t1\n", "Q1\tid\t1\n", "Q1\tilo\t1\n", "Q1\tinh\t1\n", "Q1\tio\t1\n", "Q1\tis\t1\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < $TEMP/sitelinks.qnode.count.tsv.gz | head -100" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k = \"time kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$CLAIMS\" -i \"$QUALS\" \\\n", "--match 'claims: (n1:Q30)-[l {label: property}]->(n2), qual: (l)-[q]->(t {wikidatatype: \"time\"})' \\\n", "--return 'distinct n1, property as label, n2 as node2, q.label as qualifier, kgtk_date_and_time(t) as time, l as id' \\\n", "--order-by 'n1, property, qualifier, time desc' \\\n", "--limit 100 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "kgtk query --debug --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db \\\n", "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz\" \\\n", "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/qualifiers.tsv.gz\" \\\n", "--match 'claims: (n1:Q30)-[l {label: property}]->(n2), qual: (l)-[q]->(t {wikidatatype: \"time\"})' \\\n", "--return 'distinct n1, property as label, n2 as node2, q.label as qualifier, kgtk_date_and_time(t) as time, l as id' \\\n", "--order-by 'n1, property, qualifier, time desc' \\\n", "--limit 100 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher \\\n", "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz\" \\\n", "-i \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/qualifiers.tsv.gz\" \\\n", "--match 'claims: (n1:Q30)-[l {label: property}]->(n2), qual: (l)-[q]->(t {wikidatatype: \"time\"})' \\\n", "--return 'distinct n1, property as label, n2 as node2, q.label as qualifier, kgtk_date_and_time(t) as time' \\\n", "--order-by 'n1, property, qualifier, time desc' \\\n", "--limit 100 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!{kypher} -i {claims} -i {quals} \\\n", "--match 'claims: (n1:Q1431229)-[l]->(n2)' \\\n", "--limit 100" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "quals_time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!zcat < {quals_time} | grep 'Q1431229'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.environ[\"PYWIKIBOT_DIR\"] = \"/Users/pedroszekely/Documents/GitHub/core\"\n", "os.environ[\"PYWIKIBOT_DIR\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pprint # Only for structuring the JSON file\n", "\n", "import pywikibot\n", "import pywikibot.data.api as api\n", "\n", "\"\"\"\n", "Using API calls to get pageviews\n", "\"\"\"\n", "\n", "\n", "site = pywikibot.Site(\"wikidata\", \"wikidata\")\n", "repo = site.data_repository()\n", "item = pywikibot.ItemPage(repo, \"Q216916\")\n", "\n", "req = api.Request(\n", " site=site,\n", " parameters={\n", " \"action\": \"query\", # https://www.wikidata.org/w/api.php?action=query&titles=Q42&prop=pageviews\n", " \"titles\": item,\n", " \"prop\": \"pageviews\",\n", " },\n", ")\n", "\n", "pprint.pprint(\n", " req.submit()[\"query\"][\"pages\"][str(item.pageid)][\"pageviews\"]\n", ") # Can also use print" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$CLAIMS\" -o $TEMP/new.metadata.out_degree.tsv.gz \\\n", "--match '(n1)-[l]->()' \\\n", "--return 'distinct n1 as node1, count(l) as node2, \"out_degree\" as label' \\\n", "--order-by 'n1 desc'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$CLAIMS\" -o $TEMP/new.metadata.in_degree.tsv.gz \\\n", "--match '()-[l]->(n2 {`wikidatatype`:\"wikibase-item\"})' \\\n", "--return 'distinct n2 as node1, count(distinct l) as node2, \"in_degree\" as label' \\\n", "--order-by 'n2'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\n" ] } ], "source": [ "!echo \"$kypher\"" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "label\tcount\tnode2\n", "P882\t2905\t'FIPS 6-4 (US counties)'@en\n", "P5736\t2904\t'Minor Planet Center body ID'@en\n", "P4683\t2866\t'National Gallery of Art artwork ID'@en\n", "P8286\t2864\t'Olympedia athlete ID'@en\n", "P698\t2862\t'PubMed ID'@en\n", "P7263\t2817\t'Prime Pages ID'@en\n", "P374\t2707\t'INSEE municipality code'@en\n", "P6018\t2654\t'SeaLifeBase ID'@en\n", "P4129\t2462\t'Cinema Treasures ID'@en\n", "P3064\t2330\t'LepIndex ID'@en\n", "P1415\t2301\t'Oxford Dictionary of National Biography ID'@en\n", "P815\t2226\t'ITIS TSN'@en\n", "P830\t2165\t'Encyclopedia of Life ID'@en\n", "P354\t2123\t'HGNC ID'@en\n", "P359\t2072\t'Rijksmonument ID'@en\n", "P7202\t2046\t'Belgian Species List ID'@en\n", "P1970\t2046\t'MovieMeter film ID'@en\n", "P7224\t2024\t'Insects (Insecta) of the World ID'@en\n", "P351\t1974\t'Entrez Gene ID'@en\n", "P4381\t1932\t'Soccerdonna player ID'@en\n", "P932\t1927\t'PMCID'@en\n", "P3151\t1911\t'iNaturalist taxon ID'@en\n", "P3138\t1830\t'OFDb ID'@en\n", "P5573\t1825\t'archINFORM location ID'@en\n", "P2603\t1821\t'Kinopoisk film ID'@en\n", "P2574\t1760\t'National-Football-Teams.com player ID'@en\n", "P8422\t1698\t'EHESS ID of a French commune'@en\n", "P2163\t1697\t'FAST ID'@en\n", "P772\t1671\t'INE municipality code'@en\n", "P2840\t1638\t'NSC number'@en\n", "P3143\t1595\t'elFilm film ID'@en\n", "P4327\t1512\t'BHL bibliography ID'@en\n", "P6736\t1494\t'Drobné památky ID'@en\n", "P2529\t1451\t'ČSFD film ID'@en\n", "P8351\t1423\t'vglist video game ID'@en\n", "P685\t1401\t'NCBI taxonomy ID'@en\n", "P5263\t1393\t'Czech NDOP taxon ID'@en\n", "P1600\t1343\t'Inventari del Patrimoni Arquitectònic de Catalunya code'@en\n", "P650\t1338\t'RKDartists ID'@en\n", "P2334\t1327\t'Swedish Film Database film ID'@en\n", "P838\t1302\t'BioLib taxon ID'@en\n", "P5739\t1300\t'Pontificia Università della Santa Croce ID'@en\n", "P3302\t1247\t'Open Media Database film ID'@en\n", "P2446\t1242\t'Transfermarkt player ID'@en\n", "P3844\t1221\t'Deutsche Synchronkartei film ID'@en\n", "P2605\t1201\t'ČSFD person ID'@en\n", "P1156\t1174\t'Scopus Source ID'@en\n", "P5731\t1155\t'Angelicum ID'@en\n", "P1225\t1139\t'U.S. National Archives Identifier'@en\n", "P5383\t1110\t'archINFORM project ID'@en\n", " 19.99 real 2.36 user 4.37 sys\n" ] } ], "source": [ "!$kypher -i \"$IDS\" -i \"$LABEL\" -i \"/Users/pedroszekely/Downloads/fips-large.tsv\" \\\n", "--match 'fips: (fips)-[]->(), external: (n1)-[l {label: p}]->(fips), label: (p)-[]->(p_label)' \\\n", "--return 'p, count(p) as count, p_label' \\\n", "--order-by 'count desc' \\\n", "--limit 50" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tnode1\tnode2\tnode2\tnode1\n", "P882\tQ156168\t'FIPS 6-4 (US counties)'@en\t'Autauga County'@en\t\"01001\"\n", "P882\tQ156163\t'FIPS 6-4 (US counties)'@en\t'Baldwin County'@en\t\"01003\"\n", "P882\tQ109437\t'FIPS 6-4 (US counties)'@en\t'Barbour County'@en\t\"01005\"\n", "P882\tQ461204\t'FIPS 6-4 (US counties)'@en\t'Bibb County'@en\t\"01007\"\n", "P882\tQ111250\t'FIPS 6-4 (US counties)'@en\t'Blount County'@en\t\"01009\"\n", "P882\tQ111259\t'FIPS 6-4 (US counties)'@en\t'Bullock County'@en\t\"01011\"\n", "P882\tQ108871\t'FIPS 6-4 (US counties)'@en\t'Butler County'@en\t\"01013\"\n", "P882\tQ108856\t'FIPS 6-4 (US counties)'@en\t'Calhoun County'@en\t\"01015\"\n", "P882\tQ111280\t'FIPS 6-4 (US counties)'@en\t'Chambers County'@en\t\"01017\"\n", "P882\tQ108832\t'FIPS 6-4 (US counties)'@en\t'Cherokee County'@en\t\"01019\"\n", "P882\tQ111266\t'FIPS 6-4 (US counties)'@en\t'Chilton County'@en\t\"01021\"\n", "P882\tQ111254\t'FIPS 6-4 (US counties)'@en\t'Choctaw County'@en\t\"01023\"\n", "P882\tQ111273\t'FIPS 6-4 (US counties)'@en\t'Clarke County'@en\t\"01025\"\n", "P882\tQ156570\t'FIPS 6-4 (US counties)'@en\t'Clay County'@en\t\"01027\"\n", "P882\tQ327080\t'FIPS 6-4 (US counties)'@en\t'Cleburne County'@en\t\"01029\"\n", "P882\tQ485660\t'FIPS 6-4 (US counties)'@en\t'Coffee County'@en\t\"01031\"\n", "P882\tQ487731\t'FIPS 6-4 (US counties)'@en\t'Colbert County'@en\t\"01033\"\n", "P882\tQ487716\t'FIPS 6-4 (US counties)'@en\t'Conecuh County'@en\t\"01035\"\n", "P882\tQ487738\t'FIPS 6-4 (US counties)'@en\t'Coosa County'@en\t\"01037\"\n", "P882\tQ487725\t'FIPS 6-4 (US counties)'@en\t'Covington County'@en\t\"01039\"\n", "P882\tQ488831\t'FIPS 6-4 (US counties)'@en\t'Crenshaw County'@en\t\"01041\"\n", "P882\tQ188204\t'FIPS 6-4 (US counties)'@en\t'Cullman County'@en\t\"01043\"\n", "P882\tQ488840\t'FIPS 6-4 (US counties)'@en\t'Dale County'@en\t\"01045\"\n", "P882\tQ488847\t'FIPS 6-4 (US counties)'@en\t'Dallas County'@en\t\"01047\"\n", "P882\tQ494626\t'FIPS 6-4 (US counties)'@en\t'DeKalb County'@en\t\"01049\"\n", "P882\tQ494630\t'FIPS 6-4 (US counties)'@en\t'Elmore County'@en\t\"01051\"\n", "P882\tQ487744\t'FIPS 6-4 (US counties)'@en\t'Escambia County'@en\t\"01053\"\n", "P882\tQ493951\t'FIPS 6-4 (US counties)'@en\t'Etowah County'@en\t\"01055\"\n", "P882\tQ493957\t'FIPS 6-4 (US counties)'@en\t'Fayette County'@en\t\"01057\"\n", "P882\tQ488892\t'FIPS 6-4 (US counties)'@en\t'Franklin County'@en\t\"01059\"\n", "P882\tQ494620\t'FIPS 6-4 (US counties)'@en\t'Geneva County'@en\t\"01061\"\n", "P882\tQ493709\t'FIPS 6-4 (US counties)'@en\t'Greene County'@en\t\"01063\"\n", "P882\tQ501147\t'FIPS 6-4 (US counties)'@en\t'Hale County'@en\t\"01065\"\n", "P882\tQ501000\t'FIPS 6-4 (US counties)'@en\t'Henry County'@en\t\"01067\"\n", "P882\tQ496292\t'FIPS 6-4 (US counties)'@en\t'Houston County'@en\t\"01069\"\n", "P882\tQ366959\t'FIPS 6-4 (US counties)'@en\t'Jackson County'@en\t\"01071\"\n", "P882\tQ112271\t'FIPS 6-4 (US counties)'@en\t'Jefferson County'@en\t\"01073\"\n", "P882\tQ505317\t'FIPS 6-4 (US counties)'@en\t'Lamar County'@en\t\"01075\"\n", "P882\tQ261672\t'FIPS 6-4 (US counties)'@en\t'Lauderdale County'@en\t\"01077\"\n", "P882\tQ502737\t'FIPS 6-4 (US counties)'@en\t'Lawrence County'@en\t\"01079\"\n", "P882\tQ501055\t'FIPS 6-4 (US counties)'@en\t'Lee County'@en\t\"01081\"\n", "P882\tQ501108\t'FIPS 6-4 (US counties)'@en\t'Limestone County'@en\t\"01083\"\n", "P882\tQ503461\t'FIPS 6-4 (US counties)'@en\t'Lowndes County'@en\t\"01085\"\n", "P882\tQ502777\t'FIPS 6-4 (US counties)'@en\t'Macon County'@en\t\"01087\"\n", "P882\tQ493715\t'FIPS 6-4 (US counties)'@en\t'Madison County'@en\t\"01089\"\n", "P882\tQ501074\t'FIPS 6-4 (US counties)'@en\t'Marengo County'@en\t\"01091\"\n", "P882\tQ502739\t'FIPS 6-4 (US counties)'@en\t'Marion County'@en\t\"01093\"\n", "P882\tQ502925\t'FIPS 6-4 (US counties)'@en\t'Marshall County'@en\t\"01095\"\n", "P882\tQ495738\t'FIPS 6-4 (US counties)'@en\t'Mobile County'@en\t\"01097\"\n", "P882\tQ501060\t'FIPS 6-4 (US counties)'@en\t'Monroe County'@en\t\"01099\"\n", "P882\tQ502784\t'FIPS 6-4 (US counties)'@en\t'Montgomery County'@en\t\"01101\"\n", "P882\tQ137828\t'FIPS 6-4 (US counties)'@en\t'Morgan County'@en\t\"01103\"\n", "P882\tQ253538\t'FIPS 6-4 (US counties)'@en\t'Perry County'@en\t\"01105\"\n", "P882\tQ949766\t'FIPS 6-4 (US counties)'@en\t'Pickens County'@en\t\"01107\"\n", "P882\tQ492888\t'FIPS 6-4 (US counties)'@en\t'Pike County'@en\t\"01109\"\n", "P882\tQ502743\t'FIPS 6-4 (US counties)'@en\t'Randolph County'@en\t\"01111\"\n", "P882\tQ503329\t'FIPS 6-4 (US counties)'@en\t'Russell County'@en\t\"01113\"\n", "P882\tQ503451\t'FIPS 6-4 (US counties)'@en\t'St. Clair County'@en\t\"01115\"\n", "P882\tQ501084\t'FIPS 6-4 (US counties)'@en\t'Shelby County'@en\t\"01117\"\n", "P882\tQ501051\t'FIPS 6-4 (US counties)'@en\t'Sumter County'@en\t\"01119\"\n", "P882\tQ302918\t'FIPS 6-4 (US counties)'@en\t'Talladega County'@en\t\"01121\"\n", "P882\tQ512787\t'FIPS 6-4 (US counties)'@en\t'Tallapoosa County'@en\t\"01123\"\n", "P882\tQ503877\t'FIPS 6-4 (US counties)'@en\t'Tuscaloosa County'@en\t\"01125\"\n", "P882\tQ506291\t'FIPS 6-4 (US counties)'@en\t'Walker County'@en\t\"01127\"\n", "P882\tQ501157\t'FIPS 6-4 (US counties)'@en\t'Washington County'@en\t\"01129\"\n", "P882\tQ503081\t'FIPS 6-4 (US counties)'@en\t'Wilcox County'@en\t\"01131\"\n", "P882\tQ503088\t'FIPS 6-4 (US counties)'@en\t'Winston County'@en\t\"01133\"\n", "P882\tQ504371\t'FIPS 6-4 (US counties)'@en\t'Aleutians East Borough'@en\t\"02013\"\n", "P882\tQ185533\t'FIPS 6-4 (US counties)'@en\t'Aleutians West Census Area'@en\t\"02016\"\n", "P882\tQ39450\t'FIPS 6-4 (US counties)'@en\t'Anchorage'@en\t\"02020\"\n", "P882\tQ49297981\t'FIPS 6-4 (US counties)'@en\t'Anchorage Municipality'@en\t\"02020\"\n", "P882\tQ500312\t'FIPS 6-4 (US counties)'@en\t'Bethel Census Area'@en\t\"02050\"\n", "P882\tQ501130\t'FIPS 6-4 (US counties)'@en\t'Bristol Bay Borough'@en\t\"02060\"\n", "P882\tQ179950\t'FIPS 6-4 (US counties)'@en\t'Denali Borough'@en\t\"02068\"\n", "P882\tQ277728\t'FIPS 6-4 (US counties)'@en\t'Dillingham Census Area'@en\t\"02070\"\n", "P882\tQ512901\t'FIPS 6-4 (US counties)'@en\t'Fairbanks North Star Borough'@en\t\"02090\"\n", "P882\tQ512981\t'FIPS 6-4 (US counties)'@en\t'Haines Borough'@en\t\"02100\"\n", "P882\tQ500827\t'FIPS 6-4 (US counties)'@en\t'Hoonah–Angoon Census Area'@en\t\"02105\"\n", "P882\tQ29445\t'FIPS 6-4 (US counties)'@en\t'Juneau'@en\t\"02110\"\n", "P882\tQ512713\t'FIPS 6-4 (US counties)'@en\t'Kenai Peninsula Borough'@en\t\"02122\"\n", "P882\tQ506064\t'FIPS 6-4 (US counties)'@en\t'Ketchikan Gateway Borough'@en\t\"02130\"\n", "P882\tQ514093\t'FIPS 6-4 (US counties)'@en\t'Kodiak Island Borough'@en\t\"02150\"\n", "P882\tQ379474\t'FIPS 6-4 (US counties)'@en\t'Kusilvak Census Area'@en\t\"02158\"\n", "P882\tQ511679\t'FIPS 6-4 (US counties)'@en\t'Lake and Peninsula Borough'@en\t\"02164\"\n", "P882\tQ512925\t'FIPS 6-4 (US counties)'@en\t'Matanuska-Susitna Borough'@en\t\"02170\"\n", "P882\tQ503023\t'FIPS 6-4 (US counties)'@en\t'Nome Census Area'@en\t\"02180\"\n", "P882\tQ511806\t'FIPS 6-4 (US counties)'@en\t'North Slope Borough'@en\t\"02185\"\n", "P882\tQ511723\t'FIPS 6-4 (US counties)'@en\t'Northwest Arctic Borough'@en\t\"02188\"\n", "P882\tQ25408755\t'FIPS 6-4 (US counties)'@en\t'Petersburg Borough'@en\t\"02195\"\n", "P882\tQ503028\t'FIPS 6-4 (US counties)'@en\t'Petersburg Census Area'@en\t\"02195\"\n", "P882\tQ18120072\t'FIPS 6-4 (US counties)'@en\t'Prince of Wales–Hyder Census Area'@en\t\"02198\"\n", "P882\tQ79804\t'FIPS 6-4 (US counties)'@en\t'Sitka'@en\t\"02220\"\n", "P882\tQ615975\t'FIPS 6-4 (US counties)'@en\t'Skagway'@en\t\"02230\"\n", "P882\tQ500845\t'FIPS 6-4 (US counties)'@en\t'Southeast Fairbanks Census Area'@en\t\"02240\"\n", "P882\tQ508618\t'FIPS 6-4 (US counties)'@en\t'Valdez–Cordova Census Area'@en\t\"02261\"\n", "P882\tQ43983\t'FIPS 6-4 (US counties)'@en\t'Wrangell'@en\t\"02275\"\n", "P882\tQ487681\t'FIPS 6-4 (US counties)'@en\t'Yakutat'@en\t\"02282\"\n", "P882\tQ500818\t'FIPS 6-4 (US counties)'@en\t'Yukon–Koyukuk Census Area'@en\t\"02290\"\n", "P882\tQ58771\t'FIPS 6-4 (US counties)'@en\t'Apache County'@en\t\"04001\"\n", "P882\tQ58774\t'FIPS 6-4 (US counties)'@en\t'Cochise County'@en\t\"04003\"\n", "P882\tQ58684\t'FIPS 6-4 (US counties)'@en\t'Coconino County'@en\t\"04005\"\n", "P882\tQ58686\t'FIPS 6-4 (US counties)'@en\t'Gila County'@en\t\"04007\"\n", "P882\tQ58692\t'FIPS 6-4 (US counties)'@en\t'Graham County'@en\t\"04009\"\n", "P882\tQ58683\t'FIPS 6-4 (US counties)'@en\t'Greenlee County'@en\t\"04011\"\n", "P882\tQ58759\t'FIPS 6-4 (US counties)'@en\t'La Paz County'@en\t\"04012\"\n", "P882\tQ58691\t'FIPS 6-4 (US counties)'@en\t'Maricopa County'@en\t\"04013\"\n", "P882\tQ58696\t'FIPS 6-4 (US counties)'@en\t'Mohave County'@en\t\"04015\"\n", "P882\tQ58694\t'FIPS 6-4 (US counties)'@en\t'Navajo County'@en\t\"04017\"\n", "P882\tQ58688\t'FIPS 6-4 (US counties)'@en\t'Pima County'@en\t\"04019\"\n", "P882\tQ58712\t'FIPS 6-4 (US counties)'@en\t'Pinal County'@en\t\"04021\"\n", "P882\tQ58689\t'FIPS 6-4 (US counties)'@en\t'Santa Cruz County'@en\t\"04023\"\n", "P882\tQ58711\t'FIPS 6-4 (US counties)'@en\t'Yavapai County'@en\t\"04025\"\n", "P882\tQ58698\t'FIPS 6-4 (US counties)'@en\t'Yuma County'@en\t\"04027\"\n", "P882\tQ61414\t'FIPS 6-4 (US counties)'@en\t'Arkansas County'@en\t\"05001\"\n", "P882\tQ61026\t'FIPS 6-4 (US counties)'@en\t'Ashley County'@en\t\"05003\"\n", "P882\tQ61086\t'FIPS 6-4 (US counties)'@en\t'Baxter County'@en\t\"05005\"\n", "P882\tQ61020\t'FIPS 6-4 (US counties)'@en\t'Benton County'@en\t\"05007\"\n", "P882\tQ61010\t'FIPS 6-4 (US counties)'@en\t'Boone County'@en\t\"05009\"\n", "P882\tQ61024\t'FIPS 6-4 (US counties)'@en\t'Bradley County'@en\t\"05011\"\n", "P882\tQ61461\t'FIPS 6-4 (US counties)'@en\t'Calhoun County'@en\t\"05013\"\n", "P882\tQ61216\t'FIPS 6-4 (US counties)'@en\t'Carroll County'@en\t\"05015\"\n", "P882\tQ61458\t'FIPS 6-4 (US counties)'@en\t'Chicot County'@en\t\"05017\"\n", "P882\tQ61200\t'FIPS 6-4 (US counties)'@en\t'Clark County'@en\t\"05019\"\n", "P882\tQ61330\t'FIPS 6-4 (US counties)'@en\t'Clay County'@en\t\"05021\"\n", "P882\tQ61039\t'FIPS 6-4 (US counties)'@en\t'Cleburne County'@en\t\"05023\"\n", "P882\tQ61032\t'FIPS 6-4 (US counties)'@en\t'Cleveland County'@en\t\"05025\"\n", "P882\tQ61358\t'FIPS 6-4 (US counties)'@en\t'Columbia County'@en\t\"05027\"\n", "P882\tQ61352\t'FIPS 6-4 (US counties)'@en\t'Conway County'@en\t\"05029\"\n", "P882\tQ61354\t'FIPS 6-4 (US counties)'@en\t'Craighead County'@en\t\"05031\"\n", "P882\tQ61005\t'FIPS 6-4 (US counties)'@en\t'Crawford County'@en\t\"05033\"\n", "P882\tQ61346\t'FIPS 6-4 (US counties)'@en\t'Crittenden County'@en\t\"05035\"\n", "P882\tQ61036\t'FIPS 6-4 (US counties)'@en\t'Cross County'@en\t\"05037\"\n", "P882\tQ61012\t'FIPS 6-4 (US counties)'@en\t'Dallas County'@en\t\"05039\"\n", "P882\tQ61029\t'FIPS 6-4 (US counties)'@en\t'Desha County'@en\t\"05041\"\n", "P882\tQ61478\t'FIPS 6-4 (US counties)'@en\t'Drew County'@en\t\"05043\"\n", "P882\tQ61468\t'FIPS 6-4 (US counties)'@en\t'Faulkner County'@en\t\"05045\"\n", "P882\tQ61084\t'FIPS 6-4 (US counties)'@en\t'Franklin County'@en\t\"05047\"\n", "P882\tQ61007\t'FIPS 6-4 (US counties)'@en\t'Fulton County'@en\t\"05049\"\n", "P882\tQ61077\t'FIPS 6-4 (US counties)'@en\t'Garland County'@en\t\"05051\"\n", " 0.81 real 0.58 user 0.16 sys\n" ] } ], "source": [ "!$kypher -i \"$IDS\" -i \"$LABEL\" -i \"/Users/pedroszekely/Downloads/fips-sample.tsv\" \\\n", "--match 'fips: (fips)-[]->(), external: (n1)-[l:P882 {label: p}]->(fips), label: (p)-[]->(p_label), label: (n1)-[]->(q_label)' \\\n", "--return 'p, n1, p_label, q_label, fips' " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.external-id.tsv.gz\n" ] } ], "source": [ "!echo \"$IDS\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.environ['IDS'] = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.external-id.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "os.environ['TIME'] = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.time.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4.95 real 3.58 user 0.59 sys\n", "node1 node1;label class node2;year\n", "Q11285199 'Ayusan'@en Q726 2010\n", "Q11290535 'Epiphaneia'@en Q726 2010\n", "Q11296691 'Kizuna'@en Q726 2010\n", "Q11297901 'Kingsbarns'@en Q726 2010\n", "Q11343357 'Meisho Mambo'@en Q726 2010\n", "Q11350241 'Logotype'@en Q726 2010\n", "Q11351036 'Robe Tissage'@en Q726 2010\n", "Q11576902 'Nao Tamura'@en Q5 2010\n", "Q12495326 'Louis, Duke of Burgundy'@en Q5 2010\n", "Q12516585 'Suhel Fahmi'@en Q5 2010\n", "Q12981960 'Orb'@en Q726 2010\n", "Q13512747 'Vahideh Nazeri'@en Q5 2010\n", "Q15052027 'Trêve'@en Q726 2010\n", "Q16335413 'Max Alan Shatto'@en Q5 2010\n", "Q16515807 'Nami Havelková'@en Q5 2010\n", "Q16727999 'Chinawoman'@en Q5 2010\n", "Q16889222 'Oxbow'@en Q726 2010\n", "Q16950986 'Beholder'@en Q726 2010\n", "Q16963128 'Winsili'@en Q726 2010\n", "Q16971546 'Shamus Award'@en Q726 2010\n" ] } ], "source": [ "!$kypher -i \"$TIME\" -i \"$LABEL\" -i \"$ISA\" \\\n", "--match 'time: (n1)-[l:P569]->(n2), label: (n1)-[]->(lab), isa: (n1)-[]->(class)' \\\n", "--return 'n1 as node1, lab as `node1;label`, class as class, kgtk_date_year(n2) as `node2;year`' \\\n", "--where 'kgtk_date_year(n2) = 2010' \\\n", "--limit 20 \\\n", "| column -t -s $'\\t'" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", "Q16515807-P106-Q33999-285a55e8-0\tQ16515807\tP106\tQ33999\tnormal\twikibase-item\n", "Q16515807-P106-Q970153-f9c11847-0\tQ16515807\tP106\tQ970153\tnormal\twikibase-item\n", "Q16515807-P1477-69fe1d-f8504ec5-0\tQ16515807\tP1477\t'Natálie Miroslava Havelková'@cs\tnormal\tmonolingualtext\n", "Q16515807-P19-Q155993-9c796f27-0\tQ16515807\tP19\tQ155993\tnormal\twikibase-item\n", "Q16515807-P21-Q6581072-70378435-0\tQ16515807\tP21\tQ6581072\tnormal\twikibase-item\n", "Q16515807-P2605-8cb85f-9a0573db-0\tQ16515807\tP2605\t\"292876\"\tnormal\texternal-id\n", "Q16515807-P27-Q213-98d068e5-0\tQ16515807\tP27\tQ213\tnormal\twikibase-item\n", "Q16515807-P31-Q5-3aba8c99-0\tQ16515807\tP31\tQ5\tnormal\twikibase-item\n", "Q16515807-P569-42a69c-36932550-0\tQ16515807\tP569\t^2010-00-00T00:00:00Z/7\tnormal\ttime\n", "Q16515807-P735-Q28732407-65ef2f48-0\tQ16515807\tP735\tQ28732407\tnormal\twikibase-item\n", "Q16515807-P735-Q923005-d5e0f80d-0\tQ16515807\tP735\tQ923005\tnormal\twikibase-item\n", " 0.99 real 0.55 user 0.18 sys\n" ] } ], "source": [ "!$kypher -i \"$CLAIMS\" \\\n", "--match '(n1:Q16515807)-[l]-(n2)'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtk", "language": "python", "name": "kgtk" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.8" } }, "nbformat": 4, "nbformat_minor": 4 }