# Add Derived Graphs To The Tutorial Graph



In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

import papermill as pm

sys.path.insert(0,'../..')
from kgtk.configure_kgtk_notebooks import ConfigureKGTK

from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
project_name = "tutorial-derived-graphs"
tutorial_files_path = "/Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/datasets/arnold"

In [3]:
files = [
    "all"
]
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)

User home: /Users/amandeep
Current dir: /Users/amandeep/Github/kgtk/tutorial/build-kg
KGTK dir: /Users/amandeep/Github/kgtk
Use-cases dir: /Users/amandeep/Github/kgtk/use-cases


In [4]:
ck.print_env_variables()

OUT: /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs
TEMP: /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/temp.tutorial-derived-graphs
STORE: /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db
kypher: kgtk query --graph-cache /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db
GRAPH: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold
USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases
kgtk: kgtk
EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples
all: /Volumes/saggu-ssd/kgtk-tutorial-files/datasets/arnold/all.tsv.gz


Turn on debugging for kypher

In [5]:
os.environ['tutorial_files_path'] = tutorial_files_path
os.environ['kgtk_path'] = kgtk_path
os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']
os.environ['KGTK_LABEL_FILE'] = os.environ['OUT'] + "/parts/labels.en.tsv.gz"
os.environ['KGTK_OPTION_DEBUG'] = "true"

Load all my files into the kypher cache so that all graph aliases are defined

In [6]:
ck.load_files_into_cache()

kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs/temp.tutorial-derived-graphs/wikidata.sqlite3.db -i "/Users/pedroszekely/Downloads/kypher/projects/build-tutorial/all.tsv.gz" --as all  --limit 3
[2021-10-10 11:51:26 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_27 AS graph_27_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
node1	label	node2	id
P10	P31	Q18610173	P10-P31-Q18610173-85ef4d24-0
P1000	P31	Q18608871	P1000-P31-Q18608871-093affb5-0
P1001	P1647	P276	P1001-P1647-P276-e4e44f83-0


In [7]:
%cd {os.environ['OUT']}

/Users/pedroszekely/Downloads/kypher/projects/tutorial-derived-graphs


## Run partition notebook

We need the parts to run the Useful Files notebook

In [8]:
pm.execute_notebook(
    os.environ["EXAMPLES_DIR"] + "/partition-wikidata.ipynb",
    os.environ["TEMP"] + "/partition-wikidata.out.ipynb",
    parameters=dict(
        wikidata_input_path = input_path + "/all.tsv.gz",
        wikidata_parts_path = os.environ["OUT"] + "/parts",
        temp_folder_path = os.environ["OUT"] + "/parts/temp",
        sort_extras = "--buffer-size 30% --temporary-directory $OUT/parts/temp",
        verbose = False,
        gzip_command = 'gzip'
    )
)
;

Executing:   0%|          | 0/49 [00:00<?, ?cell/s]

''

Show the files after partition

In [9]:
!ls $OUT/parts

aliases.en.tsv.gz                   metadata.property.datatypes.tsv.gz
aliases.tsv.gz                      metadata.types.tsv.gz
all.tsv.gz                          qualifiers.commonsMedia.tsv.gz
claims.commonsMedia.tsv.gz          qualifiers.external-id.tsv.gz
claims.external-id.tsv.gz           qualifiers.geo-shape.tsv.gz
claims.geo-shape.tsv.gz             qualifiers.globe-coordinate.tsv.gz
claims.globe-coordinate.tsv.gz      qualifiers.math.tsv.gz
claims.math.tsv.gz                  qualifiers.monolingualtext.tsv.gz
claims.monolingualtext.tsv.gz       qualifiers.musical-notation.tsv.gz
claims.musical-notation.tsv.gz      qualifiers.quantity.tsv.gz
claims.other.tsv.gz                 qualifiers.string.tsv.gz
claims.quantity.tsv.gz              qualifiers.tabular-data.tsv.gz
claims.string.tsv.gz                qualifiers.time.tsv.gz
claims.tabular-data.tsv.gz          qualifiers.tsv.gz
claims.time.tsv.gz                  qualifiers.url.tsv.gz
claims.tsv.gz                       quali

Deploy the parts to `$tutorial_files_path`

## Run useful files notebook

In [9]:
pm.execute_notebook(
    os.environ["USE_CASES_DIR"] + "/Wikidata Useful Files.ipynb",
    os.environ["TEMP"] + "/Wikidata Useful Files Out.ipynb",
    parameters=dict(
        output_path = os.environ["OUT"],
        project_name = "useful_files",
        kgtk_path = kgtk_path,
        input_path = os.environ["OUT"] + "/parts",
        files = 'claims,label,label_all,alias,alias_all,description,description_all,item',
        graph_cache_path = os.environ['STORE'],
        languages = 'en',
        compute_pagerank = True,
        compute_degrees = True,
        compute_hits = False, 
        compute_table_linker_files = False,
        debug = "false"
    )
)
;

Executing:   0%|          | 0/151 [00:00<?, ?cell/s]

''

In [10]:
!ls -l $OUT/useful_files

total 98312
-rw-r--r--   1 amandeep  staff   1344059 Oct 12 16:32 aliases.en.tsv.gz
-rw-r--r--   1 amandeep  staff    390973 Oct 12 16:32 derived.P279.tsv.gz
-rw-r--r--   1 amandeep  staff   3325552 Oct 12 16:33 derived.P279star.tsv.gz
-rw-r--r--   1 amandeep  staff   1181395 Oct 12 16:32 derived.P31.tsv.gz
-rw-r--r--   1 amandeep  staff  12814647 Oct 12 16:34 derived.P31P279star.tsv.gz
-rw-r--r--   1 amandeep  staff    499128 Oct 12 16:33 derived.isa.tsv.gz
-rw-r--r--   1 amandeep  staff  15435743 Oct 12 16:33 derived.isastar.tsv.gz
-rw-r--r--   1 amandeep  staff   1341047 Oct 12 16:32 descriptions.en.tsv.gz
-rw-r--r--   1 amandeep  staff   1065321 Oct 12 16:32 labels.en.tsv.gz
-rw-r--r--   1 amandeep  staff    309510 Oct 12 16:35 metadata.in_degree.tsv.gz
-rw-r--r--   1 amandeep  staff    585326 Oct 12 16:34 metadata.out_degree.tsv.gz
-rw-r--r--   1 amandeep  staff   2192335 Oct 12 16:34 metadata.pagerank.directed.tsv.gz
-rw-r--r--   1 amandeep  staff   2426762 Oct 12 16:34 metadata.

## Enhance pagerank files to include ordinal

Approach:
- Load the `directed_pagerank` from the metadata file into a dataframe (using kypher because somehow cat is broken, sigh
- Sort the file by pagerank descending
- Add a new column with header `P1545` (ordinal) and store the ranks in this column
- Store the result in a temporary file.

In [11]:
%%time
directed_pagerank = kgtk("""
    query -i $OUT/useful_files/metadata.pagerank.directed.tsv.gz 
    --match '(n1)-[l:Pdirected_pagerank]->(pagerank)'
""")

directed_pagerank_sorted = directed_pagerank.sort_values("node2", ascending=False)
directed_pagerank_sorted.insert(0, 'P1545', range(1, 1 + len(directed_pagerank_sorted)))
directed_pagerank_sorted.to_csv(f"{os.environ['TEMP']}/directed-pagerank.ordinal.tsv", index=False, sep='\t')
directed_pagerank_sorted

[2021-10-12 16:35:42 sqlstore]: IMPORT graph directly into table graph_1 from /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/useful_files/metadata.pagerank.directed.tsv.gz ...
[2021-10-12 16:35:43 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_1 AS graph_1_c1
     WHERE graph_1_c1."label" = ?
  PARAS: ['Pdirected_pagerank']
---------------------------------------------
[2021-10-12 16:35:43 sqlstore]: CREATE INDEX on table graph_1 column label ...
[2021-10-12 16:35:43 sqlstore]: ANALYZE INDEX on table graph_1 column label ...

CPU times: user 658 ms, sys: 99.8 ms, total: 758 ms
Wall time: 2.81 s


Unnamed: 0,P1545,node1,label,node2,id
26562,1,Q23958852,Pdirected_pagerank,0.071410,Q23958852-Pdirected_pagerank-79688
42551,2,Q23960977,Pdirected_pagerank,0.032866,Q23960977-Pdirected_pagerank-127655
14856,3,Q35120,Pdirected_pagerank,0.028596,Q35120-Pdirected_pagerank-44570
11192,4,Q151885,Pdirected_pagerank,0.026957,Q151885-Pdirected_pagerank-33578
439,5,Q5,Pdirected_pagerank,0.012807,Q5-Pdirected_pagerank-1319
...,...,...,...,...,...
38463,66010,Q207482,Pdirected_pagerank,0.000002,Q207482-Pdirected_pagerank-115391
38462,66011,Q20747487,Pdirected_pagerank,0.000002,Q20747487-Pdirected_pagerank-115388
38455,66012,Q20746713,Pdirected_pagerank,0.000002,Q20746713-Pdirected_pagerank-115367
38453,66013,Q20746702,Pdirected_pagerank,0.000002,Q20746702-Pdirected_pagerank-115361


The temporary file looks good, next steps:
- `normalize` to put the qualifiers as extra edges so the file has only `node1/label/node2/id`
- `add-ids` as we want all edges to have ids

In [12]:
kgtk("""
    normalize -i "$TEMP"/directed-pagerank.ordinal.tsv
    / add-id --id-style wikidata 
    -o "$OUT"/useful_files/metadata.pagerank.directed.ordinal.tsv.gz
""")

Look at the result to confirm that we are generating the data we want.

In [13]:
kgtk("""
    head -i "$OUT"/useful_files/metadata.pagerank.directed.ordinal.tsv.gz / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label,label;label
0,Q23958852,Pdirected_pagerank,0.07141,Q23958852-Pdirected_pagerank-79688,'variable-order class'@en,
1,Q23958852-Pdirected_pagerank-79688,P1545,1.0,Q23958852-Pdirected_pagerank-79688-P1545-6b86b2,,'series ordinal'@en
2,Q23960977,Pdirected_pagerank,0.032866,Q23960977-Pdirected_pagerank-127655,'(meta)class'@en,
3,Q23960977-Pdirected_pagerank-127655,P1545,2.0,Q23960977-Pdirected_pagerank-127655-P1545-d4735e,,'series ordinal'@en
4,Q35120,Pdirected_pagerank,0.028596,Q35120-Pdirected_pagerank-44570,'entity'@en,
5,Q35120-Pdirected_pagerank-44570,P1545,3.0,Q35120-Pdirected_pagerank-44570-P1545-4e0740,,'series ordinal'@en
6,Q151885,Pdirected_pagerank,0.026957,Q151885-Pdirected_pagerank-33578,'concept'@en,
7,Q151885-Pdirected_pagerank-33578,P1545,4.0,Q151885-Pdirected_pagerank-33578-P1545-4b2277,,'series ordinal'@en
8,Q5,Pdirected_pagerank,0.012807,Q5-Pdirected_pagerank-1319,'human'@en,
9,Q5-Pdirected_pagerank-1319,P1545,5.0,Q5-Pdirected_pagerank-1319-P1545-ef2d12,,'series ordinal'@en


Repeat the same steps for `undirected_pagerank`

In [14]:
%%time
undirected_pagerank = kgtk("""
    query -i $OUT/useful_files/metadata.pagerank.undirected.tsv.gz 
    --match '(n1)-[l:Pundirected_pagerank]->(pagerank)'
""")

undirected_pagerank = undirected_pagerank.sort_values("node2", ascending=False)
undirected_pagerank.insert(0, 'P1545', range(1, 1 + len(undirected_pagerank)))
undirected_pagerank.to_csv(f"{os.environ['TEMP']}/undirected-pagerank.ordinal.tsv", index=False, sep='\t')
undirected_pagerank

[2021-10-12 16:36:03 sqlstore]: IMPORT graph directly into table graph_2 from /Volumes/saggu-ssd/arnold-2/tutorial-derived-graphs/useful_files/metadata.pagerank.undirected.tsv.gz ...
[2021-10-12 16:36:04 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_2 AS graph_2_c1
     WHERE graph_2_c1."label" = ?
  PARAS: ['Pundirected_pagerank']
---------------------------------------------
[2021-10-12 16:36:04 sqlstore]: CREATE INDEX on table graph_2 column label ...
[2021-10-12 16:36:04 sqlstore]: ANALYZE INDEX on table graph_2 column label ...

CPU times: user 698 ms, sys: 104 ms, total: 802 ms
Wall time: 2.89 s


Unnamed: 0,P1545,node1,label,node2,id
439,1,Q5,Pundirected_pagerank,0.022010,Q5-Pundirected_pagerank-1319
173,2,Q30,Pundirected_pagerank,0.012919,Q30-Pundirected_pagerank-521
4782,3,Q6581097,Pundirected_pagerank,0.008353,Q6581097-Pundirected_pagerank-14348
7097,4,Q15221623,Pundirected_pagerank,0.004738,Q15221623-Pundirected_pagerank-21293
1391,5,Q1860,Pundirected_pagerank,0.004441,Q1860-Pundirected_pagerank-4175
...,...,...,...,...,...
64005,66010,Q7958659,Pundirected_pagerank,0.000003,Q7958659-Pundirected_pagerank-192017
32231,66011,Q17021934,Pundirected_pagerank,0.000003,Q17021934-Pundirected_pagerank-96695
45319,66012,Q27890917,Pundirected_pagerank,0.000003,Q27890917-Pundirected_pagerank-135959
35443,66013,Q10876480,Pundirected_pagerank,0.000003,Q10876480-Pundirected_pagerank-106331


In [15]:
kgtk("""
    normalize -i "$TEMP"/undirected-pagerank.ordinal.tsv
    / add-id --id-style wikidata 
    -o "$OUT"/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz
""")

In [16]:
kgtk("""
    head -i "$OUT"/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz / add-labels
""")

Unnamed: 0,node1,label,node2,id,node1;label,label;label
0,Q5,Pundirected_pagerank,0.02201,Q5-Pundirected_pagerank-1319,'human'@en,
1,Q5-Pundirected_pagerank-1319,P1545,1.0,Q5-Pundirected_pagerank-1319-P1545-6b86b2,,'series ordinal'@en
2,Q30,Pundirected_pagerank,0.012919,Q30-Pundirected_pagerank-521,'United States of America'@en,
3,Q30-Pundirected_pagerank-521,P1545,2.0,Q30-Pundirected_pagerank-521-P1545-d4735e,,'series ordinal'@en
4,Q6581097,Pundirected_pagerank,0.008353,Q6581097-Pundirected_pagerank-14348,'male'@en,
5,Q6581097-Pundirected_pagerank-14348,P1545,3.0,Q6581097-Pundirected_pagerank-14348-P1545-4e0740,,'series ordinal'@en
6,Q15221623,Pundirected_pagerank,0.004738,Q15221623-Pundirected_pagerank-21293,'bilateral relation'@en,
7,Q15221623-Pundirected_pagerank-21293,P1545,4.0,Q15221623-Pundirected_pagerank-21293-P1545-4b2277,,'series ordinal'@en
8,Q1860,Pundirected_pagerank,0.004441,Q1860-Pundirected_pagerank-4175,'English'@en,
9,Q1860-Pundirected_pagerank-4175,P1545,5.0,Q1860-Pundirected_pagerank-4175-P1545-ef2d12,,'series ordinal'@en


## Deploy the tutorial files to `$tutorial_files_path`

Define the files we want to have in the tutorial

In [18]:
tutorial_files_parts = [
    "labels.en.tsv.gz",
    "aliases.en.tsv.gz",
    "descriptions.en.tsv.gz",
    "claims.external-id.tsv.gz",
    "claims.monolingualtext.tsv.gz",
    "claims.quantity.tsv.gz",
    "claims.string.tsv.gz",
    "claims.time.tsv.gz",
    "claims.wikibase-item.tsv.gz",
    "claims.wikibase-property.tsv.gz",
    "qualifiers.tsv.gz"
]

tutorial_files_useful = [
    "derived.P279.tsv.gz",
    "derived.P279star.tsv.gz",
    "derived.P31.tsv.gz",
    "metadata.in_degree.tsv.gz",
    "metadata.out_degree.tsv.gz"
]

Deploy the files from the partition and useful notebooks. 

In [19]:
for file in tutorial_files_parts:
    path = "$OUT/parts/" + file
    !cp -p {path} $tutorial_files_path

for file in tutorial_files_useful:
    path = "$OUT/useful_files/" + file
    !cp -p {path} $tutorial_files_path

Overwrite the original pagerank files with the ones that include ordinal

In [20]:
!cp -p $OUT/useful_files/metadata.pagerank.directed.ordinal.tsv.gz $tutorial_files_path/metadata.pagerank.directed.tsv.gz
!cp -p $OUT/useful_files/metadata.pagerank.undirected.ordinal.tsv.gz $tutorial_files_path/metadata.pagerank.undirected.tsv.gz 

Important to deply the custom KGTK properties file. Copy using KGTK to conveniently compress the file.

In [21]:
kgtk("""
    cat 
        -i "$kgtk_path"/kgtk-properties/kgtk.properties.tsv 
        -i "$OUT"/parts/metadata.property.datatypes.tsv.gz
        -o "$tutorial_files_path"/metadata.property.datatypes.tsv.gz
""")

In [22]:
!ls -l "$tutorial_files_path"

total 181880
-rw-r--r--  1 pedroszekely  staff   1342345 Oct 10 11:52 aliases.en.tsv.gz
-rw-r--r--  1 pedroszekely  staff  44564618 Oct 10 11:36 all.tsv.gz
-rw-r--r--  1 pedroszekely  staff  13620313 Oct 10 11:52 claims.external-id.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1069769 Oct 10 11:52 claims.monolingualtext.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1936951 Oct 10 11:52 claims.quantity.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1095875 Oct 10 11:52 claims.string.tsv.gz
-rw-r--r--  1 pedroszekely  staff    781182 Oct 10 11:52 claims.time.tsv.gz
-rw-r--r--  1 pedroszekely  staff   6332200 Oct 10 11:52 claims.wikibase-item.tsv.gz
-rw-r--r--  1 pedroszekely  staff     97267 Oct 10 11:52 claims.wikibase-property.tsv.gz
-rw-r--r--  1 pedroszekely  staff    390973 Oct 10 11:53 derived.P279.tsv.gz
-rw-r--r--  1 pedroszekely  staff   3325552 Oct 10 11:54 derived.P279star.tsv.gz
-rw-r--r--  1 pedroszekely  staff   1181395 Oct 10 11:53 derived.P31.tsv.gz
-rw-r--r--  1 pedroszekely  staf

Create an `all.tsv.gz` file

In [23]:
%%time
all_file_path = os.environ['tutorial_files_path'] + "/all.tsv.gz"
if os.path.exists(all_file_path):
    !rm {all_file_path}
!kgtk cat -i "$tutorial_files_path"/*.tsv.gz -o {all_file_path}

CPU times: user 285 ms, sys: 105 ms, total: 390 ms
Wall time: 24 s


Peek at the file

In [24]:
kgtk("""
    head -i "$tutorial_files_path"/all.tsv.gz
""")

Unnamed: 0,node1,label,node2,id,node2;wikidatatype
0,P10,alias,'gif'@en,P10-alias-en-282226-0,
1,P10,alias,'animation'@en,P10-alias-en-2f86d8-0,
2,P10,alias,'media'@en,P10-alias-en-c1427e-0,
3,P10,alias,'trailer (Commons)'@en,P10-alias-en-c61ab1-0,
4,P1001,alias,'belongs to jurisdiction'@en,P1001-alias-en-0dd7ce-0,
5,P1001,alias,'linked to jurisdiction'@en,P1001-alias-en-106818-0,
6,P1001,alias,'of jurisdiction'@en,P1001-alias-en-7e4abe-0,
7,P1001,alias,'applied to jurisdiction'@en,P1001-alias-en-89ed18-0,
8,P1001,alias,'jurisdiction'@en,P1001-alias-en-a524ab-0,
9,P1001,alias,'valid in jurisdiction'@en,P1001-alias-en-ca2e7c-0,


Run the KGTK validator on the new knowledge graph

In [25]:
%%time
!kgtk validate -i "$tutorial_files_path"/all.tsv.gz \
    --allow-wikidata-lq-strings True \
    --ignore-minimum-year True \
    --ignore-maximum-year True


Data lines read: 2614949
Data lines passed: 2614949
CPU times: user 1.34 s, sys: 426 ms, total: 1.76 s
Wall time: 2min 3s
