# Profiling To Support The Browser



### Preamble: set up the environment and files used in the tutorial

In [1]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd
from IPython.display import display, HTML

from graph_tool.all import *

import papermill as pm

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

kgtk_path = "/Users/pedroszekely/Documents/GitHub/kgtk"

# Folder on local machine where to create the output and temporary folders
input_path = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215/data/"
input_path = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/"
output_path = "/Users/pedroszekely/Downloads/kypher/projects"
graph_cache_path = "/Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db"
project_name = "browser-profiling"

Our Wikidata distribution partitions the knowledge in Wikidata into smaller files that make it possible for you to pick and choose which files you want to use. Our tutorial KG is a subset of Wikidata, and is partitioned in the same way as the full Wikidata. The following is a partial list of all the files:

In [3]:
files = [
    "claims",
    "item",
    "p279star",
    "label"
]
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  graph_cache_path=graph_cache_path,
                  project_name=project_name,
                  debug=True
                 )

User home: /Users/pedroszekely
Current dir: /Users/pedroszekely/Documents/GitHub/kgtk-tutorial-files/use-cases
KGTK dir: /Users/pedroszekely/Documents/GitHub/kgtk
Use-cases dir: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases


The KGTK setup command defines environment variables for all the files so that you can reuse the Jupyter notebook when you install it on your local machine.

In [4]:
ck.print_env_variables()

GRAPH: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/
OUT: /Users/pedroszekely/Downloads/kypher/projects/browser-profiling
kgtk: kgtk --debug
kypher: kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db
KGTK_GRAPH_CACHE: /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db
KGTK_LABEL_FILE: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz
STORE: /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases
TEMP: /Users/pedroszekely/Downloads/kypher/projects/browser-profiling/temp.browser-profiling
EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples
claims: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.tsv.gz
item: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.wikibase-item.tsv.gz
p279star: /Volumes/GoogleDri

In [5]:
ck.load_files_into_cache()

kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.tsv.gz" --as claims  -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//claims.wikibase-item.tsv.gz" --as item  -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//derived.P279star.tsv.gz" --as p279star  -i "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2//labels.en.tsv.gz" --as label  --limit 3
[2021-12-11 20:06:46 sqlstore]: IMPORT graph directly into table graph_4 from /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20210215-dwd-v2/claims.tsv.gz ...
[2021-12-11 21:09:05 query]: SQL Translation:
---------------------------------------------
  SELECT *
     FROM graph_4 AS graph_4_c1
     LIMIT ?
  PARAS: [3]
---------------------------------------------
id	node1	label	node2	rank	node2;wikidatatype
P10-P1628-32b85d-7927ece

In [None]:
# best indexing for this notebook
# !kgtk --debug query -i claims --idx mode:graph -i p279star --idx mode:monograph

In [13]:
# how to. drop existing indices to create new ones
# !kgtk --debug query -i claims --idx mode:clear mode:graph -i p279star --idx mode:clear mode:monograph --limit 5

### Profile incoming edges


In [19]:
kgtk("""
    query -i item
        --match '(n1)-[edgeid {label: property}]->(n2)'
        --return 'distinct n2 as node1, "Pincoming_statement_count" as label, property as node2, count(edgeid) as P1114'
    -o $OUT/incoming.property.count.tsv.gz
""")

In [26]:
kgtk("""
    query -i $OUT/incoming.property.count.tsv.gz
        --match '(n1)-[e {P1114: quantity}]->(property)'
        --where 'cast(quantity, int) > 25'
        --order-by 'n1, cast(quantity, int) desc'
    -o $OUT/incoming.property.count.25.tsv.gz
""")

In [35]:
!zcat < $OUT/incoming.property.count.25.tsv.gz | wc -l

  342816


In [45]:
kgtk("""
    query -i $OUT/incoming.property.count.25.tsv.gz
        --match '(n1)-[e {P1114: quantity}]->(property)'
        --return 'property as property, count(distinct n1) as total'
        --order-by 'cast(total, int) desc'
        --limit 25
    / add-labels
""")

Unnamed: 0,property,total,property;label
0,P131,53811,'located in the administrative territorial ent...
1,P171,19082,'parent taxon'@en
2,P361,18820,'part of'@en
3,P684,17790,'ortholog'@en
4,P31,13658,'instance of'@en
5,P734,12763,'family name'@en
6,P161,12532,'cast member'@en
7,P19,11403,'place of birth'@en
8,P735,9824,'given name'@en
9,P54,8519,'member of sports team'@en


In [28]:
kgtk("head -i $OUT/incoming.property.count.25.tsv.gz / add-labels")

Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,Q100,Pincoming_statement_count,P19,4141,'Boston'@en,'place of birth'@en
1,Q100,Pincoming_statement_count,P20,1649,'Boston'@en,'place of death'@en
2,Q100,Pincoming_statement_count,P291,1572,'Boston'@en,'place of publication'@en
3,Q100,Pincoming_statement_count,P937,1305,'Boston'@en,'work location'@en
4,Q100,Pincoming_statement_count,P159,1026,'Boston'@en,'headquarters location'@en
5,Q100,Pincoming_statement_count,P131,1006,'Boston'@en,'located in the administrative territorial ent...
6,Q100,Pincoming_statement_count,P551,355,'Boston'@en,'residence'@en
7,Q100,Pincoming_statement_count,P1071,336,'Boston'@en,'location of creation'@en
8,Q100,Pincoming_statement_count,P740,259,'Boston'@en,'location of formation'@en
9,Q100,Pincoming_statement_count,P840,195,'Boston'@en,'narrative location'@en


In [31]:
kgtk("""
    query -i $OUT/incoming.property.count.25.tsv.gz
        --match '(entity)-[e {P1114: quantity}]->(property)'
        --where 'entity = "Q76"'
    / add-labels
""")

Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,Q76,Pincoming_statement_count,P50,302,'Barack Obama'@en,'author'@en
1,Q76,Pincoming_statement_count,P991,109,'Barack Obama'@en,'successful candidate'@en
2,Q76,Pincoming_statement_count,P726,106,'Barack Obama'@en,'candidate'@en
3,Q76,Pincoming_statement_count,P921,47,'Barack Obama'@en,'main subject'@en
4,Q76,Pincoming_statement_count,P1891,37,'Barack Obama'@en,'signatory'@en
5,Q76,Pincoming_statement_count,P161,34,'Barack Obama'@en,'cast member'@en
6,Q76,Pincoming_statement_count,P138,33,'Barack Obama'@en,'named after'@en
7,Q76,Pincoming_statement_count,P823,30,'Barack Obama'@en,'speaker'@en


In [32]:
kgtk("""
    query -i claims
        --match '(n1)-[edgeid {label: property}]->(n2)'
        --return 'distinct n1 as node1, "Poutgoing_statement_count" as label, property as node2, count(edgeid) as P1114'
    -o $OUT/outgoing.property.count.tsv.gz
""")

In [33]:
kgtk("""
    query -i $OUT/outgoing.property.count.tsv.gz
        --match '(n1)-[e {P1114: quantity}]->(property)'
        --where 'cast(quantity, int) > 25'
        --order-by 'n1, cast(quantity, int) desc'
    -o $OUT/outgoing.property.count.25.tsv.gz
""")

In [36]:
!zcat < $OUT/outgoing.property.count.25.tsv.gz | wc -l

  104727


In [46]:
kgtk("""
    query -i $OUT/outgoing.property.count.25.tsv.gz
        --match '(n1)-[e {P1114: quantity}]->(property)'
        --return 'property as property, count(distinct n1) as total'
        --order-by 'cast(total, int) desc'
        --limit 25
    / add-labels
""")

Unnamed: 0,property,total,property;label
0,P684,17782,'ortholog'@en
1,P1087,17018,'Elo rating'@en
2,P528,8996,'catalog code'@en
3,P150,8950,'contains administrative territorial entity'@en
4,P161,5075,'cast member'@en
5,P1843,4018,'taxon common name'@en
6,P682,3726,'biological process'@en
7,P527,3171,'has part'@en
8,P1082,2777,'population'@en
9,P128,2343,'regulates (molecular biology)'@en


In [34]:
kgtk("head -i $OUT/outgoing.property.count.25.tsv.gz / add-labels")

Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,P1533,Poutgoing_statement_count,P2302,26,'family name identical to this given name'@en,'property constraint'@en
1,P154,Poutgoing_statement_count,P2302,28,'logo image'@en,'property constraint'@en
2,P18,Poutgoing_statement_count,P1659,30,'image'@en,'see also'@en
3,P2586,Poutgoing_statement_count,P2302,28,'INSEE department code'@en,'property constraint'@en
4,P3171,Poutgoing_statement_count,P1659,30,'International Olympic Committee athlete ID'@en,'see also'@en
5,P4839,Poutgoing_statement_count,P2264,85,'Wolfram Language entity code'@en,'mix'n'match catalog ID'@en
6,P5086,Poutgoing_statement_count,P2302,45,'FIPS 5-2 alpha code (US states)'@en,'property constraint'@en
7,P5087,Poutgoing_statement_count,P2302,29,'FIPS 5-2 numeric code (US states)'@en,'property constraint'@en
8,P5209,Poutgoing_statement_count,P1855,52,'ISO 3950 code'@en,'Wikidata property example'@en
9,P553,Poutgoing_statement_count,P1659,50,'website account on'@en,'see also'@en


In [40]:
kgtk("""
    query -i $OUT/outgoing.property.count.25.tsv.gz
        --match '(entity)-[e {P1114: quantity}]->(property)'
        --where 'entity = "Q99"'
    / add-labels
""")

Unnamed: 0,node1,label,node2,P1114,node1;label,node2;label
0,Q99,Poutgoing_statement_count,P2936,61,'California'@en,'language used'@en
1,Q99,Poutgoing_statement_count,P150,58,'California'@en,'contains administrative territorial entity'@en
2,Q99,Poutgoing_statement_count,P1082,39,'California'@en,'population'@en
