# WD-AMC dataset generation in KGTK
In the following representations: qualifiers, Standard Reification and N-ary Relationships

## Setting up KGTK and loading data

In [32]:
import io
import os
import subprocess
import sys
import csv
import pandas as pd

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [33]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/data02/ana_iglesias/data/subset/parts"
output_path = "/data02/ana_iglesias/data/subset"
project_name = "reframings"

In [34]:
files = [
    "all",
    "alias",
    "claims",
    "description",
    "label",
    "datatypes",
    "qualifiers"
]

ck = ConfigureKGTK(files)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)


User home: /data02/ana_iglesias
Current dir: /data02/ana_iglesias/data/subset
KGTK dir: /data02/ana_iglesias/data
Use-cases dir: /data02/ana_iglesias/data/use-cases


In [35]:
ck.print_env_variables()

kypher: kgtk query --graph-cache /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db
GRAPH: /data02/ana_iglesias/data/subset/parts
KGTK_GRAPH_CACHE: /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db
KGTK_LABEL_FILE: /data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz
kgtk: kgtk
TEMP: /data02/ana_iglesias/data/subset/reframings/temp.reframings
EXAMPLES_DIR: /data02/ana_iglesias/data/examples
USE_CASES_DIR: /data02/ana_iglesias/data/use-cases
OUT: /data02/ana_iglesias/data/subset/reframings
STORE: /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
all: /data02/ana_iglesias/data/subset/parts/all.tsv.gz
alias: /data02/ana_iglesias/data/subset/parts/aliases.en.tsv.gz
claims: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz
description: /data02/ana_iglesias/data/subset/parts/descriptions.en.tsv.gz
label: /data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz
datatypes: /da

In [36]:
%%time
ck.load_files_into_cache()

kgtk query --graph-cache /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db -i "/data02/ana_iglesias/data/subset/parts/all.tsv.gz" --as all  -i "/data02/ana_iglesias/data/subset/parts/aliases.en.tsv.gz" --as alias  -i "/data02/ana_iglesias/data/subset/parts/claims.tsv.gz" --as claims  -i "/data02/ana_iglesias/data/subset/parts/descriptions.en.tsv.gz" --as description  -i "/data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz" --as label  -i "/data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz" --as datatypes  -i "/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz" --as qualifiers  --limit 3
node1	label	node2	id	node2;wikidatatype
P10	P1628	"http://www.w3.org/2006/vcard/ns#Video"	P10-P1628-32b85d-7927ece6-0	url
P10	P1628	"https://schema.org/video"	P10-P1628-acf60d-b8950832-0	url
P10	P1629	Q34508	P10-P1629-Q34508-bcc39400-0	wikibase-item
CPU times: user 3.34 ms, sys: 9.93 ms, total: 13.3 ms
Wall time: 25.9 s


## Creating base claims 

Removing human nodes from the `claims` file and saving it as `claims_base.tsv`. The disjoint file, containing only human claims, is saved to `reframingclaims.tsv`

In [6]:
kgtk("""
    query -i claims
        --match '(s)-[t:P31]->(type),
                 (s)-[p]->(o)'
        --where 'p.label != "Pdirected_pagerank" and 
                 p.label != "Pundirected_pagerank" and 
                 p.label != "Pout_degree" and 
                 p.label != "Pin_degree" and
                 type IN ["Q5", "Q229390", "Q24869", "Q11424", "Q581714","Q11425","Q29168811","Q5398426","Q526877","Q19020","Q15773347","Q15773317"]'
        --return 's, p.label, o, p'
        -o $TEMP/reframingclaims-temp.tsv
""")

kgtk("query -i $TEMP/reframingclaims-temp.tsv --as reframingclaimstemp")
reframing_claims = pd.read_csv("/data02/ana_iglesias/data/subset/reframings/temp.reframings/reframingclaims-temp.tsv", sep="\t")
reframing_claims

Unnamed: 0,node1,label,node2,id
0,Q100292318,P1040,Q24578312,Q100292318-P1040-Q24578312-fa7dc50b-0
1,Q100292318,P1258,m/the_addams_family_2,Q100292318-P1258-841eb3-3f6f8f58-0
2,Q100292318,P1265,278122,Q100292318-P1265-d7a0f2-29dfe293-0
3,Q100292318,P136,Q157443,Q100292318-P136-Q157443-ca2b6c26-0
4,Q100292318,P136,Q28968258,Q100292318-P136-Q28968258-47a1de5a-0
...,...,...,...,...
1921154,Q97365172,P86,Q1740191,Q97365172-P86-Q1740191-38c6c027-0
1921155,Q97365172,P8687,+12773,Q97365172-P8687-1e0b17-c5647aa0-0
1921156,Q97365172,P8687,+53434,Q97365172-P8687-a1b4dd-667276a6-0
1921157,Q97365172,P9751,umc.cmc.jzpcwzmyd6h9eaadrakph6ta,Q97365172-P9751-1b0487-17916163-0


In [7]:
kgtk("""ifnotexists --input-file claims \
               --filter-file reframingclaimstemp \
               --input-keys 'id' \
               --filter-keys 'id' \
               -o $TEMP/claims_base.tsv""")

kgtk("query -i $TEMP/claims_base.tsv --as base_claims")
base_claims = pd.read_csv("/data02/ana_iglesias/data/subset/reframings/temp.reframings/claims_base.tsv", sep="\t")
base_claims


Unnamed: 0,id,node1,label,node2,node2;wikidatatype
0,P10-P1628-32b85d-7927ece6-0,P10,P1628,http://www.w3.org/2006/vcard/ns#Video,url
1,P10-P1628-acf60d-b8950832-0,P10,P1628,https://schema.org/video,url
2,P10-P1629-Q34508-bcc39400-0,P10,P1629,Q34508,wikibase-item
3,P10-P1630-53947a-fbe9093e-0,P10,P1630,https://commons.wikimedia.org/wiki/File:$1,string
4,P10-P1659-P1651-c4068028-0,P10,P1659,P1651,wikibase-property
...,...,...,...,...,...
2337030,Q99998027-P31-Q11483816-d1dfd1de-0,Q99998027,P31,Q11483816,wikibase-item
2337031,Q99998027-P664-Q617433-3d360b0b-0,Q99998027,P664,Q617433,wikibase-item
2337032,Q99999126-P31-Q11407181-5b3cc2ad-0,Q99999126,P31,Q11407181,wikibase-item
2337033,Q99999126-P31-Q11483816-46ec0a53-0,Q99999126,P31,Q11483816,wikibase-item


### Adding single claims of humans, the ones that don't have qualifiers attatched


Qualifiers that apply to human nodes saved to 'human_qualifiers.tsv', to reduce the dataset size.


In [8]:
%%time
kgtk("""
    query -i reframingclaimstemp -i qualifiers
        --match 'reframingclaimstemp: ()-[qs]->(),
                 qualifiers: (qs)-[qp]->(qo)'
        --return 'qs as node1, qp.label as label, qo as node2, qp as id'
        -o $TEMP/reframing_qualifiers.tsv
""")

kgtk("query -i $TEMP/reframing_qualifiers.tsv --as reframingquals")
reframing_quals = pd.read_csv("/data02/ana_iglesias/data/subset/reframings/temp.reframings/reframing_qualifiers.tsv", sep="\t")
reframing_quals

CPU times: user 4.75 s, sys: 1.52 s, total: 6.27 s
Wall time: 16.8 s


Unnamed: 0,node1,label,node2,id
0,Q1000118-P1441-Q28146833-e939a5a7-0,P175,Q28556723,Q1000118-P1441-Q28146833-e939a5a7-0-P175-Q2855...
1,Q1000118-P345-ce5234-73bca1aa-0,P2241,Q44374960,Q1000118-P345-ce5234-73bca1aa-0-P2241-Q44374960-0
2,Q1000118-P6262-169bd9-1227c261-0,P1810,Peter Pettigrew,Q1000118-P6262-169bd9-1227c261-0-P1810-6f074a-0
3,Q1000118-P6262-169bd9-1227c261-0,P407,Q809,Q1000118-P6262-169bd9-1227c261-0-P407-Q809-0
4,Q1000118-P6262-169bd9-1227c261-0,P9675,286,Q1000118-P6262-169bd9-1227c261-0-P9675-ca871a-0
...,...,...,...,...
420679,Q999960-P10527-Q50920401-c2581644-0,P1810,"Cochet, Jean-Laurent",Q999960-P10527-Q50920401-c2581644-0-P1810-5c66...
420680,Q999960-P166-Q10855271-52323a81-0,P585,^2006-01-01T00:00:00Z/9,Q999960-P166-Q10855271-52323a81-0-P585-cf2407-0
420681,Q999960-P166-Q13452531-0ed37b2c-0,P585,^2012-01-01T00:00:00Z/9,Q999960-P166-Q13452531-0ed37b2c-0-P585-979d4e-0
420682,Q999960-P166-Q3405661-d2f16aa9-0,P585,^1984-01-01T00:00:00Z/9,Q999960-P166-Q3405661-d2f16aa9-0-P585-a649f8-0


In [9]:
kgtk("""ifexists --input-file reframingclaimstemp \
               --filter-file reframingquals \
               --input-keys 'id' \
               --filter-keys 'node1' \
               -o $TEMP/reframingclaims.tsv \
               --reject-file $TEMP/human_single_claims.tsv""")

In [10]:
kgtk("query -i $TEMP/reframingclaims.tsv --as reframingclaims")

Unnamed: 0,node1,label,node2,id
0,Q100292318,P1552,Q27834579,Q100292318-P1552-Q27834579-87b20a63-0
1,Q100292318,P1552,Q27847754,Q100292318-P1552-Q27847754-d80110c0-0
2,Q100292318,P1651,k1UNQFEUsPg,Q100292318-P1651-cb1243-138008d8-0
3,Q100292318,P1657,Q18665334,Q100292318-P1657-Q18665334-dc007cbc-0
4,Q100292318,P1981,Q20644795,Q100292318-P1981-Q20644795-8508e9f8-0
...,...,...,...,...
226432,Q97365172,P725,Q837676,Q97365172-P725-Q837676-2c91e4c2-0
226433,Q97365172,P725,Q965261,Q97365172-P725-Q965261-40d9e1b3-0
226434,Q97365172,P856,https://www.starwars.com/series/star-wars-the-...,Q97365172-P856-efdce4-8d456f9b-0
226435,Q97365172,P8687,+12773,Q97365172-P8687-1e0b17-c5647aa0-0


In [11]:
kgtk("""remove-columns -i $TEMP/claims_base.tsv \
                       --columns 'node2;wikidatatype' \
        / cat -i - \
              -i $TEMP/human_single_claims.tsv \
              -o $TEMP/kbaseclaims.tsv""") 

kgtk("query -i $TEMP/kbaseclaims.tsv --as kbaseclaims")

Unnamed: 0,id,node1,label,node2
0,P10-P1628-32b85d-7927ece6-0,P10,P1628,http://www.w3.org/2006/vcard/ns#Video
1,P10-P1628-acf60d-b8950832-0,P10,P1628,https://schema.org/video
2,P10-P1629-Q34508-bcc39400-0,P10,P1629,Q34508
3,P10-P1630-53947a-fbe9093e-0,P10,P1630,https://commons.wikimedia.org/wiki/File:$1
4,P10-P1659-P1651-c4068028-0,P10,P1659,P1651
...,...,...,...,...
4031755,Q97365172-P8411-Q17480853-32949b92-0,Q97365172,P8411,Q17480853
4031756,Q97365172-P8411-Q2775969-7f0bc6e5-0,Q97365172,P8411,Q2775969
4031757,Q97365172-P86-Q1740191-38c6c027-0,Q97365172,P86,Q1740191
4031758,Q97365172-P9751-1b0487-17916163-0,Q97365172,P9751,umc.cmc.jzpcwzmyd6h9eaadrakph6ta


### Adding datatypes to added properties for SR and events
From file `added-property-datatypes.tsv`, that looks like:

| node1      | label    | node2         | id                  |
|------------|----------|---------------|---------------------|
| Psubject   | datatype | wikibase-item | Psubject-datatype   |
| Ppredicate | datatype | wikibase-item | Ppredicate-datatype |
| Pobject    | datatype | wikibase-item | Pobject-datatype    |
| Phas_event | datatype | wikibase-item | Pevent-datatype     |

Add this to the file `metadata.metadata.property.datatypes.tsv.gz`. To this file, remove unnecessary column `node2;wikidatatype`.

In [21]:
kgtk("remove-columns -i datatypes \
                    --columns 'node2;wikidatatype' \
     / cat -i $GRAPH/added-property-datatypes.tsv \
           -i - \
           -o $OUT/metadata.property.datatypes.tsv.gz") 

## Standard reification

### SR claims

Reframing into Standard Reification approach

In [37]:
%%time
kgtk("""
    query -i claims -i qualifiers --multi 5
        --match 'claims: (subject)-[p]->(value),
                 qualifiers: (p)-[qual]->(qual_value)'
        --return 'replace(p,"-","") as node1, printf("P31") as label, printf("Q3539534") as node2,
                  replace(p,"-","") as node1, printf("Psubject") as label, subject as node2,
                  replace(p,"-","") as node1, printf("Ppredicate") as label, p.label as node2,
                  replace(p,"-","") as node1, printf("Pobject") as label, value as node2,
                  replace(p,"-","") as node1, qual.label as label, qual_value as node2'
        / deduplicate
        / add-id --id-style wikidata -o $TEMP/sr_added_claims.tsv.gz
""")

kgtk("query -i $TEMP/sr_added_claims.tsv.gz")

CPU times: user 30.1 s, sys: 8.73 s, total: 38.9 s
Wall time: 2min 19s


Unnamed: 0,node1,label,node2,id
0,Q1000115P1343Q6023581694a9be0,P31,Q3539534,Q1000115P1343Q6023581694a9be0-P31-Q3539534
1,Q1000115P1343Q6023581694a9be0,P805,Q24514151,Q1000115P1343Q6023581694a9be0-P805-Q24514151
2,Q1000115P1343Q6023581694a9be0,Pobject,Q602358,Q1000115P1343Q6023581694a9be0-Pobject-Q602358
3,Q1000115P1343Q6023581694a9be0,Ppredicate,P1343,Q1000115P1343Q6023581694a9be0-Ppredicate-P1343
4,Q1000115P1343Q6023581694a9be0,Psubject,Q1000115,Q1000115P1343Q6023581694a9be0-Psubject-Q1000115
...,...,...,...,...
3991818,Q99P86724e9ef1a5c7bbb50,P1810,California,Q99P86724e9ef1a5c7bbb50-P1810-965e75
3991819,Q99P86724e9ef1a5c7bbb50,P31,Q3539534,Q99P86724e9ef1a5c7bbb50-P31-Q3539534
3991820,Q99P86724e9ef1a5c7bbb50,Pobject,1259995120016912384,Q99P86724e9ef1a5c7bbb50-Pobject-4e9ef1
3991821,Q99P86724e9ef1a5c7bbb50,Ppredicate,P8672,Q99P86724e9ef1a5c7bbb50-Ppredicate-P8672


In [170]:
kgtk("query -i $TEMP/sr_added_claims.tsv.gz --match '(stm)-[:P31]->(:Q3539534)' --return 'count(distinct stm) as count'")

Unnamed: 0,count
0,671897


Extracting the claims that must be deleted to avoid duplication of information and appending to the reframed claims

In [38]:
kgtk("""query -i claims -i qualifiers 
        --match 'claims: (subject)-[p]->(value),
                 qualifiers: (p)-[qual]->(qual_value)'
        --return 'subject as node1, p.label as label, value as node2, p as id'
        / deduplicate -o $TEMP/d_srclaims.tsv.gz
""")

In [39]:
kgtk("""ifnotexists --input-file claims \
               --filter-file $TEMP/d_srclaims.tsv.gz \
               --input-keys 'id' \
               --filter-keys 'id' \
               -o $TEMP/sr_baseclaims.tsv.gz
""")

kgtk("""cat -i $TEMP/sr_baseclaims.tsv.gz -i $TEMP/sr_added_claims.tsv.gz 
        / deduplicate -o $OUT/sr_claims.tsv.gz""")

In [10]:

!kgtk head -i $OUT/sr_claims.tsv.gz


node1	label	node2	id	node2;wikidatatype
P10	P1628	"http://www.w3.org/2006/vcard/ns#Video"	P10-P1628-32b85d-7927ece6-0	url
P10	P1628	"https://schema.org/video"	P10-P1628-acf60d-b8950832-0	url
P10	P1629	Q34508	P10-P1629-Q34508-bcc39400-0	wikibase-item
P10	P1630	"https://commons.wikimedia.org/wiki/File:$1"	P10-P1630-53947a-fbe9093e-0	string
P10	P1659	P1651	P10-P1659-P1651-c4068028-0	wikibase-property
P10	P1659	P18	P10-P1659-P18-5e4b9c4f-0	wikibase-property
P10	P1659	P4238	P10-P1659-P4238-d21d1ac0-0	wikibase-property
P10	P1659	P51	P10-P1659-P51-86aca4c5-0	wikibase-property
P10	P1855	Q15075950	P10-P1855-Q15075950-7eff6d65-0	wikibase-item
P10	P1855	Q4504	P10-P1855-Q4504-a69d2c73-0	wikibase-item


### Browser files: label, alias and description

#### Label
Take user-friendly labels instead of ids to form the labels of the statments, add labels from original dataset, and save to `sr_labels.tsv`. Also add the new labels:

| node1      | label | node2                | id                 |
|------------|-------|----------------------|--------------------|
| Psubject   | label | 'subject'@en         | Psubject-label-0   |
| Ppredicate | label | 'predicate'@en       | Ppredicate-label-0 |
| Pobject    | label | 'object'@en          | Pobject-label-0    |
| Phas_event | label | 'has event'@en       | Phas_event-label-0 |
| Q3539534   | label | 'semantic triple'@en | Q3539534-label-0   |
| QEvent     | label | 'Event'@en           | QEvent-label-0     |

In [40]:
kgtk("""
    query -i $OUT/sr_claims.tsv.gz -i label 
        --match '          (dummy_s)-[:Psubject]->(subject),
                           (dummy_s)-[:Ppredicate]->(pred),
                           (dummy_s)-[:Pobject]->(object),
                 label: (subject)-[:label]->(s_label),
                        (pred)-[:label]->(p_label),
                        (object)-[:label]->(o_label)'
        --return 'dummy_s as node1, printf("label") as label, 
        concat(kgtk_stringify(concat("Statement for ", kgtk_lqstring_text(s_label), " - ", kgtk_lqstring_text(p_label), " - ", kgtk_lqstring_text(o_label))), "@en") as node2'
        / deduplicate
        / add-id --id-style wikidata 
        / cat -i $GRAPH/extra-labels.tsv -i - -o $TEMP/sr_added_labels.en.tsv.gz
""") 

In [41]:
## statements with objects that are not nodes
kgtk("""ifnotexists --input-file $TEMP/sr_added_claims.tsv.gz \
               --filter-file $TEMP/sr_added_labels.en.tsv.gz \
               --input-keys 'node1' \
               --filter-keys 'node1' \
               -o $TEMP/sr_nonnode_claims.tsv.gz
""")

In [42]:
kgtk ("""query -i $TEMP/sr_nonnode_claims.tsv.gz -i label \
        --match '          (dummy_s)-[:Psubject]->(subject), \
                           (dummy_s)-[:Ppredicate]->(pred), \
                           (dummy_s)-[:Pobject]->(object), \
                 label: (subject)-[:label]->(s_label), \
                        (pred)-[:label]->(p_label)' \
        --return 'dummy_s as node1, printf("label") as label, concat(kgtk_stringify(concat("Statement for ", kgtk_lqstring_text(s_label), " - ", kgtk_lqstring_text(p_label), " - ", kgtk_unstringify(object))), "@en") as node2' \
        / deduplicate \
        / add-id --id-style wikidata 
        / cat -i $TEMP/sr_added_labels.en.tsv.gz -i - -o $TEMP/sr_complete_added_labels.en.tsv.gz
""")

In [43]:
kgtk("""
    cat -i $TEMP/sr_complete_added_labels.en.tsv.gz -i label -o  $OUT/sr_labels.en.tsv.gz
""")

In [10]:
!wc -l $OUT/sr_labels.en.tsv.gz

98688 /data02/ana_iglesias/data/subset/reframings/sr_labels.en.tsv.gz


#### Description
From the labels additions created above, replace `label` for `description`, add descriptions from original dataset and save to `sr_desc.tsv`.

In [44]:
kgtk("""
    query -i $TEMP/sr_complete_added_labels.en.tsv.gz
    --match '(s)-[:label]->(o)'
    --return 's as node1, "description" as label, o as node2'
    / deduplicate
    / add-id --id-style wikidata -o $TEMP/sr_added_descriptions.en.tsv.gz
""")

In [45]:
kgtk("""
    cat -i $TEMP/sr_added_descriptions.en.tsv.gz -i description -o  $OUT/sr_descriptions.en.tsv.gz
""")

#### Alias

In [46]:
kgtk("""
    query -i $TEMP/sr_complete_added_labels.en.tsv.gz
    --match '(s)-[:label]->(o)'
    --return 's as node1, "alias" as label, o as node2'
    / deduplicate
    / add-id --id-style wikidata -o $TEMP/sr_added_aliases.en.tsv.gz
""")

In [47]:
kgtk("""
    cat -i $TEMP/sr_added_aliases.en.tsv.gz -i alias -o  $OUT/sr_aliases.en.tsv.gz
""")

### Creating clean output files into subfolder

In [48]:
kgtk("""remove-columns -i $OUT/sr_aliases.en.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/sr/aliases.en.tsv.gz
""")

kgtk("""remove-columns -i $OUT/sr_claims.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/sr/claims.tsv.gz
""")

kgtk("""remove-columns -i $OUT/sr_descriptions.en.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/sr/descriptions.en.tsv.gz
""")

kgtk("""remove-columns -i $OUT/sr_labels.en.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/sr/labels.en.tsv.gz
""")

In [49]:
!zcat $OUT/sr/labels.en.tsv.gz | sed -e "s/\t\"/\t'/g" | sed -e "s/\"@en/'@en/g" > $OUT/sr/labels.en.tsv
!zcat $OUT/sr/aliases.en.tsv.gz | sed -e "s/\t\"/\t'/g" | sed -e "s/\"@en/'@en/g" > $OUT/sr/aliases.en.tsv
!zcat $OUT/sr/descriptions.en.tsv.gz | sed -e "s/\t\"/\t'/g" | sed -e "s/\"@en/'@en/g" > $OUT/sr/descriptions.en.tsv

## N-ary relationships: events 

### Events claims

#### Human events 

In [50]:
%%time

## Single events (point in time)

# reframing claims
kgtk("""query -i claims -i qualifiers -i label --multi 4
        --match 'claims: (human)-[:P31]->(:Q5),
                         (human)-[p]->(value)'
        --opt 'qualifiers: (p)-[qual]->(qual_value)'
        --return 'human as node1, printf("Phas_event") as label, concat(replace(p,"-",""),"Event") as node2,
                  concat(replace(p,"-",""),"Event") as node1, qual.label as label, qual_value as node2,
                  concat(replace(p,"-",""),"Event") as node1, p.label as label, value as node2,
                  concat(replace(p,"-",""),"Event") as node1, printf("P31") as label, printf("QEvent") as node2'
        / deduplicate
        / add-id --id-style wikidata -o $TEMP/singleclaims_nofilter.tsv.gz
""")
# intermediary file bc was not filtering with --where
kgtk("""query -i $TEMP/singleclaims_nofilter.tsv.gz --multi 2
        --match '(human)-[:Phas_event]->(event),
                         (event)-[p]->(pv),
                         (event)-[pred]->(val)'
        --where 'p.label IN ["P40", "P166", "P1411", "P184", "P1344","P27", "P69", "P551", "P463", "P26", "P106", "P39", "P108"]'
        --return 'human as node1, printf("Phas_event") as label, event as node2,
                  event as node1, pred.label as label, val as node1'
        / deduplicate
        / add-id --id-style wikidata -o $TEMP/singleclaims.tsv.gz
""")

# temporal file to allow creation of labels
kgtk("""query -i claims -i qualifiers --multi 3 
              --match 'claims: (human)-[:P31]->(:Q5),
                               (human)-[p]->(value)'
              --where 'p.label IN ["P40", "P166", "P1411", "P184", "P1344","P27", "P69", "P551", "P463", "P26", "P106", "P39", "P108"]'
              --return 'concat(replace(p,"-",""),"Event") as node1, printf("s") as label, human as node2,
                        concat(replace(p,"-",""),"Event") as node1, printf("p") as label, p.label as node2,
                        concat(replace(p,"-",""),"Event") as node1, printf("o") as label, value as node2'
              -o $TEMP/evlabel_temp.tsv.gz""")

# Label creation
kgtk("""query -i $TEMP/evlabel_temp.tsv.gz -i label
        --match '      (id)-[:s]->(snode),
                       (id)-[:p]->(pnode),
                       (id)-[:o]->(onode),
                 label: (snode)-[:label]->(slabel),
                        (pnode)-[:label]->(plabel),
                        (onode)-[:label]->(olabel)'
        --return 'id as node1, printf("label") as label, 
            concat(kgtk_stringify(concat("Event of ", kgtk_lqstring_text(slabel), " - ", kgtk_lqstring_text(plabel), " - ", kgtk_lqstring_text(olabel))), "@en") as node2'
        / deduplicate
        / add-id --id-style wikidata -o $TEMP/singlelabels.tsv.gz
""")

CPU times: user 115 ms, sys: 119 ms, total: 234 ms
Wall time: 3min 35s


#### Birth and death events

In [51]:
%%time
## BIRTH

#birth_properties = ["P19", "P3373", "P22", "P25", "P3448", "P21", "P172"]

## For birth date (P569)
kgtk(""" 
     query -i claims --multi 4
        --match '(human)-[:P31]->(:Q5),
                 (human)-[p:P569]->(birth_date)'
        --opt   '(human)-[p_birth]->(birth)'
        --where 'p_birth.label IN ["P19", "P3373", "P22", "P25", "P3448", "P21", "P172"]'
        --return 'human as node1, printf("Phas_event") as label, concat(replace(p,"-",""),"BirthEvent") as node2,
                  concat(replace(p,"-",""),"BirthEvent") as node1, p.label as label, birth_date as node2,
                  concat(replace(p,"-",""),"BirthEvent") as node1, p_birth.label as label, birth as node2,
                  concat(replace(p,"-",""),"BirthEvent") as node1, printf("P31") as label, printf("QEvent") as node2'
        
     / deduplicate
     / add-id --id-style wikidata -o $TEMP/birthclaims.tsv.gz
""")

kgtk(""" 
     query -i claims -i label 
        --match 'claims: (human)-[:P31]->(:Q5),
                         (human)-[p:P569]->(birth_date),
                 label: (human)-[:label]->(hlabel)'
        --return 'concat(replace(p,"-",""),"BirthEvent") as node1, printf("label") as label, 
            concat(kgtk_stringify(concat("Birth event of ", kgtk_lqstring_text(hlabel))), "@en") as node2'
        
     / deduplicate
     / add-id --id-style wikidata -o $TEMP/birthlabels.tsv.gz
""")


CPU times: user 12.7 ms, sys: 50.9 ms, total: 63.7 ms
Wall time: 11.9 s


In [52]:
%%time

## DEATH
#death_properties = ["P1196", "P20", "P509"]

## For death date (P570)
kgtk(""" 
     query -i claims --multi 4
        --match '(human)-[:P31]->(:Q5),
                 (human)-[p:P570]->(death_date)'
        --opt   '(human)-[p_death]->(death)'
        --where 'p_death.label IN ["P1196", "P20", "P509"]'
        --return 'human as node1, printf("Phas_event") as label, concat(replace(p,"-",""),"DeathEvent") as node2,
                  concat(replace(p,"-",""),"DeathEvent") as node1, p.label as label, death_date as node2,
                  concat(replace(p,"-",""),"DeathEvent") as node1, p_death.label as label, death as node2,
                  concat(replace(p,"-",""),"DeathEvent") as node1, printf("P31") as label, printf("QEvent") as node2'        
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/deathclaims.tsv.gz
""")

# death labels

kgtk(""" 
     query -i claims -i label 
        --match 'claims: (human)-[:P31]->(:Q5),
                         (human)-[p:P570]->(birth_date),
                 label: (human)-[:label]->(hlabel)'
        --return 'concat(replace(p,"-",""),"DeathEvent") as node1, printf("label") as label, 
            concat(kgtk_stringify(concat("Death event of ", kgtk_lqstring_text(hlabel))), "@en") as node2'
     / deduplicate
     / add-id --id-style wikidata -o $TEMP/deathlabels.tsv.gz
""")



CPU times: user 10.5 ms, sys: 45.2 ms, total: 55.7 ms
Wall time: 7.06 s


#### Awards

In [53]:
kgtk(""" 
     query -i claims -i qualifiers --multi 4
        --match 'claims: (award)-[p:P1346]->(winner),
                 qualifiers: (p)-[ptime:P585]->(year),
                                 (p)-[qual]->(qvalue)'
        --where 'award IN ["Q103618","Q103916"]'
        --return 'award as node1, p.label as label, concat(replace(p,"-",""),"Winner",kgtk_date_year(year)) as node2,
                  concat(replace(p,"-",""),"Winner",kgtk_date_year(year)) as node1, p.label as label, winner as node2,
                  concat(replace(p,"-",""),"Winner",kgtk_date_year(year)) as node1, ptime.label as label, year as node2,
                  concat(replace(p,"-",""),"Winner",kgtk_date_year(year)) as node1, qual.label as label, qvalue as node2' 
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/awardclaims.tsv.gz
""")

kgtk(""" 
     query -i claims -i qualifiers -i label
        --match 'claims: (award)-[p:P1346]->(winner),
                 qualifiers: (p)-[ptime:P585]->(year),
                 label: (award)-[:label]->(alabel)'
        --where 'award IN ["Q103618","Q103916"]'
        --return 'concat(replace(p,"-",""),"Winner",kgtk_date_year(year)) as node1, printf("label") as label, 
            concat(kgtk_stringify(concat(kgtk_lqstring_text(alabel)," ",kgtk_date_year(year))), "@en") as node2'
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/awardlabels.tsv.gz
""")


#### Movies and tv series

In [54]:
kgtk(""" 
     query -i claims -i qualifiers --multi 3
        --match 'claims: (movie)-[:P31]->(type),
                         (movie)-[p:P161]->(cast)'
        --opt 'qualifiers: (p)-[qual]->(qvalue)'
        --where 'type IN ["Q229390", "Q24869", "Q11424","Q5398426","Q526877"]'
        --return 'movie as node1, p.label as label, concat(replace(p,"-",""),"CastMember",cast) as node2,
                  concat(replace(p,"-",""),"CastMember",cast) as node1, p.label as label, cast as node2,
                  concat(replace(p,"-",""),"CastMember",cast) as node1, qual.label as label, qvalue as node2' 
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/movieclaims.tsv.gz
""")

kgtk(""" 
     query -i claims -i label -i label
        --match 'claims: (movie)-[:P31]->(type),
                         (movie)-[p:P161]->(cast),
                 label: (cast)-[:label]->(clabel),
                        (movie)-[:label]->(mlabel)'
        --where 'type IN ["Q229390", "Q24869", "Q11424","Q5398426","Q526877"]'
        --return 'concat(replace(p,"-",""),"CastMember",cast) as node1, printf("label") as label, 
            concat(kgtk_stringify(concat("Cast member ", kgtk_lqstring_text(clabel), " of ", kgtk_lqstring_text(mlabel))), "@en") as node2'
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/movielabels.tsv.gz
""")


In [55]:
## animation movies/tv series with 'voice actor' instead of 'cast member'
kgtk(""" 
     query -i claims -i reframingquals --multi 3
        --match 'claims: (movie)-[:P31]->(type),
                         (movie)-[p:P725]->(cast)'
        --opt 'reframingquals: (p)-[qual]->(qvalue)'
        --where 'type IN ["Q581714","Q11425","Q29168811"]'
        --return 'movie as node1, p.label as label, concat(replace(p,"-",""),"VoiceActor",cast) as node2,
                  concat(replace(p,"-",""),"VoiceActor",cast) as node1, p.label as label, cast as node2,
                  concat(replace(p,"-",""),"VoiceActor",cast) as node1, qual.label as label, qvalue as node2' 
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/animationclaims.tsv.gz
""")

kgtk(""" 
     query -i claims -i label -i label
        --match 'claims: (movie)-[:P31]->(type),
                         (movie)-[p:P725]->(cast),
                 label: (cast)-[:label]->(clabel),
                        (movie)-[:label]->(mlabel)'
        --where 'type IN ["Q581714","Q11425","Q29168811"]'
        --return 'concat(replace(p,"-",""),"VoiceActor",cast) as node1, printf("label") as label, 
            concat(kgtk_stringify(concat("Voice actor ", kgtk_lqstring_text(clabel), " of ", kgtk_lqstring_text(mlabel))), "@en") as node2'
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/animationlabels.tsv.gz
""")

#### Characters

In [56]:
kgtk(""" 
     query -i claims -i qualifiers --multi 3
        --match 'claims: (char)-[:P31]->(type),
                         (char)-[p:P175]->(cast)'
        --opt 'qualifiers: (p)-[qual]->(qvalue)'
        --where 'type IN ["Q15773347","Q15773317"]'
        --return 'char as node1, p.label as label, concat(replace(p,"-",""),"CharacterPerformer",cast) as node2,
                  concat(replace(p,"-",""),"CharacterPerformer",cast) as node1, p.label as label, cast as node2,
                  concat(replace(p,"-",""),"CharacterPerformer",cast) as node1, qual.label as label, qvalue as node2' 
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/charclaims.tsv.gz
""")

kgtk(""" 
     query -i claims -i label 
        --match 'claims: (char)-[:P31]->(type),
                         (char)-[p:P175]->(cast),
                 label: (cast)-[:label]->(clabel),
                        (char)-[:label]->(mlabel)'
        --where 'type IN ["Q15773347","Q15773317"]'
        --return 'concat(replace(p,"-",""),"CharacterPerformer",cast) as node1, printf("label") as label, 
            concat(kgtk_stringify(concat(kgtk_lqstring_text(mlabel), " Character performer ", kgtk_lqstring_text(clabel))), "@en") as node2'
     / deduplicate 
     / add-id --id-style wikidata -o $TEMP/charlabels.tsv.gz
""")


In [153]:
!kgtk head -i $TEMP/charlabels.tsv.gz

node1	label	node2	id
Q1000118P175Q28556723c6ece1a70CharacterPerformerQ28556723	label	"Peter Pettigrew Character performer Zachary David"@en	Q1000118P175Q28556723c6ece1a70CharacterPerformerQ28556723-label-113a30
Q1000118P175Q287824bce710040CharacterPerformerQ287824	label	"Peter Pettigrew Character performer Timothy Spall"@en	Q1000118P175Q287824bce710040CharacterPerformerQ287824-label-aebb06
Q101069011P175Q232477854517aa0CharacterPerformerQ232477	label	"Countess Helena Andrenyi Character performer Jacqueline Bisset"@en	Q101069011P175Q232477854517aa0CharacterPerformerQ232477-label-5ebb6b
Q101069011P175Q26011378cab6a710CharacterPerformerQ2601137	label	"Countess Helena Andrenyi Character performer Elena Satine"@en	Q101069011P175Q26011378cab6a710CharacterPerformerQ2601137-label-077817
Q101069011P175Q902478140ebef9d0CharacterPerformerQ9024781	label	"Countess Helena Andrenyi Character performer Lucy Boynton"@en	Q101069011P175Q902478140ebef9d0CharacterPerformerQ9024781-label-2e9c85
Q10106

#### Creating claims file

In [57]:
kgtk(""" cat -i $TEMP/singleclaims.tsv.gz 
             -i $TEMP/birthclaims.tsv.gz 
             -i $TEMP/deathclaims.tsv.gz 
             -i $TEMP/awardclaims.tsv.gz 
             -i $TEMP/movieclaims.tsv.gz 
             -i $TEMP/animationclaims.tsv.gz 
             -i $TEMP/charclaims.tsv.gz 
             -o $TEMP/event_added_claims.tsv.gz
""")

In [58]:
kgtk("""query -i claims 
        --match '(human)-[:P31]->(:Q5),
                 (human)-[p]->(value)'
        --where 'p.label IN ["P40", "P166", "P1411", "P184", "P1344","P27", "P69", "P551", "P463", "P26", "P106", "P39", "P108","P19", "P3373", "P22", "P25", "P3448", "P21", "P172","P1196", "P20", "P509", "P569","P570"]'
        --return 'human as node1, p.label as label, value as node2, p as id'
        / deduplicate -o $TEMP/d_humanclaims.tsv.gz""")

kgtk("""query -i claims 
        --match '(award)-[p:P1346]->(value)'
        --where 'award IN ["Q103618","Q103916"]'
        --return 'award as node1, p.label as label, value as node2, p as id'
        / deduplicate -o $TEMP/d_awardclaims.tsv.gz""")

kgtk("""query -i claims 
        --match '(movie)-[:P31]->(type),
                 (movie)-[p:P161]->(value)'
        --where 'type IN ["Q229390", "Q24869", "Q11424","Q5398426","Q526877"]'
        --return 'movie as node1, p.label as label, value as node2, p as id'
        / deduplicate -o $TEMP/d_movieclaims.tsv.gz""")

kgtk("""query -i claims 
        --match '(movie)-[:P31]->(type),
                 (movie)-[p:P725]->(value)'
        --where 'type IN ["Q581714","Q11425","Q29168811"]'
        --return 'movie as node1, p.label as label, value as node2, p as id'
        / deduplicate -o $TEMP/d_animationclaims.tsv.gz""")

kgtk("""query -i claims 
        --match '(char)-[:P31]->(type),
                 (char)-[p:P175]->(value)'
        --where 'type IN ["Q15773347","Q15773317"]'
        --return 'char as node1, p.label as label, value as node2, p as id'
        / deduplicate -o $TEMP/d_charclaims.tsv.gz""")


In [59]:
kgtk(""" cat -i $TEMP/d_humanclaims.tsv.gz
             -i $TEMP/d_awardclaims.tsv.gz
             -i $TEMP/d_movieclaims.tsv.gz
             -i $TEMP/d_animationclaims.tsv.gz
             -i $TEMP/d_charclaims.tsv.gz
             -o $TEMP/d_ev_claims.tsv.gz
""")


In [60]:
kgtk("""ifnotexists --input-file claims \
               --filter-file $TEMP/d_ev_claims.tsv.gz \
               --input-keys 'id' \
               --filter-keys 'id' \
               -o $TEMP/ev_baseclaims.tsv.gz
""")

kgtk("""cat -i $TEMP/ev_baseclaims.tsv.gz -i $TEMP/event_added_claims.tsv.gz
        / deduplicate -o $OUT/ev_claims.tsv.gz""")

### Browser files: label, alias and description

#### Label
Take user-friendly labels instead of ids to form the labels of the statments, add labels from original dataset, and save to `sr_labels.tsv`.

In [61]:
kgtk(""" cat -i $GRAPH/extra-labels.tsv
             -i $TEMP/singlelabels.tsv.gz 
             -i $TEMP/birthlabels.tsv.gz 
             -i $TEMP/deathlabels.tsv.gz 
             -i $TEMP/awardlabels.tsv.gz 
             -i $TEMP/movielabels.tsv.gz 
             -i $TEMP/animationlabels.tsv.gz 
             -i $TEMP/charlabels.tsv.gz 
             -o $TEMP/event_added_labels.tsv.gz
""") 

In [62]:
kgtk("""
    cat -i $TEMP/event_added_labels.tsv.gz -i label -o $OUT/ev_labels.en.tsv.gz
""")

#### Description
From the labels additions created above, replace `label` for `description`, add descriptions from original dataset and save to `sr_desc.tsv`.

In [63]:
kgtk("""
    query -i $TEMP/event_added_labels.tsv.gz
    --match '(s)-[:label]->(o)'
    --return 's as node1, "description" as label, o as node2'
    / deduplicate
    / add-id --id-style wikidata -o $TEMP/ev_added_descriptions.en.tsv.gz
""")

kgtk("""
    cat -i $TEMP/ev_added_descriptions.en.tsv.gz -i description -o $OUT/ev_descriptions.en.tsv.gz
""")

#### Alias

In [64]:
kgtk("""
    query -i $TEMP/event_added_labels.tsv.gz
    --match '(s)-[:label]->(o)'
    --return 's as node1, "alias" as label, o as node2'
    / deduplicate
    / add-id --id-style wikidata -o $TEMP/ev_added_aliases.en.tsv.gz
""")

kgtk("""
    cat -i $TEMP/ev_added_aliases.en.tsv.gz -i alias -o $OUT/ev_aliases.en.tsv.gz
""")

### Creating clean output files into subfolder

In [65]:
kgtk("""remove-columns -i $OUT/ev_aliases.en.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/events/aliases.en.tsv.gz
""")

kgtk("""remove-columns -i $OUT/ev_claims.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/events/claims.tsv.gz
""")

kgtk("""remove-columns -i $OUT/ev_descriptions.en.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/events/descriptions.en.tsv.gz
""")

kgtk("""remove-columns -i $OUT/ev_labels.en.tsv.gz 
                    --columns 'node2;wikidatatype' 
        / deduplicate -o $OUT/events/labels.en.tsv.gz
""")

In [66]:
!zcat $OUT/events/labels.en.tsv.gz | sed -e "s/\t\"/\t'/g" | sed -e "s/\"@en/'@en/g" > $OUT/events/labels.en.tsv
!zcat $OUT/events/aliases.en.tsv.gz | sed -e "s/\t\"/\t'/g" | sed -e "s/\"@en/'@en/g" > $OUT/events/aliases.en.tsv
!zcat $OUT/events/descriptions.en.tsv.gz | sed -e "s/\t\"/\t'/g" | sed -e "s/\"@en/'@en/g" > $OUT/events/descriptions.en.tsv