# Creating a subset of Wikidata

This notebook illustrates how to create a subset of Wikidata. We use as an example https://www.wikidata.org/wiki/Q11173 (chemical compound)



In [1]:
# Parameters
wikidata_home = "/Users/pedroszekely/Downloads/kypher"
wikidata_file = "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v3/all.tsv.gz"
wikidata_file = "all.10.tsv.gz"
wikidata_parts_folder = "/Users/pedroszekely/Downloads/kypher/useful_wikidata_files"
#wikidata_parts_folder = "/Users/pedroszekely/Downloads/kypher/output.all.10"
home = "/Users/pedroszekely/Downloads/kypher"
cache_folder = "/Users/pedroszekely/Downloads/kypher"
output_folder = "/Users/pedroszekely/Downloads/scratch"
delete_database = "yes"

In [2]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

import altair as alt

# from IPython.display import display, HTML, Image
# from pandas_profiling import ProfileReport

### Set up environment variables and folders that we need

In [3]:
# folder containing wikidata broken down into smaller files.
os.environ['WIKIDATA_PARTS'] = wikidata_parts_folder
# path of folder where the wikidata parts folder is stored.
os.environ['WIKIDATA_HOME'] = wikidata_home
os.environ['KYPHER'] = home
os.environ['OUT'] = output_folder
# kgtk command to run
os.environ['kgtk'] = "kgtk"
os.environ['kgtk'] = "time kgtk --debug"
# absolute path of the db
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_folder)

In [4]:
cd $home

/Users/pedroszekely/Downloads/kypher


In [64]:
def bar_chart(data, x_column, y_column):
    """Construct a simple bar chart with two properties"""
    bars = alt.Chart(data).mark_bar().encode(
        y=alt.Y(y_column, sort='-x'),
        x=x_column
    )

    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3  # Nudges text to right so it doesn't appear on top of the bar
    ).encode(
        text=x_column
    )

    return (bars + text)

In [65]:
def run_command(cmd, substitution_dictionary = {}):
    """Run a templetized command."""
    for k, v in substitution_dictionary.items():
        cmd = cmd.replace(k, v)
    
    print(cmd)
    output = subprocess.run([cmd], shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(output.stdout)
    print(output.stderr)
    #print(output.returncode)

In [6]:
!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.en.tsv.gz --graph-cache $STORE \
--match 'item: (n1)-[l:P452]->(n2:Q507443), label: (n1)-[:label]->(label)' \
--return 'distinct n1 as pharamceutical_company, label as name' \
--where 'label.kgtk_lqstring_lang_suffix = "en"' \
--order-by 'label' \
--limit 10

[2020-10-20 22:48:09 sqlstore]: IMPORT graph directly into table graph_3 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.wikibase-item.tsv.gz ...
^C

Keyboard interrupt in query -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.wikibase-item.tsv.gz -i /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/part.label.en.tsv.gz --graph-cache /Users/pedroszekely/Downloads/kypher/wikidata.sqlite3.db --match item: (n1)-[l:P452]->(n2:Q507443), label: (n1)-[:label]->(label) --return distinct n1 as pharamceutical_company, label as name --where label.kgtk_lqstring_lang_suffix = "en" --order-by label --limit 10.
       45.47 real        74.94 user         1.90 sys


In [6]:
!$kgtk query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.label.en.tsv.gz --graph-cache $STORE \
--match 'item: (n1)-[l {label: llab}]->(n2), label: (llab)-[:label]->(name)' \
--return 'distinct name as property, llab as property, count(n1) as count' \
--where 'name.kgtk_lqstring_lang_suffix = "en"' \
--order-by 'count(n1) desc' \
--limit 20 \
-o $OUT/test.tsv 

[2020-10-24 18:53:48 sqlstore]: IMPORT graph directly into table graph_15 from /Users/pedroszekely/Downloads/kypher/Q44/Q44.part.wikibase-item.tsv.gz ...
[2020-10-24 18:53:48 sqlstore]: IMPORT graph directly into table graph_16 from /Users/pedroszekely/Downloads/kypher/Q44/Q44.label.en.tsv.gz ...
[2020-10-24 18:53:48 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_16_c2."node2" "property", graph_16_c2."node1" "property", count(graph_15_c1."node1") "count"
     FROM graph_15 AS graph_15_c1, graph_16 AS graph_16_c2
     WHERE graph_15_c1."label"=graph_16_c2."node1"
     AND graph_16_c2."label"=?
     AND graph_15_c1."label"=graph_16_c2."node1"
     AND (kgtk_lqstring_lang_suffix(graph_16_c2."node2") = ?)
     GROUP BY property, property
     ORDER BY count(graph_15_c1."node1") DESC
     LIMIT ?
  PARAS: ['label', 'en', 20]
---------------------------------------------
[2020-10-24 18:53:48 sqlstore]: CREATE INDEX on table graph_15 column labe

In [50]:
import io
import pandas
import subprocess

def shell_df(command, shell=False, **kwargs):
    """
    Takes a shell command as a string and and reads the result into a Pandas DataFrame.
    
    Additional keyword arguments are passed through to pandas.read_csv.
    
    :param command: a shell command that returns tabular data
    :type command: str
    :param shell: passed to subprocess.Popen
    :type shell: bool
    
    :return: a pandas dataframe
    :rtype: :class:`pandas.dataframe`
    """
    proc = subprocess.Popen(command, 
                            shell=shell,
                            stdout=subprocess.PIPE, 
                            stderr=subprocess.PIPE)
    output, error = proc.communicate()
    
    if proc.returncode == 0:
        if error:
            print(error.decode())
        with io.StringIO(output.decode()) as buffer:
            return pandas.read_csv(buffer, **kwargs)
    else:
        message = ("Shell command returned non-zero exit status: {0}\n\n"
                   "Command was:\n{1}\n\n"
                   "Standard error was:\n{2}")
        raise IOError(message.format(proc.returncode, command, error.decode()))

In [58]:
command = "kgtk --debug query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $KYPHER/Q44/Q44.label.en.tsv.gz --graph-cache $STORE \
--match 'item: (n1)-[l {label: llab}]->(n2), label: (llab)-[:label]->(name)' \
--return 'distinct name as property, llab as property, count(n1) as count' \
--where 'name.kgtk_lqstring_lang_suffix = \"en\"' \
--order-by 'count(n1) desc' \
--limit 20"

In [66]:
shell_df(command, shell=True, sep='\t')

[2020-10-24 19:22:07 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_16_c2."node2" "property", graph_15_c1."label" "property", count(graph_15_c1."node1") "count"
     FROM graph_15 AS graph_15_c1, graph_16 AS graph_16_c2
     WHERE graph_15_c1."label"=graph_15_c1."label"
     AND graph_16_c2."label"=?
     AND graph_15_c1."label"=graph_16_c2."node1"
     AND (kgtk_lqstring_lang_suffix(graph_16_c2."node2") = ?)
     GROUP BY property, property
     ORDER BY count(graph_15_c1."node1") DESC
     LIMIT ?
  PARAS: ['label', 'en', 20]
---------------------------------------------



Unnamed: 0,property,property.1,count
0,'diplomatic relation'@en,P530,3434
1,'language used'@en,P2936,3235
2,'member of'@en,P463,2111
3,'contains administrative territorial entity'@en,P150,1864
4,'country'@en,P17,824
5,'shares border with'@en,P47,564
6,'head of government'@en,P6,562
7,'described by source'@en,P1343,420
8,'head of state'@en,P35,344
9,'located in time zone'@en,P421,319


In [67]:
bar_chart(_, 'count', 'property')

In [10]:
graphs = ["Q318", "Q11173", "Q5", "Q7187", "Q28885102"]

command = "time papermill /Users/pedroszekely/Documents/GitHub/kgtk/examples/Example8\ -\ Wikidata\ Subset.ipynb $OUT.GRAPH.out.ipynb \
-p wikidata_home /Users/pedroszekely/Downloads/kypher \
-p wikidata_file wikidata-20200803-all-edges.tsv.gz \
-p wikidata_parts_folder /Users/pedroszekely/Downloads/kypher/useful_wikidata_files \
-p subset_name GRAPH \
-p home /Users/pedroszekely/Downloads/kypher \
-p cache_folder /Users/pedroszekely/Downloads/kypher \
-p delete_database no"


# for g in graphs:
#    run_command(command, {"GRAPH": g})

In [11]:
!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.en.tsv.gz --graph-cache $STORE \
    --match 'item: ()-[l {label: p}]->(:Q20978643), label: (p)-[:label]->(label)' \
    --return 'distinct label as node1' \
    --where 'label.kgtk_lqstring_lang_suffix = "en"' \
    --limit 10

[2020-10-18 21:34:09 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_25_c2."node2" "node1"
     FROM graph_25 AS graph_25_c2, graph_5 AS graph_5_c1
     WHERE graph_25_c2."label"=?
     AND graph_5_c1."label"=graph_5_c1."label"
     AND graph_5_c1."node2"=?
     AND graph_25_c2."node1"=graph_5_c1."label"
     AND (kgtk_lqstring_lang_suffix(graph_25_c2."node2") = ?)
     LIMIT ?
  PARAS: ['label', 'Q20978643', 'en', 10]
---------------------------------------------
node1
'instance of'@en
'subclass of'@en
'facet of'@en
'opposite of'@en
'uses'@en
        0.67 real         0.53 user         0.12 sys


In [7]:
# Only useful when first argument is a separate list of nodes
!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.tsv.gz --graph-cache $STORE \
--match 'item: (n1)-[]->(), label: (n1)-[:label]->(label)' \
--return 'distinct kgtk_lqstring_lang(label) as langauge, count(distinct label) as count' \
--order-by 'count(kgtk_lqstring_lang(label)) desc' \
-o $OUT/language.label.distribution.tsv

[2020-10-17 19:51:40 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT kgtk_lqstring_lang(graph_111_c2."node2") "langauge", count(DISTINCT graph_111_c2."node2") "count"
     FROM graph_111 AS graph_111_c2, graph_5 AS graph_5_c1
     WHERE graph_111_c2."label"=?
     AND graph_111_c2."node1"=graph_5_c1."node1"
     GROUP BY langauge
     ORDER BY count(kgtk_lqstring_lang(graph_111_c2."node2")) DESC
  PARAS: ['label']
---------------------------------------------
^C


In [34]:
!$kgtk query -i $KYPHER/Q44/Q44.part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/part.label.tsv.gz --graph-cache $STORE \
--match 'Q44: (n1)-[]->(), label: (n1)-[:label]->(label)' \
--return 'distinct kgtk_lqstring_lang(label) as langauge, count(distinct label) as count' \
--order-by 'count(kgtk_lqstring_lang(label)) desc' \
-o $OUT/Q44.language.distribution.tsv

[2020-10-17 10:32:38 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT kgtk_lqstring_lang(graph_111_c2."node2") "langauge", count(DISTINCT graph_111_c2."node2") "count"
     FROM graph_111 AS graph_111_c2, graph_99 AS graph_99_c1
     WHERE graph_111_c2."label"=?
     AND graph_111_c2."node1"=graph_99_c1."node1"
     GROUP BY langauge
     ORDER BY count(kgtk_lqstring_lang(graph_111_c2."node2")) DESC
  PARAS: ['label']
---------------------------------------------
       20.09 real        19.80 user         0.25 sys


In [35]:
!head $OUT/Q44.language.distribution.tsv

langauge	count
zh	1142
en	861
de	541
sr	321
gom	173
pt	362
be	250
crh	150
nds	188


In [8]:
!$kgtk query -i $WIKIDATA_PARTS/part.label.tsv.gz --graph-cache $STORE \
--match '(n1)-[:label]->(label)' \
--return 'distinct kgtk_lqstring_lang(label) as langauge, count(distinct label) as count' \
--order-by 'count(kgtk_lqstring_lang(label)) desc' \
-o $OUT/language.label.distribution.tsv

[2020-10-17 20:08:21 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT kgtk_lqstring_lang(graph_111_c1."node2") "langauge", count(DISTINCT graph_111_c1."node2") "count"
     FROM graph_111 AS graph_111_c1
     WHERE graph_111_c1."label"=?
     GROUP BY langauge
     ORDER BY count(kgtk_lqstring_lang(graph_111_c1."node2")) DESC
  PARAS: ['label']
---------------------------------------------
    54265.51 real      3271.07 user     50061.03 sys


In [12]:
!head -10 $OUT/language.label.distribution.tsv

langauge	count
en	72063244
nl	47885659
de	12428668
ast	12238864
fr	11285204
es	10692133
zh	8090879
pt	7586786
it	7364609


In [10]:
!wc $OUT/language.label.distribution.tsv

     396     791    4127 /Users/pedroszekely/Downloads/scratch/language.label.distribution.tsv


## Remove large classes from Wikidata

First create a file with all the edges we need for doing a test

In [12]:
!$kgtk cat \
-i $WIKIDATA_PARTS/part.alias.en.tsv.gz \
-i $WIKIDATA_PARTS/part.commonsMedia.tsv.gz \
-i $WIKIDATA_PARTS/part.description.en.tsv.gz \
-i $WIKIDATA_PARTS/part.external-id.tsv.gz \
-i $WIKIDATA_PARTS/part.geo-shape.tsv.gz \
-i $WIKIDATA_PARTS/part.globe-coordinate.tsv.gz \
-i $WIKIDATA_PARTS/part.label.en.tsv.gz \
-i $WIKIDATA_PARTS/part.math.tsv.gz \
-i $WIKIDATA_PARTS/part.monolingualtext.tsv.gz \
-i $WIKIDATA_PARTS/part.musical-notation.tsv.gz \
-i $WIKIDATA_PARTS/part.quantity.tsv.gz \
-i $WIKIDATA_PARTS/part.string.tsv.gz \
-i $WIKIDATA_PARTS/part.time.tsv.gz \
-i $WIKIDATA_PARTS/part.type.tsv.gz \
-i $WIKIDATA_PARTS/part.url.tsv.gz \
-i $WIKIDATA_PARTS/part.wikibase-form.tsv.gz \
-i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz \
-i $WIKIDATA_PARTS/part.wikibase-property.tsv.gz \
-i $WIKIDATA_PARTS/part.wikidatatype.distribution.tsv.gz \
-i $WIKIDATA_PARTS/part.wikipedia-sitelink.tsv.gz \
| gzip > $WIKIDATA_PARTS/almost.all.edges.tsv.gz

     8035.64 real      7898.06 user        79.77 sys


Make a list of the Q-nodes that we want to remove

In [13]:
!$kgtk query -i $WIKIDATA_PARTS/part.wikibase-item.tsv.gz -i $WIKIDATA_PARTS/all.P279star.tsv.gz --graph-cache $STORE \
--match 'item: (n1)-[:P31]->(n2), P279star: (n2)-[:P279star]->(q)' \
--where "q in ['Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']"  \
--return 'distinct n1 as node1, n2 as node2' \
> $OUT/temp.items.remove.tsv

[2020-10-18 23:49:28 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_5_c1."node1" "node1", graph_2_c2."node1" "node2"
     FROM graph_2 AS graph_2_c2, graph_5 AS graph_5_c1
     WHERE graph_2_c2."label"=?
     AND graph_5_c1."label"=?
     AND graph_2_c2."node1"=graph_5_c1."node2"
     AND (graph_2_c2."node2" IN (?, ?, ?, ?, ?, ?, ?))
  PARAS: ['P279star', 'P31', 'Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']
---------------------------------------------
     1006.83 real       234.25 user       275.94 sys


Need to make a list of classes to remove: if we remove we should remove all the subclasses too

In [16]:
!wc $OUT/temp.items.remove.tsv 

 48643331 97286662 928980995 /Users/pedroszekely/Downloads/scratch/temp.items.remove.tsv


In [23]:
!head $OUT/temp.items.remove.tsv 

node1	node2
Q65225360	Q101487
Q221307	Q101487
Q413421	Q101487
Q415750	Q101487
Q411073	Q101487
Q416972	Q101487
Q905731	Q101487
Q49081089	Q101487
Q382897	Q101487


In [17]:
!$kgtk query -i $WIKIDATA_PARTS/all.P279star.tsv.gz --graph-cache $STORE \
--match '(n1)-[:P279star]->(q)' \
--where "q in ['Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']"  \
--return 'distinct n1 as node1, q as node2' \
> $OUT/temp.subclasses.remove.tsv

[2020-10-19 08:24:17 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_2_c1."node1" "node1", graph_2_c1."node2" "node2"
     FROM graph_2 AS graph_2_c1
     WHERE graph_2_c1."label"=?
     AND (graph_2_c1."node2" IN (?, ?, ?, ?, ?, ?, ?))
  PARAS: ['P279star', 'Q13442814', 'Q523', 'Q318', 'Q7318358', 'Q7187', 'Q11173', 'Q8054']
---------------------------------------------
       54.97 real        13.30 user        13.09 sys


In [18]:
!wc $OUT/temp.subclasses.remove.tsv

 2654438 5308876 43316342 /Users/pedroszekely/Downloads/scratch/temp.subclasses.remove.tsv


In [24]:
!head $OUT/temp.subclasses.remove.tsv

node1	node2
Q1000726	Q11173
Q1010629	Q11173
Q1013950	Q11173
Q101487	Q11173
Q101497	Q11173
Q1018145	Q11173
Q1018211	Q11173
Q1018722	Q11173
Q1018754	Q11173


In [22]:
!wd u Q1000726 Q1018754 Q98066085 Q982914

[90mid[39m Q1000726
[42mLabel[49m native strength
[44mDescription[49m organisches, makromolekulares Polysaccharid
[30m[47msubclass of[49m[39m [90m(P279)[39m[90m: [39mpolysaccharides [90m(Q134219)[39m

[90mid[39m Q1018754
[42mLabel[49m diazoles
[44mDescription[49m class of chemical compounds which contain five-membered aromatic ring with two nitrogen heteroatoms
[30m[47minstance of[49m[39m [90m(P31)[39m[90m: [39m structural class of chemical compounds [90m(Q47154513)[39m
[30m[47msubclass of[49m[39m [90m(P279)[39m[90m: [39morganonitrogen heterocyclic compound [90m(Q72084374)[39m | heteroarene [90m(Q907447)[39m

[90mid[39m Q98066085
[42mLabel[49m surface protease GP63 (pseudogene), putative
[44mDescription[49m بروتين في تريبانوسوما كروزية
[30m[47minstance of[49m[39m [90m(P31)[39m[90m: [39m protein [90m(Q8054)[39m
[30m[47msubclass of[49m[39m [90m(P279)[39m[90m: [39mpseudogenic transcript [90m(Q64698614)[39m

[90mid[39m

Get all the items we want to remove

In [25]:
!$kgtk cat --mode NONE -i $OUT/temp.items.remove.tsv  -i $OUT/temp.subclasses.remove.tsv >  $OUT/temp.things.remove.tsv

      201.60 real       197.55 user         2.73 sys


Get all the edges we want to remove

In [6]:
!$kgtk query -i $OUT/temp.things.remove.tsv -i $WIKIDATA_PARTS/almost.all.edges.tsv.gz --graph-cache $STORE \
--match 'remove: (n1)-[]->(), all: (n1)-[l]->(n2)' \
--return 'distinct n1 as node1, l.label as label, n2 as node2, l as id' \
--order-by l \
-o $OUT/temp.edges.remove.tsv.gz

[2020-10-19 11:34:14 sqlstore]: IMPORT graph directly into table graph_1 from /Users/pedroszekely/Downloads/scratch/temp.things.remove.tsv ...
[2020-10-19 11:35:37 sqlstore]: IMPORT graph directly into table graph_2 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.tsv.gz ...
[2020-10-19 13:28:58 query]: SQL Translation:
---------------------------------------------
  SELECT DISTINCT graph_2_c2."node1" "node1", graph_2_c2."label" "label", graph_2_c2."node2" "node2", graph_2_c2."id" "id"
     FROM graph_1 AS graph_1_c1, graph_2 AS graph_2_c2
     WHERE graph_1_c1."node1"=graph_2_c2."node1"
     ORDER BY graph_2_c2."id" ASC
  PARAS: []
---------------------------------------------
[2020-10-19 13:28:58 sqlstore]: CREATE INDEX on table graph_2 column node1 ...
[2020-10-19 14:10:58 sqlstore]: ANALYZE INDEX on table graph_2 column node1 ...
[2020-10-19 14:13:40 sqlstore]: CREATE INDEX on table graph_1 column node1 ...
[2020-10-19 14:14:20 sqlstore]: ANALYZE IND

In [11]:
!gzcat $OUT/temp.edges.remove.tsv.gz | wc

 984873984 5242393011 54970910630


We must sort the `almost.all.edges` file because ifnotexists will run out of memory if the files are not sorted. Note that we don't need to sort the `temp.edges.remove` because the `query` command has an `--order-by` clause.

In [22]:
!$kgtk sort2 \
    --columns id node1 label node2 \
    -X "--buffer-size 50% --parallel 4 -T $OUT/sort" \
    -i $WIKIDATA_PARTS/almost.all.edges.tsv.gz \
| gzip > $WIKIDATA_PARTS/almost.all.edges.sorted.tsv.gz

     7377.98 real      6833.53 user       647.69 sys


Now remove the edges

In [9]:
!$kgtk --timing --progress \
     ifnotexists --verbose \
     -i $WIKIDATA_PARTS/almost.all.edges.sorted.tsv.gz \
     --filter-on $OUT/temp.edges.remove.tsv.gz \
     --presorted \
     --input-keys id node1 label node2 \
     --filter-keys id node1 label node2 \
     -o $OUT/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz \
     --reject-file $OUT/rejected.edges.tsv.gz

KgtkIfEfexists version: 2020-10-20T00:17:59.814324+00:00#EbuHEPUZTUwEzyNtkR5BuhxPXaSCQze2GEwE595ETRtwEUcrAgPganWGSJiuEW0a3Y1DWNRvQxqgHe+vlColrw==
Opening the input file: /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.sorted.tsv.gz
KgtkReader: File_path.suffix: .gz
KgtkReader: reading gzip /Users/pedroszekely/Downloads/kypher/useful_wikidata_files/almost.all.edges.sorted.tsv.gz
header: id	node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading an edge file.
Opening the filter input file: /Users/pedroszekely/Downloads/scratch/temp.edges.remove.tsv.gz
KgtkReader: File_path.suffix: .gz
KgtkReader: reading gzip /Users/pedroszekely/Downloads/scratch/temp.edges.remove.tsv.gz
header: node1	label	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
Opening the output file: /Users/pedroszekel

See how many edges we have now

In [10]:
!gzcat $OUT/wikidata.minus.Q13442814.Q523.Q318.Q7318358.Q7187.Q11173.Q8054.edges.tsv.gz | wc

 462394640 2123743245 26042700951
