This notebook collect labels, aliases and descriptions for node1, property and node2 in a given KGTK edge file

Parameters are set up in the first cell so that we can run this notebook in batch mode. Example invocation command:

```
papermill Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb example9.out.ipynb \
-p wlf /Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz \
-p waf /Users/amandeep/Github/kgtk/data/all.alias.en.tsv.gz \
-p wdf /Users/amandeep/Github/kgtk/data/all.description.en.tsv.gz \
-p inf /Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv \
-p out_folder /Users/amandeep/Github/kgtk/data/statements/output \
-p out_file valid_lad.tsv \
-p delete_database no \
-p run_node1 yes \
-p run_property yes \
-p run_node2 yes
```

To print a help message and exit:

```
papermill --help-notebook Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb
```

In [1]:
# path to wiki labels file
wlf:str='/Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz'
# path to wiki alias file
waf:str='/Users/amandeep/Github/kgtk/data/all.alias.en.tsv.gz'
# path to wiki description file
wdf:str='/Users/amandeep/Github/kgtk/data/all.description.en.tsv.gz'
# path to input kgtk edge file
inf:str='/Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv'
# output folder path
out_folder:str='/Users/amandeep/Github/kgtk/data/statements/output'
# output file name
out_file:str='valid_lad.tsv'
# delete previous database
delete_database:str = 'no'
# find labels, aliases and descriptions for node1
run_node1:str='yes'
# find labels, aliases and descriptions for property
run_property:str='yes'
# find labels, aliases and descriptions for node2
run_node2:str='yes'
 

In [2]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

### Setup Environment Variables

In [3]:
os.environ['INPUT_FILE'] = inf
os.environ['INPUT_FILE_NAME'] = inf.split('/')[-1]

os.environ['WIKI_LABEL'] = wlf
os.environ['WIKI_LABEL_NAME'] = wlf.split('/')[-1]

os.environ['WIKI_ALIAS'] = waf
os.environ['WIKI_ALIAS_NAME'] = waf.split('/')[-1]

os.environ['WIKI_DESCRIPTION'] = wdf
os.environ['WIKI_DESCRIPTION_NAME'] = wdf.split('/')[-1]

os.environ['OUT'] = out_folder
os.environ['OUT_FILE'] = out_file
os.environ['kgtk'] = "time kgtk --debug"

os.environ['temp'] = f'{out_folder}/temp'
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(os.environ['temp'])

In [4]:
# create temp folder to be deleted later
# -p option creates the path if it does not exist
!mkdir -p $temp

In [None]:
if delete_database and delete_database.lower() == "yes":
 print("Deleted database")
 !rm $STORE

In [6]:
def format_command(cmd, substitution_dictionary = {}):
 """Run a templetized command."""
 for k, v in substitution_dictionary.items():
 cmd = cmd.replace(k, v)
 
 print(cmd)
 return cmd

In [7]:
!echo $kgtk
!echo $WIKI_LABEL
!echo $INPUT_FILE
!echo $INPUT_FILE_NAME
!echo $WIKI_LABEL_NAME
!echo $STORE

time kgtk --debug
/Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz
/Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv
valid_edges.tsv
all.label.en.tsv.gz
/Users/amandeep/Github/kgtk/data/statements/output/temp/wikidata.sqlite3.db


### Step 1: find labels for `node1` in the file

In [11]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_LABEL_NAME': '`{}`'.format(os.environ['WIKI_LABEL_NAME'])
}

command = "time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/node1_labels.tsv \
 --match 'INPUT_FILE_NAME: (n1)-[]->(), WIKI_LABEL_NAME: (n1)-[l:label]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct n1 as node1, l.label, n2' \
 --order-by 'n1, l.label, n2'"
cmd = format_command(command, substitution_dictionary=replace_dict)
if run_node1 and run_node1.lower() == 'yes':
 !$cmd

$kgtk query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o $temp/node1_labels.tsv --match '`valid_edges.tsv`: (n1)-[]->(), `all.label.en.tsv.gz`: (n1)-[l:label]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'

[2020-10-15 17:29:45 sqlstore]: IMPORT graph directly into table graph_1 from /Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz ...
[2020-10-15 17:38:43 sqlstore]: IMPORT graph directly into table graph_2 from /Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv ...
[2020-10-15 17:38:43 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."node1" "node1", graph_1_c2."label", graph_1_c2."node2"
 FROM graph_1 AS graph_1_c2, graph_2 AS graph_2_c1
 WHERE graph_1_c2."label"=?
 AND graph_1_c2."node1"=graph_2_c1."node1"
 ORDER BY graph_2_c1."node1" ASC, graph_1_c2."label" ASC, graph_1_c2."node2" ASC
 PARAS: ['label']
---------------------------------------------
[2020-10-15 17:38:43 sqlstore]

### Step 2: find aliases for `node1` in the file

In [19]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_ALIAS_NAME': '`{}`'.format(os.environ['WIKI_ALIAS_NAME'])
}
command = "time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/node1_aliases.tsv \
 --match 'INPUT_FILE_NAME: (n1)-[]->(), WIKI_ALIAS_NAME: (n1)-[l:alias]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'"
cmd = format_command(command, substitution_dictionary=replace_dict)
if run_node1 and run_node1.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE -o $temp/node1_aliases.tsv --match '`valid_edges.tsv`: (n1)-[]->(), `all.alias.en.tsv.gz`: (n1)-[l:alias]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'
[2020-10-15 17:44:11 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."node1" "node1", graph_3_c2."label", graph_3_c2."node2"
 FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2
 WHERE graph_3_c2."label"=?
 AND graph_2_c1."node1"=graph_3_c2."node1"
 ORDER BY graph_2_c1."node1" ASC, graph_3_c2."label" ASC, graph_3_c2."node2" ASC
 PARAS: ['alias']
---------------------------------------------
kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE -o $temp/node1_aliases.tsv 0.72s user 0.11s system 99% cpu 0.837 total


### Step 3: find descriptions for `node1` in the file

In [8]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_DESCRIPTION_NAME': '`{}`'.format(os.environ['WIKI_DESCRIPTION_NAME'])
}

command = "time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/node1_descriptions.tsv \
 --match 'INPUT_FILE_NAME: (n1)-[]->(), WIKI_DESCRIPTION_NAME: (n1)-[l:description]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'"

cmd = format_command(command, substitution_dictionary=replace_dict)
if run_node1 and run_node1.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE -o $temp/node1_descriptions.tsv --match '`valid_edges.tsv`: (n1)-[]->(), `all.description.en.tsv.gz`: (n1)-[l:description]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'
[2020-10-15 17:48:38 sqlstore]: IMPORT graph directly into table graph_3 from /Users/amandeep/Github/kgtk/data/all.description.en.tsv.gz ...
[2020-10-15 18:00:45 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."node1" "node1", graph_3_c2."label", graph_3_c2."node2"
 FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2
 WHERE graph_3_c2."label"=?
 AND graph_2_c1."node1"=graph_3_c2."node1"
 ORDER BY graph_2_c1."node1" ASC, graph_3_c2."label" ASC, graph_3_c2."node2" ASC
 PARAS: ['description']
---------------------------------------------
[2020-10-15 18:00:45 sqlstore]: CREATE INDEX on table graph_3 column label ...
[2020-10-15 18:02:20 sqlstore]: ANALYZE INDEX 

### Step 4: find labels for `node2` in the file

In [9]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_LABEL_NAME': '`{}`'.format(os.environ['WIKI_LABEL_NAME'])
}

command = "time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/node2_labels.tsv \
 --match 'INPUT_FILE_NAME: ()-[]->(n1), WIKI_LABEL_NAME: (n1)-[l:label]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'"

cmd = format_command(command, substitution_dictionary=replace_dict)

if run_node2 and run_node2.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o $temp/node2_labels.tsv --match '`valid_edges.tsv`: ()-[]->(n1), `all.label.en.tsv.gz`: (n1)-[l:label]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'
[2020-10-15 18:04:41 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."node2" "node1", graph_1_c2."label", graph_1_c2."node2"
 FROM graph_1 AS graph_1_c2, graph_2 AS graph_2_c1
 WHERE graph_1_c2."label"=?
 AND graph_1_c2."node1"=graph_2_c1."node2"
 ORDER BY graph_2_c1."node2" ASC, graph_1_c2."label" ASC, graph_1_c2."node2" ASC
 PARAS: ['label']
---------------------------------------------
[2020-10-15 18:04:41 sqlstore]: CREATE INDEX on table graph_2 column node2 ...
[2020-10-15 18:04:41 sqlstore]: ANALYZE INDEX on table graph_2 column node2 ...
kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o 0.93s user 0.49s system 38% cpu 3.663 total


### Step 5: find aliases for `node2` in the file

In [10]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_ALIAS_NAME': '`{}`'.format(os.environ['WIKI_ALIAS_NAME'])
}

command = "time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/node2_aliases.tsv \
 --match 'INPUT_FILE_NAME: ()-[]->(n1), WIKI_ALIAS_NAME: (n1)-[l:alias]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'"

cmd = format_command(command, substitution_dictionary=replace_dict)
if run_node2 and run_node2.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o $temp/node2_aliases.tsv --match '`valid_edges.tsv`: ()-[]->(n1), `all.alias.en.tsv.gz`: (n1)-[l:alias]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'
[2020-10-15 18:05:22 sqlstore]: IMPORT graph directly into table graph_4 from /Users/amandeep/Github/kgtk/data/all.alias.en.tsv.gz ...
[2020-10-15 18:05:55 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."node2" "node1", graph_4_c2."label", graph_4_c2."node2"
 FROM graph_2 AS graph_2_c1, graph_4 AS graph_4_c2
 WHERE graph_4_c2."label"=?
 AND graph_2_c1."node2"=graph_4_c2."node1"
 ORDER BY graph_2_c1."node2" ASC, graph_4_c2."label" ASC, graph_4_c2."node2" ASC
 PARAS: ['alias']
---------------------------------------------
[2020-10-15 18:05:55 sqlstore]: CREATE INDEX on table graph_4 column label ...
[2020-10-15 18:05:58 sqlstore]: ANALYZE INDEX on table graph_4 column label ...
[

### Step 6: find descriptions for `node2` in the file

In [11]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_DESCRIPTION_NAME': '`{}`'.format(os.environ['WIKI_DESCRIPTION_NAME'])
}

command = "time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/node2_descriptions.tsv \
 --match 'INPUT_FILE_NAME: ()-[]->(n1), WIKI_DESCRIPTION_NAME: (n1)-[l:description]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'"

cmd = format_command(command, substitution_dictionary=replace_dict)
if run_node2 and run_node2.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE -o $temp/node2_descriptions.tsv --match '`valid_edges.tsv`: ()-[]->(n1), `all.description.en.tsv.gz`: (n1)-[l:description]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'
[2020-10-15 18:06:36 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."node2" "node1", graph_3_c2."label", graph_3_c2."node2"
 FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2
 WHERE graph_3_c2."label"=?
 AND graph_2_c1."node2"=graph_3_c2."node1"
 ORDER BY graph_2_c1."node2" ASC, graph_3_c2."label" ASC, graph_3_c2."node2" ASC
 PARAS: ['description']
---------------------------------------------
kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE - 0.75s user 0.26s system 59% cpu 1.708 total


### Step 7: find labels for `property` in the file

In [12]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_LABEL_NAME': '`{}`'.format(os.environ['WIKI_LABEL_NAME'])
}

command = "time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/property_labels.tsv \
 --match 'INPUT_FILE_NAME: ()-[id {label: prop}]->(), WIKI_LABEL_NAME: (prop)-[l:label]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct prop as node1, l.label, n2' \
 --order-by 'prop, l.label, n2'"

cmd = format_command(command, substitution_dictionary=replace_dict)
if run_property and run_property.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o $temp/property_labels.tsv --match '`valid_edges.tsv`: ()-[id {label: prop}]->(), `all.label.en.tsv.gz`: (prop)-[l:label]->(n2)' --return 'distinct prop as node1, l.label, n2' --order-by 'prop, l.label, n2'
[2020-10-15 18:07:01 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_1_c2."node1" "node1", graph_1_c2."label", graph_1_c2."node2"
 FROM graph_1 AS graph_1_c2, graph_2 AS graph_2_c1
 WHERE graph_1_c2."label"=?
 AND graph_2_c1."label"=graph_1_c2."node1"
 AND graph_1_c2."node1"=graph_2_c1."label"
 ORDER BY graph_1_c2."node1" ASC, graph_1_c2."label" ASC, graph_1_c2."node2" ASC
 PARAS: ['label']
---------------------------------------------
[2020-10-15 18:07:01 sqlstore]: CREATE INDEX on table graph_2 column label ...
[2020-10-15 18:07:01 sqlstore]: ANALYZE INDEX on table graph_2 column label ...
kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -

### Step 8: find aliases for `property` in the file


In [13]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_ALIAS_NAME': '`{}`'.format(os.environ['WIKI_ALIAS_NAME'])
}

command = "time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/property_aliases.tsv \
 --match 'INPUT_FILE_NAME: ()-[id {label: prop}]->(), WIKI_ALIAS_NAME: (prop)-[l:alias]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct prop as node1, l.label, n2' \
 --order-by 'prop, l.label, n2'"
cmd = format_command(command, substitution_dictionary=replace_dict)
if run_property and run_property.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o $temp/property_aliases.tsv --match '`valid_edges.tsv`: ()-[id {label: prop}]->(), `all.alias.en.tsv.gz`: (prop)-[l:alias]->(n2)' --return 'distinct prop as node1, l.label, n2' --order-by 'prop, l.label, n2'
[2020-10-15 18:07:33 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_4_c2."node1" "node1", graph_4_c2."label", graph_4_c2."node2"
 FROM graph_2 AS graph_2_c1, graph_4 AS graph_4_c2
 WHERE graph_2_c1."label"=graph_4_c2."node1"
 AND graph_4_c2."label"=?
 AND graph_2_c1."label"=graph_4_c2."node1"
 ORDER BY graph_4_c2."node1" ASC, graph_4_c2."label" ASC, graph_4_c2."node2" ASC
 PARAS: ['alias']
---------------------------------------------
kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o 1.04s user 0.09s system 99% cpu 1.145 total


### Step 9: find descriptions for `property` in the file

In [14]:
replace_dict = {
 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),
 'WIKI_DESCRIPTION_NAME': '`{}`'.format(os.environ['WIKI_DESCRIPTION_NAME'])
}

command = "time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE \
 --graph-cache $STORE \
 -o $temp/property_descriptions.tsv \
 --match 'INPUT_FILE_NAME: ()-[id {label: prop}]->(), WIKI_DESCRIPTION_NAME: (prop)-[l:description]->(n2)' \
 --where 'n2.kgtk_lqstring_lang_suffix = \"en\"' \
 --return 'distinct prop as node1, l.label, n2' \
 --order-by 'prop, l.label, n2'"

cmd = format_command(command, substitution_dictionary=replace_dict)
if run_property and run_property.lower() == 'yes':
 !$cmd

time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE -o $temp/property_descriptions.tsv --match '`valid_edges.tsv`: ()-[id {label: prop}]->(), `all.description.en.tsv.gz`: (prop)-[l:description]->(n2)' --return 'distinct prop as node1, l.label, n2' --order-by 'prop, l.label, n2'
[2020-10-15 18:07:51 query]: SQL Translation:
---------------------------------------------
 SELECT DISTINCT graph_2_c1."label" "node1", graph_3_c2."label", graph_3_c2."node2"
 FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2
 WHERE graph_2_c1."label"=graph_2_c1."label"
 AND graph_3_c2."label"=?
 AND graph_2_c1."label"=graph_3_c2."node1"
 ORDER BY graph_2_c1."label" ASC, graph_3_c2."label" ASC, graph_3_c2."node2" ASC
 PARAS: ['description']
---------------------------------------------
kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE - 0.66s user 0.09s system 93% cpu 0.802 total


### Step 10: cat all the files into one file

In [16]:
file_list = []
if run_node1 and run_node1.lower() == 'yes':
 file_list.append('{}/node1_labels.tsv'.format(os.environ['temp']))
 file_list.append('{}/node1_aliases.tsv'.format(os.environ['temp']))
 file_list.append('{}/node1_descriptions.tsv'.format(os.environ['temp']))

if run_node2 and run_node2.lower() == 'yes':
 file_list.append('{}/node2_labels.tsv'.format(os.environ['temp']))
 file_list.append('{}/node2_aliases.tsv'.format(os.environ['temp']))
 file_list.append('{}/node2_descriptions.tsv'.format(os.environ['temp']))

if run_property and run_property.lower() == 'yes':
 file_list.append('{}/property_labels.tsv'.format(os.environ['temp']))
 file_list.append('{}/property_aliases.tsv'.format(os.environ['temp']))
 file_list.append('{}/property_descriptions.tsv'.format(os.environ['temp']))

f_list = ' '.join(file_list).strip()
print(f_list)
kgtk_cat_command = 'kgtk cat -i {} -o $temp/all_labels_aliases_descriptions_duplicates.tsv'.format(f_list)

!$kgtk_cat_command

### Step 11: sort and compact 

In [None]:
!kgtk sort2 -i $temp/all_labels_aliases_descriptions_duplicates.tsv \
 -o $temp/all_labels_aliases_descriptions_sorted.tsv

!kgtk compact -i $temp/all_labels_aliases_descriptions_sorted.tsv \
 -o $temp/all_labels_aliases_descriptions_sorted_dedup.tsv

### Step 12: add ids

In [18]:
!kgtk add-id -i $temp/all_labels_aliases_descriptions_sorted_dedup.tsv \
 -o $OUT/$OUT_FILE \
 --id-style node1-label-num