# Import Wikidata

This notebook assumes the file `latest-all.json.bz2` is already [downloaded](https://dumps.wikimedia.org/wikidatawiki/entities/) and stored in the `input_path` in the cell marked as #Parameters.

You can download the `gz` version as well, please update the variable `wikidata_json_file` with correct file name.

In [1]:
import os

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [2]:
# Parameters

# Folder on local machine where to create the output and temporary folders
input_path = "/data02/ana_iglesias/data"
output_path = "/data02/ana_iglesias/data"
project_name = "import-wikidata"

kgtk_path = "/data02/ana_iglesias/Github/kgtk"
wikidata_json_file = "latest-all.json.bz2"
# sort_command = 'gsort'
sort_command = 'sort'

In [3]:
files = []

ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
 output_path=output_path,
 project_name=project_name)

User home: /data02/ana_iglesias
Current dir: /data02/ana_iglesias
KGTK dir: /data02/ana_iglesias/Github/kgtk
Use-cases dir: /data02/ana_iglesias/Github/kgtk/use-cases


In [4]:
ck.print_env_variables()

KGTK_LABEL_FILE: /data02/ana_iglesias/data/labels.en.tsv.gz
kgtk: kgtk
KGTK_OPTION_DEBUG: false
GRAPH: /data02/ana_iglesias/data
KGTK_GRAPH_CACHE: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
kypher: kgtk query --graph-cache /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
OUT: /data02/ana_iglesias/data/import-wikidata
TEMP: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata
EXAMPLES_DIR: /data02/ana_iglesias/Github/kgtk/examples
STORE: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db
USE_CASES_DIR: /data02/ana_iglesias/Github/kgtk/use-cases


## Define some ENV Variables, users can simply run this step, no changes required

In [11]:
os.environ['WIKIDATA_ALL_JSON'] = f"{os.environ['GRAPH']}/{wikidata_json_file}"

# Work file extensions
os.environ['UNSORTED_KGTK'] = "unsorted.tsv.gz"
os.environ['SORTED_KGTK'] = "tsv.gz"

# Use mgzip in some cases?
os.environ['USE_MGZIP'] = "TRUE"

# Select on of the following gzip implementations:
# GZIP_CMD=bzip
os.environ['GZIP_CMD'] = "pigz"


# Some common flags:
#KGTK_FLAGS="--debug --timing --progress --progress-tty `tty`"
os.environ['KGTK_FLAGS'] = "--debug --timing"
os.environ['VERBOSE'] = "--verbose"
os.environ['SORT_EXTRAS'] = f"--parallel 6 --buffer-size 50% -T {os.environ['TEMP']}"

# The Wikidata datatypes:
WIKIDATATYPES = [ 
 "commonsMedia",
 "external-id",
 "geo-shape",
 "globe-coordinate",
 "math",
 "monolingualtext",
 "musical-notation",
 "quantity",
 "string",
 "tabular-data",
 "time",
 "url",
 "wikibase-form",
 "wikibase-item",
 "wikibase-lexeme",
 "wikibase-property",
 "wikibase-sense",
 "other"
 ]

# The wikidata import split files to be sorted:
WIKIDATA_IMPORT_SPLIT_FILES = [ "claims",
	"claims.badvalue",
	"claims.novalue",
	"claims.somevalue",
	"qualifiers",
	"qualifiers.badvalue",
	"qualifiers.badvalueClaims",
	"qualifiers.novalue",
	"qualifiers.novalueClaims",
	"qualifiers.somevalue",
	"qualifiers.somevalueClaims",
	"aliases",
	"aliases.en",
	"descriptions",
	"descriptions.en",
	"labels",
	"labels.en",
	"sitelinks",
	"sitelinks.en",
	"sitelinks.en.qualifiers",
	"sitelinks.qualifiers",
	"metadata.node",
	"metadata.property.datatypes",
	"metadata.types"]


os.environ['SORT_COMMAND'] = sort_command

## Run the `import-wikidata` command

**NOTE**:
This command is set to import only english labels/aliases/descriptions, controlled by parameters `--all-languages False` and `--lang en`.

If you wish to import all languages, simple set `--all-languages True`.

In [None]:
!kgtk ${KGTK_FLAGS} \
 import-wikidata \
 -i ${WIKIDATA_ALL_JSON} \
 --node-file ${TEMP}/metadata.node.${UNSORTED_KGTK} \
 --minimal-edge-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \
 --minimal-qual-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \
 --invalid-edge-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \
 --invalid-qual-file ${TEMP}/qualifiers.badvalue.${UNSORTED_KGTK} \
 --node-file-id-only \
 --explode-values False \
 --all-languages False \
 --lang en \
 --alias-edges True \
 --split-alias-file ${TEMP}/aliases.${UNSORTED_KGTK} \
 --split-en-alias-file ${TEMP}/aliases.en.${UNSORTED_KGTK} \
 --description-edges True \
 --split-description-file ${TEMP}/descriptions.${UNSORTED_KGTK} \
 --split-en-description-file ${TEMP}/descriptions.en.${UNSORTED_KGTK} \
 --label-edges True \
 --split-label-file ${TEMP}/labels.${UNSORTED_KGTK} \
 --split-en-label-file ${TEMP}/labels.en.${UNSORTED_KGTK} \
 --datatype-edges True \
 --split-datatype-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \
 --entry-type-edges True \
 --split-type-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \
 --sitelink-edges True \
 --sitelink-verbose-edges True \
 --split-sitelink-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \
 --split-en-sitelink-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \
 --value-hash-width 6 \
 --claim-id-hash-width 8 \
 --use-kgtkwriter True \
 --use-mgzip-for-input False \
 --use-mgzip-for-output False \
 --use-shm True \
 --procs 12 \
 --mapper-batch-size 5 \
 --max-size-per-mapper-queue 3 \
 --single-mapper-queue True \
 --collect-results True \
 --collect-seperately True\
 --collector-batch-size 5 \
 --collector-queue-per-proc-size 3 \
 --progress-interval 500000 \
 --clean \
 --allow-end-of-day False \
 --repair-month-or-day-zero \
 --minimum-valid-year 1 \
 --maximum-valid-year 9999 \
 --validate-fromisoformat \
 --repair-lax-coordinates \
 --allow-language-suffixes \
 --allow-wikidata-lq-strings \
 | tee ${TEMP}/import-split-wikidata.log


kgtk import-wikidata version: 2021-11-17T01:38:17.437678+00:00#9z/aARcXhiV2hPdyVXjAREcpZwh2MawWFp6numz8GZBCtAg2WypLYAFpHjP43k97Zj8VHVaoel0oEit9KHXH0w==
Starting main process (pid 40232).
Processing.
Processing wikidata file /data02/ana_iglesias/data/latest-all.json.bz2
Decompressing (bz2)
Creating the collector queue.
The collector node queue has been created (maxsize=36).
Creating the node_collector.
Creating the node collector process.
Starting the node collector process.
Started the node collector process.
The node collector is starting (pid 40309).
The collector edge queue has been created (maxsize=36).
Creating the edge_collector.
Creating the edge collector process.
Starting the edge collector process.
Started the edge collector process.
The edge collector is starting (pid 40310).
The collector qual queue has been created (maxsize=36).
Creating the qual_collector.
Creating the qual collector process.
Starting the qual collector process.
Started the qual collector process.
The qua


*** Qualifier collision #1 detected for Q380373-P26-Q1141121-48bebee4-0-P580-2e184a (^1294-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q380868-P26-Q384941-46f6240f-0-P580-4b742f (^1533-08-25T00:00:00Z/11)

*** Qualifier collision #1 detected for Q453771-P26-Q443876-84acba5b-0-P580-84a26a (^1446-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q477343-P26-Q3374718-c7014aa0-0-P580-a95d2d (^1573-10-27T00:00:00Z/11)

*** Qualifier collision #1 detected for Q1834423-P26-Q322841-6c85598c-0-P580-876067 (^1559-06-16T00:00:00Z/11)

*** Qualifier collision #1 detected for Q3007367-P26-Q430782-f64d3af2-0-P580-5b468d (^1555-02-07T00:00:00Z/11)

*** Sitelink collision #1 detected for Q4299475-wikipedia_sitelink-d13454 (https://.wikipedia.org/wiki/Template:Bot)

*** Sitelink collision #1 detected for Q4847311-wikipedia_sitelink-69c00a (https://.wikipedia.org/wiki/Template:Delete)

*** Sitelink collision #1 detected for Q5406510-wikipedia_sitelink-97590b (https://.wikipedi


*** Qualifier collision #1 detected for Q203647-P26-Q2284422-aed54bb0-0-P580-246002 (^1045-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q232137-P26-Q41847-0dcc4fd6-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q263474-P26-Q3044-90d4ea9f-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q333359-P26-Q3052486-9cbd9d9e-0-P580-355ae9 (^0960-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q454810-P26-Q702209-74f88753-0-P580-e60df9 (^1476-08-25T00:00:00Z/11)

*** Qualifier collision #1 detected for Q672446-P26-Q2912335-f19e5091-0-P580-93d3bd (^1447-12-14T00:00:00Z/11)

*** Qualifier collision #1 detected for Q674931-P26-Q19601994-f7d507fb-0-P580-9b41a5 (^1222-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q694351-P26-Q329555-c88da6e5-0-P580-15a1f0 (^1381-09-02T00:00:00Z/11)

*** Qualifier collision #1 detected for Q702602-P26-Q79176-0f28ed9a-0-P580-676c21 (^1431-06-03T00:00:00Z/11)

*** 


*** Qualifier collision #1 detected for Q241797-P26-Q7731-b7834ae7-0-P580-a01064 (^1671-02-01T00:00:00Z/11)

*** Qualifier collision #1 detected for Q262059-P26-Q187312-c501aba2-0-P580-7e48ad (^1302-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q267483-P26-Q57920-80635ac2-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)

*** Qualifier collision #1 detected for Q271799-P26-Q169319-a97c2304-0-P580-0d082c (^1523-12-11T00:00:00Z/11)

*** Qualifier collision #1 detected for Q326738-P26-Q684224-2df6ee20-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)

*** Qualifier collision #1 detected for Q327572-P26-Q68952-ae5f6316-0-P580-5906e2 (^1563-05-10T00:00:00Z/11)

*** Qualifier collision #1 detected for Q374210-P26-Q4768218-c9e0eacd-0-P580-ef8382 (^1571-12-19T00:00:00Z/11)

*** Qualifier collision #1 detected for Q384941-P26-Q380868-4ca9581a-0-P580-4b742f (^1533-08-25T00:00:00Z/11)

*** Qualifier collision #1 detected for Q536174-P26-Q551752-c9a99a5e-0-P580-16c9b2 (^1229-00-00T00:00:00Z/9)

*** Q


*** Qualifier collision #1 detected for Q122794-P26-Q430950-e085ea2d-0-P580-94ae3a (^1577-10-20T00:00:00Z/11)

*** Qualifier collision #1 detected for Q124682-P26-Q337057-4fb67536-0-P580-db5ec5 (^1389-08-17T00:00:00Z/11)

*** Qualifier collision #1 detected for Q130005-P26-Q259564-c738415f-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)

*** Qualifier collision #1 detected for Q132545-P26-Q131552-2fbc7eb5-0-P580-e56690 (^1533-10-28T00:00:00Z/11)

*** Qualifier collision #1 detected for Q134452-P26-Q201143-a2079e30-0-P580-7c0e43 (^1491-12-06T00:00:00Z/11)

*** Qualifier collision #1 detected for Q160349-P26-Q154064-ec5ff971-0-P580-7a7cba (^1385-07-17T00:00:00Z/11)

*** Qualifier collision #1 detected for Q220845-P26-Q936976-0f99833d-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)

*** Qualifier collision #1 detected for Q234257-P26-Q170398-56a0eb9a-0-P580-850b4d (^1816-01-24T00:00:00Z/11)

*** Qualifier collision #1 detected for Q259564-P26-Q130005-bd5ab415-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)




*** Qualifier collision #1 detected for Q172203-P26-Q229419-b442326a-0-P580-a50c51 (^1262-05-28T00:00:00Z/11)

*** Qualifier collision #1 detected for Q174964-P26-Q231798-bd2d3d6b-0-P580-dc0f7a (^1322-09-21T00:00:00Z/11)

*** Qualifier collision #1 detected for Q202566-P26-Q688471-440b6399-0-P580-283d12 (^1531-09-20T00:00:00Z/11)

*** Qualifier collision #1 detected for Q232801-P26-Q721680-fa26b14e-0-P580-70598b (^1473-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q270234-P26-Q210569-6b693078-0-P580-f6928a (^1446-06-20T00:00:00Z/11)

*** Sitelink collision #1 detected for Q274740-wikipedia_sitelink-d71f3f (https://tl.wikipedia.org/wiki/Beaurevoir)

*** Qualifier collision #1 detected for Q325824-P26-Q547225-762b0607-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q441394-P26-Q57161-47bffbac-0-P580-77780b (^1308-00-00T00:00:00Z/9)

*** Qualifier collision #1 detected for Q454769-P26-Q76956-91d862f6-0-P580-981a99 (^1245-00-00T00:00:00Z/9)



5000000 lines processed by processor 6
The qual collector called 10000000 times: 0 nrows, 0 erows, 190433714 qrows, 0 invalid erows, 0 invalid qrows
The node collector called 12000000 times: 60000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows
The edge collector called 12000000 times: 0 nrows, 973473705 erows, 0 qrows, 0 invalid erows, 0 invalid qrows
5000000 lines processed by processor 11
The description collector called 11500000 times: 0 nrows, 48252977 erows, 0 qrows, 0 invalid erows, 0 invalid qrows
5000000 lines processed by processor 3
5000000 lines processed by processor 0
5000000 lines processed by processor 5
5000000 lines processed by processor 9
5000000 lines processed by processor 4

*** Sitelink collision #1 detected for Q7164-wikipedia_sitelink-65f45f (https://tr.wikipedia.org/wiki/Dünya_Bankası)

*** Qualifier collision #1 detected for Q7731-P26-Q259907-7f7cc241-0-P580-8d5052 (^1648-01-26T00:00:00Z/11)

*** Qualifier collision #1 detected for Q7731-P26-Q24

## Split `somevalue` and `novalue` from `claims.raw.unsorted.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
 filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \
 --input-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \
 --first-match-only \
 --pattern ";; novalue" -o ${TEMP}/claims.novalue.${UNSORTED_KGTK} \
 --pattern ";; somevalue" -o ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \
 --reject-file ${TEMP}/claims.${UNSORTED_KGTK} \
 | tee ${TEMP}/split-claims-missing-values.log

 ## Split `somevalue` and `novalue` from `qualifiers.raw.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
 filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \
 --input-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \
 --first-match-only \
 --pattern ";; novalue" -o ${TEMP}/qualifiers.novalue.${UNSORTED_KGTK} \
 --pattern ";; somevalue" -o ${TEMP}/qualifiers.somevalue.${UNSORTED_KGTK} \
 --reject-file - \
 / ifexists ${VERBOSE} \
 --input-keys node1 \
 --filter-file ${TEMP}/claims.novalue.${UNSORTED_KGTK} \
 --filter-keys id \
 --output-file ${TEMP}/qualifiers.novalueClaims.${UNSORTED_KGTK} \
 --reject-file - \
 / ifexists ${VERBOSE} \
 --input-keys node1 \
 --filter-file ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \
 --filter-keys id \
 --output-file ${TEMP}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \
 --reject-file - \
 / ifexists ${VERBOSE} \
 --input-keys node1 \
 --filter-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \
 --filter-keys id \
 --output-file ${TEMP}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \
 --reject-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \
 | tee ${TEMP}/split-qualifiers-missing-values.log

## Split `sitelinks.raw.unsorted.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
 filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \
 --input-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \
 --pattern "; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;" \
 --output-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \
 --reject-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \
 | tee ${TEMP}/split-sitelink-qualifiers.log

## Split `sitelinks.en.raw.unsorted.tsv.gz`

In [None]:
!kgtk ${KGTK_FLAGS} \
 filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \
 --input-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \
 --pattern "; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;" \
 --output-file ${TEMP}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \
 --reject-file ${TEMP}/sitelinks.en.${UNSORTED_KGTK} \
 | tee ${TEMP}/split-sitelink-en-qualifiers.log

## Sort the files from `TEMP` to `OUT` folder

In [None]:
for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:
 print(f"Sort the {TARGET} file.")
 input_file = f"{os.environ['TEMP']}/{TARGET}.{os.environ['UNSORTED_KGTK']}"
 output_file = f"{os.environ['OUT']}/{TARGET}.{os.environ['SORTED_KGTK']}"
 logfile = f"{os.environ['TEMP']}/{TARGET}-sorted.log"
 sort_command = f"""kgtk {os.environ['KGTK_FLAGS']} \
 sort {os.environ['VERBOSE']} \
 --input-file {input_file} \
 --output-file {output_file} \
 --gzip-command {os.environ['GZIP_CMD']} \
 --sort-command {os.environ['SORT_COMMAND']} \
 --extra '{os.environ['SORT_EXTRAS']}' | tee {logfile}"""
 !$sort_command


## Build the `all.tsv.gz file`

In [None]:
!kgtk ${KGTK_FLAGS} \
 cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \
 --input-file ${TEMP}/claims.${UNSORTED_KGTK} \
 --input-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \
 --input-file ${TEMP}/aliases.${UNSORTED_KGTK} \
 --input-file ${TEMP}/descriptions.${UNSORTED_KGTK} \
 --input-file ${TEMP}/labels.${UNSORTED_KGTK} \
 --input-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \
 --input-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \
 --input-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \
 --input-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \
 / sort ${VERBOSE} \
 --gzip-command ${GZIP_CMD} \
 --extra "${SORT_EXTRAS}" \
 --output-file ${OUT}/all.${SORTED_KGTK} \
| tee ${TEMP}/build-all-edges.log

## Subset

In [5]:
!kgtk ifexists --input-file ${OUT}/all.tsv.gz \
 --filter-on /data02/ana_iglesias/data/qnodes.tsv \
 --o ${TEMP}/all.sub.filtered.tsv.gz

In [6]:
!kgtk ifexists --input-file ${OUT}/all.tsv.gz \
 --filter-file ${TEMP}/all.sub.filtered.tsv.gz \
 --input-keys 'node1' \
 --filter-keys 'id' \
 -o ${TEMP}/all.qual.filtered.tsv.gz

In [7]:
!kgtk ifexists --input-file ${OUT}/all.tsv.gz \
 --filter-file ${TEMP}/all.sub.filtered.tsv.gz \
 --input-keys 'node1' \
 --filter-keys 'label' \
 -o ${TEMP}/all.prop-sub.filtered.tsv.gz

In [8]:
!kgtk ifexists --input-file ${OUT}/all.tsv.gz \
 --filter-file ${TEMP}/all.qual.filtered.tsv.gz \
 --input-keys 'node1' \
 --filter-keys 'label' \
 -o ${TEMP}/all.prop-qual.filtered.tsv.gz

In [12]:
!kgtk ${KGTK_FLAGS} \
 cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \
 --input-file ${TEMP}/all.sub.filtered.tsv.gz \
 --input-file ${TEMP}/all.qual.filtered.tsv.gz \
 --input-file ${TEMP}/all.prop-sub.filtered.tsv.gz \
 --input-file ${TEMP}/all.prop-qual.filtered.tsv.gz \
 / sort ${VERBOSE} \
 --gzip-command ${GZIP_CMD} \
 --extra "${SORT_EXTRAS}" \
 --output-file ${OUT}/all.subset.${SORTED_KGTK} \
| tee ${TEMP}/build-subset.log

Starting kgtkcat pid=22692
Opening the 4 input files.
Opening file 1: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz
input format: kgtk
Using KGTK_GRAPH_CACHE='/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'
Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data02/ana_iglesias/data/import-wikidata/all.subset.tsv.gz'
sort command: { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | pigz - > '/data02/ana_iglesias/data/import-wikidata/all.subset.tsv.gz'
Graph cache '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz' not found in the cache.
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkR