In [17]:
# Parameters
wikidata_input_path = '/data02/ana_iglesias/data/subset/all.tsv.gz'
wikidata_parts_path = '/data02/ana_iglesias/data/subset/parts'
temp_folder_path = wikidata_parts_path + '/temp'
gzip_command = 'gzip'
kgtk_command = 'time kgtk'
kgtk_options = '--debug --timing'
kgtk_extension = 'tsv.gz'
presorted = 'False'
sort_extras = '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path
use_mgzip = 'True'
verbose = 'True'

In [18]:
print('wikidata_input_path = %s' % repr(wikidata_input_path))
print('wikidata_parts_path = %s' % repr(wikidata_parts_path))
print('temp_folder_path = %s' % repr(temp_folder_path))
print('gzip_command = %s' % repr(gzip_command))
print('kgtk_command = %s' % repr(kgtk_command))
print('kgtk_options = %s' % repr(kgtk_options))
print('kgtk_extension = %s' % repr(kgtk_extension))
print('presorted = %s' % repr(presorted))
print('sort_extras = %s' % repr(sort_extras))
print('use_mgzip = %s' % repr(use_mgzip))
print('verbose = %s' % repr(verbose))

wikidata_input_path = '/data02/ana_iglesias/data/subset/all.tsv.gz'
wikidata_parts_path = '/data02/ana_iglesias/data/subset/parts'
temp_folder_path = '/data02/ana_iglesias/data/subset/parts/temp'
gzip_command = 'gzip'
kgtk_command = 'time kgtk'
kgtk_options = '--debug --timing'
kgtk_extension = 'tsv.gz'
presorted = 'False'
sort_extras = '--parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp'
use_mgzip = 'True'
verbose = 'True'


### Create working folders and empty them

In [4]:
!mkdir {wikidata_parts_path}
!mkdir {temp_folder_path}

mkdir: cannot create directory '/data02/ana_iglesias/data/subset/parts': File exists
mkdir: cannot create directory '/data02/ana_iglesias/data/subset/parts/temp': File exists


In [5]:
!rm {wikidata_parts_path}/*.tsv {wikidata_parts_path}/*.tsv.gz
!rm {temp_folder_path}/*.tsv {temp_folder_path}/*.tsv.gz

rm: cannot remove '/data02/ana_iglesias/data/subset/parts/*.tsv': No such file or directory
rm: cannot remove '/data02/ana_iglesias/data/subset/parts/temp/*.tsv': No such file or directory


### Sort the Input Data Unless Presorted
Sort the input data file by (id, node1, label, node2).
This may take a while.

In [6]:
if presorted.lower() == "true": 
 print('Using a presorted input file %s.' % repr(wikidata_input_path))
 partition_input_file = wikidata_input_path 
else: 
 print('Sorting the input file %s.' % repr(wikidata_input_path))
 partition_input_file = wikidata_parts_path + '/all.' + kgtk_extension 
 !{kgtk_command} {kgtk_options} sort --verbose={verbose} --gzip-command={gzip_command} \
 --input-file {wikidata_input_path} \
 --output-file {partition_input_file} \
 --columns id node1 label node2 \
 --extra "{sort_extras}"

Sorting the input file '/data02/ana_iglesias/data/subset/all.tsv.gz'.
Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data02/ana_iglesias/data/subset/parts/all.tsv.gz'
sort command: { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/all.tsv.gz'
gunzip input file: '/data02/ana_iglesias/data/subset/all.tsv.gz'
full command: gzip -dc '/data02/ana_iglesias/data/subset/all.tsv.gz' | { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/all.tsv.gz'
Monitoring the cat command (pid=28206).
Running the sort script (pid=28209).
Reading the KGTK input file header line with KgtkReader
input format: kgtk
KgtkReader: OK to use the fast r

### Getting rid of 'rank' and 'lang' columns

In [7]:
!{kgtk_command} {kgtk_options} remove-columns -i {wikidata_parts_path}/all.{kgtk_extension} \
 --columns rank lang \
 -o {wikidata_parts_path}/all.min.{kgtk_extension}
!rm {wikidata_parts_path}/all.{kgtk_extension}
!mv {wikidata_parts_path}/all.min.{kgtk_extension} {wikidata_parts_path}/all.{kgtk_extension}

Timing: elapsed=0:03:28.170880 CPU=0:03:34.469851 (103.0%): remove-columns -i /data02/ana_iglesias/data/subset/parts/all.tsv.gz --columns rank lang -o /data02/ana_iglesias/data/subset/parts/all.min.tsv.gz

real	3m28.341s
user	3m29.001s
sys	0m5.796s


### Partition the Claims, Qualifiers, and Entity Data
Split out the entity data (alias, description, label, and sitelinks) and additional metadata (datatype, type). Separate the qualifiers from the claims.


In [8]:
!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --first-match-only \
 --input-file {partition_input_file} \
 -p '; datatype ;' -o {wikidata_parts_path}/metadata.property.datatypes.{kgtk_extension} \
 -p '; alias ;' -o {wikidata_parts_path}/aliases.{kgtk_extension} \
 -p '; description ;' -o {wikidata_parts_path}/descriptions.{kgtk_extension} \
 -p '; label ;' -o {wikidata_parts_path}/labels.{kgtk_extension} \
 -p '; addl_wikipedia_sitelink,wikipedia_sitelink ;' \
 -o {wikidata_parts_path}/sitelinks.{kgtk_extension} \
 -p '; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;' \
 -o {wikidata_parts_path}/sitelinks.qualifiers.{kgtk_extension} \
 -p '; type ;' -o {wikidata_parts_path}/metadata.types.{kgtk_extension} \
 --reject-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension}

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data02/ana_iglesias/data/subset/parts/all.tsv.gz
input format: kgtk
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/all.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading a kgtk file using the fast path.
Opening the output file: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz
File_path.suffix: .gz
KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
Opening the output file: /data02/ana_iglesias/data/subset/

### Sort the claims and qualifiers on Node1
Sort the combined claims and qualifiers file by the node1 column.
This may take a while.

In [9]:
!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \
 --input-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension} \
 --output-file {temp_folder_path}/claims-and-qualifiers.sorted-by-node1.{kgtk_extension}\
 --columns node1 \
 --extra "{sort_extras}"

Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz'
sort command: { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz'
gunzip input file: '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz'
full command: gzip -dc '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz' | { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz'
Monitoring the cat command (pid=28976).
Running the s

### Split the claims and qualifiers
If row A's node1 value matches some other row's id value, the then row A is a qualifier.

In [10]:
!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \
 --input-file {temp_folder_path}/claims-and-qualifiers.sorted-by-node1.{kgtk_extension} \
 --filter-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension} \
 --output-file {temp_folder_path}/qualifiers.sorted-by-node1.{kgtk_extension}\
 --reject-file {temp_folder_path}/claims.sorted-by-node1.{kgtk_extension}\
 --input-keys node1 \
 --filter-keys id

KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==
Opening the input file: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz
input format: kgtk
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading a kgtk file using the fast path.
Opening the filter input file: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz
input format: kgtk
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data02/

### Sort the claims by ID
Sort the split claims by id, node1, label, node2.
This may take a while.

In [11]:
!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \
 --input-file {temp_folder_path}/claims.sorted-by-node1.{kgtk_extension} \
 --output-file {temp_folder_path}/claims.no-datatype.{kgtk_extension}\
 --columns id node1 label node2 \
 --extra "{sort_extras}"

Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz'
sort command: { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz'
gunzip input file: '/data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz'
full command: gzip -dc '/data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz' | { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz'
Monitoring the cat command (pid=29216).
Running the sort script (pid=29219).
Reading the KGTK input file header line with KgtkReader
i

### Merge the Wikidata Property Datatypes into the claims
Merge the Wikidata Property Datatypes into the claims row as node2;wikidatatype. This column will be used to partition the claims by Wikidata Property Datatype ina later step. If the claims file already has a node2;wikidatatype column, lift only when that column has an empty value.


In [12]:
!{kgtk_command} {kgtk_options} lift --verbose={verbose} --use-mgzip={use_mgzip} \
 --input-file {temp_folder_path}/claims.no-datatype.{kgtk_extension} \
 --columns-to-lift label \
 --overwrite False \
 --label-file {wikidata_parts_path}/metadata.property.datatypes.{kgtk_extension}\
 --label-value datatype \
 --output-file {wikidata_parts_path}/claims.{kgtk_extension}\
 --columns-to-write 'node2;wikidatatype'

Opening the input file: /data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz
input format: kgtk
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading a kgtk file using the fast path.
Opening the label file: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz
input format: kgtk
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=F

### Sort the qualifiers by ID
Sort the split qualifiers by id, node1, label, node2.
This may take a while.

In [13]:
!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \
 --input-file {temp_folder_path}/qualifiers.sorted-by-node1.{kgtk_extension} \
 --output-file {wikidata_parts_path}/qualifiers.{kgtk_extension}\
 --columns id node1 label node2 \
 --extra "{sort_extras}"

Using the sort command 'sort'
header pipe: read_fd=4 write_fd=5
sort options pipe: read_fd=6 write_fd=7
gzip output file: '/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz'
sort command: { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz'
gunzip input file: '/data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz'
full command: gzip -dc '/data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz' | { IFS= read -r header ; { printf "%s\n" "$header" >&5 ; } ; printf "%s\n" "$header" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '	' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz'
Monitoring the cat command (pid=29530).
Running the sort script (pid=29533).
Reading the KGTK input file header line with KgtkReader
input format: kgtk
KgtkReader: O

### Partition the claims by Wikidata Property Datatype
Wikidata has two names for each Wikidata property datatype: the name that appears in the JSON dump file, and the name that appears in the TTL dump file. `kgtk import-wikidata` currently imports rows from Wikikdata JSON dump files, and these are the names that appear below.

The `part.other` file catches any records that have an unknown Wikidata property datatype. Additional Wikidata property datatypes may occur when processing from certain Wikidata extensions.

In [14]:
!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --first-match-only \
 --input-file {wikidata_parts_path}/claims.{kgtk_extension} \
 --obj 'node2;wikidatatype' \
 -p ';; wikibase-item' -o {wikidata_parts_path}/claims.wikibase-item.{kgtk_extension} \
 --reject-file {wikidata_parts_path}/claims.other.{kgtk_extension}

kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==
Opening the input file: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz
input format: kgtk
KgtkReader: OK to use the fast read path.
KgtkReader: File_path.suffix: .gz
KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
node1 column found, this is a KGTK edge file
KgtkReader: is_edge_file=True is_node_file=False
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading a kgtk file using the fast path.
Opening the output file: /data02/ana_iglesias/data/subset/parts/claims.wikibase-item.tsv.gz
File_path.suffix: .gz
KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/claims.wikibase-item.tsv.gz
header: node1	label	node2	id	node2;wikidatatype
Opening the reject file: /data02/ana_iglesias/data/subset/parts/cl

## Compute Pagerank

In [21]:
!{kgtk_command} {kgtk_options} graph-statistics \
 --debug \
 -i /data02/ana_iglesias/data/subset/reframings/events/claims.tsv.gz \
 -o /data02/ana_iglesias/data/subset/reframings/events/metadata.pagerank.undirected.tsv.gz \
 --compute-pagerank True \
 --compute-hits False \
 --page-rank-property Pundirected_pagerank \
 --use-mgzip True \
 --mgzip-threads 12 \
 --output-degrees False \
 --output-pagerank True \
 --output-hits False \
 --output-statistics-only \
 --undirected True \
 --log-file {temp_folder_path}/metadata.pagerank.undirected.summary.txt


	Using the fallback 'C' locale.
Timing: elapsed=0:01:43.923291 CPU=0:03:05.149310 (178.2%): graph-statistics -i /data02/ana_iglesias/data/subset/reframings/events/claims.tsv.gz -o /data02/ana_iglesias/data/subset/reframings/events/metadata.pagerank.undirected.tsv.gz --compute-pagerank True --compute-hits False --page-rank-property Pundirected_pagerank --use-mgzip True --mgzip-threads 12 --output-degrees False --output-pagerank True --output-hits False --output-statistics-only --undirected True --log-file /data02/ana_iglesias/data/subset/parts/temp/metadata.pagerank.undirected.summary.txt

real	1m44.361s
user	2m59.282s
sys	0m6.365s
