{ "cells": [ { "cell_type": "code", "execution_count": 17, "id": "dd0ec03b", "metadata": {}, "outputs": [], "source": [ "# Parameters\n", "wikidata_input_path = '/data02/ana_iglesias/data/subset/all.tsv.gz'\n", "wikidata_parts_path = '/data02/ana_iglesias/data/subset/parts'\n", "temp_folder_path = wikidata_parts_path + '/temp'\n", "gzip_command = 'gzip'\n", "kgtk_command = 'time kgtk'\n", "kgtk_options = '--debug --timing'\n", "kgtk_extension = 'tsv.gz'\n", "presorted = 'False'\n", "sort_extras = '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path\n", "use_mgzip = 'True'\n", "verbose = 'True'" ] }, { "cell_type": "code", "execution_count": 18, "id": "ca529b52", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "wikidata_input_path = '/data02/ana_iglesias/data/subset/all.tsv.gz'\n", "wikidata_parts_path = '/data02/ana_iglesias/data/subset/parts'\n", "temp_folder_path = '/data02/ana_iglesias/data/subset/parts/temp'\n", "gzip_command = 'gzip'\n", "kgtk_command = 'time kgtk'\n", "kgtk_options = '--debug --timing'\n", "kgtk_extension = 'tsv.gz'\n", "presorted = 'False'\n", "sort_extras = '--parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp'\n", "use_mgzip = 'True'\n", "verbose = 'True'\n" ] } ], "source": [ "print('wikidata_input_path = %s' % repr(wikidata_input_path))\n", "print('wikidata_parts_path = %s' % repr(wikidata_parts_path))\n", "print('temp_folder_path = %s' % repr(temp_folder_path))\n", "print('gzip_command = %s' % repr(gzip_command))\n", "print('kgtk_command = %s' % repr(kgtk_command))\n", "print('kgtk_options = %s' % repr(kgtk_options))\n", "print('kgtk_extension = %s' % repr(kgtk_extension))\n", "print('presorted = %s' % repr(presorted))\n", "print('sort_extras = %s' % repr(sort_extras))\n", "print('use_mgzip = %s' % repr(use_mgzip))\n", "print('verbose = %s' % repr(verbose))" ] }, { "cell_type": "markdown", "id": "6d78f37e", "metadata": {}, "source": [ "### Create working folders and empty them" ] }, { "cell_type": "code", "execution_count": 4, "id": "8637f4c0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: cannot create directory '/data02/ana_iglesias/data/subset/parts': File exists\n", "mkdir: cannot create directory '/data02/ana_iglesias/data/subset/parts/temp': File exists\n" ] } ], "source": [ "!mkdir {wikidata_parts_path}\n", "!mkdir {temp_folder_path}" ] }, { "cell_type": "code", "execution_count": 5, "id": "7a12e480", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rm: cannot remove '/data02/ana_iglesias/data/subset/parts/*.tsv': No such file or directory\n", "rm: cannot remove '/data02/ana_iglesias/data/subset/parts/temp/*.tsv': No such file or directory\n" ] } ], "source": [ "!rm {wikidata_parts_path}/*.tsv {wikidata_parts_path}/*.tsv.gz\n", "!rm {temp_folder_path}/*.tsv {temp_folder_path}/*.tsv.gz" ] }, { "cell_type": "markdown", "id": "ef1c87af", "metadata": {}, "source": [ "### Sort the Input Data Unless Presorted\n", "Sort the input data file by (id, node1, label, node2).\n", "This may take a while." ] }, { "cell_type": "code", "execution_count": 6, "id": "b39f48bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sorting the input file '/data02/ana_iglesias/data/subset/all.tsv.gz'.\n", "Using the sort command 'sort'\n", "header pipe: read_fd=4 write_fd=5\n", "sort options pipe: read_fd=6 write_fd=7\n", "gzip output file: '/data02/ana_iglesias/data/subset/parts/all.tsv.gz'\n", "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/all.tsv.gz'\n", "gunzip input file: '/data02/ana_iglesias/data/subset/all.tsv.gz'\n", "full command: gzip -dc '/data02/ana_iglesias/data/subset/all.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/all.tsv.gz'\n", "Monitoring the cat command (pid=28206).\n", "Running the sort script (pid=28209).\n", "Reading the KGTK input file header line with KgtkReader\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: reading file descriptor 4\n", "header: node1\tlabel\tnode2\tid\trank\tnode2;wikidatatype\tlang\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "KGTK header: node1 label node2 id rank node2;wikidatatype lang\n", "sort options: --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp -k 4,4 -k 1,1 -k 2,2 -k 3,3\n", "\n", "Waiting for the sort command to complete.\n", "\n", "\n", "Done reading the input file\n", "Monitoring the sort command (pid=28214)\n", "Cleanup.\n", "Timing: elapsed=0:00:52.436506 CPU=0:00:06.980607 ( 13.3%): sort --verbose=True --gzip-command=gzip --input-file /data02/ana_iglesias/data/subset/all.tsv.gz --output-file /data02/ana_iglesias/data/subset/parts/all.tsv.gz --columns id node1 label node2 --extra --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp\n", "\n", "real\t0m52.646s\n", "user\t1m5.163s\n", "sys\t0m10.588s\n" ] } ], "source": [ "if presorted.lower() == \"true\": \n", " print('Using a presorted input file %s.' % repr(wikidata_input_path))\n", " partition_input_file = wikidata_input_path \n", "else: \n", " print('Sorting the input file %s.' % repr(wikidata_input_path))\n", " partition_input_file = wikidata_parts_path + '/all.' + kgtk_extension \n", " !{kgtk_command} {kgtk_options} sort --verbose={verbose} --gzip-command={gzip_command} \\\n", " --input-file {wikidata_input_path} \\\n", " --output-file {partition_input_file} \\\n", " --columns id node1 label node2 \\\n", " --extra \"{sort_extras}\"" ] }, { "cell_type": "markdown", "id": "eb873ee6", "metadata": {}, "source": [ "### Getting rid of 'rank' and 'lang' columns" ] }, { "cell_type": "code", "execution_count": 7, "id": "4b259a2f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Timing: elapsed=0:03:28.170880 CPU=0:03:34.469851 (103.0%): remove-columns -i /data02/ana_iglesias/data/subset/parts/all.tsv.gz --columns rank lang -o /data02/ana_iglesias/data/subset/parts/all.min.tsv.gz\n", "\n", "real\t3m28.341s\n", "user\t3m29.001s\n", "sys\t0m5.796s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} remove-columns -i {wikidata_parts_path}/all.{kgtk_extension} \\\n", " --columns rank lang \\\n", " -o {wikidata_parts_path}/all.min.{kgtk_extension}\n", "!rm {wikidata_parts_path}/all.{kgtk_extension}\n", "!mv {wikidata_parts_path}/all.min.{kgtk_extension} {wikidata_parts_path}/all.{kgtk_extension}" ] }, { "cell_type": "markdown", "id": "e681dd14", "metadata": {}, "source": [ "### Partition the Claims, Qualifiers, and Entity Data\n", "Split out the entity data (alias, description, label, and sitelinks) and additional metadata (datatype, type). Separate the qualifiers from the claims.\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "ef790a57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", "Opening the input file: /data02/ana_iglesias/data/subset/parts/all.tsv.gz\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/all.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/aliases.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/aliases.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/descriptions.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/descriptions.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/labels.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/labels.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/sitelinks.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/sitelinks.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/sitelinks.qualifiers.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/sitelinks.qualifiers.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/metadata.types.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/metadata.types.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the reject file: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Applying a dispatched multiple-output predicate filter\n", "Read 15770685 rows, rejected 5640395 rows, wrote 10130290 rows.\n", "Closing output files.\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "All output files have been closed.\n", "Timing: elapsed=0:01:25.414823 CPU=0:03:43.054031 (261.1%): filter --verbose=True --use-mgzip=True --first-match-only --input-file /data02/ana_iglesias/data/subset/parts/all.tsv.gz -p ; datatype ; -o /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz -p ; alias ; -o /data02/ana_iglesias/data/subset/parts/aliases.tsv.gz -p ; description ; -o /data02/ana_iglesias/data/subset/parts/descriptions.tsv.gz -p ; label ; -o /data02/ana_iglesias/data/subset/parts/labels.tsv.gz -p ; addl_wikipedia_sitelink,wikipedia_sitelink ; -o /data02/ana_iglesias/data/subset/parts/sitelinks.tsv.gz -p ; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ; -o /data02/ana_iglesias/data/subset/parts/sitelinks.qualifiers.tsv.gz -p ; type ; -o /data02/ana_iglesias/data/subset/parts/metadata.types.tsv.gz --reject-file /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz\n", "\n", "real\t1m25.639s\n", "user\t3m36.211s\n", "sys\t0m7.142s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --first-match-only \\\n", " --input-file {partition_input_file} \\\n", " -p '; datatype ;' -o {wikidata_parts_path}/metadata.property.datatypes.{kgtk_extension} \\\n", " -p '; alias ;' -o {wikidata_parts_path}/aliases.{kgtk_extension} \\\n", " -p '; description ;' -o {wikidata_parts_path}/descriptions.{kgtk_extension} \\\n", " -p '; label ;' -o {wikidata_parts_path}/labels.{kgtk_extension} \\\n", " -p '; addl_wikipedia_sitelink,wikipedia_sitelink ;' \\\n", " -o {wikidata_parts_path}/sitelinks.{kgtk_extension} \\\n", " -p '; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;' \\\n", " -o {wikidata_parts_path}/sitelinks.qualifiers.{kgtk_extension} \\\n", " -p '; type ;' -o {wikidata_parts_path}/metadata.types.{kgtk_extension} \\\n", " --reject-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension}" ] }, { "cell_type": "markdown", "id": "4ce54ab6", "metadata": {}, "source": [ "### Sort the claims and qualifiers on Node1\n", "Sort the combined claims and qualifiers file by the node1 column.\n", "This may take a while." ] }, { "cell_type": "code", "execution_count": 9, "id": "52fd266e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the sort command 'sort'\n", "header pipe: read_fd=4 write_fd=5\n", "sort options pipe: read_fd=6 write_fd=7\n", "gzip output file: '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz'\n", "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz'\n", "gunzip input file: '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz'\n", "full command: gzip -dc '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz'\n", "Monitoring the cat command (pid=28976).\n", "Running the sort script (pid=28979).\n", "Reading the KGTK input file header line with KgtkReader\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: reading file descriptor 4\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "KGTK header: node1 label node2 id node2;wikidatatype\n", "sort options: --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp -k 1,1\n", "\n", "Waiting for the sort command to complete.\n", "\n", "\n", "Done reading the input file\n", "Monitoring the sort command (pid=28984)\n", "Cleanup.\n", "Timing: elapsed=0:00:19.446438 CPU=0:00:06.843942 ( 35.2%): sort2 --verbose=True --gzip-command=gzip --input-file /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz --output-file /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz --columns node1 --extra --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp\n", "\n", "real\t0m19.655s\n", "user\t0m25.513s\n", "sys\t0m6.486s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", " --input-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension} \\\n", " --output-file {temp_folder_path}/claims-and-qualifiers.sorted-by-node1.{kgtk_extension}\\\n", " --columns node1 \\\n", " --extra \"{sort_extras}\"" ] }, { "cell_type": "markdown", "id": "376784de", "metadata": {}, "source": [ "### Split the claims and qualifiers\n", "If row A's node1 value matches some other row's id value, the then row A is a qualifier." ] }, { "cell_type": "code", "execution_count": 10, "id": "3b4b3aaa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", "Opening the input file: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Opening the filter input file: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Input key columns: node1\n", "Filter key columns: id\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the reject file: /data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Processing presorted files.\n", "Read 5640395 input records, accepted 1175703 records, rejected 4464692 records.\n", "Read 5640395 filter records, 704046 found matching input records, 4936348 did not find matches.\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "Timing: elapsed=0:00:42.330478 CPU=0:01:23.824565 (198.0%): ifexists --verbose=True --use-mgzip=True --presorted --input-file /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-node1.tsv.gz --filter-file /data02/ana_iglesias/data/subset/parts/temp/claims-and-qualifiers.sorted-by-id.tsv.gz --output-file /data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz --reject-file /data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz --input-keys node1 --filter-keys id\n", "\n", "real\t0m42.583s\n", "user\t1m18.135s\n", "sys\t0m5.959s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", " --input-file {temp_folder_path}/claims-and-qualifiers.sorted-by-node1.{kgtk_extension} \\\n", " --filter-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension} \\\n", " --output-file {temp_folder_path}/qualifiers.sorted-by-node1.{kgtk_extension}\\\n", " --reject-file {temp_folder_path}/claims.sorted-by-node1.{kgtk_extension}\\\n", " --input-keys node1 \\\n", " --filter-keys id" ] }, { "cell_type": "markdown", "id": "50afac84", "metadata": {}, "source": [ "### Sort the claims by ID\n", "Sort the split claims by id, node1, label, node2.\n", "This may take a while." ] }, { "cell_type": "code", "execution_count": 11, "id": "029f52e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the sort command 'sort'\n", "header pipe: read_fd=4 write_fd=5\n", "sort options pipe: read_fd=6 write_fd=7\n", "gzip output file: '/data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz'\n", "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz'\n", "gunzip input file: '/data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz'\n", "full command: gzip -dc '/data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz'\n", "Monitoring the cat command (pid=29216).\n", "Running the sort script (pid=29219).\n", "Reading the KGTK input file header line with KgtkReader\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: reading file descriptor 4\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "KGTK header: node1 label node2 id node2;wikidatatype\n", "sort options: --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp -k 4,4 -k 1,1 -k 2,2 -k 3,3\n", "\n", "Waiting for the sort command to complete.\n", "\n", "\n", "Done reading the input file\n", "Monitoring the sort command (pid=29224)\n", "Cleanup.\n", "Timing: elapsed=0:00:14.796727 CPU=0:00:06.854193 ( 46.3%): sort2 --verbose=True --gzip-command=gzip --input-file /data02/ana_iglesias/data/subset/parts/temp/claims.sorted-by-node1.tsv.gz --output-file /data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz --columns id node1 label node2 --extra --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp\n", "\n", "real\t0m14.992s\n", "user\t0m19.598s\n", "sys\t0m5.913s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", " --input-file {temp_folder_path}/claims.sorted-by-node1.{kgtk_extension} \\\n", " --output-file {temp_folder_path}/claims.no-datatype.{kgtk_extension}\\\n", " --columns id node1 label node2 \\\n", " --extra \"{sort_extras}\"" ] }, { "cell_type": "markdown", "id": "9c5ab8b0", "metadata": {}, "source": [ "### Merge the Wikidata Property Datatypes into the claims\n", "Merge the Wikidata Property Datatypes into the claims row as node2;wikidatatype. This column will be used to partition the claims by Wikidata Property Datatype ina later step. If the claims file already has a node2;wikidatatype column, lift only when that column has an empty value.\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "e3a25b47", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Opening the input file: /data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Opening the label file: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Lifting with in-memory buffering.\n", "Loading labels from the label file.\n", "The label file is an edge file, defaulting to the node1 column (or alias) for the match column.\n", "The label file is an edge file, defaulting to the node2 column (or alias) for the value column.\n", "Loading labels from /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "label_match_column_idx=0 (node1).\n", "label_select_column_idx=1 (label).\n", "label_value_column_idx=2 (node2).\n", "label_select_column_value='datatype'.\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Writing output records\n", "Read 4464692 non-label input records, 0 skipped.\n", "6567 labels were found, 1 matched.\n", "Wrote 4464692 output records, 33 modified.\n", "KgtkWriter: closing the output file\n", "Timing: elapsed=0:00:28.979939 CPU=0:00:56.493679 (194.9%): lift --verbose=True --use-mgzip=True --input-file /data02/ana_iglesias/data/subset/parts/temp/claims.no-datatype.tsv.gz --columns-to-lift label --overwrite False --label-file /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz --label-value datatype --output-file /data02/ana_iglesias/data/subset/parts/claims.tsv.gz --columns-to-write node2;wikidatatype\n", "\n", "real\t0m29.191s\n", "user\t0m51.629s\n", "sys\t0m5.107s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} lift --verbose={verbose} --use-mgzip={use_mgzip} \\\n", " --input-file {temp_folder_path}/claims.no-datatype.{kgtk_extension} \\\n", " --columns-to-lift label \\\n", " --overwrite False \\\n", " --label-file {wikidata_parts_path}/metadata.property.datatypes.{kgtk_extension}\\\n", " --label-value datatype \\\n", " --output-file {wikidata_parts_path}/claims.{kgtk_extension}\\\n", " --columns-to-write 'node2;wikidatatype'" ] }, { "cell_type": "markdown", "id": "d2d9c486", "metadata": {}, "source": [ "### Sort the qualifiers by ID\n", "Sort the split qualifiers by id, node1, label, node2.\n", "This may take a while." ] }, { "cell_type": "code", "execution_count": 13, "id": "c32febea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the sort command 'sort'\n", "header pipe: read_fd=4 write_fd=5\n", "sort options pipe: read_fd=6 write_fd=7\n", "gzip output file: '/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz'\n", "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz'\n", "gunzip input file: '/data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz'\n", "full command: gzip -dc '/data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | gzip - > '/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz'\n", "Monitoring the cat command (pid=29530).\n", "Running the sort script (pid=29533).\n", "Reading the KGTK input file header line with KgtkReader\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: reading file descriptor 4\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "KGTK header: node1 label node2 id node2;wikidatatype\n", "sort options: --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp -k 4,4 -k 1,1 -k 2,2 -k 3,3\n", "\n", "Waiting for the sort command to complete.\n", "\n", "\n", "Done reading the input file\n", "Monitoring the sort command (pid=29538)\n", "Cleanup.\n", "Timing: elapsed=0:00:05.864913 CPU=0:00:06.885400 (117.4%): sort2 --verbose=True --gzip-command=gzip --input-file /data02/ana_iglesias/data/subset/parts/temp/qualifiers.sorted-by-node1.tsv.gz --output-file /data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz --columns id node1 label node2 --extra --parallel 24 --buffer-size 30% --temporary-directory /data02/ana_iglesias/data/subset/parts/temp\n", "\n", "real\t0m6.057s\n", "user\t0m8.712s\n", "sys\t0m5.072s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", " --input-file {temp_folder_path}/qualifiers.sorted-by-node1.{kgtk_extension} \\\n", " --output-file {wikidata_parts_path}/qualifiers.{kgtk_extension}\\\n", " --columns id node1 label node2 \\\n", " --extra \"{sort_extras}\"" ] }, { "cell_type": "markdown", "id": "e914a0d0", "metadata": {}, "source": [ "### Partition the claims by Wikidata Property Datatype\n", "Wikidata has two names for each Wikidata property datatype: the name that appears in the JSON dump file, and the name that appears in the TTL dump file. `kgtk import-wikidata` currently imports rows from Wikikdata JSON dump files, and these are the names that appear below.\n", "\n", "The `part.other` file catches any records that have an unknown Wikidata property datatype. Additional Wikidata property datatypes may occur when processing from certain Wikidata extensions." ] }, { "cell_type": "code", "execution_count": 14, "id": "d884f14c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", "Opening the input file: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz\n", "input format: kgtk\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=0 label=1 node2=2 id=3\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Opening the output file: /data02/ana_iglesias/data/subset/parts/claims.wikibase-item.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/claims.wikibase-item.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Opening the reject file: /data02/ana_iglesias/data/subset/parts/claims.other.tsv.gz\n", "File_path.suffix: .gz\n", "KgtkWriter: writing mgzip with 3 threads: /data02/ana_iglesias/data/subset/parts/claims.other.tsv.gz\n", "header: node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "Applying a single object filter\n", "Read 4464692 rows, rejected 2546088 rows, wrote 1918604 rows.\n", "Closing output files.\n", "KgtkWriter: closing the output file\n", "KgtkWriter: closing the output file\n", "All output files have been closed.\n", "Timing: elapsed=0:00:35.284359 CPU=0:00:55.771441 (158.1%): filter --verbose=True --use-mgzip=True --first-match-only --input-file /data02/ana_iglesias/data/subset/parts/claims.tsv.gz --obj node2;wikidatatype -p ;; wikibase-item -o /data02/ana_iglesias/data/subset/parts/claims.wikibase-item.tsv.gz --reject-file /data02/ana_iglesias/data/subset/parts/claims.other.tsv.gz\n", "\n", "real\t0m35.495s\n", "user\t0m50.419s\n", "sys\t0m5.607s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --first-match-only \\\n", " --input-file {wikidata_parts_path}/claims.{kgtk_extension} \\\n", " --obj 'node2;wikidatatype' \\\n", " -p ';; wikibase-item' -o {wikidata_parts_path}/claims.wikibase-item.{kgtk_extension} \\\n", " --reject-file {wikidata_parts_path}/claims.other.{kgtk_extension}" ] }, { "cell_type": "markdown", "id": "29225561", "metadata": {}, "source": [ "## Compute Pagerank" ] }, { "cell_type": "code", "execution_count": 21, "id": "1893a965", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "(kgtk:6592): Gtk-\u001b[1;33mWARNING\u001b[0m **: \u001b[34m18:46:16.731\u001b[0m: Locale not supported by C library.\n", "\tUsing the fallback 'C' locale.\n", "Timing: elapsed=0:01:43.923291 CPU=0:03:05.149310 (178.2%): graph-statistics -i /data02/ana_iglesias/data/subset/reframings/events/claims.tsv.gz -o /data02/ana_iglesias/data/subset/reframings/events/metadata.pagerank.undirected.tsv.gz --compute-pagerank True --compute-hits False --page-rank-property Pundirected_pagerank --use-mgzip True --mgzip-threads 12 --output-degrees False --output-pagerank True --output-hits False --output-statistics-only --undirected True --log-file /data02/ana_iglesias/data/subset/parts/temp/metadata.pagerank.undirected.summary.txt\n", "\n", "real\t1m44.361s\n", "user\t2m59.282s\n", "sys\t0m6.365s\n" ] } ], "source": [ "!{kgtk_command} {kgtk_options} graph-statistics \\\n", " --debug \\\n", " -i /data02/ana_iglesias/data/subset/reframings/events/claims.tsv.gz \\\n", " -o /data02/ana_iglesias/data/subset/reframings/events/metadata.pagerank.undirected.tsv.gz \\\n", " --compute-pagerank True \\\n", " --compute-hits False \\\n", " --page-rank-property Pundirected_pagerank \\\n", " --use-mgzip True \\\n", " --mgzip-threads 12 \\\n", " --output-degrees False \\\n", " --output-pagerank True \\\n", " --output-hits False \\\n", " --output-statistics-only \\\n", " --undirected True \\\n", " --log-file {temp_folder_path}/metadata.pagerank.undirected.summary.txt" ] }, { "cell_type": "code", "execution_count": null, "id": "af5984df", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }