{ "cells": [ { "cell_type": "markdown", "id": "0929e7bb-51f8-4ab2-ad69-07130a2e368e", "metadata": {}, "source": [ "# Import Wikidata" ] }, { "cell_type": "markdown", "id": "6303f2a2-babb-4a1b-9ab1-cc08bf4771ee", "metadata": {}, "source": [ "This notebook assumes the file `latest-all.json.bz2` is already [downloaded](https://dumps.wikimedia.org/wikidatawiki/entities/) and stored in the `input_path` in the cell marked as #Parameters.\n", "\n", "You can download the `gz` version as well, please update the variable `wikidata_json_file` with correct file name." ] }, { "cell_type": "code", "execution_count": 1, "id": "c5f9d560-8293-4dec-9667-f7e08c6ccf52", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 2, "id": "1a6cc50d-2a13-4eca-95be-486767de63ec", "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/data02/ana_iglesias/data\"\n", "output_path = \"/data02/ana_iglesias/data\"\n", "project_name = \"import-wikidata\"\n", "\n", "kgtk_path = \"/data02/ana_iglesias/Github/kgtk\"\n", "wikidata_json_file = \"latest-all.json.bz2\"\n", "# sort_command = 'gsort'\n", "sort_command = 'sort'" ] }, { "cell_type": "code", "execution_count": 3, "id": "d56ac16c-ba43-4810-8760-2a0755bfbd5f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /data02/ana_iglesias\n", "Current dir: /data02/ana_iglesias\n", "KGTK dir: /data02/ana_iglesias/Github/kgtk\n", "Use-cases dir: /data02/ana_iglesias/Github/kgtk/use-cases\n" ] } ], "source": [ "files = []\n", "\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 4, "id": "046a0b40-c0c1-4e9f-9b36-afcfac05edfe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KGTK_LABEL_FILE: /data02/ana_iglesias/data/labels.en.tsv.gz\n", "kgtk: kgtk\n", "KGTK_OPTION_DEBUG: false\n", "GRAPH: /data02/ana_iglesias/data\n", "KGTK_GRAPH_CACHE: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", "kypher: kgtk query --graph-cache /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", "OUT: /data02/ana_iglesias/data/import-wikidata\n", "TEMP: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata\n", "EXAMPLES_DIR: /data02/ana_iglesias/Github/kgtk/examples\n", "STORE: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", "USE_CASES_DIR: /data02/ana_iglesias/Github/kgtk/use-cases\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "markdown", "id": "4c9ec80b-48fe-48cc-8984-6cf94e69c2d0", "metadata": {}, "source": [ "## Define some ENV Variables, users can simply run this step, no changes required" ] }, { "cell_type": "code", "execution_count": 11, "id": "48fb958a-ff91-4360-b73f-3a136797f056", "metadata": {}, "outputs": [], "source": [ "os.environ['WIKIDATA_ALL_JSON'] = f\"{os.environ['GRAPH']}/{wikidata_json_file}\"\n", "\n", "# Work file extensions\n", "os.environ['UNSORTED_KGTK'] = \"unsorted.tsv.gz\"\n", "os.environ['SORTED_KGTK'] = \"tsv.gz\"\n", "\n", "# Use mgzip in some cases?\n", "os.environ['USE_MGZIP'] = \"TRUE\"\n", "\n", "# Select on of the following gzip implementations:\n", "# GZIP_CMD=bzip\n", "os.environ['GZIP_CMD'] = \"pigz\"\n", "\n", "\n", "# Some common flags:\n", "#KGTK_FLAGS=\"--debug --timing --progress --progress-tty `tty`\"\n", "os.environ['KGTK_FLAGS'] = \"--debug --timing\"\n", "os.environ['VERBOSE'] = \"--verbose\"\n", "os.environ['SORT_EXTRAS'] = f\"--parallel 6 --buffer-size 50% -T {os.environ['TEMP']}\"\n", "\n", "# The Wikidata datatypes:\n", "WIKIDATATYPES = [ \n", " \"commonsMedia\",\n", " \"external-id\",\n", " \"geo-shape\",\n", " \"globe-coordinate\",\n", " \"math\",\n", " \"monolingualtext\",\n", " \"musical-notation\",\n", " \"quantity\",\n", " \"string\",\n", " \"tabular-data\",\n", " \"time\",\n", " \"url\",\n", " \"wikibase-form\",\n", " \"wikibase-item\",\n", " \"wikibase-lexeme\",\n", " \"wikibase-property\",\n", " \"wikibase-sense\",\n", " \"other\"\n", " ]\n", "\n", "# The wikidata import split files to be sorted:\n", "WIKIDATA_IMPORT_SPLIT_FILES = [ \"claims\",\n", "\t\"claims.badvalue\",\n", "\t\"claims.novalue\",\n", "\t\"claims.somevalue\",\n", "\t\"qualifiers\",\n", "\t\"qualifiers.badvalue\",\n", "\t\"qualifiers.badvalueClaims\",\n", "\t\"qualifiers.novalue\",\n", "\t\"qualifiers.novalueClaims\",\n", "\t\"qualifiers.somevalue\",\n", "\t\"qualifiers.somevalueClaims\",\n", "\t\"aliases\",\n", "\t\"aliases.en\",\n", "\t\"descriptions\",\n", "\t\"descriptions.en\",\n", "\t\"labels\",\n", "\t\"labels.en\",\n", "\t\"sitelinks\",\n", "\t\"sitelinks.en\",\n", "\t\"sitelinks.en.qualifiers\",\n", "\t\"sitelinks.qualifiers\",\n", "\t\"metadata.node\",\n", "\t\"metadata.property.datatypes\",\n", "\t\"metadata.types\"]\n", "\n", "\n", "os.environ['SORT_COMMAND'] = sort_command" ] }, { "cell_type": "markdown", "id": "5b91bcd0-50f0-415a-903b-f9da7394a4f4", "metadata": {}, "source": [ "## Run the `import-wikidata` command" ] }, { "cell_type": "markdown", "id": "926b6689-9fae-4bb9-b5c8-c2daead96f4f", "metadata": {}, "source": [ "**NOTE**:\n", "This command is set to import only english labels/aliases/descriptions, controlled by parameters `--all-languages False` and `--lang en`.\n", "\n", "If you wish to import all languages, simple set `--all-languages True`." ] }, { "cell_type": "code", "execution_count": null, "id": "df8647da-650a-44a6-9d19-290c64765e31", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk import-wikidata version: 2021-11-17T01:38:17.437678+00:00#9z/aARcXhiV2hPdyVXjAREcpZwh2MawWFp6numz8GZBCtAg2WypLYAFpHjP43k97Zj8VHVaoel0oEit9KHXH0w==\n", "Starting main process (pid 40232).\n", "Processing.\n", "Processing wikidata file /data02/ana_iglesias/data/latest-all.json.bz2\n", "Decompressing (bz2)\n", "Creating the collector queue.\n", "The collector node queue has been created (maxsize=36).\n", "Creating the node_collector.\n", "Creating the node collector process.\n", "Starting the node collector process.\n", "Started the node collector process.\n", "The node collector is starting (pid 40309).\n", "The collector edge queue has been created (maxsize=36).\n", "Creating the edge_collector.\n", "Creating the edge collector process.\n", "Starting the edge collector process.\n", "Started the edge collector process.\n", "The edge collector is starting (pid 40310).\n", "The collector qual queue has been created (maxsize=36).\n", "Creating the qual_collector.\n", "Creating the qual collector process.\n", "Starting the qual collector process.\n", "Started the qual collector process.\n", "The qual collector is starting (pid 40311).\n", "The collector invalid edge queue has been created (maxsize=36).\n", "Creating the invalid_edge_collector.\n", "Creating the invalid edge collector process.\n", "Starting the invalid edge collector process.\n", "Started the invalid edge collector process.\n", "The invalid edge collector is starting (pid 40312).\n", "The collector invalid qual queue has been created (maxsize=36).\n", "Creating the invalid_qual_collector.\n", "Creating the invalid qual collector process.\n", "Starting the invalid qual collector process.\n", "Started the invalid qual collector process.\n", "The invalid qual collector is starting (pid 40313).\n", "The collector description queue has been created (maxsize=36).\n", "Creating the description collector.\n", "Creating the description collector process.\n", "Starting the description collector process.\n", "Started the description collector process.\n", "The description collector is starting (pid 40314).\n", "The collector sitelink queue has been created (maxsize=36).\n", "Creating the sitelink collector.\n", "Creating the sitelink collector process.\n", "Starting the sitelink collector process.\n", "Started the sitelink collector process.\n", "Sending the node header to the collector.\n", "Sent the node header to the collector.\n", "Sending the minimal edge file header to the collector.\n", "Sent the minimal edge file header to the collector.\n", "Sending the alias file header to the collector.\n", "Sent the alias file header to the collector.\n", "Sending the English alias file header to the collector.\n", "Sent the English alias file header to the collector.\n", "Sending the datatype file header to the collector.\n", "Sent the datatype file header to the collector.\n", "Sending the description file header to the collector.\n", "Opening the node file in the node collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/metadata.node.unsorted.tsv.gz\n", "Sent the description file header to the collector.\n", "Sending the English description file header to the collector.\n", "Sent the English description file header to the collector.\n", "Sending the label file header to the collector.\n", "Sent the label file header to the collector.\n", "Sending the English label file header to the collector.\n", "Opening the minimal edge file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/claims.raw.unsorted.tsv.gz\n", "Sent the English label file header to the collector.\n", "Sending the sitelink file header to the collector.\n", "Sent the sitelink file header to the collector.\n", "Sending the English sitelink file header to the collector.\n", "Sent the English sitelink file header to the collector.\n", "Sending the entry type file header to the collector.\n", "Sent the entry type file header to the collector.\n", "Sending the minimal invalid edge header to the collector.\n", "Sent the minimal invalid edge header to the collector.\n", "Sending the minimal qual file header to the collector.\n", "Sent the minimal qual file header to the collector.\n", "Sending the minimal invalid qual header to the collector.\n", "Opening the description file in the description collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/descriptions.unsorted.tsv.gz\n", "Sent the minimal invalid qual header to the collector.\n", "Creating parallel processor for /data02/ana_iglesias/data/latest-all.json.bz2\n", "Opening the qual file in the invalid qual collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/qualifiers.badvalue.unsorted.tsv.gz\n", "Opening the invalid edge file in the invalid edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/claims.badvalue.unsorted.tsv.gz\n", "Opening the minimal qual file in the qual collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/qualifiers.raw.unsorted.tsv.gz\n", "The sitelink collector is starting (pid 40315).\n", "Opening the wikipedia_sitelink file in the sitelink collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/sitelinks.raw.unsorted.tsv.gz\n", "Opening the alias file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/aliases.unsorted.tsv.gz\n", "Opening the English description file in the description collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/descriptions.en.unsorted.tsv.gz\n", "Opening the English alias file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/aliases.en.unsorted.tsv.gz\n", "Opening the datatype file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/metadata.property.datatypes.unsorted.tsv.gz\n", "Opening the English wikipedia_sitelink file in the sitelink collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/sitelinks.en.raw.unsorted.tsv.gz\n", "Opening the label file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/labels.unsorted.tsv.gz\n", "Opening the English label file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/labels.en.unsorted.tsv.gz\n", "Opening the type file in the edge collector with KgtkWriter: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/metadata.types.unsorted.tsv.gz\n", "Start parallel processing\n", "Starting worker process 0 (pid 40316).\n", "Starting worker process 1 (pid 40317).\n", "Starting worker process 2 (pid 40318).\n", "Starting worker process 3 (pid 40319).\n", "Starting worker process 4 (pid 40320).\n", "Starting worker process 5 (pid 40321).\n", "Starting worker process 6 (pid 40322).\n", "Starting worker process 7 (pid 40329).\n", "Starting worker process 8 (pid 40330).\n", "Starting worker process 9 (pid 40331).\n", "Starting worker process 10 (pid 40332).\n", "Starting worker process 11 (pid 40333).\n", "\n", "*** Sitelink collision #1 detected for Q5056-wikipedia_sitelink-88b48d (https://.wikipedia.org/wiki/Template:Support)\n", "\n", "*** Qualifier collision #1 detected for Q37062-P26-Q2028843-b2e6740f-0-P580-6f4356 (^1411-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q61814-P26-Q66516-1fa99291-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q62481-P26-Q2086776-87b8910e-0-P580-360391 (^1561-10-12T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q70789-P26-Q935411-28987fd8-0-P580-941716 (^1463-05-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q89405-P26-Q101877-d20a377b-0-P580-2b9eed (^1560-07-01T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q150611-P26-Q233335-575116d2-0-P580-29c809 (^1521-05-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q165284-P26-Q353-84a8ff47-0-P580-776c43 (^1200-05-23T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q169992-P26-Q235487-0e315055-0-P580-7a47d9 (^1332-07-28T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q183698-P26-Q256222-4322595e-0-P580-1fecee (^1684-01-09T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q235447-P26-Q161958-0d89305f-0-P580-52c362 (^1406-10-26T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q238931-wikipedia_sitelink-5113a0 (https://fr.wikipedia.org/wiki/Oblys_de_Manguistaou)\n", "\n", "*** Qualifier collision #1 detected for Q256222-P26-Q183698-415fc5b0-0-P580-1fecee (^1684-01-09T00:00:00Z/11)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Qualifier collision #1 detected for Q380373-P26-Q1141121-48bebee4-0-P580-2e184a (^1294-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q380868-P26-Q384941-46f6240f-0-P580-4b742f (^1533-08-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q453771-P26-Q443876-84acba5b-0-P580-84a26a (^1446-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q477343-P26-Q3374718-c7014aa0-0-P580-a95d2d (^1573-10-27T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q1834423-P26-Q322841-6c85598c-0-P580-876067 (^1559-06-16T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q3007367-P26-Q430782-f64d3af2-0-P580-5b468d (^1555-02-07T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q4299475-wikipedia_sitelink-d13454 (https://.wikipedia.org/wiki/Template:Bot)\n", "\n", "*** Sitelink collision #1 detected for Q4847311-wikipedia_sitelink-69c00a (https://.wikipedia.org/wiki/Template:Delete)\n", "\n", "*** Sitelink collision #1 detected for Q5406510-wikipedia_sitelink-97590b (https://.wikipedia.org/wiki/Template:=)\n", "\n", "*** Sitelink collision #1 detected for Q5412328-wikipedia_sitelink-95b8fc (https://.wikipedia.org/wiki/Template:Trim)\n", "\n", "*** Sitelink collision #1 detected for Q5621274-wikipedia_sitelink-190234 (https://.wikipedia.org/wiki/Template:Column-count)\n", "\n", "*** Sitelink collision #1 detected for Q5882248-wikipedia_sitelink-20b8c9 (https://.wikipedia.org/wiki/Template:Documentation_subpage)\n", "\n", "*** Sitelink collision #1 detected for Q6906791-wikipedia_sitelink-1fc2b9 (https://.wikipedia.org/wiki/Template:Side_box)\n", "\n", "*** Sitelink collision #1 detected for Q7192108-wikipedia_sitelink-aa2579 (https://.wikipedia.org/wiki/Category:Pages_with_script_errors)\n", "\n", "*** Qualifier collision #1 detected for Q7529231-P26-Q6792225-896048a6-0-P580-5a896b (^1508-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q9737782-wikipedia_sitelink-0f7f7f (https://.wikipedia.org/wiki/Category:User_templates)\n", "\n", "*** Sitelink collision #1 detected for Q10560270-wikipedia_sitelink-efe081 (https://.wikipedia.org/wiki/Template:Under_construction)\n", "\n", "*** Qualifier collision #1 detected for Q13058108-P159-Q1354-267a1462-0-P625-cb2660 (@23.728063/90.419591)\n", "\n", "*** Sitelink collision #1 detected for Q13156670-wikipedia_sitelink-58d6d4 (https://.wikipedia.org/wiki/Template:Interwiki_redirect)\n", "\n", "*** Sitelink collision #1 detected for Q14511701-wikipedia_sitelink-75778a (https://.wikipedia.org/wiki/Template:TemplateData_header)\n", "\n", "*** Sitelink collision #1 detected for Q14635514-wikipedia_sitelink-01112f (https://.wikipedia.org/wiki/Template:Reply_to)\n", "\n", "*** Sitelink collision #1 detected for Q7253814-wikipedia_sitelink-ca80bf (https://.wikipedia.org/wiki/Module:String)\n", "\n", "*** Sitelink collision #1 detected for Q7348344-wikipedia_sitelink-7a5626 (https://.wikipedia.org/wiki/Module:Coordinates)\n", "\n", "*** Sitelink collision #1 detected for Q15818920-wikipedia_sitelink-709464 (https://.wikipedia.org/wiki/Template:Autoarchive_resolved_section)\n", "The node collector called 500000 times: 2500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 500000 times: 0 nrows, 30705989 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 500000 times: 0 nrows, 2093414 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The sitelink collector called 500000 times: 0 nrows, 34262121 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q29053400-wikipedia_sitelink-7a9744 (https://.wikipedia.org/wiki/Category:Pages_with_template_loops)\n", "\n", "*** Sitelink collision #1 detected for Q32859338-wikipedia_sitelink-9d571a (https://.wikipedia.org/wiki/Category:Archive_templates)\n", "The qual collector called 500000 times: 0 nrows, 0 erows, 7689591 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q43398278-P7209-Q66424109-ff08dae7-0-P585-ad17a5 (^1661-07-00T00:00:00Z/10)\n", "The node collector called 1000000 times: 5000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 1000000 times: 0 nrows, 87824765 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 1000000 times: 0 nrows, 4427959 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "500000 lines processed by processor 8\n", "500000 lines processed by processor 10\n", "500000 lines processed by processor 1\n", "500000 lines processed by processor 5\n", "500000 lines processed by processor 9\n", "500000 lines processed by processor 7\n", "500000 lines processed by processor 0\n", "500000 lines processed by processor 4\n", "500000 lines processed by processor 2\n", "500000 lines processed by processor 3\n", "500000 lines processed by processor 6\n", "500000 lines processed by processor 11\n", "\n", "*** Qualifier collision #1 detected for Q55579391-P26-Q121846-1952d1ff-0-P580-cae35d (^1284-00-00T00:00:00Z/9)\n", "The qual collector called 1000000 times: 0 nrows, 0 erows, 18909863 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q58832772-wikipedia_sitelink-eb1155 (https://.wikipedia.org/wiki/Module:LangSwitch)\n", "The node collector called 1500000 times: 7500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 1500000 times: 0 nrows, 130275339 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 1500000 times: 0 nrows, 6479974 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 1500000 times: 0 nrows, 0 erows, 28087953 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 2000000 times: 10000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 2000000 times: 0 nrows, 168060400 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q90722487-wikipedia_sitelink-5b05a6 (https://.wikipedia.org/wiki/Category:Pages_using_deprecated_source_tags)\n", "The description collector called 2000000 times: 0 nrows, 8490836 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q99735928-wikipedia_sitelink-e4f78d (https://.wikipedia.org/wiki/Template:BCP47)\n", "The sitelink collector called 1000000 times: 0 nrows, 41047128 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q109249671-wikipedia_sitelink-b04570 (https://.wikipedia.org/wiki/Template:None)\n", "1000000 lines processed by processor 10\n", "1000000 lines processed by processor 7\n", "1000000 lines processed by processor 1\n", "1000000 lines processed by processor 8\n", "1000000 lines processed by processor 0\n", "1000000 lines processed by processor 5\n", "1000000 lines processed by processor 9\n", "The qual collector called 2000000 times: 0 nrows, 0 erows, 38102824 qrows, 0 invalid erows, 0 invalid qrows\n", "1000000 lines processed by processor 6\n", "1000000 lines processed by processor 2\n", "1000000 lines processed by processor 11\n", "1000000 lines processed by processor 4\n", "1000000 lines processed by processor 3\n", "\n", "*** Qualifier collision #1 detected for Q8058-P26-Q254085-4eab60ab-0-P580-8df26d (^1436-06-24T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q8384-P26-Q70590-6edd7354-0-P580-e23c66 (^1305-09-23T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q9076-wikipedia_sitelink-a73704 (https://vec.wikipedia.org/wiki/Brandizzo)\n", "\n", "*** Qualifier collision #1 detected for Q13167-P348-99b09e-08cc7a6d-0-P577-07f6e3 (^2016-07-12T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q57161-P26-Q441394-f0d02358-0-P580-77780b (^1308-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q57161-P26-Q467019-4ee33344-0-P580-9268e9 (^1324-02-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q74019-P26-Q540767-c098df36-0-P580-5774e5 (^1422-07-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q121130-P26-Q119431-af2d7776-0-P580-10c067 (^1197-05-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q151587-P26-Q7996-4448a491-0-P580-62d46c (^1572-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q153319-P26-Q57852-a80af489-0-P580-56d3ba (^1725-06-01T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q154998-P26-Q234549-77a9d927-0-P580-45ce34 (^1525-10-29T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q168254-wikipedia_sitelink-01f14b (https://ru.wikipedia.org/wiki/Рудольф_I_(король_Чехии))\n", "\n", "*** Qualifier collision #1 detected for Q184868-P26-Q390071-b34d3d54-0-P580-90dfde (^1680-07-18T00:00:00Z/11)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Qualifier collision #1 detected for Q203647-P26-Q2284422-aed54bb0-0-P580-246002 (^1045-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q232137-P26-Q41847-0dcc4fd6-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q263474-P26-Q3044-90d4ea9f-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q333359-P26-Q3052486-9cbd9d9e-0-P580-355ae9 (^0960-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q454810-P26-Q702209-74f88753-0-P580-e60df9 (^1476-08-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q672446-P26-Q2912335-f19e5091-0-P580-93d3bd (^1447-12-14T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q674931-P26-Q19601994-f7d507fb-0-P580-9b41a5 (^1222-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q694351-P26-Q329555-c88da6e5-0-P580-15a1f0 (^1381-09-02T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q702602-P26-Q79176-0f28ed9a-0-P580-676c21 (^1431-06-03T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q719501-P26-Q69462-4f695a08-0-P580-79dbc8 (^1512-07-06T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q903384-P159-Q1490-327fab97-0-P625-1c8a39 (@35.6895/139.692)\n", "\n", "*** Qualifier collision #1 detected for Q1381324-P26-Q535528-9a0e7ede-0-P580-c9b00e (^1221-06-19T00:00:00Z/11)\n", "The node collector called 2500000 times: 12500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 2500000 times: 0 nrows, 203335752 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q3139317-P159-Q9799-e865820c-0-P625-be8120 (@50.8802/5.9595)\n", "\n", "*** Qualifier collision #1 detected for Q4115450-P159-Q191204-8fdad044-0-P625-a98823 (@35.569778/45.352163)\n", "\n", "*** Sitelink collision #1 detected for Q5411705-wikipedia_sitelink-e80085 (https://.wikipedia.org/wiki/Template:Clear)\n", "\n", "*** Sitelink collision #1 detected for Q5459259-wikipedia_sitelink-15f705 (https://.wikipedia.org/wiki/Template:Center)\n", "\n", "*** Sitelink collision #1 detected for Q5622198-wikipedia_sitelink-b0ead5 (https://.wikipedia.org/wiki/Template:Done)\n", "\n", "*** Sitelink collision #1 detected for Q5646673-wikipedia_sitelink-6bd5a0 (https://.wikipedia.org/wiki/Template:Pp-template)\n", "\n", "*** Sitelink collision #1 detected for Q6063221-wikipedia_sitelink-3210bb (https://.wikipedia.org/wiki/Template:Mbox)\n", "\n", "*** Sitelink collision #1 detected for Q6133158-wikipedia_sitelink-63b782 (https://.wikipedia.org/wiki/Template:@)\n", "The description collector called 2500000 times: 0 nrows, 10484843 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q6867218-P159-Q9268849-32a831ad-0-P625-51c420 (@52.223817/21.005108)\n", "\n", "*** Sitelink collision #1 detected for Q7008653-wikipedia_sitelink-06da49 (https://cs.wikipedia.org/wiki/Kategorie:Čínští_astronomové)\n", "\n", "*** Sitelink collision #1 detected for Q8386460-wikipedia_sitelink-dea037 (https://.wikipedia.org/wiki/Category:Documentation_subpages_without_corresponding_pages)\n", "\n", "*** Qualifier collision #1 detected for Q9061646-P39-Q84701409-5a714518-0-P580-3e1e37 (^1116-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q9061646-P39-Q84701409-5a714518-0-P582-ac0fb1 (^1154-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q16748603-wikipedia_sitelink-bd3560 (https://.wikipedia.org/wiki/Module:No_globals)\n", "\n", "*** Sitelink collision #1 detected for Q17347205-wikipedia_sitelink-ed8d08 (https://.wikipedia.org/wiki/Module:Category_handler/config)\n", "The sitelink collector called 1500000 times: 0 nrows, 74381419 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q23706330-wikipedia_sitelink-0b1301 (https://.wikipedia.org/wiki/Category:Pages_with_maps)\n", "The edge collector called 3000000 times: 0 nrows, 231396569 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 3000000 times: 15000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q26878179-P26-Q55169081-de1c53f2-0-P580-01b412 (^1571-09-08T00:00:00Z/11)\n", "The description collector called 3000000 times: 0 nrows, 12596548 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 2500000 times: 0 nrows, 0 erows, 45264827 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q35831526-wikipedia_sitelink-40a1da (https://pl.wikipedia.org/wiki/Pierre_Kunde)\n", "The node collector called 3500000 times: 17500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 3500000 times: 0 nrows, 289972422 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "1500000 lines processed by processor 10\n", "1500000 lines processed by processor 7\n", "1500000 lines processed by processor 1\n", "1500000 lines processed by processor 8\n", "1500000 lines processed by processor 5\n", "1500000 lines processed by processor 0\n", "1500000 lines processed by processor 6\n", "1500000 lines processed by processor 9\n", "1500000 lines processed by processor 2\n", "1500000 lines processed by processor 11\n", "The description collector called 3500000 times: 0 nrows, 14969549 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "1500000 lines processed by processor 4\n", "1500000 lines processed by processor 3\n", "\n", "*** Qualifier collision #1 detected for Q54902946-P26-Q31191593-fb18c102-0-P580-d109bb (^1560-12-15T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q56528384-wikipedia_sitelink-fc9f9d (https://.wikipedia.org/wiki/Module:I18n/date)\n", "The qual collector called 3000000 times: 0 nrows, 0 erows, 55513650 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q56582849-P26-Q72922-06e7a6cd-0-P580-c16f56 (^1499-01-21T00:00:00Z/11)\n", "The node collector called 4000000 times: 20000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 4000000 times: 0 nrows, 330848394 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q75458516-P26-Q7324457-79a267cb-0-P580-221dc5 (^1568-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q75552257-P26-Q75552262-2af17717-0-P580-04284b (^1556-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q75552262-P26-Q75552257-6fa3779f-0-P580-04284b (^1556-00-00T00:00:00Z/9)\n", "The description collector called 4000000 times: 0 nrows, 16990392 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 3500000 times: 0 nrows, 0 erows, 65653637 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 4500000 times: 22500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 4500000 times: 0 nrows, 369860822 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The sitelink collector called 2000000 times: 0 nrows, 82036402 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 4500000 times: 0 nrows, 18920080 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "2000000 lines processed by processor 10\n", "2000000 lines processed by processor 7\n", "2000000 lines processed by processor 1\n", "2000000 lines processed by processor 8\n", "The qual collector called 4000000 times: 0 nrows, 0 erows, 76261288 qrows, 0 invalid erows, 0 invalid qrows\n", "2000000 lines processed by processor 2\n", "2000000 lines processed by processor 6\n", "2000000 lines processed by processor 0\n", "2000000 lines processed by processor 5\n", "2000000 lines processed by processor 9\n", "2000000 lines processed by processor 11\n", "2000000 lines processed by processor 3\n", "2000000 lines processed by processor 4\n", "\n", "*** Qualifier collision #1 detected for Q40433-P26-Q463669-cd43ed58-0-P580-480b99 (^1550-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q57654-P26-Q154041-8d52292f-0-P580-3b3df4 (^1572-07-20T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q66270-P26-Q325505-28bc872e-0-P580-08d4a0 (^1478-05-29T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q66516-P26-Q61814-43ebfd75-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q101877-P26-Q89405-6e0cba4d-0-P580-2b9eed (^1560-07-01T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q155167-P26-Q269586-cc56bab6-0-P580-c54274 (^1334-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q168669-P26-Q193658-6bff08d2-0-P580-e8a3ec (^0939-00-00T00:00:00Z/9)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Qualifier collision #1 detected for Q241797-P26-Q7731-b7834ae7-0-P580-a01064 (^1671-02-01T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q262059-P26-Q187312-c501aba2-0-P580-7e48ad (^1302-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q267483-P26-Q57920-80635ac2-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q271799-P26-Q169319-a97c2304-0-P580-0d082c (^1523-12-11T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q326738-P26-Q684224-2df6ee20-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q327572-P26-Q68952-ae5f6316-0-P580-5906e2 (^1563-05-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q374210-P26-Q4768218-c9e0eacd-0-P580-ef8382 (^1571-12-19T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q384941-P26-Q380868-4ca9581a-0-P580-4b742f (^1533-08-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q536174-P26-Q551752-c9a99a5e-0-P580-16c9b2 (^1229-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q574718-P26-Q21153658-ffa49040-0-P580-03dd18 (^1319-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q623188-P26-Q553289-7323bb58-0-P580-d8d288 (^1090-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q947423-P26-Q5358431-95b068e2-0-P580-6b2ce5 (^1152-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q948719-wikipedia_sitelink-ca8671 (https://arz.wikipedia.org/wiki/كنيسه_سان_انطولين_دى_توك)\n", "\n", "*** Qualifier collision #1 detected for Q2039358-P26-Q13474657-3f305fc3-0-P580-593f4e (^1558-00-00T00:00:00Z/9)\n", "The node collector called 5000000 times: 25000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 5000000 times: 0 nrows, 404401006 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q6705618-wikipedia_sitelink-499e91 (https://.wikipedia.org/wiki/Template:Autotranslate)\n", "\n", "*** Sitelink collision #1 detected for Q7221363-wikipedia_sitelink-ba6288 (https://.wikipedia.org/wiki/Category:Lua-based_templates)\n", "\n", "*** Qualifier collision #1 detected for Q7324457-P26-Q75567328-84b7c804-0-P580-4e6c67 (^1553-11-24T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q7324457-P26-Q75458516-fee1a551-0-P580-221dc5 (^1568-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q9400517-wikipedia_sitelink-f64eb2 (https://en.wikipedia.org/wiki/Category:1822_in_the_arts)\n", "\n", "*** Sitelink collision #1 detected for Q10350561-wikipedia_sitelink-49c2ca (https://.wikipedia.org/wiki/Template:Lua)\n", "The description collector called 5000000 times: 0 nrows, 20939252 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q15116966-wikipedia_sitelink-36dfd1 (https://.wikipedia.org/wiki/Module:Message_box)\n", "\n", "*** Sitelink collision #1 detected for Q15212145-wikipedia_sitelink-0d0079 (https://.wikipedia.org/wiki/Template:LangSwitch)\n", "\n", "*** Sitelink collision #1 detected for Q17121869-wikipedia_sitelink-cb0eff (https://.wikipedia.org/wiki/Module:Lua_banner)\n", "The sitelink collector called 2500000 times: 0 nrows, 114406926 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q21153658-P26-Q574718-39f28f24-0-P580-03dd18 (^1319-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q26856839-wikipedia_sitelink-661263 (https://.wikipedia.org/wiki/Template:Languages)\n", "\n", "*** Sitelink collision #1 detected for Q26905108-wikipedia_sitelink-c81953 (https://.wikipedia.org/wiki/Module:I18n/complex_date)\n", "The node collector called 5500000 times: 27500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 5500000 times: 0 nrows, 431246050 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 4500000 times: 0 nrows, 0 erows, 83285392 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 5500000 times: 0 nrows, 23133997 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "2500000 lines processed by processor 10\n", "2500000 lines processed by processor 1\n", "2500000 lines processed by processor 7\n", "2500000 lines processed by processor 8\n", "2500000 lines processed by processor 2\n", "2500000 lines processed by processor 6\n", "The node collector called 6000000 times: 30000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 6000000 times: 0 nrows, 492235400 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "2500000 lines processed by processor 0\n", "2500000 lines processed by processor 5\n", "2500000 lines processed by processor 9\n", "2500000 lines processed by processor 3\n", "2500000 lines processed by processor 11\n", "2500000 lines processed by processor 4\n", "The qual collector called 5000000 times: 0 nrows, 0 erows, 93446268 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 6000000 times: 0 nrows, 25373519 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 6500000 times: 32500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 6500000 times: 0 nrows, 531578519 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 5500000 times: 0 nrows, 0 erows, 103540865 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 6500000 times: 0 nrows, 27420419 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The sitelink collector called 3000000 times: 0 nrows, 123201789 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 7000000 times: 35000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 7000000 times: 0 nrows, 571413703 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "3000000 lines processed by processor 10\n", "3000000 lines processed by processor 1\n", "3000000 lines processed by processor 8\n", "3000000 lines processed by processor 2\n", "The qual collector called 6000000 times: 0 nrows, 0 erows, 114459872 qrows, 0 invalid erows, 0 invalid qrows\n", "3000000 lines processed by processor 7\n", "3000000 lines processed by processor 6\n", "3000000 lines processed by processor 0\n", "3000000 lines processed by processor 5\n", "3000000 lines processed by processor 9\n", "3000000 lines processed by processor 3\n", "3000000 lines processed by processor 11\n", "3000000 lines processed by processor 4\n", "The description collector called 7000000 times: 0 nrows, 29368459 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q1084-wikipedia_sitelink-ac9eb4 (https://ks.wikipedia.org/wiki/ناوُت)\n", "\n", "*** Qualifier collision #1 detected for Q3044-P26-Q263474-631d88d0-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q3044-P26-Q261866-27b1ed09-0-P580-3fbd66 (^0794-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q27932-P26-Q287503-29306074-0-P580-11c3a9 (^1237-04-25T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q35073-wikipedia_sitelink-c68d59 (https://kn.wikipedia.org/wiki/ಅಮೇರಿಕ_ಸಂಯುಕ್ತ_ಸಂಸ್ಥಾನದ_ರಾಷ್ಟ್ರಪತಿಗಳ_ಪಟ್ಟಿ)\n", "\n", "*** Qualifier collision #1 detected for Q38370-P26-Q80823-ae3ce4e4-0-P580-c9d352 (^1533-01-25T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q38370-P26-Q182637-8103e2ff-0-P580-7524c3 (^1536-05-30T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q38370-P26-Q57126-cb76b09d-0-P580-c55b0a (^1540-01-06T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q38370-P26-Q188926-259757b1-0-P580-3301d6 (^1540-07-28T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q38370-P26-Q192943-4b53adeb-0-P580-1ea2b6 (^1543-07-12T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q60563-P26-Q2915743-f5fdee07-0-P580-cade68 (^1169-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q65946-P26-Q462536-89b54878-0-P580-48f754 (^1407-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q66888-P26-Q3721846-b7243730-0-P580-4e0bc1 (^1571-01-09T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q66888-P26-Q23771111-34bc78ba-0-P580-713f01 (^1560-03-03T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q68304-P26-Q539111-dfcad6f4-0-P580-d0edbb (^1545-05-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q95627-P26-Q354945-873a167d-0-P580-3b86a9 (^1276-11-24T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q121846-P26-Q55579391-afdbc2b3-0-P580-cae35d (^1284-00-00T00:00:00Z/9)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Qualifier collision #1 detected for Q122794-P26-Q430950-e085ea2d-0-P580-94ae3a (^1577-10-20T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q124682-P26-Q337057-4fb67536-0-P580-db5ec5 (^1389-08-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q130005-P26-Q259564-c738415f-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q132545-P26-Q131552-2fbc7eb5-0-P580-e56690 (^1533-10-28T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q134452-P26-Q201143-a2079e30-0-P580-7c0e43 (^1491-12-06T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q160349-P26-Q154064-ec5ff971-0-P580-7a7cba (^1385-07-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q220845-P26-Q936976-0f99833d-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q234257-P26-Q170398-56a0eb9a-0-P580-850b4d (^1816-01-24T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q259564-P26-Q130005-bd5ab415-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q325583-P26-Q527486-704144b1-0-P580-f981af (^1577-05-19T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q326449-P26-Q23682783-85a9914e-0-P580-13178a (^1736-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q440132-P26-Q506527-db15118a-0-P580-2bef25 (^1524-11-06T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q469389-P26-Q1924994-36c61689-0-P580-017942 (^1377-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q658714-P26-Q20498980-28fdf4a6-0-P580-0672b7 (^1409-01-30T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q682736-P26-Q68285-f3f03090-0-P580-eae385 (^1460-11-19T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q684224-P26-Q326738-18c31ccf-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q684276-P26-Q61576937-87fbff2c-0-P580-2b5632 (^1217-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q703249-P26-Q1309296-09047836-0-P580-a97c74 (^1228-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q960849-wikipedia_sitelink-286520 (https://fa.wikipedia.org/wiki/دپرس،_میزوری)\n", "\n", "*** Qualifier collision #1 detected for Q2028843-P26-Q37062-259ae253-0-P580-6f4356 (^1411-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q2606511-wikipedia_sitelink-cef725 (https://sl.wikipedia.org/wiki/Libanonci)\n", "\n", "*** Sitelink collision #1 detected for Q3926105-wikipedia_sitelink-6d7f54 (https://.wikipedia.org/wiki/Template:Userbox)\n", "\n", "*** Sitelink collision #1 detected for Q3950914-wikipedia_sitelink-363c58 (https://es.wikipedia.org/wiki/Huevo_militar_de_acero)\n", "The node collector called 7500000 times: 37500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 7500000 times: 0 nrows, 605287885 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q6244699-P26-Q76366716-754c9057-0-P580-d7261a (^1579-04-27T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q6483751-wikipedia_sitelink-8325f7 (https://it.wikipedia.org/wiki/Categoria:Cardinali)\n", "\n", "*** Sitelink collision #1 detected for Q7448333-wikipedia_sitelink-2dc780 (https://hr.wikipedia.org/wiki/Kategorija:Nenečki_autonomni_okrug)\n", "\n", "*** Sitelink collision #1 detected for Q8219368-wikipedia_sitelink-0ed7bb (https://.wikipedia.org/wiki/Category:Table_templates)\n", "\n", "*** Sitelink collision #1 detected for Q15117391-wikipedia_sitelink-f282a4 (https://.wikipedia.org/wiki/Module:Message_box/configuration)\n", "The sitelink collector called 3500000 times: 0 nrows, 153766884 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 7500000 times: 0 nrows, 31408266 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q25713407-wikipedia_sitelink-ce2b95 (https://.wikipedia.org/wiki/Template:CURRENTCONTENTLANGUAGE)\n", "The qual collector called 6500000 times: 0 nrows, 0 erows, 121180930 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q28058315-wikipedia_sitelink-5f24f5 (https://en.wikipedia.org/wiki/Christian_Guiffroy)\n", "The node collector called 8000000 times: 40000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 8000000 times: 0 nrows, 633304695 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "3500000 lines processed by processor 10\n", "3500000 lines processed by processor 1\n", "The description collector called 8000000 times: 0 nrows, 33708602 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "3500000 lines processed by processor 2\n", "3500000 lines processed by processor 8\n", "3500000 lines processed by processor 7\n", "3500000 lines processed by processor 6\n", "3500000 lines processed by processor 0\n", "3500000 lines processed by processor 3\n", "3500000 lines processed by processor 5\n", "3500000 lines processed by processor 11\n", "3500000 lines processed by processor 9\n", "3500000 lines processed by processor 4\n", "The node collector called 8500000 times: 42500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 8500000 times: 0 nrows, 692814980 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 7000000 times: 0 nrows, 0 erows, 131310519 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q64944842-wikipedia_sitelink-498ca7 (https://.wikipedia.org/wiki/Module:Portal_navigation)\n", "\n", "*** Qualifier collision #1 detected for Q65617406-P26-Q265478-6faeca05-0-P580-dc9c16 (^1884-05-30T00:00:00Z/11)\n", "The sitelink collector called 4000000 times: 0 nrows, 164403765 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 8500000 times: 0 nrows, 35790747 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 9000000 times: 45000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 9000000 times: 0 nrows, 732292927 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 7500000 times: 0 nrows, 0 erows, 141580707 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q76366716-P26-Q6244699-b5d45f0b-0-P580-d7261a (^1579-04-27T00:00:00Z/11)\n", "The description collector called 9000000 times: 0 nrows, 37831424 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 9500000 times: 47500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 9500000 times: 0 nrows, 773047193 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "4000000 lines processed by processor 10\n", "4000000 lines processed by processor 1\n", "4000000 lines processed by processor 2\n", "4000000 lines processed by processor 8\n", "4000000 lines processed by processor 6\n", "4000000 lines processed by processor 7\n", "The qual collector called 8000000 times: 0 nrows, 0 erows, 152535707 qrows, 0 invalid erows, 0 invalid qrows\n", "4000000 lines processed by processor 11\n", "4000000 lines processed by processor 5\n", "4000000 lines processed by processor 3\n", "4000000 lines processed by processor 0\n", "\n", "*** Sitelink collision #1 detected for Q102226589-wikipedia_sitelink-9c8625 (https://.wikipedia.org/wiki/Template:User_mnw)\n", "4000000 lines processed by processor 9\n", "4000000 lines processed by processor 4\n", "\n", "*** Sitelink collision #1 detected for Q3740-wikipedia_sitelink-95c040 (https://.wikipedia.org/wiki/Category:Templates)\n", "\n", "*** Qualifier collision #1 detected for Q41847-P26-Q232137-573ea212-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q60211-P26-Q264709-a5d5e20b-0-P580-7f1413 (^1564-12-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q60386-P26-Q157776-b68a50b9-0-P580-e55fcf (^1478-09-06T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q70828-P26-Q110845-78948fbb-0-P580-189c4f (^1282-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q79176-P26-Q702602-bcda292d-0-P580-676c21 (^1431-06-03T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q119050-P26-Q26882160-c09f6014-0-P580-78cd0b (^1567-01-13T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q120365-P26-Q69620-c54a3667-0-P580-05429a (^1116-07-13T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q167782-P26-Q231794-aef59aa3-0-P580-7e2e98 (^1350-04-08T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q170586-P26-Q231742-11295529-0-P580-90c66e (^1313-07-00T00:00:00Z/10)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Qualifier collision #1 detected for Q172203-P26-Q229419-b442326a-0-P580-a50c51 (^1262-05-28T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q174964-P26-Q231798-bd2d3d6b-0-P580-dc0f7a (^1322-09-21T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q202566-P26-Q688471-440b6399-0-P580-283d12 (^1531-09-20T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q232801-P26-Q721680-fa26b14e-0-P580-70598b (^1473-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q270234-P26-Q210569-6b693078-0-P580-f6928a (^1446-06-20T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q274740-wikipedia_sitelink-d71f3f (https://tl.wikipedia.org/wiki/Beaurevoir)\n", "\n", "*** Qualifier collision #1 detected for Q325824-P26-Q547225-762b0607-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q441394-P26-Q57161-47bffbac-0-P580-77780b (^1308-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q454769-P26-Q76956-91d862f6-0-P580-981a99 (^1245-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q455201-P26-Q152148-9eb66558-0-P580-e1de94 (^1389-05-02T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q479538-P26-Q98010-40ca7cda-0-P580-31ff5b (^1582-11-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q513315-P26-Q87066-cd6b2f7c-0-P580-3f638b (^1551-03-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q513315-P26-Q70019-5c7fa382-0-P580-f9548c (^1558-08-01T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q547225-P26-Q325824-31db3890-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q567378-P26-Q434771-205319b2-0-P580-4d06ab (^1509-11-20T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q571597-P26-Q327750-b0a44162-0-P580-f16789 (^1555-09-10T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q687028-P26-Q2334373-525f829d-0-P580-a6af64 (^1556-02-16T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q1070853-P26-Q2467970-c7d5f6fa-0-P580-d71f7b (^1358-09-04T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q1141121-P26-Q380373-eeba5d95-0-P580-2e184a (^1294-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q1773344-wikipedia_sitelink-139861 (https://tr.wikipedia.org/wiki/Flor_da_Serra_do_Sul)\n", "\n", "*** Qualifier collision #1 detected for Q1916706-P26-Q80714-35bbccc5-0-P580-d3fce7 (^1109-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q2049198-P26-Q63291-be046904-0-P580-f3b88a (^1372-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q2334373-P26-Q687028-b6b9f398-0-P580-a6af64 (^1556-02-16T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q2334373-P26-Q328693-3f939052-0-P580-0d15c6 (^1543-08-26T00:00:00Z/11)\n", "The description collector called 9500000 times: 0 nrows, 39826758 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q3926051-wikipedia_sitelink-ed0ce0 (https://.wikipedia.org/wiki/Template:Tl)\n", "\n", "*** Sitelink collision #1 detected for Q4481730-wikipedia_sitelink-07edc3 (https://.wikipedia.org/wiki/Template:Tracked)\n", "\n", "*** Sitelink collision #1 detected for Q4989282-wikipedia_sitelink-dac0bb (https://.wikipedia.org/wiki/Category:Pages_with_broken_file_links)\n", "\n", "*** Sitelink collision #1 detected for Q5070586-wikipedia_sitelink-b5b81d (https://.wikipedia.org/wiki/Template:Shortcut)\n", "\n", "*** Sitelink collision #1 detected for Q6027565-wikipedia_sitelink-9eb9b0 (https://.wikipedia.org/wiki/Template:Tag)\n", "\n", "*** Sitelink collision #1 detected for Q6068612-wikipedia_sitelink-e74015 (https://.wikipedia.org/wiki/Template:Talk_archive)\n", "The node collector called 10000000 times: 50000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 10000000 times: 0 nrows, 805704462 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q6648427-wikipedia_sitelink-9f95d5 (https://lb.wikipedia.org/wiki/Kategorie:Gebuer_39_v._Chr.)\n", "\n", "*** Qualifier collision #1 detected for Q6940461-P159-Q61302-c45d5aa7-0-P625-dc88d7 (@28.6386/-106.0756)\n", "\n", "*** Sitelink collision #1 detected for Q7145341-wikipedia_sitelink-ba378c (https://vi.wikipedia.org/wiki/Thể_loại:Hải_chiến)\n", "\n", "*** Sitelink collision #1 detected for Q7643575-wikipedia_sitelink-10e2ca (https://.wikipedia.org/wiki/Template:Colon)\n", "\n", "*** Qualifier collision #1 detected for Q9150575-P26-Q679083-79dd46a6-0-P580-5d5db4 (^1320-00-00T00:00:00Z/9)\n", "The sitelink collector called 4500000 times: 0 nrows, 192543120 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q13972091-P26-Q75389849-5b19ecc3-0-P1319-532ed8 (^1509-07-04T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q13972091-P26-Q6469914-b9869239-0-P1319-839147 (^1520-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q15281133-P26-Q75240211-8a7057f8-0-P580-97ad08 (^1526-07-20T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q15605797-wikipedia_sitelink-40fef6 (https://.wikipedia.org/wiki/Module:List)\n", "\n", "*** Sitelink collision #1 detected for Q17347230-wikipedia_sitelink-2f8d40 (https://.wikipedia.org/wiki/Module:Category_handler/blacklist)\n", "\n", "*** Qualifier collision #1 detected for Q20202663-P26-Q299612-893fda0a-0-P580-b50376 (^1080-00-00T00:00:00Z/9)\n", "\n", "*** Sitelink collision #1 detected for Q20819962-wikipedia_sitelink-03a379 (https://.wikipedia.org/wiki/Module:Fallback)\n", "The qual collector called 8500000 times: 0 nrows, 0 erows, 159450296 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q26877297-P26-Q542751-a70d423c-0-P580-d584ea (^1488-02-17T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q27031232-P26-Q55217321-0fe60a4f-0-P580-7606e7 (^1280-00-00T00:00:00Z/9)\n", "The description collector called 10000000 times: 0 nrows, 41925034 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 10500000 times: 52500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 10500000 times: 0 nrows, 833352982 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "4500000 lines processed by processor 10\n", "4500000 lines processed by processor 1\n", "4500000 lines processed by processor 2\n", "4500000 lines processed by processor 8\n", "4500000 lines processed by processor 6\n", "4500000 lines processed by processor 7\n", "4500000 lines processed by processor 11\n", "\n", "*** Qualifier collision #1 detected for Q44191792-P26-Q54862322-4c83e8e6-0-P580-a27cd1 (^1567-08-21T00:00:00Z/11)\n", "4500000 lines processed by processor 3\n", "4500000 lines processed by processor 0\n", "4500000 lines processed by processor 5\n", "4500000 lines processed by processor 9\n", "4500000 lines processed by processor 4\n", "The qual collector called 9000000 times: 0 nrows, 0 erows, 169386569 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 10500000 times: 0 nrows, 44262604 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 11000000 times: 55000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 11000000 times: 0 nrows, 894279381 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The sitelink collector called 5000000 times: 0 nrows, 205212568 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q64506586-P26-Q262726-b670dee9-0-P580-4a9d3d (^1298-00-00T00:00:00Z/9)\n", "The description collector called 11000000 times: 0 nrows, 46327925 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The qual collector called 9500000 times: 0 nrows, 0 erows, 179469241 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 11500000 times: 57500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 11500000 times: 0 nrows, 932948176 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Qualifier collision #1 detected for Q75395291-P26-Q76157640-e3d697ee-0-P580-54254d (^1578-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q75420332-P26-Q208922-3b5559ee-0-P580-447dca (^1559-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q76157640-P26-Q75395291-d75eee5e-0-P580-54254d (^1578-00-00T00:00:00Z/9)\n", "5000000 lines processed by processor 10\n", "5000000 lines processed by processor 1\n", "5000000 lines processed by processor 2\n", "5000000 lines processed by processor 8\n", "5000000 lines processed by processor 7\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "5000000 lines processed by processor 6\n", "The qual collector called 10000000 times: 0 nrows, 0 erows, 190433714 qrows, 0 invalid erows, 0 invalid qrows\n", "The node collector called 12000000 times: 60000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 12000000 times: 0 nrows, 973473705 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "5000000 lines processed by processor 11\n", "The description collector called 11500000 times: 0 nrows, 48252977 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "5000000 lines processed by processor 3\n", "5000000 lines processed by processor 0\n", "5000000 lines processed by processor 5\n", "5000000 lines processed by processor 9\n", "5000000 lines processed by processor 4\n", "\n", "*** Sitelink collision #1 detected for Q7164-wikipedia_sitelink-65f45f (https://tr.wikipedia.org/wiki/Dünya_Bankası)\n", "\n", "*** Qualifier collision #1 detected for Q7731-P26-Q259907-7f7cc241-0-P580-8d5052 (^1648-01-26T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q7731-P26-Q241797-ff9269a2-0-P580-a01064 (^1671-02-01T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q57920-P26-Q267483-a2460de3-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q62483-P26-Q229286-18b62769-0-P580-bebb21 (^1541-06-14T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q62483-P26-Q261905-fc01d066-0-P580-7aecc7 (^1546-07-18T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q69334-P26-Q2419674-1dc5e587-0-P580-e1ff18 (^1183-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q72922-P26-Q56582849-77ca7313-0-P580-c16f56 (^1499-01-21T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q76956-P26-Q454769-cf7fc40d-0-P580-981a99 (^1245-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q86055-P26-Q24661944-f75c4596-0-P580-54820b (^1472-10-19T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q168664-P26-Q15193-1b533b05-0-P580-a310ca (^1793-10-09T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q276526-P26-Q10855916-3e70b907-0-P580-f18c2a (^1392-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q287503-P26-Q316828-d4637da7-0-P580-9879f5 (^1261-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q316831-P26-Q238609-208f7dcc-0-P580-92ae06 (^1153-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q316831-P26-Q450971-656d5797-0-P580-5ed4f3 (^1177-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q336754-P26-Q2084307-30a93eb5-0-P580-b520a9 (^1318-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q370902-P26-Q75289133-2d7df0e9-0-P580-83a193 (^1275-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q430782-P26-Q3007367-9502d33f-0-P580-5b468d (^1555-02-07T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q443876-P26-Q453771-bbc80f51-0-P580-84a26a (^1446-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q563792-P26-Q4958342-a85e5b57-0-P580-acfb1b (^1391-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q588852-P26-Q58514-55e81240-0-P580-ae0480 (^1514-10-09T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q651948-P26-Q9165680-bf5d7e43-0-P580-a08da9 (^1396-03-06T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q679083-P26-Q9150575-c56910ae-0-P580-5d5db4 (^1320-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q936976-P26-Q220845-281a5972-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)\n", "\n", "*** Qualifier collision #1 detected for Q1166728-P26-Q1494018-db61e006-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q1494018-P26-Q1166728-5c17988d-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q1524640-P26-Q166853-40fa3891-0-P580-515f76 (^1375-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q2465095-P26-Q1167368-4ffb7291-0-P580-e64863 (^1257-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q2834394-P39-Q84701409-f487718d-0-P580-ac0fb1 (^1154-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q2834394-P39-Q84701409-f487718d-0-P582-35fc60 (^1173-00-00T00:00:00Z/9)\n", "\n", "*** Qualifier collision #1 detected for Q2844574-P793-Q2238935-db1dea90-0-P585-ab2ece (^1350-00-00T00:00:00Z/7)\n", "\n", "*** Sitelink collision #1 detected for Q4608595-wikipedia_sitelink-83747b (https://.wikipedia.org/wiki/Template:Documentation)\n", "\n", "*** Sitelink collision #1 detected for Q5611978-wikipedia_sitelink-a20fba (https://.wikipedia.org/wiki/Template:Welcome)\n", "\n", "*** Sitelink collision #1 detected for Q6117427-wikipedia_sitelink-31e0a3 (https://.wikipedia.org/wiki/Template:Multicol-end)\n", "\n", "*** Qualifier collision #1 detected for Q6129540-P106-Q25393460-4c72cbac-0-P580-9eefc6 (^1552-07-17T00:00:00Z/11)\n", "\n", "*** Sitelink collision #1 detected for Q6330737-wikipedia_sitelink-b995ff (https://.wikipedia.org/wiki/Category:Template_documentation_pages)\n", "\n", "*** Sitelink collision #1 detected for Q6426831-wikipedia_sitelink-e94d3d (https://.wikipedia.org/wiki/Template:Edit_filter_warning)\n", "\n", "*** Sitelink collision #1 detected for Q7605021-wikipedia_sitelink-0626c0 (https://.wikipedia.org/wiki/Template:Comment)\n", "The node collector called 12500000 times: 62500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 12500000 times: 0 nrows, 1005847985 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q8353981-wikipedia_sitelink-2a54e3 (https://.wikipedia.org/wiki/Template:Multicol)\n", "The sitelink collector called 5500000 times: 0 nrows, 231223828 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 12000000 times: 0 nrows, 50310499 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q15098140-wikipedia_sitelink-3d8060 (https://.wikipedia.org/wiki/Module:Yesno)\n", "\n", "*** Sitelink collision #1 detected for Q15117218-wikipedia_sitelink-36ace4 (https://.wikipedia.org/wiki/Module:Category_handler)\n", "\n", "*** Sitelink collision #1 detected for Q15506579-wikipedia_sitelink-01bc80 (https://.wikipedia.org/wiki/Module:Documentation/config)\n", "\n", "*** Sitelink collision #1 detected for Q8244473-wikipedia_sitelink-c1fedd (https://.wikipedia.org/wiki/Module:InfoboxImage)\n", "The qual collector called 10500000 times: 0 nrows, 0 erows, 197550975 qrows, 0 invalid erows, 0 invalid qrows\n", "\n", "*** Sitelink collision #1 detected for Q20962109-wikipedia_sitelink-502976 (https://.wikipedia.org/wiki/Module:ISOdate)\n", "\n", "*** Sitelink collision #1 detected for Q22910717-wikipedia_sitelink-8e101e (https://.wikipedia.org/wiki/Template:Sandbox_other)\n", "\n", "*** Sitelink collision #1 detected for Q25714577-wikipedia_sitelink-59e42c (https://.wikipedia.org/wiki/Module:WikidataIB)\n", "\n", "*** Qualifier collision #1 detected for Q26877285-P26-Q828710-08b99587-0-P580-c30f0a (^1566-02-16T00:00:00Z/11)\n", "The node collector called 13000000 times: 65000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The edge collector called 13000000 times: 0 nrows, 1034260545 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "The description collector called 12500000 times: 0 nrows, 52422958 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", "5500000 lines processed by processor 10\n", "5500000 lines processed by processor 1\n", "5500000 lines processed by processor 2\n", "5500000 lines processed by processor 7\n", "5500000 lines processed by processor 8\n", "5500000 lines processed by processor 6\n", "5500000 lines processed by processor 11\n", "5500000 lines processed by processor 3\n" ] } ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " import-wikidata \\\n", " -i ${WIKIDATA_ALL_JSON} \\\n", " --node-file ${TEMP}/metadata.node.${UNSORTED_KGTK} \\\n", " --minimal-edge-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \\\n", " --minimal-qual-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \\\n", " --invalid-edge-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \\\n", " --invalid-qual-file ${TEMP}/qualifiers.badvalue.${UNSORTED_KGTK} \\\n", " --node-file-id-only \\\n", " --explode-values False \\\n", " --all-languages False \\\n", " --lang en \\\n", " --alias-edges True \\\n", " --split-alias-file ${TEMP}/aliases.${UNSORTED_KGTK} \\\n", " --split-en-alias-file ${TEMP}/aliases.en.${UNSORTED_KGTK} \\\n", " --description-edges True \\\n", " --split-description-file ${TEMP}/descriptions.${UNSORTED_KGTK} \\\n", " --split-en-description-file ${TEMP}/descriptions.en.${UNSORTED_KGTK} \\\n", " --label-edges True \\\n", " --split-label-file ${TEMP}/labels.${UNSORTED_KGTK} \\\n", " --split-en-label-file ${TEMP}/labels.en.${UNSORTED_KGTK} \\\n", " --datatype-edges True \\\n", " --split-datatype-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", " --entry-type-edges True \\\n", " --split-type-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \\\n", " --sitelink-edges True \\\n", " --sitelink-verbose-edges True \\\n", " --split-sitelink-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \\\n", " --split-en-sitelink-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", " --value-hash-width 6 \\\n", " --claim-id-hash-width 8 \\\n", " --use-kgtkwriter True \\\n", " --use-mgzip-for-input False \\\n", " --use-mgzip-for-output False \\\n", " --use-shm True \\\n", " --procs 12 \\\n", " --mapper-batch-size 5 \\\n", " --max-size-per-mapper-queue 3 \\\n", " --single-mapper-queue True \\\n", " --collect-results True \\\n", " --collect-seperately True\\\n", " --collector-batch-size 5 \\\n", " --collector-queue-per-proc-size 3 \\\n", " --progress-interval 500000 \\\n", " --clean \\\n", " --allow-end-of-day False \\\n", " --repair-month-or-day-zero \\\n", " --minimum-valid-year 1 \\\n", " --maximum-valid-year 9999 \\\n", " --validate-fromisoformat \\\n", " --repair-lax-coordinates \\\n", " --allow-language-suffixes \\\n", " --allow-wikidata-lq-strings \\\n", " | tee ${TEMP}/import-split-wikidata.log\n" ] }, { "cell_type": "markdown", "id": "3d4a8e15-8826-4d50-83c9-9a17346eb206", "metadata": {}, "source": [ "## Split `somevalue` and `novalue` from `claims.raw.unsorted.tsv.gz`" ] }, { "cell_type": "code", "execution_count": null, "id": "6aad9a3c-c27b-4858-802b-b633c40dbb5d", "metadata": {}, "outputs": [], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", " --input-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \\\n", " --first-match-only \\\n", " --pattern \";; novalue\" -o ${TEMP}/claims.novalue.${UNSORTED_KGTK} \\\n", " --pattern \";; somevalue\" -o ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \\\n", " --reject-file ${TEMP}/claims.${UNSORTED_KGTK} \\\n", " | tee ${TEMP}/split-claims-missing-values.log" ] }, { "cell_type": "markdown", "id": "d7d9a090-de83-4e9f-84d9-93fb49f486b8", "metadata": {}, "source": [ " ## Split `somevalue` and `novalue` from `qualifiers.raw.tsv.gz`" ] }, { "cell_type": "code", "execution_count": null, "id": "7538ad9d-018a-45ea-95f8-46467e68affe", "metadata": {}, "outputs": [], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", " --input-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \\\n", " --first-match-only \\\n", " --pattern \";; novalue\" -o ${TEMP}/qualifiers.novalue.${UNSORTED_KGTK} \\\n", " --pattern \";; somevalue\" -o ${TEMP}/qualifiers.somevalue.${UNSORTED_KGTK} \\\n", " --reject-file - \\\n", " / ifexists ${VERBOSE} \\\n", " --input-keys node1 \\\n", " --filter-file ${TEMP}/claims.novalue.${UNSORTED_KGTK} \\\n", " --filter-keys id \\\n", " --output-file ${TEMP}/qualifiers.novalueClaims.${UNSORTED_KGTK} \\\n", " --reject-file - \\\n", " / ifexists ${VERBOSE} \\\n", " --input-keys node1 \\\n", " --filter-file ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \\\n", " --filter-keys id \\\n", " --output-file ${TEMP}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \\\n", " --reject-file - \\\n", " / ifexists ${VERBOSE} \\\n", " --input-keys node1 \\\n", " --filter-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \\\n", " --filter-keys id \\\n", " --output-file ${TEMP}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \\\n", " --reject-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \\\n", " | tee ${TEMP}/split-qualifiers-missing-values.log" ] }, { "cell_type": "markdown", "id": "d6f67ef6-4570-4786-b41f-a1355fb63981", "metadata": {}, "source": [ "## Split `sitelinks.raw.unsorted.tsv.gz`" ] }, { "cell_type": "code", "execution_count": null, "id": "261a84b8-c671-4134-afaa-12af4c4a7762", "metadata": {}, "outputs": [], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", " --input-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \\\n", " --pattern \"; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;\" \\\n", " --output-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", " --reject-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \\\n", " | tee ${TEMP}/split-sitelink-qualifiers.log" ] }, { "cell_type": "markdown", "id": "e7d55217-89c7-42e9-b01b-aaf3e0426234", "metadata": {}, "source": [ "## Split `sitelinks.en.raw.unsorted.tsv.gz`" ] }, { "cell_type": "code", "execution_count": null, "id": "9bce9d69-6032-4e2f-aa04-12da7998d508", "metadata": {}, "outputs": [], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", " --input-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", " --pattern \"; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;\" \\\n", " --output-file ${TEMP}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \\\n", " --reject-file ${TEMP}/sitelinks.en.${UNSORTED_KGTK} \\\n", " | tee ${TEMP}/split-sitelink-en-qualifiers.log" ] }, { "cell_type": "markdown", "id": "9275fecc-98db-435c-863a-7f4d780f64c9", "metadata": {}, "source": [ "## Sort the files from `TEMP` to `OUT` folder" ] }, { "cell_type": "code", "execution_count": null, "id": "90b70419-1894-4dcb-954d-5b83d4c80d48", "metadata": {}, "outputs": [], "source": [ "for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:\n", " print(f\"Sort the {TARGET} file.\")\n", " input_file = f\"{os.environ['TEMP']}/{TARGET}.{os.environ['UNSORTED_KGTK']}\"\n", " output_file = f\"{os.environ['OUT']}/{TARGET}.{os.environ['SORTED_KGTK']}\"\n", " logfile = f\"{os.environ['TEMP']}/{TARGET}-sorted.log\"\n", " sort_command = f\"\"\"kgtk {os.environ['KGTK_FLAGS']} \\\n", " sort {os.environ['VERBOSE']} \\\n", " --input-file {input_file} \\\n", " --output-file {output_file} \\\n", " --gzip-command {os.environ['GZIP_CMD']} \\\n", " --sort-command {os.environ['SORT_COMMAND']} \\\n", " --extra '{os.environ['SORT_EXTRAS']}' | tee {logfile}\"\"\"\n", " !$sort_command\n" ] }, { "cell_type": "markdown", "id": "22ea12ca-5cb1-4e51-82ae-6cb733f4a555", "metadata": {}, "source": [ "## Build the `all.tsv.gz file`" ] }, { "cell_type": "code", "execution_count": null, "id": "47560fa3-d87e-4840-800e-ecdf7d1d4341", "metadata": {}, "outputs": [], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", " --input-file ${TEMP}/claims.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/aliases.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/descriptions.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/labels.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \\\n", " --input-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", " / sort ${VERBOSE} \\\n", " --gzip-command ${GZIP_CMD} \\\n", " --extra \"${SORT_EXTRAS}\" \\\n", " --output-file ${OUT}/all.${SORTED_KGTK} \\\n", "| tee ${TEMP}/build-all-edges.log" ] }, { "cell_type": "markdown", "id": "aba89f00", "metadata": {}, "source": [ "## Subset" ] }, { "cell_type": "code", "execution_count": 5, "id": "69fef4d8", "metadata": {}, "outputs": [], "source": [ "!kgtk ifexists --input-file ${OUT}/all.tsv.gz \\\n", " --filter-on /data02/ana_iglesias/data/qnodes.tsv \\\n", " --o ${TEMP}/all.sub.filtered.tsv.gz" ] }, { "cell_type": "code", "execution_count": 6, "id": "94e81a56", "metadata": {}, "outputs": [], "source": [ "!kgtk ifexists --input-file ${OUT}/all.tsv.gz \\\n", " --filter-file ${TEMP}/all.sub.filtered.tsv.gz \\\n", " --input-keys 'node1' \\\n", " --filter-keys 'id' \\\n", " -o ${TEMP}/all.qual.filtered.tsv.gz" ] }, { "cell_type": "code", "execution_count": 7, "id": "f305d054", "metadata": {}, "outputs": [], "source": [ "!kgtk ifexists --input-file ${OUT}/all.tsv.gz \\\n", " --filter-file ${TEMP}/all.sub.filtered.tsv.gz \\\n", " --input-keys 'node1' \\\n", " --filter-keys 'label' \\\n", " -o ${TEMP}/all.prop-sub.filtered.tsv.gz" ] }, { "cell_type": "code", "execution_count": 8, "id": "341ae643", "metadata": {}, "outputs": [], "source": [ "!kgtk ifexists --input-file ${OUT}/all.tsv.gz \\\n", " --filter-file ${TEMP}/all.qual.filtered.tsv.gz \\\n", " --input-keys 'node1' \\\n", " --filter-keys 'label' \\\n", " -o ${TEMP}/all.prop-qual.filtered.tsv.gz" ] }, { "cell_type": "code", "execution_count": 12, "id": "beee355c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting kgtkcat pid=22692\n", "Opening the 4 input files.\n", "Opening file 1: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz\n", "input format: kgtk\n", "Using KGTK_GRAPH_CACHE='/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", "Using the sort command 'sort'\n", "header pipe: read_fd=4 write_fd=5\n", "sort options pipe: read_fd=6 write_fd=7\n", "gzip output file: '/data02/ana_iglesias/data/import-wikidata/all.subset.tsv.gz'\n", "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data02/ana_iglesias/data/import-wikidata/all.subset.tsv.gz'\n", "Graph cache '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz' not found in the cache.\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz\n", "Running the sort script (pid=22833).\n", "Reading the KGTK input file header line with KgtkReader\n", "input format: kgtk\n", "Using KGTK_GRAPH_CACHE='/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", "Graph cache '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: reading file descriptor 4\n", "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "The output file will be an edge file.\n", "Mapping the 7 column names in /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz.\n", "Opening file 2: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz\n", "input format: kgtk\n", "Using KGTK_GRAPH_CACHE='/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", "Graph cache '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz' not found in the cache.\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz\n", "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Mapping the 7 column names in /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz.\n", "Opening file 3: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz\n", "input format: kgtk\n", "Using KGTK_GRAPH_CACHE='/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", "Graph cache '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz' not found in the cache.\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz\n", "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Mapping the 7 column names in /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz.\n", "Opening file 4: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz\n", "input format: kgtk\n", "Using KGTK_GRAPH_CACHE='/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", "Graph cache '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz' not found in the cache.\n", "KgtkReader: OK to use the fast read path.\n", "KgtkReader: File_path.suffix: .gz\n", "KgtkReader: reading mgzip with 3 threads: /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz\n", "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "Mapping the 7 column names in /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz.\n", "There are 7 merged columns.\n", "Using the system commands for fast copies.\n", "The total file size (292730734) meets the minimum for fast copies (10000).\n", "system command: \"( gzip --decompress --stdout '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz' && gzip --decompress --stdout '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz' | tail -n +2 && gzip --decompress --stdout '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz' | tail -n +2 && gzip --decompress --stdout '/data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz' | tail -n +2 )\"\n", "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: is_edge_file=True is_node_file=False\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading a kgtk file using the fast path.\n", "KGTK header: id node1 label node2 rank node2;wikidatatype lang\n", "sort options: --parallel 6 --buffer-size 50% -T /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", "\n", "Waiting for the sort command to complete.\n", "\n", "\n", "Running the cat script (pid=22863).\n", "\n", "Waiting for the cat command to complete.\n", "\n", "Timing: elapsed=0:00:10.477162 CPU=0:00:04.786855 ( 45.7%): cat --verbose --use-mgzip=TRUE --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz\n", "Cleanup.\n", "Timing: elapsed=0:00:15.628176 CPU=0:00:04.074626 ( 26.1%): sort --verbose --gzip-command pigz --extra --parallel 6 --buffer-size 50% -T /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata --output-file /data02/ana_iglesias/data/import-wikidata/all.subset.tsv.gz\n", "Timing: elapsed=0:00:16.431546 CPU=0:00:06.860200 ( 41.8%): cat --verbose --use-mgzip=TRUE --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.sub.filtered.tsv.gz --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.qual.filtered.tsv.gz --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-sub.filtered.tsv.gz --input-file /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata/all.prop-qual.filtered.tsv.gz / sort --verbose --gzip-command pigz --extra --parallel 6 --buffer-size 50% -T /data02/ana_iglesias/data/import-wikidata/temp.import-wikidata --output-file /data02/ana_iglesias/data/import-wikidata/all.subset.tsv.gz\n" ] } ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", " --input-file ${TEMP}/all.sub.filtered.tsv.gz \\\n", " --input-file ${TEMP}/all.qual.filtered.tsv.gz \\\n", " --input-file ${TEMP}/all.prop-sub.filtered.tsv.gz \\\n", " --input-file ${TEMP}/all.prop-qual.filtered.tsv.gz \\\n", " / sort ${VERBOSE} \\\n", " --gzip-command ${GZIP_CMD} \\\n", " --extra \"${SORT_EXTRAS}\" \\\n", " --output-file ${OUT}/all.subset.${SORTED_KGTK} \\\n", "| tee ${TEMP}/build-subset.log" ] }, { "cell_type": "code", "execution_count": null, "id": "b105f781-0a64-4f43-a780-82cbf9dedad8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }