{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "f0vqPVwV4PLp" }, "source": [ "# WD-AMC dataset generation in KGTK\n", "In the following representations: qualifiers, Standard Reification and N-ary Relationships" ] }, { "cell_type": "markdown", "metadata": { "id": "uaMp3Z644ZMS" }, "source": [ "## Setting up KGTK and loading data" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "executionInfo": { "elapsed": 796, "status": "ok", "timestamp": 1663628829174, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "aEvTuC6N4eUi" }, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "import csv\n", "import pandas as pd\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "executionInfo": { "elapsed": 5, "status": "ok", "timestamp": 1663628829175, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "5hwj2XmI4lt4" }, "outputs": [], "source": [ "# Parameters\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/data02/ana_iglesias/data/subset/parts\"\n", "output_path = \"/data02/ana_iglesias/data/subset\"\n", "project_name = \"reframings\"" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 8555, "status": "ok", "timestamp": 1663628837725, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "YnbIYphc4mWX", "outputId": "bfedaa34-df01-4e68-c193-4b13280a0046" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /data02/ana_iglesias\n", "Current dir: /data02/ana_iglesias/data/subset\n", "KGTK dir: /data02/ana_iglesias/data\n", "Use-cases dir: /data02/ana_iglesias/data/use-cases\n" ] } ], "source": [ "files = [\n", " \"all\",\n", " \"alias\",\n", " \"claims\",\n", " \"description\",\n", " \"label\",\n", " \"datatypes\",\n", " \"qualifiers\"\n", "]\n", "\n", "ck = ConfigureKGTK(files)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 4, "status": "ok", "timestamp": 1663628838043, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "mKGSbCVY782a", "outputId": "963a6cd7-6b93-449e-b99b-a2aac201b43f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kypher: kgtk query --graph-cache /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db\n", "GRAPH: /data02/ana_iglesias/data/subset/parts\n", "KGTK_GRAPH_CACHE: /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db\n", "KGTK_LABEL_FILE: /data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz\n", "kgtk: kgtk\n", "TEMP: /data02/ana_iglesias/data/subset/reframings/temp.reframings\n", "EXAMPLES_DIR: /data02/ana_iglesias/data/examples\n", "USE_CASES_DIR: /data02/ana_iglesias/data/use-cases\n", "OUT: /data02/ana_iglesias/data/subset/reframings\n", "STORE: /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db\n", "KGTK_OPTION_DEBUG: false\n", "all: /data02/ana_iglesias/data/subset/parts/all.tsv.gz\n", "alias: /data02/ana_iglesias/data/subset/parts/aliases.en.tsv.gz\n", "claims: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz\n", "description: /data02/ana_iglesias/data/subset/parts/descriptions.en.tsv.gz\n", "label: /data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz\n", "datatypes: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "qualifiers: /data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 22557, "status": "ok", "timestamp": 1663628860598, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "H-7s0o2p7_FU", "outputId": "7dadbab1-7a68-41f4-c85b-1b9366840898" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk query --graph-cache /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db -i \"/data02/ana_iglesias/data/subset/parts/all.tsv.gz\" --as all -i \"/data02/ana_iglesias/data/subset/parts/aliases.en.tsv.gz\" --as alias -i \"/data02/ana_iglesias/data/subset/parts/claims.tsv.gz\" --as claims -i \"/data02/ana_iglesias/data/subset/parts/descriptions.en.tsv.gz\" --as description -i \"/data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz\" --as label -i \"/data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\" --as datatypes -i \"/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz\" --as qualifiers --limit 3\n", "node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "P10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tP10-P1628-32b85d-7927ece6-0\turl\n", "P10\tP1628\t\"https://schema.org/video\"\tP10-P1628-acf60d-b8950832-0\turl\n", "P10\tP1629\tQ34508\tP10-P1629-Q34508-bcc39400-0\twikibase-item\n", "CPU times: user 3.34 ms, sys: 9.93 ms, total: 13.3 ms\n", "Wall time: 25.9 s\n" ] } ], "source": [ "%%time\n", "ck.load_files_into_cache()" ] }, { "cell_type": "markdown", "metadata": { "id": "ErD74LCh8KqV" }, "source": [ "## Creating base claims " ] }, { "cell_type": "markdown", "metadata": { "id": "qfcLTxVtHtFP" }, "source": [ "Removing human nodes from the `claims` file and saving it as `claims_base.tsv`. The disjoint file, containing only human claims, is saved to `reframingclaims.tsv`" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "executionInfo": { "elapsed": 13548, "status": "ok", "timestamp": 1663628934162, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "DjeEii-9a1AX", "outputId": "8de8c4df-2445-45ce-b526-593269c7f863" }, "outputs": [ { "data": { "text/html": [ "
\n", " | node1 | \n", "label | \n", "node2 | \n", "id | \n", "
---|---|---|---|---|
0 | \n", "Q100292318 | \n", "P1040 | \n", "Q24578312 | \n", "Q100292318-P1040-Q24578312-fa7dc50b-0 | \n", "
1 | \n", "Q100292318 | \n", "P1258 | \n", "m/the_addams_family_2 | \n", "Q100292318-P1258-841eb3-3f6f8f58-0 | \n", "
2 | \n", "Q100292318 | \n", "P1265 | \n", "278122 | \n", "Q100292318-P1265-d7a0f2-29dfe293-0 | \n", "
3 | \n", "Q100292318 | \n", "P136 | \n", "Q157443 | \n", "Q100292318-P136-Q157443-ca2b6c26-0 | \n", "
4 | \n", "Q100292318 | \n", "P136 | \n", "Q28968258 | \n", "Q100292318-P136-Q28968258-47a1de5a-0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1921154 | \n", "Q97365172 | \n", "P86 | \n", "Q1740191 | \n", "Q97365172-P86-Q1740191-38c6c027-0 | \n", "
1921155 | \n", "Q97365172 | \n", "P8687 | \n", "+12773 | \n", "Q97365172-P8687-1e0b17-c5647aa0-0 | \n", "
1921156 | \n", "Q97365172 | \n", "P8687 | \n", "+53434 | \n", "Q97365172-P8687-a1b4dd-667276a6-0 | \n", "
1921157 | \n", "Q97365172 | \n", "P9751 | \n", "umc.cmc.jzpcwzmyd6h9eaadrakph6ta | \n", "Q97365172-P9751-1b0487-17916163-0 | \n", "
1921158 | \n", "Q97365172 | \n", "P9821 | \n", "7125 | \n", "Q97365172-P9821-85ea86-6c81ab56-0 | \n", "
1921159 rows × 4 columns
\n", "\n", " | id | \n", "node1 | \n", "label | \n", "node2 | \n", "node2;wikidatatype | \n", "
---|---|---|---|---|---|
0 | \n", "P10-P1628-32b85d-7927ece6-0 | \n", "P10 | \n", "P1628 | \n", "http://www.w3.org/2006/vcard/ns#Video | \n", "url | \n", "
1 | \n", "P10-P1628-acf60d-b8950832-0 | \n", "P10 | \n", "P1628 | \n", "https://schema.org/video | \n", "url | \n", "
2 | \n", "P10-P1629-Q34508-bcc39400-0 | \n", "P10 | \n", "P1629 | \n", "Q34508 | \n", "wikibase-item | \n", "
3 | \n", "P10-P1630-53947a-fbe9093e-0 | \n", "P10 | \n", "P1630 | \n", "https://commons.wikimedia.org/wiki/File:$1 | \n", "string | \n", "
4 | \n", "P10-P1659-P1651-c4068028-0 | \n", "P10 | \n", "P1659 | \n", "P1651 | \n", "wikibase-property | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2337030 | \n", "Q99998027-P31-Q11483816-d1dfd1de-0 | \n", "Q99998027 | \n", "P31 | \n", "Q11483816 | \n", "wikibase-item | \n", "
2337031 | \n", "Q99998027-P664-Q617433-3d360b0b-0 | \n", "Q99998027 | \n", "P664 | \n", "Q617433 | \n", "wikibase-item | \n", "
2337032 | \n", "Q99999126-P31-Q11407181-5b3cc2ad-0 | \n", "Q99999126 | \n", "P31 | \n", "Q11407181 | \n", "wikibase-item | \n", "
2337033 | \n", "Q99999126-P31-Q11483816-46ec0a53-0 | \n", "Q99999126 | \n", "P31 | \n", "Q11483816 | \n", "wikibase-item | \n", "
2337034 | \n", "Q99999126-P664-Q41506-a0b5bdfa-0 | \n", "Q99999126 | \n", "P664 | \n", "Q41506 | \n", "wikibase-item | \n", "
2337035 rows × 5 columns
\n", "\n", " | node1 | \n", "label | \n", "node2 | \n", "id | \n", "
---|---|---|---|---|
0 | \n", "Q1000118-P1441-Q28146833-e939a5a7-0 | \n", "P175 | \n", "Q28556723 | \n", "Q1000118-P1441-Q28146833-e939a5a7-0-P175-Q2855... | \n", "
1 | \n", "Q1000118-P345-ce5234-73bca1aa-0 | \n", "P2241 | \n", "Q44374960 | \n", "Q1000118-P345-ce5234-73bca1aa-0-P2241-Q44374960-0 | \n", "
2 | \n", "Q1000118-P6262-169bd9-1227c261-0 | \n", "P1810 | \n", "Peter Pettigrew | \n", "Q1000118-P6262-169bd9-1227c261-0-P1810-6f074a-0 | \n", "
3 | \n", "Q1000118-P6262-169bd9-1227c261-0 | \n", "P407 | \n", "Q809 | \n", "Q1000118-P6262-169bd9-1227c261-0-P407-Q809-0 | \n", "
4 | \n", "Q1000118-P6262-169bd9-1227c261-0 | \n", "P9675 | \n", "286 | \n", "Q1000118-P6262-169bd9-1227c261-0-P9675-ca871a-0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
420679 | \n", "Q999960-P10527-Q50920401-c2581644-0 | \n", "P1810 | \n", "Cochet, Jean-Laurent | \n", "Q999960-P10527-Q50920401-c2581644-0-P1810-5c66... | \n", "
420680 | \n", "Q999960-P166-Q10855271-52323a81-0 | \n", "P585 | \n", "^2006-01-01T00:00:00Z/9 | \n", "Q999960-P166-Q10855271-52323a81-0-P585-cf2407-0 | \n", "
420681 | \n", "Q999960-P166-Q13452531-0ed37b2c-0 | \n", "P585 | \n", "^2012-01-01T00:00:00Z/9 | \n", "Q999960-P166-Q13452531-0ed37b2c-0-P585-979d4e-0 | \n", "
420682 | \n", "Q999960-P166-Q3405661-d2f16aa9-0 | \n", "P585 | \n", "^1984-01-01T00:00:00Z/9 | \n", "Q999960-P166-Q3405661-d2f16aa9-0-P585-a649f8-0 | \n", "
420683 | \n", "Q999960-P166-Q3405863-ec957bcb-0 | \n", "P585 | \n", "^1975-01-01T00:00:00Z/9 | \n", "Q999960-P166-Q3405863-ec957bcb-0-P585-572da6-0 | \n", "
420684 rows × 4 columns
\n", "\n", " | node1 | \n", "label | \n", "node2 | \n", "id | \n", "
---|---|---|---|---|
0 | \n", "Q100292318 | \n", "P1552 | \n", "Q27834579 | \n", "Q100292318-P1552-Q27834579-87b20a63-0 | \n", "
1 | \n", "Q100292318 | \n", "P1552 | \n", "Q27847754 | \n", "Q100292318-P1552-Q27847754-d80110c0-0 | \n", "
2 | \n", "Q100292318 | \n", "P1651 | \n", "k1UNQFEUsPg | \n", "Q100292318-P1651-cb1243-138008d8-0 | \n", "
3 | \n", "Q100292318 | \n", "P1657 | \n", "Q18665334 | \n", "Q100292318-P1657-Q18665334-dc007cbc-0 | \n", "
4 | \n", "Q100292318 | \n", "P1981 | \n", "Q20644795 | \n", "Q100292318-P1981-Q20644795-8508e9f8-0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
226432 | \n", "Q97365172 | \n", "P725 | \n", "Q837676 | \n", "Q97365172-P725-Q837676-2c91e4c2-0 | \n", "
226433 | \n", "Q97365172 | \n", "P725 | \n", "Q965261 | \n", "Q97365172-P725-Q965261-40d9e1b3-0 | \n", "
226434 | \n", "Q97365172 | \n", "P856 | \n", "https://www.starwars.com/series/star-wars-the-... | \n", "Q97365172-P856-efdce4-8d456f9b-0 | \n", "
226435 | \n", "Q97365172 | \n", "P8687 | \n", "+12773 | \n", "Q97365172-P8687-1e0b17-c5647aa0-0 | \n", "
226436 | \n", "Q97365172 | \n", "P8687 | \n", "+53434 | \n", "Q97365172-P8687-a1b4dd-667276a6-0 | \n", "
226437 rows × 4 columns
\n", "\n", " | id | \n", "node1 | \n", "label | \n", "node2 | \n", "
---|---|---|---|---|
0 | \n", "P10-P1628-32b85d-7927ece6-0 | \n", "P10 | \n", "P1628 | \n", "http://www.w3.org/2006/vcard/ns#Video | \n", "
1 | \n", "P10-P1628-acf60d-b8950832-0 | \n", "P10 | \n", "P1628 | \n", "https://schema.org/video | \n", "
2 | \n", "P10-P1629-Q34508-bcc39400-0 | \n", "P10 | \n", "P1629 | \n", "Q34508 | \n", "
3 | \n", "P10-P1630-53947a-fbe9093e-0 | \n", "P10 | \n", "P1630 | \n", "https://commons.wikimedia.org/wiki/File:$1 | \n", "
4 | \n", "P10-P1659-P1651-c4068028-0 | \n", "P10 | \n", "P1659 | \n", "P1651 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4031755 | \n", "Q97365172-P8411-Q17480853-32949b92-0 | \n", "Q97365172 | \n", "P8411 | \n", "Q17480853 | \n", "
4031756 | \n", "Q97365172-P8411-Q2775969-7f0bc6e5-0 | \n", "Q97365172 | \n", "P8411 | \n", "Q2775969 | \n", "
4031757 | \n", "Q97365172-P86-Q1740191-38c6c027-0 | \n", "Q97365172 | \n", "P86 | \n", "Q1740191 | \n", "
4031758 | \n", "Q97365172-P9751-1b0487-17916163-0 | \n", "Q97365172 | \n", "P9751 | \n", "umc.cmc.jzpcwzmyd6h9eaadrakph6ta | \n", "
4031759 | \n", "Q97365172-P9821-85ea86-6c81ab56-0 | \n", "Q97365172 | \n", "P9821 | \n", "7125 | \n", "
4031760 rows × 4 columns
\n", "\n", " | node1 | \n", "label | \n", "node2 | \n", "id | \n", "
---|---|---|---|---|
0 | \n", "Q1000115P1343Q6023581694a9be0 | \n", "P31 | \n", "Q3539534 | \n", "Q1000115P1343Q6023581694a9be0-P31-Q3539534 | \n", "
1 | \n", "Q1000115P1343Q6023581694a9be0 | \n", "P805 | \n", "Q24514151 | \n", "Q1000115P1343Q6023581694a9be0-P805-Q24514151 | \n", "
2 | \n", "Q1000115P1343Q6023581694a9be0 | \n", "Pobject | \n", "Q602358 | \n", "Q1000115P1343Q6023581694a9be0-Pobject-Q602358 | \n", "
3 | \n", "Q1000115P1343Q6023581694a9be0 | \n", "Ppredicate | \n", "P1343 | \n", "Q1000115P1343Q6023581694a9be0-Ppredicate-P1343 | \n", "
4 | \n", "Q1000115P1343Q6023581694a9be0 | \n", "Psubject | \n", "Q1000115 | \n", "Q1000115P1343Q6023581694a9be0-Psubject-Q1000115 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
3991818 | \n", "Q99P86724e9ef1a5c7bbb50 | \n", "P1810 | \n", "California | \n", "Q99P86724e9ef1a5c7bbb50-P1810-965e75 | \n", "
3991819 | \n", "Q99P86724e9ef1a5c7bbb50 | \n", "P31 | \n", "Q3539534 | \n", "Q99P86724e9ef1a5c7bbb50-P31-Q3539534 | \n", "
3991820 | \n", "Q99P86724e9ef1a5c7bbb50 | \n", "Pobject | \n", "1259995120016912384 | \n", "Q99P86724e9ef1a5c7bbb50-Pobject-4e9ef1 | \n", "
3991821 | \n", "Q99P86724e9ef1a5c7bbb50 | \n", "Ppredicate | \n", "P8672 | \n", "Q99P86724e9ef1a5c7bbb50-Ppredicate-P8672 | \n", "
3991822 | \n", "Q99P86724e9ef1a5c7bbb50 | \n", "Psubject | \n", "Q99 | \n", "Q99P86724e9ef1a5c7bbb50-Psubject-Q99 | \n", "
3991823 rows × 4 columns
\n", "\n", " | count | \n", "
---|---|
0 | \n", "671897 | \n", "