{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "f0vqPVwV4PLp" }, "source": [ "# WD-AMC dataset generation in KGTK\n", "In the following representations: qualifiers, Standard Reification and N-ary Relationships" ] }, { "cell_type": "markdown", "metadata": { "id": "uaMp3Z644ZMS" }, "source": [ "## Setting up KGTK and loading data" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "executionInfo": { "elapsed": 796, "status": "ok", "timestamp": 1663628829174, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "aEvTuC6N4eUi" }, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "import csv\n", "import pandas as pd\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "executionInfo": { "elapsed": 5, "status": "ok", "timestamp": 1663628829175, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "5hwj2XmI4lt4" }, "outputs": [], "source": [ "# Parameters\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/data02/ana_iglesias/data/subset/parts\"\n", "output_path = \"/data02/ana_iglesias/data/subset\"\n", "project_name = \"reframings\"" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 8555, "status": "ok", "timestamp": 1663628837725, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "YnbIYphc4mWX", "outputId": "bfedaa34-df01-4e68-c193-4b13280a0046" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /data02/ana_iglesias\n", "Current dir: /data02/ana_iglesias/data/subset\n", "KGTK dir: /data02/ana_iglesias/data\n", "Use-cases dir: /data02/ana_iglesias/data/use-cases\n" ] } ], "source": [ "files = [\n", " \"all\",\n", " \"alias\",\n", " \"claims\",\n", " \"description\",\n", " \"label\",\n", " \"datatypes\",\n", " \"qualifiers\"\n", "]\n", "\n", "ck = ConfigureKGTK(files)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 4, "status": "ok", "timestamp": 1663628838043, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "mKGSbCVY782a", "outputId": "963a6cd7-6b93-449e-b99b-a2aac201b43f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kypher: kgtk query --graph-cache /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db\n", "GRAPH: /data02/ana_iglesias/data/subset/parts\n", "KGTK_GRAPH_CACHE: /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db\n", "KGTK_LABEL_FILE: /data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz\n", "kgtk: kgtk\n", "TEMP: /data02/ana_iglesias/data/subset/reframings/temp.reframings\n", "EXAMPLES_DIR: /data02/ana_iglesias/data/examples\n", "USE_CASES_DIR: /data02/ana_iglesias/data/use-cases\n", "OUT: /data02/ana_iglesias/data/subset/reframings\n", "STORE: /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db\n", "KGTK_OPTION_DEBUG: false\n", "all: /data02/ana_iglesias/data/subset/parts/all.tsv.gz\n", "alias: /data02/ana_iglesias/data/subset/parts/aliases.en.tsv.gz\n", "claims: /data02/ana_iglesias/data/subset/parts/claims.tsv.gz\n", "description: /data02/ana_iglesias/data/subset/parts/descriptions.en.tsv.gz\n", "label: /data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz\n", "datatypes: /data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\n", "qualifiers: /data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 22557, "status": "ok", "timestamp": 1663628860598, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "H-7s0o2p7_FU", "outputId": "7dadbab1-7a68-41f4-c85b-1b9366840898" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk query --graph-cache /data02/ana_iglesias/data/subset/reframings/temp.reframings/wikidata.sqlite3.db -i \"/data02/ana_iglesias/data/subset/parts/all.tsv.gz\" --as all -i \"/data02/ana_iglesias/data/subset/parts/aliases.en.tsv.gz\" --as alias -i \"/data02/ana_iglesias/data/subset/parts/claims.tsv.gz\" --as claims -i \"/data02/ana_iglesias/data/subset/parts/descriptions.en.tsv.gz\" --as description -i \"/data02/ana_iglesias/data/subset/parts/labels.en.tsv.gz\" --as label -i \"/data02/ana_iglesias/data/subset/parts/metadata.property.datatypes.tsv.gz\" --as datatypes -i \"/data02/ana_iglesias/data/subset/parts/qualifiers.tsv.gz\" --as qualifiers --limit 3\n", "node1\tlabel\tnode2\tid\tnode2;wikidatatype\n", "P10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tP10-P1628-32b85d-7927ece6-0\turl\n", "P10\tP1628\t\"https://schema.org/video\"\tP10-P1628-acf60d-b8950832-0\turl\n", "P10\tP1629\tQ34508\tP10-P1629-Q34508-bcc39400-0\twikibase-item\n", "CPU times: user 3.34 ms, sys: 9.93 ms, total: 13.3 ms\n", "Wall time: 25.9 s\n" ] } ], "source": [ "%%time\n", "ck.load_files_into_cache()" ] }, { "cell_type": "markdown", "metadata": { "id": "ErD74LCh8KqV" }, "source": [ "## Creating base claims " ] }, { "cell_type": "markdown", "metadata": { "id": "qfcLTxVtHtFP" }, "source": [ "Removing human nodes from the `claims` file and saving it as `claims_base.tsv`. The disjoint file, containing only human claims, is saved to `reframingclaims.tsv`" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "executionInfo": { "elapsed": 13548, "status": "ok", "timestamp": 1663628934162, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "DjeEii-9a1AX", "outputId": "8de8c4df-2445-45ce-b526-593269c7f863" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q100292318P1040Q24578312Q100292318-P1040-Q24578312-fa7dc50b-0
1Q100292318P1258m/the_addams_family_2Q100292318-P1258-841eb3-3f6f8f58-0
2Q100292318P1265278122Q100292318-P1265-d7a0f2-29dfe293-0
3Q100292318P136Q157443Q100292318-P136-Q157443-ca2b6c26-0
4Q100292318P136Q28968258Q100292318-P136-Q28968258-47a1de5a-0
...............
1921154Q97365172P86Q1740191Q97365172-P86-Q1740191-38c6c027-0
1921155Q97365172P8687+12773Q97365172-P8687-1e0b17-c5647aa0-0
1921156Q97365172P8687+53434Q97365172-P8687-a1b4dd-667276a6-0
1921157Q97365172P9751umc.cmc.jzpcwzmyd6h9eaadrakph6taQ97365172-P9751-1b0487-17916163-0
1921158Q97365172P98217125Q97365172-P9821-85ea86-6c81ab56-0
\n", "

1921159 rows × 4 columns

\n", "
" ], "text/plain": [ " node1 label node2 \\\n", "0 Q100292318 P1040 Q24578312 \n", "1 Q100292318 P1258 m/the_addams_family_2 \n", "2 Q100292318 P1265 278122 \n", "3 Q100292318 P136 Q157443 \n", "4 Q100292318 P136 Q28968258 \n", "... ... ... ... \n", "1921154 Q97365172 P86 Q1740191 \n", "1921155 Q97365172 P8687 +12773 \n", "1921156 Q97365172 P8687 +53434 \n", "1921157 Q97365172 P9751 umc.cmc.jzpcwzmyd6h9eaadrakph6ta \n", "1921158 Q97365172 P9821 7125 \n", "\n", " id \n", "0 Q100292318-P1040-Q24578312-fa7dc50b-0 \n", "1 Q100292318-P1258-841eb3-3f6f8f58-0 \n", "2 Q100292318-P1265-d7a0f2-29dfe293-0 \n", "3 Q100292318-P136-Q157443-ca2b6c26-0 \n", "4 Q100292318-P136-Q28968258-47a1de5a-0 \n", "... ... \n", "1921154 Q97365172-P86-Q1740191-38c6c027-0 \n", "1921155 Q97365172-P8687-1e0b17-c5647aa0-0 \n", "1921156 Q97365172-P8687-a1b4dd-667276a6-0 \n", "1921157 Q97365172-P9751-1b0487-17916163-0 \n", "1921158 Q97365172-P9821-85ea86-6c81ab56-0 \n", "\n", "[1921159 rows x 4 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i claims\n", " --match '(s)-[t:P31]->(type),\n", " (s)-[p]->(o)'\n", " --where 'p.label != \"Pdirected_pagerank\" and \n", " p.label != \"Pundirected_pagerank\" and \n", " p.label != \"Pout_degree\" and \n", " p.label != \"Pin_degree\" and\n", " type IN [\"Q5\", \"Q229390\", \"Q24869\", \"Q11424\", \"Q581714\",\"Q11425\",\"Q29168811\",\"Q5398426\",\"Q526877\",\"Q19020\",\"Q15773347\",\"Q15773317\"]'\n", " --return 's, p.label, o, p'\n", " -o $TEMP/reframingclaims-temp.tsv\n", "\"\"\")\n", "\n", "kgtk(\"query -i $TEMP/reframingclaims-temp.tsv --as reframingclaimstemp\")\n", "reframing_claims = pd.read_csv(\"/data02/ana_iglesias/data/subset/reframings/temp.reframings/reframingclaims-temp.tsv\", sep=\"\\t\")\n", "reframing_claims" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "executionInfo": { "elapsed": 60038, "status": "ok", "timestamp": 1663628920621, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "h30g-PS58R2g", "outputId": "48019370-d10e-4154-e986-5a6b1a94b8a6" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2node2;wikidatatype
0P10-P1628-32b85d-7927ece6-0P10P1628http://www.w3.org/2006/vcard/ns#Videourl
1P10-P1628-acf60d-b8950832-0P10P1628https://schema.org/videourl
2P10-P1629-Q34508-bcc39400-0P10P1629Q34508wikibase-item
3P10-P1630-53947a-fbe9093e-0P10P1630https://commons.wikimedia.org/wiki/File:$1string
4P10-P1659-P1651-c4068028-0P10P1659P1651wikibase-property
..................
2337030Q99998027-P31-Q11483816-d1dfd1de-0Q99998027P31Q11483816wikibase-item
2337031Q99998027-P664-Q617433-3d360b0b-0Q99998027P664Q617433wikibase-item
2337032Q99999126-P31-Q11407181-5b3cc2ad-0Q99999126P31Q11407181wikibase-item
2337033Q99999126-P31-Q11483816-46ec0a53-0Q99999126P31Q11483816wikibase-item
2337034Q99999126-P664-Q41506-a0b5bdfa-0Q99999126P664Q41506wikibase-item
\n", "

2337035 rows × 5 columns

\n", "
" ], "text/plain": [ " id node1 label \\\n", "0 P10-P1628-32b85d-7927ece6-0 P10 P1628 \n", "1 P10-P1628-acf60d-b8950832-0 P10 P1628 \n", "2 P10-P1629-Q34508-bcc39400-0 P10 P1629 \n", "3 P10-P1630-53947a-fbe9093e-0 P10 P1630 \n", "4 P10-P1659-P1651-c4068028-0 P10 P1659 \n", "... ... ... ... \n", "2337030 Q99998027-P31-Q11483816-d1dfd1de-0 Q99998027 P31 \n", "2337031 Q99998027-P664-Q617433-3d360b0b-0 Q99998027 P664 \n", "2337032 Q99999126-P31-Q11407181-5b3cc2ad-0 Q99999126 P31 \n", "2337033 Q99999126-P31-Q11483816-46ec0a53-0 Q99999126 P31 \n", "2337034 Q99999126-P664-Q41506-a0b5bdfa-0 Q99999126 P664 \n", "\n", " node2 node2;wikidatatype \n", "0 http://www.w3.org/2006/vcard/ns#Video url \n", "1 https://schema.org/video url \n", "2 Q34508 wikibase-item \n", "3 https://commons.wikimedia.org/wiki/File:$1 string \n", "4 P1651 wikibase-property \n", "... ... ... \n", "2337030 Q11483816 wikibase-item \n", "2337031 Q617433 wikibase-item \n", "2337032 Q11407181 wikibase-item \n", "2337033 Q11483816 wikibase-item \n", "2337034 Q41506 wikibase-item \n", "\n", "[2337035 rows x 5 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"ifnotexists --input-file claims \\\n", " --filter-file reframingclaimstemp \\\n", " --input-keys 'id' \\\n", " --filter-keys 'id' \\\n", " -o $TEMP/claims_base.tsv\"\"\")\n", "\n", "kgtk(\"query -i $TEMP/claims_base.tsv --as base_claims\")\n", "base_claims = pd.read_csv(\"/data02/ana_iglesias/data/subset/reframings/temp.reframings/claims_base.tsv\", sep=\"\\t\")\n", "base_claims\n" ] }, { "cell_type": "markdown", "metadata": { "id": "dUM-1jc9O42O" }, "source": [ "### Adding single claims of humans, the ones that don't have qualifiers attatched\n" ] }, { "cell_type": "markdown", "metadata": { "id": "YKAGDsiXeImD" }, "source": [ "Qualifiers that apply to human nodes saved to 'human_qualifiers.tsv', to reduce the dataset size.\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "executionInfo": { "elapsed": 5073, "status": "ok", "timestamp": 1663628939232, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "0V7-T0LXcYMl", "outputId": "1b2add22-88d9-4fad-b882-bb784c088060" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.75 s, sys: 1.52 s, total: 6.27 s\n", "Wall time: 16.8 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q1000118-P1441-Q28146833-e939a5a7-0P175Q28556723Q1000118-P1441-Q28146833-e939a5a7-0-P175-Q2855...
1Q1000118-P345-ce5234-73bca1aa-0P2241Q44374960Q1000118-P345-ce5234-73bca1aa-0-P2241-Q44374960-0
2Q1000118-P6262-169bd9-1227c261-0P1810Peter PettigrewQ1000118-P6262-169bd9-1227c261-0-P1810-6f074a-0
3Q1000118-P6262-169bd9-1227c261-0P407Q809Q1000118-P6262-169bd9-1227c261-0-P407-Q809-0
4Q1000118-P6262-169bd9-1227c261-0P9675286Q1000118-P6262-169bd9-1227c261-0-P9675-ca871a-0
...............
420679Q999960-P10527-Q50920401-c2581644-0P1810Cochet, Jean-LaurentQ999960-P10527-Q50920401-c2581644-0-P1810-5c66...
420680Q999960-P166-Q10855271-52323a81-0P585^2006-01-01T00:00:00Z/9Q999960-P166-Q10855271-52323a81-0-P585-cf2407-0
420681Q999960-P166-Q13452531-0ed37b2c-0P585^2012-01-01T00:00:00Z/9Q999960-P166-Q13452531-0ed37b2c-0-P585-979d4e-0
420682Q999960-P166-Q3405661-d2f16aa9-0P585^1984-01-01T00:00:00Z/9Q999960-P166-Q3405661-d2f16aa9-0-P585-a649f8-0
420683Q999960-P166-Q3405863-ec957bcb-0P585^1975-01-01T00:00:00Z/9Q999960-P166-Q3405863-ec957bcb-0-P585-572da6-0
\n", "

420684 rows × 4 columns

\n", "
" ], "text/plain": [ " node1 label node2 \\\n", "0 Q1000118-P1441-Q28146833-e939a5a7-0 P175 Q28556723 \n", "1 Q1000118-P345-ce5234-73bca1aa-0 P2241 Q44374960 \n", "2 Q1000118-P6262-169bd9-1227c261-0 P1810 Peter Pettigrew \n", "3 Q1000118-P6262-169bd9-1227c261-0 P407 Q809 \n", "4 Q1000118-P6262-169bd9-1227c261-0 P9675 286 \n", "... ... ... ... \n", "420679 Q999960-P10527-Q50920401-c2581644-0 P1810 Cochet, Jean-Laurent \n", "420680 Q999960-P166-Q10855271-52323a81-0 P585 ^2006-01-01T00:00:00Z/9 \n", "420681 Q999960-P166-Q13452531-0ed37b2c-0 P585 ^2012-01-01T00:00:00Z/9 \n", "420682 Q999960-P166-Q3405661-d2f16aa9-0 P585 ^1984-01-01T00:00:00Z/9 \n", "420683 Q999960-P166-Q3405863-ec957bcb-0 P585 ^1975-01-01T00:00:00Z/9 \n", "\n", " id \n", "0 Q1000118-P1441-Q28146833-e939a5a7-0-P175-Q2855... \n", "1 Q1000118-P345-ce5234-73bca1aa-0-P2241-Q44374960-0 \n", "2 Q1000118-P6262-169bd9-1227c261-0-P1810-6f074a-0 \n", "3 Q1000118-P6262-169bd9-1227c261-0-P407-Q809-0 \n", "4 Q1000118-P6262-169bd9-1227c261-0-P9675-ca871a-0 \n", "... ... \n", "420679 Q999960-P10527-Q50920401-c2581644-0-P1810-5c66... \n", "420680 Q999960-P166-Q10855271-52323a81-0-P585-cf2407-0 \n", "420681 Q999960-P166-Q13452531-0ed37b2c-0-P585-979d4e-0 \n", "420682 Q999960-P166-Q3405661-d2f16aa9-0-P585-a649f8-0 \n", "420683 Q999960-P166-Q3405863-ec957bcb-0-P585-572da6-0 \n", "\n", "[420684 rows x 4 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "kgtk(\"\"\"\n", " query -i reframingclaimstemp -i qualifiers\n", " --match 'reframingclaimstemp: ()-[qs]->(),\n", " qualifiers: (qs)-[qp]->(qo)'\n", " --return 'qs as node1, qp.label as label, qo as node2, qp as id'\n", " -o $TEMP/reframing_qualifiers.tsv\n", "\"\"\")\n", "\n", "kgtk(\"query -i $TEMP/reframing_qualifiers.tsv --as reframingquals\")\n", "reframing_quals = pd.read_csv(\"/data02/ana_iglesias/data/subset/reframings/temp.reframings/reframing_qualifiers.tsv\", sep=\"\\t\")\n", "reframing_quals" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"ifexists --input-file reframingclaimstemp \\\n", " --filter-file reframingquals \\\n", " --input-keys 'id' \\\n", " --filter-keys 'node1' \\\n", " -o $TEMP/reframingclaims.tsv \\\n", " --reject-file $TEMP/human_single_claims.tsv\"\"\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q100292318P1552Q27834579Q100292318-P1552-Q27834579-87b20a63-0
1Q100292318P1552Q27847754Q100292318-P1552-Q27847754-d80110c0-0
2Q100292318P1651k1UNQFEUsPgQ100292318-P1651-cb1243-138008d8-0
3Q100292318P1657Q18665334Q100292318-P1657-Q18665334-dc007cbc-0
4Q100292318P1981Q20644795Q100292318-P1981-Q20644795-8508e9f8-0
...............
226432Q97365172P725Q837676Q97365172-P725-Q837676-2c91e4c2-0
226433Q97365172P725Q965261Q97365172-P725-Q965261-40d9e1b3-0
226434Q97365172P856https://www.starwars.com/series/star-wars-the-...Q97365172-P856-efdce4-8d456f9b-0
226435Q97365172P8687+12773Q97365172-P8687-1e0b17-c5647aa0-0
226436Q97365172P8687+53434Q97365172-P8687-a1b4dd-667276a6-0
\n", "

226437 rows × 4 columns

\n", "
" ], "text/plain": [ " node1 label node2 \\\n", "0 Q100292318 P1552 Q27834579 \n", "1 Q100292318 P1552 Q27847754 \n", "2 Q100292318 P1651 k1UNQFEUsPg \n", "3 Q100292318 P1657 Q18665334 \n", "4 Q100292318 P1981 Q20644795 \n", "... ... ... ... \n", "226432 Q97365172 P725 Q837676 \n", "226433 Q97365172 P725 Q965261 \n", "226434 Q97365172 P856 https://www.starwars.com/series/star-wars-the-... \n", "226435 Q97365172 P8687 +12773 \n", "226436 Q97365172 P8687 +53434 \n", "\n", " id \n", "0 Q100292318-P1552-Q27834579-87b20a63-0 \n", "1 Q100292318-P1552-Q27847754-d80110c0-0 \n", "2 Q100292318-P1651-cb1243-138008d8-0 \n", "3 Q100292318-P1657-Q18665334-dc007cbc-0 \n", "4 Q100292318-P1981-Q20644795-8508e9f8-0 \n", "... ... \n", "226432 Q97365172-P725-Q837676-2c91e4c2-0 \n", "226433 Q97365172-P725-Q965261-40d9e1b3-0 \n", "226434 Q97365172-P856-efdce4-8d456f9b-0 \n", "226435 Q97365172-P8687-1e0b17-c5647aa0-0 \n", "226436 Q97365172-P8687-a1b4dd-667276a6-0 \n", "\n", "[226437 rows x 4 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"query -i $TEMP/reframingclaims.tsv --as reframingclaims\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "executionInfo": { "elapsed": 59151, "status": "ok", "timestamp": 1663635450821, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "fY5TwatlPB8O", "outputId": "e0b19df4-9b8b-458b-b425-68a01ddfad09" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0P10-P1628-32b85d-7927ece6-0P10P1628http://www.w3.org/2006/vcard/ns#Video
1P10-P1628-acf60d-b8950832-0P10P1628https://schema.org/video
2P10-P1629-Q34508-bcc39400-0P10P1629Q34508
3P10-P1630-53947a-fbe9093e-0P10P1630https://commons.wikimedia.org/wiki/File:$1
4P10-P1659-P1651-c4068028-0P10P1659P1651
...............
4031755Q97365172-P8411-Q17480853-32949b92-0Q97365172P8411Q17480853
4031756Q97365172-P8411-Q2775969-7f0bc6e5-0Q97365172P8411Q2775969
4031757Q97365172-P86-Q1740191-38c6c027-0Q97365172P86Q1740191
4031758Q97365172-P9751-1b0487-17916163-0Q97365172P9751umc.cmc.jzpcwzmyd6h9eaadrakph6ta
4031759Q97365172-P9821-85ea86-6c81ab56-0Q97365172P98217125
\n", "

4031760 rows × 4 columns

\n", "
" ], "text/plain": [ " id node1 label \\\n", "0 P10-P1628-32b85d-7927ece6-0 P10 P1628 \n", "1 P10-P1628-acf60d-b8950832-0 P10 P1628 \n", "2 P10-P1629-Q34508-bcc39400-0 P10 P1629 \n", "3 P10-P1630-53947a-fbe9093e-0 P10 P1630 \n", "4 P10-P1659-P1651-c4068028-0 P10 P1659 \n", "... ... ... ... \n", "4031755 Q97365172-P8411-Q17480853-32949b92-0 Q97365172 P8411 \n", "4031756 Q97365172-P8411-Q2775969-7f0bc6e5-0 Q97365172 P8411 \n", "4031757 Q97365172-P86-Q1740191-38c6c027-0 Q97365172 P86 \n", "4031758 Q97365172-P9751-1b0487-17916163-0 Q97365172 P9751 \n", "4031759 Q97365172-P9821-85ea86-6c81ab56-0 Q97365172 P9821 \n", "\n", " node2 \n", "0 http://www.w3.org/2006/vcard/ns#Video \n", "1 https://schema.org/video \n", "2 Q34508 \n", "3 https://commons.wikimedia.org/wiki/File:$1 \n", "4 P1651 \n", "... ... \n", "4031755 Q17480853 \n", "4031756 Q2775969 \n", "4031757 Q1740191 \n", "4031758 umc.cmc.jzpcwzmyd6h9eaadrakph6ta \n", "4031759 7125 \n", "\n", "[4031760 rows x 4 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"remove-columns -i $TEMP/claims_base.tsv \\\n", " --columns 'node2;wikidatatype' \\\n", " / cat -i - \\\n", " -i $TEMP/human_single_claims.tsv \\\n", " -o $TEMP/kbaseclaims.tsv\"\"\") \n", "\n", "kgtk(\"query -i $TEMP/kbaseclaims.tsv --as kbaseclaims\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding datatypes to added properties for SR and events\n", "From file `added-property-datatypes.tsv`, that looks like:\n", "\n", "| node1 | label | node2 | id |\n", "|------------|----------|---------------|---------------------|\n", "| Psubject | datatype | wikibase-item | Psubject-datatype |\n", "| Ppredicate | datatype | wikibase-item | Ppredicate-datatype |\n", "| Pobject | datatype | wikibase-item | Pobject-datatype |\n", "| Phas_event | datatype | wikibase-item | Pevent-datatype |\n", "\n", "Add this to the file `metadata.metadata.property.datatypes.tsv.gz`. To this file, remove unnecessary column `node2;wikidatatype`." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "kgtk(\"remove-columns -i datatypes \\\n", " --columns 'node2;wikidatatype' \\\n", " / cat -i $GRAPH/added-property-datatypes.tsv \\\n", " -i - \\\n", " -o $OUT/metadata.property.datatypes.tsv.gz\") " ] }, { "cell_type": "markdown", "metadata": { "id": "FzThSRGCYeZK" }, "source": [ "## Standard reification" ] }, { "cell_type": "markdown", "metadata": { "id": "vWNTn1Ya146H" }, "source": [ "### SR claims" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reframing into Standard Reification approach" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "executionInfo": { "elapsed": 15516, "status": "ok", "timestamp": 1663635562517, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "EPnpridrY-jh", "outputId": "a0362aff-d765-470b-af57-12a6f745f6c5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 30.1 s, sys: 8.73 s, total: 38.9 s\n", "Wall time: 2min 19s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q1000115P1343Q6023581694a9be0P31Q3539534Q1000115P1343Q6023581694a9be0-P31-Q3539534
1Q1000115P1343Q6023581694a9be0P805Q24514151Q1000115P1343Q6023581694a9be0-P805-Q24514151
2Q1000115P1343Q6023581694a9be0PobjectQ602358Q1000115P1343Q6023581694a9be0-Pobject-Q602358
3Q1000115P1343Q6023581694a9be0PpredicateP1343Q1000115P1343Q6023581694a9be0-Ppredicate-P1343
4Q1000115P1343Q6023581694a9be0PsubjectQ1000115Q1000115P1343Q6023581694a9be0-Psubject-Q1000115
...............
3991818Q99P86724e9ef1a5c7bbb50P1810CaliforniaQ99P86724e9ef1a5c7bbb50-P1810-965e75
3991819Q99P86724e9ef1a5c7bbb50P31Q3539534Q99P86724e9ef1a5c7bbb50-P31-Q3539534
3991820Q99P86724e9ef1a5c7bbb50Pobject1259995120016912384Q99P86724e9ef1a5c7bbb50-Pobject-4e9ef1
3991821Q99P86724e9ef1a5c7bbb50PpredicateP8672Q99P86724e9ef1a5c7bbb50-Ppredicate-P8672
3991822Q99P86724e9ef1a5c7bbb50PsubjectQ99Q99P86724e9ef1a5c7bbb50-Psubject-Q99
\n", "

3991823 rows × 4 columns

\n", "
" ], "text/plain": [ " node1 label node2 \\\n", "0 Q1000115P1343Q6023581694a9be0 P31 Q3539534 \n", "1 Q1000115P1343Q6023581694a9be0 P805 Q24514151 \n", "2 Q1000115P1343Q6023581694a9be0 Pobject Q602358 \n", "3 Q1000115P1343Q6023581694a9be0 Ppredicate P1343 \n", "4 Q1000115P1343Q6023581694a9be0 Psubject Q1000115 \n", "... ... ... ... \n", "3991818 Q99P86724e9ef1a5c7bbb50 P1810 California \n", "3991819 Q99P86724e9ef1a5c7bbb50 P31 Q3539534 \n", "3991820 Q99P86724e9ef1a5c7bbb50 Pobject 1259995120016912384 \n", "3991821 Q99P86724e9ef1a5c7bbb50 Ppredicate P8672 \n", "3991822 Q99P86724e9ef1a5c7bbb50 Psubject Q99 \n", "\n", " id \n", "0 Q1000115P1343Q6023581694a9be0-P31-Q3539534 \n", "1 Q1000115P1343Q6023581694a9be0-P805-Q24514151 \n", "2 Q1000115P1343Q6023581694a9be0-Pobject-Q602358 \n", "3 Q1000115P1343Q6023581694a9be0-Ppredicate-P1343 \n", "4 Q1000115P1343Q6023581694a9be0-Psubject-Q1000115 \n", "... ... \n", "3991818 Q99P86724e9ef1a5c7bbb50-P1810-965e75 \n", "3991819 Q99P86724e9ef1a5c7bbb50-P31-Q3539534 \n", "3991820 Q99P86724e9ef1a5c7bbb50-Pobject-4e9ef1 \n", "3991821 Q99P86724e9ef1a5c7bbb50-Ppredicate-P8672 \n", "3991822 Q99P86724e9ef1a5c7bbb50-Psubject-Q99 \n", "\n", "[3991823 rows x 4 columns]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "kgtk(\"\"\"\n", " query -i claims -i qualifiers --multi 5\n", " --match 'claims: (subject)-[p]->(value),\n", " qualifiers: (p)-[qual]->(qual_value)'\n", " --return 'replace(p,\"-\",\"\") as node1, printf(\"P31\") as label, printf(\"Q3539534\") as node2,\n", " replace(p,\"-\",\"\") as node1, printf(\"Psubject\") as label, subject as node2,\n", " replace(p,\"-\",\"\") as node1, printf(\"Ppredicate\") as label, p.label as node2,\n", " replace(p,\"-\",\"\") as node1, printf(\"Pobject\") as label, value as node2,\n", " replace(p,\"-\",\"\") as node1, qual.label as label, qual_value as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/sr_added_claims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"query -i $TEMP/sr_added_claims.tsv.gz\")" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
0671897
\n", "
" ], "text/plain": [ " count\n", "0 671897" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"query -i $TEMP/sr_added_claims.tsv.gz --match '(stm)-[:P31]->(:Q3539534)' --return 'count(distinct stm) as count'\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extracting the claims that must be deleted to avoid duplication of information and appending to the reframed claims" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"query -i claims -i qualifiers \n", " --match 'claims: (subject)-[p]->(value),\n", " qualifiers: (p)-[qual]->(qual_value)'\n", " --return 'subject as node1, p.label as label, value as node2, p as id'\n", " / deduplicate -o $TEMP/d_srclaims.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"ifnotexists --input-file claims \\\n", " --filter-file $TEMP/d_srclaims.tsv.gz \\\n", " --input-keys 'id' \\\n", " --filter-keys 'id' \\\n", " -o $TEMP/sr_baseclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"cat -i $TEMP/sr_baseclaims.tsv.gz -i $TEMP/sr_added_claims.tsv.gz \n", " / deduplicate -o $OUT/sr_claims.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 9151, "status": "ok", "timestamp": 1663635571662, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "5YaBQHdUu3nZ", "outputId": "217909dd-789b-4671-d76b-d345e55976cc" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\tnode2;wikidatatype\r\n", "P10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tP10-P1628-32b85d-7927ece6-0\turl\r\n", "P10\tP1628\t\"https://schema.org/video\"\tP10-P1628-acf60d-b8950832-0\turl\r\n", "P10\tP1629\tQ34508\tP10-P1629-Q34508-bcc39400-0\twikibase-item\r\n", "P10\tP1630\t\"https://commons.wikimedia.org/wiki/File:$1\"\tP10-P1630-53947a-fbe9093e-0\tstring\r\n", "P10\tP1659\tP1651\tP10-P1659-P1651-c4068028-0\twikibase-property\r\n", "P10\tP1659\tP18\tP10-P1659-P18-5e4b9c4f-0\twikibase-property\r\n", "P10\tP1659\tP4238\tP10-P1659-P4238-d21d1ac0-0\twikibase-property\r\n", "P10\tP1659\tP51\tP10-P1659-P51-86aca4c5-0\twikibase-property\r\n", "P10\tP1855\tQ15075950\tP10-P1855-Q15075950-7eff6d65-0\twikibase-item\r\n", "P10\tP1855\tQ4504\tP10-P1855-Q4504-a69d2c73-0\twikibase-item\r\n" ] } ], "source": [ "\n", "!kgtk head -i $OUT/sr_claims.tsv.gz\n" ] }, { "cell_type": "markdown", "metadata": { "id": "MQM_AQnQ2HB2" }, "source": [ "### Browser files: label, alias and description" ] }, { "cell_type": "markdown", "metadata": { "id": "AE-8KwGeEn7N" }, "source": [ "#### Label\n", "Take user-friendly labels instead of ids to form the labels of the statments, add labels from original dataset, and save to `sr_labels.tsv`. Also add the new labels:\n", "\n", "| node1 | label | node2 | id |\n", "|------------|-------|----------------------|--------------------|\n", "| Psubject | label | 'subject'@en | Psubject-label-0 |\n", "| Ppredicate | label | 'predicate'@en | Ppredicate-label-0 |\n", "| Pobject | label | 'object'@en | Pobject-label-0 |\n", "| Phas_event | label | 'has event'@en | Phas_event-label-0 |\n", "| Q3539534 | label | 'semantic triple'@en | Q3539534-label-0 |\n", "| QEvent | label | 'Event'@en | QEvent-label-0 |" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/sr_claims.tsv.gz -i label \n", " --match ' (dummy_s)-[:Psubject]->(subject),\n", " (dummy_s)-[:Ppredicate]->(pred),\n", " (dummy_s)-[:Pobject]->(object),\n", " label: (subject)-[:label]->(s_label),\n", " (pred)-[:label]->(p_label),\n", " (object)-[:label]->(o_label)'\n", " --return 'dummy_s as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(\"Statement for \", kgtk_lqstring_text(s_label), \" - \", kgtk_lqstring_text(p_label), \" - \", kgtk_lqstring_text(o_label))), \"@en\") as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata \n", " / cat -i $GRAPH/extra-labels.tsv -i - -o $TEMP/sr_added_labels.en.tsv.gz\n", "\"\"\") " ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "## statements with objects that are not nodes\n", "kgtk(\"\"\"ifnotexists --input-file $TEMP/sr_added_claims.tsv.gz \\\n", " --filter-file $TEMP/sr_added_labels.en.tsv.gz \\\n", " --input-keys 'node1' \\\n", " --filter-keys 'node1' \\\n", " -o $TEMP/sr_nonnode_claims.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "kgtk (\"\"\"query -i $TEMP/sr_nonnode_claims.tsv.gz -i label \\\n", " --match ' (dummy_s)-[:Psubject]->(subject), \\\n", " (dummy_s)-[:Ppredicate]->(pred), \\\n", " (dummy_s)-[:Pobject]->(object), \\\n", " label: (subject)-[:label]->(s_label), \\\n", " (pred)-[:label]->(p_label)' \\\n", " --return 'dummy_s as node1, printf(\"label\") as label, concat(kgtk_stringify(concat(\"Statement for \", kgtk_lqstring_text(s_label), \" - \", kgtk_lqstring_text(p_label), \" - \", kgtk_unstringify(object))), \"@en\") as node2' \\\n", " / deduplicate \\\n", " / add-id --id-style wikidata \n", " / cat -i $TEMP/sr_added_labels.en.tsv.gz -i - -o $TEMP/sr_complete_added_labels.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat -i $TEMP/sr_complete_added_labels.en.tsv.gz -i label -o $OUT/sr_labels.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "98688 /data02/ana_iglesias/data/subset/reframings/sr_labels.en.tsv.gz\r\n" ] } ], "source": [ "!wc -l $OUT/sr_labels.en.tsv.gz" ] }, { "cell_type": "markdown", "metadata": { "id": "5n5Fd530ErQx" }, "source": [ "#### Description\n", "From the labels additions created above, replace `label` for `description`, add descriptions from original dataset and save to `sr_desc.tsv`." ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/sr_complete_added_labels.en.tsv.gz\n", " --match '(s)-[:label]->(o)'\n", " --return 's as node1, \"description\" as label, o as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/sr_added_descriptions.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat -i $TEMP/sr_added_descriptions.en.tsv.gz -i description -o $OUT/sr_descriptions.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "id": "OtP1F-_mEv62" }, "source": [ "#### Alias" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/sr_complete_added_labels.en.tsv.gz\n", " --match '(s)-[:label]->(o)'\n", " --return 's as node1, \"alias\" as label, o as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/sr_added_aliases.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat -i $TEMP/sr_added_aliases.en.tsv.gz -i alias -o $OUT/sr_aliases.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating clean output files into subfolder" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"remove-columns -i $OUT/sr_aliases.en.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/sr/aliases.en.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"remove-columns -i $OUT/sr_claims.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/sr/claims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"remove-columns -i $OUT/sr_descriptions.en.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/sr/descriptions.en.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"remove-columns -i $OUT/sr_labels.en.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/sr/labels.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "!zcat $OUT/sr/labels.en.tsv.gz | sed -e \"s/\\t\\\"/\\t'/g\" | sed -e \"s/\\\"@en/'@en/g\" > $OUT/sr/labels.en.tsv\n", "!zcat $OUT/sr/aliases.en.tsv.gz | sed -e \"s/\\t\\\"/\\t'/g\" | sed -e \"s/\\\"@en/'@en/g\" > $OUT/sr/aliases.en.tsv\n", "!zcat $OUT/sr/descriptions.en.tsv.gz | sed -e \"s/\\t\\\"/\\t'/g\" | sed -e \"s/\\\"@en/'@en/g\" > $OUT/sr/descriptions.en.tsv" ] }, { "cell_type": "markdown", "metadata": { "id": "Je1J9NU0ZZ09" }, "source": [ "## N-ary relationships: events " ] }, { "cell_type": "markdown", "metadata": { "id": "uFBW_f4zZZ0-" }, "source": [ "### Events claims" ] }, { "cell_type": "markdown", "metadata": { "id": "mJgcuVZ4A438" }, "source": [ "#### Human events " ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 115 ms, sys: 119 ms, total: 234 ms\n", "Wall time: 3min 35s\n" ] } ], "source": [ "%%time\n", "\n", "## Single events (point in time)\n", "\n", "# reframing claims\n", "kgtk(\"\"\"query -i claims -i qualifiers -i label --multi 4\n", " --match 'claims: (human)-[:P31]->(:Q5),\n", " (human)-[p]->(value)'\n", " --opt 'qualifiers: (p)-[qual]->(qual_value)'\n", " --return 'human as node1, printf(\"Phas_event\") as label, concat(replace(p,\"-\",\"\"),\"Event\") as node2,\n", " concat(replace(p,\"-\",\"\"),\"Event\") as node1, qual.label as label, qual_value as node2,\n", " concat(replace(p,\"-\",\"\"),\"Event\") as node1, p.label as label, value as node2,\n", " concat(replace(p,\"-\",\"\"),\"Event\") as node1, printf(\"P31\") as label, printf(\"QEvent\") as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/singleclaims_nofilter.tsv.gz\n", "\"\"\")\n", "# intermediary file bc was not filtering with --where\n", "kgtk(\"\"\"query -i $TEMP/singleclaims_nofilter.tsv.gz --multi 2\n", " --match '(human)-[:Phas_event]->(event),\n", " (event)-[p]->(pv),\n", " (event)-[pred]->(val)'\n", " --where 'p.label IN [\"P40\", \"P166\", \"P1411\", \"P184\", \"P1344\",\"P27\", \"P69\", \"P551\", \"P463\", \"P26\", \"P106\", \"P39\", \"P108\"]'\n", " --return 'human as node1, printf(\"Phas_event\") as label, event as node2,\n", " event as node1, pred.label as label, val as node1'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/singleclaims.tsv.gz\n", "\"\"\")\n", "\n", "# temporal file to allow creation of labels\n", "kgtk(\"\"\"query -i claims -i qualifiers --multi 3 \n", " --match 'claims: (human)-[:P31]->(:Q5),\n", " (human)-[p]->(value)'\n", " --where 'p.label IN [\"P40\", \"P166\", \"P1411\", \"P184\", \"P1344\",\"P27\", \"P69\", \"P551\", \"P463\", \"P26\", \"P106\", \"P39\", \"P108\"]'\n", " --return 'concat(replace(p,\"-\",\"\"),\"Event\") as node1, printf(\"s\") as label, human as node2,\n", " concat(replace(p,\"-\",\"\"),\"Event\") as node1, printf(\"p\") as label, p.label as node2,\n", " concat(replace(p,\"-\",\"\"),\"Event\") as node1, printf(\"o\") as label, value as node2'\n", " -o $TEMP/evlabel_temp.tsv.gz\"\"\")\n", "\n", "# Label creation\n", "kgtk(\"\"\"query -i $TEMP/evlabel_temp.tsv.gz -i label\n", " --match ' (id)-[:s]->(snode),\n", " (id)-[:p]->(pnode),\n", " (id)-[:o]->(onode),\n", " label: (snode)-[:label]->(slabel),\n", " (pnode)-[:label]->(plabel),\n", " (onode)-[:label]->(olabel)'\n", " --return 'id as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(\"Event of \", kgtk_lqstring_text(slabel), \" - \", kgtk_lqstring_text(plabel), \" - \", kgtk_lqstring_text(olabel))), \"@en\") as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/singlelabels.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "id": "IepbPqu5A-_p" }, "source": [ "#### Birth and death events" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 32298, "status": "ok", "timestamp": 1663636885899, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "CQ5-5pyHBIIf", "outputId": "48506bf7-294d-4165-a7d9-3e3ac23aa8a5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 12.7 ms, sys: 50.9 ms, total: 63.7 ms\n", "Wall time: 11.9 s\n" ] } ], "source": [ "%%time\n", "## BIRTH\n", "\n", "#birth_properties = [\"P19\", \"P3373\", \"P22\", \"P25\", \"P3448\", \"P21\", \"P172\"]\n", "\n", "## For birth date (P569)\n", "kgtk(\"\"\" \n", " query -i claims --multi 4\n", " --match '(human)-[:P31]->(:Q5),\n", " (human)-[p:P569]->(birth_date)'\n", " --opt '(human)-[p_birth]->(birth)'\n", " --where 'p_birth.label IN [\"P19\", \"P3373\", \"P22\", \"P25\", \"P3448\", \"P21\", \"P172\"]'\n", " --return 'human as node1, printf(\"Phas_event\") as label, concat(replace(p,\"-\",\"\"),\"BirthEvent\") as node2,\n", " concat(replace(p,\"-\",\"\"),\"BirthEvent\") as node1, p.label as label, birth_date as node2,\n", " concat(replace(p,\"-\",\"\"),\"BirthEvent\") as node1, p_birth.label as label, birth as node2,\n", " concat(replace(p,\"-\",\"\"),\"BirthEvent\") as node1, printf(\"P31\") as label, printf(\"QEvent\") as node2'\n", " \n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/birthclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\" \n", " query -i claims -i label \n", " --match 'claims: (human)-[:P31]->(:Q5),\n", " (human)-[p:P569]->(birth_date),\n", " label: (human)-[:label]->(hlabel)'\n", " --return 'concat(replace(p,\"-\",\"\"),\"BirthEvent\") as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(\"Birth event of \", kgtk_lqstring_text(hlabel))), \"@en\") as node2'\n", " \n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/birthlabels.tsv.gz\n", "\"\"\")\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 18017, "status": "ok", "timestamp": 1663636903907, "user": { "displayName": "Ana Iglesias Molina", "userId": "04261502950676662263" }, "user_tz": 420 }, "id": "9UjIUaodCzZB", "outputId": "23859e88-15df-47d9-f839-055ca5b1fa63" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 10.5 ms, sys: 45.2 ms, total: 55.7 ms\n", "Wall time: 7.06 s\n" ] } ], "source": [ "%%time\n", "\n", "## DEATH\n", "#death_properties = [\"P1196\", \"P20\", \"P509\"]\n", "\n", "## For death date (P570)\n", "kgtk(\"\"\" \n", " query -i claims --multi 4\n", " --match '(human)-[:P31]->(:Q5),\n", " (human)-[p:P570]->(death_date)'\n", " --opt '(human)-[p_death]->(death)'\n", " --where 'p_death.label IN [\"P1196\", \"P20\", \"P509\"]'\n", " --return 'human as node1, printf(\"Phas_event\") as label, concat(replace(p,\"-\",\"\"),\"DeathEvent\") as node2,\n", " concat(replace(p,\"-\",\"\"),\"DeathEvent\") as node1, p.label as label, death_date as node2,\n", " concat(replace(p,\"-\",\"\"),\"DeathEvent\") as node1, p_death.label as label, death as node2,\n", " concat(replace(p,\"-\",\"\"),\"DeathEvent\") as node1, printf(\"P31\") as label, printf(\"QEvent\") as node2' \n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/deathclaims.tsv.gz\n", "\"\"\")\n", "\n", "# death labels\n", "\n", "kgtk(\"\"\" \n", " query -i claims -i label \n", " --match 'claims: (human)-[:P31]->(:Q5),\n", " (human)-[p:P570]->(birth_date),\n", " label: (human)-[:label]->(hlabel)'\n", " --return 'concat(replace(p,\"-\",\"\"),\"DeathEvent\") as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(\"Death event of \", kgtk_lqstring_text(hlabel))), \"@en\") as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/deathlabels.tsv.gz\n", "\"\"\")\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Awards" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\" \n", " query -i claims -i qualifiers --multi 4\n", " --match 'claims: (award)-[p:P1346]->(winner),\n", " qualifiers: (p)-[ptime:P585]->(year),\n", " (p)-[qual]->(qvalue)'\n", " --where 'award IN [\"Q103618\",\"Q103916\"]'\n", " --return 'award as node1, p.label as label, concat(replace(p,\"-\",\"\"),\"Winner\",kgtk_date_year(year)) as node2,\n", " concat(replace(p,\"-\",\"\"),\"Winner\",kgtk_date_year(year)) as node1, p.label as label, winner as node2,\n", " concat(replace(p,\"-\",\"\"),\"Winner\",kgtk_date_year(year)) as node1, ptime.label as label, year as node2,\n", " concat(replace(p,\"-\",\"\"),\"Winner\",kgtk_date_year(year)) as node1, qual.label as label, qvalue as node2' \n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/awardclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\" \n", " query -i claims -i qualifiers -i label\n", " --match 'claims: (award)-[p:P1346]->(winner),\n", " qualifiers: (p)-[ptime:P585]->(year),\n", " label: (award)-[:label]->(alabel)'\n", " --where 'award IN [\"Q103618\",\"Q103916\"]'\n", " --return 'concat(replace(p,\"-\",\"\"),\"Winner\",kgtk_date_year(year)) as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(kgtk_lqstring_text(alabel),\" \",kgtk_date_year(year))), \"@en\") as node2'\n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/awardlabels.tsv.gz\n", "\"\"\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Movies and tv series" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\" \n", " query -i claims -i qualifiers --multi 3\n", " --match 'claims: (movie)-[:P31]->(type),\n", " (movie)-[p:P161]->(cast)'\n", " --opt 'qualifiers: (p)-[qual]->(qvalue)'\n", " --where 'type IN [\"Q229390\", \"Q24869\", \"Q11424\",\"Q5398426\",\"Q526877\"]'\n", " --return 'movie as node1, p.label as label, concat(replace(p,\"-\",\"\"),\"CastMember\",cast) as node2,\n", " concat(replace(p,\"-\",\"\"),\"CastMember\",cast) as node1, p.label as label, cast as node2,\n", " concat(replace(p,\"-\",\"\"),\"CastMember\",cast) as node1, qual.label as label, qvalue as node2' \n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/movieclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\" \n", " query -i claims -i label -i label\n", " --match 'claims: (movie)-[:P31]->(type),\n", " (movie)-[p:P161]->(cast),\n", " label: (cast)-[:label]->(clabel),\n", " (movie)-[:label]->(mlabel)'\n", " --where 'type IN [\"Q229390\", \"Q24869\", \"Q11424\",\"Q5398426\",\"Q526877\"]'\n", " --return 'concat(replace(p,\"-\",\"\"),\"CastMember\",cast) as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(\"Cast member \", kgtk_lqstring_text(clabel), \" of \", kgtk_lqstring_text(mlabel))), \"@en\") as node2'\n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/movielabels.tsv.gz\n", "\"\"\")\n" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "## animation movies/tv series with 'voice actor' instead of 'cast member'\n", "kgtk(\"\"\" \n", " query -i claims -i reframingquals --multi 3\n", " --match 'claims: (movie)-[:P31]->(type),\n", " (movie)-[p:P725]->(cast)'\n", " --opt 'reframingquals: (p)-[qual]->(qvalue)'\n", " --where 'type IN [\"Q581714\",\"Q11425\",\"Q29168811\"]'\n", " --return 'movie as node1, p.label as label, concat(replace(p,\"-\",\"\"),\"VoiceActor\",cast) as node2,\n", " concat(replace(p,\"-\",\"\"),\"VoiceActor\",cast) as node1, p.label as label, cast as node2,\n", " concat(replace(p,\"-\",\"\"),\"VoiceActor\",cast) as node1, qual.label as label, qvalue as node2' \n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/animationclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\" \n", " query -i claims -i label -i label\n", " --match 'claims: (movie)-[:P31]->(type),\n", " (movie)-[p:P725]->(cast),\n", " label: (cast)-[:label]->(clabel),\n", " (movie)-[:label]->(mlabel)'\n", " --where 'type IN [\"Q581714\",\"Q11425\",\"Q29168811\"]'\n", " --return 'concat(replace(p,\"-\",\"\"),\"VoiceActor\",cast) as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(\"Voice actor \", kgtk_lqstring_text(clabel), \" of \", kgtk_lqstring_text(mlabel))), \"@en\") as node2'\n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/animationlabels.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Characters" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\" \n", " query -i claims -i qualifiers --multi 3\n", " --match 'claims: (char)-[:P31]->(type),\n", " (char)-[p:P175]->(cast)'\n", " --opt 'qualifiers: (p)-[qual]->(qvalue)'\n", " --where 'type IN [\"Q15773347\",\"Q15773317\"]'\n", " --return 'char as node1, p.label as label, concat(replace(p,\"-\",\"\"),\"CharacterPerformer\",cast) as node2,\n", " concat(replace(p,\"-\",\"\"),\"CharacterPerformer\",cast) as node1, p.label as label, cast as node2,\n", " concat(replace(p,\"-\",\"\"),\"CharacterPerformer\",cast) as node1, qual.label as label, qvalue as node2' \n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/charclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\" \n", " query -i claims -i label \n", " --match 'claims: (char)-[:P31]->(type),\n", " (char)-[p:P175]->(cast),\n", " label: (cast)-[:label]->(clabel),\n", " (char)-[:label]->(mlabel)'\n", " --where 'type IN [\"Q15773347\",\"Q15773317\"]'\n", " --return 'concat(replace(p,\"-\",\"\"),\"CharacterPerformer\",cast) as node1, printf(\"label\") as label, \n", " concat(kgtk_stringify(concat(kgtk_lqstring_text(mlabel), \" Character performer \", kgtk_lqstring_text(clabel))), \"@en\") as node2'\n", " / deduplicate \n", " / add-id --id-style wikidata -o $TEMP/charlabels.tsv.gz\n", "\"\"\")\n" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\r\n", "Q1000118P175Q28556723c6ece1a70CharacterPerformerQ28556723\tlabel\t\"Peter Pettigrew Character performer Zachary David\"@en\tQ1000118P175Q28556723c6ece1a70CharacterPerformerQ28556723-label-113a30\r\n", "Q1000118P175Q287824bce710040CharacterPerformerQ287824\tlabel\t\"Peter Pettigrew Character performer Timothy Spall\"@en\tQ1000118P175Q287824bce710040CharacterPerformerQ287824-label-aebb06\r\n", "Q101069011P175Q232477854517aa0CharacterPerformerQ232477\tlabel\t\"Countess Helena Andrenyi Character performer Jacqueline Bisset\"@en\tQ101069011P175Q232477854517aa0CharacterPerformerQ232477-label-5ebb6b\r\n", "Q101069011P175Q26011378cab6a710CharacterPerformerQ2601137\tlabel\t\"Countess Helena Andrenyi Character performer Elena Satine\"@en\tQ101069011P175Q26011378cab6a710CharacterPerformerQ2601137-label-077817\r\n", "Q101069011P175Q902478140ebef9d0CharacterPerformerQ9024781\tlabel\t\"Countess Helena Andrenyi Character performer Lucy Boynton\"@en\tQ101069011P175Q902478140ebef9d0CharacterPerformerQ9024781-label-2e9c85\r\n", "Q101069438P175Q39666dab89e370CharacterPerformerQ39666\tlabel\t\"Pilar Estravados Character performer Penélope Cruz\"@en\tQ101069438P175Q39666dab89e370CharacterPerformerQ39666-label-c7b369\r\n", "Q101069445P175Q2306267c005b240CharacterPerformerQ230626\tlabel\t\"Princess Natalia Dragomiroff Character performer Wendy Hiller\"@en\tQ101069445P175Q2306267c005b240CharacterPerformerQ230626-label-3e804e\r\n", "Q101069445P175Q2698350baae7b70CharacterPerformerQ269835\tlabel\t\"Princess Natalia Dragomiroff Character performer Eileen Atkins\"@en\tQ101069445P175Q2698350baae7b70CharacterPerformerQ269835-label-3f699e\r\n", "Q101069445P175Q280545b3a36ee0CharacterPerformerQ28054\tlabel\t\"Princess Natalia Dragomiroff Character performer Judi Dench\"@en\tQ101069445P175Q280545b3a36ee0CharacterPerformerQ28054-label-0ec500\r\n", "Q101071356P175Q159778402e85100CharacterPerformerQ159778\tlabel\t\"Caroline Hubbard Character performer Michelle Pfeiffer\"@en\tQ101071356P175Q159778402e85100CharacterPerformerQ159778-label-206b7e\r\n" ] } ], "source": [ "!kgtk head -i $TEMP/charlabels.tsv.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creating claims file" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\" cat -i $TEMP/singleclaims.tsv.gz \n", " -i $TEMP/birthclaims.tsv.gz \n", " -i $TEMP/deathclaims.tsv.gz \n", " -i $TEMP/awardclaims.tsv.gz \n", " -i $TEMP/movieclaims.tsv.gz \n", " -i $TEMP/animationclaims.tsv.gz \n", " -i $TEMP/charclaims.tsv.gz \n", " -o $TEMP/event_added_claims.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"query -i claims \n", " --match '(human)-[:P31]->(:Q5),\n", " (human)-[p]->(value)'\n", " --where 'p.label IN [\"P40\", \"P166\", \"P1411\", \"P184\", \"P1344\",\"P27\", \"P69\", \"P551\", \"P463\", \"P26\", \"P106\", \"P39\", \"P108\",\"P19\", \"P3373\", \"P22\", \"P25\", \"P3448\", \"P21\", \"P172\",\"P1196\", \"P20\", \"P509\", \"P569\",\"P570\"]'\n", " --return 'human as node1, p.label as label, value as node2, p as id'\n", " / deduplicate -o $TEMP/d_humanclaims.tsv.gz\"\"\")\n", "\n", "kgtk(\"\"\"query -i claims \n", " --match '(award)-[p:P1346]->(value)'\n", " --where 'award IN [\"Q103618\",\"Q103916\"]'\n", " --return 'award as node1, p.label as label, value as node2, p as id'\n", " / deduplicate -o $TEMP/d_awardclaims.tsv.gz\"\"\")\n", "\n", "kgtk(\"\"\"query -i claims \n", " --match '(movie)-[:P31]->(type),\n", " (movie)-[p:P161]->(value)'\n", " --where 'type IN [\"Q229390\", \"Q24869\", \"Q11424\",\"Q5398426\",\"Q526877\"]'\n", " --return 'movie as node1, p.label as label, value as node2, p as id'\n", " / deduplicate -o $TEMP/d_movieclaims.tsv.gz\"\"\")\n", "\n", "kgtk(\"\"\"query -i claims \n", " --match '(movie)-[:P31]->(type),\n", " (movie)-[p:P725]->(value)'\n", " --where 'type IN [\"Q581714\",\"Q11425\",\"Q29168811\"]'\n", " --return 'movie as node1, p.label as label, value as node2, p as id'\n", " / deduplicate -o $TEMP/d_animationclaims.tsv.gz\"\"\")\n", "\n", "kgtk(\"\"\"query -i claims \n", " --match '(char)-[:P31]->(type),\n", " (char)-[p:P175]->(value)'\n", " --where 'type IN [\"Q15773347\",\"Q15773317\"]'\n", " --return 'char as node1, p.label as label, value as node2, p as id'\n", " / deduplicate -o $TEMP/d_charclaims.tsv.gz\"\"\")\n" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\" cat -i $TEMP/d_humanclaims.tsv.gz\n", " -i $TEMP/d_awardclaims.tsv.gz\n", " -i $TEMP/d_movieclaims.tsv.gz\n", " -i $TEMP/d_animationclaims.tsv.gz\n", " -i $TEMP/d_charclaims.tsv.gz\n", " -o $TEMP/d_ev_claims.tsv.gz\n", "\"\"\")\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"ifnotexists --input-file claims \\\n", " --filter-file $TEMP/d_ev_claims.tsv.gz \\\n", " --input-keys 'id' \\\n", " --filter-keys 'id' \\\n", " -o $TEMP/ev_baseclaims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"cat -i $TEMP/ev_baseclaims.tsv.gz -i $TEMP/event_added_claims.tsv.gz\n", " / deduplicate -o $OUT/ev_claims.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "id": "-4nyCAqXZZ0-" }, "source": [ "### Browser files: label, alias and description" ] }, { "cell_type": "markdown", "metadata": { "id": "Kxy0Fls_ZZ0-" }, "source": [ "#### Label\n", "Take user-friendly labels instead of ids to form the labels of the statments, add labels from original dataset, and save to `sr_labels.tsv`." ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\" cat -i $GRAPH/extra-labels.tsv\n", " -i $TEMP/singlelabels.tsv.gz \n", " -i $TEMP/birthlabels.tsv.gz \n", " -i $TEMP/deathlabels.tsv.gz \n", " -i $TEMP/awardlabels.tsv.gz \n", " -i $TEMP/movielabels.tsv.gz \n", " -i $TEMP/animationlabels.tsv.gz \n", " -i $TEMP/charlabels.tsv.gz \n", " -o $TEMP/event_added_labels.tsv.gz\n", "\"\"\") " ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " cat -i $TEMP/event_added_labels.tsv.gz -i label -o $OUT/ev_labels.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "id": "FlU59Q6gZZ0-" }, "source": [ "#### Description\n", "From the labels additions created above, replace `label` for `description`, add descriptions from original dataset and save to `sr_desc.tsv`." ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/event_added_labels.tsv.gz\n", " --match '(s)-[:label]->(o)'\n", " --return 's as node1, \"description\" as label, o as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/ev_added_descriptions.en.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"\n", " cat -i $TEMP/ev_added_descriptions.en.tsv.gz -i description -o $OUT/ev_descriptions.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": { "id": "1Kl_FyuoZZ0_" }, "source": [ "#### Alias" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"\n", " query -i $TEMP/event_added_labels.tsv.gz\n", " --match '(s)-[:label]->(o)'\n", " --return 's as node1, \"alias\" as label, o as node2'\n", " / deduplicate\n", " / add-id --id-style wikidata -o $TEMP/ev_added_aliases.en.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"\n", " cat -i $TEMP/ev_added_aliases.en.tsv.gz -i alias -o $OUT/ev_aliases.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating clean output files into subfolder" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"remove-columns -i $OUT/ev_aliases.en.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/events/aliases.en.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"remove-columns -i $OUT/ev_claims.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/events/claims.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"remove-columns -i $OUT/ev_descriptions.en.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/events/descriptions.en.tsv.gz\n", "\"\"\")\n", "\n", "kgtk(\"\"\"remove-columns -i $OUT/ev_labels.en.tsv.gz \n", " --columns 'node2;wikidatatype' \n", " / deduplicate -o $OUT/events/labels.en.tsv.gz\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "!zcat $OUT/events/labels.en.tsv.gz | sed -e \"s/\\t\\\"/\\t'/g\" | sed -e \"s/\\\"@en/'@en/g\" > $OUT/events/labels.en.tsv\n", "!zcat $OUT/events/aliases.en.tsv.gz | sed -e \"s/\\t\\\"/\\t'/g\" | sed -e \"s/\\\"@en/'@en/g\" > $OUT/events/aliases.en.tsv\n", "!zcat $OUT/events/descriptions.en.tsv.gz | sed -e \"s/\\t\\\"/\\t'/g\" | sed -e \"s/\\\"@en/'@en/g\" > $OUT/events/descriptions.en.tsv" ] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyO9uXmnAzFx3Jatr8duushN", "collapsed_sections": [ "uaMp3Z644ZMS", "FzThSRGCYeZK", "tO4k2o1UFM8O" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 1 }