{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# Parameters\n", "\n", "input_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", "output_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", "\n", "project_name = \"geography-files\"\n", "\n", "files = 'label,item,monolingualtext,external_id,p279star,p31'\n", "debug=False" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "files = files.split(',')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /nas/home/amandeep\n", "Current dir: /data/amandeep/github/kgtk-notebooks/use-cases\n", "KGTK dir: /data/amandeep/github/kgtk-notebooks\n", "Use-cases dir: /data/amandeep/github/kgtk-notebooks/use-cases\n" ] } ], "source": [ "ck = ConfigureKGTK(files)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db\n", "KGTK_LABEL_FILE: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\n", "GRAPH: /data/amandeep/wikidata-20211027-dwd-v3\n", "USE_CASES_DIR: /data/amandeep/github/kgtk-notebooks/use-cases\n", "TEMP: /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files\n", "EXAMPLES_DIR: /data/amandeep/github/kgtk-notebooks/examples\n", "kgtk: kgtk\n", "STORE: /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db\n", "OUT: /data/amandeep/wikidata-20211027-dwd-v3/geography-files\n", "KGTK_OPTION_DEBUG: false\n", "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db\n", "label: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\n", "item: /data/amandeep/wikidata-20211027-dwd-v3/claims.wikibase-item.tsv.gz\n", "monolingualtext: /data/amandeep/wikidata-20211027-dwd-v3/claims.monolingualtext.tsv.gz\n", "external_id: /data/amandeep/wikidata-20211027-dwd-v3/claims.external-id.tsv.gz\n", "p279star: /data/amandeep/wikidata-20211027-dwd-v3/derived.P279star.tsv.gz\n", "p31: /data/amandeep/wikidata-20211027-dwd-v3/derived.P31.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\" --as label -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.wikibase-item.tsv.gz\" --as item -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.monolingualtext.tsv.gz\" --as monolingualtext -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.external-id.tsv.gz\" --as external_id -i \"/data/amandeep/wikidata-20211027-dwd-v3/derived.P279star.tsv.gz\" --as p279star -i \"/data/amandeep/wikidata-20211027-dwd-v3/derived.P31.tsv.gz\" --as p31 --limit 3\n", "id\tnode1\tlabel\tnode2\tlang\trank\tnode2;wikidatatype\n", "P10-label-en\tP10\tlabel\t'video'@en\ten\t\t\n", "P1000-label-en\tP1000\tlabel\t'record held'@en\ten\t\t\n", "P10000-label-en\tP10000\tlabel\t'Research Vocabularies Australia ID'@en\ten\t\t\n" ] } ], "source": [ "ck.load_files_into_cache()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build Files To Reason About Geography" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `P131star`: map each node1 in `P131` to all its admin up the chain of `P131`\n", "\n", "Removes historical admins, including historical countries.\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1000398 | P131star | Q12694 | Q1000398-P131star-Q12694 |\n", "| Q1001008 | P131star | Q12589 | Q1001008-P131star-Q12589 |\n", "| Q1001499 | P131star | Q214 | Q1001499-P131star-Q214 |\n", "| Q1001995 | P131star | Q1001995 | Q1001995-P131star-Q1001995 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the graph of `P131` so that we give a smaller graph to reachable-nodes, and remove historical admins" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i item\n", " --match '(n1)-[l:P131]->(n2)' \n", " --return 'distinct n1 as node1, l.label as label, n2 as node2'\n", " --order-by 'n1'\n", " / add-id --id-style wikidata\n", " -o \"$TEMP\"/P131.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index P131" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "P2618\tP131\tQ3206\tP2618-P131-Q3206\n", "P2621\tP131\tQ21\tP2621-P131-Q21\n" ] } ], "source": [ "kypher(\"\"\" -i \"$TEMP\"/P131.tsv.gz --as p131 --limit 2\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get all node1 in the `P131` relation, as we will use them as roots for `P131star`" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "kypher(\"\"\" -i p131\n", " --match 'p131: (n1)-[]->(n2)'\n", " --return 'distinct n1 as id' \n", " -o \"$TEMP\"/p131.node1.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute `P131star`, which maps every node1 in P131 to all the admin locations that can be reached from it." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"reachable-nodes\n", " --rootfile \"$TEMP\"/p131.node1.tsv.gz\n", " --rootfilecolumn id\n", " --label \"P131star\"\n", " --selflink\n", " -i \"$TEMP\"/P131.tsv.gz\n", " / add-id --id-style wikidata\n", " / sort\n", " -o $OUT/derived.P131star.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index `p131star` in kypher" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "P2618\tP131star\tP2618\tP2618-P131star-P2618\n", "P2618\tP131star\tQ3206\tP2618-P131star-Q3206\n" ] } ], "source": [ "kypher(\"\"\" -i $OUT/derived.P131star.tsv.gz --as p131star --limit 2\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test: get all the admins of Pasadena: Q485176" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "if debug:\n", " !$kypher -i p131star -i label -i p31 \\\n", " --match ' \\\n", " p131star: (:Q485176)-[]->(admin), \\\n", " p31: (admin)-[]->(admin_class)' \\\n", " --opt 'labels: (admin)-[]->(admin_label)' \\\n", " --opt 'labels: (admin_class)-[]->(admin_class_label)' \\\n", " --return 'distinct admin as admin, admin_label as admin_label, admin_class as admin_class, admin_class_label as admin_class_label' \\\n", " --order-by 'admin_class_label' \\\n", " | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131admin2` map each human settlement to its admin2 `Q13220204`\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q100252368 | P131admin1 | Q1588 | Q100252368-P131admin1-Q1588 |\n", "| Q1005394 | P131admin1 | Q54171 | Q1005394-P131admin1-Q54171 |\n", "| Q100923 | P131admin1 | Q1263 | Q100923-P131admin1-Q1263 |\n", "| Q101111580 | P131admin1 | Q34800 | Q101111580-P131admin1-Q34800 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make a list of all human settlements" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "kypher(\"\"\" -i p31 -i p279star\n", " --match '\n", " p31: (human_settlement)-[]->(class),\n", " p279star: (class)-[]->(:Q486972)'\n", " --return 'distinct human_settlement as node1'\n", " -o \"$TEMP\"/human_settlement.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index human_settlement" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\n", "Q104214562\n", "Q105923404\n" ] } ], "source": [ "kypher(\"\"\" -i \"$TEMP\"/human_settlement.tsv.gz --as settlement --limit 2\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pick out all the admins that are below admin2 `Q13220204`" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p131star -i p279star\n", " --match '\n", " p131star: (x)-[]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q13220204)'\n", " --return 'distinct x as node1, \"P131admin2\" as label, admin as node2'\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.P131admin2.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131admin1` map each human settlement to its admin1 `Q10864048`\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q100252368 | P131admin1 | Q1588 | Q100252368-P131admin1-Q1588 |\n", "| Q1005394 | P131admin1 | Q54171 | Q1005394-P131admin1-Q54171 |\n", "| Q100923 | P131admin1 | Q1263 | Q100923-P131admin1-Q1263 |\n", "| Q101111580 | P131admin1 | Q34800 | Q101111580-P131admin1-Q34800 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pick out the admins that are below admin1 `Q10864048`" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p131star -i p279star\n", " --match '\n", " p131star: (x)-[]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q10864048)'\n", " --return 'distinct x as node1, \"P131admin1\" as label, admin as node2'\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.P131admin1.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131country` map each settlement to its country\n", "Removes historical country\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1003172 | P131country | Q45 | Q1003172-P131country-Q45 |\n", "| Q100701578 | P131country | Q678 | Q100701578-P131country-Q678 |\n", "| Q1010068 | P131country | Q219 | Q1010068-P131country-Q219 |\n", "| Q101218885 | P131country | Q822 | Q101218885-P131country-Q822 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pick out all the admin that are countries `Q6256`" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p131star -i p279star\n", " --match '\n", " p131star: (x)-[]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q6256)'\n", " --return 'distinct x as node1, \"P131country\" as label, admin as node2'\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.P131country.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131.admin`" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131admin1.tsv.gz\n", "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131admin2.tsv.gz\n", "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131country.tsv.gz\n", "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131star.tsv.gz\n" ] } ], "source": [ "!ls \"$OUT\"/derived.P131*.tsv.gz" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat\n", " -i \"$OUT\"/derived.P131admin1.tsv.gz\n", " -i \"$OUT\"/derived.P131admin2.tsv.gz\n", " -i \"$OUT\"/derived.P131country.tsv.gz\n", " -o \"$OUT\"/derived.P131.admin.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index `derived.P131.admin`" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "P2618\tP131admin1\tQ3206\tP2618-P131admin1-Q3206\n", "P2621\tP131admin1\tQ21\tP2621-P131admin1-Q21\n" ] } ], "source": [ "!$kypher -i \"$OUT\"/derived.P131.admin.tsv.gz --as p131admin --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Something to worry about: some countries have end times:" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-09-19 22:10:14 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_1_c1.\"node1\", graph_4_c4.\"node2\", graph_4_c5.\"node2\", graph_7_c3.\"node2\"\n", " FROM graph_1 AS graph_1_c1\n", " INNER JOIN graph_1 AS graph_1_c2, graph_4 AS graph_4_c4, graph_4 AS graph_4_c5, graph_7 AS graph_7_c3\n", " ON graph_1_c1.\"node1\" = graph_1_c2.\"node1\"\n", " AND graph_1_c1.\"node1\" = graph_4_c4.\"node1\"\n", " AND graph_1_c2.\"id\" = graph_7_c3.\"node1\"\n", " AND graph_1_c2.\"node2\" = graph_4_c5.\"node1\"\n", " AND graph_1_c1.\"node2\" = ?\n", " AND graph_7_c3.\"label\" = ?\n", " LIMIT ?\n", " PARAS: ['Q6256', 'P582', 20]\n", "---------------------------------------------\n", "node1\tnode2\tnode2\tnode2\n", "Q1000\t'Gabon'@en\t'French colonial empire'@en\t^1960-01-01T00:00:00Z/9\n", "Q1029\t'Mozambique'@en\t'overseas province of Portugal'@en\t^1975-06-24T00:00:00Z/11\n", "Q1146786\t'Señorío of Cuzcatlán'@en\t'country'@en\t^1528-01-01T00:00:00Z/9\n", "Q1155700\t'Rattanakosin Kingdom'@en\t'country'@en\t^1932-06-24T00:00:00Z/11\n", "Q12548\t'Holy Roman Empire'@en\t'sovereign state'@en\t^1806-01-01T00:00:00Z/9\n", "Q139377\t'Kingdom of Aksum'@en\t'realm'@en\t^0960-01-01T00:00:00Z/9\n", "Q139377\t'Kingdom of Aksum'@en\t'country'@en\t^0960-01-01T00:00:00Z/9\n", "Q142\t'France'@en\t'colonial power'@en\t^1960-01-01T00:00:00Z/9\n", "Q16\t'Canada'@en\t'dominion of the British Empire'@en\t^1931-01-01T00:00:00Z/9\n", "Q172640\t'North Vietnam'@en\t'self-proclaimed state'@en\t^1954-01-01T00:00:00Z/9\n", "Q172640\t'North Vietnam'@en\t'sovereign state'@en\t^1976-01-01T00:00:00Z/9\n", "Q180573\t'South Vietnam'@en\t'sovereign state'@en\t^1975-01-01T00:00:00Z/9\n", "Q189988\t'Central African Empire'@en\t'sovereign state'@en\t^1979-09-20T00:00:00Z/11\n", "Q213353\t'Cisalpine Republic'@en\t'country'@en\t^1802-01-01T00:00:00Z/9\n", "Q2741089\t'Greater Republic of Central America'@en\t'country'@en\t^1898-08-27T00:00:00Z/11\n", "Q30\t'United States of America'@en\t'historical unrecognized state'@en\t^1784-05-12T00:00:00Z/11\n", "Q31\t'Belgium'@en\t'colonial power'@en\t^1960-01-01T00:00:00Z/9\n", "Q3167772\t'Kingdom of Polonnaruwa'@en\t'country'@en\t^1310-01-01T00:00:00Z/9\n", "Q31747\t'Irish Free State'@en\t'country'@en\t^1937-12-29T00:00:00Z/11\n", "Q39\t'Switzerland'@en\t'confederation'@en\t^1815-08-07T00:00:00Z/11\n", " 0.96 real 0.77 user 0.15 sys\n" ] } ], "source": [ "!$kypher -i p31 -i labels -i qualifiers \\\n", "--match ' \\\n", " p31: (country)-[]->(:Q6256), \\\n", " p31: (country)-[r]->(class), \\\n", " qualifiers: (r)-[:P582]->(endtime), \\\n", " labels: (country)-[]->(country_label), \\\n", " labels: (class)-[]->(class_label)' \\\n", "--return 'country, country_label, class_label, endtime' \\\n", "--limit 20" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import the file into the kypher index and give it the alias `admin1`" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i item -i p279star\n", " --match '\n", " p31: (country)-[:P31]->(:Q6256),\n", " item: (settlement)-[:P17]->(country),\n", " p31: (settlement)-[:P31]->(a_settlement_subclass),\n", " p279star: (a_settlement_subclass)-[]->(:Q486972)'\n", " --where 'a_settlement_subclass != \"Q486972\"'\n", " --return 'settlement as node1, \"P17\" as label, country as node2' \n", " / compact -i - --deduplicate --build-id --id-style wikidata\n", " -o \"$TEMP\"/settlement-to-country.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | node1 | \n", "label | \n", "node2 | \n", "id | \n", "
---|---|---|---|---|
0 | \n", "Q100 | \n", "P17 | \n", "Q30 | \n", "Q100-P17-Q30 | \n", "
1 | \n", "Q100000 | \n", "P17 | \n", "Q55 | \n", "Q100000-P17-Q55 | \n", "
2 | \n", "Q1000003 | \n", "P17 | \n", "Q142 | \n", "Q1000003-P17-Q142 | \n", "