{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# Parameters\n", "\n", "input_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", "output_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", "\n", "project_name = \"geography-files\"\n", "\n", "files = 'label,item,monolingualtext,external_id,p279star,p31'\n", "debug=False" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "files = files.split(',')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /nas/home/amandeep\n", "Current dir: /data/amandeep/github/kgtk-notebooks/use-cases\n", "KGTK dir: /data/amandeep/github/kgtk-notebooks\n", "Use-cases dir: /data/amandeep/github/kgtk-notebooks/use-cases\n" ] } ], "source": [ "ck = ConfigureKGTK(files)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db\n", "KGTK_LABEL_FILE: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\n", "GRAPH: /data/amandeep/wikidata-20211027-dwd-v3\n", "USE_CASES_DIR: /data/amandeep/github/kgtk-notebooks/use-cases\n", "TEMP: /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files\n", "EXAMPLES_DIR: /data/amandeep/github/kgtk-notebooks/examples\n", "kgtk: kgtk\n", "STORE: /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db\n", "OUT: /data/amandeep/wikidata-20211027-dwd-v3/geography-files\n", "KGTK_OPTION_DEBUG: false\n", "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db\n", "label: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\n", "item: /data/amandeep/wikidata-20211027-dwd-v3/claims.wikibase-item.tsv.gz\n", "monolingualtext: /data/amandeep/wikidata-20211027-dwd-v3/claims.monolingualtext.tsv.gz\n", "external_id: /data/amandeep/wikidata-20211027-dwd-v3/claims.external-id.tsv.gz\n", "p279star: /data/amandeep/wikidata-20211027-dwd-v3/derived.P279star.tsv.gz\n", "p31: /data/amandeep/wikidata-20211027-dwd-v3/derived.P31.tsv.gz\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/geography-files/temp.geography-files/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\" --as label -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.wikibase-item.tsv.gz\" --as item -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.monolingualtext.tsv.gz\" --as monolingualtext -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.external-id.tsv.gz\" --as external_id -i \"/data/amandeep/wikidata-20211027-dwd-v3/derived.P279star.tsv.gz\" --as p279star -i \"/data/amandeep/wikidata-20211027-dwd-v3/derived.P31.tsv.gz\" --as p31 --limit 3\n", "id\tnode1\tlabel\tnode2\tlang\trank\tnode2;wikidatatype\n", "P10-label-en\tP10\tlabel\t'video'@en\ten\t\t\n", "P1000-label-en\tP1000\tlabel\t'record held'@en\ten\t\t\n", "P10000-label-en\tP10000\tlabel\t'Research Vocabularies Australia ID'@en\ten\t\t\n" ] } ], "source": [ "ck.load_files_into_cache()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build Files To Reason About Geography" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `P131star`: map each node1 in `P131` to all its admin up the chain of `P131`\n", "\n", "Removes historical admins, including historical countries.\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1000398 | P131star | Q12694 | Q1000398-P131star-Q12694 |\n", "| Q1001008 | P131star | Q12589 | Q1001008-P131star-Q12589 |\n", "| Q1001499 | P131star | Q214 | Q1001499-P131star-Q214 |\n", "| Q1001995 | P131star | Q1001995 | Q1001995-P131star-Q1001995 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract the graph of `P131` so that we give a smaller graph to reachable-nodes, and remove historical admins" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i item\n", " --match '(n1)-[l:P131]->(n2)' \n", " --return 'distinct n1 as node1, l.label as label, n2 as node2'\n", " --order-by 'n1'\n", " / add-id --id-style wikidata\n", " -o \"$TEMP\"/P131.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index P131" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "P2618\tP131\tQ3206\tP2618-P131-Q3206\n", "P2621\tP131\tQ21\tP2621-P131-Q21\n" ] } ], "source": [ "kypher(\"\"\" -i \"$TEMP\"/P131.tsv.gz --as p131 --limit 2\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get all node1 in the `P131` relation, as we will use them as roots for `P131star`" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "kypher(\"\"\" -i p131\n", " --match 'p131: (n1)-[]->(n2)'\n", " --return 'distinct n1 as id' \n", " -o \"$TEMP\"/p131.node1.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute `P131star`, which maps every node1 in P131 to all the admin locations that can be reached from it." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"reachable-nodes\n", " --rootfile \"$TEMP\"/p131.node1.tsv.gz\n", " --rootfilecolumn id\n", " --label \"P131star\"\n", " --selflink\n", " -i \"$TEMP\"/P131.tsv.gz\n", " / add-id --id-style wikidata\n", " / sort\n", " -o $OUT/derived.P131star.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index `p131star` in kypher" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "P2618\tP131star\tP2618\tP2618-P131star-P2618\n", "P2618\tP131star\tQ3206\tP2618-P131star-Q3206\n" ] } ], "source": [ "kypher(\"\"\" -i $OUT/derived.P131star.tsv.gz --as p131star --limit 2\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test: get all the admins of Pasadena: Q485176" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "if debug:\n", " !$kypher -i p131star -i label -i p31 \\\n", " --match ' \\\n", " p131star: (:Q485176)-[]->(admin), \\\n", " p31: (admin)-[]->(admin_class)' \\\n", " --opt 'labels: (admin)-[]->(admin_label)' \\\n", " --opt 'labels: (admin_class)-[]->(admin_class_label)' \\\n", " --return 'distinct admin as admin, admin_label as admin_label, admin_class as admin_class, admin_class_label as admin_class_label' \\\n", " --order-by 'admin_class_label' \\\n", " | column -t -s $'\\t'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131admin2` map each human settlement to its admin2 `Q13220204`\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q100252368 | P131admin1 | Q1588 | Q100252368-P131admin1-Q1588 |\n", "| Q1005394 | P131admin1 | Q54171 | Q1005394-P131admin1-Q54171 |\n", "| Q100923 | P131admin1 | Q1263 | Q100923-P131admin1-Q1263 |\n", "| Q101111580 | P131admin1 | Q34800 | Q101111580-P131admin1-Q34800 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make a list of all human settlements" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "kypher(\"\"\" -i p31 -i p279star\n", " --match '\n", " p31: (human_settlement)-[]->(class),\n", " p279star: (class)-[]->(:Q486972)'\n", " --return 'distinct human_settlement as node1'\n", " -o \"$TEMP\"/human_settlement.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index human_settlement" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\n", "Q104214562\n", "Q105923404\n" ] } ], "source": [ "kypher(\"\"\" -i \"$TEMP\"/human_settlement.tsv.gz --as settlement --limit 2\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pick out all the admins that are below admin2 `Q13220204`" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p131star -i p279star\n", " --match '\n", " p131star: (x)-[]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q13220204)'\n", " --return 'distinct x as node1, \"P131admin2\" as label, admin as node2'\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.P131admin2.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131admin1` map each human settlement to its admin1 `Q10864048`\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q100252368 | P131admin1 | Q1588 | Q100252368-P131admin1-Q1588 |\n", "| Q1005394 | P131admin1 | Q54171 | Q1005394-P131admin1-Q54171 |\n", "| Q100923 | P131admin1 | Q1263 | Q100923-P131admin1-Q1263 |\n", "| Q101111580 | P131admin1 | Q34800 | Q101111580-P131admin1-Q34800 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pick out the admins that are below admin1 `Q10864048`" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p131star -i p279star\n", " --match '\n", " p131star: (x)-[]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q10864048)'\n", " --return 'distinct x as node1, \"P131admin1\" as label, admin as node2'\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.P131admin1.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131country` map each settlement to its country\n", "Removes historical country\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1003172 | P131country | Q45 | Q1003172-P131country-Q45 |\n", "| Q100701578 | P131country | Q678 | Q100701578-P131country-Q678 |\n", "| Q1010068 | P131country | Q219 | Q1010068-P131country-Q219 |\n", "| Q101218885 | P131country | Q822 | Q101218885-P131country-Q822 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pick out all the admin that are countries `Q6256`" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p131star -i p279star\n", " --match '\n", " p131star: (x)-[]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q6256)'\n", " --return 'distinct x as node1, \"P131country\" as label, admin as node2'\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.P131country.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `derived.P131.admin`" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131admin1.tsv.gz\n", "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131admin2.tsv.gz\n", "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131country.tsv.gz\n", "/data/amandeep/wikidata-20211027-dwd-v3/geography-files/derived.P131star.tsv.gz\n" ] } ], "source": [ "!ls \"$OUT\"/derived.P131*.tsv.gz" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat\n", " -i \"$OUT\"/derived.P131admin1.tsv.gz\n", " -i \"$OUT\"/derived.P131admin2.tsv.gz\n", " -i \"$OUT\"/derived.P131country.tsv.gz\n", " -o \"$OUT\"/derived.P131.admin.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Index `derived.P131.admin`" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "P2618\tP131admin1\tQ3206\tP2618-P131admin1-Q3206\n", "P2621\tP131admin1\tQ21\tP2621-P131admin1-Q21\n" ] } ], "source": [ "!$kypher -i \"$OUT\"/derived.P131.admin.tsv.gz --as p131admin --limit 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Something to worry about: some countries have end times:" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-09-19 22:10:14 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_1_c1.\"node1\", graph_4_c4.\"node2\", graph_4_c5.\"node2\", graph_7_c3.\"node2\"\n", " FROM graph_1 AS graph_1_c1\n", " INNER JOIN graph_1 AS graph_1_c2, graph_4 AS graph_4_c4, graph_4 AS graph_4_c5, graph_7 AS graph_7_c3\n", " ON graph_1_c1.\"node1\" = graph_1_c2.\"node1\"\n", " AND graph_1_c1.\"node1\" = graph_4_c4.\"node1\"\n", " AND graph_1_c2.\"id\" = graph_7_c3.\"node1\"\n", " AND graph_1_c2.\"node2\" = graph_4_c5.\"node1\"\n", " AND graph_1_c1.\"node2\" = ?\n", " AND graph_7_c3.\"label\" = ?\n", " LIMIT ?\n", " PARAS: ['Q6256', 'P582', 20]\n", "---------------------------------------------\n", "node1\tnode2\tnode2\tnode2\n", "Q1000\t'Gabon'@en\t'French colonial empire'@en\t^1960-01-01T00:00:00Z/9\n", "Q1029\t'Mozambique'@en\t'overseas province of Portugal'@en\t^1975-06-24T00:00:00Z/11\n", "Q1146786\t'Señorío of Cuzcatlán'@en\t'country'@en\t^1528-01-01T00:00:00Z/9\n", "Q1155700\t'Rattanakosin Kingdom'@en\t'country'@en\t^1932-06-24T00:00:00Z/11\n", "Q12548\t'Holy Roman Empire'@en\t'sovereign state'@en\t^1806-01-01T00:00:00Z/9\n", "Q139377\t'Kingdom of Aksum'@en\t'realm'@en\t^0960-01-01T00:00:00Z/9\n", "Q139377\t'Kingdom of Aksum'@en\t'country'@en\t^0960-01-01T00:00:00Z/9\n", "Q142\t'France'@en\t'colonial power'@en\t^1960-01-01T00:00:00Z/9\n", "Q16\t'Canada'@en\t'dominion of the British Empire'@en\t^1931-01-01T00:00:00Z/9\n", "Q172640\t'North Vietnam'@en\t'self-proclaimed state'@en\t^1954-01-01T00:00:00Z/9\n", "Q172640\t'North Vietnam'@en\t'sovereign state'@en\t^1976-01-01T00:00:00Z/9\n", "Q180573\t'South Vietnam'@en\t'sovereign state'@en\t^1975-01-01T00:00:00Z/9\n", "Q189988\t'Central African Empire'@en\t'sovereign state'@en\t^1979-09-20T00:00:00Z/11\n", "Q213353\t'Cisalpine Republic'@en\t'country'@en\t^1802-01-01T00:00:00Z/9\n", "Q2741089\t'Greater Republic of Central America'@en\t'country'@en\t^1898-08-27T00:00:00Z/11\n", "Q30\t'United States of America'@en\t'historical unrecognized state'@en\t^1784-05-12T00:00:00Z/11\n", "Q31\t'Belgium'@en\t'colonial power'@en\t^1960-01-01T00:00:00Z/9\n", "Q3167772\t'Kingdom of Polonnaruwa'@en\t'country'@en\t^1310-01-01T00:00:00Z/9\n", "Q31747\t'Irish Free State'@en\t'country'@en\t^1937-12-29T00:00:00Z/11\n", "Q39\t'Switzerland'@en\t'confederation'@en\t^1815-08-07T00:00:00Z/11\n", " 0.96 real 0.77 user 0.15 sys\n" ] } ], "source": [ "!$kypher -i p31 -i labels -i qualifiers \\\n", "--match ' \\\n", " p31: (country)-[]->(:Q6256), \\\n", " p31: (country)-[r]->(class), \\\n", " qualifiers: (r)-[:P582]->(endtime), \\\n", " labels: (country)-[]->(country_label), \\\n", " labels: (class)-[]->(class_label)' \\\n", "--return 'country, country_label, class_label, endtime' \\\n", "--limit 20" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import the file into the kypher index and give it the alias `admin1`" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i item -i p279star\n", " --match '\n", " p31: (country)-[:P31]->(:Q6256),\n", " item: (settlement)-[:P17]->(country),\n", " p31: (settlement)-[:P31]->(a_settlement_subclass),\n", " p279star: (a_settlement_subclass)-[]->(:Q486972)'\n", " --where 'a_settlement_subclass != \"Q486972\"'\n", " --return 'settlement as node1, \"P17\" as label, country as node2' \n", " / compact -i - --deduplicate --build-id --id-style wikidata\n", " -o \"$TEMP\"/settlement-to-country.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2id
0Q100P17Q30Q100-P17-Q30
1Q100000P17Q55Q100000-P17-Q55
2Q1000003P17Q142Q1000003-P17-Q142
\n", "
" ], "text/plain": [ " node1 label node2 id\n", "0 Q100 P17 Q30 Q100-P17-Q30\n", "1 Q100000 P17 Q55 Q100000-P17-Q55\n", "2 Q1000003 P17 Q142 Q1000003-P17-Q142" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kypher(\"\"\"-i \"$TEMP\"/settlement-to-country.tsv.gz --as settlements --limit 3\"\"\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "kypher(\"\"\"-i settlements -i item -i p31 -i p279star\n", " --match '\n", " settlements: (settlement)-[]->(),\n", " item: (settlement)-[:P131]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q10864048)'\n", " --return 'distinct settlement as node1, \"P131admin1\" as label, admin as node2'\n", " -o \"$TEMP\"/derived.settlement.P131admin1.direct.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "kypher(\"\"\" -i settlements -i item -i p31 -i p279star\n", " --match '\n", " settlements: (settlement)-[]->(),\n", " item: (settlement)-[:P131]->(admin_a)-[:P131]->(admin),\n", " p31: (admin)-[]->(admin_class),\n", " p279star: (admin_class)-[]->(:Q10864048)'\n", " --return 'distinct settlement as node1, \"P131admin1\" as label, admin as node2'\n", " -o \"$TEMP\"/derived.settlement.P131admin1.2hop.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat \n", " -i \"$TEMP\"/derived.settlement.P131admin1.direct.tsv.gz\n", " -i \"$TEMP\"/derived.settlement.P131admin1.2hop.tsv.gz\n", " / compact -i - --deduplicate --build-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.settlement.P131admin1.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "Q100\tP131admin1\tQ771\tQ100-P131admin1-Q771\n", "Q100000\tP131admin1\tQ1093\tQ100000-P131admin1-Q1093\n", "Q1000003\tP131admin1\tQ16987\tQ1000003-P131admin1-Q16987\n" ] } ], "source": [ "kypher(\"\"\" -i \"$OUT\"/derived.settlement.P131admin1.tsv.gz --as settlementadmin1 --limit 3\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Aliases for cities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `derived.alias.settlement.admin1.full.tsv`\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q100349105 | alias | 'Colonnacce Roman villa, Lazio'@en | Q100349105-alias-44bd19 |\n", "| Q1007434 | alias | 'Frangovo, Struga Municipality'@en | Q1007434-alias-1a7cbb |\n", "| Q101071309 | alias | '50-76 Stryiska Street, Lviv, Lwów Voivodeship'@en | Q101071309-alias-6c36ec |\n", "| Q1012440 | alias | 'Ranong, Ranong'@en | Q1012440-alias-2af390 |" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i settlementadmin1 -i label\n", " --match '\n", " settlementadmin1: (settlement)-[]->(admin1),\n", " label: (settlement)-[]->(settlement_label),\n", " label: (admin1)-[]->(admin1_label)'\n", " --return 'distinct settlement as node1, \"alias\" as label,\n", " kgtk_stringify(printf(\"%s, %s\", kgtk_lqstring_text(settlement_label), kgtk_lqstring_text(admin1_label))) as `node2;kgtk:text`,\n", " \"en\" as `node2;kgtk:language`,\n", " \"language_qualified_string\" as `node2;kgtk:data_type`'\n", " / implode -i - --mode NONE --types language_qualified_string --without language_suffix --remove-prefixed-columns True\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.alias.settlement.admin1.full.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `derived.alias.settlement.admin1.short.tsv`\n", "Use the short name (`P1813`) of the admin as the label\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1012017 | alias | 'Plainfield, IL'@en | Q1012017-alias-28be4a |\n", "| Q1020218 | alias | 'Princeton, C.-B.'@en | Q1020218-alias-f54afb |\n", "| Q1029922 | alias | 'Camp Pendleton North, Cal.'@en | Q1029922-alias-3876ac |\n", "| Q1052502 | alias | 'Cedar Grove, IN'@en | Q1052502-alias-b1d922 |" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i settlementadmin1 -i monolingualtext -i label\n", " --match '\n", " settlementadmin1: (settlement)-[]->(admin1),\n", " label: (settlement)-[]->(settlement_label),\n", " monolingualtext: (admin1)-[:P1813]->(admin1_short_label)'\n", " --return 'distinct settlement as node1, \"alias\" as label,\n", " kgtk_stringify(printf(\"%s, %s\", kgtk_lqstring_text(settlement_label), kgtk_lqstring_text(admin1_short_label))) as `node2;kgtk:text`,\n", " \"en\" as `node2;kgtk:language`,\n", " \"language_qualified_string\" as `node2;kgtk:data_type`'\n", " / implode -i - --mode NONE --types language_qualified_string --without language_suffix --remove-prefixed-columns True\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$OUT\"/derived.alias.settlement.admin1.short.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `derived.alias.settlement.admin1.iso.tsv`\n", "\n", "Use the iso code (`P300`) of the admin as the label, after removing the country part of the code\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1003425 | alias | 'Fourneaux, P'@en | Q1003425-alias-f75f90 |\n", "| Q1007427 | alias | 'La Lande-de-Fronsac, B'@en | Q1007427-alias-0eaf01 |\n", "| Q101073 | alias | 'Foresto Sparso, 25'@en | Q101073-alias-b6bf2f |\n", "| Q1012377 | alias | 'Fabryczna, LB'@en | Q1012377-alias-e30d31 |" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i settlementadmin1 -i external_id -i label\n", " --match '\n", " settlementadmin1: (settlement)-[]->(admin1),\n", " label: (settlement)-[]->(settlement_label),\n", " external_id: (admin1)-[:P300]->(iso_code)'\n", " --return 'distinct settlement as node1, \"alias\" as label,\n", " kgtk_stringify(printf(\"%s, %s\", kgtk_lqstring_text(settlement_label), substr(kgtk_unstringify(iso_code), 4))) as `node2;kgtk:text`,\n", " \"en\" as `node2;kgtk:language`,\n", " \"language_qualified_string\" as `node2;kgtk:data_type`'\n", " / implode -i - --mode NONE --types language_qualified_string --without language_suffix --remove-prefixed-columns True\n", " / add-id --id-style wikidata\n", " / sort \n", " -o \"$OUT\"/derived.alias.settlement.admin1.iso.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `derived.alias.city.country.tsv`\n", "\n", "Produce \"city, country\" aliases for big cities\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q1023481 | alias | 'Ocilla, United States of America'@en | Q1023481-alias-51eee8 |\n", "| Q1160417 | alias | 'Lébény, Hungary'@en | Q1160417-alias-e1447f |\n", "| Q1341 | alias | 'Tolyatti, Soviet Union'@en | Q1341-alias-e387a3 |\n", "| Q151920 | alias | 'Tiberias, Mandatory Palestine'@en | Q151920-alias-0fb665 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Produce city/country aliases for all cities (Q515) and all countries" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# Q515: city\n", "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p279star -i item -i label\n", " --match ' \n", " p31: (city)-[]->(city_class),\n", " p279star: (city_class)-[]->(:Q515),\n", " item: (city)-[:P17]->(country),\n", " label: (city)-[]->(city_label),\n", " label: (country)-[]->(country_label)'\n", " --return 'distinct city as node1, \"alias\" as label,\n", " kgtk_stringify(printf(\"%s, %s\", kgtk_lqstring_text(city_label), kgtk_lqstring_text(country_label), 4)) as `node2;kgtk:text`,\n", " \"en\" as `node2;kgtk:language`,\n", " \"language_qualified_string\" as `node2;kgtk:data_type`'\n", " / implode -i - --mode NONE --types language_qualified_string --without language_suffix --remove-prefixed-columns True\n", " / add-id --id-style wikidata\n", " / sort \n", " -o \"$OUT\"/derived.alias.city.country.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `derived.alias.city.us.tsv`\n", "\n", "For the US, produce special files that use `USA` and `United States` in addition to the official name `United States of America`\n", "\n", "| node1 | label | node2 | id |\n", "| -- | -- | -- | -- |\n", "| Q100 | alias | 'Boston, United States'@en | Q100-alias-d454d3 |\n", "| Q100 | alias | 'Boston, USA'@en | Q100-alias-fd19d4 |\n", "| Q1000030 | alias | 'Orange City, USA'@en | Q1000030-alias-63cb74 |\n", "| Q1000030 | alias | 'Orange City, United States'@en | Q1000030-alias-a00c53 |\n", "| Q1000065 | alias | 'Neosho, USA'@en | Q1000065-alias-3631c0 |\n", "| Q1000065 | alias | 'Neosho, United States'@en | Q1000065-alias-6be460 |" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p279star -i item -i label\n", " --match '\n", " p31: (city)-[]->(city_class),\n", " p279star: (city_class)-[]->(:Q515),\n", " item: (city)-[:P17]->(:Q30),\n", " label: (city)-[]->(city_label)'\n", " --return 'distinct city as node1, \"alias\" as label,\n", " kgtk_stringify(printf(\"%s, USA\", kgtk_lqstring_text(city_label))) as `node2;kgtk:text`,\n", " \"en\" as `node2;kgtk:language`,\n", " \"language_qualified_string\" as `node2;kgtk:data_type`'\n", " / implode -i - --mode NONE --types language_qualified_string --without language_suffix --remove-prefixed-columns True\n", " / add-id --id-style wikidata\n", " / sort \n", " -o \"$TEMP\"/derived.alias.city.USA.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"--debug query --gc $STORE -i p31 -i p279star -i item -i label\n", " --match ' \n", " p31: (city)-[]->(city_class), \n", " p279star: (city_class)-[]->(:Q515), \n", " item: (city)-[:P17]->(:Q30), \n", " label: (city)-[]->(city_label)' \n", " --return 'distinct city as node1, \"alias\" as label,\n", " kgtk_stringify(printf(\"%s, United States\", kgtk_lqstring_text(city_label))) as `node2;kgtk:text`,\n", " \"en\" as `node2;kgtk:language`, \n", " \"language_qualified_string\" as `node2;kgtk:data_type`'\n", " / implode -i - --mode NONE --types language_qualified_string --without language_suffix --remove-prefixed-columns True\n", " / add-id --id-style wikidata\n", " / sort\n", " -o \"$TEMP\"/derived.alias.city.united_states.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combine the files for the US into one file, as it might be useful by itself" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat\n", " -i \"$TEMP\"/derived.alias.city.USA.tsv.gz\n", " -i \"$TEMP\"/derived.alias.city.united_states.tsv.gz\n", " / sort\n", " -o \"$OUT\"/derived.alias.city.us.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `derived.alias.settlement.tsv` \n", "Combined file of all aliases for cities" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat\n", " -i \"$OUT\"/derived.alias.settlement.admin1.iso.tsv.gz \n", " -i \"$OUT\"/derived.alias.settlement.admin1.short.tsv.gz \n", " -i \"$OUT\"/derived.alias.settlement.admin1.full.tsv.gz \n", " -i \"$OUT\"/derived.alias.city.country.tsv.gz \n", " -i \"$OUT\"/derived.alias.city.us.tsv.gz \n", " / compact -i - --deduplicate --build-id --id-style wikidata\n", " -o \"$OUT\"/derived.alias.settlement.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================================================\n", "Data lines read: 1473081\n", "Data lines passed: 1473081\n" ] } ], "source": [ "!$kgtk validate -i \"$OUT\"/derived.alias.settlement.tsv.gz" ] } ], "metadata": { "kernelspec": { "display_name": "kgtk-env", "language": "python", "name": "kgtk-env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 4 }