{ "cells": [ { "cell_type": "markdown", "id": "54a5847d-547a-4757-bd9d-3bcf85b7a049", "metadata": {}, "source": [ "# Table Linker Files" ] }, { "cell_type": "markdown", "id": "2fd7da29-e129-44d4-89d3-c92d85106e6d", "metadata": {}, "source": [ "Build the files required for Table linker Elasticsearch Index.\n", "\n", "You'll need to run the [Wikidata Useful Files](https://github.com/usc-isi-i2/kgtk/blob/master/use-cases/Wikidata%20Useful%20Files.ipynb) prior to running this notebook as files created by that notebook will be used here." ] }, { "cell_type": "code", "execution_count": 3, "id": "df52c3e3-f460-48f7-adf5-6a565643683d", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": null, "id": "c2131b4f-42fa-4710-8c40-858bd8dd29a3", "metadata": { "tags": [ "Parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Volumes/saggu-ssd/wikidata-2021-10-27\"\n", "output_path = \"/Volumes/saggu-ssd/wikidata-2021-10-27\"\n", "project_name = \"table-linker-files\"\n", "\n", "graph_cache_path = None" ] }, { "cell_type": "code", "execution_count": null, "id": "ef0816fe-4c9f-4fec-b652-01bd2a090452", "metadata": {}, "outputs": [], "source": [ "files = [\n", " \"label\",\n", " \"alias\",\n", " \"item\",\n", " \"p279star\",\n", " \"claims\",\n", " \"isa\",\n", " \"isastar\",\n", " \"all\"\n", "]\n", "ck = ConfigureKGTK(files)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name,\n", " graph_cache_path=graph_cache_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "f67482ed-9dca-4666-9289-99b4b2517e26", "metadata": {}, "outputs": [], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "code", "execution_count": null, "id": "ef29c088-ba92-4807-9008-fe9de04d8b83", "metadata": {}, "outputs": [], "source": [ "%%time\n", "if graph_cache_path is None:\n", " ck.load_files_into_cache()" ] }, { "cell_type": "markdown", "id": "cba57f78-f59e-4e0d-89f3-6426d189ad81", "metadata": {}, "source": [ "## Create DWD ISA (Variant of IS A)" ] }, { "cell_type": "code", "execution_count": null, "id": "6ab48d10-31c1-4bdd-a388-bc89777a01fd", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"filter -i \"$item\" -p '; P31, P279, P106, P39 ;' -o \"$TEMP\"/derived.P31_39_106_279.1.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a4cc88ba-e1f0-4cfa-b081-bfb2ec37318e", "metadata": {}, "outputs": [], "source": [ "with open(os.environ['TEMP'] + '/custom-edges.tsv', 'w') as fp:\n", " fp.write(\"node1\\tlabel\\tnode2\\n\")\n", " fp.write(\"Q215627\\tdwd_isa\\tQ5\\n\") # person dwd_isa human\n", " fp.write(\"Q12737077\\tdwd_isa\\tQ5\\n\") # occupation dwd_isa human (perhaps controversial)\n", " fp.write(\"Q5\\tdwd_isa_\\tQ215627\\n\") # inverse\n", " fp.write(\"Q5\\tdwd_isa_\\tQ12737077\\n\") # inverse\n", "fp.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "7f1255bf-5044-4bc7-89e2-9e84559bb05b", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i \"$TEMP\"/derived.P31_39_106_279.1.tsv.gz\n", " -i \"$TEMP\"/custom-edges.tsv \n", " -o \"$OUT\"/derived.dwd_isa.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0f733bc0-9fa1-4bc8-be8b-84236b3392d8", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$OUT\"/derived.dwd_isa.tsv.gz | wc -l" ] }, { "cell_type": "markdown", "id": "4141f8e7-fd02-4799-a9e5-3999a034a59d", "metadata": {}, "source": [ "## Compute TF IDF : Class and Property count files" ] }, { "cell_type": "markdown", "id": "9235f87b-9565-47bb-b59f-f01371365d40", "metadata": {}, "source": [ "### Class Counts" ] }, { "cell_type": "code", "execution_count": null, "id": "26f3b853-d7eb-4d28-a6e7-740bcced2887", "metadata": {}, "outputs": [], "source": [ "!$kypher -i p279star -i \"$OUT\"/derived.dwd_isa.tsv.gz \\\n", " --match 'dwd_isa: (n1)-[]->(class), p279star: (class)-[]->(super_class)' \\\n", " --return 'distinct class as node1, \"P31_39_106_279star\" as label, super_class as node2' \\\n", " --order-by 'node1, label, node2' \\\n", " / add-id --id-style wikidata \\\n", " -o \"$OUT\"/derived.P31_39_106_279star.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "d2097b0b-b65e-4fe1-9cba-a2ccfb149768", "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$OUT\"/derived.P31_39_106_279star.tsv.gz --as dwd_isa_star -i \"$OUT\"/derived.dwd_isa.tsv.gz --as P31_39_106_279 \\\n", " --match 'P31_39_106_279: (n1)-[]->(class), dwd_isa_star: (class)-[]->(super_class)' \\\n", " --return 'distinct super_class as node1, count(distinct n1) as node2, \"P31_39_106_279_count\" as label' \\\n", " --order-by 'node1, label, node2' \\\n", " -o \"$OUT\"/derived.dwd.count.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "8123a717-11c1-4244-85af-9ff794d61a76", "metadata": {}, "outputs": [], "source": [ "!$kypher -i dwd_isa_star -i P31_39_106_279 -i \"$OUT\"/derived.dwd.count.tsv.gz \\\n", " --match 'P31_39_106_279: (n1)-[]->(class), dwd_isa_star: (class)-[]->(super_class), count: (super_class)-[]->(count)' \\\n", " --return 'distinct n1 as node1, \"class_count\" as label, printf(\"%s:%s\", super_class, count) as node2' \\\n", " --order-by 'node1, label, node2' \\\n", " -o \"$TEMP\"/dwd_isa_class_count.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "27b3dd5a-2910-4cf9-ba73-31f8e3c0d489", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"sort -i \"$TEMP\"/dwd_isa_class_count.tsv.gz \n", " -X \"--parallel 8 --buffer-size 60%\" \n", " -o \"$TEMP\"/dwd_isa_class_count.sorted.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "85e8bd91-f5a3-49c3-b521-b6c03807072c", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"compact -i \"$TEMP\"/dwd_isa_class_count.sorted.tsv.gz \n", " --mode=NONE \n", " --columns node1 label \n", " --presorted True \n", " -o \"$OUT\"/dwd_isa_class_count.compact.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "id": "807e58e2-9e78-4636-975b-84e83f720b73", "metadata": {}, "source": [ "### Property Counts" ] }, { "cell_type": "markdown", "id": "7f105983-accd-480b-9d26-42fc1326135b", "metadata": {}, "source": [ "#### For each property get the number of node1 that it occurs in" ] }, { "cell_type": "code", "execution_count": null, "id": "0c8f60b0-45b5-4249-b193-5ca61d0cfb37", "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims \\\n", " --match '(n1)-[l {label: property}]->()' \\\n", " --return 'distinct property as node1, count(distinct n1) as node2, \"nodes_count\" as label' \\\n", " -o \"$TEMP\"/property.count.tsv.gz" ] }, { "cell_type": "markdown", "id": "01942fbf-2cdf-4327-a0de-63d761463a4b", "metadata": {}, "source": [ "#### For each item, list the properties it has" ] }, { "cell_type": "code", "execution_count": null, "id": "19ab8f29-0d84-473a-9597-e2444bf7e307", "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims \\\n", " --match '(n1)-[l {label: property}]->()' \\\n", " --return 'distinct n1 as node1, property as node2, \"property\" as label' \\\n", " -o \"$TEMP\"/item.property.tsv.gz" ] }, { "cell_type": "markdown", "id": "468ce4f8-7df8-43a8-9cb5-e19f0ac02d43", "metadata": {}, "source": [ "#### Combine the property and the counts into one column" ] }, { "cell_type": "code", "execution_count": null, "id": "43915a02-2fd2-4ed7-ae95-88931c1203f8", "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$TEMP\"/property.count.tsv.gz -i \"$TEMP\"/item.property.tsv.gz \\\n", " --match 'count: (property)-[]->(count), item: (n1)-[]->(property)' \\\n", " --return 'distinct n1 as node1, \"property_count\" as label, printf(\"%s:%s\", property, count) as node2' \\\n", " --order-by 'node1, label, node2' \\\n", " -o \"$TEMP\"/item.property.count.tsv.gz" ] }, { "cell_type": "markdown", "id": "7edc269e-bd0e-4079-8fe3-e54d4f8684e3", "metadata": {}, "source": [ "#### Put all the property/count pairs in one row for each node" ] }, { "cell_type": "code", "execution_count": null, "id": "e5bade10-94a0-4001-8a52-65f9bac3008a", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"sort -i \"$TEMP\"/item.property.count.tsv.gz \n", " --sort-command gsort \n", " -X \"--parallel 8 --buffer-size 60%\" \n", " -o \"$TEMP\"/item.property.count.sorted.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "bd80736e-c4c0-4162-88bf-d24545e773bd", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"compact -i \"$TEMP\"/item.property.count.sorted.tsv.gz \n", " --mode=NONE \n", " --columns node1 label \n", " --presorted True \n", " -o \"$OUT\"/item.property.count.compact.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "id": "495d08d0-2242-4c82-be88-3d463d292544", "metadata": {}, "source": [ "## Compute Property Values (context) File" ] }, { "cell_type": "markdown", "id": "c8a4485e-7d27-44c2-95c6-ee36fff12be4", "metadata": {}, "source": [ "### Collecting all the properties that we do not need in the wikibase items\n", "\n", "For wikibase-item properties, the context will include the label of the value of the property.\n", "\n", "The follwing query creates context information for all wikibase-item properties. We need to trim the set of properties to include only the ones that give us useful values:\n", "\n", "- exclude P31, P279 as this info is unlikely to be useful as context\n", "- exclude P793, P47, P1830, P190, P1549 (these properties are likely to give confusing contexts, this is just a small sample)\n", "- exclude all properties that are is-a of P31/P279star of\n", " - Q19820110 (Wikidata property for property documentation)\n", " - Q18667213 (Wikidata property about Wikimedia categories)\n", " - Q51118703 (Wikidata property about Wikimedia templates)\n", " - Q51118821 (Wikidata property about Wikimedia entities)\n", " - Q18608359 (Wikidata property to indicate a source)\n", "- exclude properties that are subproperty of (note: some of the following may already be exluded by the above exlusions):\n", " - P1455 (list of works) \n", " - P2354 (has list)" ] }, { "cell_type": "code", "execution_count": null, "id": "26c0ad70-50f9-4166-bc01-6d8cab0e6454", "metadata": {}, "outputs": [], "source": [ "!wd u P31 P279 P793 P47 P1830 P190 P1549" ] }, { "cell_type": "code", "execution_count": null, "id": "535a5119-e576-4e1f-ac20-22833fc3f91b", "metadata": {}, "outputs": [], "source": [ "if compute_table_linker_files:\n", " df = pd.DataFrame()\n", " df['node1'] = [\"P31\", \"P279\", \"P793\", \"P47\", \"P1830\", \"P190\", \"P1549\"]\n", " df.to_csv(f\"{os.environ['TEMP']}/properties.to.remove.tsv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "31ff1088-ed5b-4f01-89bf-833780bc52d4", "metadata": {}, "outputs": [], "source": [ "isa_classes = \"Q19820110,Q18667213,Q51118703,Q51118821\"" ] }, { "cell_type": "code", "execution_count": null, "id": "0a9e8870-84a3-4e05-be95-90892137699e", "metadata": {}, "outputs": [], "source": [ "!wd u Q19820110 Q18667213 Q51118703 Q51118821" ] }, { "cell_type": "markdown", "id": "312e4b18-61dc-4d36-a77d-67b11ce86c00", "metadata": {}, "source": [ "Make sure the following files are present in the input path. These files for DWD v2 can be downloaded from https://drive.google.com/drive/folders/1AyKSLjt5OmTZvEvi4cp2lJt7FIvFKvzG" ] }, { "cell_type": "code", "execution_count": null, "id": "7b302ef4-9129-45ad-8bda-efffc77142f6", "metadata": {}, "outputs": [], "source": [ "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.P131.admin.tsv.gz'), \"derived.P131.admin.tsv.gz is missing from the input graph path\"\n", "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.alias.settlement.admin1.tsv.gz'), \"derived.alias.settlement.admin1.tsv.gz is missing from the input graph path\"\n", "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.alias.settlement.admin1.full.tsv.gz'), \"derived.alias.settlement.admin1.full.tsv.gz is missing from the input graph path\"\n", "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.alias.settlement.admin1.iso.tsv.gz'), \"derived.alias.settlement.admin1.iso.tsv.gz is missing from the input graph path\"\n", "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.alias.city.country.tsv.gz'), \"derived.alias.city.country.tsv.gz is missing from the input graph path\"\n", "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.alias.city.us.tsv.gz'), \"derived.alias.city.us.tsv.gz is missing from the input graph path\"\n", "assert os.path.exists(f'{os.environ[\"GRAPH\"]}/derived.alias.settlement.tsv.gz'), \"derived.alias.settlement.tsv.gz is missing from the input graph path\"" ] }, { "cell_type": "code", "execution_count": null, "id": "a148e0b5-cefe-4050-8ed1-0429c03b9f50", "metadata": {}, "outputs": [], "source": [ "!$kypher -i isa \\\n", " --match '(n1)-[]->(n2)' \\\n", " --where 'n2 in [\"Q19820110\", \"Q18667213\", \"Q51118703\", \"Q51118821\"]' \\\n", " --return 'n1' \\\n", " -o $TEMP/isa.properties.remove.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "6d57f3e0-d9e8-4d8a-a915-07131ffbde82", "metadata": {}, "outputs": [], "source": [ "!$kypher -i $TEMP/isa.properties.remove.tsv.gz \\\n", " --match '(n1)-[]->()' \\\n", " --where \"substr(n1,1,1)='P'\" \\\n", " --return 'distinct n1' \\\n", " -o $TEMP/isa.properties.remove.distinct.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "86a5ebd2-7e25-4e01-ab8a-60eb32661642", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i \"$TEMP\"/isa.properties.remove.distinct.tsv.gz \n", " -i \"$TEMP\"/properties.to.remove.tsv\n", " -o \"$TEMP\"/properties.remove.tsv.gz --mode=NONE\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3393c93d-abb6-4f81-8dc0-7974b3a7ab4d", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i $item\n", " -i \"$GRAPH/derived.P131.admin.tsv.gz\" \n", " -o \"$TEMP/derived.table-linker.items.tsv.gz\"\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "664fd6cc-6da6-46a5-8de9-704baa4ef3f8", "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$TEMP/derived.table-linker.items.tsv.gz\" --as table_linker_item \\\n", " --match 'item: (n1)-[l {label: property}]->()' \\\n", " --return 'distinct property as node1' \\\n", " -o \"$TEMP\"/all.properties.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "8b91b474-b62b-44e4-9c42-3f7cecdfdb45", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"ifnotexists -i \"$TEMP\"/all.properties.tsv.gz\n", " --filter-on \"$TEMP\"/properties.remove.tsv.gz \n", " --input-keys node1 \n", " --filter-keys node1 \n", " --mode=NONE \n", " -o \"$TEMP\"/final.properties.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "916ca3f2-d1c3-4d9e-bce3-ddf4e6f65c94", "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims \\\n", " --match 'claims: (n1)-[l {label: property}]->(n2 {wikidatatype:\"time\"})' \\\n", " --return 'n1 as node1, \"context\" as label, printf(\"d\\\"%s\\\":%s\", kgtk_date_date(n2), property) as node2' \\\n", " -o $TEMP/context.time.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "06b01aa3-1551-4175-9a45-3d2c2f636818", "metadata": {}, "outputs": [], "source": [ "!zcat $TEMP/context.time.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": null, "id": "5ca4f033-7485-4358-9fce-671f04a29788", "metadata": {}, "outputs": [], "source": [ "!$kypher -i table_linker_item -i label -i $TEMP/final.properties.tsv.gz --as final_props \\\n", " --match 'item: (n1)-[l {label: property}]->(n2), label: (n2)-[]->(lab), final_props: (property)' \\\n", " --return 'n1 as node1, \"context\" as label, printf(\"i%s:%s:%s\", lower(kgtk_lqstring_text_string(lab)), property, n2) as node2' \\\n", " --where 'n1 != n2' \\\n", " -o \"$TEMP\"/context.labels.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "f0d0d8ec-eb44-482b-8887-894950c6507c", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$TEMP\"/context.labels.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": null, "id": "c7a7cf47-0186-4bbe-81ba-b818695f85d5", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i alias \n", " -i \"$GRAPH/derived.alias.settlement.admin1.tsv.gz\" \n", " -i \"$GRAPH/derived.alias.settlement.admin1.full.tsv.gz\" \n", " -i \"$GRAPH/derived.alias.settlement.admin1.iso.tsv.gz\"\n", " -i \"$GRAPH/derived.alias.city.country.tsv.gz\" \n", " -i \"$GRAPH/derived.alias.city.us.tsv.gz\" \n", " -i \"$GRAPH/derived.alias.settlement.tsv.gz\" \n", " -o \"$TEMP/derived.location.aliases.tsv.gz\"\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "df18416b-b693-4bb2-bf82-8a0a27766c6c", "metadata": {}, "outputs": [], "source": [ "!$kypher -i table_linker_item -i \"$TEMP/derived.location.aliases.tsv.gz\" -i final_props \\\n", " --match 'item: (n1)-[l {label: property}]->(n2), aliases: (n2)-[]->(alias), final_props: (property)' \\\n", " --return 'n1 as node1, \"context\" as label, printf(\"i%s:%s:%s\", lower(kgtk_lqstring_text_string(alias)), property, n2) as node2' \\\n", " --where 'n1 != n2' \\\n", " -o \"$TEMP\"/context.aliases.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "00c201cc-6dae-43f6-8af4-6a83163474d8", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$TEMP\"/context.aliases.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": null, "id": "24be0c68-e82d-464d-9fca-3c93d17d7187", "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims \\\n", " --match 'claims: (n1)-[l {label: property}]->(n2 {wikidatatype:\"external-id\"})' \\\n", " --return 'n1 as node1, \"context\" as label, printf(\"e%s:%s\", n2, property) as node2' \\\n", " -o \"$TEMP\"/context.external_id.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "c57d3589-1b68-4936-a3df-02f877484e65", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$TEMP\"/context.external_id.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": null, "id": "1a80a10e-ebee-40fe-9d08-7c6b3dc4c13a", "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims \\\n", " --match 'claims: (n1)-[l {label: property}]->(n2 {wikidatatype:\"quantity\"})' \\\n", " --return 'n1 as node1, \"context\" as label, printf(\"q\\\"%s\\\":%s\",kgtk_quantity_number(n2), property) as node2' \\\n", " -o \"$TEMP\"/context.quantity.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "24079f0a-89bf-4066-a378-b5a732a3a25e", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$TEMP\"/context.quantity.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": null, "id": "51364f86-e4b2-468b-8664-1196f0da67c3", "metadata": {}, "outputs": [], "source": [ "!$kypher -i claims \\\n", " --match 'claims: (n1)-[l {label: property}]->(n2 {wikidatatype:\"monolingualtext\"})' \\\n", " --return 'n1 as node1, \"context\" as label, printf(\"m%s:%s\",lower(kgtk_lqstring_text_string(n2)), property) as node2' \\\n", " --where 'n2.kgtk_lqstring_lang = \"en\"' \\\n", " -o \"$TEMP\"/context.monolingualtext.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "147a1116-97ca-496f-aab6-5a1d501b92fc", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$TEMP\"/context.monolingualtext.tsv.gz | head" ] }, { "cell_type": "code", "execution_count": null, "id": "dd2839bb-c629-4a86-ad38-9baff1ea4256", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i \"$TEMP'/context.external_id.tsv.gz \n", " \"$TEMP\"/context.quantity.tsv.gz \n", " \"$TEMP\"/context.aliases.tsv.gz \n", " \"$TEMP\"/context.labels.tsv.gz \n", " \"$TEMP\"/context.time.tsv.gz \n", " \"$TEMP\"/context.monolingualtext.tsv.gz \n", " -o \"$TEMP\"/context.all.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "44e999ee-8aef-45bc-826a-680315adf162", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"sort --columns node1 \n", " -i \"$TEMP\"/context.all.tsv.gz \n", " -X \"--parallel 8 --buffer-size 60%\" \n", " -o \"$TEMP\"/context.all.sorted.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "caa03264-1580-4c3b-88e7-8f77cbb568cb", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"compact -i \"$TEMP\"/context.all.sorted.tsv.gz\n", " --mode=NONE \n", " --columns node1 label\n", " --presorted True \n", " -o \"$OUT\"/table_linker.qnode.property.values.tsv.gz\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e7c51b3b-4080-41a6-9636-01c45b5e98c4", "metadata": {}, "outputs": [], "source": [ "!zcat < \"$OUT\"/table_linker.qnode.property.values.tsv.gz | head" ] }, { "cell_type": "markdown", "id": "6d7c1270-b136-4f57-9199-c93941d69443", "metadata": {}, "source": [ "## ComplEx Graph Embeddings" ] }, { "cell_type": "code", "execution_count": null, "id": "8b7a58f2-cef0-4187-af54-5a3639c3f9c4", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"graph-embeddings --verbose -i $item \n", " -o $OUT/wikidatadwd.complEx.graph-embeddings.tsv.gz\n", " --retain_temporary_data True \n", " --operator ComplEx \n", " --workers 24 \n", " --log \"$TEMP\"/ge.complex.log \n", " -T \"$TEMP\"\n", " -ot kgtk\n", " -e 600\"\"\")" ] }, { "cell_type": "markdown", "id": "4151b50e-63d3-4bf5-9cdc-81d30cf72913", "metadata": {}, "source": [ "The augmentation files required for the next cell can be downloaded from Google Drive,\n", "https://drive.google.com/drive/u/1/folders/1qbbgjo7pddMdDvQzOSeSaL6lYwj_f5gi" ] }, { "cell_type": "code", "execution_count": null, "id": "ef67e807-d98d-40c5-8307-d5252478139b", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i \"$GRAPH\"/all.tsv.gz \n", " -i \"$GRAPH\"/metadata.property.datatypes.tsv.gz \n", " -i \"$GRAPH\"/metadata.pagerank.undirected.tsv.gz \n", " -i \"$GRAPH\"/metadata.pagerank.directed.tsv.gz \n", " -i \"$GRAPH\"/augmentation.wikipedia.anchors.tsv.gz \n", " -i \"$GRAPH\"/augmentation.wikipedia.tables.anchors.tsv.gz \n", " -i \"$GRAPH\"/augmentation.wikipedia.redirect.tsv.gz \n", " -i \"$OUT\"/item.property.count.compact.tsv.gz \n", " -i \"$OUT\"/dwd_isa_class_count.compact.tsv.gz \n", " -i \"$OUT\"/wikidatadwd.complEx.graph-embeddings.tsv.gz\n", " -i \"$OUT\"/derived.dwd_isa.tsv.gz \n", " -i \"$OUT\"/table_linker.qnode.property.values.tsv.gz \n", " -i \"$GRAPH\"/derived.isastar.tsv.gz \n", " -o \"$TEMP\"/wikidata.dwd.table-linker.index.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "43b59a85-887f-4570-a305-b2d6fb826fa3", "metadata": {}, "outputs": [], "source": [ "kgtk(f\"\"\"sort -i \"$TEMP\"/wikidata.dwd.table-linker.index.tsv.gz \n", " --columns node1 \n", " --extra '--parallel 24 --buffer-size 30% --temporary-directory ' + {os.environ['TEMP']} \n", " -o \"$OUT\"/wikidata.dwd.table-linker.index.sorted.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "id": "b1c47dda-172d-4db6-882b-4d8bb17ca59a", "metadata": {}, "source": [ "### Create the `metadata.property.datatypes.augmented.tsv.gz` file" ] }, { "cell_type": "code", "execution_count": null, "id": "662a59ef-4ce9-4fe1-93a0-1b79719d4013", "metadata": {}, "outputs": [], "source": [ "!curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk-properties.tsv" ] }, { "cell_type": "code", "execution_count": null, "id": "0fa3492d-5745-4658-98a8-6032a58c97fa", "metadata": {}, "outputs": [], "source": [ "!$kgtk filter -p \";data_type;\" -i $TEMP/kgtk-properties.tsv -o $TEMP/kgtk-properties.datatype.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "6419517b-0907-47c2-8b62-e3a0dfd5d07d", "metadata": {}, "outputs": [], "source": [ "!$kgtk cat -i $TEMP/kgtk-properties.datatype.tsv.gz $OUT/metadata.property.datatypes.tsv.gz -o $OUT/metadata.property.datatypes.augmented.tsv.gz" ] }, { "cell_type": "markdown", "id": "51d6be1e-529d-446b-80be-9314fa0f9aba", "metadata": {}, "source": [ "### Build JSON Lines Files to be loaded into ElasticSearch" ] }, { "cell_type": "code", "execution_count": null, "id": "a0f6be7a-1a76-4a42-9b2e-459221f15462", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"build-kgtk-search-input --input-file \"$OUT\"/wikidata.dwd.table-linker.index.sorted.tsv.gz\n", "--output-file \"$OUT\"/wikidata.dwd.table-linker.index.sorted.jl \n", "--label-properties label \n", "--alias-properties alias \n", "--extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449 \n", "--description-properties description \n", "--pagerank-properties Pundirected_pagerank \n", "--mapping-file \"$OUT\"/wikidata_dwd.v2.table-linker.json \n", "--property-datatype-file \"$OUT\"/metadata.property.datatypes.augmented.tsv.gz\"\"\")" ] }, { "cell_type": "markdown", "id": "4c9c3441-2402-4b28-b100-50255d42ccce", "metadata": {}, "source": [ "#### Move all the files to input folder" ] }, { "cell_type": "code", "execution_count": null, "id": "7eb5fe6c-2d3c-4491-9277-15b756829d92", "metadata": {}, "outputs": [], "source": [ "!mv $OUT/* $GRAPH/" ] } ], "metadata": { "kernelspec": { "display_name": "kgtk-env", "language": "python", "name": "kgtk-env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }