{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook collect labels, aliases and descriptions for node1, property and node2 in a given KGTK edge file\n", "\n", "Parameters are set up in the first cell so that we can run this notebook in batch mode. Example invocation command:\n", "\n", "```\n", "papermill Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb example9.out.ipynb \\\n", "-p wlf /Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz \\\n", "-p waf /Users/amandeep/Github/kgtk/data/all.alias.en.tsv.gz \\\n", "-p wdf /Users/amandeep/Github/kgtk/data/all.description.en.tsv.gz \\\n", "-p inf /Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv \\\n", "-p out_folder /Users/amandeep/Github/kgtk/data/statements/output \\\n", "-p out_file valid_lad.tsv \\\n", "-p delete_database no \\\n", "-p run_node1 yes \\\n", "-p run_property yes \\\n", "-p run_node2 yes\n", "```\n", "\n", "To print a help message and exit:\n", "\n", "```\n", "papermill --help-notebook Example-9-Find-Labels-Aliases-and-Descriptions-for-a-KGTK-edge-file.ipynb\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# path to wiki labels file\n", "wlf:str='/Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz'\n", "# path to wiki alias file\n", "waf:str='/Users/amandeep/Github/kgtk/data/all.alias.en.tsv.gz'\n", "# path to wiki description file\n", "wdf:str='/Users/amandeep/Github/kgtk/data/all.description.en.tsv.gz'\n", "# path to input kgtk edge file\n", "inf:str='/Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv'\n", "# output folder path\n", "out_folder:str='/Users/amandeep/Github/kgtk/data/statements/output'\n", "# output file name\n", "out_file:str='valid_lad.tsv'\n", "# delete previous database\n", "delete_database:str = 'no'\n", "# find labels, aliases and descriptions for node1\n", "run_node1:str='yes'\n", "# find labels, aliases and descriptions for property\n", "run_property:str='yes'\n", "# find labels, aliases and descriptions for node2\n", "run_node2:str='yes'\n", " " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import io\n", "import os\n", "import subprocess\n", "import sys\n", "\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup Environment Variables" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "os.environ['INPUT_FILE'] = inf\n", "os.environ['INPUT_FILE_NAME'] = inf.split('/')[-1]\n", "\n", "os.environ['WIKI_LABEL'] = wlf\n", "os.environ['WIKI_LABEL_NAME'] = wlf.split('/')[-1]\n", "\n", "os.environ['WIKI_ALIAS'] = waf\n", "os.environ['WIKI_ALIAS_NAME'] = waf.split('/')[-1]\n", "\n", "os.environ['WIKI_DESCRIPTION'] = wdf\n", "os.environ['WIKI_DESCRIPTION_NAME'] = wdf.split('/')[-1]\n", "\n", "os.environ['OUT'] = out_folder\n", "os.environ['OUT_FILE'] = out_file\n", "os.environ['kgtk'] = \"time kgtk --debug\"\n", "\n", "os.environ['temp'] = f'{out_folder}/temp'\n", "os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(os.environ['temp'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# create temp folder to be deleted later\n", "# -p option creates the path if it does not exist\n", "!mkdir -p $temp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if delete_database and delete_database.lower() == \"yes\":\n", " print(\"Deleted database\")\n", " !rm $STORE" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def format_command(cmd, substitution_dictionary = {}):\n", " \"\"\"Run a templetized command.\"\"\"\n", " for k, v in substitution_dictionary.items():\n", " cmd = cmd.replace(k, v)\n", " \n", " print(cmd)\n", " return cmd" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug\n", "/Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz\n", "/Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv\n", "valid_edges.tsv\n", "all.label.en.tsv.gz\n", "/Users/amandeep/Github/kgtk/data/statements/output/temp/wikidata.sqlite3.db\n" ] } ], "source": [ "!echo $kgtk\n", "!echo $WIKI_LABEL\n", "!echo $INPUT_FILE\n", "!echo $INPUT_FILE_NAME\n", "!echo $WIKI_LABEL_NAME\n", "!echo $STORE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 1: find labels for `node1` in the file" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "$kgtk query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o $temp/node1_labels.tsv --match '`valid_edges.tsv`: (n1)-[]->(), `all.label.en.tsv.gz`: (n1)-[l:label]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\n", "\n", "[2020-10-15 17:29:45 sqlstore]: IMPORT graph directly into table graph_1 from /Users/amandeep/Github/kgtk/data/all.label.en.tsv.gz ...\n", "[2020-10-15 17:38:43 sqlstore]: IMPORT graph directly into table graph_2 from /Users/amandeep/Github/kgtk/data/statements/valid_edges.tsv ...\n", "[2020-10-15 17:38:43 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node1\" \"node1\", graph_1_c2.\"label\", graph_1_c2.\"node2\"\n", " FROM graph_1 AS graph_1_c2, graph_2 AS graph_2_c1\n", " WHERE graph_1_c2.\"label\"=?\n", " AND graph_1_c2.\"node1\"=graph_2_c1.\"node1\"\n", " ORDER BY graph_2_c1.\"node1\" ASC, graph_1_c2.\"label\" ASC, graph_1_c2.\"node2\" ASC\n", " PARAS: ['label']\n", "---------------------------------------------\n", "[2020-10-15 17:38:43 sqlstore]: CREATE INDEX on table graph_1 column label ...\n", "[2020-10-15 17:39:41 sqlstore]: ANALYZE INDEX on table graph_1 column label ...\n", "[2020-10-15 17:39:46 sqlstore]: CREATE INDEX on table graph_1 column node1 ...\n", "[2020-10-15 17:40:31 sqlstore]: ANALYZE INDEX on table graph_1 column node1 ...\n", "[2020-10-15 17:40:36 sqlstore]: CREATE INDEX on table graph_2 column node1 ...\n", "[2020-10-15 17:40:36 sqlstore]: ANALYZE INDEX on table graph_2 column node1 ...\n", " 654.45 real 973.74 user 39.26 sys\n", "\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_LABEL_NAME': '`{}`'.format(os.environ['WIKI_LABEL_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/node1_labels.tsv \\\n", " --match 'INPUT_FILE_NAME: (n1)-[]->(), WIKI_LABEL_NAME: (n1)-[l:label]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct n1 as node1, l.label, n2' \\\n", " --order-by 'n1, l.label, n2'\"\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_node1 and run_node1.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 2: find aliases for `node1` in the file" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE -o $temp/node1_aliases.tsv --match '`valid_edges.tsv`: (n1)-[]->(), `all.alias.en.tsv.gz`: (n1)-[l:alias]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\n", "[2020-10-15 17:44:11 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node1\" \"node1\", graph_3_c2.\"label\", graph_3_c2.\"node2\"\n", " FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2\n", " WHERE graph_3_c2.\"label\"=?\n", " AND graph_2_c1.\"node1\"=graph_3_c2.\"node1\"\n", " ORDER BY graph_2_c1.\"node1\" ASC, graph_3_c2.\"label\" ASC, graph_3_c2.\"node2\" ASC\n", " PARAS: ['alias']\n", "---------------------------------------------\n", "kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE -o $temp/node1_aliases.tsv 0.72s user 0.11s system 99% cpu 0.837 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_ALIAS_NAME': '`{}`'.format(os.environ['WIKI_ALIAS_NAME'])\n", "}\n", "command = \"time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/node1_aliases.tsv \\\n", " --match 'INPUT_FILE_NAME: (n1)-[]->(), WIKI_ALIAS_NAME: (n1)-[l:alias]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\"\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_node1 and run_node1.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 3: find descriptions for `node1` in the file" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE -o $temp/node1_descriptions.tsv --match '`valid_edges.tsv`: (n1)-[]->(), `all.description.en.tsv.gz`: (n1)-[l:description]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\n", "[2020-10-15 17:48:38 sqlstore]: IMPORT graph directly into table graph_3 from /Users/amandeep/Github/kgtk/data/all.description.en.tsv.gz ...\n", "[2020-10-15 18:00:45 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node1\" \"node1\", graph_3_c2.\"label\", graph_3_c2.\"node2\"\n", " FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2\n", " WHERE graph_3_c2.\"label\"=?\n", " AND graph_2_c1.\"node1\"=graph_3_c2.\"node1\"\n", " ORDER BY graph_2_c1.\"node1\" ASC, graph_3_c2.\"label\" ASC, graph_3_c2.\"node2\" ASC\n", " PARAS: ['description']\n", "---------------------------------------------\n", "[2020-10-15 18:00:45 sqlstore]: CREATE INDEX on table graph_3 column label ...\n", "[2020-10-15 18:02:20 sqlstore]: ANALYZE INDEX on table graph_3 column label ...\n", "[2020-10-15 18:02:27 sqlstore]: CREATE INDEX on table graph_3 column node1 ...\n", "[2020-10-15 18:03:53 sqlstore]: ANALYZE INDEX on table graph_3 column node1 ...\n", "kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE - 1305.90s user 62.33s system 147% cpu 15:25.23 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_DESCRIPTION_NAME': '`{}`'.format(os.environ['WIKI_DESCRIPTION_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/node1_descriptions.tsv \\\n", " --match 'INPUT_FILE_NAME: (n1)-[]->(), WIKI_DESCRIPTION_NAME: (n1)-[l:description]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\"\n", "\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_node1 and run_node1.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 4: find labels for `node2` in the file" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o $temp/node2_labels.tsv --match '`valid_edges.tsv`: ()-[]->(n1), `all.label.en.tsv.gz`: (n1)-[l:label]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\n", "[2020-10-15 18:04:41 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node2\" \"node1\", graph_1_c2.\"label\", graph_1_c2.\"node2\"\n", " FROM graph_1 AS graph_1_c2, graph_2 AS graph_2_c1\n", " WHERE graph_1_c2.\"label\"=?\n", " AND graph_1_c2.\"node1\"=graph_2_c1.\"node2\"\n", " ORDER BY graph_2_c1.\"node2\" ASC, graph_1_c2.\"label\" ASC, graph_1_c2.\"node2\" ASC\n", " PARAS: ['label']\n", "---------------------------------------------\n", "[2020-10-15 18:04:41 sqlstore]: CREATE INDEX on table graph_2 column node2 ...\n", "[2020-10-15 18:04:41 sqlstore]: ANALYZE INDEX on table graph_2 column node2 ...\n", "kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o 0.93s user 0.49s system 38% cpu 3.663 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_LABEL_NAME': '`{}`'.format(os.environ['WIKI_LABEL_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/node2_labels.tsv \\\n", " --match 'INPUT_FILE_NAME: ()-[]->(n1), WIKI_LABEL_NAME: (n1)-[l:label]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\"\n", "\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "\n", "if run_node2 and run_node2.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 5: find aliases for `node2` in the file" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o $temp/node2_aliases.tsv --match '`valid_edges.tsv`: ()-[]->(n1), `all.alias.en.tsv.gz`: (n1)-[l:alias]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\n", "[2020-10-15 18:05:22 sqlstore]: IMPORT graph directly into table graph_4 from /Users/amandeep/Github/kgtk/data/all.alias.en.tsv.gz ...\n", "[2020-10-15 18:05:55 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node2\" \"node1\", graph_4_c2.\"label\", graph_4_c2.\"node2\"\n", " FROM graph_2 AS graph_2_c1, graph_4 AS graph_4_c2\n", " WHERE graph_4_c2.\"label\"=?\n", " AND graph_2_c1.\"node2\"=graph_4_c2.\"node1\"\n", " ORDER BY graph_2_c1.\"node2\" ASC, graph_4_c2.\"label\" ASC, graph_4_c2.\"node2\" ASC\n", " PARAS: ['alias']\n", "---------------------------------------------\n", "[2020-10-15 18:05:55 sqlstore]: CREATE INDEX on table graph_4 column label ...\n", "[2020-10-15 18:05:58 sqlstore]: ANALYZE INDEX on table graph_4 column label ...\n", "[2020-10-15 18:05:59 sqlstore]: CREATE INDEX on table graph_4 column node1 ...\n", "[2020-10-15 18:06:02 sqlstore]: ANALYZE INDEX on table graph_4 column node1 ...\n", "kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o 64.51s user 2.26s system 158% cpu 42.042 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_ALIAS_NAME': '`{}`'.format(os.environ['WIKI_ALIAS_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/node2_aliases.tsv \\\n", " --match 'INPUT_FILE_NAME: ()-[]->(n1), WIKI_ALIAS_NAME: (n1)-[l:alias]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\"\n", "\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_node2 and run_node2.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 6: find descriptions for `node2` in the file" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE -o $temp/node2_descriptions.tsv --match '`valid_edges.tsv`: ()-[]->(n1), `all.description.en.tsv.gz`: (n1)-[l:description]->(n2)' --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\n", "[2020-10-15 18:06:36 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"node2\" \"node1\", graph_3_c2.\"label\", graph_3_c2.\"node2\"\n", " FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2\n", " WHERE graph_3_c2.\"label\"=?\n", " AND graph_2_c1.\"node2\"=graph_3_c2.\"node1\"\n", " ORDER BY graph_2_c1.\"node2\" ASC, graph_3_c2.\"label\" ASC, graph_3_c2.\"node2\" ASC\n", " PARAS: ['description']\n", "---------------------------------------------\n", "kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE - 0.75s user 0.26s system 59% cpu 1.708 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_DESCRIPTION_NAME': '`{}`'.format(os.environ['WIKI_DESCRIPTION_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/node2_descriptions.tsv \\\n", " --match 'INPUT_FILE_NAME: ()-[]->(n1), WIKI_DESCRIPTION_NAME: (n1)-[l:description]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct n1 as node1, l.label, n2' --order-by 'n1, l.label, n2'\"\n", "\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_node2 and run_node2.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 7: find labels for `property` in the file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o $temp/property_labels.tsv --match '`valid_edges.tsv`: ()-[id {label: prop}]->(), `all.label.en.tsv.gz`: (prop)-[l:label]->(n2)' --return 'distinct prop as node1, l.label, n2' --order-by 'prop, l.label, n2'\n", "[2020-10-15 18:07:01 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_1_c2.\"node1\" \"node1\", graph_1_c2.\"label\", graph_1_c2.\"node2\"\n", " FROM graph_1 AS graph_1_c2, graph_2 AS graph_2_c1\n", " WHERE graph_1_c2.\"label\"=?\n", " AND graph_2_c1.\"label\"=graph_1_c2.\"node1\"\n", " AND graph_1_c2.\"node1\"=graph_2_c1.\"label\"\n", " ORDER BY graph_1_c2.\"node1\" ASC, graph_1_c2.\"label\" ASC, graph_1_c2.\"node2\" ASC\n", " PARAS: ['label']\n", "---------------------------------------------\n", "[2020-10-15 18:07:01 sqlstore]: CREATE INDEX on table graph_2 column label ...\n", "[2020-10-15 18:07:01 sqlstore]: ANALYZE INDEX on table graph_2 column label ...\n", "kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE --graph-cache $STORE -o 0.68s user 0.09s system 90% cpu 0.840 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_LABEL_NAME': '`{}`'.format(os.environ['WIKI_LABEL_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_LABEL -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/property_labels.tsv \\\n", " --match 'INPUT_FILE_NAME: ()-[id {label: prop}]->(), WIKI_LABEL_NAME: (prop)-[l:label]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct prop as node1, l.label, n2' \\\n", " --order-by 'prop, l.label, n2'\"\n", "\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_property and run_property.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 8: find aliases for `property` in the file\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o $temp/property_aliases.tsv --match '`valid_edges.tsv`: ()-[id {label: prop}]->(), `all.alias.en.tsv.gz`: (prop)-[l:alias]->(n2)' --return 'distinct prop as node1, l.label, n2' --order-by 'prop, l.label, n2'\n", "[2020-10-15 18:07:33 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_4_c2.\"node1\" \"node1\", graph_4_c2.\"label\", graph_4_c2.\"node2\"\n", " FROM graph_2 AS graph_2_c1, graph_4 AS graph_4_c2\n", " WHERE graph_2_c1.\"label\"=graph_4_c2.\"node1\"\n", " AND graph_4_c2.\"label\"=?\n", " AND graph_2_c1.\"label\"=graph_4_c2.\"node1\"\n", " ORDER BY graph_4_c2.\"node1\" ASC, graph_4_c2.\"label\" ASC, graph_4_c2.\"node2\" ASC\n", " PARAS: ['alias']\n", "---------------------------------------------\n", "kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE --graph-cache $STORE -o 1.04s user 0.09s system 99% cpu 1.145 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_ALIAS_NAME': '`{}`'.format(os.environ['WIKI_ALIAS_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_ALIAS -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/property_aliases.tsv \\\n", " --match 'INPUT_FILE_NAME: ()-[id {label: prop}]->(), WIKI_ALIAS_NAME: (prop)-[l:alias]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct prop as node1, l.label, n2' \\\n", " --order-by 'prop, l.label, n2'\"\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_property and run_property.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 9: find descriptions for `property` in the file" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE -o $temp/property_descriptions.tsv --match '`valid_edges.tsv`: ()-[id {label: prop}]->(), `all.description.en.tsv.gz`: (prop)-[l:description]->(n2)' --return 'distinct prop as node1, l.label, n2' --order-by 'prop, l.label, n2'\n", "[2020-10-15 18:07:51 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT DISTINCT graph_2_c1.\"label\" \"node1\", graph_3_c2.\"label\", graph_3_c2.\"node2\"\n", " FROM graph_2 AS graph_2_c1, graph_3 AS graph_3_c2\n", " WHERE graph_2_c1.\"label\"=graph_2_c1.\"label\"\n", " AND graph_3_c2.\"label\"=?\n", " AND graph_2_c1.\"label\"=graph_3_c2.\"node1\"\n", " ORDER BY graph_2_c1.\"label\" ASC, graph_3_c2.\"label\" ASC, graph_3_c2.\"node2\" ASC\n", " PARAS: ['description']\n", "---------------------------------------------\n", "kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE --graph-cache $STORE - 0.66s user 0.09s system 93% cpu 0.802 total\n" ] } ], "source": [ "replace_dict = {\n", " 'INPUT_FILE_NAME': '`{}`'.format(os.environ['INPUT_FILE_NAME']),\n", " 'WIKI_DESCRIPTION_NAME': '`{}`'.format(os.environ['WIKI_DESCRIPTION_NAME'])\n", "}\n", "\n", "command = \"time kgtk --debug query -i $WIKI_DESCRIPTION -i $INPUT_FILE \\\n", " --graph-cache $STORE \\\n", " -o $temp/property_descriptions.tsv \\\n", " --match 'INPUT_FILE_NAME: ()-[id {label: prop}]->(), WIKI_DESCRIPTION_NAME: (prop)-[l:description]->(n2)' \\\n", " --where 'n2.kgtk_lqstring_lang_suffix = \\\"en\\\"' \\\n", " --return 'distinct prop as node1, l.label, n2' \\\n", " --order-by 'prop, l.label, n2'\"\n", "\n", "cmd = format_command(command, substitution_dictionary=replace_dict)\n", "if run_property and run_property.lower() == 'yes':\n", " !$cmd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 10: cat all the files into one file" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "file_list = []\n", "if run_node1 and run_node1.lower() == 'yes':\n", " file_list.append('{}/node1_labels.tsv'.format(os.environ['temp']))\n", " file_list.append('{}/node1_aliases.tsv'.format(os.environ['temp']))\n", " file_list.append('{}/node1_descriptions.tsv'.format(os.environ['temp']))\n", "\n", "if run_node2 and run_node2.lower() == 'yes':\n", " file_list.append('{}/node2_labels.tsv'.format(os.environ['temp']))\n", " file_list.append('{}/node2_aliases.tsv'.format(os.environ['temp']))\n", " file_list.append('{}/node2_descriptions.tsv'.format(os.environ['temp']))\n", "\n", "if run_property and run_property.lower() == 'yes':\n", " file_list.append('{}/property_labels.tsv'.format(os.environ['temp']))\n", " file_list.append('{}/property_aliases.tsv'.format(os.environ['temp']))\n", " file_list.append('{}/property_descriptions.tsv'.format(os.environ['temp']))\n", "\n", "f_list = ' '.join(file_list).strip()\n", "print(f_list)\n", "kgtk_cat_command = 'kgtk cat -i {} -o $temp/all_labels_aliases_descriptions_duplicates.tsv'.format(f_list)\n", "\n", "!$kgtk_cat_command" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 11: sort and compact " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kgtk sort2 -i $temp/all_labels_aliases_descriptions_duplicates.tsv \\\n", " -o $temp/all_labels_aliases_descriptions_sorted.tsv\n", "\n", "!kgtk compact -i $temp/all_labels_aliases_descriptions_sorted.tsv \\\n", " -o $temp/all_labels_aliases_descriptions_sorted_dedup.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Step 12: add ids" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "!kgtk add-id -i $temp/all_labels_aliases_descriptions_sorted_dedup.tsv \\\n", " -o $OUT/$OUT_FILE \\\n", " --id-style node1-label-num" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { "display_name": "kgtk_env", "language": "python", "name": "kgtk_env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 4 }