{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Generate KGTK Person Abbreviations file\n", "This notebook relates to [KGTK Issue 260](https://github.com/usc-isi-i2/kgtk/issues/260)\n", "\n", "Example command to run the notebook using papermill:\n", "\n", "```papermill abbreviate_human_labels.ipynb abbr_output.ipynb -p data_folder /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files \\\n", " -p wikidata_item_filename claims.wikibase-item.tsv.gz \\\n", " -p wikidata_label_filename labels.en.tsv.gz \\\n", " -p wikidata_alias_filename aliases.en.tsv.gz```" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "data_folder = '/data/amandeep/wikidata-20210215-dwd'\n", "output_folder = 'derived_files_for_es'\n", "wikidata_item_filename = 'claims.wikibase-item.tsv.gz'\n", "wikidata_label_filename = 'labels.en.tsv.gz'\n", "wikidata_alias_filename = 'aliases.en.tsv.gz'\n", "output_file_name = 'derived.Q5.abbreviations.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import gzip\n", "import shutil\n", "from collections import defaultdict\n", "import json\n", "import ast\n", "import time" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "os.environ['FILE'] = data_folder\n", "os.environ['OUT'] = f'{data_folder}/{output_folder}'\n", "os.environ['GRAPH_CACHE'] = '/data/amandeep/temp.wikidata-20210215-dwd/wikidata.sqlite3.db'\n", "os.environ['CLAIMS'] = f'{data_folder}/parts/{wikidata_item_filename}'\n", "os.environ['LABELS'] = f'{data_folder}/{wikidata_label_filename}'\n", "os.environ['ALIASES'] = f'{data_folder}/{wikidata_alias_filename}'\n", "os.environ['OUTPUT_FILE'] = output_file_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filter Human Labels" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/data/amandeep/wikidata-20210215-dwd/parts/claims.wikibase-item.tsv.gz\n", "/data/amandeep/wikidata-20210215-dwd/labels.en.tsv.gz\n", "/data/amandeep/wikidata-20210215-dwd/derived_files_for_es/human_label_edge.tsv.gz\n", "/data/amandeep/temp.wikidata-20210215-dwd/wikidata.sqlite3.db\n" ] } ], "source": [ "!echo {wikibase_item_file} \n", "!echo {label_file}\n", "!echo {human_label_output}\n", "!echo $GRAPH_CACHE" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-03-10 15:24:48 sqlstore]: IMPORT graph directly into table graph_6 from /data/amandeep/wikidata-20210215-dwd/parts/claims.wikibase-item.tsv.gz ...\n", "[2021-03-10 15:30:33 sqlstore]: IMPORT graph directly into table graph_7 from /data/amandeep/wikidata-20210215-dwd/labels.en.tsv.gz ...\n", "[2021-03-10 15:31:22 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_6_c1.\"node1\", graph_7_c2.\"label\", graph_7_c2.\"node2\"\n", " FROM graph_6 AS graph_6_c1, graph_7 AS graph_7_c2\n", " WHERE graph_6_c1.\"label\"=?\n", " AND graph_6_c1.\"node2\"=?\n", " AND graph_6_c1.\"node1\"=graph_7_c2.\"node1\"\n", " PARAS: ['P31', 'Q5']\n", "---------------------------------------------\n", "[2021-03-10 15:31:22 sqlstore]: CREATE INDEX on table graph_6 column node2 ...\n", "[2021-03-10 15:34:57 sqlstore]: ANALYZE INDEX on table graph_6 column node2 ...\n", "[2021-03-10 15:35:08 sqlstore]: CREATE INDEX on table graph_6 column node1 ...\n", "[2021-03-10 15:37:09 sqlstore]: ANALYZE INDEX on table graph_6 column node1 ...\n", "[2021-03-10 15:37:21 sqlstore]: CREATE INDEX on table graph_6 column label ...\n", "[2021-03-10 15:39:54 sqlstore]: ANALYZE INDEX on table graph_6 column label ...\n", "[2021-03-10 15:40:04 sqlstore]: CREATE INDEX on table graph_7 column node1 ...\n", "[2021-03-10 15:40:23 sqlstore]: ANALYZE INDEX on table graph_7 column node1 ...\n" ] } ], "source": [ "!kgtk --debug query --graph-cache \"$GRAPH_CACHE\" \\\n", "-i \"$CLAIMS\" \\\n", "-i \"$LABELS\" \\\n", "-o \"$OUT/human_label_edge.tsv.gz\" \\\n", "--match 'claims: (n1)-[:P31]->(:Q5), labels: (n1)-[l]->(n2)' \\\n", "--return 'n1, l.label, n2'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\r\n", "Q10000001\tlabel\t'Tatyana Kolotilshchikova'@en\r\n", "Q1000002\tlabel\t'Claus Hammel'@en\r\n", "Q1000005\tlabel\t'Karel Matěj Čapek-Chod'@en\r\n", "Q1000006\tlabel\t'Florian Eichinger'@en\r\n", "Q100000811\tlabel\t'Paul Mohr'@en\r\n", "Q100000814\tlabel\t'Walther Reichardt'@en\r\n", "Q100000817\tlabel\t'Michael Flemisch'@en\r\n", "Q100000831\tlabel\t'Otto Kramer'@en\r\n", "Q100000832\tlabel\t'Maria Magdalena Lebreton'@en\r\n", "\r\n", "gzip: stdout: Broken pipe\r\n" ] } ], "source": [ "!zcat \"$OUT/human_label_edge.tsv.gz\" | head" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-03-10 15:48:15 sqlstore]: IMPORT graph directly into table graph_8 from /data/amandeep/wikidata-20210215-dwd/aliases.en.tsv.gz ...\n", "[2021-03-10 15:48:25 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_8_c2.\"node1\", graph_8_c2.\"label\", graph_8_c2.\"node2\"\n", " FROM graph_6 AS graph_6_c1, graph_8 AS graph_8_c2\n", " WHERE graph_6_c1.\"label\"=?\n", " AND graph_6_c1.\"node2\"=?\n", " AND graph_6_c1.\"node1\"=graph_8_c2.\"node1\"\n", " PARAS: ['P31', 'Q5']\n", "---------------------------------------------\n", "[2021-03-10 15:48:25 sqlstore]: CREATE INDEX on table graph_8 column node1 ...\n", "[2021-03-10 15:48:27 sqlstore]: ANALYZE INDEX on table graph_8 column node1 ...\n" ] } ], "source": [ "!kgtk --debug query --graph-cache \"$GRAPH_CACHE\" \\\n", "-i \"$CLAIMS\" \\\n", "-i \"$ALIASES\" \\\n", "-o \"$OUT/human_alias_edge.tsv.gz\" \\\n", "--match 'claims: (n1)-[:P31]->(:Q5), aliases: (n1)-[l]->(n2)' \\\n", "--return 'n1, l.label, n2'" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\r\n", "Q10000001\talias\t'Tatyana Serafimovna Kolotilshchikova'@en\r\n", "Q1000005\talias\t'Karel Matej Capek-Chod'@en\r\n", "Q100000832\talias\t'Marie Madeleine Lebreton'@en\r\n", "Q100000832\talias\t'M. M. Lebreton'@en\r\n", "Q100001260\talias\t'Hendrikus Johannes Rigters'@en\r\n", "Q1000023\talias\t'Emmi Agathe Karola Margarete Wiltraut Rupp-von Brünneck'@en\r\n", "Q1000051\talias\t'Joseph Christopher O\\'Mahoney'@en\r\n", "Q1000051\talias\t'J. Christopher O\\'Mahoney'@en\r\n", "Q1000051\talias\t'Joseph O\\'Mahoney'@en\r\n", "\r\n", "gzip: stdout: Broken pipe\r\n" ] } ], "source": [ "!zcat \"$OUT/human_alias_edge.tsv.gz\" | head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concat the human label and alias file" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "!kgtk cat -i \"$OUT/human_label_edge.tsv.gz\" \"$OUT/human_alias_edge.tsv.gz\" \\\n", "/ sort -c node1 -o \"$OUT/human_label_alias.tsv.gz\"\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Functions to generate Abbreviations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Algorithm for generating the person abbreviations:\n", "Abbreviations for label property:\n", " * If the label has 2 words, abbreviate the first word. \n", " * If label has more than two words(eg. Michael Jeffrey Jordan), then:
\n", " a) Generate Abbreviation for all words leading upto the end word. (eg. M. J. Jordan)
\n", " b) Generate Abbreviations for the middle words. (eg. Michael J. Jordan).
\n", " c) If the generated abbreviations are present in the alias, then leave them.
\n", "\t\n", "Abbreviations for alias property:\n", "\n", "Alias may have new words other than the words present in the label. For new words present at the start and end leave them as it is. Generate abbreviations for new words in the middle." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def generate_abbreviations(name_split,word_index):\n", " '''\n", " Helper function to generate the abbreviation.\n", " Input: name_split: List of the words in a name\n", " Output: Abbreviated Name\n", " '''\n", " abbr_label = ''\n", " if word_index is None:\n", " for word in name_split[:-1]:\n", " abbr_label += word[0].upper() + '.' + ' '\n", " abbr_label += name_split[-1]\n", " if len(name_split) >= 2:\n", " abbr_label_end = name_split[-1] + ',' + ' '\n", " for word in name_split[:-1]:\n", " abbr_label_end += word[0].upper() + '.' + ' '\n", " \n", " return abbr_label, abbr_label_end.strip()\n", " \n", " return abbr_label, None\n", " else:\n", " for i in range(len(name_split) - 1):\n", " if i != word_index:\n", " abbr_label += name_split[i] + ' '\n", " else:\n", " abbr_label += name_split[i][0].upper() + '.' + ' '\n", " abbr_label += name_split[-1]\n", " return abbr_label" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "def abbreviate_human_labels(human_label_file,output_file):\n", " '''\n", " Traverses the concatenated human labels and aliases, creates the abbreviations for the labels and aliases\n", " '''\n", " with gzip.open(human_label_file,'rt') as file:\n", " prev = None\n", " lines_to_write = list()\n", "\n", " first_line = file.readline().replace('\\n','').replace('\\r','')\n", " columns = first_line.split('\\t')\n", " prop_index = columns.index('label')\n", " node1_index = columns.index(\"node1\")\n", "# id_index = columns.index(\"id\")\n", " node2_index = columns.index(\"node2\")\n", " flag = False\n", " st = time.time()\n", " for i,line in enumerate(file):\n", " if i%1000000 == 0:\n", " print(f\"DONE {i} rows!\",) \n", "# print(\"Time taken for {} is {}\".format(i,time.time() - st))\n", "# print(\"Previous Qnode is:\",prev)\n", " vals = line.split('\\t')\n", " prop_label = vals[prop_index]\n", " node1 = vals[node1_index]\n", " id_val = 'id'\n", " node2 = vals[node2_index]\n", " if node1.startswith('Q'):\n", " if prev is None:\n", " prev = node1\n", " abbr_dict = defaultdict(set)\n", " alias_dict = defaultdict(set)\n", " label_dict = defaultdict(list)\n", " \n", " if not prev.strip() == node1.strip():\n", " if len(label_dict[prev]) == 0:\n", " prev = node1\n", " continue\n", " node_label_list = label_dict[prev][0].split()\n", " abbr_str, abbr_str_end = generate_abbreviations(node_label_list,None)\n", " abbr_dict[prev].add(abbr_str)\n", " if abbr_str_end is not None:\n", " abbr_dict[prev].add(abbr_str_end)\n", " if len(node_label_list) > 2:\n", " for i in range(1,len(node_label_list) - 1):\n", " abbr_str = generate_abbreviations(node_label_list,i)\n", " abbr_dict[prev].add(abbr_str)\n", "\n", " #alias\n", " if prev in alias_dict:\n", " for alias in alias_dict[prev]:\n", " node_alias_split = alias.split()\n", " #check if first and last word of label and alias are the same. Generate abbreviation \n", " #for new middle words\n", " if node_alias_split[0] == node_label_list[0]:\n", " abbr_str, abbr_str_end = generate_abbreviations(node_alias_split,None)\n", " abbr_dict[prev].add(abbr_str)\n", " if abbr_str_end is not None:\n", " abbr_dict[prev].add(abbr_str_end)\n", " \n", " if len(node_alias_split) > 2:\n", " for i in range(1,len(node_alias_split) - 1):\n", " abbr_str = generate_abbreviations(node_alias_split,i)\n", " abbr_dict[prev].add(abbr_str)\n", " continue\n", "\n", " if node_alias_split[0] != node_label_list[0]:\n", " if len(node_alias_split) > 2:\n", " for i in range(1,len(node_alias_split) - 1):\n", " abbr_str = generate_abbreviations(node_alias_split,i)\n", " abbr_dict[prev].add(abbr_str)\n", "\n", "\n", "\n", " #unique abbreviation edges to write\n", " for lab in abbr_dict[prev]:\n", " if prev in alias_dict:\n", " if lab in alias_dict[prev]:\n", " continue\n", " lines_to_write.append(prev + '\\t' + 'abbreviated_name' + '\\t' + \"\\'\" + lab + \"\\'\" +'@en')\n", " prev = node1\n", " \n", " if prev.strip() == node1.strip():\n", " if prop_label == 'alias':\n", " alias_dict[node1].add(ast.literal_eval(node2.split('@en')[0]))\n", " \n", " if prop_label == 'label':\n", " label_dict[node1].append(ast.literal_eval(node2.split('@en')[0]))\n", " \n", " \n", " if len(lines_to_write) > 100000:\n", " with gzip.open(output_file,'a') as writer:\n", " if flag == False:\n", " header = first_line + '\\n'\n", " writer.write(header.encode('utf8'))\n", " flag = True\n", " \n", " writer.write('\\n'.join(lines_to_write).encode('utf8'))\n", " writer.write('\\n'.encode('utf8')) \n", " lines_to_write = list()\n", " \n", " for lab in abbr_dict[prev]:\n", " lines_to_write.append(prev + '\\t' + 'abbreviated_name' + '\\t' + \"\\'\" + lab + \"\\'\" +'@en')\n", " #print(lines_to_write) \n", " if len(lines_to_write) > 0:\n", " #print(lines_to_write)\n", " with gzip.open(output_file,'a') as writer:\n", " if flag == False:\n", " header = first_line + '\\n'\n", " writer.write(header.encode('utf8'))\n", " flag = True\n", " writer.write('\\n'.join(lines_to_write).encode('utf8'))\n", " writer.write('\\n'.encode('utf8')) " ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DONE 0 rows!\n", "DONE 1000000 rows!\n", "DONE 2000000 rows!\n", "DONE 3000000 rows!\n", "DONE 4000000 rows!\n", "DONE 5000000 rows!\n", "DONE 6000000 rows!\n", "DONE 7000000 rows!\n", "DONE 8000000 rows!\n", "DONE 9000000 rows!\n" ] } ], "source": [ "abbreviate_human_labels(f'{data_folder}/{output_folder}/human_label_alias.tsv.gz', \n", " f'{data_folder}/{output_folder}/derived.Q5.abbreviations.1.tsv.gz')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "!kgtk add-id -i \"$OUT/derived.Q5.abbreviations.1.tsv.gz\" \\\n", "--id-style wikidata \\\n", "-o \"$OUT/$OUTPUT_FILE\"" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\r\n", "Q10000001\tabbreviated_name\t'T. Kolotilshchikova'@en\tQ10000001-abbreviated_name-5d05f1\r\n", "Q10000001\tabbreviated_name\t'Kolotilshchikova, T.'@en\tQ10000001-abbreviated_name-ee909d\r\n", "Q10000001\tabbreviated_name\t'Tatyana S. Kolotilshchikova'@en\tQ10000001-abbreviated_name-5106b9\r\n", "Q10000001\tabbreviated_name\t'T. S. Kolotilshchikova'@en\tQ10000001-abbreviated_name-021bee\r\n", "Q10000001\tabbreviated_name\t'Kolotilshchikova, T. S.'@en\tQ10000001-abbreviated_name-5d9384\r\n", "Q1000002\tabbreviated_name\t'C. Hammel'@en\tQ1000002-abbreviated_name-90b023\r\n", "Q1000002\tabbreviated_name\t'Hammel, C.'@en\tQ1000002-abbreviated_name-2ab003\r\n", "Q1000005\tabbreviated_name\t'K. M. Čapek-Chod'@en\tQ1000005-abbreviated_name-6123f0\r\n", "Q1000005\tabbreviated_name\t'Čapek-Chod, K. M.'@en\tQ1000005-abbreviated_name-9781fd\r\n", "\r\n", "gzip: stdout: Broken pipe\r\n" ] } ], "source": [ "!zcat \"$OUT/$OUTPUT_FILE\" | head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CleanUp Temporary files" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "!rm \"$OUT/human_label_edge.tsv.gz\"\n", "!rm \"$OUT/human_alias_edge.tsv.gz\"\n", "!rm \"$OUT/human_label_alias.tsv.gz\"\n", "!rm \"$OUT/derived.Q5.abbreviations.1.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.8" } }, "nbformat": 4, "nbformat_minor": 4 }