{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Generate WikiTable Anchors\n", "\n", "This notebook relates to [KGTK Issue# 264](https://github.com/usc-isi-i2/kgtk/issues/264)\n", "\n", "The Wikitable data is present [here](https://drive.google.com/drive/folders/1dvHwiKt_YbAEIThSZRhu2-dU1ISzy8rW?usp=sharing). The data is present in the ```step_1``` folder in the above link\n", "\n", "Example Command to run using papermill:\n", "```\n", "papermill generate_wikitable_anchors.ipynb gen_anchor_output.ipynb -p file_dir /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/KGTK_issue_264/ \\\n", " -p kgtk_files_dir /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/ \\\n", " -p sitelinks_filename sitelinks.en.tsv.gz```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "# file_dir: Path of the step_1 folder which has the WikiTable corpus\n", "file_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/KGTK_issue_264/'\n", "\n", "#kgtk_files_dir: Path where the KGTK files are present(specifically the sitelinks.en.tsv.gz)\n", "kgtk_files_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/'\n", "\n", "#sitelinks_filename\n", "sitelinks_filename = 'sitelinks.en.tsv.gz'\n", "\n", "#Is the Wikitable Corpus processed\n", "wikitable_processed = False\n", "\n", "# processed wikitable filename. I keep the default name as augmentation.wikipedia.tables.anchors.meta.tsv.gz\n", "wikitable_processed_filename = 'augmentation.wikipedia.tables.anchors.meta.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import json\n", "import gzip\n", "import glob\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#Initialize variables\n", "wikitable_corpus_dir = os.path.join(file_dir, 'step_1')\n", "sitelinks_file = os.path.join(kgtk_files_dir, sitelinks_filename)\n", "temp_wiki_anchor = os.path.join(kgtk_files_dir, 'augmentation.wikipedia.tables.anchors.meta.tsv.gz')\n", "output_wikitable_anchor = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.temp3.tsv.gz')\n", "sorted_wikitable_anchor = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.sorted3.tsv')\n", "unique_anchor_edges = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.unique.tsv')\n", "final_output = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.tsv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Process WikiTable Corpus" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def gen_wikianchors(wiki_corpus, temp_file):\n", " flag = False\n", " lines_to_write = []\n", " #print(wiki_corpus)\n", " files = glob.glob(wiki_corpus+'/*.gz')\n", " #print(files)\n", " for file in files:\n", " with gzip.open(file,'rt') as reader:\n", " f = reader.readlines()\n", "\n", " for text in f:\n", " json_obj = json.loads(text)\n", "\n", " for anchor in json_obj['rows']:\n", " for anc in anchor['cells']:\n", " val = anc['value']\n", " if len(anc['links']) > 0:\n", " for link in anc['links']:\n", " href = 'http://en.wikipedia.org' + link['href']\n", " anchor_text = val[int(link['start']):int(link['end'])]\n", " if len(anchor_text.strip()) > 0:\n", " lines_to_write.append(href + '\\t' + 'wikipedia_table_anchor' + '\\t' + \"\\'\" + anchor_text + \"\\'\" + '@en')\n", "\n", " if len(lines_to_write) > 100000:\n", " with gzip.open(temp_file,'a') as writer:\n", " if flag == False:\n", " header = 'node1'+ '\\t'+'label' +'\\t'+'node2' + '\\n'\n", " writer.write(header.encode('utf8'))\n", " flag = True\n", "\n", " writer.write('\\n'.join(lines_to_write).encode('utf8'))\n", " lines_to_write = list()\n", "\n", " if len(lines_to_write) > 0:\n", " #print(lines_to_write)\n", " with gzip.open(temp_file,'a') as writer:\n", " if flag == False:\n", " header = 'node1'+ '\\t'+'label' +'\\t'+'node2' + '\\n'\n", " writer.write(header.encode('utf8'))\n", " flag = True\n", " writer.write('\\n'.join(lines_to_write).encode('utf8'))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "if not(wikitable_processed):\n", " \n", " gen_wikianchors(wikitable_corpus_dir, temp_wiki_anchor)\n", "else:\n", " temp_wiki_anchor = os.path.join(kgtk_files_dir, wikitable_processed_filename)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
0http://en.wikipedia.orghttps://commons.wikimed...wikipedia_table_anchor'Salies-du-Salat'@en
1http://en.wikipedia.org/wiki/El_Pasowikipedia_table_anchor'El Paso'@en
2http://en.wikipedia.org/wiki/Texaswikipedia_table_anchor'TX'@en
3http://en.wikipedia.org/wiki/El_Paso_County_Co...wikipedia_table_anchor'El Paso County Coliseum'@en
4http://en.wikipedia.org/wiki/Brad_Parscalewikipedia_table_anchor'Brad Parscale'@en
5http://en.wikipedia.org/wiki/John_Cornynwikipedia_table_anchor'John Cornyn'@en
6http://en.wikipedia.org/wiki/Lance_Berkmanwikipedia_table_anchor'Lance Berkman'@en
7http://en.wikipedia.org/wiki/Ted_Cruzwikipedia_table_anchor'Ted Cruz'@en
8http://en.wikipedia.org/wiki/Donald_Trump_Jr.wikipedia_table_anchor'Donald Trump Jr.'@en
9http://en.wikipedia.org/wiki/Grand_Rapidswikipedia_table_anchor'Grand Rapids'@en
\n", "
" ], "text/plain": [ " node1 label \\\n", "0 http://en.wikipedia.orghttps://commons.wikimed... wikipedia_table_anchor \n", "1 http://en.wikipedia.org/wiki/El_Paso wikipedia_table_anchor \n", "2 http://en.wikipedia.org/wiki/Texas wikipedia_table_anchor \n", "3 http://en.wikipedia.org/wiki/El_Paso_County_Co... wikipedia_table_anchor \n", "4 http://en.wikipedia.org/wiki/Brad_Parscale wikipedia_table_anchor \n", "5 http://en.wikipedia.org/wiki/John_Cornyn wikipedia_table_anchor \n", "6 http://en.wikipedia.org/wiki/Lance_Berkman wikipedia_table_anchor \n", "7 http://en.wikipedia.org/wiki/Ted_Cruz wikipedia_table_anchor \n", "8 http://en.wikipedia.org/wiki/Donald_Trump_Jr. wikipedia_table_anchor \n", "9 http://en.wikipedia.org/wiki/Grand_Rapids wikipedia_table_anchor \n", "\n", " node2 \n", "0 'Salies-du-Salat'@en \n", "1 'El Paso'@en \n", "2 'TX'@en \n", "3 'El Paso County Coliseum'@en \n", "4 'Brad Parscale'@en \n", "5 'John Cornyn'@en \n", "6 'Lance Berkman'@en \n", "7 'Ted Cruz'@en \n", "8 'Donald Trump Jr.'@en \n", "9 'Grand Rapids'@en " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(temp_wiki_anchor,sep = '\\t', nrows = 10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Kypher Query \n", "\n", "Join the sitelinks file with the processed WikiTable Corpus file" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shell 0.00s user 0.00s system 11% cpu 0.004 total\r\n", "children 0.00s user 0.00s system 0% cpu 0.004 total\r\n" ] } ], "source": [ "!time | kgtk query -i $sitelinks_file -i $temp_wiki_anchor -o $output_wikitable_anchor \\\n", " --match 'g: (x)-[r:wikipedia_sitelink]->(y), w: (y)-[t:wikipedia_table_anchor]->(c)' \\\n", " --return 'r, x, t.label, c as node2'" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
1Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
2Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
3Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
4Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
5Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
6Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
7Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
8Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
9Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "1 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n", "2 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n", "3 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n", "4 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n", "5 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n", "6 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n", "7 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "8 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "9 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(output_wikitable_anchor,sep = '\\t', nrows = 10)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "!kgtk sort2 -i $output_wikitable_anchor -c node1 -o $sorted_wikitable_anchor" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
1Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
2Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
3Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
4Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
5Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
6Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
7Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
8Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
9Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
\n", "
" ], "text/plain": [ " id node1 label node2\n", "0 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "1 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "2 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "3 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "4 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "5 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "6 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "7 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "8 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n", "9 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(sorted_wikitable_anchor,sep = '\\t',nrows = 10)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "!uniq $sorted_wikitable_anchor > $unique_anchor_edges" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!kgtk add-id -i $unique_anchor_edges --id-style wikidata --overwrite True -o $final_output" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "!gzip $final_output" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1labelnode2
0Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'Universe'@en
1Q1-wikipedia_sitelink-5e459a-0Q1wikipedia_table_anchor'universe'@en
2Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor''@en
3Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'American namesake'@en
4Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'Boston (MA)'@en
5Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'Boston Revolution'@en
6Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'Boston Round Robin'@en
7Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'Boston'@en
8Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'Boston, MA'@en
9Q100-wikipedia_sitelink-c612f2-0Q100wikipedia_table_anchor'Boston, MA, USA'@en
\n", "
" ], "text/plain": [ " id node1 label \\\n", "0 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor \n", "1 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor \n", "2 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "3 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "4 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "5 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "6 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "7 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "8 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "9 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n", "\n", " node2 \n", "0 'Universe'@en \n", "1 'universe'@en \n", "2 ''@en \n", "3 'American namesake'@en \n", "4 'Boston (MA)'@en \n", "5 'Boston Revolution'@en \n", "6 'Boston Round Robin'@en \n", "7 'Boston'@en \n", "8 'Boston, MA'@en \n", "9 'Boston, MA, USA'@en " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(final_output + '.gz', sep = '\\t', nrows = 10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CleanUp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "os.remove(sorted_wikitable_anchor)\n", "os.remove(unique_anchor_edges)" ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }