{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate WikiTable Anchors\n",
"\n",
"This notebook relates to [KGTK Issue# 264](https://github.com/usc-isi-i2/kgtk/issues/264)\n",
"\n",
"The Wikitable data is present [here](https://drive.google.com/drive/folders/1dvHwiKt_YbAEIThSZRhu2-dU1ISzy8rW?usp=sharing). The data is present in the ```step_1``` folder in the above link\n",
"\n",
"Example Command to run using papermill:\n",
"```\n",
"papermill generate_wikitable_anchors.ipynb gen_anchor_output.ipynb -p file_dir /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/KGTK_issue_264/ \\\n",
" -p kgtk_files_dir /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/ \\\n",
" -p sitelinks_filename sitelinks.en.tsv.gz```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Parameters\n",
"# file_dir: Path of the step_1 folder which has the WikiTable corpus\n",
"file_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/KGTK_issue_264/'\n",
"\n",
"#kgtk_files_dir: Path where the KGTK files are present(specifically the sitelinks.en.tsv.gz)\n",
"kgtk_files_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/'\n",
"\n",
"#sitelinks_filename\n",
"sitelinks_filename = 'sitelinks.en.tsv.gz'\n",
"\n",
"#Is the Wikitable Corpus processed\n",
"wikitable_processed = False\n",
"\n",
"# processed wikitable filename. I keep the default name as augmentation.wikipedia.tables.anchors.meta.tsv.gz\n",
"wikitable_processed_filename = 'augmentation.wikipedia.tables.anchors.meta.tsv.gz'"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import gzip\n",
"import glob\n",
"import os\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#Initialize variables\n",
"wikitable_corpus_dir = os.path.join(file_dir, 'step_1')\n",
"sitelinks_file = os.path.join(kgtk_files_dir, sitelinks_filename)\n",
"temp_wiki_anchor = os.path.join(kgtk_files_dir, 'augmentation.wikipedia.tables.anchors.meta.tsv.gz')\n",
"output_wikitable_anchor = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.temp3.tsv.gz')\n",
"sorted_wikitable_anchor = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.sorted3.tsv')\n",
"unique_anchor_edges = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.unique.tsv')\n",
"final_output = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.tsv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Process WikiTable Corpus"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def gen_wikianchors(wiki_corpus, temp_file):\n",
" flag = False\n",
" lines_to_write = []\n",
" #print(wiki_corpus)\n",
" files = glob.glob(wiki_corpus+'/*.gz')\n",
" #print(files)\n",
" for file in files:\n",
" with gzip.open(file,'rt') as reader:\n",
" f = reader.readlines()\n",
"\n",
" for text in f:\n",
" json_obj = json.loads(text)\n",
"\n",
" for anchor in json_obj['rows']:\n",
" for anc in anchor['cells']:\n",
" val = anc['value']\n",
" if len(anc['links']) > 0:\n",
" for link in anc['links']:\n",
" href = 'http://en.wikipedia.org' + link['href']\n",
" anchor_text = val[int(link['start']):int(link['end'])]\n",
" if len(anchor_text.strip()) > 0:\n",
" lines_to_write.append(href + '\\t' + 'wikipedia_table_anchor' + '\\t' + \"\\'\" + anchor_text + \"\\'\" + '@en')\n",
"\n",
" if len(lines_to_write) > 100000:\n",
" with gzip.open(temp_file,'a') as writer:\n",
" if flag == False:\n",
" header = 'node1'+ '\\t'+'label' +'\\t'+'node2' + '\\n'\n",
" writer.write(header.encode('utf8'))\n",
" flag = True\n",
"\n",
" writer.write('\\n'.join(lines_to_write).encode('utf8'))\n",
" lines_to_write = list()\n",
"\n",
" if len(lines_to_write) > 0:\n",
" #print(lines_to_write)\n",
" with gzip.open(temp_file,'a') as writer:\n",
" if flag == False:\n",
" header = 'node1'+ '\\t'+'label' +'\\t'+'node2' + '\\n'\n",
" writer.write(header.encode('utf8'))\n",
" flag = True\n",
" writer.write('\\n'.join(lines_to_write).encode('utf8'))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"if not(wikitable_processed):\n",
" \n",
" gen_wikianchors(wikitable_corpus_dir, temp_wiki_anchor)\n",
"else:\n",
" temp_wiki_anchor = os.path.join(kgtk_files_dir, wikitable_processed_filename)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" http://en.wikipedia.orghttps://commons.wikimed... | \n",
" wikipedia_table_anchor | \n",
" 'Salies-du-Salat'@en | \n",
"
\n",
" \n",
" 1 | \n",
" http://en.wikipedia.org/wiki/El_Paso | \n",
" wikipedia_table_anchor | \n",
" 'El Paso'@en | \n",
"
\n",
" \n",
" 2 | \n",
" http://en.wikipedia.org/wiki/Texas | \n",
" wikipedia_table_anchor | \n",
" 'TX'@en | \n",
"
\n",
" \n",
" 3 | \n",
" http://en.wikipedia.org/wiki/El_Paso_County_Co... | \n",
" wikipedia_table_anchor | \n",
" 'El Paso County Coliseum'@en | \n",
"
\n",
" \n",
" 4 | \n",
" http://en.wikipedia.org/wiki/Brad_Parscale | \n",
" wikipedia_table_anchor | \n",
" 'Brad Parscale'@en | \n",
"
\n",
" \n",
" 5 | \n",
" http://en.wikipedia.org/wiki/John_Cornyn | \n",
" wikipedia_table_anchor | \n",
" 'John Cornyn'@en | \n",
"
\n",
" \n",
" 6 | \n",
" http://en.wikipedia.org/wiki/Lance_Berkman | \n",
" wikipedia_table_anchor | \n",
" 'Lance Berkman'@en | \n",
"
\n",
" \n",
" 7 | \n",
" http://en.wikipedia.org/wiki/Ted_Cruz | \n",
" wikipedia_table_anchor | \n",
" 'Ted Cruz'@en | \n",
"
\n",
" \n",
" 8 | \n",
" http://en.wikipedia.org/wiki/Donald_Trump_Jr. | \n",
" wikipedia_table_anchor | \n",
" 'Donald Trump Jr.'@en | \n",
"
\n",
" \n",
" 9 | \n",
" http://en.wikipedia.org/wiki/Grand_Rapids | \n",
" wikipedia_table_anchor | \n",
" 'Grand Rapids'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node1 label \\\n",
"0 http://en.wikipedia.orghttps://commons.wikimed... wikipedia_table_anchor \n",
"1 http://en.wikipedia.org/wiki/El_Paso wikipedia_table_anchor \n",
"2 http://en.wikipedia.org/wiki/Texas wikipedia_table_anchor \n",
"3 http://en.wikipedia.org/wiki/El_Paso_County_Co... wikipedia_table_anchor \n",
"4 http://en.wikipedia.org/wiki/Brad_Parscale wikipedia_table_anchor \n",
"5 http://en.wikipedia.org/wiki/John_Cornyn wikipedia_table_anchor \n",
"6 http://en.wikipedia.org/wiki/Lance_Berkman wikipedia_table_anchor \n",
"7 http://en.wikipedia.org/wiki/Ted_Cruz wikipedia_table_anchor \n",
"8 http://en.wikipedia.org/wiki/Donald_Trump_Jr. wikipedia_table_anchor \n",
"9 http://en.wikipedia.org/wiki/Grand_Rapids wikipedia_table_anchor \n",
"\n",
" node2 \n",
"0 'Salies-du-Salat'@en \n",
"1 'El Paso'@en \n",
"2 'TX'@en \n",
"3 'El Paso County Coliseum'@en \n",
"4 'Brad Parscale'@en \n",
"5 'John Cornyn'@en \n",
"6 'Lance Berkman'@en \n",
"7 'Ted Cruz'@en \n",
"8 'Donald Trump Jr.'@en \n",
"9 'Grand Rapids'@en "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(temp_wiki_anchor,sep = '\\t', nrows = 10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kypher Query \n",
"\n",
"Join the sitelinks file with the processed WikiTable Corpus file"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shell 0.00s user 0.00s system 11% cpu 0.004 total\r\n",
"children 0.00s user 0.00s system 0% cpu 0.004 total\r\n"
]
}
],
"source": [
"!time | kgtk query -i $sitelinks_file -i $temp_wiki_anchor -o $output_wikitable_anchor \\\n",
" --match 'g: (x)-[r:wikipedia_sitelink]->(y), w: (y)-[t:wikipedia_table_anchor]->(c)' \\\n",
" --return 'r, x, t.label, c as node2'"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"1 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n",
"2 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n",
"3 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n",
"4 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n",
"5 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n",
"6 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'universe'@en\n",
"7 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"8 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"9 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(output_wikitable_anchor,sep = '\\t', nrows = 10)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"!kgtk sort2 -i $output_wikitable_anchor -c node1 -o $sorted_wikitable_anchor"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label node2\n",
"0 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"1 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"2 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"3 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"4 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"5 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"6 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"7 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"8 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en\n",
"9 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor 'Universe'@en"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(sorted_wikitable_anchor,sep = '\\t',nrows = 10)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"!uniq $sorted_wikitable_anchor > $unique_anchor_edges"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kgtk add-id -i $unique_anchor_edges --id-style wikidata --overwrite True -o $final_output"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"!gzip $final_output"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'Universe'@en | \n",
"
\n",
" \n",
" 1 | \n",
" Q1-wikipedia_sitelink-5e459a-0 | \n",
" Q1 | \n",
" wikipedia_table_anchor | \n",
" 'universe'@en | \n",
"
\n",
" \n",
" 2 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" ''@en | \n",
"
\n",
" \n",
" 3 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'American namesake'@en | \n",
"
\n",
" \n",
" 4 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'Boston (MA)'@en | \n",
"
\n",
" \n",
" 5 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'Boston Revolution'@en | \n",
"
\n",
" \n",
" 6 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'Boston Round Robin'@en | \n",
"
\n",
" \n",
" 7 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'Boston'@en | \n",
"
\n",
" \n",
" 8 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'Boston, MA'@en | \n",
"
\n",
" \n",
" 9 | \n",
" Q100-wikipedia_sitelink-c612f2-0 | \n",
" Q100 | \n",
" wikipedia_table_anchor | \n",
" 'Boston, MA, USA'@en | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id node1 label \\\n",
"0 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor \n",
"1 Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_table_anchor \n",
"2 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"3 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"4 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"5 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"6 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"7 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"8 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"9 Q100-wikipedia_sitelink-c612f2-0 Q100 wikipedia_table_anchor \n",
"\n",
" node2 \n",
"0 'Universe'@en \n",
"1 'universe'@en \n",
"2 ''@en \n",
"3 'American namesake'@en \n",
"4 'Boston (MA)'@en \n",
"5 'Boston Revolution'@en \n",
"6 'Boston Round Robin'@en \n",
"7 'Boston'@en \n",
"8 'Boston, MA'@en \n",
"9 'Boston, MA, USA'@en "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(final_output + '.gz', sep = '\\t', nrows = 10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CleanUp"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"os.remove(sorted_wikitable_anchor)\n",
"os.remove(unique_anchor_edges)"
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}