## Generate WikiTable Anchors

This notebook relates to [KGTK Issue# 264](https://github.com/usc-isi-i2/kgtk/issues/264)

The Wikitable data is present [here](https://drive.google.com/drive/folders/1dvHwiKt_YbAEIThSZRhu2-dU1ISzy8rW?usp=sharing). The data is present in the ```step_1``` folder in the above link

Example Command to run using papermill:
```
papermill generate_wikitable_anchors.ipynb gen_anchor_output.ipynb -p file_dir /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/KGTK_issue_264/ \
                                                                   -p kgtk_files_dir /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/ \
                                                                   -p sitelinks_filename sitelinks.en.tsv.gz```

In [1]:
# Parameters
# file_dir: Path of the step_1 folder which has the WikiTable corpus
file_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/KGTK_issue_264/'

#kgtk_files_dir: Path where the KGTK files are present(specifically the sitelinks.en.tsv.gz)
kgtk_files_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/'

#sitelinks_filename
sitelinks_filename = 'sitelinks.en.tsv.gz'

#Is the Wikitable Corpus processed
wikitable_processed = False

# processed wikitable filename. I keep the default name as augmentation.wikipedia.tables.anchors.meta.tsv.gz
wikitable_processed_filename = 'augmentation.wikipedia.tables.anchors.meta.tsv.gz'

In [17]:
import json
import gzip
import glob
import os
import pandas as pd

In [5]:
#Initialize variables
wikitable_corpus_dir = os.path.join(file_dir, 'step_1')
sitelinks_file = os.path.join(kgtk_files_dir, sitelinks_filename)
temp_wiki_anchor = os.path.join(kgtk_files_dir, 'augmentation.wikipedia.tables.anchors.meta.tsv.gz')
output_wikitable_anchor = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.temp3.tsv.gz')
sorted_wikitable_anchor = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.sorted3.tsv')
unique_anchor_edges = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.unique.tsv')
final_output = os.path.join(kgtk_files_dir,'augmentation.wikipedia.tables.anchors.tsv')

### Process WikiTable Corpus

In [14]:
def gen_wikianchors(wiki_corpus, temp_file):
    flag = False
    lines_to_write = []
    #print(wiki_corpus)
    files = glob.glob(wiki_corpus+'/*.gz')
    #print(files)
    for file in files:
        with gzip.open(file,'rt') as reader:
            f = reader.readlines()

            for text in f:
                json_obj = json.loads(text)

                for anchor in json_obj['rows']:
                    for anc in anchor['cells']:
                        val = anc['value']
                        if len(anc['links']) > 0:
                            for link in anc['links']:
                                href = 'http://en.wikipedia.org' + link['href']
                                anchor_text = val[int(link['start']):int(link['end'])]
                                if len(anchor_text.strip()) > 0:
                                    lines_to_write.append(href + '\t' + 'wikipedia_table_anchor' + '\t' + "\'" + anchor_text + "\'" + '@en')

                if len(lines_to_write) > 100000:
                     with gzip.open(temp_file,'a') as writer:
                            if flag == False:
                                header = 'node1'+ '\t'+'label' +'\t'+'node2' + '\n'
                                writer.write(header.encode('utf8'))
                                flag = True

                            writer.write('\n'.join(lines_to_write).encode('utf8'))
                            lines_to_write = list()

            if len(lines_to_write) > 0:
                    #print(lines_to_write)
                    with gzip.open(temp_file,'a') as writer:
                        if flag == False:
                            header = 'node1'+ '\t'+'label' +'\t'+'node2' + '\n'
                            writer.write(header.encode('utf8'))
                            flag = True
                        writer.write('\n'.join(lines_to_write).encode('utf8'))

In [15]:
if not(wikitable_processed):
    
    gen_wikianchors(wikitable_corpus_dir, temp_wiki_anchor)
else:
    temp_wiki_anchor = os.path.join(kgtk_files_dir, wikitable_processed_filename)

In [18]:
pd.read_csv(temp_wiki_anchor,sep = '\t', nrows = 10)

Unnamed: 0,node1,label,node2
0,http://en.wikipedia.orghttps://commons.wikimed...,wikipedia_table_anchor,'Salies-du-Salat'@en
1,http://en.wikipedia.org/wiki/El_Paso,wikipedia_table_anchor,'El Paso'@en
2,http://en.wikipedia.org/wiki/Texas,wikipedia_table_anchor,'TX'@en
3,http://en.wikipedia.org/wiki/El_Paso_County_Co...,wikipedia_table_anchor,'El Paso County Coliseum'@en
4,http://en.wikipedia.org/wiki/Brad_Parscale,wikipedia_table_anchor,'Brad Parscale'@en
5,http://en.wikipedia.org/wiki/John_Cornyn,wikipedia_table_anchor,'John Cornyn'@en
6,http://en.wikipedia.org/wiki/Lance_Berkman,wikipedia_table_anchor,'Lance Berkman'@en
7,http://en.wikipedia.org/wiki/Ted_Cruz,wikipedia_table_anchor,'Ted Cruz'@en
8,http://en.wikipedia.org/wiki/Donald_Trump_Jr.,wikipedia_table_anchor,'Donald Trump Jr.'@en
9,http://en.wikipedia.org/wiki/Grand_Rapids,wikipedia_table_anchor,'Grand Rapids'@en


### Kypher Query 

Join the sitelinks file with the processed WikiTable Corpus file

In [19]:
!time | kgtk query -i $sitelinks_file -i $temp_wiki_anchor -o $output_wikitable_anchor \
        --match 'g: (x)-[r:wikipedia_sitelink]->(y), w: (y)-[t:wikipedia_table_anchor]->(c)' \
        --return 'r, x, t.label, c as node2'

shell  0.00s user 0.00s system 11% cpu 0.004 total
children  0.00s user 0.00s system 0% cpu 0.004 total


In [20]:
pd.read_csv(output_wikitable_anchor,sep = '\t', nrows = 10)

Unnamed: 0,id,node1,label,node2
0,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
1,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
2,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
3,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
4,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
5,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
6,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
7,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
8,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
9,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en


In [22]:
!kgtk sort2 -i $output_wikitable_anchor -c node1 -o $sorted_wikitable_anchor

In [23]:
pd.read_csv(sorted_wikitable_anchor,sep = '\t',nrows = 10)

Unnamed: 0,id,node1,label,node2
0,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
1,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
2,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
3,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
4,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
5,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
6,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
7,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
8,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
9,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en


In [24]:
!uniq $sorted_wikitable_anchor > $unique_anchor_edges

In [None]:
!kgtk add-id -i $unique_anchor_edges --id-style wikidata --overwrite True -o $final_output

In [25]:
!gzip $final_output

In [27]:
pd.read_csv(final_output + '.gz', sep = '\t', nrows = 10)

Unnamed: 0,id,node1,label,node2
0,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'Universe'@en
1,Q1-wikipedia_sitelink-5e459a-0,Q1,wikipedia_table_anchor,'universe'@en
2,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,''@en
3,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,'American namesake'@en
4,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,'Boston (MA)'@en
5,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,'Boston Revolution'@en
6,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,'Boston Round Robin'@en
7,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,'Boston'@en
8,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,"'Boston, MA'@en"
9,Q100-wikipedia_sitelink-c612f2-0,Q100,wikipedia_table_anchor,"'Boston, MA, USA'@en"


### CleanUp

In [None]:

os.remove(sorted_wikitable_anchor)
os.remove(unique_anchor_edges)