## Edge File with DBPedia links

This notebook relates to [KGTK issue 259](https://github.com/usc-isi-i2/kgtk/issues/259)

Example command to run the notebook:

```papermill DBPedia_links.ipynb DBPedia_output.ipynb```

In [16]:
#kgtk_files_dir: Path where the KGTK files are present(specifically the sitelinks.en.tsv.gz)
kgtk_files_dir = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files/'

#sitelinks_filename
sitelinks_filename = 'sitelinks.tsv.gz'

#Is the sitelinks file processed to extract all the wiki links
sitelinks_processed = False

#Name of the file generated after sitelinks file is processed or if it is already processed
processed_sitelinks = 'sitelinks.wikipedia1.tsv.gz'

In [17]:
import json
import gzip
import glob
import os
import pandas as pd
import time

In [18]:
os.environ["SITELINKS"] = os.path.join(kgtk_files_dir, sitelinks_filename)
os.environ["WIKIPEDIA_TEMP"] = os.path.join(kgtk_files_dir,processed_sitelinks)
os.environ['WIKIPEDIA'] = os.path.join(kgtk_files_dir,'sitelinks.wikipedia1.id.tsv.gz')
output_file = os.path.join(kgtk_files_dir,'derived.dbpedia.links.wihout.id.tsv.gz')
final_output_file = os.path.join(kgtk_files_dir,'derived.dbpedia.links.tsv.gz')

In [4]:
if not(sitelinks_processed):
    !kgtk query -i $SITELINKS \
    --match '(n1)-[l:`wikipedia_sitelink` {lang: language, label:lab}]->(n2)' \
    --return 'n1 as node1, lab as label, n2 as node2, language as language' \
    -o $WIKIPEDIA_TEMP

In [19]:
pd.read_csv(os.getenv("WIKIPEDIA_TEMP"),sep = '\t', nrows = 10)

Unnamed: 0,node1,label,node2,language
0,Q1,wikipedia_sitelink,http://oc.wikipedia.org/wiki/Univèrs,oc
1,Q1,wikipedia_sitelink,http://cdo.wikipedia.org/wiki/Ṳ̄-dêu,cdo
2,Q1,wikipedia_sitelink,http://ml.wikipedia.org/wiki/പ്രപഞ്ചം,ml
3,Q1,wikipedia_sitelink,http://si.wikipedia.org/wiki/විශ්වය,si
4,Q1,wikipedia_sitelink,http://bxr.wikipedia.org/wiki/Оршолон,bxr
5,Q1,wikipedia_sitelink,http://jam.wikipedia.org/wiki/Yunivoers,jam
6,Q1,wikipedia_sitelink,http://hr.wikipedia.org/wiki/Svemir,hr
7,Q1,wikipedia_sitelink,http://chr.wikipedia.org/wiki/ᎦᎸᎶᎯ_ᎦᎸᎾᏗ,chr
8,Q1,wikipedia_sitelink,http://pfl.wikipedia.org/wiki/Weltall,pfl
9,Q1,wikipedia_sitelink,http://sv.wikipedia.org/wiki/Universum,sv


In [6]:
!kgtk add-id -i $WIKIPEDIA_TEMP --id-style wikidata -o $WIKIPEDIA

In [20]:
pd.read_csv(os.getenv("WIKIPEDIA"),sep = '\t', nrows = 10)

Unnamed: 0,node1,label,node2,language,id
0,Q1,wikipedia_sitelink,http://oc.wikipedia.org/wiki/Univèrs,oc,Q1-wikipedia_sitelink-017715
1,Q1,wikipedia_sitelink,http://cdo.wikipedia.org/wiki/Ṳ̄-dêu,cdo,Q1-wikipedia_sitelink-0753b0
2,Q1,wikipedia_sitelink,http://ml.wikipedia.org/wiki/പ്രപഞ്ചം,ml,Q1-wikipedia_sitelink-0881b4
3,Q1,wikipedia_sitelink,http://si.wikipedia.org/wiki/විශ්වය,si,Q1-wikipedia_sitelink-09275b
4,Q1,wikipedia_sitelink,http://bxr.wikipedia.org/wiki/Оршолон,bxr,Q1-wikipedia_sitelink-0befb2
5,Q1,wikipedia_sitelink,http://jam.wikipedia.org/wiki/Yunivoers,jam,Q1-wikipedia_sitelink-0cf75e
6,Q1,wikipedia_sitelink,http://hr.wikipedia.org/wiki/Svemir,hr,Q1-wikipedia_sitelink-0d76a2
7,Q1,wikipedia_sitelink,http://chr.wikipedia.org/wiki/ᎦᎸᎶᎯ_ᎦᎸᎾᏗ,chr,Q1-wikipedia_sitelink-0e47e2
8,Q1,wikipedia_sitelink,http://pfl.wikipedia.org/wiki/Weltall,pfl,Q1-wikipedia_sitelink-0e9074
9,Q1,wikipedia_sitelink,http://sv.wikipedia.org/wiki/Universum,sv,Q1-wikipedia_sitelink-102a19


In [21]:
def create_dbpedia_link(wiki_link,lang):
    wiki_link_list = wiki_link.split('/')
    if lang == "en":
        dbpedia_link = "http://dbpedia.org/resource/" + wiki_link_list[-1]
    
    else:
        dbpedia_link = "http://" + lang + '.dbpedia.org/resource/' + wiki_link_list[-1]
    
    return dbpedia_link
    

def dbpedia_edge_file(wikipedia_file,output_file):
    with gzip.open(wikipedia_file,'rt') as file:
        prev = None
        lines_to_write = list()
        first_line = file.readline().replace('\n','').replace('\r','')
        write_first_line = 'id' + '\t' + 'node1' + '\t' + 'label' + '\t' + 'node2' 
        columns = first_line.split('\t')
        print(columns)
        prop_index = columns.index('label')
        node1_index = columns.index("node1")
        id_index = columns.index("id")
        node2_index = columns.index("node2")
        lang_index = columns.index("language")
        flag = False
        st = time.time()
        
        for i,line in enumerate(file):
            if i%100000 == 0:
                print("Time taken for {} is {}".format(i,time.time() - st))
                print("Previous Qnode is:",prev)
            
            vals = line.split('\t')
            prop_label = vals[prop_index]
            node1 = vals[node1_index]
            id_val = vals[id_index].strip('\n')
            node2 = vals[node2_index]
            lang = vals[lang_index]
            
            dbpedia_link = create_dbpedia_link(node2,lang)
            
            lines_to_write.append(id_val + '\t' + node1 + '\t' + 'dbpedia_link' + '\t' + dbpedia_link + '\n')
            lines_to_write.append('\t' + id_val + '\t' + 'P424' + '\t' + lang  + '\n')
            
            if len(lines_to_write) > 100000:
                with gzip.open(output_file,'a') as writer:
                    if flag == False:
                        header = write_first_line + '\n'
                        writer.write(header.encode('utf8'))
                        flag = True
                        
                    writer.write(''.join(lines_to_write).encode('utf8'))
                    #writer.write('\n'.encode('utf8')) 
                    lines_to_write = list()
        if len(lines_to_write) > 0:
            #print(lines_to_write)
            with gzip.open(output_file,'a') as writer:
                if flag == False:
                    header = write_first_line + '\n'
                    writer.write(header.encode('utf8'))
                    flag = True
                writer.write(''.join(lines_to_write).encode('utf8'))
                #writer.write('\n'.encode('utf8'))

dbpedia_edge_file(os.getenv("WIKIPEDIA"),output_file)

['node1', 'label', 'node2', 'language', 'id']
Time taken for 0 is 6.9141387939453125e-06
Previous Qnode is: None
Time taken for 100000 is 0.8449301719665527
Previous Qnode is: None
Time taken for 200000 is 2.0532779693603516
Previous Qnode is: None
Time taken for 300000 is 3.250784158706665
Previous Qnode is: None
Time taken for 400000 is 4.442262172698975
Previous Qnode is: None
Time taken for 500000 is 5.612215042114258
Previous Qnode is: None
Time taken for 600000 is 6.799299001693726
Previous Qnode is: None
Time taken for 700000 is 7.991290807723999
Previous Qnode is: None
Time taken for 800000 is 9.355199098587036
Previous Qnode is: None
Time taken for 900000 is 10.611602067947388
Previous Qnode is: None
Time taken for 1000000 is 11.871217966079712
Previous Qnode is: None
Time taken for 1100000 is 13.197750091552734
Previous Qnode is: None
Time taken for 1200000 is 14.401841878890991
Previous Qnode is: None
Time taken for 1300000 is 15.567928791046143
Previous Qnode is: None
Time 

Time taken for 12000000 is 145.2825150489807
Previous Qnode is: None
Time taken for 12100000 is 146.4973168373108
Previous Qnode is: None
Time taken for 12200000 is 147.6632468700409
Previous Qnode is: None
Time taken for 12300000 is 148.973571062088
Previous Qnode is: None
Time taken for 12400000 is 150.33698415756226
Previous Qnode is: None
Time taken for 12500000 is 151.62766218185425
Previous Qnode is: None
Time taken for 12600000 is 152.8912389278412
Previous Qnode is: None
Time taken for 12700000 is 154.28635692596436
Previous Qnode is: None
Time taken for 12800000 is 155.59295105934143
Previous Qnode is: None
Time taken for 12900000 is 156.9116780757904
Previous Qnode is: None
Time taken for 13000000 is 158.2199900150299
Previous Qnode is: None
Time taken for 13100000 is 159.47843313217163
Previous Qnode is: None
Time taken for 13200000 is 160.76241207122803
Previous Qnode is: None
Time taken for 13300000 is 162.02077198028564
Previous Qnode is: None
Time taken for 13400000 is 1

Time taken for 23800000 is 297.93142104148865
Previous Qnode is: None
Time taken for 23900000 is 299.1232249736786
Previous Qnode is: None
Time taken for 24000000 is 300.3963232040405
Previous Qnode is: None
Time taken for 24100000 is 301.71597504615784
Previous Qnode is: None
Time taken for 24200000 is 303.1009109020233
Previous Qnode is: None
Time taken for 24300000 is 304.34784483909607
Previous Qnode is: None
Time taken for 24400000 is 305.61386799812317
Previous Qnode is: None
Time taken for 24500000 is 306.89065980911255
Previous Qnode is: None
Time taken for 24600000 is 308.14564418792725
Previous Qnode is: None
Time taken for 24700000 is 309.4140338897705
Previous Qnode is: None
Time taken for 24800000 is 310.6506690979004
Previous Qnode is: None
Time taken for 24900000 is 311.87549805641174
Previous Qnode is: None
Time taken for 25000000 is 313.20977306365967
Previous Qnode is: None
Time taken for 25100000 is 314.6065537929535
Previous Qnode is: None
Time taken for 25200000 is

Time taken for 35700000 is 449.22428011894226
Previous Qnode is: None
Time taken for 35800000 is 450.59958600997925
Previous Qnode is: None
Time taken for 35900000 is 451.98899006843567
Previous Qnode is: None
Time taken for 36000000 is 453.32234716415405
Previous Qnode is: None
Time taken for 36100000 is 454.627592086792
Previous Qnode is: None
Time taken for 36200000 is 455.89421916007996
Previous Qnode is: None
Time taken for 36300000 is 457.2095561027527
Previous Qnode is: None
Time taken for 36400000 is 458.460568189621
Previous Qnode is: None
Time taken for 36500000 is 459.7277190685272
Previous Qnode is: None
Time taken for 36600000 is 460.9438829421997
Previous Qnode is: None
Time taken for 36700000 is 462.1432650089264
Previous Qnode is: None
Time taken for 36800000 is 463.35320496559143
Previous Qnode is: None
Time taken for 36900000 is 464.5342421531677
Previous Qnode is: None
Time taken for 37000000 is 465.72650814056396
Previous Qnode is: None
Time taken for 37100000 is 46

Time taken for 47600000 is 594.7213799953461
Previous Qnode is: None
Time taken for 47700000 is 595.9149131774902
Previous Qnode is: None
Time taken for 47800000 is 597.1261200904846
Previous Qnode is: None
Time taken for 47900000 is 598.3319089412689
Previous Qnode is: None
Time taken for 48000000 is 599.526437997818
Previous Qnode is: None
Time taken for 48100000 is 600.760908126831
Previous Qnode is: None
Time taken for 48200000 is 601.9848830699921
Previous Qnode is: None
Time taken for 48300000 is 603.2123260498047
Previous Qnode is: None
Time taken for 48400000 is 604.4257249832153
Previous Qnode is: None
Time taken for 48500000 is 605.6557750701904
Previous Qnode is: None
Time taken for 48600000 is 606.8692400455475
Previous Qnode is: None
Time taken for 48700000 is 608.0654320716858
Previous Qnode is: None
Time taken for 48800000 is 609.2737710475922
Previous Qnode is: None
Time taken for 48900000 is 610.5252859592438
Previous Qnode is: None
Time taken for 49000000 is 611.75217

Time taken for 59500000 is 739.2276680469513
Previous Qnode is: None
Time taken for 59600000 is 740.442360162735
Previous Qnode is: None
Time taken for 59700000 is 741.6496458053589
Previous Qnode is: None
Time taken for 59800000 is 742.8656630516052
Previous Qnode is: None
Time taken for 59900000 is 744.050265789032
Previous Qnode is: None
Time taken for 60000000 is 745.2600898742676
Previous Qnode is: None
Time taken for 60100000 is 746.4602167606354
Previous Qnode is: None
Time taken for 60200000 is 747.6180560588837
Previous Qnode is: None
Time taken for 60300000 is 748.802463054657
Previous Qnode is: None
Time taken for 60400000 is 749.9692358970642
Previous Qnode is: None
Time taken for 60500000 is 751.1622998714447
Previous Qnode is: None
Time taken for 60600000 is 752.3710432052612
Previous Qnode is: None
Time taken for 60700000 is 753.5850188732147
Previous Qnode is: None
Time taken for 60800000 is 754.7838101387024
Previous Qnode is: None
Time taken for 60900000 is 755.986749

In [22]:
!kgtk add-id --id-style wikidata -i $output_file -o $final_output_file

In [23]:
pd.read_csv(final_output_file,sep = '\t',nrows = 10)

Unnamed: 0,id,node1,label,node2
0,Q1-wikipedia_sitelink-017715,Q1,dbpedia_link,http://oc.dbpedia.org/resource/Univèrs
1,Q1-wikipedia_sitelink-017715-P424-5f751e,Q1-wikipedia_sitelink-017715,P424,oc
2,Q1-wikipedia_sitelink-0753b0,Q1,dbpedia_link,http://cdo.dbpedia.org/resource/Ṳ̄-dêu
3,Q1-wikipedia_sitelink-0753b0-P424-da62be,Q1-wikipedia_sitelink-0753b0,P424,cdo
4,Q1-wikipedia_sitelink-0881b4,Q1,dbpedia_link,http://ml.dbpedia.org/resource/പ്രപഞ്ചം
5,Q1-wikipedia_sitelink-0881b4-P424-5d58d4,Q1-wikipedia_sitelink-0881b4,P424,ml
6,Q1-wikipedia_sitelink-09275b,Q1,dbpedia_link,http://si.dbpedia.org/resource/විශ්වය
7,Q1-wikipedia_sitelink-09275b-P424-97a62a,Q1-wikipedia_sitelink-09275b,P424,si
8,Q1-wikipedia_sitelink-0befb2,Q1,dbpedia_link,http://bxr.dbpedia.org/resource/Оршолон
9,Q1-wikipedia_sitelink-0befb2-P424-b7d00c,Q1-wikipedia_sitelink-0befb2,P424,bxr


## CleanUp

In [None]:
os.remove(output_file)