In [1]:
import pandas as pd
import gzip
import csv
import re

## Create the file `labels.en.years.tsv.gz`

get instances of class: calendar year
1. kgtk query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[id:P31]->(:Q3186692)' -o parts/claims.years.2.tsv.gz

get instances of class: year

2. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q577)' -o parts/claims.years.1.tsv.gz

get instances of class: common year

3. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q235729)' -o parts/claims.years.3.tsv.gz

get instances of class: leap year

4. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q19828)' -o parts/claims.years.4.tsv.gz

get instances of class: century leap year

5. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q3311614)' -o parts/claims.years.5.tsv.gz

6. kgtk cat -i claims.years.1.tsv.gz -i claims.years.2.tsv.gz -i claims.years.3.tsv.gz -i claims.years.4.tsv.gz -i claims.years.5.tsv.gz -o claims.years.tsv.gz

7. kgtk --debug query -i claims.years.tsv.gz \
-i labels.en.tsv.gz \
--graph-cache $GRAPH_CACHE \
--match 'claims: (n1)-[l]->(), labels: (n1)-[m]->(n2)' \
--return 'n1, m.label, n2' -o labels.en.years.tsv.gz

## Create the file `claims.time.year.tsv.gz`

kgtk query --graph-cache $GRAPH_CACHE \
-i claims.time.tsv.gz \
-o claims.time.year.tsv.gz \
--match '(s)-[i]->(o)' \
--where 'kgtk_date_precision(o) >= 9' \
--return 's, i.label, o as node2_full, kgtk_date_year(o) as node2'

In [2]:
claims_time_file = 'claims.time.year.tsv.gz'
years_label_file = 'labels.en.years.tsv.gz'

In [3]:
!gzcat {claims_time_file} | head

node1	label	node2_full	node2
P1841	P580	^2016-01-01T00:00:00Z/9	2016
P2847	P2669	^2019-04-02T00:00:00Z/11	2019
P3284	P576	^2016-12-13T00:00:00Z/11	2016
P370	P571	^2013-03-29T00:00:00Z/11	2013
P6107	P580	^2006-03-01T00:00:00Z/10	2006
Q100	P571	^1630-09-07T00:00:00Z/11	1630
Q1000	P571	^1960-01-01T00:00:00Z/9	1960
Q10000	P571	^2001-06-19T00:00:00Z/11	2001
Q1000000	P580	^2010-01-25T00:00:00Z/11	2010
gzcat: error writing to output: Broken pipe
gzcat: claims.time.year.tsv.gz: uncompress failed


In [4]:
!gzcat {years_label_file} | head

node1	label	node2
Q100594618	label	'3022'@en
Q102149476	label	'2384'@en
Q103831815	label	'2688'@en
Q103838319	label	'2691'@en
Q1044959	label	'2064'@en
Q1044959	label	'2064'@en
Q1044976	label	'2063'@en
Q1046496	label	'2062'@en
Q1062045	label	'Yang Fire Horse'@en
gzcat: error writing to output: Broken pipe
gzcat: labels.en.years.tsv.gz: uncompress failed


In [5]:
year_qnodes = {}
year_properties = {}
year_labels = {}
_not_present_years = set()

In [6]:
# P155: follows
# P156: followed by

def create_year_qnode(year, year_dict):
 year = str(year.strip())

 y_qnode = f"Q{year}-Year"
 prev_year = f'Q{str(int(year) - 1)}-Year'
 next_year = f'Q{str(int(year) + 1)}-Year'
 if year in year_dict:
 return {'qnode': year_dict[year], 'wiki_year': True}
 _not_present_years.add(year)
 if year not in year_qnodes:
 year_qnodes[year] = {
 'qnode': y_qnode,
 'P31': 'Q3186692',
 'P155': prev_year,
 'P156': next_year,
 'label': f'\"{year}\"',
 'wiki_year': False
 }
 return year_qnodes[year]

In [7]:
def create_time_properties(property):
 property = str(property).strip()
 pnode = f'{property}-Year'
 if property not in year_properties:
 year_properties[property] = {
 'P31':'Q18636219',
 'pnode': pnode,
 'label': f'\"{property}-Year\"'
 }
 return year_properties[property]

In [8]:
def min_qnode(qnodes_list):
 _ = [int(x[1:]) for x in qnodes_list]
 return f'Q{min(_)}'

In [9]:
year_regex = r'(\d*)'
def create_wiki_years_dict(years_label_file):
 _ = {}
 f = gzip.open(years_label_file, 'rt')
 for line in f:
 if 'node1' not in line:
 vals = line.strip().split('\t')
 qnode = vals[0].strip()
 q_label = vals[2].strip()
 if '@' in q_label:
 _label = q_label.split('@')[0].replace("'", "")
 try:
 _label = int(_label)
 except:

 _label = ''
 if _label != '':
 _[qnode] = _label
 freq_dict = {}
 for k, v in _.items():
 if v not in freq_dict:
 freq_dict[v] = list()
 freq_dict[v].append(k)

 freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: len(item[1]), reverse=True)}
 correct_dict = {k : min_qnode(v) for k, v in freq_dict.items()}
 return {str(k): v for k, v in correct_dict.items()}

In [10]:
year_labels = create_wiki_years_dict(years_label_file)

In [11]:
f = gzip.open(claims_time_file, 'rt')
output = []
for line in f:
 if 'node1' in line and 'node2' in line:
 continue
 vals = line.strip().split('\t')
 assert(len(vals) == 4)
 year = vals[3]
 property = vals[1]
 y_qnode = create_year_qnode(year, year_labels)['qnode']
 y_pnode = create_time_properties(property)['pnode']
 output.append({
 'node1': vals[0],
 'label': y_pnode,
 'node2': y_qnode,
 })


In [12]:
print(len(_not_present_years), list(_not_present_years))

186 ['2286', '3443', '2760', '2543', '2510', '2396', '2385', '9822', '3523', '4004', '5355', '2266', '2566', '3606', '2563', '2986', '4301', '2459', '2269', '2518', '2366', '3870', '6969', '2267', '3718', '2341', '3841', '2387', '2371', '2864', '3190', '2305', '3576', '2365', '2646', '2774', '3016', '2425', '2370', '2315', '2913', '2399', '2989', '2358', '7962', '3998', '4401', '2367', '2763', '4226', '8888', '2470', '2468', '2738', '3179', '2393', '2319', '5079', '3049', '3494', '2362', '2260', '2355', '2731', '2356', '2608', '2904', '3483', '2276', '2303', '3955', '3454', '2622', '3056', '2633', '2265', '2359', '3993', '2893', '2293', '2841', '2552', '2250', '3031', '2980', '2457', '2490', '3599', '2897', '1000', '3550', '2452', '3425', '2282', '2506', '2375', '2318', '2499', '2524', '3009', '3982', '2987', '4000', '2364', '2332', '2348', '2655', '2484', '2716', '3237', '2333', '2485', '2974', '2964', '2374', '3173', '2323', '2203', '2340', '2551', '2233', '4126', '2336', '2532', '35

In [13]:
len(output)

14463714

In [14]:
df = pd.DataFrame(output)

In [15]:
df.to_csv('claims.time.year.augmented.1.tsv.gz', index=False, sep='\t')

In [16]:
!kgtk add-id --id-style wikidata -i 'claims.time.year.augmented.1.tsv.gz' -o 'claims.time.year.augmented.2.tsv.gz'

In [17]:
_years = []
_years_labels = []
_properties = []
_properties_labels = []
yp = ['P31', 'P155', 'P156']
ypp = ['P31']

In [18]:
for k in year_qnodes:
 __ = year_qnodes[k]
 for p in yp:
 if not __['wiki_year']:
 _years.append({
 'node1': __['qnode'],
 'label': p,
 'node2': __[p]
 })
 _years_labels.append({
 'node1': __['qnode'],
 'label': 'label',
 'node2':f'"{__["label"]}"'
 })
df_y = pd.DataFrame(_years)
df_yl = pd.DataFrame(_years_labels)
df_y.to_csv('metadata.augmented.time.years.1.tsv.gz', index=False, sep='\t')
df_yl.to_csv('metadata.augmented.labels.time.years.1.tsv.gz', index=False, sep='\t', quoting=csv.QUOTE_NONE)

In [19]:
!kgtk add-id --id-style wikidata -i 'metadata.augmented.time.years.1.tsv.gz' -o 'metadata.augmented.time.years.tsv.gz'
!kgtk add-id --id-style wikidata -i 'metadata.augmented.labels.time.years.1.tsv.gz' -o 'metadata.augmented.labels.time.years.tsv.gz'

In [20]:
for k in year_properties:
 __ = year_properties[k]
 for p in ypp:
 _properties.append({
 'node1': __['pnode'],
 'label': p,
 'node2': __[p]
 })
 _properties_labels.append({
 'node1': __['pnode'],
 'label': 'label',
 'node2': __['label']
 })
df_py = pd.DataFrame(_properties)
df_pyl = pd.DataFrame(_properties_labels)
df_py.to_csv('metadata.augmented.properties.time.years.1.tsv.gz', index=False, sep='\t')
df_pyl.to_csv('metadata.augmented.labels.properties.time.years.1.tsv.gz', index=False, sep='\t', quoting=csv.QUOTE_NONE)

In [21]:
!kgtk add-id --id-style wikidata -i 'metadata.augmented.properties.time.years.1.tsv.gz' -o 'metadata.augmented.properties.time.years.tsv.gz'
!kgtk add-id --id-style wikidata -i 'metadata.augmented.labels.properties.time.years.1.tsv.gz' -o 'metadata.augmented.labels.properties.time.years.tsv.gz'

In [22]:
# concatenate to create the augmented file

In [23]:
!kgtk cat -i 'claims.time.year.augmented.2.tsv.gz' \
-i 'metadata.augmented.time.years.tsv.gz' \
-i 'metadata.augmented.properties.time.years.tsv.gz' \
-o 'claims.time.year.augmented.tsv.gz'

In [24]:
# delete intermediate files

In [25]:
!rm 'claims.time.year.augmented.1.tsv.gz' \
'claims.time.year.augmented.2.tsv.gz'\
'metadata.augmented.labels.properties.time.years.1.tsv.gz' \
'metadata.augmented.labels.time.years.1.tsv.gz' \
'metadata.augmented.properties.time.years.1.tsv.gz' \
'metadata.augmented.time.years.1.tsv.gz'

In [26]:
# cat this file with claims.wikibase-item.tsv.gz

In [None]:
kgtk cat -i claims.time.year.augmented.tsv.gz \
-i claims.wikibase-item.tsv.gz \
/ sort --extra '--parallel 24 --buffer-size 30% --temporary-directory /tmp' -o claims.wikibase-item.augmented.tsv.gz