import json import sys from nltk.corpus import wordnet as wn from datetime import datetime from collections import OrderedDict import os wn_synsets={'killing': {'killing.n.02', 'kill.v.01'}, 'injuring': {'injure.v.01', 'injured.a.01'}, 'fire_burning': {'fire.n.01'}, 'job_firing': {'displace.v.03'}} # the set of synsets that correspond to the question's event type, according to the task description def extract_keywords_and_time(qdata): """ Extract the values given for the event properties (time, location, participant) in the question, as keywords. """ keywords=set() time=None for a in ["location", "time", "participant"]: if a in qdata: raw_value=list(qdata[a].values())[0] if a=="time": #time is not a keyword but is processed differently time=qdata[a] elif a=="location": # locations are preprocessed to remove the root of the URI keywords.add(raw_value.split('/')[-1].replace('_', ' ')) else: keywords.add(raw_value) return keywords, time def get_event_lemmas(event_type): """ Obtain the event lemmas based on the event type in the question. This is done by first mapping the event type to its synsets in WordNet, and then extracting the lemmas of those WordNet synsets. """ syns = set(wn_synsets[event_type]) syns = [wn.synset(i) for i in syns] lemmas=set() for s in syns: lemmas |=set([l.replace('_', ' ') for l in s.lemma_names()]) return lemmas def get_et_for_lemmas(etypes={'killing', 'injuring'}): """ Obtain event types that a lemma refers to. """ lemma2et={} for etype in etypes: lemmas=get_event_lemmas(etype) for lemma in lemmas: lemma2et[lemma]=etype return lemma2et def time_fits(dct, qtime): """ Check if the DCT fits the question time. """ if qtime is None: return True dt = datetime.strptime(dct, '%Y-%m-%d') dct_day = dt.strftime("%d/%m/%Y") dct_month = dt.strftime("%m/%Y") dct_year = dt.strftime("%Y") for granularity in qtime: if granularity=='day': qdt=datetime.strptime(qtime[granularity], '%d/%m/%Y') qdt_day = qdt.strftime("%d/%m/%Y") return qdt_day==dct_day elif granularity=='month': qdt=datetime.strptime(qtime[granularity], '%m/%Y') qdt_month = qdt.strftime("%m/%Y") return qdt_month==dct_month elif granularity=='year': qdt=datetime.strptime(qtime[granularity], '%Y') qdt_year = qdt.strftime("%Y") return qdt_year==dct_year else: continue def process_question(qdata, conll_data, answer_is_one=False): """ Process a question and produce an answer (numerical answer and set of documents). A document belongs to the set of answer documents if: 1) its publishing time fits the question time; 2) all event property keywords from the question are found in the document text; and 3) at least one of the event lemmas was found in the document text. For the subtask S1, we make sure that the numeric answer is one (if it is not, we don't answer the question). We assume that all answer documents of S1 report on the same incident. For the subtask S2, we assume that each document reports on a different incident. """ keywords, q_time=extract_keywords_and_time(qdata) event_type=qdata["event_type"] event_lemmas=get_event_lemmas(event_type) answer_docs=set() for docid, docdata in conll_data.items(): dct=docdata['dct'] lines=docdata['content'] tokens=[] for line in lines: token=line.split()[1] if token=='NEWLINE': token='\n' tokens.append(token) text=' '.join(tokens) if all(k in text for k in keywords) and any(t in tokens for t in event_lemmas) and time_fits(dct, q_time): answer_docs.add(docid) if answer_is_one: # s1 if len(answer_docs): return answer_docs, 1 else: return answer_docs, 0 else: # s2 or s3$ return answer_docs, len(answer_docs) def mention_annotation(conll_data): """ Annotate the data with mentions and produce the resulting CONLL string. We consider the event mentions belonging to the same document to be correferential, and the ones belonging to different documents non-coreferential. """ lemma2et=get_et_for_lemmas() output_conll="" docnum=0 for docid, docdata in conll_data.items(): output_conll+= docdata['start_line'] for line in docdata['content']: line=line.strip() tid, token, part, annotation = line.split('\t') if token in lemma2et: if lemma2et[token]=='killing': annotation='(20%d)' % docnum else: # injuring annotation='(21%d)' % docnum output_conll+='\t'.join([tid, token, part, annotation]) + '\n' output_conll+=docdata['end_line'] docnum+=1 return output_conll def load_conll_data(file_location): """ Load the conll file data into a dictionary. """ conll_data=OrderedDict() with open(file_location, 'r') as f: for line in f: if line.startswith('#begin'): docid="" list_of_lines=[] start_line=line elif line.startswith("#end"): end_line = line conll_data[docid]={ 'dct': dct, 'content': list_of_lines, 'start_line': start_line, 'end_line': end_line } else: line_elements=line.split() if docid=="": docid=line_elements[0].split('.')[0] if line_elements[2]=='DCT': start_line+=line dct=line_elements[1] else: list_of_lines.append(line) return conll_data def main(subtask): """ Main processing function for a subtask. """ print("Subtask %s" % subtask) input_dir = "%s/%s" % (sys.argv[1], subtask) # the directory of the input data output_dir = "%s/%s" % (sys.argv[2], subtask) # the directory of the output data os.makedirs(output_dir, exist_ok=True) with open('%s/questions.json' % input_dir, 'r') as f: questions = json.load(f) conll_data=load_conll_data("%s/docs.conll" % input_dir) conll_output = mention_annotation(conll_data) with open('%s/docs.conll' % output_dir, 'w') as outfile: outfile.write(conll_output) answers_json={} for q in questions: answer_docs, num_answer=process_question(questions[q], conll_data, subtask=="s1") if subtask=="s1" and num_answer!=1: continue # don't answer if the answer for s1 is incorrect answers_json[q]={"numerical_answer": num_answer, "answer_docs": list(answer_docs)} with open('%s/answers.json' % output_dir, 'w') as outjson: json.dump(answers_json, outjson, indent=4) if __name__=="__main__": if len(sys.argv)<3: print("Please supply the input and the output folder as arguments") print("Example: python3 baseline1.py inputdir outputdir") sys.exit() for subtask in ["s1", "s2"]: main(subtask)