import sys,os,json def load_data(gold_answers_path, system_answers_path): gold_data = json.loads(open(gold_answers_path).read()) pred_data = json.loads(open(system_answers_path).read()) gold_answers = {item['id']: str(item['answer']).strip() for item in gold_data} predictions = {item['id']: str(item['answer']).strip() for item in pred_data} gold_list = [] pred_list = [] for qid, answer in gold_answers.items(): if qid in predictions: gold_list.append(answer) pred_list.append(predictions[qid]) return gold_list, pred_list def calculate_exact_match(reference, hypothesis): return int(reference == hypothesis) def calculate_f_score(references, hypotheses): precision_recall_f1_list = [] for ref, hyp in zip(references, hypotheses): ref_tokens = ref.split() hyp_tokens = hyp.split() common_tokens = set(ref_tokens) & set(hyp_tokens) precision = len(common_tokens) / len(hyp_tokens) if hyp_tokens else 0 recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0 f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 precision_recall_f1_list.append((precision, recall, f1_score)) avg_precision = sum(pr[0] for pr in precision_recall_f1_list) / len(precision_recall_f1_list) avg_recall = sum(pr[1] for pr in precision_recall_f1_list) / len(precision_recall_f1_list) avg_f1_score = sum(pr[2] for pr in precision_recall_f1_list) / len(precision_recall_f1_list) return avg_precision, avg_recall, avg_f1_score def evaluate(gold_answers_path, system_answers_path): gold_answers, predictions = load_data(gold_answers_path, system_answers_path) exact_matches = [calculate_exact_match(ref, hyp) for ref, hyp in zip(gold_answers, predictions)] precision, recall, f1_score = calculate_f_score(gold_answers, predictions) exact_match_score = sum(exact_matches) / len(exact_matches) if exact_matches else 0 return exact_match_score, precision, recall, f1_score def main(): input_dir = sys.argv[1] output_dir = sys.argv[2] gt_path = os.path.join(os.path.join(input_dir, 'ref'), 'answers2.txt') system_path = os.path.join(os.path.join(input_dir, 'res'), 'answers2.txt') print(f"Config:\n\tGround truth: {gt_path}\n\tSystem path: {system_path}") exact_match_accuracy, precision, recall, f1 = evaluate(gt_path, system_path) print(f"\nQA Results:\n\tExact Match Accuracy: {round(exact_match_accuracy * 100, 2)}%\n\tPrecision: {round(precision, 5)}\n\tRecall: {round(recall, 5)}\n\tF1: {round(f1, 5)}") with open(os.path.join(output_dir, 'scores.txt'), 'w') as f: f.write(f"ExactMatch: %f\n" % (exact_match_accuracy)) f.write(f"Precision: %f\n" % (precision)) f.write(f"Recall: %f\n" % (recall)) f.write(f"F1: %f\n" % (f1)) if __name__ == "__main__": main()