{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Copyright (c) Microsoft Corporation. \n", "# Licensed under the MIT license.\n", "\"\"\"\n", "The code for creating the dataset from T-REx for pre-training factual adapter on relation classification task.\n", "\n", "Input:cleaned T_Rex dataset. \"T_REx_example.json\"\n", "Output:examples: {tokens, obj_label(relation label), entity [subj_strat,subj_end]}\n", "\n", "- Whole dataset \"T_REx_examples_{}_{}.json\".format(max_count,min_predicate_num)\n", "- Training split data/train.json\n", "- Dev split data/dev.json\n", "- labels data/relations.json\n", "\n", "example:\n", "{'docid': 'Q3343462',\n", " 'token': ['Nora','Zaïdi','(','Nora','Mebrak','-','Zaïdi','),','born','on','July','6',',','1965','in','Bethoncourt','(','French','département','of',\n", " 'Doubs','),','daughter','of','an','Algerian','textile','toiler',',','is','a','French','activist','who','seated','in','the','European','Parliament','from',\n", " '1989','to','1994','.'],\n", " 'relation': 'P31',\n", " 'subj_start': 20,\n", " 'subj_end': 20,\n", " 'obj_start': 18,\n", " 'obj_end': 18,\n", " 'subj_label': 'Q3361',\n", " 'obj_label': 'Q6465'}\n", "\"\"\"\n", "\n", "import json\n", "import matplotlib.pyplot as plt\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "652\n" ] } ], "source": [ "def get_property2idx_dict(file=\"../data/cleaned_T_REx/T_REx_pred2ic.json\"):\n", " with open(file, \"r\") as fin:\n", " property2idx_dict = json.load(fin)\n", " return property2idx_dict\n", "# property dictionary.\n", "property2idx_dict = get_property2idx_dict()\n", "idx2property_dict = {v: k for k, v in property2idx_dict.items()}\n", "\n", "print(len(property2idx_dict.keys()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def load_examples(file):\n", " with open(file, \"r\") as fin:\n", " all_examples = json.load(fin)\n", " print(\"total examples:\", len(all_examples))\n", " return all_examples\n", "examples = load_examples(file=\"../data/cleaned_T_REx/T_REx_example.json\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "example = examples[0]\n", "print(example)\n", "subj_start_id = example[\"subj_start\"]\n", "subj_end_id = example[\"subj_end\"]\n", "example[\"token\"][subj_start_id:subj_end_id+1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sub_examples = examples\n", "examples_fpre = {}\n", "for i, example in enumerate(sub_examples):\n", " if example['relation'] in examples_fpre.keys():\n", " temp = examples_fpre[example['relation']]\n", " temp.append(example)\n", " examples_fpre[example['relation']]= temp\n", " else:\n", " examples_fpre[example['relation']]= [example]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from collections import Counter\n", "count_list = []\n", "label_list = []\n", "total_predicates_uri = {}\n", "for key in examples_fpre.keys():\n", " total_predicates_uri[key] = len(examples_fpre[key])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "print(total_predicates_uri['P17'])\n", "print(total_predicates_uri['P131'])\n", "print(total_predicates_uri['P31'])\n", "print(total_predicates_uri['P47'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# counting\n", "counter_predicates_uri = Counter(total_predicates_uri)\n", "sort_counter_predicates_uri = counter_predicates_uri.most_common(200)\n", "labels, values = zip(*sort_counter_predicates_uri)\n", "\n", "my_labels = list(labels)\n", "my_labels.reverse()\n", "my_values = list(values)\n", "my_values.reverse()\n", "\n", "fig, ax = plt.subplots(figsize=(10, 30),dpi=100)\n", "b = ax.barh(range(len(my_labels)), my_values,height=0.8, color='steelblue',alpha=0.7) \n", "for rect in b: \n", " w = rect.get_width() \n", " ax.text(w, rect.get_y()+rect.get_height()/2, '%d' % \n", " int(w), ha='left', va='center')\n", "\n", "ax.set_yticks(range(len(my_labels)))\n", "ax.set_yticklabels(my_labels)\n", "\n", "plt.xticks(()) \n", "plt.title('Total Subset Predicates distribution', loc='center')\n", "plt.savefig('total_subset_predicates_dis.png')\n", "plt.show()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# del P31 (instance of)\n", "del examples_fpre['P31']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sub_examples = examples_fpre" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# del some relations\n", "total_examples = 0\n", "min_predicate_num = 50\n", "count = 0\n", "discard_list = []\n", "for key in sub_examples.keys():\n", " if len(sub_examples[key])