{ "cells": [ { "cell_type": "markdown", "id": "express-journalist", "metadata": {}, "source": [ "In this notebook, we make a collection of all the embeddings which we use to do retrofitting. These embeddings are then evaluated for their similarity based on the evaluation benchmark datasets." ] }, { "cell_type": "code", "execution_count": 1, "id": "incorrect-routine", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics.pairwise import euclidean_distances\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import scipy.stats as stats\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import confusion_matrix\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from tqdm.notebook import tqdm\n", "from itertools import combinations\n", "from math import comb\n", "from sklearn.ensemble import RandomForestClassifier\n", "import os\n", "import h5py\n", "import json\n", "import gzip" ] }, { "cell_type": "code", "execution_count": 2, "id": "moderate-drunk", "metadata": {}, "outputs": [], "source": [ "# DWD V2 files\n", "# https://drive.google.com/drive/u/3/folders/1OIZegxxrs_Hv2ZhDsSO-zLVARCR60P01\n", "# SITELINKS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/sitelinks.en.tsv.gz\"\n", "CLAIMS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/claims.tsv.gz\"\n", "LABELS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz\"\n", "DESCRIPTIONS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/descriptions.en.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "exceptional-funeral", "metadata": {}, "outputs": [], "source": [ "# wikidata-20210215 files\n", "# https://drive.google.com/drive/u/3/folders/1NGtob1BFQ03sXf4yQyYvP13ly3u1Ul5u\n", "# SITELINKS_FILE_V1 = \"../source_dataset_files/wikidata-20210215/sitelinks.en.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "compressed-question", "metadata": {}, "outputs": [], "source": [ "# wikidata-20201208 files\n", "# https://drive.google.com/drive/u/3/folders/1qbbgjo7pddMdDvQzOSeSaL6lYwj_f5gi\n", "SITELINKS_FILE_V2 = \"../source_dataset_files/wikidata-20201208/sitelinks.en.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "fiscal-appointment", "metadata": {}, "outputs": [], "source": [ "# Embedding Related Files\n", "DBPEDIA_SHORT_ABSTRACTS_TTL = \"../data/evaluation/source_files/short-abstracts_lang=en.ttl\"\n", "DBPEDIA_SHORT_ABSTRACTS_CSV = \"../data/evaluation/source_files/short-abstracts_lang=en.csv\"\n", "ABSTRACTS_INTERMEDIATE_FILE = \"../data/embeddings/intermediate_files/abstracts.csv\"\n", "\n", "COMPLEX_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.complEx.graph-embeddings.txt\"\n", "TRANSE_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.transE.graph-embeddings.txt\"\n", "TEXT_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/text-embeddings-concatenated.tsv.gz\"\n", "\n", "COMPLEX_EMB_FINAL_FILE = \"../data/embeddings/complex_orig_embedding_dict.json\"\n", "TRANSE_EMB_FINAL_FILE = \"../data/embeddings/transe_orig_embedding_dict.json\"\n", "TEXT_EMB_FINAL_FILE = \"../data/embeddings/text_7_props_orig_embedding_dict.json\"\n", "ABS_EMB_FINAL_FILE = \"../data/embeddings/abstract_orig_embedding_dict.json\"\n", "ABS_FIRST_SENT_EMB_FINAL_FILE = \"../data/embeddings/abstract_first_sent_orig_embedding_dict.json\"\n", "\n", "LABELS_EMB_FINAL_FILE = \"../data/embeddings/labels_orig_embedding_dict.json\"\n", "LABELS_DESC_EMB_FINAL_FILE = \"../data/embeddings/labels_n_desc_orig_embedding_dict.json\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "departmental-buddy", "metadata": {}, "outputs": [], "source": [ "# HAS Embedding Related Files\n", "A_SOURCE_FILE = \"../source_dataset_files/A_walks_analysis/a_embeddings_10x10,min_count=0.kv\"\n", "A_OP_FILE = \"../data/embeddings/has_a_orig_embedding_dict.json\"\n", "\n", "H_SOURCE_FILE = \"../source_dataset_files/H_walks_analysis/h_embeddings_5x8,min_count=21.kv\"\n", "H_OP_FILE = \"../data/embeddings/has_h_orig_embedding_dict.json\"\n", "\n", "S_SOURCE_FILE = \"../source_dataset_files/S_walks_analysis/s_embeddings_5x10,min_count=0.kv\"\n", "S_OP_FILE = \"../data/embeddings/has_s_orig_embedding_dict.json\"" ] }, { "cell_type": "code", "execution_count": 28, "id": "failing-talent", "metadata": {}, "outputs": [], "source": [ "WORDSIM_CLASS_SIM_FILE = '../data/embeddings/wordsim_class_sim.csv'\n", "WORDSIM_JC_SIM_FILE = '../data/embeddings/wordsim_jc_sim.csv'\n", "WORDSIM_TOP_SIM_FILE = '../data/embeddings/wordsim_top_sim.csv'\n", "\n", "WORDSIM_OLD_CLASS_SIM_FILE = '../data/embeddings/wordsim_old_class_sim.csv'\n", "WORDSIM_OLD_JC_SIM_FILE = '../data/embeddings/wordsim_old_jc_sim.csv'\n", "WORDSIM_OLD_TOP_SIM_FILE = '../data/embeddings/wordsim_old_top_sim.csv'\n", "\n", "DBPEDIA_MC_30_CLASS_SIM_FILE = '../data/embeddings/dbpedia_mc_30_class_sim.csv'\n", "DBPEDIA_MC_30_JC_SIM_FILE = '../data/embeddings/dbpedia_mc_30_jc_sim.csv'\n", "DBPEDIA_MC_30_TOP_SIM_FILE = '../data/embeddings/dbpedia_mc_30_top_sim.csv'\n", "\n", "DBPEDIA_RG_65_CLASS_SIM_FILE = '../data/embeddings/dbpedia_rg_65_class_sim.csv'\n", "DBPEDIA_RG_65_JC_SIM_FILE = '../data/embeddings/dbpedia_rg_65_jc_sim.csv'\n", "DBPEDIA_RG_65_TOP_SIM_FILE = '../data/embeddings/dbpedia_rg_65_top_sim.csv'" ] }, { "cell_type": "code", "execution_count": 29, "id": "elementary-desktop", "metadata": {}, "outputs": [], "source": [ "P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = \"../data/basis/P279_ChildPar.all-distilroberta-v1.csv\"\n", "WORDSIM_FILE = \"../data/evaluation/wordsim353_with_r3.csv\"\n", "WORDSIM_OLD_FILE = \"../data/evaluation/wordsim_old.csv\"\n", "DBPEDIA_MC_30_FINAL_FILE = \"../data/evaluation/mc-30_DBpedia.csv\"\n", "DBPEDIA_RG_65_FINAL_FILE = \"../data/evaluation/rg-65_DBpedia.csv\"\n" ] }, { "cell_type": "markdown", "id": "noble-draft", "metadata": {}, "source": [ "# Common Code" ] }, { "cell_type": "code", "execution_count": 9, "id": "broadband-background", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "241698\n" ] } ], "source": [ "def get_all_nodes():\n", " \"\"\"\n", " This function generates the set of all nodes needed for execution\n", " \"\"\"\n", " p279ChildPar = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)\n", " wordsim_df = pd.read_csv(WORDSIM_FILE)\n", " dbpedia_mc_30_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)\n", " dbpedia_rg_65_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)\n", "# wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')\n", "# concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')\n", " p279QnodesList = set(p279ChildPar.node1.to_list() \n", " + p279ChildPar.node2.to_list()\n", " + wordsim_df['word1_kg_id'].to_list() \n", " + wordsim_df['word2_kg_id'].to_list()\n", " + dbpedia_mc_30_df['word1_kg_id'].to_list()\n", " + dbpedia_mc_30_df['word2_kg_id'].to_list()\n", " + dbpedia_rg_65_df['word1_kg_id'].to_list()\n", " + dbpedia_rg_65_df['word2_kg_id'].to_list())\n", "# + wiki_cs_df['word1_kg_id'].to_list() \n", "# + wiki_cs_df['word2_kg_id'].to_list()\n", "# + concept_net_df['word1_kg_id'].to_list()\n", "# + concept_net_df['word2_kg_id'].to_list())\n", " print(len(p279QnodesList))\n", " return p279QnodesList\n", "\n", "allNodes = get_all_nodes()" ] }, { "cell_type": "code", "execution_count": 10, "id": "parliamentary-documentation", "metadata": {}, "outputs": [], "source": [ "def fillCoverage(embedDict):\n", " wordSim353AnnotDF_New = pd.read_csv(WORDSIM_FILE)\n", " wordSim353AnnotDF_set = set(wordSim353AnnotDF_New['word1_kg_id'].to_list() + wordSim353AnnotDF_New['word2_kg_id'].to_list())\n", " embed_size = len(embedDict[next(iter(embedDict))])\n", "# print(embed_size)\n", " count = 0\n", " for word in wordSim353AnnotDF_set:\n", " if word not in embedDict:\n", " embedDict[word] = np.zeros((embed_size))\n", " count += 1\n", " print(f\"Added {count} corrections\")\n", " return embedDict\n", "\n", "def deserializeEmbeddingDict(embedDict):\n", " for key2 in embedDict.keys():\n", " embedDict[key2] = np.array(embedDict[key2])\n", " return embedDict\n", "\n", "def serializeEmbeddingDict(embedDict):\n", " for key2 in embedDict.keys():\n", " embedDict[key2] = embedDict[key2].tolist() if type(embedDict[key2]) != list else embedDict[key2]\n", " return embedDict" ] }, { "cell_type": "code", "execution_count": 11, "id": "established-brush", "metadata": {}, "outputs": [], "source": [ "def get_labels(node_set):\n", " labels_dict = {}\n", " first_line = True\n", " with gzip.open(LABELS_FILE, 'r') as labelsFile:\n", " firstLine = True\n", " for line in tqdm(labelsFile, total=41845781):\n", " if firstLine:\n", " firstLine = False\n", " continue\n", " line = line.decode('utf-8').strip().split('\\t')\n", " line[3] = line[3][1:-5]\n", " qnode, label = line[1], line[3]\n", " # print(qnode, label)\n", " if qnode in node_set:\n", " labels_dict[qnode] = label\n", " return labels_dict" ] }, { "cell_type": "code", "execution_count": 12, "id": "automated-olive", "metadata": {}, "outputs": [], "source": [ "def get_labels_n_desc(node_set):\n", " labels_dict = get_labels(node_set)\n", " first_line = True\n", " with gzip.open(DESCRIPTIONS_FILE, 'r') as labelsFile:\n", " firstLine = True\n", " for line in tqdm(labelsFile, total=34700043):\n", " if firstLine:\n", " firstLine = False\n", " continue\n", " line = line.decode('utf-8').strip().split('\\t')\n", " line[3] = line[3][1:-5]\n", " qnode, label = line[1], line[3]\n", " # print(qnode, label)\n", " if qnode in node_set:\n", " if qnode in labels_dict:\n", " labels_dict[qnode] += ' ' + label\n", " else:\n", " raise \"Label not present\"\n", " return labels_dict" ] }, { "cell_type": "markdown", "id": "driven-yeast", "metadata": {}, "source": [ "# Complex + Transe Embeddings Generation" ] }, { "cell_type": "code", "execution_count": 20, "id": "sound-spain", "metadata": {}, "outputs": [], "source": [ "complex_emb_dict = json.load(open(COMPLEX_EMB_FINAL_FILE))" ] }, { "cell_type": "code", "execution_count": 21, "id": "correct-gentleman", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "64deda8236084d79bce85a2fd249dec9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "first_line = True\n", "complex_emb_dict = {}\n", "with open(COMPLEX_EMB_SOURCE_FILE) as complex_file:\n", " for line in tqdm(complex_file, total=53002671):\n", " if first_line:\n", " first_line = False\n", " continue\n", " line = line.strip().split()\n", " if line[0] in allNodes and line[0] not in complex_emb_dict:\n", " complex_emb_dict[line[0]] = [float(elem) for elem in line[1:]]" ] }, { "cell_type": "code", "execution_count": 22, "id": "proved-buffer", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "241698" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(complex_emb_dict)" ] }, { "cell_type": "code", "execution_count": 23, "id": "hazardous-amazon", "metadata": {}, "outputs": [], "source": [ "json.dump(complex_emb_dict, open(COMPLEX_EMB_FINAL_FILE, 'w'))" ] }, { "cell_type": "code", "execution_count": 24, "id": "industrial-paradise", "metadata": {}, "outputs": [], "source": [ "transe_emb_dict = json.load(open(TRANSE_EMB_FINAL_FILE))" ] }, { "cell_type": "code", "execution_count": 25, "id": "needed-passion", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8636f544aa484f6d9785723a4a96e83b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "241698" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_line = True\n", "transe_emb_dict = {}\n", "with open(TRANSE_EMB_SOURCE_FILE) as complex_file:\n", " for line in tqdm(complex_file, total=53002671):\n", " if first_line:\n", " first_line = False\n", " continue\n", " line = line.strip().split()\n", " if line[0] in allNodes and line[0] not in transe_emb_dict:\n", " transe_emb_dict[line[0]] = [float(elem) for elem in line[1:]]\n", "len(transe_emb_dict)" ] }, { "cell_type": "code", "execution_count": 26, "id": "classified-chick", "metadata": {}, "outputs": [], "source": [ "json.dump(transe_emb_dict, open(TRANSE_EMB_FINAL_FILE, 'w'))" ] }, { "cell_type": "code", "execution_count": null, "id": "steady-cliff", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "suffering-zealand", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "id": "brief-timer", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "238889\n" ] } ], "source": [ "# p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')\n", "# print(len(set(p279ChildPar.node1.to_list() \n", "# + p279ChildPar.node2.to_list())))\n", "\n", "# # Load complex, transe embedding files and entity names file\n", "# compf = h5py.File('../data/complTrans/complEx.h5','r')\n", "# transf = h5py.File('../data/complTrans/transE.h5','r')\n", "# ent_names = json.load(open('../data/complTrans/entity_names_all_0.json'))\n", "# allNodes = get_all_nodes()\n", "# # json.dump(list(p279QnodesList), open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json', 'w'))\n", "\n", "# complexEmb = {qnode: emb for emb, qnode in zip(compf['embeddings'], ent_names) if qnode in allNodes}\n", "# transeEmb = {qnode: emb for emb, qnode in zip(transf['embeddings'], ent_names) if qnode in allNodes}\n", "# print(f\"Out of {len(ent_names)} embeddings, retaining {len(transeEmb)} embeddings\")\n", "\n", "# def serialize_embedding_dict(embed_dict):\n", "# for key2 in embed_dict.keys():\n", "# embed_dict[key2] = embed_dict[key2].tolist() if type(embed_dict[key2]) != list else embed_dict[key2]\n", "# return embed_dict\n", "\n", "# json.dump(serialize_embedding_dict(complexEmb),open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json','w'))\n", "# json.dump(serialize_embedding_dict(transeEmb),open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json','w'))\n", "# # complexEmb = json.load(open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json'))\n", "# # transeEmb = json.load(open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json'))" ] }, { "cell_type": "markdown", "id": "continued-locking", "metadata": {}, "source": [ "# Text Embedding" ] }, { "cell_type": "code", "execution_count": 34, "id": "polished-divorce", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6f97b30760e841da87be90aebef9c8cd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "first_line = True\n", "text_emb_dict = {}\n", "with gzip.open(TEXT_EMB_SOURCE_FILE) as file:\n", " for line in tqdm(file):\n", " if first_line:\n", " first_line = False\n", " continue\n", " line = line.decode('utf-8').strip().split('\\t')\n", " if line[1] == 'text_embedding' and line[0] in allNodes:\n", " text_emb_dict[line[0]] = [float(elem) for elem in line[2].split(',')]" ] }, { "cell_type": "code", "execution_count": 37, "id": "raising-boost", "metadata": {}, "outputs": [], "source": [ "json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))" ] }, { "cell_type": "code", "execution_count": 20, "id": "beautiful-drill", "metadata": {}, "outputs": [], "source": [ "# text_emb_dict = json.load(open('../data/embeddings/archived/text_7_props_orig_embedding_dict.json.old'))" ] }, { "cell_type": "code", "execution_count": 40, "id": "impressed-stations", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a75bc167e9b449f88e4df7ebb19bca77", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/241698 [00:00 ../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv\"\n", "os.system(q1 + \" \")" ] }, { "cell_type": "code", "execution_count": 15, "id": "based-circuit", "metadata": {}, "outputs": [], "source": [ "text7_missingnodes = pd.read_csv(\"../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv\", sep='\\t')\n", "text7_missingnodes = text7_missingnodes[text7_missingnodes.property == 'text_embedding']" ] }, { "cell_type": "code", "execution_count": 16, "id": "imposed-series", "metadata": {}, "outputs": [], "source": [ "text7_missingnodes['value'] = text7_missingnodes['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])" ] }, { "cell_type": "code", "execution_count": 17, "id": "compliant-locator", "metadata": {}, "outputs": [], "source": [ "text7EmbDict = {row['node']: row['value'] for _,row in text7_missingnodes.iterrows()}" ] }, { "cell_type": "code", "execution_count": 21, "id": "quick-voluntary", "metadata": {}, "outputs": [], "source": [ "for key in text7EmbDict.keys():\n", " if key not in text_emb_dict:\n", " text_emb_dict[key] = text7EmbDict[key]" ] }, { "cell_type": "code", "execution_count": 22, "id": "special-smile", "metadata": {}, "outputs": [], "source": [ "json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))" ] }, { "cell_type": "markdown", "id": "infectious-mauritius", "metadata": {}, "source": [ "## Old technique follows" ] }, { "cell_type": "code", "execution_count": 2, "id": "neural-gibson", "metadata": {}, "outputs": [], "source": [ "p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')\n", "p279QnodesList = list(set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list()))" ] }, { "cell_type": "code", "execution_count": 5, "id": "instructional-weather", "metadata": {}, "outputs": [], "source": [ "missingNodes = allNodes - set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list())" ] }, { "cell_type": "code", "execution_count": 6, "id": "specified-clear", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "37038" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(missingNodes)" ] }, { "cell_type": "code", "execution_count": 8, "id": "conditional-brooks", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6fb2da21d7cf4241a3e52ac132a7c534", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/38 [00:00 ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\"\n", " print(cnt)\n", " runCommCnt += 1\n", " os.system(q1 + \" &\")\n", " if runCommCnt % 15 == 0:\n", " time.sleep(11*60)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "surprising-burning", "metadata": { "scrolled": true }, "outputs": [], "source": [ "for cnt in tqdm(range(1,290)):\n", " if countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") != 4097:\n", " print(cnt)" ] }, { "cell_type": "code", "execution_count": null, "id": "bottom-lodge", "metadata": {}, "outputs": [], "source": [ "import time\n", "from os.path import exists\n", "\n", "# roberta-large-nli-mean-tokens\n", "runCommCnt = 0\n", "for cnt in tqdm(range(252,290)):\n", " if exists(\"../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\") and countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\") == 4097:\n", " continue\n", " q1 = \"\"\n", "# if cnt % 10 == 0:\n", "# q1 += \"sleep 20m; \"\n", " q1 += \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv \\\n", " --model sentence-transformers/all-distilroberta-v1 \\\n", " --property-labels-file ../data/labels.en.tsv --debug \\\n", " --isa-properties P31 P279 \\\n", " --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\"\n", " print(cnt)\n", " runCommCnt += 1\n", " os.system(q1 + \" &\")\n", " if runCommCnt % 15 == 0:\n", " time.sleep(13*60)\n", " " ] }, { "cell_type": "code", "execution_count": 71, "id": "damaged-browse", "metadata": {}, "outputs": [], "source": [ "p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))" ] }, { "cell_type": "code", "execution_count": 38, "id": "collective-april", "metadata": {}, "outputs": [], "source": [ "# temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-1.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 39, "id": "decent-yorkshire", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nodepropertyvalue
0Q99738027text_embedding0.74755263,1.6350263,-0.73952675,1.0463063,-0....
1Q99738027embedding_sentencenight shift, work shift during nighttime hours...
2Q99228502text_embedding0.25261465,0.06285462,0.029052094,0.50796187,0...
3Q99228502embedding_sentenceavenue, thoroughfare named \\\"avenue\\\" is thoro...
4Q98970128text_embedding0.11887096,0.8598291,0.4446009,-0.5038472,-0.9...
\n", "
" ], "text/plain": [ " node property \\\n", "0 Q99738027 text_embedding \n", "1 Q99738027 embedding_sentence \n", "2 Q99228502 text_embedding \n", "3 Q99228502 embedding_sentence \n", "4 Q98970128 text_embedding \n", "\n", " value \n", "0 0.74755263,1.6350263,-0.73952675,1.0463063,-0.... \n", "1 night shift, work shift during nighttime hours... \n", "2 0.25261465,0.06285462,0.029052094,0.50796187,0... \n", "3 avenue, thoroughfare named \\\"avenue\\\" is thoro... \n", "4 0.11887096,0.8598291,0.4446009,-0.5038472,-0.9... " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# temp.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "satisfactory-speech", "metadata": {}, "outputs": [], "source": [ "text2EmbArr = []\n", "for i in tqdm(range(1, 290)):\n", " if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv')):\n", " continue\n", " temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv', sep='\\t')\n", " temp = temp[temp.property == 'text_embedding']\n", " text2EmbArr.append(temp)\n", "text2Emb = pd.concat(text2EmbArr)" ] }, { "cell_type": "code", "execution_count": null, "id": "unavailable-competition", "metadata": {}, "outputs": [], "source": [ "text2Emb.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "concerned-april", "metadata": {}, "outputs": [], "source": [ "text7EmbArr = []\n", "for i in tqdm(range(1, 290)):\n", " if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv')):\n", " continue\n", " temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv', sep='\\t')\n", " temp = temp[temp.property == 'text_embedding']\n", " text7EmbArr.append(temp)\n", "text7Emb = pd.concat(text7EmbArr)" ] }, { "cell_type": "code", "execution_count": 21, "id": "australian-enforcement", "metadata": {}, "outputs": [], "source": [ "text2Emb = text2Emb[text2Emb.node.apply(lambda p: p in allNodes)]\n", "text7Emb = text7Emb[text7Emb.node.apply(lambda p: p in allNodes)]" ] }, { "cell_type": "code", "execution_count": 22, "id": "closed-treatment", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "We have 2prop text embeddings for 278467 nodes and 7prop for 277587 nodes\n" ] } ], "source": [ "print(f\"We have 2prop text embeddings for {len(text2Emb)} nodes and 7prop for {len(text7Emb)} nodes\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "duplicate-agency", "metadata": {}, "outputs": [], "source": [ "text2Emb['value'] = text2Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])\n", "text7Emb['value'] = text7Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])" ] }, { "cell_type": "code", "execution_count": 24, "id": "framed-third", "metadata": {}, "outputs": [], "source": [ "text2EmbDict = {row['node']: row['value'] for _,row in text2Emb.iterrows()}\n", "text7EmbDict = {row['node']: row['value'] for _,row in text7Emb.iterrows()}" ] }, { "cell_type": "code", "execution_count": 25, "id": "peaceful-andrew", "metadata": {}, "outputs": [], "source": [ "json.dump(text2EmbDict, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))\n", "json.dump(text7EmbDict, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))" ] }, { "cell_type": "code", "execution_count": null, "id": "considered-river", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "sustained-playback", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "requested-state", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "suited-going", "metadata": {}, "source": [ "# Abstract Embeddings Generation\n", "\n", "Downloaded short abstracts file from [DBPedia Short Abstracts - 2020.07.01](https://downloads.dbpedia.org/repo/dbpedia/text/short-abstracts/2020.07.01/short-abstracts_lang=en.ttl.bz2)\n", "\n", "Then, we extract the abstracts file from the bz2 file using: `bzip2 -d short-abstracts_lang=en.ttl.bz2`" ] }, { "cell_type": "code", "execution_count": 5, "id": "former-editor", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d0038713a1604ccb9c2e5499615fbc43", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# cnt = 0\n", "# p1s = []\n", "# p11s = []\n", "# p2s = []\n", "# lines = []\n", "# with open(DBPEDIA_SHORT_ABSTRACTS_TTL, 'r', encoding='utf-8') as f:\n", "# for line in tqdm(f):\n", "# p1 = line[:line.find(\" \")]\n", "# p11 = p1[len(\"\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Word 1Word 2IDH_SimH_DimF_SimF_DimN_SimN_DimD_Sim...P_DimAvgStdevH_origH_reversedword1_kg_idword2_kg_idcategoryembedding_cos_simResp_code
0Arafatpeace83D4NaN3U4...NaN3.60.5477232.12507.8750Q34211Q454U3.982734<Response [200]>
1Arafatterror93D4NaN3U4...NaN3.60.5477233.06256.9375Q34211Q13648784U3.969884<Response [200]>
2FBIfingerprint1093D4NaN4NaN3...NaN3.60.5477234.06255.9375Q8333Q178022U4.000000<Response [200]>
3FBIinvestigation1103U3U3U3...u3.00.0000005.06254.9375Q8333Q21004260M3.951077<Response [200]>
4HarvardYale1372S3S2S2...s2.20.4472144.87505.1250Q13371Q49112M1.264601<Response [200]>
\n", "

5 rows × 22 columns

\n", "" ], "text/plain": [ " Word 1 Word 2 ID H_Sim H_Dim F_Sim F_Dim N_Sim N_Dim D_Sim \\\n", "0 Arafat peace 8 3 D 4 NaN 3 U 4 \n", "1 Arafat terror 9 3 D 4 NaN 3 U 4 \n", "2 FBI fingerprint 109 3 D 4 NaN 4 NaN 3 \n", "3 FBI investigation 110 3 U 3 U 3 U 3 \n", "4 Harvard Yale 137 2 S 3 S 2 S 2 \n", "\n", " ... P_Dim Avg Stdev H_orig H_reversed word1_kg_id word2_kg_id \\\n", "0 ... NaN 3.6 0.547723 2.1250 7.8750 Q34211 Q454 \n", "1 ... NaN 3.6 0.547723 3.0625 6.9375 Q34211 Q13648784 \n", "2 ... NaN 3.6 0.547723 4.0625 5.9375 Q8333 Q178022 \n", "3 ... u 3.0 0.000000 5.0625 4.9375 Q8333 Q21004260 \n", "4 ... s 2.2 0.447214 4.8750 5.1250 Q13371 Q49112 \n", "\n", " category embedding_cos_sim Resp_code \n", "0 U 3.982734 \n", "1 U 3.969884 \n", "2 U 4.000000 \n", "3 M 3.951077 \n", "4 M 1.264601 \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word_sim_class_sim_df.head()" ] }, { "cell_type": "code", "execution_count": 41, "id": "operational-survival", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "48386a6eaa0745e4a9eebbed1e61c72c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/349 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
embeddingcountCoverage Percentage
0complex23844899.815395
1transe23844899.815395
2text2238889100.000000
3text7238889100.000000
4abstract10582844.300072
5abstractFirstSent10582844.300072
\n", "" ], "text/plain": [ " embedding count Coverage Percentage\n", "0 complex 238448 99.815395\n", "1 transe 238448 99.815395\n", "2 text2 238889 100.000000\n", "3 text7 238889 100.000000\n", "4 abstract 105828 44.300072\n", "5 abstractFirstSent 105828 44.300072" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(summArr, columns=['embedding', 'count', 'Coverage Percentage'])" ] }, { "cell_type": "markdown", "id": "changing-strategy", "metadata": {}, "source": [ "# Embeddings correction" ] }, { "cell_type": "code", "execution_count": 6, "id": "purple-raising", "metadata": {}, "outputs": [], "source": [ "masterEmbedDictMaster = {}\n", "subsetEmbedDictMaster = {}" ] }, { "cell_type": "code", "execution_count": 7, "id": "round-product", "metadata": {}, "outputs": [], "source": [ "masterEmbedKeys = ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']\n", "for key1 in masterEmbedKeys:\n", " masterEmbedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict.json'))" ] }, { "cell_type": "code", "execution_count": 6, "id": "metallic-insulin", "metadata": {}, "outputs": [], "source": [ "subsetEmbedKeys = ['text_7props', 'text_2props', 'complex', 'transe', 'abstract', 'abstract_first_sent']\n", "for key1 in subsetEmbedKeys:\n", " subsetEmbedDictMaster[key1] = json.load(open('../data/orig_embeddings/'+key1+'_original_embeddings_dict.json'))" ] }, { "cell_type": "code", "execution_count": 8, "id": "assigned-parameter", "metadata": {}, "outputs": [], "source": [ "wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')" ] }, { "cell_type": "code", "execution_count": 9, "id": "olympic-yemen", "metadata": {}, "outputs": [], "source": [ "wordsim_pairs = {(row['word1_kg_id'], row['word2_kg_id']) for _, row in wordSim353AnnotDF_New.iterrows()}" ] }, { "cell_type": "code", "execution_count": 15, "id": "welcome-disorder", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pair Coverage by text_7props embeddings created for 19k retrofitting: 325\n", "Pair Coverage by text_2props embeddings created for 19k retrofitting: 325\n", "Pair Coverage by complex embeddings created for 19k retrofitting: 342\n", "Pair Coverage by transe embeddings created for 19k retrofitting: 342\n", "Pair Coverage by abstract embeddings created for 19k retrofitting: 343\n", "Pair Coverage by abstract_first_sent embeddings created for 19k retrofitting: 343\n" ] } ], "source": [ "for key1 in subsetEmbedKeys:\n", " print(f\"Pair Coverage by {key1} embeddings created for 19k retrofitting: {sum([row[0] in subsetEmbedDictMaster[key1] and row[1] in subsetEmbedDictMaster[key1] for row in wordsim_pairs])}\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "northern-psychiatry", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pair Coverage by old text_7_props embeddings created for 19k retrofitting: 278\n", "Pair Coverage by old text_2_props embeddings created for 19k retrofitting: 278\n", "Pair Coverage by old complex embeddings created for 19k retrofitting: 278\n", "Pair Coverage by old transe embeddings created for 19k retrofitting: 278\n", "Pair Coverage by old abstract embeddings created for 19k retrofitting: 183\n", "Pair Coverage by old abstract_first_sent embeddings created for 19k retrofitting: 183\n" ] } ], "source": [ "for key1 in masterEmbedKeys:\n", " print(f\"Pair Coverage by old {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "contrary-casino", "metadata": {}, "outputs": [], "source": [ "wordSim353AnnotDF_New_set = set(wordSim353AnnotDF_New.word1_kg_id.to_list() + wordSim353AnnotDF_New.word2_kg_id.to_list())" ] }, { "cell_type": "code", "execution_count": 11, "id": "alleged-polish", "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "masterEmbCorrections = defaultdict(list)\n", "for node in wordSim353AnnotDF_New_set:\n", " for i in range(len(masterEmbedKeys)):\n", " if node not in masterEmbedDictMaster[masterEmbedKeys[i]] and node in wordSim353AnnotDF_New_set:\n", " masterEmbCorrections[masterEmbedKeys[i]].append(node)" ] }, { "cell_type": "code", "execution_count": 21, "id": "periodic-buffer", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['abstract', 'abstract_first_sent', 'text_7_props', 'text_2_props', 'complex', 'transe'])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "masterEmbCorrections.keys()" ] }, { "cell_type": "markdown", "id": "awful-signal", "metadata": {}, "source": [ "## Complex, Transe" ] }, { "cell_type": "code", "execution_count": 142, "id": "exceptional-acting", "metadata": {}, "outputs": [], "source": [ "# import requests\n", "# correctedComplexEmb = {}\n", "# correctedTranseEmb = {}\n", "# for wordID in masterEmbCorrections['complex']:\n", "# try:\n", "# resp = requests.get(\"http://ckg07:9200/wikidatadwd-augmented/_doc/\"+wordID).json()['_source']\n", "# correctedComplexEmb[wordID] = [float(p) for p in resp['graph_embedding_complex'].split(',')]\n", "# correctedTranseEmb[wordID] = [float(p) for p in resp['graph_embeddings_transe'].split(',')]\n", "# except:\n", "# print(\"Failure returned for http://ckg07:9200/wikidatadwd-augmented/_doc/\"+wordID)" ] }, { "cell_type": "code", "execution_count": 15, "id": "assigned-journey", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "51850ec9544547f293820bd9e94091f4", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/42575933 [00:00 ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv\"\n", "# print(q1)\n", "os.system(q1 + \" &\")" ] }, { "cell_type": "code", "execution_count": 30, "id": "cooked-vinyl", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32512" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q1 = \"kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv \\\n", " --model roberta-large-nli-mean-tokens \\\n", " --property-labels-file ../data/labels.en.tsv --debug \\\n", " --isa-properties P31 P279 \\\n", " --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv\"\n", "# print(q1)\n", "os.system(q1 + \" &\")" ] }, { "cell_type": "code", "execution_count": 110, "id": "static-failure", "metadata": {}, "outputs": [], "source": [ "corrected7Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv', sep='\\t')\n", "corrected2Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 111, "id": "spare-flexibility", "metadata": {}, "outputs": [], "source": [ "corrected7Emb = corrected7Emb[corrected7Emb.property == 'text_embedding']\n", "corrected7Emb['value'] = corrected7Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])\n", "\n", "corrected2Emb = corrected2Emb[corrected2Emb.property == 'text_embedding']\n", "corrected2Emb['value'] = corrected2Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])" ] }, { "cell_type": "code", "execution_count": 112, "id": "minute-oakland", "metadata": {}, "outputs": [], "source": [ "for _, row in corrected7Emb.iterrows():\n", " masterEmbedDictMaster['text_7_props'][row['node']] = row['value']\n", "for _, row in corrected2Emb.iterrows():\n", " masterEmbedDictMaster['text_2_props'][row['node']] = row['value']" ] }, { "cell_type": "code", "execution_count": null, "id": "documentary-fluid", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "analyzed-naples", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "psychological-brighton", "metadata": {}, "source": [ "## Abstract" ] }, { "cell_type": "code", "execution_count": 34, "id": "meaning-spanking", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DBPedia dataset has 5732949 records with unique 5732947 index values\n", "There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)\n" ] } ], "source": [ "df1 = pd.read_csv(\"../data/short-abstracts_lang=en.csv\", skiprows=1, skipfooter=1, header=None, engine='python')\n", "df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']\n", "df1 = df1.set_index('node1')\n", "df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]\n", "print(f\"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values\")\n", "sitelinksDF = pd.read_csv(\"../data/sitelinks.en.tsv.gz\", sep='\\t')\n", "sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split(\"/\")[-1] if p.split(\"/\")[-1] != '' else p.split(\"/\")[-2])\n", "sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']\n", "sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')\n", "print(f\"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)\")\n", "sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]\n", "masterEmbCorrections_abs_set = set(masterEmbCorrections['abstract'])\n", "sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]" ] }, { "cell_type": "code", "execution_count": 49, "id": "offensive-enclosure", "metadata": {}, "outputs": [], "source": [ "labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\\t')\n", "labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]\n", "labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}\n", "descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\\t')\n", "descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]\n", "descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}" ] }, { "cell_type": "code", "execution_count": 88, "id": "better-tuner", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "From 58 Qnodes, there are 16 sitelink Qnodes which do not have a short abstract i.e 42 have a short abstract\n" ] } ], "source": [ "sdf_set = set(sitelinksDF2.index.to_list())\n", "df1 = df1[df1.index.map(lambda p: p in sdf_set)]\n", "abstractsDF2 = sitelinksDF2.join(df1).reset_index()\n", "print(f\"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract\")\n", "# abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]" ] }, { "cell_type": "code", "execution_count": 89, "id": "invalid-fiction", "metadata": {}, "outputs": [], "source": [ "abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else \"\")\n", "abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else \"\")\n", "def combineAbsLabDesc(row, parameter):\n", " if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != \"\":\n", " return row[parameter]\n", " elif row['node1_label'] == \"\" and row['node1_desc'] == \"\":\n", " return None\n", " else:\n", " return row['node1_label'] + ' ' + row['node1_desc']" ] }, { "cell_type": "code", "execution_count": 90, "id": "opened-drink", "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import sent_tokenize\n", "abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)\n", "abstractsDF2 = abstractsDF2.reset_index()" ] }, { "cell_type": "code", "execution_count": 92, "id": "affected-reproduction", "metadata": {}, "outputs": [], "source": [ "abstractsDF2['abstract'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract',))\n", "abstractsDF2['abstract_firstSent'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))" ] }, { "cell_type": "code", "execution_count": 93, "id": "actual-communication", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "58" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(abstractsDF2)" ] }, { "cell_type": "code", "execution_count": 95, "id": "quantitative-tumor", "metadata": {}, "outputs": [], "source": [ "abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]" ] }, { "cell_type": "code", "execution_count": 96, "id": "turned-retail", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "58" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(abstractsDF2)" ] }, { "cell_type": "code", "execution_count": 97, "id": "heard-freedom", "metadata": {}, "outputs": [], "source": [ "abstractsDF2 = abstractsDF2.drop(columns=['index']).reset_index()" ] }, { "cell_type": "code", "execution_count": 71, "id": "forty-southeast", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level_0indextrimmedNode2idnode1labelnode2ignoreurlignore2abstractnode1_labelnode1_descabstract_firstSent
000LuxuriesQ10953913-wikipedia_sitelink-538fe3-0Q10953913wikipedia_sitelinkhttp://en.wikipedia.org/wiki/LuxuriesNaNNaNNaNluxuryBehavior, expenses or equipment that far...luxuryBehavior, expenses or equipment that far excee...nan
111PotatoQ10998-wikipedia_sitelink-56b85c-0Q10998wikipedia_sitelinkhttp://en.wikipedia.org/wiki/Potato10709.0<http://dbpedia.org/resource/Potato><http://www.w3.org/2000/01/rdf-schema#comment>The potato is a root vegetable native to the A...potatospecies of plantThe potato is a root vegetable native to the A...
222MarsQ111-wikipedia_sitelink-9ff296-0Q111wikipedia_sitelinkhttp://en.wikipedia.org/wiki/Mars1803088.0<http://dbpedia.org/resource/Mars><http://www.w3.org/2000/01/rdf-schema#comment>Mars is the fourth planet from the Sun and the...Marsfourth planet from the SunMars is the fourth planet from the Sun and the...
333DawnQ11326182-wikipedia_sitelink-ae2918-0Q11326182wikipedia_sitelinkhttp://en.wikipedia.org/wiki/Dawn97544.0<http://dbpedia.org/resource/Dawn><http://www.w3.org/2000/01/rdf-schema#comment>Dawn is the time that marks the beginning of t...dawntime that marks the beginning of the twilight ...Dawn is the time that marks the beginning of t...
444Change_(philosophy)Q1150070-wikipedia_sitelink-81cf5f-0Q1150070wikipedia_sitelinkhttp://en.wikipedia.org/wiki/Change_(philosophy)NaNNaNNaNchangeprocess, event or action that deviates f...changeprocess, event or action that deviates from th...nan
\n", "
" ], "text/plain": [ " level_0 index trimmedNode2 id \\\n", "0 0 0 Luxuries Q10953913-wikipedia_sitelink-538fe3-0 \n", "1 1 1 Potato Q10998-wikipedia_sitelink-56b85c-0 \n", "2 2 2 Mars Q111-wikipedia_sitelink-9ff296-0 \n", "3 3 3 Dawn Q11326182-wikipedia_sitelink-ae2918-0 \n", "4 4 4 Change_(philosophy) Q1150070-wikipedia_sitelink-81cf5f-0 \n", "\n", " node1 label \\\n", "0 Q10953913 wikipedia_sitelink \n", "1 Q10998 wikipedia_sitelink \n", "2 Q111 wikipedia_sitelink \n", "3 Q11326182 wikipedia_sitelink \n", "4 Q1150070 wikipedia_sitelink \n", "\n", " node2 ignore \\\n", "0 http://en.wikipedia.org/wiki/Luxuries NaN \n", "1 http://en.wikipedia.org/wiki/Potato 10709.0 \n", "2 http://en.wikipedia.org/wiki/Mars 1803088.0 \n", "3 http://en.wikipedia.org/wiki/Dawn 97544.0 \n", "4 http://en.wikipedia.org/wiki/Change_(philosophy) NaN \n", "\n", " url \\\n", "0 NaN \n", "1 \n", "2 \n", "3 \n", "4 NaN \n", "\n", " ignore2 \\\n", "0 NaN \n", "1 \n", "2 \n", "3 \n", "4 NaN \n", "\n", " abstract node1_label \\\n", "0 luxuryBehavior, expenses or equipment that far... luxury \n", "1 The potato is a root vegetable native to the A... potato \n", "2 Mars is the fourth planet from the Sun and the... Mars \n", "3 Dawn is the time that marks the beginning of t... dawn \n", "4 changeprocess, event or action that deviates f... change \n", "\n", " node1_desc \\\n", "0 Behavior, expenses or equipment that far excee... \n", "1 species of plant \n", "2 fourth planet from the Sun \n", "3 time that marks the beginning of the twilight ... \n", "4 process, event or action that deviates from th... \n", "\n", " abstract_firstSent \n", "0 nan \n", "1 The potato is a root vegetable native to the A... \n", "2 Mars is the fourth planet from the Sun and the... \n", "3 Dawn is the time that marks the beginning of t... \n", "4 nan " ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "abstractsDF2.head()" ] }, { "cell_type": "code", "execution_count": 117, "id": "multiple-offer", "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from time import time\n", "import pandas as pd\n", "\n", "def getSentEmbeddings(valSeries, modelName):\n", " model = SentenceTransformer(modelName)\n", " start = time()\n", " encodings = model.encode(valSeries.to_list())\n", " print(time()-start,'s')\n", " return encodings\n", "\n", "def getIndSentEmbeddings(sent, modelName):\n", " model = SentenceTransformer(modelName)\n", " start = time()\n", " encodings = model.encode([sent])\n", " print(time()-start,'s')\n", " return encodings" ] }, { "cell_type": "code", "execution_count": 102, "id": "sustainable-breakdown", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6419482231140137 s\n", "0.5260367393493652 s\n" ] } ], "source": [ "abstractsDF2['abs_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract, 'bert-base-nli-mean-tokens')))\n", "abstractsDF2['abs_firstSent_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract_firstSent, 'bert-base-nli-mean-tokens')))" ] }, { "cell_type": "code", "execution_count": 104, "id": "usual-selling", "metadata": {}, "outputs": [], "source": [ "for _, row in abstractsDF2.iterrows():\n", " masterEmbedDictMaster['abstract'][row['node1']] = row['abs_emb']\n", " masterEmbedDictMaster['abstract_first_sent'][row['node1']] = row['abs_firstSent_emb']" ] }, { "cell_type": "code", "execution_count": 124, "id": "promising-owner", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.37706875801086426 s\n", "0.3001420497894287 s\n", "0.370746374130249 s\n", "0.6896324157714844 s\n", "0.33779358863830566 s\n", "0.3965473175048828 s\n", "0.3200962543487549 s\n", "0.3489806652069092 s\n", "0.3413431644439697 s\n", "0.32114505767822266 s\n", "0.3811838626861572 s\n", "0.34630370140075684 s\n", "0.37790727615356445 s\n", "0.26860570907592773 s\n", "0.3601953983306885 s\n", "0.3713240623474121 s\n", "0.34137582778930664 s\n", "0.33736181259155273 s\n", "0.37023448944091797 s\n", "0.31382036209106445 s\n", "0.35136938095092773 s\n", "0.37309718132019043 s\n", "0.33543896675109863 s\n", "0.38199710845947266 s\n", "0.3740067481994629 s\n", "0.3278031349182129 s\n", "0.32283997535705566 s\n", "0.34000563621520996 s\n", "0.31502628326416016 s\n", "0.34996771812438965 s\n", "0.3871273994445801 s\n", "0.3487060070037842 s\n", "0.35172486305236816 s\n", "0.3280646800994873 s\n", "0.3519773483276367 s\n", "0.3354451656341553 s\n", "0.3633551597595215 s\n", "0.3226644992828369 s\n", "0.33882975578308105 s\n", "0.36072254180908203 s\n", "0.3833494186401367 s\n", "0.2929043769836426 s\n", "0.32875680923461914 s\n", "0.36334872245788574 s\n", "0.34148168563842773 s\n", "0.3569769859313965 s\n", "0.37468576431274414 s\n", "0.399524450302124 s\n", "0.3516504764556885 s\n", "0.333402156829834 s\n", "0.3851203918457031 s\n", "0.34867238998413086 s\n", "0.3607771396636963 s\n", "0.38669753074645996 s\n", "0.33347272872924805 s\n", "0.36278390884399414 s\n", "0.3602781295776367 s\n", "0.3322322368621826 s\n", "0.36807823181152344 s\n", "0.3407411575317383 s\n", "0.3837134838104248 s\n", "0.38958096504211426 s\n", "0.3332521915435791 s\n", "0.3331124782562256 s\n", "0.35001134872436523 s\n", "0.32433485984802246 s\n", "0.36315059661865234 s\n", "0.34323906898498535 s\n", "0.3112339973449707 s\n", "0.30588483810424805 s\n", "0.30704236030578613 s\n", "0.31201720237731934 s\n" ] } ], "source": [ "for node in masterEmbCorrections_abs_set:\n", " if node not in masterEmbedDictMaster['abstract']:\n", " if node in labelsDict and node in descDict:\n", " masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]\n", " elif node in labelsDict:\n", " masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]" ] }, { "cell_type": "code", "execution_count": 125, "id": "acquired-manitoba", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.32213783264160156 s\n", "0.357776403427124 s\n", "0.37949395179748535 s\n", "0.35210466384887695 s\n", "0.28103041648864746 s\n", "0.3626406192779541 s\n", "0.35109710693359375 s\n", "0.34203338623046875 s\n", "0.32386112213134766 s\n", "0.3354361057281494 s\n", "0.3063056468963623 s\n", "0.3441202640533447 s\n", "0.32869935035705566 s\n", "0.42442989349365234 s\n", "0.37239527702331543 s\n", "0.38650059700012207 s\n", "0.3191685676574707 s\n", "0.3609733581542969 s\n", "0.3115823268890381 s\n", "0.36015963554382324 s\n", "0.3338603973388672 s\n", "0.3487727642059326 s\n", "0.3250617980957031 s\n", "0.35145044326782227 s\n", "0.33944034576416016 s\n", "0.31502413749694824 s\n", "0.3611795902252197 s\n", "0.35285043716430664 s\n", "0.3575010299682617 s\n", "0.304781436920166 s\n", "0.4003562927246094 s\n", "0.3315858840942383 s\n", "0.36008763313293457 s\n", "0.36187100410461426 s\n", "0.32981252670288086 s\n", "0.3378865718841553 s\n", "0.31662964820861816 s\n", "0.32143092155456543 s\n", "0.3152732849121094 s\n", "0.38222813606262207 s\n", "0.3846759796142578 s\n", "0.33153700828552246 s\n", "0.37013936042785645 s\n", "0.33272790908813477 s\n", "0.29526567459106445 s\n", "0.3218040466308594 s\n", "0.3795340061187744 s\n", "0.3576061725616455 s\n", "0.35764193534851074 s\n", "0.36867713928222656 s\n", "0.3807237148284912 s\n", "0.33266758918762207 s\n", "0.33878159523010254 s\n", "0.38289546966552734 s\n", "0.38695788383483887 s\n", "0.33074188232421875 s\n", "0.32749414443969727 s\n", "0.33860039710998535 s\n", "0.36585235595703125 s\n", "0.33011841773986816 s\n", "0.3293156623840332 s\n", "0.3491702079772949 s\n", "0.3720529079437256 s\n", "0.3078622817993164 s\n", "0.3844125270843506 s\n", "0.32468104362487793 s\n", "0.3186354637145996 s\n", "0.3438723087310791 s\n", "0.36643028259277344 s\n", "0.34279680252075195 s\n", "0.3625810146331787 s\n", "0.35865354537963867 s\n", "0.3503103256225586 s\n", "0.37160682678222656 s\n", "0.3268110752105713 s\n", "0.2564544677734375 s\n", "0.37343525886535645 s\n", "0.33298277854919434 s\n" ] } ], "source": [ "for node in masterEmbCorrections_abs_set:\n", " if node not in masterEmbedDictMaster['abstract_first_sent']:\n", " if node in labelsDict and node in descDict:\n", " masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]\n", " elif node in labelsDict:\n", " masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]" ] }, { "cell_type": "markdown", "id": "veterinary-thailand", "metadata": {}, "source": [ "## Updated coverage details" ] }, { "cell_type": "code", "execution_count": 145, "id": "intimate-campus", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pair Coverage by new text_7_props embeddings created for 19k retrofitting: 325\n", "Pair Coverage by new text_2_props embeddings created for 19k retrofitting: 325\n", "Pair Coverage by new complex embeddings created for 19k retrofitting: 343\n", "Pair Coverage by new transe embeddings created for 19k retrofitting: 343\n", "Pair Coverage by new abstract embeddings created for 19k retrofitting: 339\n", "Pair Coverage by new abstract_first_sent embeddings created for 19k retrofitting: 339\n" ] } ], "source": [ "for key1 in masterEmbedKeys:\n", " print(f\"Pair Coverage by new {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "lovely-token", "metadata": {}, "outputs": [], "source": [ "for key1 in masterEmbedDictMaster.keys():\n", " for key2 in masterEmbedDictMaster[key1].keys():\n", " if type(masterEmbedDictMaster[key1][key2]) != list:\n", " masterEmbedDictMaster[key1][key2] = masterEmbedDictMaster[key1][key2].tolist()" ] }, { "cell_type": "code", "execution_count": 20, "id": "exact-surfing", "metadata": {}, "outputs": [], "source": [ "for key1 in ['complex', 'transe']:\n", " json.dump(masterEmbedDictMaster[key1], open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json', 'w'))" ] }, { "cell_type": "code", "execution_count": 134, "id": "behavioral-spain", "metadata": {}, "outputs": [], "source": [ "def countOverlap(source, target):\n", " cnt = 0\n", " for key1 in source:\n", " if key1 in target:\n", " cnt += 1\n", " return cnt\n", "p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))" ] }, { "cell_type": "code", "execution_count": 147, "id": "hawaiian-brain", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
embeddingtotal countoverlap countCoverage Percentage
0text_7_props238930238889100.000000
1text_2_props238930238889100.000000
2complex23850023844899.815395
3transe23850023844899.815395
4abstract10596410591644.336910
5abstract_first_sent10596410591644.336910
\n", "
" ], "text/plain": [ " embedding total count overlap count Coverage Percentage\n", "0 text_7_props 238930 238889 100.000000\n", "1 text_2_props 238930 238889 100.000000\n", "2 complex 238500 238448 99.815395\n", "3 transe 238500 238448 99.815395\n", "4 abstract 105964 105916 44.336910\n", "5 abstract_first_sent 105964 105916 44.336910" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summArr = []\n", "for key1 in masterEmbedDictMaster:\n", " cnt = countOverlap(masterEmbedDictMaster[key1], p279QnodesList)\n", " summArr.append([key1, len(masterEmbedDictMaster[key1]), cnt, cnt / len(p279QnodesList) * 100])\n", "pd.DataFrame(summArr, columns=['embedding', 'total count', 'overlap count', 'Coverage Percentage'])" ] }, { "cell_type": "markdown", "id": "greater-namibia", "metadata": {}, "source": [ "# Concatenated Embeddings" ] }, { "cell_type": "code", "execution_count": 9, "id": "fifth-associate", "metadata": {}, "outputs": [], "source": [ "import json\n", "embedDictMaster = {}\n", "for key1 in ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']:\n", " embedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json'))\n", " " ] }, { "cell_type": "code", "execution_count": 13, "id": "egyptian-sentence", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "text_7_props : 1024\n", "text_2_props : 1024\n", "complex : 200\n", "transe : 200\n", "abstract : 768\n", "abstract_first_sent : 768\n" ] } ], "source": [ "def determineEmbeddingLengths(embedDictMaster):\n", " for key in embedDictMaster.keys():\n", " embed_size = len(next(iter(embedDictMaster[key].values())))\n", " print(key,\": \",embed_size)\n", "determineEmbeddingLengths(embedDictMaster)\n", " " ] }, { "cell_type": "code", "execution_count": 6, "id": "removable-point", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Added 11 corrections\n", "Added 11 corrections\n", "Added 0 corrections\n", "Added 0 corrections\n", "Added 4 corrections\n", "Added 4 corrections\n" ] } ], "source": [ "for key1 in embedDictMaster.keys():\n", " embedDictMaster[key1] = deserializeEmbeddingDict(embedDictMaster[key1])\n", "# Fill Coverage of embedding dictionaries\n", "for key1 in embedDictMaster.keys():\n", " embedDictMaster[key1] = fillCoverage(embedDictMaster[key1])" ] }, { "cell_type": "code", "execution_count": 7, "id": "productive-indiana", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "text_7_props 238941\n", "text_2_props 238941\n", "complex 238941\n", "transe 238941\n", "abstract 238941\n", "abstract_first_sent 238941\n" ] } ], "source": [ "for key1 in embedDictMaster.keys():\n", " print(key1, len(next(iter(embedDictMaster.values()))))" ] }, { "cell_type": "code", "execution_count": null, "id": "mechanical-retro", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ready-financing", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "prime-hometown", "metadata": {}, "source": [ "# Retrofitting sample" ] }, { "cell_type": "code", "execution_count": 8, "id": "tight-civilization", "metadata": {}, "outputs": [], "source": [ "def fetchNeighbours(df):\n", " neighboursDict = {}\n", " for _, row in df.iterrows():\n", " if row.node1 not in neighboursDict:\n", " neighboursDict[row.node1] = []\n", " neighboursDict[row.node1].append((row.node2, row.bert2SentSim))\n", " \n", " if row.node2 not in neighboursDict:\n", " neighboursDict[row.node2] = []\n", " neighboursDict[row.node2].append((row.node1, row.bert2SentSim))\n", " print(max([len(neigh) for neigh in neighboursDict.values()]))\n", " return neighboursDict" ] }, { "cell_type": "code", "execution_count": 9, "id": "exciting-circle", "metadata": {}, "outputs": [], "source": [ "def retrofit(embedDict, neighDict, weightCase, weightAssignment=False):\n", " newEmbedDict = {}\n", " for word in embedDict.keys():\n", " if word in neighDict:\n", " neighbs = neighDict[word]\n", " neighbs = list(filter(lambda p: p[0] in embedDict, neighbs))\n", " if len(neighbs) == 0:\n", " newEmbedDict[word] = embedDict[word]\n", " continue\n", "# assert len(neighbs) == 1\n", " if weightAssignment:\n", " sumOfSims = sum([neighb[1] for neighb in neighbs])\n", " sumOfEmbs = sum([embedDict[neighb[0]] * float(neighb[1]) for neighb in neighbs])\n", " else:\n", " sumOfSims = sum([1 for neighb in neighbs])\n", " sumOfEmbs = sum([embedDict[neighb[0]] for neighb in neighbs])\n", " \n", " if weightCase == 1:\n", " newEmbedDict[word] = (embedDict[word] * (len(neighbs)) + sumOfEmbs) / ((len(neighbs)) + sumOfSims)\n", " elif weightCase == 2:\n", " newEmbedDict[word] = (embedDict[word] * (len(neighbs))**2 + sumOfEmbs) / ((len(neighbs))**2 + sumOfSims)\n", " elif weightCase == 0.5:\n", " newEmbedDict[word] = (embedDict[word] * (len(neighbs))**0.5 + sumOfEmbs) / ((len(neighbs))**0.5 + sumOfSims)\n", " else:\n", " raise\n", " else:\n", " newEmbedDict[word] = embedDict[word]\n", " return newEmbedDict" ] }, { "cell_type": "code", "execution_count": 11, "id": "hollywood-prisoner", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import classification_report\n", "def labelSamples(score):\n", " return 'I' if score <= 1.75 else 'U' if score >= 3.5 else 'M'\n", "LABELS = ['I','U','M']\n", "def fetchCorrelationResults(embedDict, newEmbedDict):\n", " wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')\n", "# print(f\"Length of wordsim dataset: {len(wordSim353AnnotDF_New)}\")\n", " assert wordSim353AnnotDF_New.word1_kg_id.isna().sum() == 0\n", " assert wordSim353AnnotDF_New.word2_kg_id.isna().sum() == 0\n", " wordSim353AnnotDF_New['category'] = wordSim353AnnotDF_New.Avg.apply(labelSamples)\n", "# wordSim353AnnotDF_New2 = wordSim353AnnotDF_New\n", " wordSim353AnnotDF_New2 = wordSim353AnnotDF_New[wordSim353AnnotDF_New.apply(lambda p: p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict, axis=1)]\n", " wordSimMissingSet = set(wordSim353AnnotDF_New[wordSim353AnnotDF_New.word1_kg_id.apply(lambda p: p not in embedDict)].word1_kg_id.to_list() + wordSim353AnnotDF_New[wordSim353AnnotDF_New.word2_kg_id.apply(lambda p: p not in embedDict)].word2_kg_id.to_list())\n", " responseDict = {}\n", " responseDict['wordSimMissingSet'] = wordSimMissingSet\n", " responseDict['coveredPairs'] = len(wordSim353AnnotDF_New2)\n", " responseDict['totalPairs'] = len(wordSim353AnnotDF_New)\n", " \n", "# wordSimMissingSet\n", "# print(f\"No. of pairs with some value for embeddings: {len(wordSim353AnnotDF_New2)}\")\n", " wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(embedDict[p['word1_kg_id']]).reshape(1,-1), np.array(embedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)\n", " wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(newEmbedDict[p['word1_kg_id']]).reshape(1,-1), np.array(newEmbedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)\n", " wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textOld'] == -1, 'textOld'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textOld'] != -1]['textOld'].mean()\n", " wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textNew'] == -1, 'textNew'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textNew'] != -1]['textNew'].mean()\n", " \n", " # Logic 1: Scale min,max value to 1,4 strictly\n", "# min1, max1 = wordSim353AnnotDF_New['textOld'].min(), wordSim353AnnotDF_New['textOld'].max()\n", "# min2, max2 = wordSim353AnnotDF_New['textNew'].min(), wordSim353AnnotDF_New['textNew'].max()\n", "# wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * (p - min1) / (max1 - min1))\n", "# wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * (p - min2) / (max2 - min2))\n", " \n", " # Logic 2: Scale abs value to 1,4 strictly\n", " wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * abs(p))\n", " wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * abs(p))\n", "\n", " \n", "# print(f\"KT Corr of old emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['Avg'])}\")\n", "# print(f\"KT Corr of new emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['Avg'])}\")\n", "# print(f\"KT Corr of old emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['H_reversed'])}\")\n", "# print(f\"KT Corr of new emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['H_reversed'])}\")\n", " \n", "# print(f\"Classification Accuracy of old embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textOld'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}\")\n", "# print(f\"Classification Accuracy of new embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textNew'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}\")\n", " responseDict['KT_old_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['Avg'])\n", " responseDict['KT_new_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['Avg'])\n", " responseDict['KT_old_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['H_reversed'])\n", " responseDict['KT_new_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['H_reversed'])\n", " responseDict['old_acc'] = accuracy_score(wordSim353AnnotDF_New['textOld'].apply(labelSamples), wordSim353AnnotDF_New['category'])\n", " responseDict['new_acc'] = accuracy_score(wordSim353AnnotDF_New['textNew'].apply(labelSamples), wordSim353AnnotDF_New['category'])\n", " \n", " responseDict['class_rep_old'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), output_dict=True)\n", " responseDict['class_rep_new'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), output_dict=True)\n", " \n", " cm_old = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), labels=LABELS)\n", " cm_new = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), labels=LABELS)\n", " \n", " responseDict['cm_old'] = cm_old\n", " responseDict['cm_new'] = cm_new\n", " \n", " return responseDict" ] }, { "cell_type": "code", "execution_count": 12, "id": "severe-explosion", "metadata": {}, "outputs": [], "source": [ "neighDictMaster, embedDictMaster = {}, {}" ] }, { "cell_type": "code", "execution_count": 13, "id": "decreased-syndication", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "39218\n" ] } ], "source": [ "neighDictMaster['19k_childPar'] = fetchNeighbours(p279ChildPar)" ] }, { "cell_type": "code", "execution_count": 14, "id": "rocky-criterion", "metadata": {}, "outputs": [], "source": [ "embedDictMaster['complex'] = complexEmb\n", "embedDictMaster['transe'] = transeEmb" ] }, { "cell_type": "code", "execution_count": 17, "id": "apparent-sapphire", "metadata": {}, "outputs": [], "source": [ "for key1 in embedDictMaster.keys():\n", " for key2 in embedDictMaster[key1].keys():\n", " embedDictMaster[key1][key2] = np.array(embedDictMaster[key1][key2])" ] }, { "cell_type": "code", "execution_count": 18, "id": "precise-oxygen", "metadata": {}, "outputs": [], "source": [ "embList = list(embedDictMaster.keys())" ] }, { "cell_type": "code", "execution_count": 19, "id": "identical-keyboard", "metadata": {}, "outputs": [], "source": [ "basisList = list(neighDictMaster.keys())" ] }, { "cell_type": "code", "execution_count": 20, "id": "aging-flavor", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['19k_childPar'])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neighDictMaster.keys()" ] }, { "cell_type": "code", "execution_count": 21, "id": "amended-remove", "metadata": {}, "outputs": [], "source": [ "newEmbedDictMaster, responsesDictMaster = {}, {}" ] }, { "cell_type": "code", "execution_count": 25, "id": "surgical-insurance", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7817a00dcf3c412b92a7c5ac75517168", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EmbeddingBasisWeightWeightednessIteration NumOld AccNew AccIncreasePairs CoveredOld I Precision...Old U PrecisionOld U RecallOld U F1-ScoreNew I PrecisionNew I RecallNew I F1-ScoreNew U PrecisionNew U RecallNew U F1-ScoreRank
0complex19k_childPar1True160.75581464.2441863.4883722911.000000...0.4331210.6601940.5230771.0000000.400.5714290.4634150.5533980.5044252
20transe19k_childPar1True162.50000065.6976743.1976742910.888889...0.3970590.2621360.3157890.8333330.500.6250000.4500000.1747570.2517480
1complex19k_childPar1True264.24418667.1511632.9069772911.000000...0.4634150.5533980.5044251.0000000.500.6666670.4950500.4854370.4901964
11complex19k_childPar2True261.91860563.6627911.7441862911.000000...0.4444440.6601940.5312501.0000000.450.6206900.4589040.6504850.53815313
10complex19k_childPar2True160.75581461.9186051.1627912911.000000...0.4331210.6601940.5230771.0000000.400.5714290.4444440.6601940.53125012
2complex19k_childPar1True367.15116367.7325580.5813952911.000000...0.4950500.4854370.4901960.9090910.500.6451610.5119050.4174760.4598931
4complex19k_childPar1True567.15116367.7325580.5813952910.916667...0.4929580.3398060.4022990.9166670.550.6875000.5079370.3106800.3855425
36transe19k_childPar2True762.50000063.0813950.5813952910.846154...0.3518520.1844660.2420380.8461540.550.6666670.3653850.1844660.2451616
30transe19k_childPar2True162.50000063.0813950.5813952910.888889...0.3970590.2621360.3157890.9000000.450.6000000.4000000.2524270.30952417
22transe19k_childPar1True364.82558165.4069770.5813952910.750000...0.3939390.1262140.1911760.7500000.750.7500000.4000000.1165050.1804513
38transe19k_childPar2True963.08139563.3720930.2906982910.785714...0.3725490.1844660.2467530.7857140.550.6470590.3800000.1844660.2483668
33transe19k_childPar2True462.79069863.0813950.2906982910.900000...0.3833330.2233010.2822090.8461540.550.6666670.3750000.2038830.26415114
18complex19k_childPar2True963.08139563.0813950.0000002910.909091...0.4444440.5436890.4890830.9090910.500.6451610.4444440.5436890.48908319
37transe19k_childPar2True863.08139563.0813950.0000002910.846154...0.3653850.1844660.2451610.7857140.550.6470590.3725490.1844660.2467537
32transe19k_childPar2True362.79069862.7906980.0000002910.900000...0.3870970.2330100.2909090.9000000.450.6000000.3833330.2233010.28220912
23transe19k_childPar1True465.40697765.4069770.0000002910.750000...0.4000000.1165050.1804510.6521740.750.6976740.4444440.1165050.1846151
39transe19k_childPar2True1063.37209363.3720930.0000002910.785714...0.3800000.1844660.2483660.7857140.550.6470590.3800000.1844660.2483669
15complex19k_childPar2True663.66279163.6627910.0000002911.000000...0.4552240.5922330.5147681.0000000.450.6206900.4538460.5728160.50643817
14complex19k_childPar2True563.66279163.6627910.0000002911.000000...0.4571430.6213590.5267491.0000000.450.6206900.4552240.5922330.51476816
13complex19k_childPar2True463.66279163.6627910.0000002911.000000...0.4577460.6310680.5306121.0000000.450.6206900.4571430.6213590.52674915
12complex19k_childPar2True363.66279163.6627910.0000002911.000000...0.4589040.6504850.5381531.0000000.450.6206900.4577460.6310680.53061214
17complex19k_childPar2True863.37209363.081395-0.2906982910.909091...0.4488190.5533980.4956520.9090910.500.6451610.4444440.5436890.48908318
16complex19k_childPar2True763.66279163.372093-0.2906982911.000000...0.4538460.5728160.5064380.9090910.500.6451610.4488190.5533980.4956529
35transe19k_childPar2True662.79069862.500000-0.2906982910.846154...0.3636360.1941750.2531650.8461540.550.6666670.3518520.1844660.2420385
34transe19k_childPar2True563.08139562.790698-0.2906982910.846154...0.3750000.2038830.2641510.8461540.550.6666670.3636360.1941750.25316515
31transe19k_childPar2True263.08139562.790698-0.2906982910.900000...0.4000000.2524270.3095240.9000000.450.6000000.3870970.2330100.29090911
28transe19k_childPar1True960.75581460.465116-0.2906982910.394737...0.3684210.0679610.1147540.3658540.750.4918030.3888890.0679610.11570213
19complex19k_childPar2True1063.08139562.790698-0.2906982910.909091...0.4444440.5436890.4890830.9000000.450.6000000.4444440.5436890.48908310
26transe19k_childPar1True762.20930261.627907-0.5813952910.428571...0.4285710.0873790.1451610.4166670.750.5357140.4000000.0776700.1300814
29transe19k_childPar1True1060.46511659.883721-0.5813952910.365854...0.3888890.0679610.1157020.3333330.750.4615380.4117650.0679610.11666716
8complex19k_childPar1True965.11627964.534884-0.5813952910.785714...0.4423080.2233010.2967740.7857140.550.6470590.4166670.1941750.2649010
3complex19k_childPar1True467.73255867.151163-0.5813952910.909091...0.5119050.4174760.4598930.9166670.550.6875000.4929580.3398060.4022993
6complex19k_childPar1True766.56976765.988372-0.5813952910.916667...0.4754100.2815530.3536590.8461540.550.6666670.4642860.2524270.3270447
7complex19k_childPar1True865.98837265.116279-0.8720932910.846154...0.4642860.2524270.3270440.7857140.550.6470590.4423080.2233010.2967748
21transe19k_childPar1True265.69767464.825581-0.8720932910.833333...0.4500000.1747570.2517480.7500000.600.6666670.3939390.1262140.1911762
27transe19k_childPar1True861.62790760.755814-0.8720932910.416667...0.4000000.0776700.1300810.3947370.750.5172410.3684210.0679610.11475410
25transe19k_childPar1True663.37209362.209302-1.1627912910.468750...0.4545450.0970870.1600000.4285710.750.5454550.4285710.0873790.14516119
5complex19k_childPar1True667.73255866.569767-1.1627912910.916667...0.5079370.3106800.3855420.9166670.550.6875000.4754100.2815530.3536596
9complex19k_childPar1True1064.53488463.081395-1.4534882910.785714...0.4166670.1941750.2649010.6875000.550.6111110.3777780.1650490.22973011
24transe19k_childPar1True565.40697763.372093-2.0348842910.652174...0.4444440.1165050.1846150.4687500.750.5769230.4545450.0970870.16000018
\n", "

40 rows × 22 columns

\n", "" ], "text/plain": [ " Embedding Basis Weight Weightedness Iteration Num Old Acc \\\n", "0 complex 19k_childPar 1 True 1 60.755814 \n", "20 transe 19k_childPar 1 True 1 62.500000 \n", "1 complex 19k_childPar 1 True 2 64.244186 \n", "11 complex 19k_childPar 2 True 2 61.918605 \n", "10 complex 19k_childPar 2 True 1 60.755814 \n", "2 complex 19k_childPar 1 True 3 67.151163 \n", "4 complex 19k_childPar 1 True 5 67.151163 \n", "36 transe 19k_childPar 2 True 7 62.500000 \n", "30 transe 19k_childPar 2 True 1 62.500000 \n", "22 transe 19k_childPar 1 True 3 64.825581 \n", "38 transe 19k_childPar 2 True 9 63.081395 \n", "33 transe 19k_childPar 2 True 4 62.790698 \n", "18 complex 19k_childPar 2 True 9 63.081395 \n", "37 transe 19k_childPar 2 True 8 63.081395 \n", "32 transe 19k_childPar 2 True 3 62.790698 \n", "23 transe 19k_childPar 1 True 4 65.406977 \n", "39 transe 19k_childPar 2 True 10 63.372093 \n", "15 complex 19k_childPar 2 True 6 63.662791 \n", "14 complex 19k_childPar 2 True 5 63.662791 \n", "13 complex 19k_childPar 2 True 4 63.662791 \n", "12 complex 19k_childPar 2 True 3 63.662791 \n", "17 complex 19k_childPar 2 True 8 63.372093 \n", "16 complex 19k_childPar 2 True 7 63.662791 \n", "35 transe 19k_childPar 2 True 6 62.790698 \n", "34 transe 19k_childPar 2 True 5 63.081395 \n", "31 transe 19k_childPar 2 True 2 63.081395 \n", "28 transe 19k_childPar 1 True 9 60.755814 \n", "19 complex 19k_childPar 2 True 10 63.081395 \n", "26 transe 19k_childPar 1 True 7 62.209302 \n", "29 transe 19k_childPar 1 True 10 60.465116 \n", "8 complex 19k_childPar 1 True 9 65.116279 \n", "3 complex 19k_childPar 1 True 4 67.732558 \n", "6 complex 19k_childPar 1 True 7 66.569767 \n", "7 complex 19k_childPar 1 True 8 65.988372 \n", "21 transe 19k_childPar 1 True 2 65.697674 \n", "27 transe 19k_childPar 1 True 8 61.627907 \n", "25 transe 19k_childPar 1 True 6 63.372093 \n", "5 complex 19k_childPar 1 True 6 67.732558 \n", "9 complex 19k_childPar 1 True 10 64.534884 \n", "24 transe 19k_childPar 1 True 5 65.406977 \n", "\n", " New Acc Increase Pairs Covered Old I Precision ... Old U Precision \\\n", "0 64.244186 3.488372 291 1.000000 ... 0.433121 \n", "20 65.697674 3.197674 291 0.888889 ... 0.397059 \n", "1 67.151163 2.906977 291 1.000000 ... 0.463415 \n", "11 63.662791 1.744186 291 1.000000 ... 0.444444 \n", "10 61.918605 1.162791 291 1.000000 ... 0.433121 \n", "2 67.732558 0.581395 291 1.000000 ... 0.495050 \n", "4 67.732558 0.581395 291 0.916667 ... 0.492958 \n", "36 63.081395 0.581395 291 0.846154 ... 0.351852 \n", "30 63.081395 0.581395 291 0.888889 ... 0.397059 \n", "22 65.406977 0.581395 291 0.750000 ... 0.393939 \n", "38 63.372093 0.290698 291 0.785714 ... 0.372549 \n", "33 63.081395 0.290698 291 0.900000 ... 0.383333 \n", "18 63.081395 0.000000 291 0.909091 ... 0.444444 \n", "37 63.081395 0.000000 291 0.846154 ... 0.365385 \n", "32 62.790698 0.000000 291 0.900000 ... 0.387097 \n", "23 65.406977 0.000000 291 0.750000 ... 0.400000 \n", "39 63.372093 0.000000 291 0.785714 ... 0.380000 \n", "15 63.662791 0.000000 291 1.000000 ... 0.455224 \n", "14 63.662791 0.000000 291 1.000000 ... 0.457143 \n", "13 63.662791 0.000000 291 1.000000 ... 0.457746 \n", "12 63.662791 0.000000 291 1.000000 ... 0.458904 \n", "17 63.081395 -0.290698 291 0.909091 ... 0.448819 \n", "16 63.372093 -0.290698 291 1.000000 ... 0.453846 \n", "35 62.500000 -0.290698 291 0.846154 ... 0.363636 \n", "34 62.790698 -0.290698 291 0.846154 ... 0.375000 \n", "31 62.790698 -0.290698 291 0.900000 ... 0.400000 \n", "28 60.465116 -0.290698 291 0.394737 ... 0.368421 \n", "19 62.790698 -0.290698 291 0.909091 ... 0.444444 \n", "26 61.627907 -0.581395 291 0.428571 ... 0.428571 \n", "29 59.883721 -0.581395 291 0.365854 ... 0.388889 \n", "8 64.534884 -0.581395 291 0.785714 ... 0.442308 \n", "3 67.151163 -0.581395 291 0.909091 ... 0.511905 \n", "6 65.988372 -0.581395 291 0.916667 ... 0.475410 \n", "7 65.116279 -0.872093 291 0.846154 ... 0.464286 \n", "21 64.825581 -0.872093 291 0.833333 ... 0.450000 \n", "27 60.755814 -0.872093 291 0.416667 ... 0.400000 \n", "25 62.209302 -1.162791 291 0.468750 ... 0.454545 \n", "5 66.569767 -1.162791 291 0.916667 ... 0.507937 \n", "9 63.081395 -1.453488 291 0.785714 ... 0.416667 \n", "24 63.372093 -2.034884 291 0.652174 ... 0.444444 \n", "\n", " Old U Recall Old U F1-Score New I Precision New I Recall \\\n", "0 0.660194 0.523077 1.000000 0.40 \n", "20 0.262136 0.315789 0.833333 0.50 \n", "1 0.553398 0.504425 1.000000 0.50 \n", "11 0.660194 0.531250 1.000000 0.45 \n", "10 0.660194 0.523077 1.000000 0.40 \n", "2 0.485437 0.490196 0.909091 0.50 \n", "4 0.339806 0.402299 0.916667 0.55 \n", "36 0.184466 0.242038 0.846154 0.55 \n", "30 0.262136 0.315789 0.900000 0.45 \n", "22 0.126214 0.191176 0.750000 0.75 \n", "38 0.184466 0.246753 0.785714 0.55 \n", "33 0.223301 0.282209 0.846154 0.55 \n", "18 0.543689 0.489083 0.909091 0.50 \n", "37 0.184466 0.245161 0.785714 0.55 \n", "32 0.233010 0.290909 0.900000 0.45 \n", "23 0.116505 0.180451 0.652174 0.75 \n", "39 0.184466 0.248366 0.785714 0.55 \n", "15 0.592233 0.514768 1.000000 0.45 \n", "14 0.621359 0.526749 1.000000 0.45 \n", "13 0.631068 0.530612 1.000000 0.45 \n", "12 0.650485 0.538153 1.000000 0.45 \n", "17 0.553398 0.495652 0.909091 0.50 \n", "16 0.572816 0.506438 0.909091 0.50 \n", "35 0.194175 0.253165 0.846154 0.55 \n", "34 0.203883 0.264151 0.846154 0.55 \n", "31 0.252427 0.309524 0.900000 0.45 \n", "28 0.067961 0.114754 0.365854 0.75 \n", "19 0.543689 0.489083 0.900000 0.45 \n", "26 0.087379 0.145161 0.416667 0.75 \n", "29 0.067961 0.115702 0.333333 0.75 \n", "8 0.223301 0.296774 0.785714 0.55 \n", "3 0.417476 0.459893 0.916667 0.55 \n", "6 0.281553 0.353659 0.846154 0.55 \n", "7 0.252427 0.327044 0.785714 0.55 \n", "21 0.174757 0.251748 0.750000 0.60 \n", "27 0.077670 0.130081 0.394737 0.75 \n", "25 0.097087 0.160000 0.428571 0.75 \n", "5 0.310680 0.385542 0.916667 0.55 \n", "9 0.194175 0.264901 0.687500 0.55 \n", "24 0.116505 0.184615 0.468750 0.75 \n", "\n", " New I F1-Score New U Precision New U Recall New U F1-Score Rank \n", "0 0.571429 0.463415 0.553398 0.504425 2 \n", "20 0.625000 0.450000 0.174757 0.251748 0 \n", "1 0.666667 0.495050 0.485437 0.490196 4 \n", "11 0.620690 0.458904 0.650485 0.538153 13 \n", "10 0.571429 0.444444 0.660194 0.531250 12 \n", "2 0.645161 0.511905 0.417476 0.459893 1 \n", "4 0.687500 0.507937 0.310680 0.385542 5 \n", "36 0.666667 0.365385 0.184466 0.245161 6 \n", "30 0.600000 0.400000 0.252427 0.309524 17 \n", "22 0.750000 0.400000 0.116505 0.180451 3 \n", "38 0.647059 0.380000 0.184466 0.248366 8 \n", "33 0.666667 0.375000 0.203883 0.264151 14 \n", "18 0.645161 0.444444 0.543689 0.489083 19 \n", "37 0.647059 0.372549 0.184466 0.246753 7 \n", "32 0.600000 0.383333 0.223301 0.282209 12 \n", "23 0.697674 0.444444 0.116505 0.184615 1 \n", "39 0.647059 0.380000 0.184466 0.248366 9 \n", "15 0.620690 0.453846 0.572816 0.506438 17 \n", "14 0.620690 0.455224 0.592233 0.514768 16 \n", "13 0.620690 0.457143 0.621359 0.526749 15 \n", "12 0.620690 0.457746 0.631068 0.530612 14 \n", "17 0.645161 0.444444 0.543689 0.489083 18 \n", "16 0.645161 0.448819 0.553398 0.495652 9 \n", "35 0.666667 0.351852 0.184466 0.242038 5 \n", "34 0.666667 0.363636 0.194175 0.253165 15 \n", "31 0.600000 0.387097 0.233010 0.290909 11 \n", "28 0.491803 0.388889 0.067961 0.115702 13 \n", "19 0.600000 0.444444 0.543689 0.489083 10 \n", "26 0.535714 0.400000 0.077670 0.130081 4 \n", "29 0.461538 0.411765 0.067961 0.116667 16 \n", "8 0.647059 0.416667 0.194175 0.264901 0 \n", "3 0.687500 0.492958 0.339806 0.402299 3 \n", "6 0.666667 0.464286 0.252427 0.327044 7 \n", "7 0.647059 0.442308 0.223301 0.296774 8 \n", "21 0.666667 0.393939 0.126214 0.191176 2 \n", "27 0.517241 0.368421 0.067961 0.114754 10 \n", "25 0.545455 0.428571 0.087379 0.145161 19 \n", "5 0.687500 0.475410 0.281553 0.353659 6 \n", "9 0.611111 0.377778 0.165049 0.229730 11 \n", "24 0.576923 0.454545 0.097087 0.160000 18 \n", "\n", "[40 rows x 22 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resultsDF.sort_values(by=['Increase'], ascending=False)" ] }, { "cell_type": "code", "execution_count": 28, "id": "utility-globe", "metadata": {}, "outputs": [], "source": [ "resultsDF.to_csv('../data/retrofitting/masterRetro_Aug20_2021.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "crazy-scene", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtkEnv2", "language": "python", "name": "kgtkenv2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "288px" }, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }