{
"cells": [
{
"cell_type": "markdown",
"id": "express-journalist",
"metadata": {},
"source": [
"In this notebook, we make a collection of all the embeddings which we use to do retrofitting. These embeddings are then evaluated for their similarity based on the evaluation benchmark datasets."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "incorrect-routine",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.metrics.pairwise import euclidean_distances\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import scipy.stats as stats\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import confusion_matrix\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from tqdm.notebook import tqdm\n",
"from itertools import combinations\n",
"from math import comb\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import os\n",
"import h5py\n",
"import json\n",
"import gzip"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "moderate-drunk",
"metadata": {},
"outputs": [],
"source": [
"# DWD V2 files\n",
"# https://drive.google.com/drive/u/3/folders/1OIZegxxrs_Hv2ZhDsSO-zLVARCR60P01\n",
"# SITELINKS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/sitelinks.en.tsv.gz\"\n",
"CLAIMS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/claims.tsv.gz\"\n",
"LABELS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz\"\n",
"DESCRIPTIONS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/descriptions.en.tsv.gz\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "exceptional-funeral",
"metadata": {},
"outputs": [],
"source": [
"# wikidata-20210215 files\n",
"# https://drive.google.com/drive/u/3/folders/1NGtob1BFQ03sXf4yQyYvP13ly3u1Ul5u\n",
"# SITELINKS_FILE_V1 = \"../source_dataset_files/wikidata-20210215/sitelinks.en.tsv.gz\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "compressed-question",
"metadata": {},
"outputs": [],
"source": [
"# wikidata-20201208 files\n",
"# https://drive.google.com/drive/u/3/folders/1qbbgjo7pddMdDvQzOSeSaL6lYwj_f5gi\n",
"SITELINKS_FILE_V2 = \"../source_dataset_files/wikidata-20201208/sitelinks.en.tsv.gz\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fiscal-appointment",
"metadata": {},
"outputs": [],
"source": [
"# Embedding Related Files\n",
"DBPEDIA_SHORT_ABSTRACTS_TTL = \"../data/evaluation/source_files/short-abstracts_lang=en.ttl\"\n",
"DBPEDIA_SHORT_ABSTRACTS_CSV = \"../data/evaluation/source_files/short-abstracts_lang=en.csv\"\n",
"ABSTRACTS_INTERMEDIATE_FILE = \"../data/embeddings/intermediate_files/abstracts.csv\"\n",
"\n",
"COMPLEX_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.complEx.graph-embeddings.txt\"\n",
"TRANSE_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.transE.graph-embeddings.txt\"\n",
"TEXT_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/text-embeddings-concatenated.tsv.gz\"\n",
"\n",
"COMPLEX_EMB_FINAL_FILE = \"../data/embeddings/complex_orig_embedding_dict.json\"\n",
"TRANSE_EMB_FINAL_FILE = \"../data/embeddings/transe_orig_embedding_dict.json\"\n",
"TEXT_EMB_FINAL_FILE = \"../data/embeddings/text_7_props_orig_embedding_dict.json\"\n",
"ABS_EMB_FINAL_FILE = \"../data/embeddings/abstract_orig_embedding_dict.json\"\n",
"ABS_FIRST_SENT_EMB_FINAL_FILE = \"../data/embeddings/abstract_first_sent_orig_embedding_dict.json\"\n",
"\n",
"LABELS_EMB_FINAL_FILE = \"../data/embeddings/labels_orig_embedding_dict.json\"\n",
"LABELS_DESC_EMB_FINAL_FILE = \"../data/embeddings/labels_n_desc_orig_embedding_dict.json\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "departmental-buddy",
"metadata": {},
"outputs": [],
"source": [
"# HAS Embedding Related Files\n",
"A_SOURCE_FILE = \"../source_dataset_files/A_walks_analysis/a_embeddings_10x10,min_count=0.kv\"\n",
"A_OP_FILE = \"../data/embeddings/has_a_orig_embedding_dict.json\"\n",
"\n",
"H_SOURCE_FILE = \"../source_dataset_files/H_walks_analysis/h_embeddings_5x8,min_count=21.kv\"\n",
"H_OP_FILE = \"../data/embeddings/has_h_orig_embedding_dict.json\"\n",
"\n",
"S_SOURCE_FILE = \"../source_dataset_files/S_walks_analysis/s_embeddings_5x10,min_count=0.kv\"\n",
"S_OP_FILE = \"../data/embeddings/has_s_orig_embedding_dict.json\""
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "failing-talent",
"metadata": {},
"outputs": [],
"source": [
"WORDSIM_CLASS_SIM_FILE = '../data/embeddings/wordsim_class_sim.csv'\n",
"WORDSIM_JC_SIM_FILE = '../data/embeddings/wordsim_jc_sim.csv'\n",
"WORDSIM_TOP_SIM_FILE = '../data/embeddings/wordsim_top_sim.csv'\n",
"\n",
"WORDSIM_OLD_CLASS_SIM_FILE = '../data/embeddings/wordsim_old_class_sim.csv'\n",
"WORDSIM_OLD_JC_SIM_FILE = '../data/embeddings/wordsim_old_jc_sim.csv'\n",
"WORDSIM_OLD_TOP_SIM_FILE = '../data/embeddings/wordsim_old_top_sim.csv'\n",
"\n",
"DBPEDIA_MC_30_CLASS_SIM_FILE = '../data/embeddings/dbpedia_mc_30_class_sim.csv'\n",
"DBPEDIA_MC_30_JC_SIM_FILE = '../data/embeddings/dbpedia_mc_30_jc_sim.csv'\n",
"DBPEDIA_MC_30_TOP_SIM_FILE = '../data/embeddings/dbpedia_mc_30_top_sim.csv'\n",
"\n",
"DBPEDIA_RG_65_CLASS_SIM_FILE = '../data/embeddings/dbpedia_rg_65_class_sim.csv'\n",
"DBPEDIA_RG_65_JC_SIM_FILE = '../data/embeddings/dbpedia_rg_65_jc_sim.csv'\n",
"DBPEDIA_RG_65_TOP_SIM_FILE = '../data/embeddings/dbpedia_rg_65_top_sim.csv'"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "elementary-desktop",
"metadata": {},
"outputs": [],
"source": [
"P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = \"../data/basis/P279_ChildPar.all-distilroberta-v1.csv\"\n",
"WORDSIM_FILE = \"../data/evaluation/wordsim353_with_r3.csv\"\n",
"WORDSIM_OLD_FILE = \"../data/evaluation/wordsim_old.csv\"\n",
"DBPEDIA_MC_30_FINAL_FILE = \"../data/evaluation/mc-30_DBpedia.csv\"\n",
"DBPEDIA_RG_65_FINAL_FILE = \"../data/evaluation/rg-65_DBpedia.csv\"\n"
]
},
{
"cell_type": "markdown",
"id": "noble-draft",
"metadata": {},
"source": [
"# Common Code"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "broadband-background",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"241698\n"
]
}
],
"source": [
"def get_all_nodes():\n",
" \"\"\"\n",
" This function generates the set of all nodes needed for execution\n",
" \"\"\"\n",
" p279ChildPar = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)\n",
" wordsim_df = pd.read_csv(WORDSIM_FILE)\n",
" dbpedia_mc_30_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)\n",
" dbpedia_rg_65_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)\n",
"# wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')\n",
"# concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')\n",
" p279QnodesList = set(p279ChildPar.node1.to_list() \n",
" + p279ChildPar.node2.to_list()\n",
" + wordsim_df['word1_kg_id'].to_list() \n",
" + wordsim_df['word2_kg_id'].to_list()\n",
" + dbpedia_mc_30_df['word1_kg_id'].to_list()\n",
" + dbpedia_mc_30_df['word2_kg_id'].to_list()\n",
" + dbpedia_rg_65_df['word1_kg_id'].to_list()\n",
" + dbpedia_rg_65_df['word2_kg_id'].to_list())\n",
"# + wiki_cs_df['word1_kg_id'].to_list() \n",
"# + wiki_cs_df['word2_kg_id'].to_list()\n",
"# + concept_net_df['word1_kg_id'].to_list()\n",
"# + concept_net_df['word2_kg_id'].to_list())\n",
" print(len(p279QnodesList))\n",
" return p279QnodesList\n",
"\n",
"allNodes = get_all_nodes()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "parliamentary-documentation",
"metadata": {},
"outputs": [],
"source": [
"def fillCoverage(embedDict):\n",
" wordSim353AnnotDF_New = pd.read_csv(WORDSIM_FILE)\n",
" wordSim353AnnotDF_set = set(wordSim353AnnotDF_New['word1_kg_id'].to_list() + wordSim353AnnotDF_New['word2_kg_id'].to_list())\n",
" embed_size = len(embedDict[next(iter(embedDict))])\n",
"# print(embed_size)\n",
" count = 0\n",
" for word in wordSim353AnnotDF_set:\n",
" if word not in embedDict:\n",
" embedDict[word] = np.zeros((embed_size))\n",
" count += 1\n",
" print(f\"Added {count} corrections\")\n",
" return embedDict\n",
"\n",
"def deserializeEmbeddingDict(embedDict):\n",
" for key2 in embedDict.keys():\n",
" embedDict[key2] = np.array(embedDict[key2])\n",
" return embedDict\n",
"\n",
"def serializeEmbeddingDict(embedDict):\n",
" for key2 in embedDict.keys():\n",
" embedDict[key2] = embedDict[key2].tolist() if type(embedDict[key2]) != list else embedDict[key2]\n",
" return embedDict"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "established-brush",
"metadata": {},
"outputs": [],
"source": [
"def get_labels(node_set):\n",
" labels_dict = {}\n",
" first_line = True\n",
" with gzip.open(LABELS_FILE, 'r') as labelsFile:\n",
" firstLine = True\n",
" for line in tqdm(labelsFile, total=41845781):\n",
" if firstLine:\n",
" firstLine = False\n",
" continue\n",
" line = line.decode('utf-8').strip().split('\\t')\n",
" line[3] = line[3][1:-5]\n",
" qnode, label = line[1], line[3]\n",
" # print(qnode, label)\n",
" if qnode in node_set:\n",
" labels_dict[qnode] = label\n",
" return labels_dict"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "automated-olive",
"metadata": {},
"outputs": [],
"source": [
"def get_labels_n_desc(node_set):\n",
" labels_dict = get_labels(node_set)\n",
" first_line = True\n",
" with gzip.open(DESCRIPTIONS_FILE, 'r') as labelsFile:\n",
" firstLine = True\n",
" for line in tqdm(labelsFile, total=34700043):\n",
" if firstLine:\n",
" firstLine = False\n",
" continue\n",
" line = line.decode('utf-8').strip().split('\\t')\n",
" line[3] = line[3][1:-5]\n",
" qnode, label = line[1], line[3]\n",
" # print(qnode, label)\n",
" if qnode in node_set:\n",
" if qnode in labels_dict:\n",
" labels_dict[qnode] += ' ' + label\n",
" else:\n",
" raise \"Label not present\"\n",
" return labels_dict"
]
},
{
"cell_type": "markdown",
"id": "driven-yeast",
"metadata": {},
"source": [
"# Complex + Transe Embeddings Generation"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "sound-spain",
"metadata": {},
"outputs": [],
"source": [
"complex_emb_dict = json.load(open(COMPLEX_EMB_FINAL_FILE))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "correct-gentleman",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "64deda8236084d79bce85a2fd249dec9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"first_line = True\n",
"complex_emb_dict = {}\n",
"with open(COMPLEX_EMB_SOURCE_FILE) as complex_file:\n",
" for line in tqdm(complex_file, total=53002671):\n",
" if first_line:\n",
" first_line = False\n",
" continue\n",
" line = line.strip().split()\n",
" if line[0] in allNodes and line[0] not in complex_emb_dict:\n",
" complex_emb_dict[line[0]] = [float(elem) for elem in line[1:]]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "proved-buffer",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"241698"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(complex_emb_dict)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "hazardous-amazon",
"metadata": {},
"outputs": [],
"source": [
"json.dump(complex_emb_dict, open(COMPLEX_EMB_FINAL_FILE, 'w'))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "industrial-paradise",
"metadata": {},
"outputs": [],
"source": [
"transe_emb_dict = json.load(open(TRANSE_EMB_FINAL_FILE))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "needed-passion",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8636f544aa484f6d9785723a4a96e83b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"241698"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first_line = True\n",
"transe_emb_dict = {}\n",
"with open(TRANSE_EMB_SOURCE_FILE) as complex_file:\n",
" for line in tqdm(complex_file, total=53002671):\n",
" if first_line:\n",
" first_line = False\n",
" continue\n",
" line = line.strip().split()\n",
" if line[0] in allNodes and line[0] not in transe_emb_dict:\n",
" transe_emb_dict[line[0]] = [float(elem) for elem in line[1:]]\n",
"len(transe_emb_dict)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "classified-chick",
"metadata": {},
"outputs": [],
"source": [
"json.dump(transe_emb_dict, open(TRANSE_EMB_FINAL_FILE, 'w'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "steady-cliff",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "suffering-zealand",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"id": "brief-timer",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"238889\n"
]
}
],
"source": [
"# p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')\n",
"# print(len(set(p279ChildPar.node1.to_list() \n",
"# + p279ChildPar.node2.to_list())))\n",
"\n",
"# # Load complex, transe embedding files and entity names file\n",
"# compf = h5py.File('../data/complTrans/complEx.h5','r')\n",
"# transf = h5py.File('../data/complTrans/transE.h5','r')\n",
"# ent_names = json.load(open('../data/complTrans/entity_names_all_0.json'))\n",
"# allNodes = get_all_nodes()\n",
"# # json.dump(list(p279QnodesList), open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json', 'w'))\n",
"\n",
"# complexEmb = {qnode: emb for emb, qnode in zip(compf['embeddings'], ent_names) if qnode in allNodes}\n",
"# transeEmb = {qnode: emb for emb, qnode in zip(transf['embeddings'], ent_names) if qnode in allNodes}\n",
"# print(f\"Out of {len(ent_names)} embeddings, retaining {len(transeEmb)} embeddings\")\n",
"\n",
"# def serialize_embedding_dict(embed_dict):\n",
"# for key2 in embed_dict.keys():\n",
"# embed_dict[key2] = embed_dict[key2].tolist() if type(embed_dict[key2]) != list else embed_dict[key2]\n",
"# return embed_dict\n",
"\n",
"# json.dump(serialize_embedding_dict(complexEmb),open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json','w'))\n",
"# json.dump(serialize_embedding_dict(transeEmb),open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json','w'))\n",
"# # complexEmb = json.load(open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json'))\n",
"# # transeEmb = json.load(open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json'))"
]
},
{
"cell_type": "markdown",
"id": "continued-locking",
"metadata": {},
"source": [
"# Text Embedding"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "polished-divorce",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6f97b30760e841da87be90aebef9c8cd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"first_line = True\n",
"text_emb_dict = {}\n",
"with gzip.open(TEXT_EMB_SOURCE_FILE) as file:\n",
" for line in tqdm(file):\n",
" if first_line:\n",
" first_line = False\n",
" continue\n",
" line = line.decode('utf-8').strip().split('\\t')\n",
" if line[1] == 'text_embedding' and line[0] in allNodes:\n",
" text_emb_dict[line[0]] = [float(elem) for elem in line[2].split(',')]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "raising-boost",
"metadata": {},
"outputs": [],
"source": [
"json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "beautiful-drill",
"metadata": {},
"outputs": [],
"source": [
"# text_emb_dict = json.load(open('../data/embeddings/archived/text_7_props_orig_embedding_dict.json.old'))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "impressed-stations",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a75bc167e9b449f88e4df7ebb19bca77",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/241698 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"missing_nodes = []\n",
"for node in tqdm(allNodes):\n",
" if node not in text_emb_dict:\n",
" missing_nodes.append(node)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "disabled-corporation",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0db4a305d6dc4f1e85dfdc02ea564537",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"missing_nodes_set = set(missing_nodes)\n",
"new_file = []\n",
"with gzip.open(CLAIMS_FILE, 'r') as all_claims_file:\n",
" firstLine = True\n",
" for ogline in tqdm(all_claims_file, total=491297976):\n",
" if firstLine:\n",
" firstLine = False\n",
" continue\n",
" line = ogline.decode('utf-8').strip().split('\\t')\n",
" line[3] = line[3][1:-5]\n",
" qnode, label = line[1], line[3]\n",
"# print(qnode, label)\n",
" if qnode in missing_nodes_set:\n",
" new_file.append(ogline)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "complete-performer",
"metadata": {},
"outputs": [],
"source": [
"allowed_props = set(['P31', 'P279', 'P106', 'P39', 'P1382', 'P373', 'P452'])\n",
"new_file1 = []\n",
"for line in new_file:\n",
" line1 = line.decode('utf-8').strip().split('\\t')\n",
" if line1[2] in allowed_props:\n",
" new_file1.append(line.decode('utf-8'))\n",
"new_file1 = ['id\\tnode1\\tlabel\\tnode2\\trank\\tnode2;wikidatatype\\n'] + new_file1"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "straight-internet",
"metadata": {},
"outputs": [],
"source": [
"with open('../output/text-embeddings/missing_nodes.tsv', 'w') as f:\n",
" f.writelines(new_file1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "suited-boating",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# --model sentence-transformers/roberta-large-nli-mean-tokens \\\n",
"q1 = \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../output/text-embeddings/missing_nodes.tsv \\\n",
" --model roberta-large-nli-mean-tokens \\\n",
" --property-labels-file \" + LABELS_FILE + \" --debug \\\n",
" --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n",
" --save-embedding-sentence > ../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv\"\n",
"os.system(q1 + \" \")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "based-circuit",
"metadata": {},
"outputs": [],
"source": [
"text7_missingnodes = pd.read_csv(\"../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv\", sep='\\t')\n",
"text7_missingnodes = text7_missingnodes[text7_missingnodes.property == 'text_embedding']"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "imposed-series",
"metadata": {},
"outputs": [],
"source": [
"text7_missingnodes['value'] = text7_missingnodes['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "compliant-locator",
"metadata": {},
"outputs": [],
"source": [
"text7EmbDict = {row['node']: row['value'] for _,row in text7_missingnodes.iterrows()}"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "quick-voluntary",
"metadata": {},
"outputs": [],
"source": [
"for key in text7EmbDict.keys():\n",
" if key not in text_emb_dict:\n",
" text_emb_dict[key] = text7EmbDict[key]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "special-smile",
"metadata": {},
"outputs": [],
"source": [
"json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))"
]
},
{
"cell_type": "markdown",
"id": "infectious-mauritius",
"metadata": {},
"source": [
"## Old technique follows"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "neural-gibson",
"metadata": {},
"outputs": [],
"source": [
"p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')\n",
"p279QnodesList = list(set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list()))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "instructional-weather",
"metadata": {},
"outputs": [],
"source": [
"missingNodes = allNodes - set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "specified-clear",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"37038"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(missingNodes)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "conditional-brooks",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6fb2da21d7cf4241a3e52ac132a7c534",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/38 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Split main file into sub-files for groups of properties for multi-processing\n",
"\n",
"# bsize = len(p279QnodesList) // 250\n",
"# cnt = 1\n",
"# for i in range(0, len(p279QnodesList), bsize):\n",
"# q1 = \"kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '\" + '|'.join(p279QnodesList[i:i+bsize]) + \";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv -v True\"\n",
"# # print(len(q1))\n",
"# cnt += 1\n",
"# # print(q1)\n",
"# os.system(\"screen -dm \" + q1)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "utility-fleet",
"metadata": {},
"outputs": [],
"source": [
"for cnt in range(290,503):\n",
" os.system(\"rm ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "alleged-strength",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6fb2da21d7cf4241a3e52ac132a7c534",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/38 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# # Split main file into sub-files for groups of properties for multi-processing\n",
"missingNodes = list(missingNodes)\n",
"bsize = 1000\n",
"cnt = 252\n",
"for i in tqdm(range(0, len(missingNodes), bsize)):\n",
" q1 = \"kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '\" + '|'.join(missingNodes[i:i+bsize]) + \";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv -v True\"\n",
"# print(len(q1))\n",
" cnt += 1\n",
"# print(q1)\n",
" os.system(\"screen -dm \" + q1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "signal-island",
"metadata": {},
"outputs": [],
"source": [
"def checkIfFileContainsLines(file):\n",
" with open(file) as f:\n",
" for line in f:\n",
" return True\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "fatal-broadway",
"metadata": {},
"outputs": [],
"source": [
"def countFileLines(file):\n",
" count = 0\n",
" with open(file) as f:\n",
" for line in f:\n",
" count += 1\n",
" return count"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "separate-satin",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from os.path import exists\n",
"\n",
"runCommCnt = 1\n",
"# 252\n",
"for cnt in tqdm(range(252,290)):\n",
" if exists(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") and countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") == 4097:\n",
" continue\n",
" q1 = \"\"\n",
"# if cnt % 10 == 0:\n",
"# q1 += \"sleep 20m; \"\n",
" q1 = \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv \\\n",
" --model sentence-transformers/all-distilroberta-v1 \\\n",
" --property-labels-file ../data/labels.en.tsv --debug \\\n",
" --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n",
" --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\"\n",
" print(cnt)\n",
" runCommCnt += 1\n",
" os.system(q1 + \" &\")\n",
" if runCommCnt % 15 == 0:\n",
" time.sleep(11*60)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "surprising-burning",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"for cnt in tqdm(range(1,290)):\n",
" if countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") != 4097:\n",
" print(cnt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bottom-lodge",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from os.path import exists\n",
"\n",
"# roberta-large-nli-mean-tokens\n",
"runCommCnt = 0\n",
"for cnt in tqdm(range(252,290)):\n",
" if exists(\"../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\") and countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\") == 4097:\n",
" continue\n",
" q1 = \"\"\n",
"# if cnt % 10 == 0:\n",
"# q1 += \"sleep 20m; \"\n",
" q1 += \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv \\\n",
" --model sentence-transformers/all-distilroberta-v1 \\\n",
" --property-labels-file ../data/labels.en.tsv --debug \\\n",
" --isa-properties P31 P279 \\\n",
" --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\"\n",
" print(cnt)\n",
" runCommCnt += 1\n",
" os.system(q1 + \" &\")\n",
" if runCommCnt % 15 == 0:\n",
" time.sleep(13*60)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "damaged-browse",
"metadata": {},
"outputs": [],
"source": [
"p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "collective-april",
"metadata": {},
"outputs": [],
"source": [
"# temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-1.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "decent-yorkshire",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" node | \n",
" property | \n",
" value | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Q99738027 | \n",
" text_embedding | \n",
" 0.74755263,1.6350263,-0.73952675,1.0463063,-0.... | \n",
"
\n",
" \n",
" | 1 | \n",
" Q99738027 | \n",
" embedding_sentence | \n",
" night shift, work shift during nighttime hours... | \n",
"
\n",
" \n",
" | 2 | \n",
" Q99228502 | \n",
" text_embedding | \n",
" 0.25261465,0.06285462,0.029052094,0.50796187,0... | \n",
"
\n",
" \n",
" | 3 | \n",
" Q99228502 | \n",
" embedding_sentence | \n",
" avenue, thoroughfare named \\\"avenue\\\" is thoro... | \n",
"
\n",
" \n",
" | 4 | \n",
" Q98970128 | \n",
" text_embedding | \n",
" 0.11887096,0.8598291,0.4446009,-0.5038472,-0.9... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" node property \\\n",
"0 Q99738027 text_embedding \n",
"1 Q99738027 embedding_sentence \n",
"2 Q99228502 text_embedding \n",
"3 Q99228502 embedding_sentence \n",
"4 Q98970128 text_embedding \n",
"\n",
" value \n",
"0 0.74755263,1.6350263,-0.73952675,1.0463063,-0.... \n",
"1 night shift, work shift during nighttime hours... \n",
"2 0.25261465,0.06285462,0.029052094,0.50796187,0... \n",
"3 avenue, thoroughfare named \\\"avenue\\\" is thoro... \n",
"4 0.11887096,0.8598291,0.4446009,-0.5038472,-0.9... "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# temp.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "satisfactory-speech",
"metadata": {},
"outputs": [],
"source": [
"text2EmbArr = []\n",
"for i in tqdm(range(1, 290)):\n",
" if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv')):\n",
" continue\n",
" temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv', sep='\\t')\n",
" temp = temp[temp.property == 'text_embedding']\n",
" text2EmbArr.append(temp)\n",
"text2Emb = pd.concat(text2EmbArr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "unavailable-competition",
"metadata": {},
"outputs": [],
"source": [
"text2Emb.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "concerned-april",
"metadata": {},
"outputs": [],
"source": [
"text7EmbArr = []\n",
"for i in tqdm(range(1, 290)):\n",
" if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv')):\n",
" continue\n",
" temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv', sep='\\t')\n",
" temp = temp[temp.property == 'text_embedding']\n",
" text7EmbArr.append(temp)\n",
"text7Emb = pd.concat(text7EmbArr)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "australian-enforcement",
"metadata": {},
"outputs": [],
"source": [
"text2Emb = text2Emb[text2Emb.node.apply(lambda p: p in allNodes)]\n",
"text7Emb = text7Emb[text7Emb.node.apply(lambda p: p in allNodes)]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "closed-treatment",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We have 2prop text embeddings for 278467 nodes and 7prop for 277587 nodes\n"
]
}
],
"source": [
"print(f\"We have 2prop text embeddings for {len(text2Emb)} nodes and 7prop for {len(text7Emb)} nodes\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "duplicate-agency",
"metadata": {},
"outputs": [],
"source": [
"text2Emb['value'] = text2Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])\n",
"text7Emb['value'] = text7Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "framed-third",
"metadata": {},
"outputs": [],
"source": [
"text2EmbDict = {row['node']: row['value'] for _,row in text2Emb.iterrows()}\n",
"text7EmbDict = {row['node']: row['value'] for _,row in text7Emb.iterrows()}"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "peaceful-andrew",
"metadata": {},
"outputs": [],
"source": [
"json.dump(text2EmbDict, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))\n",
"json.dump(text7EmbDict, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "considered-river",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "sustained-playback",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "requested-state",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "suited-going",
"metadata": {},
"source": [
"# Abstract Embeddings Generation\n",
"\n",
"Downloaded short abstracts file from [DBPedia Short Abstracts - 2020.07.01](https://downloads.dbpedia.org/repo/dbpedia/text/short-abstracts/2020.07.01/short-abstracts_lang=en.ttl.bz2)\n",
"\n",
"Then, we extract the abstracts file from the bz2 file using: `bzip2 -d short-abstracts_lang=en.ttl.bz2`"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "former-editor",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d0038713a1604ccb9c2e5499615fbc43",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# cnt = 0\n",
"# p1s = []\n",
"# p11s = []\n",
"# p2s = []\n",
"# lines = []\n",
"# with open(DBPEDIA_SHORT_ABSTRACTS_TTL, 'r', encoding='utf-8') as f:\n",
"# for line in tqdm(f):\n",
"# p1 = line[:line.find(\" \")]\n",
"# p11 = p1[len(\"\n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" Word 1 | \n",
" Word 2 | \n",
" ID | \n",
" H_Sim | \n",
" H_Dim | \n",
" F_Sim | \n",
" F_Dim | \n",
" N_Sim | \n",
" N_Dim | \n",
" D_Sim | \n",
" ... | \n",
" P_Dim | \n",
" Avg | \n",
" Stdev | \n",
" H_orig | \n",
" H_reversed | \n",
" word1_kg_id | \n",
" word2_kg_id | \n",
" category | \n",
" embedding_cos_sim | \n",
" Resp_code | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Arafat | \n",
" peace | \n",
" 8 | \n",
" 3 | \n",
" D | \n",
" 4 | \n",
" NaN | \n",
" 3 | \n",
" U | \n",
" 4 | \n",
" ... | \n",
" NaN | \n",
" 3.6 | \n",
" 0.547723 | \n",
" 2.1250 | \n",
" 7.8750 | \n",
" Q34211 | \n",
" Q454 | \n",
" U | \n",
" 3.982734 | \n",
" <Response [200]> | \n",
"
\n",
" \n",
" | 1 | \n",
" Arafat | \n",
" terror | \n",
" 9 | \n",
" 3 | \n",
" D | \n",
" 4 | \n",
" NaN | \n",
" 3 | \n",
" U | \n",
" 4 | \n",
" ... | \n",
" NaN | \n",
" 3.6 | \n",
" 0.547723 | \n",
" 3.0625 | \n",
" 6.9375 | \n",
" Q34211 | \n",
" Q13648784 | \n",
" U | \n",
" 3.969884 | \n",
" <Response [200]> | \n",
"
\n",
" \n",
" | 2 | \n",
" FBI | \n",
" fingerprint | \n",
" 109 | \n",
" 3 | \n",
" D | \n",
" 4 | \n",
" NaN | \n",
" 4 | \n",
" NaN | \n",
" 3 | \n",
" ... | \n",
" NaN | \n",
" 3.6 | \n",
" 0.547723 | \n",
" 4.0625 | \n",
" 5.9375 | \n",
" Q8333 | \n",
" Q178022 | \n",
" U | \n",
" 4.000000 | \n",
" <Response [200]> | \n",
"
\n",
" \n",
" | 3 | \n",
" FBI | \n",
" investigation | \n",
" 110 | \n",
" 3 | \n",
" U | \n",
" 3 | \n",
" U | \n",
" 3 | \n",
" U | \n",
" 3 | \n",
" ... | \n",
" u | \n",
" 3.0 | \n",
" 0.000000 | \n",
" 5.0625 | \n",
" 4.9375 | \n",
" Q8333 | \n",
" Q21004260 | \n",
" M | \n",
" 3.951077 | \n",
" <Response [200]> | \n",
"
\n",
" \n",
" | 4 | \n",
" Harvard | \n",
" Yale | \n",
" 137 | \n",
" 2 | \n",
" S | \n",
" 3 | \n",
" S | \n",
" 2 | \n",
" S | \n",
" 2 | \n",
" ... | \n",
" s | \n",
" 2.2 | \n",
" 0.447214 | \n",
" 4.8750 | \n",
" 5.1250 | \n",
" Q13371 | \n",
" Q49112 | \n",
" M | \n",
" 1.264601 | \n",
" <Response [200]> | \n",
"
\n",
" \n",
"
\n",
"5 rows × 22 columns
\n",
""
],
"text/plain": [
" Word 1 Word 2 ID H_Sim H_Dim F_Sim F_Dim N_Sim N_Dim D_Sim \\\n",
"0 Arafat peace 8 3 D 4 NaN 3 U 4 \n",
"1 Arafat terror 9 3 D 4 NaN 3 U 4 \n",
"2 FBI fingerprint 109 3 D 4 NaN 4 NaN 3 \n",
"3 FBI investigation 110 3 U 3 U 3 U 3 \n",
"4 Harvard Yale 137 2 S 3 S 2 S 2 \n",
"\n",
" ... P_Dim Avg Stdev H_orig H_reversed word1_kg_id word2_kg_id \\\n",
"0 ... NaN 3.6 0.547723 2.1250 7.8750 Q34211 Q454 \n",
"1 ... NaN 3.6 0.547723 3.0625 6.9375 Q34211 Q13648784 \n",
"2 ... NaN 3.6 0.547723 4.0625 5.9375 Q8333 Q178022 \n",
"3 ... u 3.0 0.000000 5.0625 4.9375 Q8333 Q21004260 \n",
"4 ... s 2.2 0.447214 4.8750 5.1250 Q13371 Q49112 \n",
"\n",
" category embedding_cos_sim Resp_code \n",
"0 U 3.982734 \n",
"1 U 3.969884 \n",
"2 U 4.000000 \n",
"3 M 3.951077 \n",
"4 M 1.264601 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_sim_class_sim_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "operational-survival",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "48386a6eaa0745e4a9eebbed1e61c72c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/349 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "094ee6f3ee7f4566855755f5b33e0515",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/349 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "615d0754edee407cac17b7cb3fda9c73",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/349 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import requests\n",
"from tqdm.notebook import tqdm\n",
"import json\n",
"from joblib import Parallel, delayed\n",
"import sys\n",
"\n",
"word_sim_df = pd.read_csv(WORDSIM_OLD_FILE)\n",
"\n",
"def fetchSim(row, similarity_type):\n",
" resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
" try:\n",
" row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
" except Exception as exc:\n",
" print(exc)\n",
" print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
" row['embedding_cos_sim'] = None\n",
" row['Resp_code'] = resp\n",
" return row\n",
"\n",
"word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"\n",
"word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"\n",
"word_sim_class_sim_df.to_csv(WORDSIM_OLD_CLASS_SIM_FILE, index=None)\n",
"word_sim_jc_sim_df.to_csv(WORDSIM_OLD_JC_SIM_FILE, index=None)\n",
"word_sim_top_sim_df.to_csv(WORDSIM_OLD_TOP_SIM_FILE, index=None)\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "useful-effectiveness",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4517a92d40bc4f2fac7cf08a47c047ad",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/16 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6dfb01e8813644e0b56d890be25eb955",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/16 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fe45273ea1f34d9faabd5422160ddc67",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/16 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import requests\n",
"from tqdm.notebook import tqdm\n",
"import json\n",
"from joblib import Parallel, delayed\n",
"import sys\n",
"\n",
"word_sim_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)\n",
"\n",
"def fetchSim(row, similarity_type):\n",
" resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
" try:\n",
" row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
" except Exception as exc:\n",
" print(exc)\n",
" print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
" row['embedding_cos_sim'] = None\n",
" row['Resp_code'] = resp\n",
" return row\n",
"\n",
"word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"\n",
"word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"\n",
"word_sim_class_sim_df.to_csv(DBPEDIA_MC_30_CLASS_SIM_FILE, index=None)\n",
"word_sim_jc_sim_df.to_csv(DBPEDIA_MC_30_JC_SIM_FILE, index=None)\n",
"word_sim_top_sim_df.to_csv(DBPEDIA_MC_30_TOP_SIM_FILE, index=None)\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "dressed-grove",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2b986c2d80c6424f937b336ca871ac5e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/34 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4e945c056cf546ac860a80004b19ae1b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/34 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "624ccbdd27734b44840c3d7fd6878294",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/34 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import requests\n",
"from tqdm.notebook import tqdm\n",
"import json\n",
"from joblib import Parallel, delayed\n",
"import sys\n",
"\n",
"word_sim_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)\n",
"\n",
"def fetchSim(row, similarity_type):\n",
" resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
" try:\n",
" row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
" except Exception as exc:\n",
" print(exc)\n",
" print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
" row['embedding_cos_sim'] = None\n",
" row['Resp_code'] = resp\n",
" return row\n",
"\n",
"word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
"\n",
"word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
"word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
"word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
"\n",
"\n",
"word_sim_class_sim_df.to_csv(DBPEDIA_RG_65_CLASS_SIM_FILE, index=None)\n",
"word_sim_jc_sim_df.to_csv(DBPEDIA_RG_65_JC_SIM_FILE, index=None)\n",
"word_sim_top_sim_df.to_csv(DBPEDIA_RG_65_TOP_SIM_FILE, index=None)\n"
]
},
{
"cell_type": "markdown",
"id": "spatial-excerpt",
"metadata": {},
"source": [
"# Summary of embeddings"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "wooden-medicare",
"metadata": {},
"outputs": [],
"source": [
"p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))\n",
"\n",
"complexEmb = json.load(open('../data/Master_P279_dataset/masterComplexEmb.json'))\n",
"transeEmb = json.load(open('../data/Master_P279_dataset/masterTranseEmb.json'))\n",
"\n",
"text2Emb = json.load(open('../data/Master_P279_dataset/text2Emb.json'))\n",
"text7Emb = json.load(open('../data/Master_P279_dataset/text7Emb.json'))\n",
"\n",
"abstractEmb = json.load(open('../data/Master_P279_dataset/abstractEmb.json'))\n",
"abstractFirstSentEmb = json.load(open('../data/Master_P279_dataset/abstractFirstSentEmb.json'))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "distributed-magazine",
"metadata": {},
"outputs": [],
"source": [
"json.dump({key:val for key, val in complexEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json', 'w'))\n",
"json.dump({key:val for key, val in transeEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json', 'w'))\n",
"json.dump({key:val for key, val in text2Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))\n",
"json.dump({key:val for key, val in text7Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))\n",
"json.dump({key:val for key, val in abstractEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_orig_embedding_dict.json', 'w'))\n",
"json.dump({key:val for key, val in abstractFirstSentEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_first_sent_orig_embedding_dict.json', 'w'))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "disturbed-better",
"metadata": {},
"outputs": [],
"source": [
"def countOverlap(source, target):\n",
" cnt = 0\n",
" for key1 in source:\n",
" if key1 in target:\n",
" cnt += 1\n",
" return cnt"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "advised-cyprus",
"metadata": {},
"outputs": [],
"source": [
"summArr = []\n",
"cnt = countOverlap(complexEmb, p279QnodesList)\n",
"summArr.append(['complex', cnt, cnt / len(p279QnodesList) * 100])\n",
"\n",
"cnt = countOverlap(transeEmb, p279QnodesList)\n",
"summArr.append(['transe', cnt, cnt / len(p279QnodesList) * 100])\n",
"\n",
"cnt = countOverlap(text2Emb, p279QnodesList)\n",
"summArr.append(['text2', cnt, cnt / len(p279QnodesList) * 100])\n",
"\n",
"cnt = countOverlap(text7Emb, p279QnodesList)\n",
"summArr.append(['text7', cnt, cnt / len(p279QnodesList) * 100])\n",
"\n",
"cnt = countOverlap(abstractEmb, p279QnodesList)\n",
"summArr.append(['abstract', cnt, cnt / len(p279QnodesList) * 100])\n",
"\n",
"cnt = countOverlap(abstractFirstSentEmb, p279QnodesList)\n",
"summArr.append(['abstractFirstSent', cnt, cnt / len(p279QnodesList) * 100])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "czech-keeping",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"238889"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(p279QnodesList)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "employed-christmas",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" embedding | \n",
" count | \n",
" Coverage Percentage | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" complex | \n",
" 238448 | \n",
" 99.815395 | \n",
"
\n",
" \n",
" | 1 | \n",
" transe | \n",
" 238448 | \n",
" 99.815395 | \n",
"
\n",
" \n",
" | 2 | \n",
" text2 | \n",
" 238889 | \n",
" 100.000000 | \n",
"
\n",
" \n",
" | 3 | \n",
" text7 | \n",
" 238889 | \n",
" 100.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" abstract | \n",
" 105828 | \n",
" 44.300072 | \n",
"
\n",
" \n",
" | 5 | \n",
" abstractFirstSent | \n",
" 105828 | \n",
" 44.300072 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" embedding count Coverage Percentage\n",
"0 complex 238448 99.815395\n",
"1 transe 238448 99.815395\n",
"2 text2 238889 100.000000\n",
"3 text7 238889 100.000000\n",
"4 abstract 105828 44.300072\n",
"5 abstractFirstSent 105828 44.300072"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(summArr, columns=['embedding', 'count', 'Coverage Percentage'])"
]
},
{
"cell_type": "markdown",
"id": "changing-strategy",
"metadata": {},
"source": [
"# Embeddings correction"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "purple-raising",
"metadata": {},
"outputs": [],
"source": [
"masterEmbedDictMaster = {}\n",
"subsetEmbedDictMaster = {}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "round-product",
"metadata": {},
"outputs": [],
"source": [
"masterEmbedKeys = ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']\n",
"for key1 in masterEmbedKeys:\n",
" masterEmbedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict.json'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "metallic-insulin",
"metadata": {},
"outputs": [],
"source": [
"subsetEmbedKeys = ['text_7props', 'text_2props', 'complex', 'transe', 'abstract', 'abstract_first_sent']\n",
"for key1 in subsetEmbedKeys:\n",
" subsetEmbedDictMaster[key1] = json.load(open('../data/orig_embeddings/'+key1+'_original_embeddings_dict.json'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "assigned-parameter",
"metadata": {},
"outputs": [],
"source": [
"wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "olympic-yemen",
"metadata": {},
"outputs": [],
"source": [
"wordsim_pairs = {(row['word1_kg_id'], row['word2_kg_id']) for _, row in wordSim353AnnotDF_New.iterrows()}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "welcome-disorder",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pair Coverage by text_7props embeddings created for 19k retrofitting: 325\n",
"Pair Coverage by text_2props embeddings created for 19k retrofitting: 325\n",
"Pair Coverage by complex embeddings created for 19k retrofitting: 342\n",
"Pair Coverage by transe embeddings created for 19k retrofitting: 342\n",
"Pair Coverage by abstract embeddings created for 19k retrofitting: 343\n",
"Pair Coverage by abstract_first_sent embeddings created for 19k retrofitting: 343\n"
]
}
],
"source": [
"for key1 in subsetEmbedKeys:\n",
" print(f\"Pair Coverage by {key1} embeddings created for 19k retrofitting: {sum([row[0] in subsetEmbedDictMaster[key1] and row[1] in subsetEmbedDictMaster[key1] for row in wordsim_pairs])}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "northern-psychiatry",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pair Coverage by old text_7_props embeddings created for 19k retrofitting: 278\n",
"Pair Coverage by old text_2_props embeddings created for 19k retrofitting: 278\n",
"Pair Coverage by old complex embeddings created for 19k retrofitting: 278\n",
"Pair Coverage by old transe embeddings created for 19k retrofitting: 278\n",
"Pair Coverage by old abstract embeddings created for 19k retrofitting: 183\n",
"Pair Coverage by old abstract_first_sent embeddings created for 19k retrofitting: 183\n"
]
}
],
"source": [
"for key1 in masterEmbedKeys:\n",
" print(f\"Pair Coverage by old {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "contrary-casino",
"metadata": {},
"outputs": [],
"source": [
"wordSim353AnnotDF_New_set = set(wordSim353AnnotDF_New.word1_kg_id.to_list() + wordSim353AnnotDF_New.word2_kg_id.to_list())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "alleged-polish",
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"masterEmbCorrections = defaultdict(list)\n",
"for node in wordSim353AnnotDF_New_set:\n",
" for i in range(len(masterEmbedKeys)):\n",
" if node not in masterEmbedDictMaster[masterEmbedKeys[i]] and node in wordSim353AnnotDF_New_set:\n",
" masterEmbCorrections[masterEmbedKeys[i]].append(node)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "periodic-buffer",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['abstract', 'abstract_first_sent', 'text_7_props', 'text_2_props', 'complex', 'transe'])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"masterEmbCorrections.keys()"
]
},
{
"cell_type": "markdown",
"id": "awful-signal",
"metadata": {},
"source": [
"## Complex, Transe"
]
},
{
"cell_type": "code",
"execution_count": 142,
"id": "exceptional-acting",
"metadata": {},
"outputs": [],
"source": [
"# import requests\n",
"# correctedComplexEmb = {}\n",
"# correctedTranseEmb = {}\n",
"# for wordID in masterEmbCorrections['complex']:\n",
"# try:\n",
"# resp = requests.get(\"http://ckg07:9200/wikidatadwd-augmented/_doc/\"+wordID).json()['_source']\n",
"# correctedComplexEmb[wordID] = [float(p) for p in resp['graph_embedding_complex'].split(',')]\n",
"# correctedTranseEmb[wordID] = [float(p) for p in resp['graph_embeddings_transe'].split(',')]\n",
"# except:\n",
"# print(\"Failure returned for http://ckg07:9200/wikidatadwd-augmented/_doc/\"+wordID)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "assigned-journey",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "51850ec9544547f293820bd9e94091f4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/42575933 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4de00c52596f4ce3b5d17ee9ef73b068",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/42575933 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1h 33min 17s, sys: 2min 38s, total: 1h 35min 56s\n",
"Wall time: 1h 35min 28s\n"
]
}
],
"source": [
"%%time\n",
"correctedComplexEmb = {qnode: emb for emb, qnode in tqdm(zip(f['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}\n",
"correctedTranseEmb = {qnode: emb for emb, qnode in tqdm(zip(transf['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "female-scope",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(list(correctedComplexEmb.items())[0][1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "valuable-bahrain",
"metadata": {},
"outputs": [],
"source": [
"len(masterEmbedDictMaster['complex'][list(masterEmbedDictMaster['complex'].keys())[0]])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "religious-celebration",
"metadata": {},
"outputs": [],
"source": [
"for node, emb in correctedComplexEmb.items():\n",
" masterEmbedDictMaster['complex'][node] = emb\n",
"for node, emb in correctedTranseEmb.items():\n",
" masterEmbedDictMaster['transe'][node] = emb"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "spatial-writer",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "periodic-laundry",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "motivated-soundtrack",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "imported-advance",
"metadata": {},
"source": [
"## Text Embeddings Correction file generation"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "historical-conservative",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q1 = \"kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '\" + '|'.join(masterEmbCorrections['text_7_props']) + \";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv -v True\"\n",
"os.system(\"screen -dm \" + q1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "overhead-commission",
"metadata": {},
"outputs": [],
"source": [
"q1 = \"kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv \\\n",
" --model roberta-large-nli-mean-tokens \\\n",
" --property-labels-file ../data/labels.en.tsv --debug \\\n",
" --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n",
" --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv\"\n",
"# print(q1)\n",
"os.system(q1 + \" &\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "cooked-vinyl",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"32512"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q1 = \"kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv \\\n",
" --model roberta-large-nli-mean-tokens \\\n",
" --property-labels-file ../data/labels.en.tsv --debug \\\n",
" --isa-properties P31 P279 \\\n",
" --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv\"\n",
"# print(q1)\n",
"os.system(q1 + \" &\")"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "static-failure",
"metadata": {},
"outputs": [],
"source": [
"corrected7Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv', sep='\\t')\n",
"corrected2Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "spare-flexibility",
"metadata": {},
"outputs": [],
"source": [
"corrected7Emb = corrected7Emb[corrected7Emb.property == 'text_embedding']\n",
"corrected7Emb['value'] = corrected7Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])\n",
"\n",
"corrected2Emb = corrected2Emb[corrected2Emb.property == 'text_embedding']\n",
"corrected2Emb['value'] = corrected2Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "minute-oakland",
"metadata": {},
"outputs": [],
"source": [
"for _, row in corrected7Emb.iterrows():\n",
" masterEmbedDictMaster['text_7_props'][row['node']] = row['value']\n",
"for _, row in corrected2Emb.iterrows():\n",
" masterEmbedDictMaster['text_2_props'][row['node']] = row['value']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "documentary-fluid",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "analyzed-naples",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "psychological-brighton",
"metadata": {},
"source": [
"## Abstract"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "meaning-spanking",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DBPedia dataset has 5732949 records with unique 5732947 index values\n",
"There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)\n"
]
}
],
"source": [
"df1 = pd.read_csv(\"../data/short-abstracts_lang=en.csv\", skiprows=1, skipfooter=1, header=None, engine='python')\n",
"df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']\n",
"df1 = df1.set_index('node1')\n",
"df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]\n",
"print(f\"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values\")\n",
"sitelinksDF = pd.read_csv(\"../data/sitelinks.en.tsv.gz\", sep='\\t')\n",
"sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split(\"/\")[-1] if p.split(\"/\")[-1] != '' else p.split(\"/\")[-2])\n",
"sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']\n",
"sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')\n",
"print(f\"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)\")\n",
"sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]\n",
"masterEmbCorrections_abs_set = set(masterEmbCorrections['abstract'])\n",
"sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "offensive-enclosure",
"metadata": {},
"outputs": [],
"source": [
"labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\\t')\n",
"labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]\n",
"labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}\n",
"descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\\t')\n",
"descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]\n",
"descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "better-tuner",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"From 58 Qnodes, there are 16 sitelink Qnodes which do not have a short abstract i.e 42 have a short abstract\n"
]
}
],
"source": [
"sdf_set = set(sitelinksDF2.index.to_list())\n",
"df1 = df1[df1.index.map(lambda p: p in sdf_set)]\n",
"abstractsDF2 = sitelinksDF2.join(df1).reset_index()\n",
"print(f\"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract\")\n",
"# abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "invalid-fiction",
"metadata": {},
"outputs": [],
"source": [
"abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else \"\")\n",
"abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else \"\")\n",
"def combineAbsLabDesc(row, parameter):\n",
" if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != \"\":\n",
" return row[parameter]\n",
" elif row['node1_label'] == \"\" and row['node1_desc'] == \"\":\n",
" return None\n",
" else:\n",
" return row['node1_label'] + ' ' + row['node1_desc']"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "opened-drink",
"metadata": {},
"outputs": [],
"source": [
"from nltk.tokenize import sent_tokenize\n",
"abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)\n",
"abstractsDF2 = abstractsDF2.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "affected-reproduction",
"metadata": {},
"outputs": [],
"source": [
"abstractsDF2['abstract'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract',))\n",
"abstractsDF2['abstract_firstSent'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "actual-communication",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"58"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(abstractsDF2)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "quantitative-tumor",
"metadata": {},
"outputs": [],
"source": [
"abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "turned-retail",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"58"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(abstractsDF2)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "heard-freedom",
"metadata": {},
"outputs": [],
"source": [
"abstractsDF2 = abstractsDF2.drop(columns=['index']).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "forty-southeast",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" level_0 | \n",
" index | \n",
" trimmedNode2 | \n",
" id | \n",
" node1 | \n",
" label | \n",
" node2 | \n",
" ignore | \n",
" url | \n",
" ignore2 | \n",
" abstract | \n",
" node1_label | \n",
" node1_desc | \n",
" abstract_firstSent | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" Luxuries | \n",
" Q10953913-wikipedia_sitelink-538fe3-0 | \n",
" Q10953913 | \n",
" wikipedia_sitelink | \n",
" http://en.wikipedia.org/wiki/Luxuries | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" luxuryBehavior, expenses or equipment that far... | \n",
" luxury | \n",
" Behavior, expenses or equipment that far excee... | \n",
" nan | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1 | \n",
" Potato | \n",
" Q10998-wikipedia_sitelink-56b85c-0 | \n",
" Q10998 | \n",
" wikipedia_sitelink | \n",
" http://en.wikipedia.org/wiki/Potato | \n",
" 10709.0 | \n",
" <http://dbpedia.org/resource/Potato> | \n",
" <http://www.w3.org/2000/01/rdf-schema#comment> | \n",
" The potato is a root vegetable native to the A... | \n",
" potato | \n",
" species of plant | \n",
" The potato is a root vegetable native to the A... | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 2 | \n",
" Mars | \n",
" Q111-wikipedia_sitelink-9ff296-0 | \n",
" Q111 | \n",
" wikipedia_sitelink | \n",
" http://en.wikipedia.org/wiki/Mars | \n",
" 1803088.0 | \n",
" <http://dbpedia.org/resource/Mars> | \n",
" <http://www.w3.org/2000/01/rdf-schema#comment> | \n",
" Mars is the fourth planet from the Sun and the... | \n",
" Mars | \n",
" fourth planet from the Sun | \n",
" Mars is the fourth planet from the Sun and the... | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 3 | \n",
" Dawn | \n",
" Q11326182-wikipedia_sitelink-ae2918-0 | \n",
" Q11326182 | \n",
" wikipedia_sitelink | \n",
" http://en.wikipedia.org/wiki/Dawn | \n",
" 97544.0 | \n",
" <http://dbpedia.org/resource/Dawn> | \n",
" <http://www.w3.org/2000/01/rdf-schema#comment> | \n",
" Dawn is the time that marks the beginning of t... | \n",
" dawn | \n",
" time that marks the beginning of the twilight ... | \n",
" Dawn is the time that marks the beginning of t... | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 4 | \n",
" Change_(philosophy) | \n",
" Q1150070-wikipedia_sitelink-81cf5f-0 | \n",
" Q1150070 | \n",
" wikipedia_sitelink | \n",
" http://en.wikipedia.org/wiki/Change_(philosophy) | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" changeprocess, event or action that deviates f... | \n",
" change | \n",
" process, event or action that deviates from th... | \n",
" nan | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" level_0 index trimmedNode2 id \\\n",
"0 0 0 Luxuries Q10953913-wikipedia_sitelink-538fe3-0 \n",
"1 1 1 Potato Q10998-wikipedia_sitelink-56b85c-0 \n",
"2 2 2 Mars Q111-wikipedia_sitelink-9ff296-0 \n",
"3 3 3 Dawn Q11326182-wikipedia_sitelink-ae2918-0 \n",
"4 4 4 Change_(philosophy) Q1150070-wikipedia_sitelink-81cf5f-0 \n",
"\n",
" node1 label \\\n",
"0 Q10953913 wikipedia_sitelink \n",
"1 Q10998 wikipedia_sitelink \n",
"2 Q111 wikipedia_sitelink \n",
"3 Q11326182 wikipedia_sitelink \n",
"4 Q1150070 wikipedia_sitelink \n",
"\n",
" node2 ignore \\\n",
"0 http://en.wikipedia.org/wiki/Luxuries NaN \n",
"1 http://en.wikipedia.org/wiki/Potato 10709.0 \n",
"2 http://en.wikipedia.org/wiki/Mars 1803088.0 \n",
"3 http://en.wikipedia.org/wiki/Dawn 97544.0 \n",
"4 http://en.wikipedia.org/wiki/Change_(philosophy) NaN \n",
"\n",
" url \\\n",
"0 NaN \n",
"1 \n",
"2 \n",
"3 \n",
"4 NaN \n",
"\n",
" ignore2 \\\n",
"0 NaN \n",
"1 \n",
"2 \n",
"3 \n",
"4 NaN \n",
"\n",
" abstract node1_label \\\n",
"0 luxuryBehavior, expenses or equipment that far... luxury \n",
"1 The potato is a root vegetable native to the A... potato \n",
"2 Mars is the fourth planet from the Sun and the... Mars \n",
"3 Dawn is the time that marks the beginning of t... dawn \n",
"4 changeprocess, event or action that deviates f... change \n",
"\n",
" node1_desc \\\n",
"0 Behavior, expenses or equipment that far excee... \n",
"1 species of plant \n",
"2 fourth planet from the Sun \n",
"3 time that marks the beginning of the twilight ... \n",
"4 process, event or action that deviates from th... \n",
"\n",
" abstract_firstSent \n",
"0 nan \n",
"1 The potato is a root vegetable native to the A... \n",
"2 Mars is the fourth planet from the Sun and the... \n",
"3 Dawn is the time that marks the beginning of t... \n",
"4 nan "
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"abstractsDF2.head()"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "multiple-offer",
"metadata": {},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from time import time\n",
"import pandas as pd\n",
"\n",
"def getSentEmbeddings(valSeries, modelName):\n",
" model = SentenceTransformer(modelName)\n",
" start = time()\n",
" encodings = model.encode(valSeries.to_list())\n",
" print(time()-start,'s')\n",
" return encodings\n",
"\n",
"def getIndSentEmbeddings(sent, modelName):\n",
" model = SentenceTransformer(modelName)\n",
" start = time()\n",
" encodings = model.encode([sent])\n",
" print(time()-start,'s')\n",
" return encodings"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "sustainable-breakdown",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6419482231140137 s\n",
"0.5260367393493652 s\n"
]
}
],
"source": [
"abstractsDF2['abs_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract, 'bert-base-nli-mean-tokens')))\n",
"abstractsDF2['abs_firstSent_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract_firstSent, 'bert-base-nli-mean-tokens')))"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "usual-selling",
"metadata": {},
"outputs": [],
"source": [
"for _, row in abstractsDF2.iterrows():\n",
" masterEmbedDictMaster['abstract'][row['node1']] = row['abs_emb']\n",
" masterEmbedDictMaster['abstract_first_sent'][row['node1']] = row['abs_firstSent_emb']"
]
},
{
"cell_type": "code",
"execution_count": 124,
"id": "promising-owner",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.37706875801086426 s\n",
"0.3001420497894287 s\n",
"0.370746374130249 s\n",
"0.6896324157714844 s\n",
"0.33779358863830566 s\n",
"0.3965473175048828 s\n",
"0.3200962543487549 s\n",
"0.3489806652069092 s\n",
"0.3413431644439697 s\n",
"0.32114505767822266 s\n",
"0.3811838626861572 s\n",
"0.34630370140075684 s\n",
"0.37790727615356445 s\n",
"0.26860570907592773 s\n",
"0.3601953983306885 s\n",
"0.3713240623474121 s\n",
"0.34137582778930664 s\n",
"0.33736181259155273 s\n",
"0.37023448944091797 s\n",
"0.31382036209106445 s\n",
"0.35136938095092773 s\n",
"0.37309718132019043 s\n",
"0.33543896675109863 s\n",
"0.38199710845947266 s\n",
"0.3740067481994629 s\n",
"0.3278031349182129 s\n",
"0.32283997535705566 s\n",
"0.34000563621520996 s\n",
"0.31502628326416016 s\n",
"0.34996771812438965 s\n",
"0.3871273994445801 s\n",
"0.3487060070037842 s\n",
"0.35172486305236816 s\n",
"0.3280646800994873 s\n",
"0.3519773483276367 s\n",
"0.3354451656341553 s\n",
"0.3633551597595215 s\n",
"0.3226644992828369 s\n",
"0.33882975578308105 s\n",
"0.36072254180908203 s\n",
"0.3833494186401367 s\n",
"0.2929043769836426 s\n",
"0.32875680923461914 s\n",
"0.36334872245788574 s\n",
"0.34148168563842773 s\n",
"0.3569769859313965 s\n",
"0.37468576431274414 s\n",
"0.399524450302124 s\n",
"0.3516504764556885 s\n",
"0.333402156829834 s\n",
"0.3851203918457031 s\n",
"0.34867238998413086 s\n",
"0.3607771396636963 s\n",
"0.38669753074645996 s\n",
"0.33347272872924805 s\n",
"0.36278390884399414 s\n",
"0.3602781295776367 s\n",
"0.3322322368621826 s\n",
"0.36807823181152344 s\n",
"0.3407411575317383 s\n",
"0.3837134838104248 s\n",
"0.38958096504211426 s\n",
"0.3332521915435791 s\n",
"0.3331124782562256 s\n",
"0.35001134872436523 s\n",
"0.32433485984802246 s\n",
"0.36315059661865234 s\n",
"0.34323906898498535 s\n",
"0.3112339973449707 s\n",
"0.30588483810424805 s\n",
"0.30704236030578613 s\n",
"0.31201720237731934 s\n"
]
}
],
"source": [
"for node in masterEmbCorrections_abs_set:\n",
" if node not in masterEmbedDictMaster['abstract']:\n",
" if node in labelsDict and node in descDict:\n",
" masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]\n",
" elif node in labelsDict:\n",
" masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]"
]
},
{
"cell_type": "code",
"execution_count": 125,
"id": "acquired-manitoba",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.32213783264160156 s\n",
"0.357776403427124 s\n",
"0.37949395179748535 s\n",
"0.35210466384887695 s\n",
"0.28103041648864746 s\n",
"0.3626406192779541 s\n",
"0.35109710693359375 s\n",
"0.34203338623046875 s\n",
"0.32386112213134766 s\n",
"0.3354361057281494 s\n",
"0.3063056468963623 s\n",
"0.3441202640533447 s\n",
"0.32869935035705566 s\n",
"0.42442989349365234 s\n",
"0.37239527702331543 s\n",
"0.38650059700012207 s\n",
"0.3191685676574707 s\n",
"0.3609733581542969 s\n",
"0.3115823268890381 s\n",
"0.36015963554382324 s\n",
"0.3338603973388672 s\n",
"0.3487727642059326 s\n",
"0.3250617980957031 s\n",
"0.35145044326782227 s\n",
"0.33944034576416016 s\n",
"0.31502413749694824 s\n",
"0.3611795902252197 s\n",
"0.35285043716430664 s\n",
"0.3575010299682617 s\n",
"0.304781436920166 s\n",
"0.4003562927246094 s\n",
"0.3315858840942383 s\n",
"0.36008763313293457 s\n",
"0.36187100410461426 s\n",
"0.32981252670288086 s\n",
"0.3378865718841553 s\n",
"0.31662964820861816 s\n",
"0.32143092155456543 s\n",
"0.3152732849121094 s\n",
"0.38222813606262207 s\n",
"0.3846759796142578 s\n",
"0.33153700828552246 s\n",
"0.37013936042785645 s\n",
"0.33272790908813477 s\n",
"0.29526567459106445 s\n",
"0.3218040466308594 s\n",
"0.3795340061187744 s\n",
"0.3576061725616455 s\n",
"0.35764193534851074 s\n",
"0.36867713928222656 s\n",
"0.3807237148284912 s\n",
"0.33266758918762207 s\n",
"0.33878159523010254 s\n",
"0.38289546966552734 s\n",
"0.38695788383483887 s\n",
"0.33074188232421875 s\n",
"0.32749414443969727 s\n",
"0.33860039710998535 s\n",
"0.36585235595703125 s\n",
"0.33011841773986816 s\n",
"0.3293156623840332 s\n",
"0.3491702079772949 s\n",
"0.3720529079437256 s\n",
"0.3078622817993164 s\n",
"0.3844125270843506 s\n",
"0.32468104362487793 s\n",
"0.3186354637145996 s\n",
"0.3438723087310791 s\n",
"0.36643028259277344 s\n",
"0.34279680252075195 s\n",
"0.3625810146331787 s\n",
"0.35865354537963867 s\n",
"0.3503103256225586 s\n",
"0.37160682678222656 s\n",
"0.3268110752105713 s\n",
"0.2564544677734375 s\n",
"0.37343525886535645 s\n",
"0.33298277854919434 s\n"
]
}
],
"source": [
"for node in masterEmbCorrections_abs_set:\n",
" if node not in masterEmbedDictMaster['abstract_first_sent']:\n",
" if node in labelsDict and node in descDict:\n",
" masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]\n",
" elif node in labelsDict:\n",
" masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]"
]
},
{
"cell_type": "markdown",
"id": "veterinary-thailand",
"metadata": {},
"source": [
"## Updated coverage details"
]
},
{
"cell_type": "code",
"execution_count": 145,
"id": "intimate-campus",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pair Coverage by new text_7_props embeddings created for 19k retrofitting: 325\n",
"Pair Coverage by new text_2_props embeddings created for 19k retrofitting: 325\n",
"Pair Coverage by new complex embeddings created for 19k retrofitting: 343\n",
"Pair Coverage by new transe embeddings created for 19k retrofitting: 343\n",
"Pair Coverage by new abstract embeddings created for 19k retrofitting: 339\n",
"Pair Coverage by new abstract_first_sent embeddings created for 19k retrofitting: 339\n"
]
}
],
"source": [
"for key1 in masterEmbedKeys:\n",
" print(f\"Pair Coverage by new {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "lovely-token",
"metadata": {},
"outputs": [],
"source": [
"for key1 in masterEmbedDictMaster.keys():\n",
" for key2 in masterEmbedDictMaster[key1].keys():\n",
" if type(masterEmbedDictMaster[key1][key2]) != list:\n",
" masterEmbedDictMaster[key1][key2] = masterEmbedDictMaster[key1][key2].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "exact-surfing",
"metadata": {},
"outputs": [],
"source": [
"for key1 in ['complex', 'transe']:\n",
" json.dump(masterEmbedDictMaster[key1], open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json', 'w'))"
]
},
{
"cell_type": "code",
"execution_count": 134,
"id": "behavioral-spain",
"metadata": {},
"outputs": [],
"source": [
"def countOverlap(source, target):\n",
" cnt = 0\n",
" for key1 in source:\n",
" if key1 in target:\n",
" cnt += 1\n",
" return cnt\n",
"p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))"
]
},
{
"cell_type": "code",
"execution_count": 147,
"id": "hawaiian-brain",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" embedding | \n",
" total count | \n",
" overlap count | \n",
" Coverage Percentage | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" text_7_props | \n",
" 238930 | \n",
" 238889 | \n",
" 100.000000 | \n",
"
\n",
" \n",
" | 1 | \n",
" text_2_props | \n",
" 238930 | \n",
" 238889 | \n",
" 100.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" complex | \n",
" 238500 | \n",
" 238448 | \n",
" 99.815395 | \n",
"
\n",
" \n",
" | 3 | \n",
" transe | \n",
" 238500 | \n",
" 238448 | \n",
" 99.815395 | \n",
"
\n",
" \n",
" | 4 | \n",
" abstract | \n",
" 105964 | \n",
" 105916 | \n",
" 44.336910 | \n",
"
\n",
" \n",
" | 5 | \n",
" abstract_first_sent | \n",
" 105964 | \n",
" 105916 | \n",
" 44.336910 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" embedding total count overlap count Coverage Percentage\n",
"0 text_7_props 238930 238889 100.000000\n",
"1 text_2_props 238930 238889 100.000000\n",
"2 complex 238500 238448 99.815395\n",
"3 transe 238500 238448 99.815395\n",
"4 abstract 105964 105916 44.336910\n",
"5 abstract_first_sent 105964 105916 44.336910"
]
},
"execution_count": 147,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summArr = []\n",
"for key1 in masterEmbedDictMaster:\n",
" cnt = countOverlap(masterEmbedDictMaster[key1], p279QnodesList)\n",
" summArr.append([key1, len(masterEmbedDictMaster[key1]), cnt, cnt / len(p279QnodesList) * 100])\n",
"pd.DataFrame(summArr, columns=['embedding', 'total count', 'overlap count', 'Coverage Percentage'])"
]
},
{
"cell_type": "markdown",
"id": "greater-namibia",
"metadata": {},
"source": [
"# Concatenated Embeddings"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fifth-associate",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"embedDictMaster = {}\n",
"for key1 in ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']:\n",
" embedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json'))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "egyptian-sentence",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"text_7_props : 1024\n",
"text_2_props : 1024\n",
"complex : 200\n",
"transe : 200\n",
"abstract : 768\n",
"abstract_first_sent : 768\n"
]
}
],
"source": [
"def determineEmbeddingLengths(embedDictMaster):\n",
" for key in embedDictMaster.keys():\n",
" embed_size = len(next(iter(embedDictMaster[key].values())))\n",
" print(key,\": \",embed_size)\n",
"determineEmbeddingLengths(embedDictMaster)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "removable-point",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Added 11 corrections\n",
"Added 11 corrections\n",
"Added 0 corrections\n",
"Added 0 corrections\n",
"Added 4 corrections\n",
"Added 4 corrections\n"
]
}
],
"source": [
"for key1 in embedDictMaster.keys():\n",
" embedDictMaster[key1] = deserializeEmbeddingDict(embedDictMaster[key1])\n",
"# Fill Coverage of embedding dictionaries\n",
"for key1 in embedDictMaster.keys():\n",
" embedDictMaster[key1] = fillCoverage(embedDictMaster[key1])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "productive-indiana",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"text_7_props 238941\n",
"text_2_props 238941\n",
"complex 238941\n",
"transe 238941\n",
"abstract 238941\n",
"abstract_first_sent 238941\n"
]
}
],
"source": [
"for key1 in embedDictMaster.keys():\n",
" print(key1, len(next(iter(embedDictMaster.values()))))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "mechanical-retro",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ready-financing",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "prime-hometown",
"metadata": {},
"source": [
"# Retrofitting sample"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "tight-civilization",
"metadata": {},
"outputs": [],
"source": [
"def fetchNeighbours(df):\n",
" neighboursDict = {}\n",
" for _, row in df.iterrows():\n",
" if row.node1 not in neighboursDict:\n",
" neighboursDict[row.node1] = []\n",
" neighboursDict[row.node1].append((row.node2, row.bert2SentSim))\n",
" \n",
" if row.node2 not in neighboursDict:\n",
" neighboursDict[row.node2] = []\n",
" neighboursDict[row.node2].append((row.node1, row.bert2SentSim))\n",
" print(max([len(neigh) for neigh in neighboursDict.values()]))\n",
" return neighboursDict"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "exciting-circle",
"metadata": {},
"outputs": [],
"source": [
"def retrofit(embedDict, neighDict, weightCase, weightAssignment=False):\n",
" newEmbedDict = {}\n",
" for word in embedDict.keys():\n",
" if word in neighDict:\n",
" neighbs = neighDict[word]\n",
" neighbs = list(filter(lambda p: p[0] in embedDict, neighbs))\n",
" if len(neighbs) == 0:\n",
" newEmbedDict[word] = embedDict[word]\n",
" continue\n",
"# assert len(neighbs) == 1\n",
" if weightAssignment:\n",
" sumOfSims = sum([neighb[1] for neighb in neighbs])\n",
" sumOfEmbs = sum([embedDict[neighb[0]] * float(neighb[1]) for neighb in neighbs])\n",
" else:\n",
" sumOfSims = sum([1 for neighb in neighbs])\n",
" sumOfEmbs = sum([embedDict[neighb[0]] for neighb in neighbs])\n",
" \n",
" if weightCase == 1:\n",
" newEmbedDict[word] = (embedDict[word] * (len(neighbs)) + sumOfEmbs) / ((len(neighbs)) + sumOfSims)\n",
" elif weightCase == 2:\n",
" newEmbedDict[word] = (embedDict[word] * (len(neighbs))**2 + sumOfEmbs) / ((len(neighbs))**2 + sumOfSims)\n",
" elif weightCase == 0.5:\n",
" newEmbedDict[word] = (embedDict[word] * (len(neighbs))**0.5 + sumOfEmbs) / ((len(neighbs))**0.5 + sumOfSims)\n",
" else:\n",
" raise\n",
" else:\n",
" newEmbedDict[word] = embedDict[word]\n",
" return newEmbedDict"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "hollywood-prisoner",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import classification_report\n",
"def labelSamples(score):\n",
" return 'I' if score <= 1.75 else 'U' if score >= 3.5 else 'M'\n",
"LABELS = ['I','U','M']\n",
"def fetchCorrelationResults(embedDict, newEmbedDict):\n",
" wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')\n",
"# print(f\"Length of wordsim dataset: {len(wordSim353AnnotDF_New)}\")\n",
" assert wordSim353AnnotDF_New.word1_kg_id.isna().sum() == 0\n",
" assert wordSim353AnnotDF_New.word2_kg_id.isna().sum() == 0\n",
" wordSim353AnnotDF_New['category'] = wordSim353AnnotDF_New.Avg.apply(labelSamples)\n",
"# wordSim353AnnotDF_New2 = wordSim353AnnotDF_New\n",
" wordSim353AnnotDF_New2 = wordSim353AnnotDF_New[wordSim353AnnotDF_New.apply(lambda p: p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict, axis=1)]\n",
" wordSimMissingSet = set(wordSim353AnnotDF_New[wordSim353AnnotDF_New.word1_kg_id.apply(lambda p: p not in embedDict)].word1_kg_id.to_list() + wordSim353AnnotDF_New[wordSim353AnnotDF_New.word2_kg_id.apply(lambda p: p not in embedDict)].word2_kg_id.to_list())\n",
" responseDict = {}\n",
" responseDict['wordSimMissingSet'] = wordSimMissingSet\n",
" responseDict['coveredPairs'] = len(wordSim353AnnotDF_New2)\n",
" responseDict['totalPairs'] = len(wordSim353AnnotDF_New)\n",
" \n",
"# wordSimMissingSet\n",
"# print(f\"No. of pairs with some value for embeddings: {len(wordSim353AnnotDF_New2)}\")\n",
" wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(embedDict[p['word1_kg_id']]).reshape(1,-1), np.array(embedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)\n",
" wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(newEmbedDict[p['word1_kg_id']]).reshape(1,-1), np.array(newEmbedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)\n",
" wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textOld'] == -1, 'textOld'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textOld'] != -1]['textOld'].mean()\n",
" wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textNew'] == -1, 'textNew'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textNew'] != -1]['textNew'].mean()\n",
" \n",
" # Logic 1: Scale min,max value to 1,4 strictly\n",
"# min1, max1 = wordSim353AnnotDF_New['textOld'].min(), wordSim353AnnotDF_New['textOld'].max()\n",
"# min2, max2 = wordSim353AnnotDF_New['textNew'].min(), wordSim353AnnotDF_New['textNew'].max()\n",
"# wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * (p - min1) / (max1 - min1))\n",
"# wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * (p - min2) / (max2 - min2))\n",
" \n",
" # Logic 2: Scale abs value to 1,4 strictly\n",
" wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * abs(p))\n",
" wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * abs(p))\n",
"\n",
" \n",
"# print(f\"KT Corr of old emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['Avg'])}\")\n",
"# print(f\"KT Corr of new emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['Avg'])}\")\n",
"# print(f\"KT Corr of old emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['H_reversed'])}\")\n",
"# print(f\"KT Corr of new emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['H_reversed'])}\")\n",
" \n",
"# print(f\"Classification Accuracy of old embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textOld'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}\")\n",
"# print(f\"Classification Accuracy of new embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textNew'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}\")\n",
" responseDict['KT_old_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['Avg'])\n",
" responseDict['KT_new_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['Avg'])\n",
" responseDict['KT_old_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['H_reversed'])\n",
" responseDict['KT_new_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['H_reversed'])\n",
" responseDict['old_acc'] = accuracy_score(wordSim353AnnotDF_New['textOld'].apply(labelSamples), wordSim353AnnotDF_New['category'])\n",
" responseDict['new_acc'] = accuracy_score(wordSim353AnnotDF_New['textNew'].apply(labelSamples), wordSim353AnnotDF_New['category'])\n",
" \n",
" responseDict['class_rep_old'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), output_dict=True)\n",
" responseDict['class_rep_new'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), output_dict=True)\n",
" \n",
" cm_old = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), labels=LABELS)\n",
" cm_new = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), labels=LABELS)\n",
" \n",
" responseDict['cm_old'] = cm_old\n",
" responseDict['cm_new'] = cm_new\n",
" \n",
" return responseDict"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "severe-explosion",
"metadata": {},
"outputs": [],
"source": [
"neighDictMaster, embedDictMaster = {}, {}"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "decreased-syndication",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"39218\n"
]
}
],
"source": [
"neighDictMaster['19k_childPar'] = fetchNeighbours(p279ChildPar)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "rocky-criterion",
"metadata": {},
"outputs": [],
"source": [
"embedDictMaster['complex'] = complexEmb\n",
"embedDictMaster['transe'] = transeEmb"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "apparent-sapphire",
"metadata": {},
"outputs": [],
"source": [
"for key1 in embedDictMaster.keys():\n",
" for key2 in embedDictMaster[key1].keys():\n",
" embedDictMaster[key1][key2] = np.array(embedDictMaster[key1][key2])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "precise-oxygen",
"metadata": {},
"outputs": [],
"source": [
"embList = list(embedDictMaster.keys())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "identical-keyboard",
"metadata": {},
"outputs": [],
"source": [
"basisList = list(neighDictMaster.keys())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "aging-flavor",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['19k_childPar'])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"neighDictMaster.keys()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "amended-remove",
"metadata": {},
"outputs": [],
"source": [
"newEmbedDictMaster, responsesDictMaster = {}, {}"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "surgical-insurance",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7817a00dcf3c412b92a7c5ac75517168",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"results = []\n",
"NUM_ITERS = 10\n",
"for basis in tqdm(basisList):\n",
" for emb in embList:\n",
" for weightedNess in [True]:\n",
" groupResults = []\n",
" for weightCase in [1,2]:\n",
" embedDict = embedDictMaster[emb]\n",
" if weightedNess:\n",
" caseName = emb + '_' + basis + '_' + str(weightCase) + '_weighted'\n",
" else:\n",
" caseName = emb + '_' + basis + '_' + str(weightCase) + '_unweighted'\n",
" for iterNum in range(1,NUM_ITERS+1):\n",
" newEmbedDict = retrofit(embedDict, neighDictMaster[basis], weightCase, weightedNess)\n",
" # dists = determineDistances(embedDict, newEmbedDict)\n",
" responsesDict = fetchCorrelationResults(embedDict, newEmbedDict)\n",
" # print(responsesDict.keys())\n",
" groupResults.append([emb, basis, weightCase, weightedNess, iterNum, \\\n",
" responsesDict['old_acc']*100, \\\n",
" responsesDict['new_acc']*100, \\\n",
" (responsesDict['new_acc'] - responsesDict['old_acc'])*100, \\\n",
" responsesDict['coveredPairs'], \\\n",
" responsesDict['class_rep_old']['I']['precision'], \\\n",
" responsesDict['class_rep_old']['I']['recall'], \\\n",
" responsesDict['class_rep_old']['I']['f1-score'], \\\n",
" responsesDict['class_rep_old']['U']['precision'], \\\n",
" responsesDict['class_rep_old']['U']['recall'], \\\n",
" responsesDict['class_rep_old']['U']['f1-score'], \\\n",
" responsesDict['class_rep_new']['I']['precision'], \\\n",
" responsesDict['class_rep_new']['I']['recall'], \\\n",
" responsesDict['class_rep_new']['I']['f1-score'], \\\n",
" responsesDict['class_rep_new']['U']['precision'], \\\n",
" responsesDict['class_rep_new']['U']['recall'], \\\n",
" responsesDict['class_rep_new']['U']['f1-score'], \\\n",
" ])\n",
" embedDict = newEmbedDict\n",
"\n",
" newEmbedDictMaster[caseName] = newEmbedDict\n",
" responsesDictMaster[caseName] = responsesDict\n",
" for gR, rank in zip(groupResults, np.argsort([-p[6] for p in groupResults])):\n",
" results.append(gR+[rank])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "assigned-stations",
"metadata": {},
"outputs": [],
"source": [
"resultsDF = pd.DataFrame(results, columns=['Embedding', 'Basis', 'Weight', 'Weightedness', 'Iteration Num', 'Old Acc', 'New Acc', 'Increase', 'Pairs Covered', \\\n",
" 'Old I Precision', 'Old I Recall', 'Old I F1-Score', \\\n",
" 'Old U Precision', 'Old U Recall', 'Old U F1-Score', \\\n",
" 'New I Precision', 'New I Recall', 'New I F1-Score', \\\n",
" 'New U Precision', 'New U Recall', 'New U F1-Score', \\\n",
" 'Rank'])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "built-mumbai",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Embedding | \n",
" Basis | \n",
" Weight | \n",
" Weightedness | \n",
" Iteration Num | \n",
" Old Acc | \n",
" New Acc | \n",
" Increase | \n",
" Pairs Covered | \n",
" Old I Precision | \n",
" ... | \n",
" Old U Precision | \n",
" Old U Recall | \n",
" Old U F1-Score | \n",
" New I Precision | \n",
" New I Recall | \n",
" New I F1-Score | \n",
" New U Precision | \n",
" New U Recall | \n",
" New U F1-Score | \n",
" Rank | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 1 | \n",
" 60.755814 | \n",
" 64.244186 | \n",
" 3.488372 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.433121 | \n",
" 0.660194 | \n",
" 0.523077 | \n",
" 1.000000 | \n",
" 0.40 | \n",
" 0.571429 | \n",
" 0.463415 | \n",
" 0.553398 | \n",
" 0.504425 | \n",
" 2 | \n",
"
\n",
" \n",
" | 20 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 1 | \n",
" 62.500000 | \n",
" 65.697674 | \n",
" 3.197674 | \n",
" 291 | \n",
" 0.888889 | \n",
" ... | \n",
" 0.397059 | \n",
" 0.262136 | \n",
" 0.315789 | \n",
" 0.833333 | \n",
" 0.50 | \n",
" 0.625000 | \n",
" 0.450000 | \n",
" 0.174757 | \n",
" 0.251748 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 2 | \n",
" 64.244186 | \n",
" 67.151163 | \n",
" 2.906977 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.463415 | \n",
" 0.553398 | \n",
" 0.504425 | \n",
" 1.000000 | \n",
" 0.50 | \n",
" 0.666667 | \n",
" 0.495050 | \n",
" 0.485437 | \n",
" 0.490196 | \n",
" 4 | \n",
"
\n",
" \n",
" | 11 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 2 | \n",
" 61.918605 | \n",
" 63.662791 | \n",
" 1.744186 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.444444 | \n",
" 0.660194 | \n",
" 0.531250 | \n",
" 1.000000 | \n",
" 0.45 | \n",
" 0.620690 | \n",
" 0.458904 | \n",
" 0.650485 | \n",
" 0.538153 | \n",
" 13 | \n",
"
\n",
" \n",
" | 10 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 1 | \n",
" 60.755814 | \n",
" 61.918605 | \n",
" 1.162791 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.433121 | \n",
" 0.660194 | \n",
" 0.523077 | \n",
" 1.000000 | \n",
" 0.40 | \n",
" 0.571429 | \n",
" 0.444444 | \n",
" 0.660194 | \n",
" 0.531250 | \n",
" 12 | \n",
"
\n",
" \n",
" | 2 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 3 | \n",
" 67.151163 | \n",
" 67.732558 | \n",
" 0.581395 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.495050 | \n",
" 0.485437 | \n",
" 0.490196 | \n",
" 0.909091 | \n",
" 0.50 | \n",
" 0.645161 | \n",
" 0.511905 | \n",
" 0.417476 | \n",
" 0.459893 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 5 | \n",
" 67.151163 | \n",
" 67.732558 | \n",
" 0.581395 | \n",
" 291 | \n",
" 0.916667 | \n",
" ... | \n",
" 0.492958 | \n",
" 0.339806 | \n",
" 0.402299 | \n",
" 0.916667 | \n",
" 0.55 | \n",
" 0.687500 | \n",
" 0.507937 | \n",
" 0.310680 | \n",
" 0.385542 | \n",
" 5 | \n",
"
\n",
" \n",
" | 36 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 7 | \n",
" 62.500000 | \n",
" 63.081395 | \n",
" 0.581395 | \n",
" 291 | \n",
" 0.846154 | \n",
" ... | \n",
" 0.351852 | \n",
" 0.184466 | \n",
" 0.242038 | \n",
" 0.846154 | \n",
" 0.55 | \n",
" 0.666667 | \n",
" 0.365385 | \n",
" 0.184466 | \n",
" 0.245161 | \n",
" 6 | \n",
"
\n",
" \n",
" | 30 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 1 | \n",
" 62.500000 | \n",
" 63.081395 | \n",
" 0.581395 | \n",
" 291 | \n",
" 0.888889 | \n",
" ... | \n",
" 0.397059 | \n",
" 0.262136 | \n",
" 0.315789 | \n",
" 0.900000 | \n",
" 0.45 | \n",
" 0.600000 | \n",
" 0.400000 | \n",
" 0.252427 | \n",
" 0.309524 | \n",
" 17 | \n",
"
\n",
" \n",
" | 22 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 3 | \n",
" 64.825581 | \n",
" 65.406977 | \n",
" 0.581395 | \n",
" 291 | \n",
" 0.750000 | \n",
" ... | \n",
" 0.393939 | \n",
" 0.126214 | \n",
" 0.191176 | \n",
" 0.750000 | \n",
" 0.75 | \n",
" 0.750000 | \n",
" 0.400000 | \n",
" 0.116505 | \n",
" 0.180451 | \n",
" 3 | \n",
"
\n",
" \n",
" | 38 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 9 | \n",
" 63.081395 | \n",
" 63.372093 | \n",
" 0.290698 | \n",
" 291 | \n",
" 0.785714 | \n",
" ... | \n",
" 0.372549 | \n",
" 0.184466 | \n",
" 0.246753 | \n",
" 0.785714 | \n",
" 0.55 | \n",
" 0.647059 | \n",
" 0.380000 | \n",
" 0.184466 | \n",
" 0.248366 | \n",
" 8 | \n",
"
\n",
" \n",
" | 33 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 4 | \n",
" 62.790698 | \n",
" 63.081395 | \n",
" 0.290698 | \n",
" 291 | \n",
" 0.900000 | \n",
" ... | \n",
" 0.383333 | \n",
" 0.223301 | \n",
" 0.282209 | \n",
" 0.846154 | \n",
" 0.55 | \n",
" 0.666667 | \n",
" 0.375000 | \n",
" 0.203883 | \n",
" 0.264151 | \n",
" 14 | \n",
"
\n",
" \n",
" | 18 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 9 | \n",
" 63.081395 | \n",
" 63.081395 | \n",
" 0.000000 | \n",
" 291 | \n",
" 0.909091 | \n",
" ... | \n",
" 0.444444 | \n",
" 0.543689 | \n",
" 0.489083 | \n",
" 0.909091 | \n",
" 0.50 | \n",
" 0.645161 | \n",
" 0.444444 | \n",
" 0.543689 | \n",
" 0.489083 | \n",
" 19 | \n",
"
\n",
" \n",
" | 37 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 8 | \n",
" 63.081395 | \n",
" 63.081395 | \n",
" 0.000000 | \n",
" 291 | \n",
" 0.846154 | \n",
" ... | \n",
" 0.365385 | \n",
" 0.184466 | \n",
" 0.245161 | \n",
" 0.785714 | \n",
" 0.55 | \n",
" 0.647059 | \n",
" 0.372549 | \n",
" 0.184466 | \n",
" 0.246753 | \n",
" 7 | \n",
"
\n",
" \n",
" | 32 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 3 | \n",
" 62.790698 | \n",
" 62.790698 | \n",
" 0.000000 | \n",
" 291 | \n",
" 0.900000 | \n",
" ... | \n",
" 0.387097 | \n",
" 0.233010 | \n",
" 0.290909 | \n",
" 0.900000 | \n",
" 0.45 | \n",
" 0.600000 | \n",
" 0.383333 | \n",
" 0.223301 | \n",
" 0.282209 | \n",
" 12 | \n",
"
\n",
" \n",
" | 23 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 4 | \n",
" 65.406977 | \n",
" 65.406977 | \n",
" 0.000000 | \n",
" 291 | \n",
" 0.750000 | \n",
" ... | \n",
" 0.400000 | \n",
" 0.116505 | \n",
" 0.180451 | \n",
" 0.652174 | \n",
" 0.75 | \n",
" 0.697674 | \n",
" 0.444444 | \n",
" 0.116505 | \n",
" 0.184615 | \n",
" 1 | \n",
"
\n",
" \n",
" | 39 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 10 | \n",
" 63.372093 | \n",
" 63.372093 | \n",
" 0.000000 | \n",
" 291 | \n",
" 0.785714 | \n",
" ... | \n",
" 0.380000 | \n",
" 0.184466 | \n",
" 0.248366 | \n",
" 0.785714 | \n",
" 0.55 | \n",
" 0.647059 | \n",
" 0.380000 | \n",
" 0.184466 | \n",
" 0.248366 | \n",
" 9 | \n",
"
\n",
" \n",
" | 15 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 6 | \n",
" 63.662791 | \n",
" 63.662791 | \n",
" 0.000000 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.455224 | \n",
" 0.592233 | \n",
" 0.514768 | \n",
" 1.000000 | \n",
" 0.45 | \n",
" 0.620690 | \n",
" 0.453846 | \n",
" 0.572816 | \n",
" 0.506438 | \n",
" 17 | \n",
"
\n",
" \n",
" | 14 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 5 | \n",
" 63.662791 | \n",
" 63.662791 | \n",
" 0.000000 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.457143 | \n",
" 0.621359 | \n",
" 0.526749 | \n",
" 1.000000 | \n",
" 0.45 | \n",
" 0.620690 | \n",
" 0.455224 | \n",
" 0.592233 | \n",
" 0.514768 | \n",
" 16 | \n",
"
\n",
" \n",
" | 13 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 4 | \n",
" 63.662791 | \n",
" 63.662791 | \n",
" 0.000000 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.457746 | \n",
" 0.631068 | \n",
" 0.530612 | \n",
" 1.000000 | \n",
" 0.45 | \n",
" 0.620690 | \n",
" 0.457143 | \n",
" 0.621359 | \n",
" 0.526749 | \n",
" 15 | \n",
"
\n",
" \n",
" | 12 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 3 | \n",
" 63.662791 | \n",
" 63.662791 | \n",
" 0.000000 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.458904 | \n",
" 0.650485 | \n",
" 0.538153 | \n",
" 1.000000 | \n",
" 0.45 | \n",
" 0.620690 | \n",
" 0.457746 | \n",
" 0.631068 | \n",
" 0.530612 | \n",
" 14 | \n",
"
\n",
" \n",
" | 17 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 8 | \n",
" 63.372093 | \n",
" 63.081395 | \n",
" -0.290698 | \n",
" 291 | \n",
" 0.909091 | \n",
" ... | \n",
" 0.448819 | \n",
" 0.553398 | \n",
" 0.495652 | \n",
" 0.909091 | \n",
" 0.50 | \n",
" 0.645161 | \n",
" 0.444444 | \n",
" 0.543689 | \n",
" 0.489083 | \n",
" 18 | \n",
"
\n",
" \n",
" | 16 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 7 | \n",
" 63.662791 | \n",
" 63.372093 | \n",
" -0.290698 | \n",
" 291 | \n",
" 1.000000 | \n",
" ... | \n",
" 0.453846 | \n",
" 0.572816 | \n",
" 0.506438 | \n",
" 0.909091 | \n",
" 0.50 | \n",
" 0.645161 | \n",
" 0.448819 | \n",
" 0.553398 | \n",
" 0.495652 | \n",
" 9 | \n",
"
\n",
" \n",
" | 35 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 6 | \n",
" 62.790698 | \n",
" 62.500000 | \n",
" -0.290698 | \n",
" 291 | \n",
" 0.846154 | \n",
" ... | \n",
" 0.363636 | \n",
" 0.194175 | \n",
" 0.253165 | \n",
" 0.846154 | \n",
" 0.55 | \n",
" 0.666667 | \n",
" 0.351852 | \n",
" 0.184466 | \n",
" 0.242038 | \n",
" 5 | \n",
"
\n",
" \n",
" | 34 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 5 | \n",
" 63.081395 | \n",
" 62.790698 | \n",
" -0.290698 | \n",
" 291 | \n",
" 0.846154 | \n",
" ... | \n",
" 0.375000 | \n",
" 0.203883 | \n",
" 0.264151 | \n",
" 0.846154 | \n",
" 0.55 | \n",
" 0.666667 | \n",
" 0.363636 | \n",
" 0.194175 | \n",
" 0.253165 | \n",
" 15 | \n",
"
\n",
" \n",
" | 31 | \n",
" transe | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 2 | \n",
" 63.081395 | \n",
" 62.790698 | \n",
" -0.290698 | \n",
" 291 | \n",
" 0.900000 | \n",
" ... | \n",
" 0.400000 | \n",
" 0.252427 | \n",
" 0.309524 | \n",
" 0.900000 | \n",
" 0.45 | \n",
" 0.600000 | \n",
" 0.387097 | \n",
" 0.233010 | \n",
" 0.290909 | \n",
" 11 | \n",
"
\n",
" \n",
" | 28 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 9 | \n",
" 60.755814 | \n",
" 60.465116 | \n",
" -0.290698 | \n",
" 291 | \n",
" 0.394737 | \n",
" ... | \n",
" 0.368421 | \n",
" 0.067961 | \n",
" 0.114754 | \n",
" 0.365854 | \n",
" 0.75 | \n",
" 0.491803 | \n",
" 0.388889 | \n",
" 0.067961 | \n",
" 0.115702 | \n",
" 13 | \n",
"
\n",
" \n",
" | 19 | \n",
" complex | \n",
" 19k_childPar | \n",
" 2 | \n",
" True | \n",
" 10 | \n",
" 63.081395 | \n",
" 62.790698 | \n",
" -0.290698 | \n",
" 291 | \n",
" 0.909091 | \n",
" ... | \n",
" 0.444444 | \n",
" 0.543689 | \n",
" 0.489083 | \n",
" 0.900000 | \n",
" 0.45 | \n",
" 0.600000 | \n",
" 0.444444 | \n",
" 0.543689 | \n",
" 0.489083 | \n",
" 10 | \n",
"
\n",
" \n",
" | 26 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 7 | \n",
" 62.209302 | \n",
" 61.627907 | \n",
" -0.581395 | \n",
" 291 | \n",
" 0.428571 | \n",
" ... | \n",
" 0.428571 | \n",
" 0.087379 | \n",
" 0.145161 | \n",
" 0.416667 | \n",
" 0.75 | \n",
" 0.535714 | \n",
" 0.400000 | \n",
" 0.077670 | \n",
" 0.130081 | \n",
" 4 | \n",
"
\n",
" \n",
" | 29 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 10 | \n",
" 60.465116 | \n",
" 59.883721 | \n",
" -0.581395 | \n",
" 291 | \n",
" 0.365854 | \n",
" ... | \n",
" 0.388889 | \n",
" 0.067961 | \n",
" 0.115702 | \n",
" 0.333333 | \n",
" 0.75 | \n",
" 0.461538 | \n",
" 0.411765 | \n",
" 0.067961 | \n",
" 0.116667 | \n",
" 16 | \n",
"
\n",
" \n",
" | 8 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 9 | \n",
" 65.116279 | \n",
" 64.534884 | \n",
" -0.581395 | \n",
" 291 | \n",
" 0.785714 | \n",
" ... | \n",
" 0.442308 | \n",
" 0.223301 | \n",
" 0.296774 | \n",
" 0.785714 | \n",
" 0.55 | \n",
" 0.647059 | \n",
" 0.416667 | \n",
" 0.194175 | \n",
" 0.264901 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 4 | \n",
" 67.732558 | \n",
" 67.151163 | \n",
" -0.581395 | \n",
" 291 | \n",
" 0.909091 | \n",
" ... | \n",
" 0.511905 | \n",
" 0.417476 | \n",
" 0.459893 | \n",
" 0.916667 | \n",
" 0.55 | \n",
" 0.687500 | \n",
" 0.492958 | \n",
" 0.339806 | \n",
" 0.402299 | \n",
" 3 | \n",
"
\n",
" \n",
" | 6 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 7 | \n",
" 66.569767 | \n",
" 65.988372 | \n",
" -0.581395 | \n",
" 291 | \n",
" 0.916667 | \n",
" ... | \n",
" 0.475410 | \n",
" 0.281553 | \n",
" 0.353659 | \n",
" 0.846154 | \n",
" 0.55 | \n",
" 0.666667 | \n",
" 0.464286 | \n",
" 0.252427 | \n",
" 0.327044 | \n",
" 7 | \n",
"
\n",
" \n",
" | 7 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 8 | \n",
" 65.988372 | \n",
" 65.116279 | \n",
" -0.872093 | \n",
" 291 | \n",
" 0.846154 | \n",
" ... | \n",
" 0.464286 | \n",
" 0.252427 | \n",
" 0.327044 | \n",
" 0.785714 | \n",
" 0.55 | \n",
" 0.647059 | \n",
" 0.442308 | \n",
" 0.223301 | \n",
" 0.296774 | \n",
" 8 | \n",
"
\n",
" \n",
" | 21 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 2 | \n",
" 65.697674 | \n",
" 64.825581 | \n",
" -0.872093 | \n",
" 291 | \n",
" 0.833333 | \n",
" ... | \n",
" 0.450000 | \n",
" 0.174757 | \n",
" 0.251748 | \n",
" 0.750000 | \n",
" 0.60 | \n",
" 0.666667 | \n",
" 0.393939 | \n",
" 0.126214 | \n",
" 0.191176 | \n",
" 2 | \n",
"
\n",
" \n",
" | 27 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 8 | \n",
" 61.627907 | \n",
" 60.755814 | \n",
" -0.872093 | \n",
" 291 | \n",
" 0.416667 | \n",
" ... | \n",
" 0.400000 | \n",
" 0.077670 | \n",
" 0.130081 | \n",
" 0.394737 | \n",
" 0.75 | \n",
" 0.517241 | \n",
" 0.368421 | \n",
" 0.067961 | \n",
" 0.114754 | \n",
" 10 | \n",
"
\n",
" \n",
" | 25 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 6 | \n",
" 63.372093 | \n",
" 62.209302 | \n",
" -1.162791 | \n",
" 291 | \n",
" 0.468750 | \n",
" ... | \n",
" 0.454545 | \n",
" 0.097087 | \n",
" 0.160000 | \n",
" 0.428571 | \n",
" 0.75 | \n",
" 0.545455 | \n",
" 0.428571 | \n",
" 0.087379 | \n",
" 0.145161 | \n",
" 19 | \n",
"
\n",
" \n",
" | 5 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 6 | \n",
" 67.732558 | \n",
" 66.569767 | \n",
" -1.162791 | \n",
" 291 | \n",
" 0.916667 | \n",
" ... | \n",
" 0.507937 | \n",
" 0.310680 | \n",
" 0.385542 | \n",
" 0.916667 | \n",
" 0.55 | \n",
" 0.687500 | \n",
" 0.475410 | \n",
" 0.281553 | \n",
" 0.353659 | \n",
" 6 | \n",
"
\n",
" \n",
" | 9 | \n",
" complex | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 10 | \n",
" 64.534884 | \n",
" 63.081395 | \n",
" -1.453488 | \n",
" 291 | \n",
" 0.785714 | \n",
" ... | \n",
" 0.416667 | \n",
" 0.194175 | \n",
" 0.264901 | \n",
" 0.687500 | \n",
" 0.55 | \n",
" 0.611111 | \n",
" 0.377778 | \n",
" 0.165049 | \n",
" 0.229730 | \n",
" 11 | \n",
"
\n",
" \n",
" | 24 | \n",
" transe | \n",
" 19k_childPar | \n",
" 1 | \n",
" True | \n",
" 5 | \n",
" 65.406977 | \n",
" 63.372093 | \n",
" -2.034884 | \n",
" 291 | \n",
" 0.652174 | \n",
" ... | \n",
" 0.444444 | \n",
" 0.116505 | \n",
" 0.184615 | \n",
" 0.468750 | \n",
" 0.75 | \n",
" 0.576923 | \n",
" 0.454545 | \n",
" 0.097087 | \n",
" 0.160000 | \n",
" 18 | \n",
"
\n",
" \n",
"
\n",
"
40 rows × 22 columns
\n",
"
"
],
"text/plain": [
" Embedding Basis Weight Weightedness Iteration Num Old Acc \\\n",
"0 complex 19k_childPar 1 True 1 60.755814 \n",
"20 transe 19k_childPar 1 True 1 62.500000 \n",
"1 complex 19k_childPar 1 True 2 64.244186 \n",
"11 complex 19k_childPar 2 True 2 61.918605 \n",
"10 complex 19k_childPar 2 True 1 60.755814 \n",
"2 complex 19k_childPar 1 True 3 67.151163 \n",
"4 complex 19k_childPar 1 True 5 67.151163 \n",
"36 transe 19k_childPar 2 True 7 62.500000 \n",
"30 transe 19k_childPar 2 True 1 62.500000 \n",
"22 transe 19k_childPar 1 True 3 64.825581 \n",
"38 transe 19k_childPar 2 True 9 63.081395 \n",
"33 transe 19k_childPar 2 True 4 62.790698 \n",
"18 complex 19k_childPar 2 True 9 63.081395 \n",
"37 transe 19k_childPar 2 True 8 63.081395 \n",
"32 transe 19k_childPar 2 True 3 62.790698 \n",
"23 transe 19k_childPar 1 True 4 65.406977 \n",
"39 transe 19k_childPar 2 True 10 63.372093 \n",
"15 complex 19k_childPar 2 True 6 63.662791 \n",
"14 complex 19k_childPar 2 True 5 63.662791 \n",
"13 complex 19k_childPar 2 True 4 63.662791 \n",
"12 complex 19k_childPar 2 True 3 63.662791 \n",
"17 complex 19k_childPar 2 True 8 63.372093 \n",
"16 complex 19k_childPar 2 True 7 63.662791 \n",
"35 transe 19k_childPar 2 True 6 62.790698 \n",
"34 transe 19k_childPar 2 True 5 63.081395 \n",
"31 transe 19k_childPar 2 True 2 63.081395 \n",
"28 transe 19k_childPar 1 True 9 60.755814 \n",
"19 complex 19k_childPar 2 True 10 63.081395 \n",
"26 transe 19k_childPar 1 True 7 62.209302 \n",
"29 transe 19k_childPar 1 True 10 60.465116 \n",
"8 complex 19k_childPar 1 True 9 65.116279 \n",
"3 complex 19k_childPar 1 True 4 67.732558 \n",
"6 complex 19k_childPar 1 True 7 66.569767 \n",
"7 complex 19k_childPar 1 True 8 65.988372 \n",
"21 transe 19k_childPar 1 True 2 65.697674 \n",
"27 transe 19k_childPar 1 True 8 61.627907 \n",
"25 transe 19k_childPar 1 True 6 63.372093 \n",
"5 complex 19k_childPar 1 True 6 67.732558 \n",
"9 complex 19k_childPar 1 True 10 64.534884 \n",
"24 transe 19k_childPar 1 True 5 65.406977 \n",
"\n",
" New Acc Increase Pairs Covered Old I Precision ... Old U Precision \\\n",
"0 64.244186 3.488372 291 1.000000 ... 0.433121 \n",
"20 65.697674 3.197674 291 0.888889 ... 0.397059 \n",
"1 67.151163 2.906977 291 1.000000 ... 0.463415 \n",
"11 63.662791 1.744186 291 1.000000 ... 0.444444 \n",
"10 61.918605 1.162791 291 1.000000 ... 0.433121 \n",
"2 67.732558 0.581395 291 1.000000 ... 0.495050 \n",
"4 67.732558 0.581395 291 0.916667 ... 0.492958 \n",
"36 63.081395 0.581395 291 0.846154 ... 0.351852 \n",
"30 63.081395 0.581395 291 0.888889 ... 0.397059 \n",
"22 65.406977 0.581395 291 0.750000 ... 0.393939 \n",
"38 63.372093 0.290698 291 0.785714 ... 0.372549 \n",
"33 63.081395 0.290698 291 0.900000 ... 0.383333 \n",
"18 63.081395 0.000000 291 0.909091 ... 0.444444 \n",
"37 63.081395 0.000000 291 0.846154 ... 0.365385 \n",
"32 62.790698 0.000000 291 0.900000 ... 0.387097 \n",
"23 65.406977 0.000000 291 0.750000 ... 0.400000 \n",
"39 63.372093 0.000000 291 0.785714 ... 0.380000 \n",
"15 63.662791 0.000000 291 1.000000 ... 0.455224 \n",
"14 63.662791 0.000000 291 1.000000 ... 0.457143 \n",
"13 63.662791 0.000000 291 1.000000 ... 0.457746 \n",
"12 63.662791 0.000000 291 1.000000 ... 0.458904 \n",
"17 63.081395 -0.290698 291 0.909091 ... 0.448819 \n",
"16 63.372093 -0.290698 291 1.000000 ... 0.453846 \n",
"35 62.500000 -0.290698 291 0.846154 ... 0.363636 \n",
"34 62.790698 -0.290698 291 0.846154 ... 0.375000 \n",
"31 62.790698 -0.290698 291 0.900000 ... 0.400000 \n",
"28 60.465116 -0.290698 291 0.394737 ... 0.368421 \n",
"19 62.790698 -0.290698 291 0.909091 ... 0.444444 \n",
"26 61.627907 -0.581395 291 0.428571 ... 0.428571 \n",
"29 59.883721 -0.581395 291 0.365854 ... 0.388889 \n",
"8 64.534884 -0.581395 291 0.785714 ... 0.442308 \n",
"3 67.151163 -0.581395 291 0.909091 ... 0.511905 \n",
"6 65.988372 -0.581395 291 0.916667 ... 0.475410 \n",
"7 65.116279 -0.872093 291 0.846154 ... 0.464286 \n",
"21 64.825581 -0.872093 291 0.833333 ... 0.450000 \n",
"27 60.755814 -0.872093 291 0.416667 ... 0.400000 \n",
"25 62.209302 -1.162791 291 0.468750 ... 0.454545 \n",
"5 66.569767 -1.162791 291 0.916667 ... 0.507937 \n",
"9 63.081395 -1.453488 291 0.785714 ... 0.416667 \n",
"24 63.372093 -2.034884 291 0.652174 ... 0.444444 \n",
"\n",
" Old U Recall Old U F1-Score New I Precision New I Recall \\\n",
"0 0.660194 0.523077 1.000000 0.40 \n",
"20 0.262136 0.315789 0.833333 0.50 \n",
"1 0.553398 0.504425 1.000000 0.50 \n",
"11 0.660194 0.531250 1.000000 0.45 \n",
"10 0.660194 0.523077 1.000000 0.40 \n",
"2 0.485437 0.490196 0.909091 0.50 \n",
"4 0.339806 0.402299 0.916667 0.55 \n",
"36 0.184466 0.242038 0.846154 0.55 \n",
"30 0.262136 0.315789 0.900000 0.45 \n",
"22 0.126214 0.191176 0.750000 0.75 \n",
"38 0.184466 0.246753 0.785714 0.55 \n",
"33 0.223301 0.282209 0.846154 0.55 \n",
"18 0.543689 0.489083 0.909091 0.50 \n",
"37 0.184466 0.245161 0.785714 0.55 \n",
"32 0.233010 0.290909 0.900000 0.45 \n",
"23 0.116505 0.180451 0.652174 0.75 \n",
"39 0.184466 0.248366 0.785714 0.55 \n",
"15 0.592233 0.514768 1.000000 0.45 \n",
"14 0.621359 0.526749 1.000000 0.45 \n",
"13 0.631068 0.530612 1.000000 0.45 \n",
"12 0.650485 0.538153 1.000000 0.45 \n",
"17 0.553398 0.495652 0.909091 0.50 \n",
"16 0.572816 0.506438 0.909091 0.50 \n",
"35 0.194175 0.253165 0.846154 0.55 \n",
"34 0.203883 0.264151 0.846154 0.55 \n",
"31 0.252427 0.309524 0.900000 0.45 \n",
"28 0.067961 0.114754 0.365854 0.75 \n",
"19 0.543689 0.489083 0.900000 0.45 \n",
"26 0.087379 0.145161 0.416667 0.75 \n",
"29 0.067961 0.115702 0.333333 0.75 \n",
"8 0.223301 0.296774 0.785714 0.55 \n",
"3 0.417476 0.459893 0.916667 0.55 \n",
"6 0.281553 0.353659 0.846154 0.55 \n",
"7 0.252427 0.327044 0.785714 0.55 \n",
"21 0.174757 0.251748 0.750000 0.60 \n",
"27 0.077670 0.130081 0.394737 0.75 \n",
"25 0.097087 0.160000 0.428571 0.75 \n",
"5 0.310680 0.385542 0.916667 0.55 \n",
"9 0.194175 0.264901 0.687500 0.55 \n",
"24 0.116505 0.184615 0.468750 0.75 \n",
"\n",
" New I F1-Score New U Precision New U Recall New U F1-Score Rank \n",
"0 0.571429 0.463415 0.553398 0.504425 2 \n",
"20 0.625000 0.450000 0.174757 0.251748 0 \n",
"1 0.666667 0.495050 0.485437 0.490196 4 \n",
"11 0.620690 0.458904 0.650485 0.538153 13 \n",
"10 0.571429 0.444444 0.660194 0.531250 12 \n",
"2 0.645161 0.511905 0.417476 0.459893 1 \n",
"4 0.687500 0.507937 0.310680 0.385542 5 \n",
"36 0.666667 0.365385 0.184466 0.245161 6 \n",
"30 0.600000 0.400000 0.252427 0.309524 17 \n",
"22 0.750000 0.400000 0.116505 0.180451 3 \n",
"38 0.647059 0.380000 0.184466 0.248366 8 \n",
"33 0.666667 0.375000 0.203883 0.264151 14 \n",
"18 0.645161 0.444444 0.543689 0.489083 19 \n",
"37 0.647059 0.372549 0.184466 0.246753 7 \n",
"32 0.600000 0.383333 0.223301 0.282209 12 \n",
"23 0.697674 0.444444 0.116505 0.184615 1 \n",
"39 0.647059 0.380000 0.184466 0.248366 9 \n",
"15 0.620690 0.453846 0.572816 0.506438 17 \n",
"14 0.620690 0.455224 0.592233 0.514768 16 \n",
"13 0.620690 0.457143 0.621359 0.526749 15 \n",
"12 0.620690 0.457746 0.631068 0.530612 14 \n",
"17 0.645161 0.444444 0.543689 0.489083 18 \n",
"16 0.645161 0.448819 0.553398 0.495652 9 \n",
"35 0.666667 0.351852 0.184466 0.242038 5 \n",
"34 0.666667 0.363636 0.194175 0.253165 15 \n",
"31 0.600000 0.387097 0.233010 0.290909 11 \n",
"28 0.491803 0.388889 0.067961 0.115702 13 \n",
"19 0.600000 0.444444 0.543689 0.489083 10 \n",
"26 0.535714 0.400000 0.077670 0.130081 4 \n",
"29 0.461538 0.411765 0.067961 0.116667 16 \n",
"8 0.647059 0.416667 0.194175 0.264901 0 \n",
"3 0.687500 0.492958 0.339806 0.402299 3 \n",
"6 0.666667 0.464286 0.252427 0.327044 7 \n",
"7 0.647059 0.442308 0.223301 0.296774 8 \n",
"21 0.666667 0.393939 0.126214 0.191176 2 \n",
"27 0.517241 0.368421 0.067961 0.114754 10 \n",
"25 0.545455 0.428571 0.087379 0.145161 19 \n",
"5 0.687500 0.475410 0.281553 0.353659 6 \n",
"9 0.611111 0.377778 0.165049 0.229730 11 \n",
"24 0.576923 0.454545 0.097087 0.160000 18 \n",
"\n",
"[40 rows x 22 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resultsDF.sort_values(by=['Increase'], ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "utility-globe",
"metadata": {},
"outputs": [],
"source": [
"resultsDF.to_csv('../data/retrofitting/masterRetro_Aug20_2021.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "crazy-scene",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "kgtkEnv2",
"language": "python",
"name": "kgtkenv2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "288px"
},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}