{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "express-journalist",
   "metadata": {},
   "source": [
    "In this notebook, we make a collection of all the embeddings which we use to do retrofitting. These embeddings are then evaluated for their similarity based on the evaluation benchmark datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "incorrect-routine",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics.pairwise import euclidean_distances\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import scipy.stats as stats\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import confusion_matrix\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm.notebook import tqdm\n",
    "from itertools import combinations\n",
    "from math import comb\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import os\n",
    "import h5py\n",
    "import json\n",
    "import gzip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "moderate-drunk",
   "metadata": {},
   "outputs": [],
   "source": [
    "# DWD V2 files\n",
    "# https://drive.google.com/drive/u/3/folders/1OIZegxxrs_Hv2ZhDsSO-zLVARCR60P01\n",
    "# SITELINKS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/sitelinks.en.tsv.gz\"\n",
    "CLAIMS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/claims.tsv.gz\"\n",
    "LABELS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz\"\n",
    "DESCRIPTIONS_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/descriptions.en.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "exceptional-funeral",
   "metadata": {},
   "outputs": [],
   "source": [
    "# wikidata-20210215 files\n",
    "# https://drive.google.com/drive/u/3/folders/1NGtob1BFQ03sXf4yQyYvP13ly3u1Ul5u\n",
    "# SITELINKS_FILE_V1 = \"../source_dataset_files/wikidata-20210215/sitelinks.en.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "compressed-question",
   "metadata": {},
   "outputs": [],
   "source": [
    "# wikidata-20201208 files\n",
    "# https://drive.google.com/drive/u/3/folders/1qbbgjo7pddMdDvQzOSeSaL6lYwj_f5gi\n",
    "SITELINKS_FILE_V2 = \"../source_dataset_files/wikidata-20201208/sitelinks.en.tsv.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fiscal-appointment",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Embedding Related Files\n",
    "DBPEDIA_SHORT_ABSTRACTS_TTL = \"../data/evaluation/source_files/short-abstracts_lang=en.ttl\"\n",
    "DBPEDIA_SHORT_ABSTRACTS_CSV = \"../data/evaluation/source_files/short-abstracts_lang=en.csv\"\n",
    "ABSTRACTS_INTERMEDIATE_FILE = \"../data/embeddings/intermediate_files/abstracts.csv\"\n",
    "\n",
    "COMPLEX_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.complEx.graph-embeddings.txt\"\n",
    "TRANSE_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.transE.graph-embeddings.txt\"\n",
    "TEXT_EMB_SOURCE_FILE = \"../source_dataset_files/wikidata-20210215-dwd-v2/text-embeddings-concatenated.tsv.gz\"\n",
    "\n",
    "COMPLEX_EMB_FINAL_FILE = \"../data/embeddings/complex_orig_embedding_dict.json\"\n",
    "TRANSE_EMB_FINAL_FILE = \"../data/embeddings/transe_orig_embedding_dict.json\"\n",
    "TEXT_EMB_FINAL_FILE = \"../data/embeddings/text_7_props_orig_embedding_dict.json\"\n",
    "ABS_EMB_FINAL_FILE = \"../data/embeddings/abstract_orig_embedding_dict.json\"\n",
    "ABS_FIRST_SENT_EMB_FINAL_FILE = \"../data/embeddings/abstract_first_sent_orig_embedding_dict.json\"\n",
    "\n",
    "LABELS_EMB_FINAL_FILE = \"../data/embeddings/labels_orig_embedding_dict.json\"\n",
    "LABELS_DESC_EMB_FINAL_FILE = \"../data/embeddings/labels_n_desc_orig_embedding_dict.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "departmental-buddy",
   "metadata": {},
   "outputs": [],
   "source": [
    "# HAS Embedding Related Files\n",
    "A_SOURCE_FILE = \"../source_dataset_files/A_walks_analysis/a_embeddings_10x10,min_count=0.kv\"\n",
    "A_OP_FILE = \"../data/embeddings/has_a_orig_embedding_dict.json\"\n",
    "\n",
    "H_SOURCE_FILE = \"../source_dataset_files/H_walks_analysis/h_embeddings_5x8,min_count=21.kv\"\n",
    "H_OP_FILE = \"../data/embeddings/has_h_orig_embedding_dict.json\"\n",
    "\n",
    "S_SOURCE_FILE = \"../source_dataset_files/S_walks_analysis/s_embeddings_5x10,min_count=0.kv\"\n",
    "S_OP_FILE = \"../data/embeddings/has_s_orig_embedding_dict.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "failing-talent",
   "metadata": {},
   "outputs": [],
   "source": [
    "WORDSIM_CLASS_SIM_FILE = '../data/embeddings/wordsim_class_sim.csv'\n",
    "WORDSIM_JC_SIM_FILE = '../data/embeddings/wordsim_jc_sim.csv'\n",
    "WORDSIM_TOP_SIM_FILE = '../data/embeddings/wordsim_top_sim.csv'\n",
    "\n",
    "WORDSIM_OLD_CLASS_SIM_FILE = '../data/embeddings/wordsim_old_class_sim.csv'\n",
    "WORDSIM_OLD_JC_SIM_FILE = '../data/embeddings/wordsim_old_jc_sim.csv'\n",
    "WORDSIM_OLD_TOP_SIM_FILE = '../data/embeddings/wordsim_old_top_sim.csv'\n",
    "\n",
    "DBPEDIA_MC_30_CLASS_SIM_FILE = '../data/embeddings/dbpedia_mc_30_class_sim.csv'\n",
    "DBPEDIA_MC_30_JC_SIM_FILE = '../data/embeddings/dbpedia_mc_30_jc_sim.csv'\n",
    "DBPEDIA_MC_30_TOP_SIM_FILE = '../data/embeddings/dbpedia_mc_30_top_sim.csv'\n",
    "\n",
    "DBPEDIA_RG_65_CLASS_SIM_FILE = '../data/embeddings/dbpedia_rg_65_class_sim.csv'\n",
    "DBPEDIA_RG_65_JC_SIM_FILE = '../data/embeddings/dbpedia_rg_65_jc_sim.csv'\n",
    "DBPEDIA_RG_65_TOP_SIM_FILE = '../data/embeddings/dbpedia_rg_65_top_sim.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "elementary-desktop",
   "metadata": {},
   "outputs": [],
   "source": [
    "P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = \"../data/basis/P279_ChildPar.all-distilroberta-v1.csv\"\n",
    "WORDSIM_FILE = \"../data/evaluation/wordsim353_with_r3.csv\"\n",
    "WORDSIM_OLD_FILE = \"../data/evaluation/wordsim_old.csv\"\n",
    "DBPEDIA_MC_30_FINAL_FILE = \"../data/evaluation/mc-30_DBpedia.csv\"\n",
    "DBPEDIA_RG_65_FINAL_FILE = \"../data/evaluation/rg-65_DBpedia.csv\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "noble-draft",
   "metadata": {},
   "source": [
    "# Common Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "broadband-background",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "241698\n"
     ]
    }
   ],
   "source": [
    "def get_all_nodes():\n",
    "    \"\"\"\n",
    "    This function generates the set of all nodes needed for execution\n",
    "    \"\"\"\n",
    "    p279ChildPar = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)\n",
    "    wordsim_df = pd.read_csv(WORDSIM_FILE)\n",
    "    dbpedia_mc_30_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)\n",
    "    dbpedia_rg_65_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)\n",
    "#     wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')\n",
    "#     concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')\n",
    "    p279QnodesList = set(p279ChildPar.node1.to_list() \n",
    "                        + p279ChildPar.node2.to_list()\n",
    "                        + wordsim_df['word1_kg_id'].to_list() \n",
    "                        + wordsim_df['word2_kg_id'].to_list()\n",
    "                        + dbpedia_mc_30_df['word1_kg_id'].to_list()\n",
    "                        + dbpedia_mc_30_df['word2_kg_id'].to_list()\n",
    "                        + dbpedia_rg_65_df['word1_kg_id'].to_list()\n",
    "                        + dbpedia_rg_65_df['word2_kg_id'].to_list())\n",
    "#                         + wiki_cs_df['word1_kg_id'].to_list() \n",
    "#                         + wiki_cs_df['word2_kg_id'].to_list()\n",
    "#                         + concept_net_df['word1_kg_id'].to_list()\n",
    "#                         + concept_net_df['word2_kg_id'].to_list())\n",
    "    print(len(p279QnodesList))\n",
    "    return p279QnodesList\n",
    "\n",
    "allNodes = get_all_nodes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "parliamentary-documentation",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fillCoverage(embedDict):\n",
    "    wordSim353AnnotDF_New = pd.read_csv(WORDSIM_FILE)\n",
    "    wordSim353AnnotDF_set = set(wordSim353AnnotDF_New['word1_kg_id'].to_list() + wordSim353AnnotDF_New['word2_kg_id'].to_list())\n",
    "    embed_size = len(embedDict[next(iter(embedDict))])\n",
    "#     print(embed_size)\n",
    "    count = 0\n",
    "    for word in wordSim353AnnotDF_set:\n",
    "        if word not in embedDict:\n",
    "            embedDict[word] = np.zeros((embed_size))\n",
    "            count += 1\n",
    "    print(f\"Added {count} corrections\")\n",
    "    return embedDict\n",
    "\n",
    "def deserializeEmbeddingDict(embedDict):\n",
    "    for key2 in embedDict.keys():\n",
    "        embedDict[key2] = np.array(embedDict[key2])\n",
    "    return embedDict\n",
    "\n",
    "def serializeEmbeddingDict(embedDict):\n",
    "    for key2 in embedDict.keys():\n",
    "        embedDict[key2] = embedDict[key2].tolist() if type(embedDict[key2]) != list else embedDict[key2]\n",
    "    return embedDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "established-brush",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_labels(node_set):\n",
    "    labels_dict = {}\n",
    "    first_line = True\n",
    "    with gzip.open(LABELS_FILE, 'r') as labelsFile:\n",
    "        firstLine = True\n",
    "        for line in tqdm(labelsFile, total=41845781):\n",
    "            if firstLine:\n",
    "                firstLine = False\n",
    "                continue\n",
    "            line = line.decode('utf-8').strip().split('\\t')\n",
    "            line[3] = line[3][1:-5]\n",
    "            qnode, label = line[1], line[3]\n",
    "    #         print(qnode, label)\n",
    "            if qnode in node_set:\n",
    "                labels_dict[qnode] = label\n",
    "    return labels_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "automated-olive",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_labels_n_desc(node_set):\n",
    "    labels_dict = get_labels(node_set)\n",
    "    first_line = True\n",
    "    with gzip.open(DESCRIPTIONS_FILE, 'r') as labelsFile:\n",
    "        firstLine = True\n",
    "        for line in tqdm(labelsFile, total=34700043):\n",
    "            if firstLine:\n",
    "                firstLine = False\n",
    "                continue\n",
    "            line = line.decode('utf-8').strip().split('\\t')\n",
    "            line[3] = line[3][1:-5]\n",
    "            qnode, label = line[1], line[3]\n",
    "    #         print(qnode, label)\n",
    "            if qnode in node_set:\n",
    "                if qnode in labels_dict:\n",
    "                    labels_dict[qnode] += ' ' + label\n",
    "                else:\n",
    "                    raise \"Label not present\"\n",
    "    return labels_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "driven-yeast",
   "metadata": {},
   "source": [
    "# Complex + Transe Embeddings Generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "sound-spain",
   "metadata": {},
   "outputs": [],
   "source": [
    "complex_emb_dict = json.load(open(COMPLEX_EMB_FINAL_FILE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "correct-gentleman",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "64deda8236084d79bce85a2fd249dec9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "first_line = True\n",
    "complex_emb_dict = {}\n",
    "with open(COMPLEX_EMB_SOURCE_FILE) as complex_file:\n",
    "    for line in tqdm(complex_file, total=53002671):\n",
    "        if first_line:\n",
    "            first_line = False\n",
    "            continue\n",
    "        line = line.strip().split()\n",
    "        if line[0] in allNodes and line[0] not in complex_emb_dict:\n",
    "            complex_emb_dict[line[0]] = [float(elem) for elem in line[1:]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "proved-buffer",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "241698"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(complex_emb_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "hazardous-amazon",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(complex_emb_dict, open(COMPLEX_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "industrial-paradise",
   "metadata": {},
   "outputs": [],
   "source": [
    "transe_emb_dict = json.load(open(TRANSE_EMB_FINAL_FILE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "needed-passion",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8636f544aa484f6d9785723a4a96e83b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "241698"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_line = True\n",
    "transe_emb_dict = {}\n",
    "with open(TRANSE_EMB_SOURCE_FILE) as complex_file:\n",
    "    for line in tqdm(complex_file, total=53002671):\n",
    "        if first_line:\n",
    "            first_line = False\n",
    "            continue\n",
    "        line = line.strip().split()\n",
    "        if line[0] in allNodes and line[0] not in transe_emb_dict:\n",
    "            transe_emb_dict[line[0]] = [float(elem) for elem in line[1:]]\n",
    "len(transe_emb_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "classified-chick",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(transe_emb_dict, open(TRANSE_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "steady-cliff",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "suffering-zealand",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "brief-timer",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "238889\n"
     ]
    }
   ],
   "source": [
    "# p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')\n",
    "# print(len(set(p279ChildPar.node1.to_list() \n",
    "#                         + p279ChildPar.node2.to_list())))\n",
    "\n",
    "# # Load complex, transe embedding files and entity names file\n",
    "# compf = h5py.File('../data/complTrans/complEx.h5','r')\n",
    "# transf = h5py.File('../data/complTrans/transE.h5','r')\n",
    "# ent_names = json.load(open('../data/complTrans/entity_names_all_0.json'))\n",
    "# allNodes = get_all_nodes()\n",
    "# # json.dump(list(p279QnodesList), open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json', 'w'))\n",
    "\n",
    "# complexEmb = {qnode: emb for emb, qnode in zip(compf['embeddings'], ent_names) if qnode in allNodes}\n",
    "# transeEmb = {qnode: emb for emb, qnode in zip(transf['embeddings'], ent_names) if qnode in allNodes}\n",
    "# print(f\"Out of {len(ent_names)} embeddings, retaining {len(transeEmb)} embeddings\")\n",
    "\n",
    "# def serialize_embedding_dict(embed_dict):\n",
    "#     for key2 in embed_dict.keys():\n",
    "#         embed_dict[key2] = embed_dict[key2].tolist() if type(embed_dict[key2]) != list else embed_dict[key2]\n",
    "#     return embed_dict\n",
    "\n",
    "# json.dump(serialize_embedding_dict(complexEmb),open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json','w'))\n",
    "# json.dump(serialize_embedding_dict(transeEmb),open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json','w'))\n",
    "# # complexEmb = json.load(open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json'))\n",
    "# # transeEmb = json.load(open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "continued-locking",
   "metadata": {},
   "source": [
    "# Text Embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "polished-divorce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6f97b30760e841da87be90aebef9c8cd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "first_line = True\n",
    "text_emb_dict = {}\n",
    "with gzip.open(TEXT_EMB_SOURCE_FILE) as file:\n",
    "    for line in tqdm(file):\n",
    "        if first_line:\n",
    "            first_line = False\n",
    "            continue\n",
    "        line = line.decode('utf-8').strip().split('\\t')\n",
    "        if line[1] == 'text_embedding' and line[0] in allNodes:\n",
    "            text_emb_dict[line[0]] = [float(elem) for elem in line[2].split(',')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "raising-boost",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "beautiful-drill",
   "metadata": {},
   "outputs": [],
   "source": [
    "# text_emb_dict = json.load(open('../data/embeddings/archived/text_7_props_orig_embedding_dict.json.old'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "impressed-stations",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a75bc167e9b449f88e4df7ebb19bca77",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/241698 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "missing_nodes = []\n",
    "for node in tqdm(allNodes):\n",
    "    if node not in text_emb_dict:\n",
    "        missing_nodes.append(node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "disabled-corporation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0db4a305d6dc4f1e85dfdc02ea564537",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "missing_nodes_set = set(missing_nodes)\n",
    "new_file = []\n",
    "with gzip.open(CLAIMS_FILE, 'r') as all_claims_file:\n",
    "    firstLine = True\n",
    "    for ogline in tqdm(all_claims_file, total=491297976):\n",
    "        if firstLine:\n",
    "            firstLine = False\n",
    "            continue\n",
    "        line = ogline.decode('utf-8').strip().split('\\t')\n",
    "        line[3] = line[3][1:-5]\n",
    "        qnode, label = line[1], line[3]\n",
    "#         print(qnode, label)\n",
    "        if qnode in missing_nodes_set:\n",
    "            new_file.append(ogline)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "complete-performer",
   "metadata": {},
   "outputs": [],
   "source": [
    "allowed_props = set(['P31', 'P279', 'P106', 'P39', 'P1382', 'P373', 'P452'])\n",
    "new_file1 = []\n",
    "for line in new_file:\n",
    "    line1 = line.decode('utf-8').strip().split('\\t')\n",
    "    if line1[2] in allowed_props:\n",
    "        new_file1.append(line.decode('utf-8'))\n",
    "new_file1 = ['id\\tnode1\\tlabel\\tnode2\\trank\\tnode2;wikidatatype\\n'] + new_file1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "straight-internet",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../output/text-embeddings/missing_nodes.tsv', 'w') as f:\n",
    "    f.writelines(new_file1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "suited-boating",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#     --model sentence-transformers/roberta-large-nli-mean-tokens   \\\n",
    "q1 = \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../output/text-embeddings/missing_nodes.tsv   \\\n",
    "    --model roberta-large-nli-mean-tokens   \\\n",
    "    --property-labels-file \" + LABELS_FILE + \"  --debug   \\\n",
    "    --isa-properties P31 P279 P106 P39 P1382 P373 P452   \\\n",
    "    --save-embedding-sentence > ../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv\"\n",
    "os.system(q1 + \" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "based-circuit",
   "metadata": {},
   "outputs": [],
   "source": [
    "text7_missingnodes = pd.read_csv(\"../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv\", sep='\\t')\n",
    "text7_missingnodes = text7_missingnodes[text7_missingnodes.property == 'text_embedding']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "imposed-series",
   "metadata": {},
   "outputs": [],
   "source": [
    "text7_missingnodes['value'] = text7_missingnodes['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "compliant-locator",
   "metadata": {},
   "outputs": [],
   "source": [
    "text7EmbDict = {row['node']: row['value'] for _,row in text7_missingnodes.iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "quick-voluntary",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key in text7EmbDict.keys():\n",
    "    if key not in text_emb_dict:\n",
    "        text_emb_dict[key] = text7EmbDict[key]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "special-smile",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "infectious-mauritius",
   "metadata": {},
   "source": [
    "## Old technique follows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "neural-gibson",
   "metadata": {},
   "outputs": [],
   "source": [
    "p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')\n",
    "p279QnodesList = list(set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "instructional-weather",
   "metadata": {},
   "outputs": [],
   "source": [
    "missingNodes = allNodes - set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "specified-clear",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "37038"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(missingNodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "conditional-brooks",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6fb2da21d7cf4241a3e52ac132a7c534",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/38 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Split main file into sub-files for groups of properties for multi-processing\n",
    "\n",
    "# bsize = len(p279QnodesList) // 250\n",
    "# cnt = 1\n",
    "# for i in range(0, len(p279QnodesList), bsize):\n",
    "#     q1 = \"kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '\" + '|'.join(p279QnodesList[i:i+bsize]) + \";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv -v True\"\n",
    "# #     print(len(q1))\n",
    "#     cnt += 1\n",
    "# #     print(q1)\n",
    "#     os.system(\"screen -dm \" + q1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "utility-fleet",
   "metadata": {},
   "outputs": [],
   "source": [
    "for cnt in range(290,503):\n",
    "    os.system(\"rm ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "alleged-strength",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6fb2da21d7cf4241a3e52ac132a7c534",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/38 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# # Split main file into sub-files for groups of properties for multi-processing\n",
    "missingNodes = list(missingNodes)\n",
    "bsize = 1000\n",
    "cnt = 252\n",
    "for i in tqdm(range(0, len(missingNodes), bsize)):\n",
    "    q1 = \"kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '\" + '|'.join(missingNodes[i:i+bsize]) + \";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv -v True\"\n",
    "#     print(len(q1))\n",
    "    cnt += 1\n",
    "#     print(q1)\n",
    "    os.system(\"screen -dm \" + q1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "signal-island",
   "metadata": {},
   "outputs": [],
   "source": [
    "def checkIfFileContainsLines(file):\n",
    "    with open(file) as f:\n",
    "        for line in f:\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "fatal-broadway",
   "metadata": {},
   "outputs": [],
   "source": [
    "def countFileLines(file):\n",
    "    count = 0\n",
    "    with open(file) as f:\n",
    "        for line in f:\n",
    "            count += 1\n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "separate-satin",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from os.path import exists\n",
    "\n",
    "runCommCnt = 1\n",
    "# 252\n",
    "for cnt in tqdm(range(252,290)):\n",
    "    if exists(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") and countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") == 4097:\n",
    "        continue\n",
    "    q1 = \"\"\n",
    "#     if cnt % 10 == 0:\n",
    "#         q1 += \"sleep 20m; \"\n",
    "    q1 = \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv   \\\n",
    "        --model sentence-transformers/all-distilroberta-v1   \\\n",
    "        --property-labels-file ../data/labels.en.tsv  --debug   \\\n",
    "        --isa-properties P31 P279 P106 P39 P1382 P373 P452   \\\n",
    "        --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\"\n",
    "    print(cnt)\n",
    "    runCommCnt += 1\n",
    "    os.system(q1 + \" &amp;\")\n",
    "    if runCommCnt % 15 == 0:\n",
    "        time.sleep(11*60)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "surprising-burning",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for cnt in tqdm(range(1,290)):\n",
    "    if countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-\" + str(cnt) + \".tsv\") != 4097:\n",
    "        print(cnt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bottom-lodge",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from os.path import exists\n",
    "\n",
    "# roberta-large-nli-mean-tokens\n",
    "runCommCnt = 0\n",
    "for cnt in tqdm(range(252,290)):\n",
    "    if exists(\"../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\") and countFileLines(\"../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\") == 4097:\n",
    "        continue\n",
    "    q1 = \"\"\n",
    "#     if cnt % 10 == 0:\n",
    "#         q1 += \"sleep 20m; \"\n",
    "    q1 += \"~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-\" + str(cnt) + \".tsv   \\\n",
    "        --model sentence-transformers/all-distilroberta-v1   \\\n",
    "        --property-labels-file ../data/labels.en.tsv  --debug   \\\n",
    "        --isa-properties P31 P279   \\\n",
    "        --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-\" + str(cnt) + \".tsv\"\n",
    "    print(cnt)\n",
    "    runCommCnt += 1\n",
    "    os.system(q1 + \" &amp;\")\n",
    "    if runCommCnt % 15 == 0:\n",
    "        time.sleep(13*60)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "damaged-browse",
   "metadata": {},
   "outputs": [],
   "source": [
    "p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "collective-april",
   "metadata": {},
   "outputs": [],
   "source": [
    "# temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-1.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "decent-yorkshire",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node</th>\n",
       "      <th>property</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Q99738027</td>\n",
       "      <td>text_embedding</td>\n",
       "      <td>0.74755263,1.6350263,-0.73952675,1.0463063,-0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Q99738027</td>\n",
       "      <td>embedding_sentence</td>\n",
       "      <td>night shift, work shift during nighttime hours...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Q99228502</td>\n",
       "      <td>text_embedding</td>\n",
       "      <td>0.25261465,0.06285462,0.029052094,0.50796187,0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Q99228502</td>\n",
       "      <td>embedding_sentence</td>\n",
       "      <td>avenue, thoroughfare named \\\"avenue\\\" is thoro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Q98970128</td>\n",
       "      <td>text_embedding</td>\n",
       "      <td>0.11887096,0.8598291,0.4446009,-0.5038472,-0.9...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        node            property  \\\n",
       "0  Q99738027      text_embedding   \n",
       "1  Q99738027  embedding_sentence   \n",
       "2  Q99228502      text_embedding   \n",
       "3  Q99228502  embedding_sentence   \n",
       "4  Q98970128      text_embedding   \n",
       "\n",
       "                                               value  \n",
       "0  0.74755263,1.6350263,-0.73952675,1.0463063,-0....  \n",
       "1  night shift, work shift during nighttime hours...  \n",
       "2  0.25261465,0.06285462,0.029052094,0.50796187,0...  \n",
       "3  avenue, thoroughfare named \\\"avenue\\\" is thoro...  \n",
       "4  0.11887096,0.8598291,0.4446009,-0.5038472,-0.9...  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# temp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "satisfactory-speech",
   "metadata": {},
   "outputs": [],
   "source": [
    "text2EmbArr = []\n",
    "for i in tqdm(range(1, 290)):\n",
    "    if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv')):\n",
    "        continue\n",
    "    temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv', sep='\\t')\n",
    "    temp = temp[temp.property == 'text_embedding']\n",
    "    text2EmbArr.append(temp)\n",
    "text2Emb = pd.concat(text2EmbArr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "unavailable-competition",
   "metadata": {},
   "outputs": [],
   "source": [
    "text2Emb.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "concerned-april",
   "metadata": {},
   "outputs": [],
   "source": [
    "text7EmbArr = []\n",
    "for i in tqdm(range(1, 290)):\n",
    "    if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv')):\n",
    "        continue\n",
    "    temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv', sep='\\t')\n",
    "    temp = temp[temp.property == 'text_embedding']\n",
    "    text7EmbArr.append(temp)\n",
    "text7Emb = pd.concat(text7EmbArr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "australian-enforcement",
   "metadata": {},
   "outputs": [],
   "source": [
    "text2Emb = text2Emb[text2Emb.node.apply(lambda p: p in allNodes)]\n",
    "text7Emb = text7Emb[text7Emb.node.apply(lambda p: p in allNodes)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "closed-treatment",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We have 2prop text embeddings for 278467 nodes and 7prop for 277587 nodes\n"
     ]
    }
   ],
   "source": [
    "print(f\"We have 2prop text embeddings for {len(text2Emb)} nodes and 7prop for {len(text7Emb)} nodes\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "duplicate-agency",
   "metadata": {},
   "outputs": [],
   "source": [
    "text2Emb['value'] = text2Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])\n",
    "text7Emb['value'] = text7Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "framed-third",
   "metadata": {},
   "outputs": [],
   "source": [
    "text2EmbDict = {row['node']: row['value'] for _,row in text2Emb.iterrows()}\n",
    "text7EmbDict = {row['node']: row['value'] for _,row in text7Emb.iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "peaceful-andrew",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(text2EmbDict, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))\n",
    "json.dump(text7EmbDict, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "considered-river",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "sustained-playback",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "requested-state",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "suited-going",
   "metadata": {},
   "source": [
    "# Abstract Embeddings Generation\n",
    "\n",
    "Downloaded short abstracts file from [DBPedia Short Abstracts - 2020.07.01](https://downloads.dbpedia.org/repo/dbpedia/text/short-abstracts/2020.07.01/short-abstracts_lang=en.ttl.bz2)\n",
    "\n",
    "Then, we extract the abstracts file from the bz2 file using: `bzip2 -d short-abstracts_lang=en.ttl.bz2`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "former-editor",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d0038713a1604ccb9c2e5499615fbc43",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# cnt = 0\n",
    "# p1s = []\n",
    "# p11s = []\n",
    "# p2s = []\n",
    "# lines = []\n",
    "# with open(DBPEDIA_SHORT_ABSTRACTS_TTL, 'r', encoding='utf-8') as f:\n",
    "#     for line in tqdm(f):\n",
    "#         p1 = line[:line.find(\" \")]\n",
    "#         p11 = p1[len(\"<http://dbpedia.org/resource/\"):][:-1]\n",
    "#         line = line[line.find(\" \")+1:]\n",
    "#         p2 = line[:line.find(\" \")]\n",
    "#         line = line[line.find(\" \")+1:line.rfind(\" \")][1:-4]\n",
    "#         p1s.append(p1)\n",
    "#         p11s.append(p11)\n",
    "#         p2s.append(p2)\n",
    "#         lines.append(line)\n",
    "# df1 = pd.DataFrame({'urlComp': p11s, 'url':p1s, 'p2': p2s, 'abstract': lines})\n",
    "# df1.to_csv(DBPEDIA_SHORT_ABSTRACTS_CSV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "massive-camera",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DBPedia dataset has 5732949 records with unique 5732947 index values\n",
      "There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)\n",
      "From 116088 Qnodes, there are 5134 sitelink Qnodes which do not have a short abstract i.e 110954 have a short abstract\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "51a5ed2cfc8e45c8815fac2f8855a3be",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/241698 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df1 = pd.read_csv(DBPEDIA_SHORT_ABSTRACTS_CSV, skiprows=1, skipfooter=1, header=None, engine='python')\n",
    "df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']\n",
    "df1 = df1.set_index('node1')\n",
    "df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]\n",
    "print(f\"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values\")\n",
    "df1.loc[df1[df1.index.duplicated()].index]\n",
    "sitelinksDF = pd.read_csv(SITELINKS_FILE_V2, sep='\\t')\n",
    "sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split(\"/\")[-1] if p.split(\"/\")[-1] != '' else p.split(\"/\")[-2])\n",
    "sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']\n",
    "sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')\n",
    "print(f\"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)\")\n",
    "\n",
    "sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]\n",
    "sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in allNodes)]\n",
    "labelsDF = pd.read_csv(LABELS_FILE, sep='\\t')\n",
    "labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in allNodes)]\n",
    "labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}\n",
    "descriptionsDF = pd.read_csv(DESCRIPTIONS_FILE, compression='gzip', sep='\\t')\n",
    "descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in allNodes)]\n",
    "descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}\n",
    "\n",
    "sdf_set = set(sitelinksDF2.index.to_list())\n",
    "df1 = df1[df1.index.map(lambda p: p in sdf_set)]\n",
    "abstractsDF2 = sitelinksDF2.join(df1).reset_index()\n",
    "print(f\"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract\")\n",
    "\n",
    "abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else \"\")\n",
    "abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else \"\")\n",
    "from nltk.tokenize import sent_tokenize\n",
    "abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)\n",
    "\n",
    "currNodes = set(abstractsDF2.node1.tolist())\n",
    "\n",
    "correctedRows = []\n",
    "for key in tqdm(allNodes):\n",
    "    if key not in currNodes:\n",
    "        correctedRows.append([None, None, key, None, None, None, None, None, None, labelsDict[key][1:-4] if key in labelsDict else None, descDict[key][1:-4] if key in descDict else None, None])\n",
    "        \n",
    "abstractsDF3 = pd.concat([abstractsDF2, pd.DataFrame(correctedRows, columns=abstractsDF2.columns)])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "geographic-penguin",
   "metadata": {},
   "source": [
    "DBPedia dataset has 5732949 records with unique 5732947 index values\n",
    "There are 8637003 sitelinks present in the dataset corresponding to 8637003 unique node1s (Qxxx), 8563928 unique labels (text)\n",
    "From 116430 Qnodes, there are 5707 sitelink Qnodes which do not have a short abstract i.e 110723 have a short abstract\n",
    "\n",
    "\n",
    "\n",
    "DBPedia dataset has 5732949 records with unique 5732947 index values\n",
    "There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)\n",
    "\n",
    "From 122585 Qnodes, there are 5661 sitelink Qnodes which do not have a short abstract i.e 116924 have a short abstract\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "genetic-sympathy",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "241698\n"
     ]
    }
   ],
   "source": [
    "def combineAbsLabDesc(row, parameter):\n",
    "    if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != \"\":\n",
    "        return row[parameter]\n",
    "    elif row['node1_label'] != '' and row['node1_desc'] != '' and not(pd.isna(row['node1_label'])) and not(pd.isna(row['node1_desc'])):\n",
    "        return row['node1_label'] + ' ' + row['node1_desc']\n",
    "    elif row['node1_label'] != '' and not(pd.isna(row['node1_label'])):\n",
    "        return row['node1_label']\n",
    "    else:\n",
    "        return None\n",
    "    \n",
    "abstractsDF3['abstract'] = abstractsDF3.apply(combineAbsLabDesc, axis=1, args=('abstract',))\n",
    "abstractsDF3['abstract_firstSent'] = abstractsDF3.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))\n",
    "abstractsDF3 = abstractsDF3[~abstractsDF3.abstract.isna()]\n",
    "abstractsDF3 = abstractsDF3.reset_index()\n",
    "abstractsDF3 = abstractsDF3.drop(columns=['level_0']).reset_index()\n",
    "abstractsDF3 = abstractsDF3.drop(columns=['level_0'])\n",
    "print(len(abstractsDF3))\n",
    "abstractsDF3.to_csv(ABSTRACTS_INTERMEDIATE_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "wicked-logic",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "desirable-fifteen",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/nas/home/kshenoy/miniconda3/envs/kgtkEnv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (1,2,4,5,7,8) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "241698"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "abstractsDF2 = pd.read_csv(ABSTRACTS_INTERMEDIATE_FILE)\n",
    "len(abstractsDF2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "administrative-wesley",
   "metadata": {},
   "outputs": [],
   "source": [
    "# abstractsDF2[abstractsDF2.abstract == (abstractsDF2.node1_label +  ' ' + abstractsDF2.node1_desc)].to_csv('../data/Master_P279_dataset/temppppp.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "crude-result",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from time import time\n",
    "import pandas as pd\n",
    "\n",
    "def getSentEmbeddings(valSeries, modelName):\n",
    "    model = SentenceTransformer(modelName, device='cuda:2')\n",
    "    start = time()\n",
    "    encodings = model.encode(valSeries.to_list(), show_progress_bar=True, batch_size=1000)\n",
    "    print(time()-start,'s')\n",
    "    return encodings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "impressive-comedy",
   "metadata": {},
   "outputs": [],
   "source": [
    "modelName = 'sentence-transformers/all-distilroberta-v1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "coordinate-assurance",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404\n",
      "SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch\n",
      "Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bc9488221a204b5e9558749419087c8c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "316.05097579956055 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404\n",
      "SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch\n",
      "Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3eb77d77acb549d6b3b91b3a31b7f4d8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "260.16796946525574 s\n"
     ]
    }
   ],
   "source": [
    "absEmbSeries = getSentEmbeddings(abstractsDF2.abstract, modelName)\n",
    "absFirstSentEmbSeries = getSentEmbeddings(abstractsDF2.abstract_firstSent, modelName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "valued-toilet",
   "metadata": {},
   "outputs": [],
   "source": [
    "absEmbDict = {node: emb.tolist() for node, emb in zip(abstractsDF2.node1.to_list(), absEmbSeries)}\n",
    "absFirstSentEmbDict = {node: emb.tolist() for node, emb in zip(abstractsDF2.node1.to_list(), absFirstSentEmbSeries)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "refined-belarus",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(absEmbDict, open(ABS_EMB_FINAL_FILE, 'w'))\n",
    "json.dump(absFirstSentEmbDict, open(ABS_FIRST_SENT_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "knowing-relay",
   "metadata": {},
   "source": [
    "# H,A,S embeddings\n",
    "\n",
    "Fetched from `sita` `/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "ideal-accreditation",
   "metadata": {},
   "outputs": [],
   "source": [
    "def store_relevant_embeddings(wvec, fname):\n",
    "    tempEmb = {key: wvec[key] for key in wvec.index_to_key if key in allNodes}\n",
    "    print(f\"Original Length: {len(wvec.index_to_key)}, No. of keys stored: {len(tempEmb)}\")\n",
    "    json.dump(serializeEmbeddingDict(tempEmb),open(fname, 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "wireless-playing",
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import KeyedVectors, Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "cordless-bachelor",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original Length: 12106870, No. of keys stored: 27876\n",
      "Original Length: 19593942, No. of keys stored: 166201\n",
      "Original Length: 39030788, No. of keys stored: 116993\n",
      "CPU times: user 5min 54s, sys: 2min 1s, total: 7min 56s\n",
      "Wall time: 18min 51s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "a_key_vec = KeyedVectors.load(A_SOURCE_FILE)\n",
    "store_relevant_embeddings(a_key_vec, A_OP_FILE)\n",
    "\n",
    "h_key_vec = KeyedVectors.load(H_SOURCE_FILE)\n",
    "store_relevant_embeddings(h_key_vec, H_OP_FILE)\n",
    "\n",
    "s_key_vec = KeyedVectors.load(S_SOURCE_FILE)\n",
    "store_relevant_embeddings(s_key_vec, S_OP_FILE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "warming-raise",
   "metadata": {},
   "source": [
    "# Label Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "decimal-timothy",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_dict = get_labels(allNodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "tough-wallet",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404\n",
      "SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch\n",
      "Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "99bb9a58eb3f4f97b94b592d389920c8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "98.77918219566345 s\n"
     ]
    }
   ],
   "source": [
    "modelName = 'sentence-transformers/all-distilroberta-v1'\n",
    "embs = getSentEmbeddings(pd.Series(labels_dict.values()), modelName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "obvious-tuning",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_emb_dict = {k:v.tolist() for k, v in (zip(labels_dict.keys(), embs))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "after-shoulder",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(labels_emb_dict, open(LABELS_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "composed-journey",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "satisfied-liberal",
   "metadata": {},
   "source": [
    "# Label + Desc Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "complete-office",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4b13c4c43b704c5d82c81f2442265f62",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/41845781 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dbffd7e813e04d80ac616f1650211f47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/34700043 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404\n",
      "SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch\n",
      "Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "240aaa8b6c4746f2af9ad9480aa82141",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/242 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "203.93888425827026 s\n"
     ]
    }
   ],
   "source": [
    "labels_desc_dict = get_labels_n_desc(allNodes)\n",
    "modelName = 'sentence-transformers/all-distilroberta-v1'\n",
    "embs = getSentEmbeddings(pd.Series(labels_desc_dict.values()), modelName)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "cordless-daily",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels_desc_emb_dict = {k:v.tolist() for k, v in (zip(labels_desc_dict.keys(), embs))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "light-yugoslavia",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(labels_desc_emb_dict, open(LABELS_DESC_EMB_FINAL_FILE, 'w'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "entitled-validity",
   "metadata": {},
   "source": [
    "# Direct sim files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "standing-prevention",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a7b4867cf8064396b3c1bf6c23af761a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/334 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d295067e6607498e914fa6768217aba4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/334 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1cf2c49678114c21ac442fb3c4d035e5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/334 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import requests\n",
    "from tqdm.notebook import tqdm\n",
    "import json\n",
    "from joblib import Parallel, delayed\n",
    "import sys\n",
    "\n",
    "word_sim_df = pd.read_csv(WORDSIM_FILE)\n",
    "\n",
    "def fetchSim(row, similarity_type):\n",
    "    resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
    "    try:\n",
    "        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
    "    except Exception as exc:\n",
    "        print(exc)\n",
    "        print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
    "        row['embedding_cos_sim'] = None\n",
    "    row['Resp_code'] = resp\n",
    "    return row\n",
    "\n",
    "word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "\n",
    "word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_class_sim_df.to_csv(WORDSIM_CLASS_SIM_FILE, index=None)\n",
    "word_sim_jc_sim_df.to_csv(WORDSIM_JC_SIM_FILE, index=None)\n",
    "word_sim_top_sim_df.to_csv(WORDSIM_TOP_SIM_FILE, index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "grateful-ukraine",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Word 1</th>\n",
       "      <th>Word 2</th>\n",
       "      <th>ID</th>\n",
       "      <th>H_Sim</th>\n",
       "      <th>H_Dim</th>\n",
       "      <th>F_Sim</th>\n",
       "      <th>F_Dim</th>\n",
       "      <th>N_Sim</th>\n",
       "      <th>N_Dim</th>\n",
       "      <th>D_Sim</th>\n",
       "      <th>...</th>\n",
       "      <th>P_Dim</th>\n",
       "      <th>Avg</th>\n",
       "      <th>Stdev</th>\n",
       "      <th>H_orig</th>\n",
       "      <th>H_reversed</th>\n",
       "      <th>word1_kg_id</th>\n",
       "      <th>word2_kg_id</th>\n",
       "      <th>category</th>\n",
       "      <th>embedding_cos_sim</th>\n",
       "      <th>Resp_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Arafat</td>\n",
       "      <td>peace</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>D</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>U</td>\n",
       "      <td>4</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.6</td>\n",
       "      <td>0.547723</td>\n",
       "      <td>2.1250</td>\n",
       "      <td>7.8750</td>\n",
       "      <td>Q34211</td>\n",
       "      <td>Q454</td>\n",
       "      <td>U</td>\n",
       "      <td>3.982734</td>\n",
       "      <td>&lt;Response [200]&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Arafat</td>\n",
       "      <td>terror</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>D</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>U</td>\n",
       "      <td>4</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.6</td>\n",
       "      <td>0.547723</td>\n",
       "      <td>3.0625</td>\n",
       "      <td>6.9375</td>\n",
       "      <td>Q34211</td>\n",
       "      <td>Q13648784</td>\n",
       "      <td>U</td>\n",
       "      <td>3.969884</td>\n",
       "      <td>&lt;Response [200]&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>FBI</td>\n",
       "      <td>fingerprint</td>\n",
       "      <td>109</td>\n",
       "      <td>3</td>\n",
       "      <td>D</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.6</td>\n",
       "      <td>0.547723</td>\n",
       "      <td>4.0625</td>\n",
       "      <td>5.9375</td>\n",
       "      <td>Q8333</td>\n",
       "      <td>Q178022</td>\n",
       "      <td>U</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>&lt;Response [200]&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>FBI</td>\n",
       "      <td>investigation</td>\n",
       "      <td>110</td>\n",
       "      <td>3</td>\n",
       "      <td>U</td>\n",
       "      <td>3</td>\n",
       "      <td>U</td>\n",
       "      <td>3</td>\n",
       "      <td>U</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>u</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.0625</td>\n",
       "      <td>4.9375</td>\n",
       "      <td>Q8333</td>\n",
       "      <td>Q21004260</td>\n",
       "      <td>M</td>\n",
       "      <td>3.951077</td>\n",
       "      <td>&lt;Response [200]&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Harvard</td>\n",
       "      <td>Yale</td>\n",
       "      <td>137</td>\n",
       "      <td>2</td>\n",
       "      <td>S</td>\n",
       "      <td>3</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>s</td>\n",
       "      <td>2.2</td>\n",
       "      <td>0.447214</td>\n",
       "      <td>4.8750</td>\n",
       "      <td>5.1250</td>\n",
       "      <td>Q13371</td>\n",
       "      <td>Q49112</td>\n",
       "      <td>M</td>\n",
       "      <td>1.264601</td>\n",
       "      <td>&lt;Response [200]&gt;</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Word 1         Word 2   ID  H_Sim H_Dim  F_Sim F_Dim  N_Sim N_Dim  D_Sim  \\\n",
       "0   Arafat          peace    8      3     D      4   NaN      3     U      4   \n",
       "1   Arafat         terror    9      3     D      4   NaN      3     U      4   \n",
       "2      FBI    fingerprint  109      3     D      4   NaN      4   NaN      3   \n",
       "3      FBI  investigation  110      3     U      3     U      3     U      3   \n",
       "4  Harvard           Yale  137      2     S      3     S      2     S      2   \n",
       "\n",
       "   ... P_Dim  Avg     Stdev  H_orig  H_reversed  word1_kg_id  word2_kg_id  \\\n",
       "0  ...   NaN  3.6  0.547723  2.1250      7.8750       Q34211         Q454   \n",
       "1  ...   NaN  3.6  0.547723  3.0625      6.9375       Q34211    Q13648784   \n",
       "2  ...   NaN  3.6  0.547723  4.0625      5.9375        Q8333      Q178022   \n",
       "3  ...     u  3.0  0.000000  5.0625      4.9375        Q8333    Q21004260   \n",
       "4  ...     s  2.2  0.447214  4.8750      5.1250       Q13371       Q49112   \n",
       "\n",
       "  category embedding_cos_sim         Resp_code  \n",
       "0        U          3.982734  <Response [200]>  \n",
       "1        U          3.969884  <Response [200]>  \n",
       "2        U          4.000000  <Response [200]>  \n",
       "3        M          3.951077  <Response [200]>  \n",
       "4        M          1.264601  <Response [200]>  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_sim_class_sim_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "operational-survival",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "48386a6eaa0745e4a9eebbed1e61c72c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/349 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "094ee6f3ee7f4566855755f5b33e0515",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/349 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "615d0754edee407cac17b7cb3fda9c73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/349 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import requests\n",
    "from tqdm.notebook import tqdm\n",
    "import json\n",
    "from joblib import Parallel, delayed\n",
    "import sys\n",
    "\n",
    "word_sim_df = pd.read_csv(WORDSIM_OLD_FILE)\n",
    "\n",
    "def fetchSim(row, similarity_type):\n",
    "    resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
    "    try:\n",
    "        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
    "    except Exception as exc:\n",
    "        print(exc)\n",
    "        print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
    "        row['embedding_cos_sim'] = None\n",
    "    row['Resp_code'] = resp\n",
    "    return row\n",
    "\n",
    "word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "\n",
    "word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "\n",
    "word_sim_class_sim_df.to_csv(WORDSIM_OLD_CLASS_SIM_FILE, index=None)\n",
    "word_sim_jc_sim_df.to_csv(WORDSIM_OLD_JC_SIM_FILE, index=None)\n",
    "word_sim_top_sim_df.to_csv(WORDSIM_OLD_TOP_SIM_FILE, index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "useful-effectiveness",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4517a92d40bc4f2fac7cf08a47c047ad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/16 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6dfb01e8813644e0b56d890be25eb955",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/16 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fe45273ea1f34d9faabd5422160ddc67",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/16 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import requests\n",
    "from tqdm.notebook import tqdm\n",
    "import json\n",
    "from joblib import Parallel, delayed\n",
    "import sys\n",
    "\n",
    "word_sim_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)\n",
    "\n",
    "def fetchSim(row, similarity_type):\n",
    "    resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
    "    try:\n",
    "        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
    "    except Exception as exc:\n",
    "        print(exc)\n",
    "        print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
    "        row['embedding_cos_sim'] = None\n",
    "    row['Resp_code'] = resp\n",
    "    return row\n",
    "\n",
    "word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "\n",
    "word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "\n",
    "word_sim_class_sim_df.to_csv(DBPEDIA_MC_30_CLASS_SIM_FILE, index=None)\n",
    "word_sim_jc_sim_df.to_csv(DBPEDIA_MC_30_JC_SIM_FILE, index=None)\n",
    "word_sim_top_sim_df.to_csv(DBPEDIA_MC_30_TOP_SIM_FILE, index=None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "dressed-grove",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b986c2d80c6424f937b336ca871ac5e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/34 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4e945c056cf546ac860a80004b19ae1b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/34 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "624ccbdd27734b44840c3d7fd6878294",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/34 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import requests\n",
    "from tqdm.notebook import tqdm\n",
    "import json\n",
    "from joblib import Parallel, delayed\n",
    "import sys\n",
    "\n",
    "word_sim_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)\n",
    "\n",
    "def fetchSim(row, similarity_type):\n",
    "    resp = requests.get(\"https://kgtk.isi.edu/similarity_api?q1=\"+row['word1_kg_id']+\"&q2=\"+row['word2_kg_id']+\"&embedding_type=\"+similarity_type)\n",
    "    try:\n",
    "        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None\n",
    "    except Exception as exc:\n",
    "        print(exc)\n",
    "        print(f\"Resp not found for {row['node1']}, {row['node2']}\")\n",
    "        row['embedding_cos_sim'] = None\n",
    "    row['Resp_code'] = resp\n",
    "    return row\n",
    "\n",
    "word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))\n",
    "\n",
    "word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)\n",
    "word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()\n",
    "word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)\n",
    "\n",
    "\n",
    "word_sim_class_sim_df.to_csv(DBPEDIA_RG_65_CLASS_SIM_FILE, index=None)\n",
    "word_sim_jc_sim_df.to_csv(DBPEDIA_RG_65_JC_SIM_FILE, index=None)\n",
    "word_sim_top_sim_df.to_csv(DBPEDIA_RG_65_TOP_SIM_FILE, index=None)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "spatial-excerpt",
   "metadata": {},
   "source": [
    "# Summary of embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "wooden-medicare",
   "metadata": {},
   "outputs": [],
   "source": [
    "p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))\n",
    "\n",
    "complexEmb = json.load(open('../data/Master_P279_dataset/masterComplexEmb.json'))\n",
    "transeEmb = json.load(open('../data/Master_P279_dataset/masterTranseEmb.json'))\n",
    "\n",
    "text2Emb = json.load(open('../data/Master_P279_dataset/text2Emb.json'))\n",
    "text7Emb = json.load(open('../data/Master_P279_dataset/text7Emb.json'))\n",
    "\n",
    "abstractEmb = json.load(open('../data/Master_P279_dataset/abstractEmb.json'))\n",
    "abstractFirstSentEmb = json.load(open('../data/Master_P279_dataset/abstractFirstSentEmb.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "distributed-magazine",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump({key:val for key, val in complexEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json', 'w'))\n",
    "json.dump({key:val for key, val in transeEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json', 'w'))\n",
    "json.dump({key:val for key, val in text2Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))\n",
    "json.dump({key:val for key, val in text7Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))\n",
    "json.dump({key:val for key, val in abstractEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_orig_embedding_dict.json', 'w'))\n",
    "json.dump({key:val for key, val in abstractFirstSentEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_first_sent_orig_embedding_dict.json', 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "disturbed-better",
   "metadata": {},
   "outputs": [],
   "source": [
    "def countOverlap(source, target):\n",
    "    cnt = 0\n",
    "    for key1 in source:\n",
    "        if key1 in target:\n",
    "            cnt += 1\n",
    "    return cnt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "advised-cyprus",
   "metadata": {},
   "outputs": [],
   "source": [
    "summArr = []\n",
    "cnt = countOverlap(complexEmb, p279QnodesList)\n",
    "summArr.append(['complex', cnt, cnt / len(p279QnodesList) * 100])\n",
    "\n",
    "cnt = countOverlap(transeEmb, p279QnodesList)\n",
    "summArr.append(['transe', cnt, cnt / len(p279QnodesList) * 100])\n",
    "\n",
    "cnt = countOverlap(text2Emb, p279QnodesList)\n",
    "summArr.append(['text2', cnt, cnt / len(p279QnodesList) * 100])\n",
    "\n",
    "cnt = countOverlap(text7Emb, p279QnodesList)\n",
    "summArr.append(['text7', cnt, cnt / len(p279QnodesList) * 100])\n",
    "\n",
    "cnt = countOverlap(abstractEmb, p279QnodesList)\n",
    "summArr.append(['abstract', cnt, cnt / len(p279QnodesList) * 100])\n",
    "\n",
    "cnt = countOverlap(abstractFirstSentEmb, p279QnodesList)\n",
    "summArr.append(['abstractFirstSent', cnt, cnt / len(p279QnodesList) * 100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "czech-keeping",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "238889"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(p279QnodesList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "employed-christmas",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>embedding</th>\n",
       "      <th>count</th>\n",
       "      <th>Coverage Percentage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>complex</td>\n",
       "      <td>238448</td>\n",
       "      <td>99.815395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>transe</td>\n",
       "      <td>238448</td>\n",
       "      <td>99.815395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>text2</td>\n",
       "      <td>238889</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>text7</td>\n",
       "      <td>238889</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>abstract</td>\n",
       "      <td>105828</td>\n",
       "      <td>44.300072</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>abstractFirstSent</td>\n",
       "      <td>105828</td>\n",
       "      <td>44.300072</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           embedding   count  Coverage Percentage\n",
       "0            complex  238448            99.815395\n",
       "1             transe  238448            99.815395\n",
       "2              text2  238889           100.000000\n",
       "3              text7  238889           100.000000\n",
       "4           abstract  105828            44.300072\n",
       "5  abstractFirstSent  105828            44.300072"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(summArr, columns=['embedding', 'count', 'Coverage Percentage'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "changing-strategy",
   "metadata": {},
   "source": [
    "# Embeddings correction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "purple-raising",
   "metadata": {},
   "outputs": [],
   "source": [
    "masterEmbedDictMaster = {}\n",
    "subsetEmbedDictMaster = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "round-product",
   "metadata": {},
   "outputs": [],
   "source": [
    "masterEmbedKeys = ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']\n",
    "for key1 in masterEmbedKeys:\n",
    "    masterEmbedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "metallic-insulin",
   "metadata": {},
   "outputs": [],
   "source": [
    "subsetEmbedKeys = ['text_7props', 'text_2props', 'complex', 'transe', 'abstract', 'abstract_first_sent']\n",
    "for key1 in subsetEmbedKeys:\n",
    "    subsetEmbedDictMaster[key1] = json.load(open('../data/orig_embeddings/'+key1+'_original_embeddings_dict.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "assigned-parameter",
   "metadata": {},
   "outputs": [],
   "source": [
    "wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "olympic-yemen",
   "metadata": {},
   "outputs": [],
   "source": [
    "wordsim_pairs = {(row['word1_kg_id'], row['word2_kg_id']) for _, row in wordSim353AnnotDF_New.iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "welcome-disorder",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pair Coverage by text_7props embeddings created for 19k retrofitting: 325\n",
      "Pair Coverage by text_2props embeddings created for 19k retrofitting: 325\n",
      "Pair Coverage by complex embeddings created for 19k retrofitting: 342\n",
      "Pair Coverage by transe embeddings created for 19k retrofitting: 342\n",
      "Pair Coverage by abstract embeddings created for 19k retrofitting: 343\n",
      "Pair Coverage by abstract_first_sent embeddings created for 19k retrofitting: 343\n"
     ]
    }
   ],
   "source": [
    "for key1 in subsetEmbedKeys:\n",
    "    print(f\"Pair Coverage by {key1} embeddings created for 19k retrofitting: {sum([row[0] in subsetEmbedDictMaster[key1] and row[1] in subsetEmbedDictMaster[key1] for row in wordsim_pairs])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "northern-psychiatry",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pair Coverage by old text_7_props embeddings created for 19k retrofitting: 278\n",
      "Pair Coverage by old text_2_props embeddings created for 19k retrofitting: 278\n",
      "Pair Coverage by old complex embeddings created for 19k retrofitting: 278\n",
      "Pair Coverage by old transe embeddings created for 19k retrofitting: 278\n",
      "Pair Coverage by old abstract embeddings created for 19k retrofitting: 183\n",
      "Pair Coverage by old abstract_first_sent embeddings created for 19k retrofitting: 183\n"
     ]
    }
   ],
   "source": [
    "for key1 in masterEmbedKeys:\n",
    "    print(f\"Pair Coverage by old {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "contrary-casino",
   "metadata": {},
   "outputs": [],
   "source": [
    "wordSim353AnnotDF_New_set = set(wordSim353AnnotDF_New.word1_kg_id.to_list() + wordSim353AnnotDF_New.word2_kg_id.to_list())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "alleged-polish",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "masterEmbCorrections = defaultdict(list)\n",
    "for node in wordSim353AnnotDF_New_set:\n",
    "    for i in range(len(masterEmbedKeys)):\n",
    "        if node not in masterEmbedDictMaster[masterEmbedKeys[i]] and node in wordSim353AnnotDF_New_set:\n",
    "            masterEmbCorrections[masterEmbedKeys[i]].append(node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "periodic-buffer",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['abstract', 'abstract_first_sent', 'text_7_props', 'text_2_props', 'complex', 'transe'])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "masterEmbCorrections.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "awful-signal",
   "metadata": {},
   "source": [
    "## Complex, Transe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "exceptional-acting",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import requests\n",
    "# correctedComplexEmb = {}\n",
    "# correctedTranseEmb = {}\n",
    "# for wordID in masterEmbCorrections['complex']:\n",
    "#     try:\n",
    "#         resp = requests.get(\"http://ckg07:9200/wikidatadwd-augmented/_doc/\"+wordID).json()['_source']\n",
    "#         correctedComplexEmb[wordID] = [float(p) for p in resp['graph_embedding_complex'].split(',')]\n",
    "#         correctedTranseEmb[wordID] = [float(p) for p in resp['graph_embeddings_transe'].split(',')]\n",
    "#     except:\n",
    "#         print(\"Failure returned for http://ckg07:9200/wikidatadwd-augmented/_doc/\"+wordID)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "assigned-journey",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "51850ec9544547f293820bd9e94091f4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/42575933 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4de00c52596f4ce3b5d17ee9ef73b068",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/42575933 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1h 33min 17s, sys: 2min 38s, total: 1h 35min 56s\n",
      "Wall time: 1h 35min 28s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "correctedComplexEmb = {qnode: emb for emb, qnode in tqdm(zip(f['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}\n",
    "correctedTranseEmb = {qnode: emb for emb, qnode in tqdm(zip(transf['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "female-scope",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list(correctedComplexEmb.items())[0][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "valuable-bahrain",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(masterEmbedDictMaster['complex'][list(masterEmbedDictMaster['complex'].keys())[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "religious-celebration",
   "metadata": {},
   "outputs": [],
   "source": [
    "for node, emb in correctedComplexEmb.items():\n",
    "    masterEmbedDictMaster['complex'][node] = emb\n",
    "for node, emb in correctedTranseEmb.items():\n",
    "    masterEmbedDictMaster['transe'][node] = emb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "spatial-writer",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "periodic-laundry",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "motivated-soundtrack",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "imported-advance",
   "metadata": {},
   "source": [
    "## Text Embeddings Correction file generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "historical-conservative",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q1 = \"kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '\" + '|'.join(masterEmbCorrections['text_7_props']) + \";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv -v True\"\n",
    "os.system(\"screen -dm \" + q1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "overhead-commission",
   "metadata": {},
   "outputs": [],
   "source": [
    "q1 = \"kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv   \\\n",
    "    --model roberta-large-nli-mean-tokens   \\\n",
    "    --property-labels-file ../data/labels.en.tsv  --debug   \\\n",
    "    --isa-properties P31 P279 P106 P39 P1382 P373 P452   \\\n",
    "    --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv\"\n",
    "#     print(q1)\n",
    "os.system(q1 + \" &amp;\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "cooked-vinyl",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32512"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q1 = \"kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv   \\\n",
    "    --model roberta-large-nli-mean-tokens   \\\n",
    "    --property-labels-file ../data/labels.en.tsv  --debug   \\\n",
    "    --isa-properties P31 P279   \\\n",
    "    --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv\"\n",
    "#     print(q1)\n",
    "os.system(q1 + \" &amp;\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "static-failure",
   "metadata": {},
   "outputs": [],
   "source": [
    "corrected7Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv', sep='\\t')\n",
    "corrected2Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "spare-flexibility",
   "metadata": {},
   "outputs": [],
   "source": [
    "corrected7Emb = corrected7Emb[corrected7Emb.property == 'text_embedding']\n",
    "corrected7Emb['value'] = corrected7Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])\n",
    "\n",
    "corrected2Emb = corrected2Emb[corrected2Emb.property == 'text_embedding']\n",
    "corrected2Emb['value'] = corrected2Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "minute-oakland",
   "metadata": {},
   "outputs": [],
   "source": [
    "for _, row in corrected7Emb.iterrows():\n",
    "    masterEmbedDictMaster['text_7_props'][row['node']] = row['value']\n",
    "for _, row in corrected2Emb.iterrows():\n",
    "    masterEmbedDictMaster['text_2_props'][row['node']] = row['value']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "documentary-fluid",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "analyzed-naples",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "psychological-brighton",
   "metadata": {},
   "source": [
    "## Abstract"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "meaning-spanking",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DBPedia dataset has 5732949 records with unique 5732947 index values\n",
      "There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)\n"
     ]
    }
   ],
   "source": [
    "df1 = pd.read_csv(\"../data/short-abstracts_lang=en.csv\", skiprows=1, skipfooter=1, header=None, engine='python')\n",
    "df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']\n",
    "df1 = df1.set_index('node1')\n",
    "df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]\n",
    "print(f\"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values\")\n",
    "sitelinksDF = pd.read_csv(\"../data/sitelinks.en.tsv.gz\", sep='\\t')\n",
    "sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split(\"/\")[-1] if p.split(\"/\")[-1] != '' else p.split(\"/\")[-2])\n",
    "sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']\n",
    "sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')\n",
    "print(f\"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)\")\n",
    "sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]\n",
    "masterEmbCorrections_abs_set = set(masterEmbCorrections['abstract'])\n",
    "sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "offensive-enclosure",
   "metadata": {},
   "outputs": [],
   "source": [
    "labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\\t')\n",
    "labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]\n",
    "labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}\n",
    "descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\\t')\n",
    "descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]\n",
    "descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "better-tuner",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From 58 Qnodes, there are 16 sitelink Qnodes which do not have a short abstract i.e 42 have a short abstract\n"
     ]
    }
   ],
   "source": [
    "sdf_set = set(sitelinksDF2.index.to_list())\n",
    "df1 = df1[df1.index.map(lambda p: p in sdf_set)]\n",
    "abstractsDF2 = sitelinksDF2.join(df1).reset_index()\n",
    "print(f\"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract\")\n",
    "# abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "invalid-fiction",
   "metadata": {},
   "outputs": [],
   "source": [
    "abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else \"\")\n",
    "abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else \"\")\n",
    "def combineAbsLabDesc(row, parameter):\n",
    "    if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != \"\":\n",
    "        return row[parameter]\n",
    "    elif row['node1_label'] == \"\" and row['node1_desc'] == \"\":\n",
    "        return None\n",
    "    else:\n",
    "        return row['node1_label'] + ' ' + row['node1_desc']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "opened-drink",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import sent_tokenize\n",
    "abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)\n",
    "abstractsDF2 = abstractsDF2.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "affected-reproduction",
   "metadata": {},
   "outputs": [],
   "source": [
    "abstractsDF2['abstract'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract',))\n",
    "abstractsDF2['abstract_firstSent'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "actual-communication",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "58"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(abstractsDF2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "quantitative-tumor",
   "metadata": {},
   "outputs": [],
   "source": [
    "abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "turned-retail",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "58"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(abstractsDF2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "heard-freedom",
   "metadata": {},
   "outputs": [],
   "source": [
    "abstractsDF2 = abstractsDF2.drop(columns=['index']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "forty-southeast",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>level_0</th>\n",
       "      <th>index</th>\n",
       "      <th>trimmedNode2</th>\n",
       "      <th>id</th>\n",
       "      <th>node1</th>\n",
       "      <th>label</th>\n",
       "      <th>node2</th>\n",
       "      <th>ignore</th>\n",
       "      <th>url</th>\n",
       "      <th>ignore2</th>\n",
       "      <th>abstract</th>\n",
       "      <th>node1_label</th>\n",
       "      <th>node1_desc</th>\n",
       "      <th>abstract_firstSent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Luxuries</td>\n",
       "      <td>Q10953913-wikipedia_sitelink-538fe3-0</td>\n",
       "      <td>Q10953913</td>\n",
       "      <td>wikipedia_sitelink</td>\n",
       "      <td>http://en.wikipedia.org/wiki/Luxuries</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>luxuryBehavior, expenses or equipment that far...</td>\n",
       "      <td>luxury</td>\n",
       "      <td>Behavior, expenses or equipment that far excee...</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Potato</td>\n",
       "      <td>Q10998-wikipedia_sitelink-56b85c-0</td>\n",
       "      <td>Q10998</td>\n",
       "      <td>wikipedia_sitelink</td>\n",
       "      <td>http://en.wikipedia.org/wiki/Potato</td>\n",
       "      <td>10709.0</td>\n",
       "      <td>&lt;http://dbpedia.org/resource/Potato&gt;</td>\n",
       "      <td>&lt;http://www.w3.org/2000/01/rdf-schema#comment&gt;</td>\n",
       "      <td>The potato is a root vegetable native to the A...</td>\n",
       "      <td>potato</td>\n",
       "      <td>species of plant</td>\n",
       "      <td>The potato is a root vegetable native to the A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Mars</td>\n",
       "      <td>Q111-wikipedia_sitelink-9ff296-0</td>\n",
       "      <td>Q111</td>\n",
       "      <td>wikipedia_sitelink</td>\n",
       "      <td>http://en.wikipedia.org/wiki/Mars</td>\n",
       "      <td>1803088.0</td>\n",
       "      <td>&lt;http://dbpedia.org/resource/Mars&gt;</td>\n",
       "      <td>&lt;http://www.w3.org/2000/01/rdf-schema#comment&gt;</td>\n",
       "      <td>Mars is the fourth planet from the Sun and the...</td>\n",
       "      <td>Mars</td>\n",
       "      <td>fourth planet from the Sun</td>\n",
       "      <td>Mars is the fourth planet from the Sun and the...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>Dawn</td>\n",
       "      <td>Q11326182-wikipedia_sitelink-ae2918-0</td>\n",
       "      <td>Q11326182</td>\n",
       "      <td>wikipedia_sitelink</td>\n",
       "      <td>http://en.wikipedia.org/wiki/Dawn</td>\n",
       "      <td>97544.0</td>\n",
       "      <td>&lt;http://dbpedia.org/resource/Dawn&gt;</td>\n",
       "      <td>&lt;http://www.w3.org/2000/01/rdf-schema#comment&gt;</td>\n",
       "      <td>Dawn is the time that marks the beginning of t...</td>\n",
       "      <td>dawn</td>\n",
       "      <td>time that marks the beginning of the twilight ...</td>\n",
       "      <td>Dawn is the time that marks the beginning of t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>Change_(philosophy)</td>\n",
       "      <td>Q1150070-wikipedia_sitelink-81cf5f-0</td>\n",
       "      <td>Q1150070</td>\n",
       "      <td>wikipedia_sitelink</td>\n",
       "      <td>http://en.wikipedia.org/wiki/Change_(philosophy)</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>changeprocess, event or action that deviates f...</td>\n",
       "      <td>change</td>\n",
       "      <td>process, event or action that deviates from th...</td>\n",
       "      <td>nan</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   level_0  index         trimmedNode2                                     id  \\\n",
       "0        0      0             Luxuries  Q10953913-wikipedia_sitelink-538fe3-0   \n",
       "1        1      1               Potato     Q10998-wikipedia_sitelink-56b85c-0   \n",
       "2        2      2                 Mars       Q111-wikipedia_sitelink-9ff296-0   \n",
       "3        3      3                 Dawn  Q11326182-wikipedia_sitelink-ae2918-0   \n",
       "4        4      4  Change_(philosophy)   Q1150070-wikipedia_sitelink-81cf5f-0   \n",
       "\n",
       "       node1               label  \\\n",
       "0  Q10953913  wikipedia_sitelink   \n",
       "1     Q10998  wikipedia_sitelink   \n",
       "2       Q111  wikipedia_sitelink   \n",
       "3  Q11326182  wikipedia_sitelink   \n",
       "4   Q1150070  wikipedia_sitelink   \n",
       "\n",
       "                                              node2     ignore  \\\n",
       "0             http://en.wikipedia.org/wiki/Luxuries        NaN   \n",
       "1               http://en.wikipedia.org/wiki/Potato    10709.0   \n",
       "2                 http://en.wikipedia.org/wiki/Mars  1803088.0   \n",
       "3                 http://en.wikipedia.org/wiki/Dawn    97544.0   \n",
       "4  http://en.wikipedia.org/wiki/Change_(philosophy)        NaN   \n",
       "\n",
       "                                    url  \\\n",
       "0                                   NaN   \n",
       "1  <http://dbpedia.org/resource/Potato>   \n",
       "2    <http://dbpedia.org/resource/Mars>   \n",
       "3    <http://dbpedia.org/resource/Dawn>   \n",
       "4                                   NaN   \n",
       "\n",
       "                                          ignore2  \\\n",
       "0                                             NaN   \n",
       "1  <http://www.w3.org/2000/01/rdf-schema#comment>   \n",
       "2  <http://www.w3.org/2000/01/rdf-schema#comment>   \n",
       "3  <http://www.w3.org/2000/01/rdf-schema#comment>   \n",
       "4                                             NaN   \n",
       "\n",
       "                                            abstract node1_label  \\\n",
       "0  luxuryBehavior, expenses or equipment that far...      luxury   \n",
       "1  The potato is a root vegetable native to the A...      potato   \n",
       "2  Mars is the fourth planet from the Sun and the...        Mars   \n",
       "3  Dawn is the time that marks the beginning of t...        dawn   \n",
       "4  changeprocess, event or action that deviates f...      change   \n",
       "\n",
       "                                          node1_desc  \\\n",
       "0  Behavior, expenses or equipment that far excee...   \n",
       "1                                   species of plant   \n",
       "2                         fourth planet from the Sun   \n",
       "3  time that marks the beginning of the twilight ...   \n",
       "4  process, event or action that deviates from th...   \n",
       "\n",
       "                                  abstract_firstSent  \n",
       "0                                                nan  \n",
       "1  The potato is a root vegetable native to the A...  \n",
       "2  Mars is the fourth planet from the Sun and the...  \n",
       "3  Dawn is the time that marks the beginning of t...  \n",
       "4                                                nan  "
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "abstractsDF2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "multiple-offer",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from time import time\n",
    "import pandas as pd\n",
    "\n",
    "def getSentEmbeddings(valSeries, modelName):\n",
    "    model = SentenceTransformer(modelName)\n",
    "    start = time()\n",
    "    encodings = model.encode(valSeries.to_list())\n",
    "    print(time()-start,'s')\n",
    "    return encodings\n",
    "\n",
    "def getIndSentEmbeddings(sent, modelName):\n",
    "    model = SentenceTransformer(modelName)\n",
    "    start = time()\n",
    "    encodings = model.encode([sent])\n",
    "    print(time()-start,'s')\n",
    "    return encodings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "sustainable-breakdown",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6419482231140137 s\n",
      "0.5260367393493652 s\n"
     ]
    }
   ],
   "source": [
    "abstractsDF2['abs_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract, 'bert-base-nli-mean-tokens')))\n",
    "abstractsDF2['abs_firstSent_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract_firstSent, 'bert-base-nli-mean-tokens')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "usual-selling",
   "metadata": {},
   "outputs": [],
   "source": [
    "for _, row in abstractsDF2.iterrows():\n",
    "    masterEmbedDictMaster['abstract'][row['node1']] = row['abs_emb']\n",
    "    masterEmbedDictMaster['abstract_first_sent'][row['node1']] = row['abs_firstSent_emb']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "promising-owner",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.37706875801086426 s\n",
      "0.3001420497894287 s\n",
      "0.370746374130249 s\n",
      "0.6896324157714844 s\n",
      "0.33779358863830566 s\n",
      "0.3965473175048828 s\n",
      "0.3200962543487549 s\n",
      "0.3489806652069092 s\n",
      "0.3413431644439697 s\n",
      "0.32114505767822266 s\n",
      "0.3811838626861572 s\n",
      "0.34630370140075684 s\n",
      "0.37790727615356445 s\n",
      "0.26860570907592773 s\n",
      "0.3601953983306885 s\n",
      "0.3713240623474121 s\n",
      "0.34137582778930664 s\n",
      "0.33736181259155273 s\n",
      "0.37023448944091797 s\n",
      "0.31382036209106445 s\n",
      "0.35136938095092773 s\n",
      "0.37309718132019043 s\n",
      "0.33543896675109863 s\n",
      "0.38199710845947266 s\n",
      "0.3740067481994629 s\n",
      "0.3278031349182129 s\n",
      "0.32283997535705566 s\n",
      "0.34000563621520996 s\n",
      "0.31502628326416016 s\n",
      "0.34996771812438965 s\n",
      "0.3871273994445801 s\n",
      "0.3487060070037842 s\n",
      "0.35172486305236816 s\n",
      "0.3280646800994873 s\n",
      "0.3519773483276367 s\n",
      "0.3354451656341553 s\n",
      "0.3633551597595215 s\n",
      "0.3226644992828369 s\n",
      "0.33882975578308105 s\n",
      "0.36072254180908203 s\n",
      "0.3833494186401367 s\n",
      "0.2929043769836426 s\n",
      "0.32875680923461914 s\n",
      "0.36334872245788574 s\n",
      "0.34148168563842773 s\n",
      "0.3569769859313965 s\n",
      "0.37468576431274414 s\n",
      "0.399524450302124 s\n",
      "0.3516504764556885 s\n",
      "0.333402156829834 s\n",
      "0.3851203918457031 s\n",
      "0.34867238998413086 s\n",
      "0.3607771396636963 s\n",
      "0.38669753074645996 s\n",
      "0.33347272872924805 s\n",
      "0.36278390884399414 s\n",
      "0.3602781295776367 s\n",
      "0.3322322368621826 s\n",
      "0.36807823181152344 s\n",
      "0.3407411575317383 s\n",
      "0.3837134838104248 s\n",
      "0.38958096504211426 s\n",
      "0.3332521915435791 s\n",
      "0.3331124782562256 s\n",
      "0.35001134872436523 s\n",
      "0.32433485984802246 s\n",
      "0.36315059661865234 s\n",
      "0.34323906898498535 s\n",
      "0.3112339973449707 s\n",
      "0.30588483810424805 s\n",
      "0.30704236030578613 s\n",
      "0.31201720237731934 s\n"
     ]
    }
   ],
   "source": [
    "for node in masterEmbCorrections_abs_set:\n",
    "    if node not in masterEmbedDictMaster['abstract']:\n",
    "        if node in labelsDict and node in descDict:\n",
    "            masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]\n",
    "        elif node in labelsDict:\n",
    "            masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "acquired-manitoba",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.32213783264160156 s\n",
      "0.357776403427124 s\n",
      "0.37949395179748535 s\n",
      "0.35210466384887695 s\n",
      "0.28103041648864746 s\n",
      "0.3626406192779541 s\n",
      "0.35109710693359375 s\n",
      "0.34203338623046875 s\n",
      "0.32386112213134766 s\n",
      "0.3354361057281494 s\n",
      "0.3063056468963623 s\n",
      "0.3441202640533447 s\n",
      "0.32869935035705566 s\n",
      "0.42442989349365234 s\n",
      "0.37239527702331543 s\n",
      "0.38650059700012207 s\n",
      "0.3191685676574707 s\n",
      "0.3609733581542969 s\n",
      "0.3115823268890381 s\n",
      "0.36015963554382324 s\n",
      "0.3338603973388672 s\n",
      "0.3487727642059326 s\n",
      "0.3250617980957031 s\n",
      "0.35145044326782227 s\n",
      "0.33944034576416016 s\n",
      "0.31502413749694824 s\n",
      "0.3611795902252197 s\n",
      "0.35285043716430664 s\n",
      "0.3575010299682617 s\n",
      "0.304781436920166 s\n",
      "0.4003562927246094 s\n",
      "0.3315858840942383 s\n",
      "0.36008763313293457 s\n",
      "0.36187100410461426 s\n",
      "0.32981252670288086 s\n",
      "0.3378865718841553 s\n",
      "0.31662964820861816 s\n",
      "0.32143092155456543 s\n",
      "0.3152732849121094 s\n",
      "0.38222813606262207 s\n",
      "0.3846759796142578 s\n",
      "0.33153700828552246 s\n",
      "0.37013936042785645 s\n",
      "0.33272790908813477 s\n",
      "0.29526567459106445 s\n",
      "0.3218040466308594 s\n",
      "0.3795340061187744 s\n",
      "0.3576061725616455 s\n",
      "0.35764193534851074 s\n",
      "0.36867713928222656 s\n",
      "0.3807237148284912 s\n",
      "0.33266758918762207 s\n",
      "0.33878159523010254 s\n",
      "0.38289546966552734 s\n",
      "0.38695788383483887 s\n",
      "0.33074188232421875 s\n",
      "0.32749414443969727 s\n",
      "0.33860039710998535 s\n",
      "0.36585235595703125 s\n",
      "0.33011841773986816 s\n",
      "0.3293156623840332 s\n",
      "0.3491702079772949 s\n",
      "0.3720529079437256 s\n",
      "0.3078622817993164 s\n",
      "0.3844125270843506 s\n",
      "0.32468104362487793 s\n",
      "0.3186354637145996 s\n",
      "0.3438723087310791 s\n",
      "0.36643028259277344 s\n",
      "0.34279680252075195 s\n",
      "0.3625810146331787 s\n",
      "0.35865354537963867 s\n",
      "0.3503103256225586 s\n",
      "0.37160682678222656 s\n",
      "0.3268110752105713 s\n",
      "0.2564544677734375 s\n",
      "0.37343525886535645 s\n",
      "0.33298277854919434 s\n"
     ]
    }
   ],
   "source": [
    "for node in masterEmbCorrections_abs_set:\n",
    "    if node not in masterEmbedDictMaster['abstract_first_sent']:\n",
    "        if node in labelsDict and node in descDict:\n",
    "            masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]\n",
    "        elif node in labelsDict:\n",
    "            masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "veterinary-thailand",
   "metadata": {},
   "source": [
    "## Updated coverage details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "id": "intimate-campus",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pair Coverage by new text_7_props embeddings created for 19k retrofitting: 325\n",
      "Pair Coverage by new text_2_props embeddings created for 19k retrofitting: 325\n",
      "Pair Coverage by new complex embeddings created for 19k retrofitting: 343\n",
      "Pair Coverage by new transe embeddings created for 19k retrofitting: 343\n",
      "Pair Coverage by new abstract embeddings created for 19k retrofitting: 339\n",
      "Pair Coverage by new abstract_first_sent embeddings created for 19k retrofitting: 339\n"
     ]
    }
   ],
   "source": [
    "for key1 in masterEmbedKeys:\n",
    "    print(f\"Pair Coverage by new {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "lovely-token",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key1 in masterEmbedDictMaster.keys():\n",
    "    for key2 in masterEmbedDictMaster[key1].keys():\n",
    "        if type(masterEmbedDictMaster[key1][key2]) != list:\n",
    "            masterEmbedDictMaster[key1][key2] = masterEmbedDictMaster[key1][key2].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "exact-surfing",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key1 in ['complex', 'transe']:\n",
    "    json.dump(masterEmbedDictMaster[key1], open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json', 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "behavioral-spain",
   "metadata": {},
   "outputs": [],
   "source": [
    "def countOverlap(source, target):\n",
    "    cnt = 0\n",
    "    for key1 in source:\n",
    "        if key1 in target:\n",
    "            cnt += 1\n",
    "    return cnt\n",
    "p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "hawaiian-brain",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>embedding</th>\n",
       "      <th>total count</th>\n",
       "      <th>overlap count</th>\n",
       "      <th>Coverage Percentage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>text_7_props</td>\n",
       "      <td>238930</td>\n",
       "      <td>238889</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>text_2_props</td>\n",
       "      <td>238930</td>\n",
       "      <td>238889</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>complex</td>\n",
       "      <td>238500</td>\n",
       "      <td>238448</td>\n",
       "      <td>99.815395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>transe</td>\n",
       "      <td>238500</td>\n",
       "      <td>238448</td>\n",
       "      <td>99.815395</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>abstract</td>\n",
       "      <td>105964</td>\n",
       "      <td>105916</td>\n",
       "      <td>44.336910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>abstract_first_sent</td>\n",
       "      <td>105964</td>\n",
       "      <td>105916</td>\n",
       "      <td>44.336910</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             embedding  total count  overlap count  Coverage Percentage\n",
       "0         text_7_props       238930         238889           100.000000\n",
       "1         text_2_props       238930         238889           100.000000\n",
       "2              complex       238500         238448            99.815395\n",
       "3               transe       238500         238448            99.815395\n",
       "4             abstract       105964         105916            44.336910\n",
       "5  abstract_first_sent       105964         105916            44.336910"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summArr = []\n",
    "for key1 in masterEmbedDictMaster:\n",
    "    cnt = countOverlap(masterEmbedDictMaster[key1], p279QnodesList)\n",
    "    summArr.append([key1, len(masterEmbedDictMaster[key1]), cnt, cnt / len(p279QnodesList) * 100])\n",
    "pd.DataFrame(summArr, columns=['embedding', 'total count', 'overlap count', 'Coverage Percentage'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "greater-namibia",
   "metadata": {},
   "source": [
    "# Concatenated Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fifth-associate",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "embedDictMaster = {}\n",
    "for key1 in ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']:\n",
    "    embedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json'))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "egyptian-sentence",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "text_7_props :  1024\n",
      "text_2_props :  1024\n",
      "complex :  200\n",
      "transe :  200\n",
      "abstract :  768\n",
      "abstract_first_sent :  768\n"
     ]
    }
   ],
   "source": [
    "def determineEmbeddingLengths(embedDictMaster):\n",
    "    for key in embedDictMaster.keys():\n",
    "        embed_size = len(next(iter(embedDictMaster[key].values())))\n",
    "        print(key,\": \",embed_size)\n",
    "determineEmbeddingLengths(embedDictMaster)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "removable-point",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Added 11 corrections\n",
      "Added 11 corrections\n",
      "Added 0 corrections\n",
      "Added 0 corrections\n",
      "Added 4 corrections\n",
      "Added 4 corrections\n"
     ]
    }
   ],
   "source": [
    "for key1 in embedDictMaster.keys():\n",
    "    embedDictMaster[key1] = deserializeEmbeddingDict(embedDictMaster[key1])\n",
    "# Fill Coverage of embedding dictionaries\n",
    "for key1 in embedDictMaster.keys():\n",
    "    embedDictMaster[key1] = fillCoverage(embedDictMaster[key1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "productive-indiana",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "text_7_props 238941\n",
      "text_2_props 238941\n",
      "complex 238941\n",
      "transe 238941\n",
      "abstract 238941\n",
      "abstract_first_sent 238941\n"
     ]
    }
   ],
   "source": [
    "for key1 in embedDictMaster.keys():\n",
    "    print(key1, len(next(iter(embedDictMaster.values()))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "mechanical-retro",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ready-financing",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "prime-hometown",
   "metadata": {},
   "source": [
    "# Retrofitting sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "tight-civilization",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetchNeighbours(df):\n",
    "    neighboursDict = {}\n",
    "    for _, row in df.iterrows():\n",
    "        if row.node1 not in neighboursDict:\n",
    "            neighboursDict[row.node1] = []\n",
    "        neighboursDict[row.node1].append((row.node2, row.bert2SentSim))\n",
    "        \n",
    "        if row.node2 not in neighboursDict:\n",
    "            neighboursDict[row.node2] = []\n",
    "        neighboursDict[row.node2].append((row.node1, row.bert2SentSim))\n",
    "    print(max([len(neigh) for neigh in neighboursDict.values()]))\n",
    "    return neighboursDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "exciting-circle",
   "metadata": {},
   "outputs": [],
   "source": [
    "def retrofit(embedDict, neighDict, weightCase, weightAssignment=False):\n",
    "    newEmbedDict = {}\n",
    "    for word in embedDict.keys():\n",
    "        if word in neighDict:\n",
    "            neighbs = neighDict[word]\n",
    "            neighbs = list(filter(lambda p: p[0] in embedDict, neighbs))\n",
    "            if len(neighbs) == 0:\n",
    "                newEmbedDict[word] = embedDict[word]\n",
    "                continue\n",
    "#             assert len(neighbs) == 1\n",
    "            if weightAssignment:\n",
    "                sumOfSims = sum([neighb[1] for neighb in neighbs])\n",
    "                sumOfEmbs = sum([embedDict[neighb[0]] * float(neighb[1]) for neighb in neighbs])\n",
    "            else:\n",
    "                sumOfSims = sum([1 for neighb in neighbs])\n",
    "                sumOfEmbs = sum([embedDict[neighb[0]] for neighb in neighbs])\n",
    "                \n",
    "            if weightCase == 1:\n",
    "                newEmbedDict[word] = (embedDict[word] * (len(neighbs)) + sumOfEmbs) / ((len(neighbs)) + sumOfSims)\n",
    "            elif weightCase == 2:\n",
    "                newEmbedDict[word] = (embedDict[word] * (len(neighbs))**2 + sumOfEmbs) / ((len(neighbs))**2 + sumOfSims)\n",
    "            elif weightCase == 0.5:\n",
    "                newEmbedDict[word] = (embedDict[word] * (len(neighbs))**0.5 + sumOfEmbs) / ((len(neighbs))**0.5 + sumOfSims)\n",
    "            else:\n",
    "                raise\n",
    "        else:\n",
    "            newEmbedDict[word] = embedDict[word]\n",
    "    return newEmbedDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "hollywood-prisoner",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "def labelSamples(score):\n",
    "    return 'I' if score <= 1.75 else 'U' if score >= 3.5 else 'M'\n",
    "LABELS = ['I','U','M']\n",
    "def fetchCorrelationResults(embedDict, newEmbedDict):\n",
    "    wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')\n",
    "#     print(f\"Length of wordsim dataset: {len(wordSim353AnnotDF_New)}\")\n",
    "    assert wordSim353AnnotDF_New.word1_kg_id.isna().sum() == 0\n",
    "    assert wordSim353AnnotDF_New.word2_kg_id.isna().sum() == 0\n",
    "    wordSim353AnnotDF_New['category'] = wordSim353AnnotDF_New.Avg.apply(labelSamples)\n",
    "#     wordSim353AnnotDF_New2 = wordSim353AnnotDF_New\n",
    "    wordSim353AnnotDF_New2 = wordSim353AnnotDF_New[wordSim353AnnotDF_New.apply(lambda p: p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict, axis=1)]\n",
    "    wordSimMissingSet = set(wordSim353AnnotDF_New[wordSim353AnnotDF_New.word1_kg_id.apply(lambda p: p not in embedDict)].word1_kg_id.to_list() + wordSim353AnnotDF_New[wordSim353AnnotDF_New.word2_kg_id.apply(lambda p: p not in embedDict)].word2_kg_id.to_list())\n",
    "    responseDict = {}\n",
    "    responseDict['wordSimMissingSet'] = wordSimMissingSet\n",
    "    responseDict['coveredPairs'] = len(wordSim353AnnotDF_New2)\n",
    "    responseDict['totalPairs'] = len(wordSim353AnnotDF_New)\n",
    "    \n",
    "#     wordSimMissingSet\n",
    "#     print(f\"No. of pairs with some value for embeddings: {len(wordSim353AnnotDF_New2)}\")\n",
    "    wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(embedDict[p['word1_kg_id']]).reshape(1,-1), np.array(embedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)\n",
    "    wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(newEmbedDict[p['word1_kg_id']]).reshape(1,-1), np.array(newEmbedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)\n",
    "    wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textOld'] == -1, 'textOld'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textOld'] != -1]['textOld'].mean()\n",
    "    wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textNew'] == -1, 'textNew'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textNew'] != -1]['textNew'].mean()\n",
    "    \n",
    "    # Logic 1: Scale min,max value to 1,4 strictly\n",
    "#     min1, max1 = wordSim353AnnotDF_New['textOld'].min(), wordSim353AnnotDF_New['textOld'].max()\n",
    "#     min2, max2 = wordSim353AnnotDF_New['textNew'].min(), wordSim353AnnotDF_New['textNew'].max()\n",
    "#     wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * (p - min1) / (max1 - min1))\n",
    "#     wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * (p - min2) / (max2 - min2))\n",
    "    \n",
    "    # Logic 2: Scale abs value to 1,4 strictly\n",
    "    wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * abs(p))\n",
    "    wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * abs(p))\n",
    "\n",
    "    \n",
    "#     print(f\"KT Corr of old emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['Avg'])}\")\n",
    "#     print(f\"KT Corr of new emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['Avg'])}\")\n",
    "#     print(f\"KT Corr of old emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['H_reversed'])}\")\n",
    "#     print(f\"KT Corr of new emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['H_reversed'])}\")\n",
    "    \n",
    "#     print(f\"Classification Accuracy of old embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textOld'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}\")\n",
    "#     print(f\"Classification Accuracy of new embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textNew'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}\")\n",
    "    responseDict['KT_old_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['Avg'])\n",
    "    responseDict['KT_new_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['Avg'])\n",
    "    responseDict['KT_old_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['H_reversed'])\n",
    "    responseDict['KT_new_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['H_reversed'])\n",
    "    responseDict['old_acc'] = accuracy_score(wordSim353AnnotDF_New['textOld'].apply(labelSamples), wordSim353AnnotDF_New['category'])\n",
    "    responseDict['new_acc'] = accuracy_score(wordSim353AnnotDF_New['textNew'].apply(labelSamples), wordSim353AnnotDF_New['category'])\n",
    "    \n",
    "    responseDict['class_rep_old'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), output_dict=True)\n",
    "    responseDict['class_rep_new'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), output_dict=True)\n",
    "    \n",
    "    cm_old = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), labels=LABELS)\n",
    "    cm_new = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), labels=LABELS)\n",
    "    \n",
    "    responseDict['cm_old'] = cm_old\n",
    "    responseDict['cm_new'] = cm_new\n",
    "    \n",
    "    return responseDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "severe-explosion",
   "metadata": {},
   "outputs": [],
   "source": [
    "neighDictMaster, embedDictMaster = {}, {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "decreased-syndication",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39218\n"
     ]
    }
   ],
   "source": [
    "neighDictMaster['19k_childPar'] = fetchNeighbours(p279ChildPar)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "rocky-criterion",
   "metadata": {},
   "outputs": [],
   "source": [
    "embedDictMaster['complex'] = complexEmb\n",
    "embedDictMaster['transe'] = transeEmb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "apparent-sapphire",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key1 in embedDictMaster.keys():\n",
    "    for key2 in embedDictMaster[key1].keys():\n",
    "        embedDictMaster[key1][key2] = np.array(embedDictMaster[key1][key2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "precise-oxygen",
   "metadata": {},
   "outputs": [],
   "source": [
    "embList = list(embedDictMaster.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "identical-keyboard",
   "metadata": {},
   "outputs": [],
   "source": [
    "basisList = list(neighDictMaster.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "aging-flavor",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['19k_childPar'])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neighDictMaster.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "amended-remove",
   "metadata": {},
   "outputs": [],
   "source": [
    "newEmbedDictMaster, responsesDictMaster = {}, {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "surgical-insurance",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7817a00dcf3c412b92a7c5ac75517168",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import numpy as np\n",
    "results = []\n",
    "NUM_ITERS = 10\n",
    "for basis in tqdm(basisList):\n",
    "    for emb in embList:\n",
    "        for weightedNess in [True]:\n",
    "            groupResults = []\n",
    "            for weightCase in [1,2]:\n",
    "                embedDict = embedDictMaster[emb]\n",
    "                if weightedNess:\n",
    "                    caseName = emb + '_' + basis + '_' + str(weightCase) + '_weighted'\n",
    "                else:\n",
    "                    caseName = emb + '_' + basis + '_' + str(weightCase) + '_unweighted'\n",
    "                for iterNum in range(1,NUM_ITERS+1):\n",
    "                    newEmbedDict = retrofit(embedDict, neighDictMaster[basis], weightCase, weightedNess)\n",
    "        #             dists = determineDistances(embedDict, newEmbedDict)\n",
    "                    responsesDict = fetchCorrelationResults(embedDict, newEmbedDict)\n",
    "        #                 print(responsesDict.keys())\n",
    "                    groupResults.append([emb, basis, weightCase, weightedNess, iterNum, \\\n",
    "                                    responsesDict['old_acc']*100, \\\n",
    "                                    responsesDict['new_acc']*100, \\\n",
    "                                    (responsesDict['new_acc'] - responsesDict['old_acc'])*100, \\\n",
    "                                    responsesDict['coveredPairs'], \\\n",
    "                                     responsesDict['class_rep_old']['I']['precision'], \\\n",
    "                                     responsesDict['class_rep_old']['I']['recall'], \\\n",
    "                                     responsesDict['class_rep_old']['I']['f1-score'], \\\n",
    "                                     responsesDict['class_rep_old']['U']['precision'], \\\n",
    "                                     responsesDict['class_rep_old']['U']['recall'], \\\n",
    "                                     responsesDict['class_rep_old']['U']['f1-score'], \\\n",
    "                                     responsesDict['class_rep_new']['I']['precision'], \\\n",
    "                                     responsesDict['class_rep_new']['I']['recall'], \\\n",
    "                                     responsesDict['class_rep_new']['I']['f1-score'], \\\n",
    "                                     responsesDict['class_rep_new']['U']['precision'], \\\n",
    "                                     responsesDict['class_rep_new']['U']['recall'], \\\n",
    "                                     responsesDict['class_rep_new']['U']['f1-score'], \\\n",
    "                                    ])\n",
    "                    embedDict = newEmbedDict\n",
    "\n",
    "                newEmbedDictMaster[caseName] = newEmbedDict\n",
    "                responsesDictMaster[caseName] = responsesDict\n",
    "            for gR, rank in zip(groupResults, np.argsort([-p[6] for p in groupResults])):\n",
    "                results.append(gR+[rank])\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "assigned-stations",
   "metadata": {},
   "outputs": [],
   "source": [
    "resultsDF = pd.DataFrame(results, columns=['Embedding', 'Basis', 'Weight', 'Weightedness', 'Iteration Num', 'Old Acc', 'New Acc', 'Increase', 'Pairs Covered', \\\n",
    "                                           'Old I Precision', 'Old I Recall', 'Old I F1-Score', \\\n",
    "                                           'Old U Precision', 'Old U Recall', 'Old U F1-Score', \\\n",
    "                                           'New I Precision', 'New I Recall', 'New I F1-Score', \\\n",
    "                                           'New U Precision', 'New U Recall', 'New U F1-Score', \\\n",
    "                                           'Rank'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "built-mumbai",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Embedding</th>\n",
       "      <th>Basis</th>\n",
       "      <th>Weight</th>\n",
       "      <th>Weightedness</th>\n",
       "      <th>Iteration Num</th>\n",
       "      <th>Old Acc</th>\n",
       "      <th>New Acc</th>\n",
       "      <th>Increase</th>\n",
       "      <th>Pairs Covered</th>\n",
       "      <th>Old I Precision</th>\n",
       "      <th>...</th>\n",
       "      <th>Old U Precision</th>\n",
       "      <th>Old U Recall</th>\n",
       "      <th>Old U F1-Score</th>\n",
       "      <th>New I Precision</th>\n",
       "      <th>New I Recall</th>\n",
       "      <th>New I F1-Score</th>\n",
       "      <th>New U Precision</th>\n",
       "      <th>New U Recall</th>\n",
       "      <th>New U F1-Score</th>\n",
       "      <th>Rank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>60.755814</td>\n",
       "      <td>64.244186</td>\n",
       "      <td>3.488372</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.433121</td>\n",
       "      <td>0.660194</td>\n",
       "      <td>0.523077</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.463415</td>\n",
       "      <td>0.553398</td>\n",
       "      <td>0.504425</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>62.500000</td>\n",
       "      <td>65.697674</td>\n",
       "      <td>3.197674</td>\n",
       "      <td>291</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>...</td>\n",
       "      <td>0.397059</td>\n",
       "      <td>0.262136</td>\n",
       "      <td>0.315789</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.625000</td>\n",
       "      <td>0.450000</td>\n",
       "      <td>0.174757</td>\n",
       "      <td>0.251748</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>64.244186</td>\n",
       "      <td>67.151163</td>\n",
       "      <td>2.906977</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.463415</td>\n",
       "      <td>0.553398</td>\n",
       "      <td>0.504425</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.495050</td>\n",
       "      <td>0.485437</td>\n",
       "      <td>0.490196</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>61.918605</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>1.744186</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.660194</td>\n",
       "      <td>0.531250</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.458904</td>\n",
       "      <td>0.650485</td>\n",
       "      <td>0.538153</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>60.755814</td>\n",
       "      <td>61.918605</td>\n",
       "      <td>1.162791</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.433121</td>\n",
       "      <td>0.660194</td>\n",
       "      <td>0.523077</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.660194</td>\n",
       "      <td>0.531250</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>67.151163</td>\n",
       "      <td>67.732558</td>\n",
       "      <td>0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.495050</td>\n",
       "      <td>0.485437</td>\n",
       "      <td>0.490196</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.645161</td>\n",
       "      <td>0.511905</td>\n",
       "      <td>0.417476</td>\n",
       "      <td>0.459893</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>67.151163</td>\n",
       "      <td>67.732558</td>\n",
       "      <td>0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>...</td>\n",
       "      <td>0.492958</td>\n",
       "      <td>0.339806</td>\n",
       "      <td>0.402299</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.507937</td>\n",
       "      <td>0.310680</td>\n",
       "      <td>0.385542</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>7</td>\n",
       "      <td>62.500000</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>...</td>\n",
       "      <td>0.351852</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.242038</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.365385</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.245161</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>62.500000</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>...</td>\n",
       "      <td>0.397059</td>\n",
       "      <td>0.262136</td>\n",
       "      <td>0.315789</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.252427</td>\n",
       "      <td>0.309524</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>64.825581</td>\n",
       "      <td>65.406977</td>\n",
       "      <td>0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.393939</td>\n",
       "      <td>0.126214</td>\n",
       "      <td>0.191176</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.116505</td>\n",
       "      <td>0.180451</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>9</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>...</td>\n",
       "      <td>0.372549</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.246753</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.647059</td>\n",
       "      <td>0.380000</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.248366</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.383333</td>\n",
       "      <td>0.223301</td>\n",
       "      <td>0.282209</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.203883</td>\n",
       "      <td>0.264151</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>9</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>...</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.543689</td>\n",
       "      <td>0.489083</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.645161</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.543689</td>\n",
       "      <td>0.489083</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>8</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>...</td>\n",
       "      <td>0.365385</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.245161</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.647059</td>\n",
       "      <td>0.372549</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.246753</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.387097</td>\n",
       "      <td>0.233010</td>\n",
       "      <td>0.290909</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.383333</td>\n",
       "      <td>0.223301</td>\n",
       "      <td>0.282209</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>65.406977</td>\n",
       "      <td>65.406977</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.116505</td>\n",
       "      <td>0.180451</td>\n",
       "      <td>0.652174</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.697674</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.116505</td>\n",
       "      <td>0.184615</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>10</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>...</td>\n",
       "      <td>0.380000</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.248366</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.647059</td>\n",
       "      <td>0.380000</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.248366</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>6</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.455224</td>\n",
       "      <td>0.592233</td>\n",
       "      <td>0.514768</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.453846</td>\n",
       "      <td>0.572816</td>\n",
       "      <td>0.506438</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.457143</td>\n",
       "      <td>0.621359</td>\n",
       "      <td>0.526749</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.455224</td>\n",
       "      <td>0.592233</td>\n",
       "      <td>0.514768</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.457746</td>\n",
       "      <td>0.631068</td>\n",
       "      <td>0.530612</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.457143</td>\n",
       "      <td>0.621359</td>\n",
       "      <td>0.526749</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>3</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.458904</td>\n",
       "      <td>0.650485</td>\n",
       "      <td>0.538153</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.457746</td>\n",
       "      <td>0.631068</td>\n",
       "      <td>0.530612</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>8</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>...</td>\n",
       "      <td>0.448819</td>\n",
       "      <td>0.553398</td>\n",
       "      <td>0.495652</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.645161</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.543689</td>\n",
       "      <td>0.489083</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>7</td>\n",
       "      <td>63.662791</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.453846</td>\n",
       "      <td>0.572816</td>\n",
       "      <td>0.506438</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.645161</td>\n",
       "      <td>0.448819</td>\n",
       "      <td>0.553398</td>\n",
       "      <td>0.495652</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>6</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>62.500000</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>...</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>0.194175</td>\n",
       "      <td>0.253165</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.351852</td>\n",
       "      <td>0.184466</td>\n",
       "      <td>0.242038</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>...</td>\n",
       "      <td>0.375000</td>\n",
       "      <td>0.203883</td>\n",
       "      <td>0.264151</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>0.194175</td>\n",
       "      <td>0.253165</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.252427</td>\n",
       "      <td>0.309524</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.387097</td>\n",
       "      <td>0.233010</td>\n",
       "      <td>0.290909</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>9</td>\n",
       "      <td>60.755814</td>\n",
       "      <td>60.465116</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.394737</td>\n",
       "      <td>...</td>\n",
       "      <td>0.368421</td>\n",
       "      <td>0.067961</td>\n",
       "      <td>0.114754</td>\n",
       "      <td>0.365854</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.491803</td>\n",
       "      <td>0.388889</td>\n",
       "      <td>0.067961</td>\n",
       "      <td>0.115702</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>10</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>62.790698</td>\n",
       "      <td>-0.290698</td>\n",
       "      <td>291</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>...</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.543689</td>\n",
       "      <td>0.489083</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.543689</td>\n",
       "      <td>0.489083</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>7</td>\n",
       "      <td>62.209302</td>\n",
       "      <td>61.627907</td>\n",
       "      <td>-0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>...</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.087379</td>\n",
       "      <td>0.145161</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.535714</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.077670</td>\n",
       "      <td>0.130081</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>10</td>\n",
       "      <td>60.465116</td>\n",
       "      <td>59.883721</td>\n",
       "      <td>-0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.365854</td>\n",
       "      <td>...</td>\n",
       "      <td>0.388889</td>\n",
       "      <td>0.067961</td>\n",
       "      <td>0.115702</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.461538</td>\n",
       "      <td>0.411765</td>\n",
       "      <td>0.067961</td>\n",
       "      <td>0.116667</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>9</td>\n",
       "      <td>65.116279</td>\n",
       "      <td>64.534884</td>\n",
       "      <td>-0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>...</td>\n",
       "      <td>0.442308</td>\n",
       "      <td>0.223301</td>\n",
       "      <td>0.296774</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.647059</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>0.194175</td>\n",
       "      <td>0.264901</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>67.732558</td>\n",
       "      <td>67.151163</td>\n",
       "      <td>-0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>...</td>\n",
       "      <td>0.511905</td>\n",
       "      <td>0.417476</td>\n",
       "      <td>0.459893</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.492958</td>\n",
       "      <td>0.339806</td>\n",
       "      <td>0.402299</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>7</td>\n",
       "      <td>66.569767</td>\n",
       "      <td>65.988372</td>\n",
       "      <td>-0.581395</td>\n",
       "      <td>291</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>...</td>\n",
       "      <td>0.475410</td>\n",
       "      <td>0.281553</td>\n",
       "      <td>0.353659</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.252427</td>\n",
       "      <td>0.327044</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>8</td>\n",
       "      <td>65.988372</td>\n",
       "      <td>65.116279</td>\n",
       "      <td>-0.872093</td>\n",
       "      <td>291</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>...</td>\n",
       "      <td>0.464286</td>\n",
       "      <td>0.252427</td>\n",
       "      <td>0.327044</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.647059</td>\n",
       "      <td>0.442308</td>\n",
       "      <td>0.223301</td>\n",
       "      <td>0.296774</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>65.697674</td>\n",
       "      <td>64.825581</td>\n",
       "      <td>-0.872093</td>\n",
       "      <td>291</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>...</td>\n",
       "      <td>0.450000</td>\n",
       "      <td>0.174757</td>\n",
       "      <td>0.251748</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.393939</td>\n",
       "      <td>0.126214</td>\n",
       "      <td>0.191176</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>8</td>\n",
       "      <td>61.627907</td>\n",
       "      <td>60.755814</td>\n",
       "      <td>-0.872093</td>\n",
       "      <td>291</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>...</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.077670</td>\n",
       "      <td>0.130081</td>\n",
       "      <td>0.394737</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.517241</td>\n",
       "      <td>0.368421</td>\n",
       "      <td>0.067961</td>\n",
       "      <td>0.114754</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>6</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>62.209302</td>\n",
       "      <td>-1.162791</td>\n",
       "      <td>291</td>\n",
       "      <td>0.468750</td>\n",
       "      <td>...</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.097087</td>\n",
       "      <td>0.160000</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.545455</td>\n",
       "      <td>0.428571</td>\n",
       "      <td>0.087379</td>\n",
       "      <td>0.145161</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>6</td>\n",
       "      <td>67.732558</td>\n",
       "      <td>66.569767</td>\n",
       "      <td>-1.162791</td>\n",
       "      <td>291</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>...</td>\n",
       "      <td>0.507937</td>\n",
       "      <td>0.310680</td>\n",
       "      <td>0.385542</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.475410</td>\n",
       "      <td>0.281553</td>\n",
       "      <td>0.353659</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>complex</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>10</td>\n",
       "      <td>64.534884</td>\n",
       "      <td>63.081395</td>\n",
       "      <td>-1.453488</td>\n",
       "      <td>291</td>\n",
       "      <td>0.785714</td>\n",
       "      <td>...</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>0.194175</td>\n",
       "      <td>0.264901</td>\n",
       "      <td>0.687500</td>\n",
       "      <td>0.55</td>\n",
       "      <td>0.611111</td>\n",
       "      <td>0.377778</td>\n",
       "      <td>0.165049</td>\n",
       "      <td>0.229730</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>transe</td>\n",
       "      <td>19k_childPar</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>65.406977</td>\n",
       "      <td>63.372093</td>\n",
       "      <td>-2.034884</td>\n",
       "      <td>291</td>\n",
       "      <td>0.652174</td>\n",
       "      <td>...</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.116505</td>\n",
       "      <td>0.184615</td>\n",
       "      <td>0.468750</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.576923</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.097087</td>\n",
       "      <td>0.160000</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>40 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Embedding         Basis  Weight  Weightedness  Iteration Num    Old Acc  \\\n",
       "0    complex  19k_childPar       1          True              1  60.755814   \n",
       "20    transe  19k_childPar       1          True              1  62.500000   \n",
       "1    complex  19k_childPar       1          True              2  64.244186   \n",
       "11   complex  19k_childPar       2          True              2  61.918605   \n",
       "10   complex  19k_childPar       2          True              1  60.755814   \n",
       "2    complex  19k_childPar       1          True              3  67.151163   \n",
       "4    complex  19k_childPar       1          True              5  67.151163   \n",
       "36    transe  19k_childPar       2          True              7  62.500000   \n",
       "30    transe  19k_childPar       2          True              1  62.500000   \n",
       "22    transe  19k_childPar       1          True              3  64.825581   \n",
       "38    transe  19k_childPar       2          True              9  63.081395   \n",
       "33    transe  19k_childPar       2          True              4  62.790698   \n",
       "18   complex  19k_childPar       2          True              9  63.081395   \n",
       "37    transe  19k_childPar       2          True              8  63.081395   \n",
       "32    transe  19k_childPar       2          True              3  62.790698   \n",
       "23    transe  19k_childPar       1          True              4  65.406977   \n",
       "39    transe  19k_childPar       2          True             10  63.372093   \n",
       "15   complex  19k_childPar       2          True              6  63.662791   \n",
       "14   complex  19k_childPar       2          True              5  63.662791   \n",
       "13   complex  19k_childPar       2          True              4  63.662791   \n",
       "12   complex  19k_childPar       2          True              3  63.662791   \n",
       "17   complex  19k_childPar       2          True              8  63.372093   \n",
       "16   complex  19k_childPar       2          True              7  63.662791   \n",
       "35    transe  19k_childPar       2          True              6  62.790698   \n",
       "34    transe  19k_childPar       2          True              5  63.081395   \n",
       "31    transe  19k_childPar       2          True              2  63.081395   \n",
       "28    transe  19k_childPar       1          True              9  60.755814   \n",
       "19   complex  19k_childPar       2          True             10  63.081395   \n",
       "26    transe  19k_childPar       1          True              7  62.209302   \n",
       "29    transe  19k_childPar       1          True             10  60.465116   \n",
       "8    complex  19k_childPar       1          True              9  65.116279   \n",
       "3    complex  19k_childPar       1          True              4  67.732558   \n",
       "6    complex  19k_childPar       1          True              7  66.569767   \n",
       "7    complex  19k_childPar       1          True              8  65.988372   \n",
       "21    transe  19k_childPar       1          True              2  65.697674   \n",
       "27    transe  19k_childPar       1          True              8  61.627907   \n",
       "25    transe  19k_childPar       1          True              6  63.372093   \n",
       "5    complex  19k_childPar       1          True              6  67.732558   \n",
       "9    complex  19k_childPar       1          True             10  64.534884   \n",
       "24    transe  19k_childPar       1          True              5  65.406977   \n",
       "\n",
       "      New Acc  Increase  Pairs Covered  Old I Precision  ...  Old U Precision  \\\n",
       "0   64.244186  3.488372            291         1.000000  ...         0.433121   \n",
       "20  65.697674  3.197674            291         0.888889  ...         0.397059   \n",
       "1   67.151163  2.906977            291         1.000000  ...         0.463415   \n",
       "11  63.662791  1.744186            291         1.000000  ...         0.444444   \n",
       "10  61.918605  1.162791            291         1.000000  ...         0.433121   \n",
       "2   67.732558  0.581395            291         1.000000  ...         0.495050   \n",
       "4   67.732558  0.581395            291         0.916667  ...         0.492958   \n",
       "36  63.081395  0.581395            291         0.846154  ...         0.351852   \n",
       "30  63.081395  0.581395            291         0.888889  ...         0.397059   \n",
       "22  65.406977  0.581395            291         0.750000  ...         0.393939   \n",
       "38  63.372093  0.290698            291         0.785714  ...         0.372549   \n",
       "33  63.081395  0.290698            291         0.900000  ...         0.383333   \n",
       "18  63.081395  0.000000            291         0.909091  ...         0.444444   \n",
       "37  63.081395  0.000000            291         0.846154  ...         0.365385   \n",
       "32  62.790698  0.000000            291         0.900000  ...         0.387097   \n",
       "23  65.406977  0.000000            291         0.750000  ...         0.400000   \n",
       "39  63.372093  0.000000            291         0.785714  ...         0.380000   \n",
       "15  63.662791  0.000000            291         1.000000  ...         0.455224   \n",
       "14  63.662791  0.000000            291         1.000000  ...         0.457143   \n",
       "13  63.662791  0.000000            291         1.000000  ...         0.457746   \n",
       "12  63.662791  0.000000            291         1.000000  ...         0.458904   \n",
       "17  63.081395 -0.290698            291         0.909091  ...         0.448819   \n",
       "16  63.372093 -0.290698            291         1.000000  ...         0.453846   \n",
       "35  62.500000 -0.290698            291         0.846154  ...         0.363636   \n",
       "34  62.790698 -0.290698            291         0.846154  ...         0.375000   \n",
       "31  62.790698 -0.290698            291         0.900000  ...         0.400000   \n",
       "28  60.465116 -0.290698            291         0.394737  ...         0.368421   \n",
       "19  62.790698 -0.290698            291         0.909091  ...         0.444444   \n",
       "26  61.627907 -0.581395            291         0.428571  ...         0.428571   \n",
       "29  59.883721 -0.581395            291         0.365854  ...         0.388889   \n",
       "8   64.534884 -0.581395            291         0.785714  ...         0.442308   \n",
       "3   67.151163 -0.581395            291         0.909091  ...         0.511905   \n",
       "6   65.988372 -0.581395            291         0.916667  ...         0.475410   \n",
       "7   65.116279 -0.872093            291         0.846154  ...         0.464286   \n",
       "21  64.825581 -0.872093            291         0.833333  ...         0.450000   \n",
       "27  60.755814 -0.872093            291         0.416667  ...         0.400000   \n",
       "25  62.209302 -1.162791            291         0.468750  ...         0.454545   \n",
       "5   66.569767 -1.162791            291         0.916667  ...         0.507937   \n",
       "9   63.081395 -1.453488            291         0.785714  ...         0.416667   \n",
       "24  63.372093 -2.034884            291         0.652174  ...         0.444444   \n",
       "\n",
       "    Old U Recall  Old U F1-Score  New I Precision  New I Recall  \\\n",
       "0       0.660194        0.523077         1.000000          0.40   \n",
       "20      0.262136        0.315789         0.833333          0.50   \n",
       "1       0.553398        0.504425         1.000000          0.50   \n",
       "11      0.660194        0.531250         1.000000          0.45   \n",
       "10      0.660194        0.523077         1.000000          0.40   \n",
       "2       0.485437        0.490196         0.909091          0.50   \n",
       "4       0.339806        0.402299         0.916667          0.55   \n",
       "36      0.184466        0.242038         0.846154          0.55   \n",
       "30      0.262136        0.315789         0.900000          0.45   \n",
       "22      0.126214        0.191176         0.750000          0.75   \n",
       "38      0.184466        0.246753         0.785714          0.55   \n",
       "33      0.223301        0.282209         0.846154          0.55   \n",
       "18      0.543689        0.489083         0.909091          0.50   \n",
       "37      0.184466        0.245161         0.785714          0.55   \n",
       "32      0.233010        0.290909         0.900000          0.45   \n",
       "23      0.116505        0.180451         0.652174          0.75   \n",
       "39      0.184466        0.248366         0.785714          0.55   \n",
       "15      0.592233        0.514768         1.000000          0.45   \n",
       "14      0.621359        0.526749         1.000000          0.45   \n",
       "13      0.631068        0.530612         1.000000          0.45   \n",
       "12      0.650485        0.538153         1.000000          0.45   \n",
       "17      0.553398        0.495652         0.909091          0.50   \n",
       "16      0.572816        0.506438         0.909091          0.50   \n",
       "35      0.194175        0.253165         0.846154          0.55   \n",
       "34      0.203883        0.264151         0.846154          0.55   \n",
       "31      0.252427        0.309524         0.900000          0.45   \n",
       "28      0.067961        0.114754         0.365854          0.75   \n",
       "19      0.543689        0.489083         0.900000          0.45   \n",
       "26      0.087379        0.145161         0.416667          0.75   \n",
       "29      0.067961        0.115702         0.333333          0.75   \n",
       "8       0.223301        0.296774         0.785714          0.55   \n",
       "3       0.417476        0.459893         0.916667          0.55   \n",
       "6       0.281553        0.353659         0.846154          0.55   \n",
       "7       0.252427        0.327044         0.785714          0.55   \n",
       "21      0.174757        0.251748         0.750000          0.60   \n",
       "27      0.077670        0.130081         0.394737          0.75   \n",
       "25      0.097087        0.160000         0.428571          0.75   \n",
       "5       0.310680        0.385542         0.916667          0.55   \n",
       "9       0.194175        0.264901         0.687500          0.55   \n",
       "24      0.116505        0.184615         0.468750          0.75   \n",
       "\n",
       "    New I F1-Score  New U Precision  New U Recall  New U F1-Score  Rank  \n",
       "0         0.571429         0.463415      0.553398        0.504425     2  \n",
       "20        0.625000         0.450000      0.174757        0.251748     0  \n",
       "1         0.666667         0.495050      0.485437        0.490196     4  \n",
       "11        0.620690         0.458904      0.650485        0.538153    13  \n",
       "10        0.571429         0.444444      0.660194        0.531250    12  \n",
       "2         0.645161         0.511905      0.417476        0.459893     1  \n",
       "4         0.687500         0.507937      0.310680        0.385542     5  \n",
       "36        0.666667         0.365385      0.184466        0.245161     6  \n",
       "30        0.600000         0.400000      0.252427        0.309524    17  \n",
       "22        0.750000         0.400000      0.116505        0.180451     3  \n",
       "38        0.647059         0.380000      0.184466        0.248366     8  \n",
       "33        0.666667         0.375000      0.203883        0.264151    14  \n",
       "18        0.645161         0.444444      0.543689        0.489083    19  \n",
       "37        0.647059         0.372549      0.184466        0.246753     7  \n",
       "32        0.600000         0.383333      0.223301        0.282209    12  \n",
       "23        0.697674         0.444444      0.116505        0.184615     1  \n",
       "39        0.647059         0.380000      0.184466        0.248366     9  \n",
       "15        0.620690         0.453846      0.572816        0.506438    17  \n",
       "14        0.620690         0.455224      0.592233        0.514768    16  \n",
       "13        0.620690         0.457143      0.621359        0.526749    15  \n",
       "12        0.620690         0.457746      0.631068        0.530612    14  \n",
       "17        0.645161         0.444444      0.543689        0.489083    18  \n",
       "16        0.645161         0.448819      0.553398        0.495652     9  \n",
       "35        0.666667         0.351852      0.184466        0.242038     5  \n",
       "34        0.666667         0.363636      0.194175        0.253165    15  \n",
       "31        0.600000         0.387097      0.233010        0.290909    11  \n",
       "28        0.491803         0.388889      0.067961        0.115702    13  \n",
       "19        0.600000         0.444444      0.543689        0.489083    10  \n",
       "26        0.535714         0.400000      0.077670        0.130081     4  \n",
       "29        0.461538         0.411765      0.067961        0.116667    16  \n",
       "8         0.647059         0.416667      0.194175        0.264901     0  \n",
       "3         0.687500         0.492958      0.339806        0.402299     3  \n",
       "6         0.666667         0.464286      0.252427        0.327044     7  \n",
       "7         0.647059         0.442308      0.223301        0.296774     8  \n",
       "21        0.666667         0.393939      0.126214        0.191176     2  \n",
       "27        0.517241         0.368421      0.067961        0.114754    10  \n",
       "25        0.545455         0.428571      0.087379        0.145161    19  \n",
       "5         0.687500         0.475410      0.281553        0.353659     6  \n",
       "9         0.611111         0.377778      0.165049        0.229730    11  \n",
       "24        0.576923         0.454545      0.097087        0.160000    18  \n",
       "\n",
       "[40 rows x 22 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resultsDF.sort_values(by=['Increase'], ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "utility-globe",
   "metadata": {},
   "outputs": [],
   "source": [
    "resultsDF.to_csv('../data/retrofitting/masterRetro_Aug20_2021.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "crazy-scene",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kgtkEnv2",
   "language": "python",
   "name": "kgtkenv2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "288px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}