{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import BertTokenizerFast, BertModel, BertConfig, BertTokenizer\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm as tqdm\n",
    "import glob\n",
    "import os\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "import plotly.express as px\n",
    "from sklearn.manifold import TSNE\n",
    "from scipy.spatial.distance import cdist, cosine\n",
    "from gpytorch.kernels.rq_kernel import RQKernel\n",
    "import torch.nn as nn\n",
    "from torch.nn import functional as F\n",
    "import torch.nn.functional as F\n",
    "from torch.utils.data import Dataset, DataLoader"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### For computing representation on fine-tuned model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BertClassifier(torch.nn.Module):\n",
    "    \n",
    "    def __init__(self, config, model, dim=256, num_classes=2):\n",
    "        super(BertClassifier, self).__init__()\n",
    "        \n",
    "        # create the model config and BERT initialize the pretrained BERT, also layers wise outputs\n",
    "        self.config = config\n",
    "        self.base = model\n",
    "        \n",
    "        # classifier head [not useful]\n",
    "        self.head = torch.nn.Sequential(*[\n",
    "            torch.nn.Dropout(p=self.config.hidden_dropout_prob),\n",
    "            torch.nn.Linear(in_features=self.config.hidden_size, out_features=dim),\n",
    "            torch.nn.ReLU(),\n",
    "            torch.nn.Dropout(p=self.config.hidden_dropout_prob),\n",
    "            torch.nn.Linear(in_features=dim, out_features=num_classes)\n",
    "        ])\n",
    "    \n",
    "    def forward(self, input_ids, attention_mask=None):\n",
    "        \n",
    "        # first output is top layer output, second output is context of input seq and third output will be layerwise tokens \n",
    "        top_layer, pooled, layers = self.base(input_ids, attention_mask)\n",
    "        outputs = self.head(pooled)\n",
    "        return top_layer, outputs, layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SentimentDataset(Dataset):\n",
    "    def __init__(self, df, tokenizer, max_len=512):\n",
    "        self.tokenizer = tokenizer\n",
    "        self.text = df.review_text.values\n",
    "        self.max_len =  max_len\n",
    "    \n",
    "    def __len__(self):\n",
    "        return len(self.text)\n",
    "    \n",
    "    def __getitem__(self, idx):\n",
    "        text  = self.text[idx]\n",
    "        \n",
    "        # encode the text and target into tensors return the attention masks as well\n",
    "        encoding = self.tokenizer.encode_plus(\n",
    "            text=text,\n",
    "            add_special_tokens=True,\n",
    "            max_length=self.max_len,\n",
    "            return_token_type_ids=False,\n",
    "            pad_to_max_length=True,\n",
    "            return_attention_mask=True,\n",
    "            return_tensors='pt',\n",
    "            truncation=True\n",
    "        )\n",
    "        \n",
    "        return {\n",
    "          'text': text,\n",
    "          'input_ids': encoding['input_ids'].flatten(),\n",
    "          'attention_mask': encoding['attention_mask'].flatten(),\n",
    "        }\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_predict(trained):\n",
    "        dictionary_list = []\n",
    "\n",
    "        df = pd.read_csv(\"./amazon-review/dvd-UL.csv\")\n",
    "        df = df.sample(n=1000, random_state=42) #number of samples\n",
    "        df = df.reset_index(drop=True)\n",
    "\n",
    "        dataset = SentimentDataset(df=df, tokenizer=tokenizer)\n",
    "\n",
    "        data_loader = torch.utils.data.DataLoader(\n",
    "            dataset=dataset,\n",
    "            batch_size= 4,\n",
    "            shuffle=False,\n",
    "            num_workers=8\n",
    "        )\n",
    "\n",
    "        for bi, d in enumerate(tqdm(data_loader)):\n",
    "            input_ids = d[\"input_ids\"]\n",
    "            attention_mask = d[\"attention_mask\"]\n",
    "\n",
    "            _, _, output = classifier_trained(input_ids, attention_mask)\n",
    "\n",
    "            output = output[1:]\n",
    "\n",
    "            for zeta in range(len(output[0])):\n",
    "                for i in range(0,12):\n",
    "                    new_row = {'embeddings':output[i][zeta][0].cpu().detach().numpy(), 'layers': i+1}\n",
    "                    dictionary_list.append(new_row)\n",
    "\n",
    "            dictionary_list = np.save(f\"./data/batch_{bi}\", dictionary_list, allow_pickle=True)\n",
    "            dictionary_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "if(os.path.exists(\"./data\")):\n",
    "    files = glob.glob('./data/*')\n",
    "    for f in files:\n",
    "        os.remove(f)\n",
    "else:\n",
    "    os.makedirs(\"./data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4c803dfea528440cabcc5d47873daaf5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "PATH =  \"books\"+\".pt\" #change the model name here\n",
    "\n",
    "model_name = \"bert-base-uncased\"\n",
    "config = BertConfig.from_pretrained(model_name, output_hidden_states=True)\n",
    "tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)\n",
    "bert = BertModel.from_pretrained(model_name, config=config)\n",
    "\n",
    "classifier_trained = BertClassifier(config=config, model=bert, num_classes=2)\n",
    "classifier_trained.load_state_dict(torch.load(PATH))\n",
    "\n",
    "model_predict(classifier_trained)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary_list = []\n",
    "\n",
    "files = glob.glob(\"./data/*.npy\")\n",
    "\n",
    "for j in range(len(files)):\n",
    "    alpha = np.load(f\"./data/batch_{j}.npy\", allow_pickle = True)\n",
    "    for i in range(len(alpha)):\n",
    "        new_row = {'embeddings':alpha[i][\"embeddings\"], 'layers': alpha[i][\"layers\"]}\n",
    "        dictionary_list.append(new_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>embeddings</th>\n",
       "      <th>layers</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[0.029436039, 0.06670721, -0.22471415, -0.2367...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[-0.1554519, -0.21112284, -0.3408423, -0.20209...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[-0.12095504, -0.36359823, -0.17967358, -0.109...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[-0.21423775, -0.7461651, -0.6160757, -0.30794...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[-0.4974339, -0.85912985, -0.42627215, -0.5099...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11995</th>\n",
       "      <td>[0.51743513, -0.6500383, -0.68353117, -0.22525...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11996</th>\n",
       "      <td>[0.42627597, -0.63389504, -0.19636014, -0.2719...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11997</th>\n",
       "      <td>[0.025212316, -0.5110682, 0.48476753, -0.35641...</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11998</th>\n",
       "      <td>[0.04342799, -0.75802934, 0.5390331, -0.213192...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11999</th>\n",
       "      <td>[-0.29624316, -0.9558969, 0.48933977, -0.35488...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              embeddings  layers\n",
       "0      [0.029436039, 0.06670721, -0.22471415, -0.2367...       1\n",
       "1      [-0.1554519, -0.21112284, -0.3408423, -0.20209...       2\n",
       "2      [-0.12095504, -0.36359823, -0.17967358, -0.109...       3\n",
       "3      [-0.21423775, -0.7461651, -0.6160757, -0.30794...       4\n",
       "4      [-0.4974339, -0.85912985, -0.42627215, -0.5099...       5\n",
       "...                                                  ...     ...\n",
       "11995  [0.51743513, -0.6500383, -0.68353117, -0.22525...       8\n",
       "11996  [0.42627597, -0.63389504, -0.19636014, -0.2719...       9\n",
       "11997  [0.025212316, -0.5110682, 0.48476753, -0.35641...      10\n",
       "11998  [0.04342799, -0.75802934, 0.5390331, -0.213192...      11\n",
       "11999  [-0.29624316, -0.9558969, 0.48933977, -0.35488...      12\n",
       "\n",
       "[12000 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame.from_dict(dictionary_list)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_pickle(\"./dvd-ft-b.pkl\") #naming convention dvd-ft-b (dvd dataset on books finetuned)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### For computing representation on pre-trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_lower_case=True)\n",
    "config = BertConfig.from_pretrained(\"bert-base-uncased\", output_hidden_states=True)\n",
    "model = BertModel.from_pretrained('bert-base-uncased', config=config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "if(os.path.exists(\"./data\")):\n",
    "    files = glob.glob('./data/*')\n",
    "    for f in files:\n",
    "        os.remove(f)\n",
    "else:\n",
    "    os.makedirs(\"./data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = \"./amazon-review/dvd-UL.csv\"\n",
    "dataset = pd.read_csv(files)\n",
    "dataset = dataset.sample(n=1000, random_state=42)\n",
    "dataset = dataset.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ee9b36da10b14cc684d7f80b5a821d70",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "dictionary_list = []\n",
    "count = 0\n",
    "\n",
    "for i in tqdm(range(len(dataset))):\n",
    "            encoding = tokenizer(dataset[\"review_text\"][i], return_tensors='pt', padding='max_length', truncation=True, max_length=180)\n",
    "            input_ids = encoding['input_ids']\n",
    "            attention_mask = encoding['attention_mask']\n",
    "            outputs = model(input_ids, attention_mask)\n",
    "            hidden_states = outputs[2][1:]\n",
    "\n",
    "            batch_text = []\n",
    "\n",
    "            for j in range(0,12):\n",
    "                new_row = {'embeddings':hidden_states[j][0][0].cpu().detach().numpy(), 'layers': j+1}\n",
    "                dictionary_list.append(new_row)\n",
    "\n",
    "            dictionary_list = np.save(f\"./data/batch_{count}\", dictionary_list, allow_pickle=True)\n",
    "            dictionary_list = []\n",
    "            count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary_list = []\n",
    "\n",
    "files = glob.glob(\"./data/*.npy\")\n",
    "\n",
    "for j in range(len(files)):\n",
    "    alpha = np.load(f\"./data/batch_{j}.npy\", allow_pickle = True)\n",
    "    for i in range(len(alpha)):\n",
    "        new_row = {'embeddings':alpha[i][\"embeddings\"], 'layers': alpha[i][\"layers\"]}\n",
    "        dictionary_list.append(new_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame.from_dict(dictionary_list)\n",
    "df.to_pickle(\"./dvd-pt.pkl\") ##naming convention dvd-pt (dvd dataset on pretrain model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.layers.unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualisation for sanity check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_pickle(\"./books-ft-b.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_pickle(\"./books-pt.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5051431655883789\n"
     ]
    }
   ],
   "source": [
    "mat = np.matrix([x for x in df.embeddings])\n",
    "start = time.time()\n",
    "pca = PCA(n_components=3, random_state=42)\n",
    "components = pca.fit_transform(mat)\n",
    "print(time.time()-start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'fig' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-21-0e04dee09881>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_html\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"3D_PCA.html\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'fig' is not defined"
     ]
    }
   ],
   "source": [
    "fig.write_html(\"3D_PCA_FT.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.scatter_3d(\n",
    "    components, x=0, y=1, z=2, color=df['layers'],\n",
    "    title=f'Domain representation plotting PCA',\n",
    "    labels={'color': 'layers'}\n",
    ")\n",
    "fig.update_layout(\n",
    "    autosize=False,\n",
    "    width=1080,\n",
    "    height=720,)\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.scatter_3d(\n",
    "    components, x=0, y=1, z=2, color=df['layers'],\n",
    "    title=f'Domain representation plotting PCA',\n",
    "    labels={'color': 'layers'}\n",
    ")\n",
    "fig.update_layout(\n",
    "    autosize=False,\n",
    "    width=1080,\n",
    "    height=720,)\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load pickles and calculate RSA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_pickle(\"./dvd-pt.pkl\")\n",
    "df1 = pd.read_pickle(\"./dvd-ft-b.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mmd(first_tensor: torch.Tensor, second_tensor: torch.tensor, kernel) -> float:\n",
    "    \"\"\" MMD will be calculated between two sets of tensors\n",
    "    Parameters\n",
    "    ----------\n",
    "    first_tensor : torch.Tensor\n",
    "        Tensor of shape batch_size, m\n",
    "        m is the dimension of the first tensor\n",
    "    second_tensor: torch.Tensor\n",
    "        Tensor of shape batch_size, n\n",
    "        n is the dimension of the target tensor \n",
    "    Returns\n",
    "    -------\n",
    "    float\n",
    "        MMD between two samples\n",
    "    \"\"\"\n",
    "    first_tensor_correlation = kernel(first_tensor, first_tensor).evaluate()\n",
    "    second_tensor_correlation = kernel(second_tensor, second_tensor).evaluate()\n",
    "    first_second_tensor_correlation = kernel(first_tensor, second_tensor).evaluate()\n",
    "    m = first_tensor.size(0)\n",
    "    n = second_tensor.size(0)\n",
    "    sum_first_corr = first_tensor_correlation.sum().item()\n",
    "    sum_second_corr = second_tensor_correlation.sum().item()\n",
    "    sum_first_second_corr = first_second_tensor_correlation.sum().item()\n",
    "    first_term = (1 / (m ** 2)) * sum_first_corr\n",
    "    second_term = (1 / (n ** 2)) * sum_second_corr\n",
    "    third_term = (2 / (m * n)) * sum_first_second_corr\n",
    "    divergence = first_term + second_term - third_term\n",
    "    return divergence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def closest_rows(a):\n",
    "    # Get euclidean distances as 2D array\n",
    "    dists = cdist(a, a, 'cosine')\n",
    "\n",
    "    # Fill diagonals with something greater than all elements as we intend\n",
    "    # to get argmin indices later on and then index into input array with those\n",
    "    # indices to get the closest rows\n",
    "#     dists.ravel()[::dists.shape[1]+1] = dists.max()+1\n",
    "    return dists"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "pretrained_books_dvd_RSA = {}\n",
    "pretrained_books_dvd_MMD = {} #for storing MMD between 2 models for each layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2bdf3a0541664dafb1e866182b8c35af",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(1,13)):\n",
    "    temp1 = df[df[\"layers\"]==i]\n",
    "    temp2 = df1[df1[\"layers\"]==i] #selecting a specific layer\n",
    "    \n",
    "    lis1 = temp1[\"embeddings\"].tolist()\n",
    "    lis2 = temp2[\"embeddings\"].tolist()\n",
    "    \n",
    "    a = np.array(lis1)\n",
    "    b = np.array(lis2)\n",
    "    \n",
    "    result1 = closest_rows(a)\n",
    "    result2 = closest_rows(b)\n",
    "    \n",
    "    pretrained_books_dvd_RSA[i] = np.corrcoef(result1.flatten(), result2.flatten())[0,1]\n",
    "\n",
    "    result1 = torch.from_numpy(result1)\n",
    "    result2 = torch.from_numpy(result2)\n",
    "    kernel = RQKernel()\n",
    "    val = mmd(result1, result2, kernel)\n",
    "    \n",
    "    pretrained_books_dvd_MMD[i] = val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{1: 0.955398855024296,\n",
       " 2: 0.9499259151146981,\n",
       " 3: 0.9346616069127789,\n",
       " 4: 0.9586915720754712,\n",
       " 5: 0.9568238275750086,\n",
       " 6: 0.9182314339773345,\n",
       " 7: 0.8438659922811926,\n",
       " 8: 0.8498103209022732,\n",
       " 9: 0.8472093680343032,\n",
       " 10: 0.5935006317709525,\n",
       " 11: 0.37255983521918024,\n",
       " 12: 0.15755181885614458}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "RSA between bert representation and fine tuned bert (books) - DVD dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{1: 0.9527077794274325,\n",
       " 2: 0.9477798799964473,\n",
       " 3: 0.9226848173618823,\n",
       " 4: 0.9463699218593898,\n",
       " 5: 0.9197475682544565,\n",
       " 6: 0.8904403027804,\n",
       " 7: 0.8978173700287129,\n",
       " 8: 0.8799731485650455,\n",
       " 9: 0.8353729201603369,\n",
       " 10: 0.5456118638478861,\n",
       " 11: 0.33934223629028293,\n",
       " 12: 0.1056740137191123}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "RSA between bert representation and fine tuned bert (books) - books dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MMD between bert representation and fine tuned bert (books) - DVD dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{1: 0.025523517691246278,\n",
    " 2: 0.017031062632806515,\n",
    " 3: 0.034405121243115344,\n",
    " 4: 0.0743630493421803,\n",
    " 5: 0.0658974319876604,\n",
    " 6: 0.21873141347150904,\n",
    " 7: 0.23004184150353701,\n",
    " 8: 0.16440297895429068,\n",
    " 9: 0.11399606384908612,\n",
    " 10: 0.2680937669944564,\n",
    " 11: 0.31493303470159223,\n",
    " 12: 0.397320439848271}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MMD between bert representation and fine tuned bert (books) - books dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{1: 0.008592661067214902,\n",
    " 2: 0.008466682016583071,\n",
    " 3: 0.01607070255527243,\n",
    " 4: 0.03613209326039435,\n",
    " 5: 0.059290986788250954,\n",
    " 6: 0.0553308317517649,\n",
    " 7: 0.0686747228871778,\n",
    " 8: 0.060072450953288725,\n",
    " 9: 0.09658271662444762,\n",
    " 10: 0.2985128726786279,\n",
    " 11: 0.3239971706890749,\n",
    " 12: 0.4579824415026016}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}