{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import BertTokenizerFast, BertModel, BertConfig, BertTokenizer\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm.notebook import tqdm as tqdm\n", "import glob\n", "import os\n", "from sklearn.decomposition import PCA\n", "import time\n", "import plotly.express as px\n", "from sklearn.manifold import TSNE\n", "from scipy.spatial.distance import cdist, cosine\n", "from gpytorch.kernels.rq_kernel import RQKernel\n", "import torch.nn as nn\n", "from torch.nn import functional as F\n", "import torch.nn.functional as F\n", "from torch.utils.data import Dataset, DataLoader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### For computing representation on fine-tuned model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class BertClassifier(torch.nn.Module):\n", " \n", " def __init__(self, config, model, dim=256, num_classes=2):\n", " super(BertClassifier, self).__init__()\n", " \n", " # create the model config and BERT initialize the pretrained BERT, also layers wise outputs\n", " self.config = config\n", " self.base = model\n", " \n", " # classifier head [not useful]\n", " self.head = torch.nn.Sequential(*[\n", " torch.nn.Dropout(p=self.config.hidden_dropout_prob),\n", " torch.nn.Linear(in_features=self.config.hidden_size, out_features=dim),\n", " torch.nn.ReLU(),\n", " torch.nn.Dropout(p=self.config.hidden_dropout_prob),\n", " torch.nn.Linear(in_features=dim, out_features=num_classes)\n", " ])\n", " \n", " def forward(self, input_ids, attention_mask=None):\n", " \n", " # first output is top layer output, second output is context of input seq and third output will be layerwise tokens \n", " top_layer, pooled, layers = self.base(input_ids, attention_mask)\n", " outputs = self.head(pooled)\n", " return top_layer, outputs, layers" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "class SentimentDataset(Dataset):\n", " def __init__(self, df, tokenizer, max_len=512):\n", " self.tokenizer = tokenizer\n", " self.text = df.review_text.values\n", " self.max_len = max_len\n", " \n", " def __len__(self):\n", " return len(self.text)\n", " \n", " def __getitem__(self, idx):\n", " text = self.text[idx]\n", " \n", " # encode the text and target into tensors return the attention masks as well\n", " encoding = self.tokenizer.encode_plus(\n", " text=text,\n", " add_special_tokens=True,\n", " max_length=self.max_len,\n", " return_token_type_ids=False,\n", " pad_to_max_length=True,\n", " return_attention_mask=True,\n", " return_tensors='pt',\n", " truncation=True\n", " )\n", " \n", " return {\n", " 'text': text,\n", " 'input_ids': encoding['input_ids'].flatten(),\n", " 'attention_mask': encoding['attention_mask'].flatten(),\n", " }\n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def model_predict(trained):\n", " dictionary_list = []\n", "\n", " df = pd.read_csv(\"./amazon-review/dvd-UL.csv\")\n", " df = df.sample(n=1000, random_state=42) #number of samples\n", " df = df.reset_index(drop=True)\n", "\n", " dataset = SentimentDataset(df=df, tokenizer=tokenizer)\n", "\n", " data_loader = torch.utils.data.DataLoader(\n", " dataset=dataset,\n", " batch_size= 4,\n", " shuffle=False,\n", " num_workers=8\n", " )\n", "\n", " for bi, d in enumerate(tqdm(data_loader)):\n", " input_ids = d[\"input_ids\"]\n", " attention_mask = d[\"attention_mask\"]\n", "\n", " _, _, output = classifier_trained(input_ids, attention_mask)\n", "\n", " output = output[1:]\n", "\n", " for zeta in range(len(output[0])):\n", " for i in range(0,12):\n", " new_row = {'embeddings':output[i][zeta][0].cpu().detach().numpy(), 'layers': i+1}\n", " dictionary_list.append(new_row)\n", "\n", " dictionary_list = np.save(f\"./data/batch_{bi}\", dictionary_list, allow_pickle=True)\n", " dictionary_list = []" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "if(os.path.exists(\"./data\")):\n", " files = glob.glob('./data/*')\n", " for f in files:\n", " os.remove(f)\n", "else:\n", " os.makedirs(\"./data\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4c803dfea528440cabcc5d47873daaf5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "PATH = \"books\"+\".pt\" #change the model name here\n", "\n", "model_name = \"bert-base-uncased\"\n", "config = BertConfig.from_pretrained(model_name, output_hidden_states=True)\n", "tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)\n", "bert = BertModel.from_pretrained(model_name, config=config)\n", "\n", "classifier_trained = BertClassifier(config=config, model=bert, num_classes=2)\n", "classifier_trained.load_state_dict(torch.load(PATH))\n", "\n", "model_predict(classifier_trained)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "dictionary_list = []\n", "\n", "files = glob.glob(\"./data/*.npy\")\n", "\n", "for j in range(len(files)):\n", " alpha = np.load(f\"./data/batch_{j}.npy\", allow_pickle = True)\n", " for i in range(len(alpha)):\n", " new_row = {'embeddings':alpha[i][\"embeddings\"], 'layers': alpha[i][\"layers\"]}\n", " dictionary_list.append(new_row)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
embeddingslayers
0[0.029436039, 0.06670721, -0.22471415, -0.2367...1
1[-0.1554519, -0.21112284, -0.3408423, -0.20209...2
2[-0.12095504, -0.36359823, -0.17967358, -0.109...3
3[-0.21423775, -0.7461651, -0.6160757, -0.30794...4
4[-0.4974339, -0.85912985, -0.42627215, -0.5099...5
.........
11995[0.51743513, -0.6500383, -0.68353117, -0.22525...8
11996[0.42627597, -0.63389504, -0.19636014, -0.2719...9
11997[0.025212316, -0.5110682, 0.48476753, -0.35641...10
11998[0.04342799, -0.75802934, 0.5390331, -0.213192...11
11999[-0.29624316, -0.9558969, 0.48933977, -0.35488...12
\n", "

12000 rows × 2 columns

\n", "
" ], "text/plain": [ " embeddings layers\n", "0 [0.029436039, 0.06670721, -0.22471415, -0.2367... 1\n", "1 [-0.1554519, -0.21112284, -0.3408423, -0.20209... 2\n", "2 [-0.12095504, -0.36359823, -0.17967358, -0.109... 3\n", "3 [-0.21423775, -0.7461651, -0.6160757, -0.30794... 4\n", "4 [-0.4974339, -0.85912985, -0.42627215, -0.5099... 5\n", "... ... ...\n", "11995 [0.51743513, -0.6500383, -0.68353117, -0.22525... 8\n", "11996 [0.42627597, -0.63389504, -0.19636014, -0.2719... 9\n", "11997 [0.025212316, -0.5110682, 0.48476753, -0.35641... 10\n", "11998 [0.04342799, -0.75802934, 0.5390331, -0.213192... 11\n", "11999 [-0.29624316, -0.9558969, 0.48933977, -0.35488... 12\n", "\n", "[12000 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame.from_dict(dictionary_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df.to_pickle(\"./dvd-ft-b.pkl\") #naming convention dvd-ft-b (dvd dataset on books finetuned)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### For computing representation on pre-trained model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_lower_case=True)\n", "config = BertConfig.from_pretrained(\"bert-base-uncased\", output_hidden_states=True)\n", "model = BertModel.from_pretrained('bert-base-uncased', config=config)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "if(os.path.exists(\"./data\")):\n", " files = glob.glob('./data/*')\n", " for f in files:\n", " os.remove(f)\n", "else:\n", " os.makedirs(\"./data\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "files = \"./amazon-review/dvd-UL.csv\"\n", "dataset = pd.read_csv(files)\n", "dataset = dataset.sample(n=1000, random_state=42)\n", "dataset = dataset.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ee9b36da10b14cc684d7f80b5a821d70", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "dictionary_list = []\n", "count = 0\n", "\n", "for i in tqdm(range(len(dataset))):\n", " encoding = tokenizer(dataset[\"review_text\"][i], return_tensors='pt', padding='max_length', truncation=True, max_length=180)\n", " input_ids = encoding['input_ids']\n", " attention_mask = encoding['attention_mask']\n", " outputs = model(input_ids, attention_mask)\n", " hidden_states = outputs[2][1:]\n", "\n", " batch_text = []\n", "\n", " for j in range(0,12):\n", " new_row = {'embeddings':hidden_states[j][0][0].cpu().detach().numpy(), 'layers': j+1}\n", " dictionary_list.append(new_row)\n", "\n", " dictionary_list = np.save(f\"./data/batch_{count}\", dictionary_list, allow_pickle=True)\n", " dictionary_list = []\n", " count += 1" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "dictionary_list = []\n", "\n", "files = glob.glob(\"./data/*.npy\")\n", "\n", "for j in range(len(files)):\n", " alpha = np.load(f\"./data/batch_{j}.npy\", allow_pickle = True)\n", " for i in range(len(alpha)):\n", " new_row = {'embeddings':alpha[i][\"embeddings\"], 'layers': alpha[i][\"layers\"]}\n", " dictionary_list.append(new_row)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame.from_dict(dictionary_list)\n", "df.to_pickle(\"./dvd-pt.pkl\") ##naming convention dvd-pt (dvd dataset on pretrain model)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.layers.unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualisation for sanity check" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle(\"./books-ft-b.pkl\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle(\"./books-pt.pkl\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5051431655883789\n" ] } ], "source": [ "mat = np.matrix([x for x in df.embeddings])\n", "start = time.time()\n", "pca = PCA(n_components=3, random_state=42)\n", "components = pca.fit_transform(mat)\n", "print(time.time()-start)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'fig' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_html\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"3D_PCA.html\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'fig' is not defined" ] } ], "source": [ "fig.write_html(\"3D_PCA_FT.html\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = px.scatter_3d(\n", " components, x=0, y=1, z=2, color=df['layers'],\n", " title=f'Domain representation plotting PCA',\n", " labels={'color': 'layers'}\n", ")\n", "fig.update_layout(\n", " autosize=False,\n", " width=1080,\n", " height=720,)\n", "\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = px.scatter_3d(\n", " components, x=0, y=1, z=2, color=df['layers'],\n", " title=f'Domain representation plotting PCA',\n", " labels={'color': 'layers'}\n", ")\n", "fig.update_layout(\n", " autosize=False,\n", " width=1080,\n", " height=720,)\n", "\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load pickles and calculate RSA" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle(\"./dvd-pt.pkl\")\n", "df1 = pd.read_pickle(\"./dvd-ft-b.pkl\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def mmd(first_tensor: torch.Tensor, second_tensor: torch.tensor, kernel) -> float:\n", " \"\"\" MMD will be calculated between two sets of tensors\n", " Parameters\n", " ----------\n", " first_tensor : torch.Tensor\n", " Tensor of shape batch_size, m\n", " m is the dimension of the first tensor\n", " second_tensor: torch.Tensor\n", " Tensor of shape batch_size, n\n", " n is the dimension of the target tensor \n", " Returns\n", " -------\n", " float\n", " MMD between two samples\n", " \"\"\"\n", " first_tensor_correlation = kernel(first_tensor, first_tensor).evaluate()\n", " second_tensor_correlation = kernel(second_tensor, second_tensor).evaluate()\n", " first_second_tensor_correlation = kernel(first_tensor, second_tensor).evaluate()\n", " m = first_tensor.size(0)\n", " n = second_tensor.size(0)\n", " sum_first_corr = first_tensor_correlation.sum().item()\n", " sum_second_corr = second_tensor_correlation.sum().item()\n", " sum_first_second_corr = first_second_tensor_correlation.sum().item()\n", " first_term = (1 / (m ** 2)) * sum_first_corr\n", " second_term = (1 / (n ** 2)) * sum_second_corr\n", " third_term = (2 / (m * n)) * sum_first_second_corr\n", " divergence = first_term + second_term - third_term\n", " return divergence" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def closest_rows(a):\n", " # Get euclidean distances as 2D array\n", " dists = cdist(a, a, 'cosine')\n", "\n", " # Fill diagonals with something greater than all elements as we intend\n", " # to get argmin indices later on and then index into input array with those\n", " # indices to get the closest rows\n", "# dists.ravel()[::dists.shape[1]+1] = dists.max()+1\n", " return dists" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "pretrained_books_dvd_RSA = {}\n", "pretrained_books_dvd_MMD = {} #for storing MMD between 2 models for each layer" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2bdf3a0541664dafb1e866182b8c35af", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for i in tqdm(range(1,13)):\n", " temp1 = df[df[\"layers\"]==i]\n", " temp2 = df1[df1[\"layers\"]==i] #selecting a specific layer\n", " \n", " lis1 = temp1[\"embeddings\"].tolist()\n", " lis2 = temp2[\"embeddings\"].tolist()\n", " \n", " a = np.array(lis1)\n", " b = np.array(lis2)\n", " \n", " result1 = closest_rows(a)\n", " result2 = closest_rows(b)\n", " \n", " pretrained_books_dvd_RSA[i] = np.corrcoef(result1.flatten(), result2.flatten())[0,1]\n", "\n", " result1 = torch.from_numpy(result1)\n", " result2 = torch.from_numpy(result2)\n", " kernel = RQKernel()\n", " val = mmd(result1, result2, kernel)\n", " \n", " pretrained_books_dvd_MMD[i] = val" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: 0.955398855024296,\n", " 2: 0.9499259151146981,\n", " 3: 0.9346616069127789,\n", " 4: 0.9586915720754712,\n", " 5: 0.9568238275750086,\n", " 6: 0.9182314339773345,\n", " 7: 0.8438659922811926,\n", " 8: 0.8498103209022732,\n", " 9: 0.8472093680343032,\n", " 10: 0.5935006317709525,\n", " 11: 0.37255983521918024,\n", " 12: 0.15755181885614458}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RSA between bert representation and fine tuned bert (books) - DVD dataset" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{1: 0.9527077794274325,\n", " 2: 0.9477798799964473,\n", " 3: 0.9226848173618823,\n", " 4: 0.9463699218593898,\n", " 5: 0.9197475682544565,\n", " 6: 0.8904403027804,\n", " 7: 0.8978173700287129,\n", " 8: 0.8799731485650455,\n", " 9: 0.8353729201603369,\n", " 10: 0.5456118638478861,\n", " 11: 0.33934223629028293,\n", " 12: 0.1056740137191123}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RSA between bert representation and fine tuned bert (books) - books dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MMD between bert representation and fine tuned bert (books) - DVD dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "{1: 0.025523517691246278,\n", " 2: 0.017031062632806515,\n", " 3: 0.034405121243115344,\n", " 4: 0.0743630493421803,\n", " 5: 0.0658974319876604,\n", " 6: 0.21873141347150904,\n", " 7: 0.23004184150353701,\n", " 8: 0.16440297895429068,\n", " 9: 0.11399606384908612,\n", " 10: 0.2680937669944564,\n", " 11: 0.31493303470159223,\n", " 12: 0.397320439848271}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MMD between bert representation and fine tuned bert (books) - books dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "{1: 0.008592661067214902,\n", " 2: 0.008466682016583071,\n", " 3: 0.01607070255527243,\n", " 4: 0.03613209326039435,\n", " 5: 0.059290986788250954,\n", " 6: 0.0553308317517649,\n", " 7: 0.0686747228871778,\n", " 8: 0.060072450953288725,\n", " 9: 0.09658271662444762,\n", " 10: 0.2985128726786279,\n", " 11: 0.3239971706890749,\n", " 12: 0.4579824415026016}" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 }