{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cosine Similarity (Version C)\n",
    "\n",
    "- Computes arithmetic mean of pairwise cosine similarity:\n",
    "    - 0.952 american films\n",
    "    - 0.944 british films\n",
    "    - 0.935 indian films"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configuration\n",
    "\n",
    "# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/\n",
    "source_texts_directory = \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\"\n",
    "# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/\n",
    "embeddings_directory  = \"/home/eml4u/EML4U/data/wikipedia-embeddings/\"\n",
    "\n",
    "# points of time\n",
    "id_a = \"20100408\"\n",
    "id_b = \"20201101\"\n",
    "# category ids\n",
    "id_american = \"american-films\"\n",
    "id_british  = \"british-films\"\n",
    "id_indian   = \"indian-films\"\n",
    "# file ids\n",
    "id_american_a = id_a + \"-\" + id_american\n",
    "id_american_b = id_b + \"-\" + id_american\n",
    "id_british_a  = id_a + \"-\" + id_british\n",
    "id_british_b  = id_b + \"-\" + id_british\n",
    "id_indian_a   = id_a + \"-\" + id_indian\n",
    "id_indian_b   = id_b + \"-\" + id_indian\n",
    "\n",
    "# 11020 american-films.txt\n",
    "# 2147 british-films.txt\n",
    "# 3596 indian-films.txt\n",
    "execute_american = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "numpy:   1.19.2\n",
      "sklearn: 0.23.2\n"
     ]
    }
   ],
   "source": [
    "# Imports\n",
    "\n",
    "import numpy\n",
    "print(\"numpy:   \" + numpy.version.version)\n",
    "\n",
    "import sklearn\n",
    "import sklearn.metrics\n",
    "print(\"sklearn: \" + sklearn.__version__)\n",
    "\n",
    "# Class instance to access data (wp texts, pre-computed embeddings)\n",
    "import data_access\n",
    "data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-american-films.txt\n",
      "(11020, 768) <class 'numpy.ndarray'>\n",
      "/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-american-films.txt\n",
      "(11020, 768) <class 'numpy.ndarray'>\n",
      "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt\n",
      "(2147, 768) <class 'numpy.ndarray'>\n",
      "/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt\n",
      "(2147, 768) <class 'numpy.ndarray'>\n",
      "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-indian-films.txt\n",
      "(3596, 768) <class 'numpy.ndarray'>\n",
      "/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-indian-films.txt\n",
      "(3596, 768) <class 'numpy.ndarray'>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Load embeddings\n",
    "\n",
    "if execute_american:\n",
    "    embeddings_american_a = data_accessor.load_embeddings(id_american_a)\n",
    "    embeddings_american_b = data_accessor.load_embeddings(id_american_b)\n",
    "embeddings_british_a = data_accessor.load_embeddings(id_british_a)\n",
    "embeddings_british_b = data_accessor.load_embeddings(id_british_b)\n",
    "embeddings_indian_a = data_accessor.load_embeddings(id_indian_a)\n",
    "embeddings_indian_b = data_accessor.load_embeddings(id_indian_b)\n",
    "print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Arithmetic mean of pairwise cosine similarity:\n",
      "11020 elements \n",
      "0.9521031637381328 american\n",
      "2147 elements \n",
      "0.9445474825043075 british\n",
      "3596 elements \n",
      "0.9354938114061401 indian\n"
     ]
    }
   ],
   "source": [
    "# Cosine similarity\n",
    "\n",
    "def get_pairwise_cosine_similarity(a, b, note = \"\", printinfo = True):\n",
    "    if printinfo:\n",
    "        print(str(type(a)) + \" \" + str(a.shape) + \"\\n\" + str(type(b)) + \" \" + str(b.shape))\n",
    "    cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]\n",
    "    if printinfo:\n",
    "        print(str(cosSim) + \" \" + note)\n",
    "    return cosSim\n",
    "\n",
    "# Sums up cosine similarities of texts of 2 points of time and divides sum by number of elements\n",
    "def get_mean_cosine_similarity(a, b, note = \"\", printinfo = True):\n",
    "    sum_ = 0;\n",
    "    for i in range(len(a)):\n",
    "        sum_ += sklearn.metrics.pairwise.cosine_similarity(a[i].reshape(1, -1), b[i].reshape(1, -1), dense_output=True)[0][0]\n",
    "    if printinfo:\n",
    "        print(str( len(a)  )+ \" elements \" + note)\n",
    "    return sum_ / len(a)\n",
    "\n",
    "print(\"Arithmetic mean of pairwise cosine similarity:\")\n",
    "if execute_american:\n",
    "    print(get_mean_cosine_similarity(embeddings_american_a, embeddings_american_b), \"american\")\n",
    "print(get_mean_cosine_similarity(embeddings_british_a, embeddings_british_b), \"british\")\n",
    "print(get_mean_cosine_similarity(embeddings_indian_a, embeddings_indian_b), \"indian\")\n",
    "\n",
    "# Arithmetic mean of pairwise cosine similarity:\n",
    "# 0.9521031637381328 american\n",
    "# 0.9445474825043075 british\n",
    "# 0.9354938114061401 indian"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Average embeddings\n",
    "\n",
    "Compute average embeddings for 2 points in time. The results will be a 768-dimensional vector for each point in time.  \n",
    "→ Get texts compared to the average vectors.\n",
    "\n",
    "→ Get typical texts    \n",
    "* One vector of old point in time $\\bar{v_{t1}}$, one vector new point in time $\\bar{v_{t2}}$  \n",
    "* Between: CosSim  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average embeddings for 2 points in time:\n",
      "<class 'numpy.ndarray'> (768,) american_a\n",
      "<class 'numpy.ndarray'> (768,) american_b\n",
      "<class 'numpy.ndarray'> (768,) british_a\n",
      "<class 'numpy.ndarray'> (768,) british_b\n",
      "<class 'numpy.ndarray'> (768,) indian_a\n",
      "<class 'numpy.ndarray'> (768,) indian_b\n"
     ]
    }
   ],
   "source": [
    "# Arithmetic mean\n",
    "\n",
    "def get_mean(embeddings, note = \"\", printinfo = True):\n",
    "    mean = numpy.mean(embeddings, axis=0)\n",
    "    if printinfo:\n",
    "        print(str(type(mean)) + \" \" + str(mean.shape) + \" \" +  note)\n",
    "    return mean\n",
    "\n",
    "print(\"Average embeddings for 2 points in time:\")\n",
    "if execute_american:\n",
    "    mean_american_a  = get_mean(embeddings_american_a, \"american_a\")\n",
    "    mean_american_b  = get_mean(embeddings_american_b, \"american_b\")\n",
    "mean_british_a  = get_mean(embeddings_british_a, \"british_a\")\n",
    "mean_british_b  = get_mean(embeddings_british_b, \"british_b\")\n",
    "mean_indian_a  = get_mean(embeddings_indian_a, \"indian_a\")\n",
    "mean_indian_b  = get_mean(embeddings_indian_b, \"indian_b\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11020 [(7210, 0.6063458325904018), (448, 0.6434165782476016), (3828, 0.6609785427292287)] .. [(1941, 0.9749839140376109), (8218, 0.9756742946779071)]\n",
      "11020 [(7210, 0.5968999088697942), (4629, 0.6120370250147551), (1738, 0.6434375594093735)] .. [(2017, 0.9780761837998851), (9245, 0.9789322185574947)]\n",
      "2147 [(961, 0.795723406144361), (1471, 0.7977011006199223), (1047, 0.7990740939165086)] .. [(1107, 0.9754497563201272), (393, 0.9757224095034057)]\n",
      "2147 [(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873)] .. [(1249, 0.9763073406692253), (1993, 0.9763565781092824)]\n",
      "3596 [(1816, 0.6436218415151918), (1175, 0.6523821880855325), (3116, 0.6672549870487694)] .. [(437, 0.9768417192399903), (2018, 0.9773265088174122)]\n",
      "3596 [(346, 0.687495848746942), (50, 0.7013913114635741), (2945, 0.7695575497231928)] .. [(2821, 0.9819029091192182), (966, 0.9821326550032112)]\n"
     ]
    }
   ],
   "source": [
    "# Texts compared to the average vectors\n",
    "\n",
    "def get_distances(embeddings, mean_embeddings, printinfo = True):\n",
    "    distances = []\n",
    "    for i in range(len(embeddings)):\n",
    "        assert len(mean_embeddings) == len(embeddings[i]), \"length of arrays different\"\n",
    "        distances.append((i, get_pairwise_cosine_similarity(mean_embeddings.reshape(1, -1), embeddings[i].reshape(1, -1), \"\", False)))\n",
    "    distances = sorted(distances, key=lambda tup: tup[1], reverse=False)\n",
    "    if(printinfo):\n",
    "        print(len(distances), distances[0:3], \"..\", distances[len(distances)-2:])\n",
    "    return distances\n",
    "\n",
    "if execute_american:\n",
    "    distances_american_a = get_distances(embeddings_american_a, mean_american_a)\n",
    "    distances_american_b = get_distances(embeddings_american_b, mean_american_b)\n",
    "distances_british_a = get_distances(embeddings_british_a, mean_british_a)\n",
    "distances_british_b = get_distances(embeddings_british_b, mean_british_b)\n",
    "distances_indian_a = get_distances(embeddings_indian_a, mean_indian_a)\n",
    "distances_indian_b = get_distances(embeddings_indian_b, mean_indian_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print source texts\n",
    "def print_source_text(directory, category_id, index):\n",
    "    print()\n",
    "    print(\"Category: \" + category_id)\n",
    "    print(\"Index:    \" + str(index))\n",
    "    file = data_accessor.get_embeddings_dict_filename(category_id, index);\n",
    "    print(\"File:     \")\n",
    "    print(data_accessor.read_source_text(directory, file))\n",
    "    print()\n",
    "\n",
    "if False:\n",
    "    print_source_text(id_british_b, id_british, similarities_british_b[0][0])\n",
    "    print_source_text(id_british_b, id_british, 680)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Compare each document embedding $v_{t2i}$ (of every wp article) at $t2$ with $\\bar{v_{t2}}$ using CosSim.  \n",
    "\n",
    "* Get WP articles with largest distance to mean-vector $\\bar{v_t2}$.\n",
    "* Optional: For article with largest distance, check attention and highlight words with largest attention  \n",
    "e.g. Integrated Gradients for text https://github.com/SeldonIO/alibi\n",
    "* Check plotting + word counts (end of file) https://github.com/EML4U/Topic-Modeling/blob/main/Twitter%20test.ipynb "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get articles with largest distance to v_t2\n",
    "# Distance: Smallest cosine similarity\n",
    "# -> See similarities_british_b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873), (1605, 0.7984441078760017), (333, 0.8048592466102065), (179, 0.806587230956004), (1202, 0.8096051288887628), (381, 0.8126745126594594), (245, 0.8173266825145751), (1925, 0.821152862297925), (255, 0.8246783158761353), (1514, 0.8287613672087364), (1811, 0.829478593797842), (2087, 0.8348875860421145), (1520, 0.8365918721037443), (886, 0.8366306915455757), (902, 0.8389038535147046), (853, 0.8407825229891188), (1015, 0.84217543288988), (1501, 0.8439344648809313), (406, 0.8461331874787608), (1554, 0.8485776775421392), (663, 0.8509946040235267), (1286, 0.8513454108754721), (213, 0.8514522727454383), (1149, 0.852359455090014), (1637, 0.8539473941773443), (778, 0.8540251801467501), (2009, 0.8576742923186815), (1435, 0.8607933663390277), (483, 0.862620813656519), (526, 0.8629482931038279), (101, 0.8641558058376334), (720, 0.8643496013779346), (1666, 0.8649259321141527), (1674, 0.865616164520292), (789, 0.866252396217221), (1156, 0.8664125570172904), (824, 0.8664865933138797), (332, 0.8671766380114813), (605, 0.8679861756571845), (574, 0.8692133375218205), (385, 0.8703120437107627), (400, 0.8710870542779425), (884, 0.8737298301327405), (1762, 0.8740842715701184), (272, 0.8747334865889536), (591, 0.8748916853566279), (1780, 0.8755334280631477), (295, 0.8764491348254336), (1858, 0.8778708375071165), (1599, 0.877924260244563), (1342, 0.8785883554894383), (2000, 0.8794430006512584), (1757, 0.8795602473035575), (1579, 0.8796144442283066), (1635, 0.8804493627515715), (674, 0.880598769307674), (747, 0.8808114412749424), (2128, 0.8817070054497067), (875, 0.8827556029215672), (1549, 0.8832425625074518), (1258, 0.8833110976547145), (174, 0.8836400509311044), (918, 0.8844534830455353), (1981, 0.8848847113626503), (2053, 0.8852315112685786), (259, 0.8854263498204921), (355, 0.8855939571431497), (1691, 0.8859139831616893), (1247, 0.8865212579727186), (304, 0.8872719351437507), (764, 0.8881581364107067), (216, 0.8881797777193562), (1321, 0.8884719784419831), (1432, 0.8887640378662863), (23, 0.8894833788108552), (839, 0.8899124912192335), (290, 0.890238506109883), (183, 0.8910271068032816), (522, 0.8910528699982174), (1295, 0.8916884399043221), (1462, 0.8917890957542246), (1883, 0.8920079172980555), (819, 0.8925052759266741), (240, 0.8925691265135722), (1835, 0.8927933950167327), (727, 0.8929515843282431), (1191, 0.8931548126744745), (1593, 0.893289718025112), (1323, 0.8934086582551485), (805, 0.8941019599905784), (1231, 0.8943603343110167), (164, 0.8945786418679671), (1615, 0.8950430990051336), (1675, 0.8953028183688045), (17, 0.8953558750795475), (2036, 0.8954911725690698), (1941, 0.8957662081755633), (545, 0.895993783162472)]\n"
     ]
    }
   ],
   "source": [
    "# 100 articles with largest distance to mean vector B\n",
    "distances_british_b = distances_british_b[0:100]\n",
    "print(distances_british_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (EML4U)",
   "language": "python",
   "name": "eml4u"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}