{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cosine Similarity (Version C)\n", "\n", "- Computes arithmetic mean of pairwise cosine similarity:\n", " - 0.952 american films\n", " - 0.944 british films\n", " - 0.935 indian films" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Configuration\n", "\n", "# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/\n", "source_texts_directory = \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\"\n", "# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/\n", "embeddings_directory = \"/home/eml4u/EML4U/data/wikipedia-embeddings/\"\n", "\n", "# points of time\n", "id_a = \"20100408\"\n", "id_b = \"20201101\"\n", "# category ids\n", "id_american = \"american-films\"\n", "id_british = \"british-films\"\n", "id_indian = \"indian-films\"\n", "# file ids\n", "id_american_a = id_a + \"-\" + id_american\n", "id_american_b = id_b + \"-\" + id_american\n", "id_british_a = id_a + \"-\" + id_british\n", "id_british_b = id_b + \"-\" + id_british\n", "id_indian_a = id_a + \"-\" + id_indian\n", "id_indian_b = id_b + \"-\" + id_indian\n", "\n", "# 11020 american-films.txt\n", "# 2147 british-films.txt\n", "# 3596 indian-films.txt\n", "execute_american = True" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "numpy: 1.19.2\n", "sklearn: 0.23.2\n" ] } ], "source": [ "# Imports\n", "\n", "import numpy\n", "print(\"numpy: \" + numpy.version.version)\n", "\n", "import sklearn\n", "import sklearn.metrics\n", "print(\"sklearn: \" + sklearn.__version__)\n", "\n", "# Class instance to access data (wp texts, pre-computed embeddings)\n", "import data_access\n", "data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-american-films.txt\n", "(11020, 768) <class 'numpy.ndarray'>\n", "/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-american-films.txt\n", "(11020, 768) <class 'numpy.ndarray'>\n", "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt\n", "(2147, 768) <class 'numpy.ndarray'>\n", "/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt\n", "(2147, 768) <class 'numpy.ndarray'>\n", "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-indian-films.txt\n", "(3596, 768) <class 'numpy.ndarray'>\n", "/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-indian-films.txt\n", "(3596, 768) <class 'numpy.ndarray'>\n", "\n" ] } ], "source": [ "# Load embeddings\n", "\n", "if execute_american:\n", " embeddings_american_a = data_accessor.load_embeddings(id_american_a)\n", " embeddings_american_b = data_accessor.load_embeddings(id_american_b)\n", "embeddings_british_a = data_accessor.load_embeddings(id_british_a)\n", "embeddings_british_b = data_accessor.load_embeddings(id_british_b)\n", "embeddings_indian_a = data_accessor.load_embeddings(id_indian_a)\n", "embeddings_indian_b = data_accessor.load_embeddings(id_indian_b)\n", "print()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Arithmetic mean of pairwise cosine similarity:\n", "11020 elements \n", "0.9521031637381328 american\n", "2147 elements \n", "0.9445474825043075 british\n", "3596 elements \n", "0.9354938114061401 indian\n" ] } ], "source": [ "# Cosine similarity\n", "\n", "def get_pairwise_cosine_similarity(a, b, note = \"\", printinfo = True):\n", " if printinfo:\n", " print(str(type(a)) + \" \" + str(a.shape) + \"\\n\" + str(type(b)) + \" \" + str(b.shape))\n", " cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]\n", " if printinfo:\n", " print(str(cosSim) + \" \" + note)\n", " return cosSim\n", "\n", "# Sums up cosine similarities of texts of 2 points of time and divides sum by number of elements\n", "def get_mean_cosine_similarity(a, b, note = \"\", printinfo = True):\n", " sum_ = 0;\n", " for i in range(len(a)):\n", " sum_ += sklearn.metrics.pairwise.cosine_similarity(a[i].reshape(1, -1), b[i].reshape(1, -1), dense_output=True)[0][0]\n", " if printinfo:\n", " print(str( len(a) )+ \" elements \" + note)\n", " return sum_ / len(a)\n", "\n", "print(\"Arithmetic mean of pairwise cosine similarity:\")\n", "if execute_american:\n", " print(get_mean_cosine_similarity(embeddings_american_a, embeddings_american_b), \"american\")\n", "print(get_mean_cosine_similarity(embeddings_british_a, embeddings_british_b), \"british\")\n", "print(get_mean_cosine_similarity(embeddings_indian_a, embeddings_indian_b), \"indian\")\n", "\n", "# Arithmetic mean of pairwise cosine similarity:\n", "# 0.9521031637381328 american\n", "# 0.9445474825043075 british\n", "# 0.9354938114061401 indian" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Average embeddings\n", "\n", "Compute average embeddings for 2 points in time. The results will be a 768-dimensional vector for each point in time. \n", "→ Get texts compared to the average vectors.\n", "\n", "→ Get typical texts \n", "* One vector of old point in time $\\bar{v_{t1}}$, one vector new point in time $\\bar{v_{t2}}$ \n", "* Between: CosSim " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average embeddings for 2 points in time:\n", "<class 'numpy.ndarray'> (768,) american_a\n", "<class 'numpy.ndarray'> (768,) american_b\n", "<class 'numpy.ndarray'> (768,) british_a\n", "<class 'numpy.ndarray'> (768,) british_b\n", "<class 'numpy.ndarray'> (768,) indian_a\n", "<class 'numpy.ndarray'> (768,) indian_b\n" ] } ], "source": [ "# Arithmetic mean\n", "\n", "def get_mean(embeddings, note = \"\", printinfo = True):\n", " mean = numpy.mean(embeddings, axis=0)\n", " if printinfo:\n", " print(str(type(mean)) + \" \" + str(mean.shape) + \" \" + note)\n", " return mean\n", "\n", "print(\"Average embeddings for 2 points in time:\")\n", "if execute_american:\n", " mean_american_a = get_mean(embeddings_american_a, \"american_a\")\n", " mean_american_b = get_mean(embeddings_american_b, \"american_b\")\n", "mean_british_a = get_mean(embeddings_british_a, \"british_a\")\n", "mean_british_b = get_mean(embeddings_british_b, \"british_b\")\n", "mean_indian_a = get_mean(embeddings_indian_a, \"indian_a\")\n", "mean_indian_b = get_mean(embeddings_indian_b, \"indian_b\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11020 [(7210, 0.6063458325904018), (448, 0.6434165782476016), (3828, 0.6609785427292287)] .. [(1941, 0.9749839140376109), (8218, 0.9756742946779071)]\n", "11020 [(7210, 0.5968999088697942), (4629, 0.6120370250147551), (1738, 0.6434375594093735)] .. [(2017, 0.9780761837998851), (9245, 0.9789322185574947)]\n", "2147 [(961, 0.795723406144361), (1471, 0.7977011006199223), (1047, 0.7990740939165086)] .. [(1107, 0.9754497563201272), (393, 0.9757224095034057)]\n", "2147 [(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873)] .. [(1249, 0.9763073406692253), (1993, 0.9763565781092824)]\n", "3596 [(1816, 0.6436218415151918), (1175, 0.6523821880855325), (3116, 0.6672549870487694)] .. [(437, 0.9768417192399903), (2018, 0.9773265088174122)]\n", "3596 [(346, 0.687495848746942), (50, 0.7013913114635741), (2945, 0.7695575497231928)] .. [(2821, 0.9819029091192182), (966, 0.9821326550032112)]\n" ] } ], "source": [ "# Texts compared to the average vectors\n", "\n", "def get_distances(embeddings, mean_embeddings, printinfo = True):\n", " distances = []\n", " for i in range(len(embeddings)):\n", " assert len(mean_embeddings) == len(embeddings[i]), \"length of arrays different\"\n", " distances.append((i, get_pairwise_cosine_similarity(mean_embeddings.reshape(1, -1), embeddings[i].reshape(1, -1), \"\", False)))\n", " distances = sorted(distances, key=lambda tup: tup[1], reverse=False)\n", " if(printinfo):\n", " print(len(distances), distances[0:3], \"..\", distances[len(distances)-2:])\n", " return distances\n", "\n", "if execute_american:\n", " distances_american_a = get_distances(embeddings_american_a, mean_american_a)\n", " distances_american_b = get_distances(embeddings_american_b, mean_american_b)\n", "distances_british_a = get_distances(embeddings_british_a, mean_british_a)\n", "distances_british_b = get_distances(embeddings_british_b, mean_british_b)\n", "distances_indian_a = get_distances(embeddings_indian_a, mean_indian_a)\n", "distances_indian_b = get_distances(embeddings_indian_b, mean_indian_b)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Print source texts\n", "def print_source_text(directory, category_id, index):\n", " print()\n", " print(\"Category: \" + category_id)\n", " print(\"Index: \" + str(index))\n", " file = data_accessor.get_embeddings_dict_filename(category_id, index);\n", " print(\"File: \")\n", " print(data_accessor.read_source_text(directory, file))\n", " print()\n", "\n", "if False:\n", " print_source_text(id_british_b, id_british, similarities_british_b[0][0])\n", " print_source_text(id_british_b, id_british, 680)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Compare each document embedding $v_{t2i}$ (of every wp article) at $t2$ with $\\bar{v_{t2}}$ using CosSim. \n", "\n", "* Get WP articles with largest distance to mean-vector $\\bar{v_t2}$.\n", "* Optional: For article with largest distance, check attention and highlight words with largest attention \n", "e.g. Integrated Gradients for text https://github.com/SeldonIO/alibi\n", "* Check plotting + word counts (end of file) https://github.com/EML4U/Topic-Modeling/blob/main/Twitter%20test.ipynb " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Get articles with largest distance to v_t2\n", "# Distance: Smallest cosine similarity\n", "# -> See similarities_british_b" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873), (1605, 0.7984441078760017), (333, 0.8048592466102065), (179, 0.806587230956004), (1202, 0.8096051288887628), (381, 0.8126745126594594), (245, 0.8173266825145751), (1925, 0.821152862297925), (255, 0.8246783158761353), (1514, 0.8287613672087364), (1811, 0.829478593797842), (2087, 0.8348875860421145), (1520, 0.8365918721037443), (886, 0.8366306915455757), (902, 0.8389038535147046), (853, 0.8407825229891188), (1015, 0.84217543288988), (1501, 0.8439344648809313), (406, 0.8461331874787608), (1554, 0.8485776775421392), (663, 0.8509946040235267), (1286, 0.8513454108754721), (213, 0.8514522727454383), (1149, 0.852359455090014), (1637, 0.8539473941773443), (778, 0.8540251801467501), (2009, 0.8576742923186815), (1435, 0.8607933663390277), (483, 0.862620813656519), (526, 0.8629482931038279), (101, 0.8641558058376334), (720, 0.8643496013779346), (1666, 0.8649259321141527), (1674, 0.865616164520292), (789, 0.866252396217221), (1156, 0.8664125570172904), (824, 0.8664865933138797), (332, 0.8671766380114813), (605, 0.8679861756571845), (574, 0.8692133375218205), (385, 0.8703120437107627), (400, 0.8710870542779425), (884, 0.8737298301327405), (1762, 0.8740842715701184), (272, 0.8747334865889536), (591, 0.8748916853566279), (1780, 0.8755334280631477), (295, 0.8764491348254336), (1858, 0.8778708375071165), (1599, 0.877924260244563), (1342, 0.8785883554894383), (2000, 0.8794430006512584), (1757, 0.8795602473035575), (1579, 0.8796144442283066), (1635, 0.8804493627515715), (674, 0.880598769307674), (747, 0.8808114412749424), (2128, 0.8817070054497067), (875, 0.8827556029215672), (1549, 0.8832425625074518), (1258, 0.8833110976547145), (174, 0.8836400509311044), (918, 0.8844534830455353), (1981, 0.8848847113626503), (2053, 0.8852315112685786), (259, 0.8854263498204921), (355, 0.8855939571431497), (1691, 0.8859139831616893), (1247, 0.8865212579727186), (304, 0.8872719351437507), (764, 0.8881581364107067), (216, 0.8881797777193562), (1321, 0.8884719784419831), (1432, 0.8887640378662863), (23, 0.8894833788108552), (839, 0.8899124912192335), (290, 0.890238506109883), (183, 0.8910271068032816), (522, 0.8910528699982174), (1295, 0.8916884399043221), (1462, 0.8917890957542246), (1883, 0.8920079172980555), (819, 0.8925052759266741), (240, 0.8925691265135722), (1835, 0.8927933950167327), (727, 0.8929515843282431), (1191, 0.8931548126744745), (1593, 0.893289718025112), (1323, 0.8934086582551485), (805, 0.8941019599905784), (1231, 0.8943603343110167), (164, 0.8945786418679671), (1615, 0.8950430990051336), (1675, 0.8953028183688045), (17, 0.8953558750795475), (2036, 0.8954911725690698), (1941, 0.8957662081755633), (545, 0.895993783162472)]\n" ] } ], "source": [ "# 100 articles with largest distance to mean vector B\n", "distances_british_b = distances_british_b[0:100]\n", "print(distances_british_b)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (EML4U)", "language": "python", "name": "eml4u" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }