{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Computation of embeddings of Wikipedia texts\n", "\n", "Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.\n", "\n", "In **1 hour** you can process around 60x60 = **3600 text-pairs**." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create embeddings for wikipedia texts\n", "# Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.\n", "# In 1 hour you can process around 3,600 = 60*60 text-pairs.\n", "\n", "# Current script\n", "baseDir = \"/home/eml4u/EML4U/notebooks/wikipedia-embeddings\"\n", "\n", "# File IDs (for input and output)\n", "#title = \"american-films\"\n", "#title = \"british-films\"\n", "#title = \"indian-films\"\n", "title = \"living-people\"\n", "dateA = \"20100408\"\n", "dateB = \"20201101\"\n", "idA = dateA + \"-\" + title\n", "idB = dateB + \"-\" + title\n", "\n", "# Input directories\n", "dataDirA = \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\" + idA + \"/\"\n", "dataDirB = \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\" + idB + \"/\"\n", "\n", "# Output files\n", "outDir = \"/home/eml4u/EML4U/data/wikipedia-embeddings/\"\n", "fileEmbeddingsA = outDir + idA + \".txt\"\n", "fileEmbeddingsB = outDir + idB + \".txt\"\n", "fileIds = outDir + title + \".txt\"\n", "\n", "print(dataDirA)\n", "print(dataDirB)\n", "print(fileEmbeddingsA)\n", "print(fileEmbeddingsB)\n", "print(fileIds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get file paths\n", "import glob\n", "filesA = glob.glob(dataDirA + '*.txt')\n", "filesB = glob.glob(dataDirB + '*.txt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Development\n", "# Limit number of file paths\n", "if False:\n", " filesA = filesA[:20]\n", " filesB = filesB[:20]\n", "# Print file paths\n", "if False:\n", " print('\\n'.join(map(str, filesA)))\n", " print()\n", " print('\\n'.join(map(str, filesB)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read files\n", "textsA = []\n", "for filename in filesA:\n", " fileobject = open(filename, \"r\") \n", " text = fileobject.read()\n", " textsA.append(text)\n", " fileobject.close\n", "\n", "textsB = []\n", "for filename in filesB:\n", " fileobject = open(filename, \"r\") \n", " text = fileobject.read()\n", " textsB.append(text)\n", " fileobject.close" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print text sizes / texts\n", "print(\"len(textsA):\", len(textsA))\n", "print(\"len(textsB):\", len(textsB))\n", "\n", "if False:\n", " print(textsA[0])\n", " print(textsB[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Ensure similar filenames in both points of time\n", "import ntpath\n", "filenames = []\n", "for x in range(len(filesA)):\n", " filenames.append(ntpath.basename(filesA[x]))\n", " if(ntpath.basename(filesA[x]) != ntpath.basename(filesB[x])):\n", " print (x , ntpath.basename(filesA[x]), ntpath.basename(filesB[x]))\n", "print(\"len(filenames):\", len(filenames))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Prepare embeddings\n", "import sys\n", "import os\n", "sys.path.append(os.path.abspath(baseDir))\n", "from embedding import BertHuggingface\n", "\n", "NUM_CLASSES = 8 # irrelevant if you dont want to retrain\n", "bert = BertHuggingface(NUM_CLASSES)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create embeddings\n", "import time\n", "print(time.asctime())\n", "startTime = time.time()\n", "embeddingsA = bert.embed(textsA)\n", "embeddingsB = bert.embed(textsB)\n", "\n", "print(\"Runtime: %s seconds\" % (time.time() - startTime))\n", "print(\"embeddingsA.shape:\", embeddingsA.shape)\n", "print(\"embeddingsB.shape:\", embeddingsB.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Write embeddings/arrays to files\n", "print(fileEmbeddingsA)\n", "print(fileEmbeddingsB)\n", "print(fileIds)\n", "\n", "import numpy\n", "numpy.savetxt(fileEmbeddingsA, embeddingsA)\n", "numpy.savetxt(fileEmbeddingsB, embeddingsB)\n", "with open(fileIds, \"w\") as outfile:\n", " outfile.write(\"\\n\".join(filenames))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check: Load arrays\n", "if True:\n", " loadedA = numpy.loadtxt(fileEmbeddingsA)\n", " loadedB = numpy.loadtxt(fileEmbeddingsB)\n", " with open(fileIds) as f:\n", " loadedFilenames = f.read().splitlines()\n", " print(numpy.array_equal(embeddingsA, loadedA))\n", " print(numpy.array_equal(embeddingsB, loadedB))\n", " print(numpy.array_equal(filenames, loadedFilenames))\n", " print(type(embeddingsA))\n", " print(type(loadedA))\n", " print(type(loadedFilenames))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (EML4U)", "language": "python", "name": "eml4u" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }