{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Development notebook for data_access.py" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy\n", "\n", "class DataAccess:\n", " \n", " # Data:\n", " # https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/\n", " # https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/\n", " \n", " # Directories in source_texts_directory\n", " # 20100408-american-films\n", " # 20100408-british-films\n", " # 20100408-Indexes\n", " # 20100408-indian-films\n", " # 20100408-living-people\n", " # 20201101-american-films\n", " # 20201101-british-films\n", " # 20201101-Indexes\n", " # 20201101-indian-films\n", " # 20201101-living-people\n", "\n", " # Files in embeddings_directory\n", " # 20100408-american-films.txt\n", " # 20100408-british-films.txt\n", " # 20100408-indian-films.txt\n", " # 20201101-american-films.txt\n", " # 20201101-british-films.txt\n", " # 20201101-indian-films.txt\n", " # american-films.txt\n", " # british-films.txt\n", " # indian-films.txt\n", " \n", " # e.g. \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\",\"/home/eml4u/EML4U/data/wikipedia-embeddings/\"\n", " def __init__(self, source_texts_directory, embeddings_directory):\n", " self.source_texts_directory = source_texts_directory\n", " self.embeddings_directory = embeddings_directory\n", " self.wikipedia_dict = {}\n", " self.embeddings_dict = {}\n", "\n", " # Source texts from Wikipedia\n", "\n", " # e.g. \"20100408-british-films\", \"Monty_Python_s_The_Meaning_of_Life.txt\"\n", " def read_source_text(self, directory, filename, print_filepath = True):\n", " filepath = self.source_texts_directory + directory + \"/\" + filename\n", " if print_filepath:\n", " print(filepath)\n", " file = open(filepath, \"r\") \n", " text = file.read() \n", " file.close()\n", " return text\n", " \n", " # Index filenames <-> Wikipedia titles\n", "\n", " # e.g. \"20100408\", \"british-films\"\n", " def get_wikipedia_dict(self, time_id, category_id):\n", " filepath = self.source_texts_directory + time_id + \"-Indexes/Index_Category_\" + category_id.capitalize().replace('-', '_') + \".txt\"\n", " count = 0\n", " dictionary = {}\n", " id = \"\"\n", " with open(filepath) as fileobject:\n", " for line in fileobject:\n", " line = line[:-1]\n", " if count % 2 == 0:\n", " id = line\n", " else:\n", " if not line:\n", " break\n", " else:\n", " dictionary[id] = line\n", " count += 1\n", " return dictionary\n", " \n", " # e.g. \"20100408\", \"british-films\", \"Monty_Python_s_The_Meaning_of_Life.txt\"\n", " def get_wikipedia_dict_title(self, time_id, category_id, filename):\n", " id = time_id + \"-\" + category_id\n", " if(id not in self.wikipedia_dict):\n", " self.wikipedia_dict[id] = self.get_wikipedia_dict(time_id, category_id)\n", " return self.wikipedia_dict[id][filename];\n", " \n", " # e.g. \"20100408\", \"british-films\", \"Monty Python's The Meaning of Life\"\n", " def get_wikipedia_dict_filename(self, time_id, category_id, title):\n", " id = time_id + \"-\" + category_id\n", " if(id not in self.wikipedia_dict):\n", " self.wikipedia_dict[id] = self.get_wikipedia_dict(time_id, category_id)\n", " return list(self.wikipedia_dict[id].keys())[list(self.wikipedia_dict[id].values()).index(title)]\n", "\n", " # Embeddings\n", " \n", " # e.g. \"20100408-british-films\"\n", " def load_embeddings(self, file_id, print_filename = True, print_info = True):\n", " filename = self.embeddings_directory + file_id + \".txt\"\n", " if print_filename:\n", " print(filename)\n", " embeddings = numpy.loadtxt(self.embeddings_directory + file_id + \".txt\")\n", " if print_info:\n", " print(embeddings.shape, type(embeddings))\n", " return embeddings\n", " \n", " # Index embeddings filenames\n", " \n", " # e.g. \"british-films\", 189\n", " def get_embeddings_dict_filename(self, category_id, index):\n", " indexFilename = category_id + \".txt\"\n", " if(category_id not in self.embeddings_dict):\n", " with open(self.embeddings_directory + indexFilename) as f:\n", " self.embeddings_dict[category_id] = f.read().splitlines()\n", " return self.embeddings_dict[category_id][index];\n", "\n", " # e.g. \"british-films\", \"Monty_Python_s_The_Meaning_of_Life.txt\"\n", " def get_embeddings_dict_index(self, category_id, filename):\n", " indexFilename = category_id + \".txt\"\n", " if(category_id not in self.embeddings_dict):\n", " with open(self.embeddings_directory + indexFilename) as f:\n", " self.embeddings_dict[category_id] = f.read().splitlines()\n", " return self.embeddings_dict[category_id].index(filename)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Use this notebook instead of python file\n", "\n", "if True:\n", " source_texts_directory = \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\"\n", " embeddings_directory = \"/home/eml4u/EML4U/data/wikipedia-embeddings/\"\n", " data_accessor = DataAccess(source_texts_directory, embeddings_directory)\n", "\n", " # points of time\n", " id_a = \"20100408\"\n", " id_b = \"20201101\"\n", " # category ids\n", " id_american = \"american-films\"\n", " id_british = \"british-films\"\n", " id_indian = \"indian-films\"\n", " # file ids\n", " id_american_a = id_a + \"-\" + id_american\n", " id_american_b = id_b + \"-\" + id_american\n", " id_british_a = id_a + \"-\" + id_british\n", " id_british_b = id_b + \"-\" + id_british\n", " id_indian_a = id_a + \"-\" + id_indian\n", " id_indian_b = id_b + \"-\" + id_indian\n", " \n", " filename = \"Monty_Python_s_The_Meaning_of_Life.txt\"\n", " title = \"Monty Python's The Meaning of Life\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20100408-british-films/Monty_Python_s_The_Meaning_of_Life.txt\n", "19383\n", "\n", "2704\n", "\n", "Monty Python's The Meaning of Life\n", "\n", "Monty_Python_s_The_Meaning_of_Life.txt\n", "\n", "/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt\n", "(2147, 768) <class 'numpy.ndarray'>\n", "(2147, 768)\n", "\n", "Monty_Python_s_The_Meaning_of_Life.txt\n", "\n", "189\n", "\n" ] } ], "source": [ "# Examples\n", "\n", "# Example: Imports and definitions\n", "if False:\n", " import data_access\n", " source_texts_directory = \"/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/\"\n", " embeddings_directory = \"/home/eml4u/EML4U/data/wikipedia-embeddings/\"\n", " data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)\n", " \n", " id_a = \"20100408\"\n", " id_british = \"british-films\"\n", " id_british_a = id_a + \"-\" + id_british\n", " filename = \"Monty_Python_s_The_Meaning_of_Life.txt\"\n", " title = \"Monty Python's The Meaning of Life\"\n", "\n", "# Source texts from Wikipedia\n", " \n", "# Example: Read source text\n", "if True:\n", " print(len(data_accessor.read_source_text(id_british_a, filename)))\n", " print()\n", "\n", "# Index filenames <-> Wikipedia titles\n", "\n", "# Example: \n", "if True:\n", " print(len(data_accessor.get_wikipedia_dict(id_a, id_british)))\n", " print()\n", "\n", "# Example: \n", "if True:\n", " print(data_accessor.get_wikipedia_dict_title(id_a, id_british, filename))\n", " print()\n", "\n", "# Example: \n", "if True:\n", " print(data_accessor.get_wikipedia_dict_filename(id_a, id_british, title))\n", " print()\n", " \n", "# Embeddings\n", " \n", "# Example: Load embeddings\n", "if True:\n", " print(data_accessor.load_embeddings(id_british_a).shape);\n", " print()\n", "\n", "# Index embeddings filenames\n", "\n", "# Example: Look up index\n", "if True:\n", " print(data_accessor.get_embeddings_dict_filename(id_british, 189))\n", " print()\n", " \n", "# Example: Look up filename\n", "if True:\n", " print(data_accessor.get_embeddings_dict_index(id_british, filename))\n", " print()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (EML4U)", "language": "python", "name": "eml4u" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }