{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lexical Dispersion Plot Jupyter Notebook\n", "## Dependency Installation and Import Required Modules" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install matplotlib nltk\n", "import glob\n", "import matplotlib.pyplot as plt\n", "from nltk.tokenize import TreebankWordTokenizer\n", "from string import punctuation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Modify the list of words to be searched for" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "words = ['war', 'love', 'death', 'life', 'marry', 'fight', 'king', 'queen']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Function Definitions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def lexical_dispersion_plot_directory(directory):\n", " for file_path in glob.glob(directory, recursive=True):\n", " with open(file_path, \"r\") as file:\n", " text = file.read()\n", "\n", " # Custom Tokenizer\n", " # Start of Go-To Tokenizer for English - Modified TreebankWordTokenize\n", " tokens = TreebankWordTokenizer().tokenize(text)\n", " for word in tokens:\n", " word = word.lower().strip(punctuation)\n", " tokens = list(filter(None, tokens)) \n", " ## EOF Function\n", "\n", " points = [(x, y) for x in range(len(tokens))\n", " for y in range(len(words)) if tokens[x] == words[y]]\n", "\n", " if points:\n", " x, y = zip(*points)\n", " else:\n", " x = y = ()\n", " \n", " print(f\"Lexical Dispersion Plot for {file_path}\")\n", "\n", " plt.figure(figsize=(8, 6))\n", " plt.plot(x, y, \"rx\")\n", " plt.yticks(range(len(words)), words)\n", " plt.ylim(-1, len(words))\n", " plt.title(f\"Lexical Dispersion Plot for {file_path}\")\n", " plt.xlabel(\"Word Offset\")\n", " plt.show()\n", "\n", "def lexical_dispersion_plot_file(file_path):\n", " with open(file_path, \"r\") as file:\n", " text = file.read()\n", "\n", " # Custom Tokenizer\n", " # Start of Go-To Tokenizer for English - Modified TreebankWordTokenize\n", " tokens = TreebankWordTokenizer().tokenize(text)\n", " for word in tokens:\n", " word = word.lower().strip(punctuation)\n", " tokens = list(filter(None, tokens)) \n", " ## EOF Function\n", "\n", " points = [(x, y) for x in range(len(tokens))\n", " for y in range(len(words)) if tokens[x] == words[y]]\n", "\n", " if points:\n", " x, y = zip(*points)\n", " else:\n", " x = y = ()\n", " \n", " print(f\"Lexical Dispersion Plot for {file_path}\")\n", "\n", " plt.figure(figsize=(8, 6))\n", " plt.plot(x, y, \"rx\")\n", " plt.yticks(range(len(words)), words)\n", " plt.ylim(-1, len(words))\n", " plt.title(f\"Lexical Dispersion Plot for {file_path}\")\n", " plt.xlabel(\"Word Offset\")\n", " plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Generating Lexical Dispersion Plot for Shakespeare & Marlowe Corpora\n", "### One file example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lexical_dispersion_plot_file(\"books/Shakespeare-corpus/Ado Much Ado About Nothing.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The whole corpus\n", "I commented out the lines in order to keep the page more tidy as the corpora contain a lot of text files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# lexical_dispersion_plot_directory(\"books/Shakespeare-corpus/*.txt\")\n", "# lexical_dispersion_plot_directory(\"books/Marlowe-corpus/*.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Custom Corpus Analysis\n", "### Lexical Dispersion Plot for a single file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lexical_dispersion_plot_file(\"books/custom-corpus/1HVI-MIT (CL).txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Alternatively you may analyze a whole directory" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lexical_dispersion_plot_directory(\"books/custom-corpus/*.txt\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 4 }