{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lexical Dispersion Plot Jupyter Notebook\n",
    "## Dependency Installation and Import Required Modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install matplotlib nltk\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "from nltk.tokenize import TreebankWordTokenizer\n",
    "from string import punctuation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Modify the list of words to be searched for"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = ['war', 'love', 'death', 'life', 'marry', 'fight', 'king', 'queen']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Function Definitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lexical_dispersion_plot_directory(directory):\n",
    "    for file_path in glob.glob(directory, recursive=True):\n",
    "        with open(file_path, \"r\") as file:\n",
    "            text = file.read()\n",
    "\n",
    "        # Custom Tokenizer\n",
    "        # Start of Go-To Tokenizer for English - Modified TreebankWordTokenize\n",
    "        tokens = TreebankWordTokenizer().tokenize(text)\n",
    "        for word in tokens:\n",
    "            word = word.lower().strip(punctuation)\n",
    "        tokens = list(filter(None, tokens))  \n",
    "        ## EOF Function\n",
    "\n",
    "        points = [(x, y) for x in range(len(tokens))\n",
    "            for y in range(len(words)) if tokens[x] == words[y]]\n",
    "\n",
    "        if points:\n",
    "            x, y = zip(*points)\n",
    "        else:\n",
    "            x = y = ()\n",
    "        \n",
    "        print(f\"Lexical Dispersion Plot for {file_path}\")\n",
    "\n",
    "        plt.figure(figsize=(8, 6))\n",
    "        plt.plot(x, y, \"rx\")\n",
    "        plt.yticks(range(len(words)), words)\n",
    "        plt.ylim(-1, len(words))\n",
    "        plt.title(f\"Lexical Dispersion Plot for {file_path}\")\n",
    "        plt.xlabel(\"Word Offset\")\n",
    "        plt.show()\n",
    "\n",
    "def lexical_dispersion_plot_file(file_path):\n",
    "    with open(file_path, \"r\") as file:\n",
    "        text = file.read()\n",
    "\n",
    "    # Custom Tokenizer\n",
    "    # Start of Go-To Tokenizer for English - Modified TreebankWordTokenize\n",
    "    tokens = TreebankWordTokenizer().tokenize(text)\n",
    "    for word in tokens:\n",
    "        word = word.lower().strip(punctuation)\n",
    "    tokens = list(filter(None, tokens))  \n",
    "    ## EOF Function\n",
    "\n",
    "    points = [(x, y) for x in range(len(tokens))\n",
    "        for y in range(len(words)) if tokens[x] == words[y]]\n",
    "\n",
    "    if points:\n",
    "        x, y = zip(*points)\n",
    "    else:\n",
    "        x = y = ()\n",
    "    \n",
    "    print(f\"Lexical Dispersion Plot for {file_path}\")\n",
    "\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    plt.plot(x, y, \"rx\")\n",
    "    plt.yticks(range(len(words)), words)\n",
    "    plt.ylim(-1, len(words))\n",
    "    plt.title(f\"Lexical Dispersion Plot for {file_path}\")\n",
    "    plt.xlabel(\"Word Offset\")\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generating Lexical Dispersion Plot for Shakespeare & Marlowe Corpora\n",
    "### One file example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lexical_dispersion_plot_file(\"books/Shakespeare-corpus/Ado Much Ado About Nothing.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The whole corpus\n",
    "I commented out the lines in order to keep the page more tidy as the corpora contain a lot of text files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lexical_dispersion_plot_directory(\"books/Shakespeare-corpus/*.txt\")\n",
    "# lexical_dispersion_plot_directory(\"books/Marlowe-corpus/*.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Corpus Analysis\n",
    "### Lexical Dispersion Plot for a single file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lexical_dispersion_plot_file(\"books/custom-corpus/1HVI-MIT (CL).txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Alternatively you may analyze a whole directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lexical_dispersion_plot_directory(\"books/custom-corpus/*.txt\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}