{ "cells": [ { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "# Explore harvested text files" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /home/tim/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to /home/tim/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "import os\n", "import zipfile\n", "from pathlib import Path\n", "\n", "import altair as alt\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "nltk.download(\"stopwords\")\n", "nltk.download(\"punkt\")\n", "\n", "stopwords = nltk.corpus.stopwords.words(\"english\")\n", "stopwords += [\"tho\", \"tbe\"]\n", "\n", "# Are you using Jupyter Lab?\n", "# If so either don't run this cell or comment out the line below\n", "\n", "# alt.renderers.enable('notebook')\n", "\n", "# If you forget, run this cell, and then get strange warnings when you make a chart,\n", "# uncomment the following line and run this cell to reset the chart renderer\n", "\n", "# alt.renderers.enable('default')\n", "\n", "# alt.data_transformers.enable('json')\n", "# nltk.download('stopwords')\n", "# nltk.download('punkt')\n", "# stopwords = nltk.corpus.stopwords.words('english')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "%%capture\n", "# Load variables from the .env file if it exists\n", "# Use %%capture to suppress messages\n", "%load_ext dotenv\n", "%dotenv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Import a harvest zip file you've created previously\n", "# First upload the zip file to the data directory, then run this cell\n", "\n", "for zipped in sorted(Path(\"data\").glob(\"*.zip\")):\n", " print(f\"Unzipping {zipped}...\")\n", " with zipfile.ZipFile(zipped, \"r\") as zip_file:\n", " zip_file.extractall(Path(f\"data/{zipped.stem}\"))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def get_latest_harvest():\n", " \"\"\"\n", " Get the timestamp of the most recent harvest.\n", " \"\"\"\n", " harvests = sorted(\n", " [d for d in Path(\"data\").iterdir() if d.is_dir() and not d.name.startswith(\".\")]\n", " )\n", " try:\n", " harvest = harvests[-1]\n", " except IndexError:\n", " print(\"No harvests!\")\n", " harvest = None\n", " return harvest" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def get_docs(harvest):\n", " docs_path = get_docs_path(harvest)\n", " for p in docs_path:\n", " yield p.read_text(encoding=\"utf-8\").strip()\n", "\n", "\n", "def get_docs_path(harvest):\n", " path = Path(harvest, \"text\")\n", " docs_path = [p for p in sorted(path.glob(\"*.txt\"))]\n", " return docs_path\n", "\n", "\n", "def get_file_names(harvest):\n", " return [p.stem for p in get_docs_path(harvest)]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# In testing environment, open a test harvest\n", "if os.getenv(\"GW_STATUS\") == \"dev\":\n", " harvest = Path(\"data\", \"1655952487\")\n", "# Otherwise open most recent harvest\n", "# Supply a harvest directory name to open a specific harvest\n", "else:\n", " harvest = get_latest_harvest()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "vectorizer = CountVectorizer(\n", " stop_words=stopwords, max_features=10000, ngram_range=(1, 1)\n", ")\n", "# preprocessor = lambda x: re.sub(r'(\\d[\\d\\.])+', 'NUM', x.lower())\n", "X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())\n", "df_freq = pd.DataFrame(\n", " X_freq, columns=vectorizer.get_feature_names_out(), index=get_file_names(harvest)\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "st 68466\n", "street 62029\n", "good 41017\n", "rooms 39883\n", "new 32997\n", "apply 30852\n", "mr 30242\n", "co 28497\n", "wanted 25910\n", "10 25748\n", "room 25505\n", "house 25309\n", "sale 24637\n", "office 22547\n", "per 21527\n", "two 19313\n", "terms 18843\n", "one 18480\n", "land 18250\n", "brisbane 18066\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.sum().nlargest(20)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level_0level_10
00018541005-13-47985402
10018550403-13-48061940
20018561031-13-71392350
30018571126-13-71425430
40018580710-13-72973593
............
30659995zu19541112-969-2047594810
30659996zu19541116-12-506192010
30659997zu19541119-470-1352561550
30659998zu19870909-11-1221209460
30659999zu19880228-11-1019792920
\n", "

30660000 rows × 3 columns

\n", "
" ], "text/plain": [ " level_0 level_1 0\n", "0 00 18541005-13-4798540 2\n", "1 00 18550403-13-4806194 0\n", "2 00 18561031-13-7139235 0\n", "3 00 18571126-13-7142543 0\n", "4 00 18580710-13-7297359 3\n", "... ... ... ..\n", "30659995 zu 19541112-969-204759481 0\n", "30659996 zu 19541116-12-50619201 0\n", "30659997 zu 19541119-470-135256155 0\n", "30659998 zu 19870909-11-122120946 0\n", "30659999 zu 19880228-11-101979292 0\n", "\n", "[30660000 rows x 3 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.unstack().to_frame().reset_index().dropna(axis=0, subset=[0])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 6.34 s, sys: 98 µs, total: 6.34 s\n", "Wall time: 6.34 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
18541005-13-4798540mrstreetbocomelbourneperhoteldaynearmrs
18550403-13-4806194johnwmjamesmrsgeothosthomashenrymissjno
18561031-13-7139235streetnovmrsaleapplylandconearleteast
18571126-13-7142543machinemadelargemessrsoneyeartwoironprizesthree
18580710-13-7297359july12streetsaleclocksellcoauctiontermsmonday
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "18541005-13-4798540 mr street bo co melbourne per hotel \n", "18550403-13-4806194 john wm james mrs geo thos thomas \n", "18561031-13-7139235 street nov mr sale apply land co \n", "18571126-13-7142543 machine made large messrs one year two \n", "18580710-13-7297359 july 12 street sale clock sell co \n", "\n", " 7 8 9 \n", "18541005-13-4798540 day near mrs \n", "18550403-13-4806194 henry miss jno \n", "18561031-13-7139235 near let east \n", "18571126-13-7142543 iron prizes three \n", "18580710-13-7297359 auction terms monday " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "# The number of words you want to show\n", "num_words = 10\n", "top_words = pd.DataFrame(\n", " {\n", " n: df_freq.T[col].nlargest(num_words).index.tolist()\n", " for n, col in enumerate(df_freq.T)\n", " }\n", ").T\n", "top_words.index = get_file_names(harvest)\n", "top_words.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
18541005-13-479854018550403-13-480619418561031-13-713923518571126-13-714254318580710-13-729735918590407-13-567908218590520-13-568143118590524-809-15483940318590812-67-6040558318640227-13-5744865...19530604-97-6249270419530822-35-1838179219531009-687-14566758819531015-379-10066547719540424-77-5731683019541112-969-20475948119541116-12-5061920119541119-470-13525615519870909-11-12212094619880228-11-101979292
002000334105...01300000000
000311532512358018...0262601600100
0010000010000...0000000000
0090000000000...0000000000
011100001101...0601000000
..................................................................
yy0000100100...0000000000
zealand0000411106...0100000000
zeehan0000000000...0000000000
zinc0000330000...0400000000
zu0000000000...0000000000
\n", "

10000 rows × 3066 columns

\n", "
" ], "text/plain": [ " 18541005-13-4798540 18550403-13-4806194 18561031-13-7139235 \\\n", "00 2 0 0 \n", "000 3 11 5 \n", "001 0 0 0 \n", "009 0 0 0 \n", "01 1 1 0 \n", "... ... ... ... \n", "yy 0 0 0 \n", "zealand 0 0 0 \n", "zeehan 0 0 0 \n", "zinc 0 0 0 \n", "zu 0 0 0 \n", "\n", " 18571126-13-7142543 18580710-13-7297359 18590407-13-5679082 \\\n", "00 0 3 3 \n", "000 3 25 12 \n", "001 0 0 1 \n", "009 0 0 0 \n", "01 0 0 0 \n", "... ... ... ... \n", "yy 0 1 0 \n", "zealand 0 4 1 \n", "zeehan 0 0 0 \n", "zinc 0 3 3 \n", "zu 0 0 0 \n", "\n", " 18590520-13-5681431 18590524-809-154839403 18590812-67-60405583 \\\n", "00 4 1 0 \n", "000 35 8 0 \n", "001 0 0 0 \n", "009 0 0 0 \n", "01 1 1 0 \n", "... ... ... ... \n", "yy 0 1 0 \n", "zealand 1 1 0 \n", "zeehan 0 0 0 \n", "zinc 0 0 0 \n", "zu 0 0 0 \n", "\n", " 18640227-13-5744865 ... 19530604-97-62492704 19530822-35-18381792 \\\n", "00 5 ... 0 13 \n", "000 18 ... 0 262 \n", "001 0 ... 0 0 \n", "009 0 ... 0 0 \n", "01 1 ... 0 6 \n", "... ... ... ... ... \n", "yy 0 ... 0 0 \n", "zealand 6 ... 0 1 \n", "zeehan 0 ... 0 0 \n", "zinc 0 ... 0 4 \n", "zu 0 ... 0 0 \n", "\n", " 19531009-687-145667588 19531015-379-100665477 19540424-77-57316830 \\\n", "00 0 0 0 \n", "000 6 0 16 \n", "001 0 0 0 \n", "009 0 0 0 \n", "01 0 1 0 \n", "... ... ... ... \n", "yy 0 0 0 \n", "zealand 0 0 0 \n", "zeehan 0 0 0 \n", "zinc 0 0 0 \n", "zu 0 0 0 \n", "\n", " 19541112-969-204759481 19541116-12-50619201 19541119-470-135256155 \\\n", "00 0 0 0 \n", "000 0 0 1 \n", "001 0 0 0 \n", "009 0 0 0 \n", "01 0 0 0 \n", "... ... ... ... \n", "yy 0 0 0 \n", "zealand 0 0 0 \n", "zeehan 0 0 0 \n", "zinc 0 0 0 \n", "zu 0 0 0 \n", "\n", " 19870909-11-122120946 19880228-11-101979292 \n", "00 0 0 \n", "000 0 0 \n", "001 0 0 \n", "009 0 0 \n", "01 0 0 \n", "... ... ... \n", "yy 0 0 \n", "zealand 0 0 \n", "zeehan 0 0 \n", "zinc 0 0 \n", "zu 0 0 \n", "\n", "[10000 rows x 3066 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.T" ] }, { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Add a 'year' column to the dataframe\n", "\n", "Each file name includes the date on which the article was published. For example, `18601224-13-5696044` was published on 24 December 1860. We can easily extract the year by just slicing the first four characters off the index." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "df_freq[\"article_year\"] = df_freq.index.str.slice(0, 4)" ] }, { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Most frequent words each year" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# Group by year and sum the word counts\n", "year_groups = df_freq.groupby(by=\"article_year\")\n", "year_group_totals = year_groups.sum()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# Reshape so that we have columns for year, word, and count\n", "words_by_year = year_group_totals.unstack().to_frame().reset_index()\n", "words_by_year.columns = [\"word\", \"year\", \"count\"]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "top_words_by_year = (\n", " words_by_year.sort_values(\"count\", ascending=False)\n", " .groupby(by=[\"year\"])\n", " .head(10)\n", " .reset_index(drop=True)\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "word\n", "street 55\n", "mr 41\n", "st 38\n", "good 35\n", "new 31\n", "co 28\n", "rooms 24\n", "10 22\n", "mrs 21\n", "one 17\n", "apply 17\n", "sale 14\n", "years 13\n", "office 13\n", "wanted 12\n", "per 10\n", "room 10\n", "11 10\n", "house 9\n", "loving 8\n", "would 8\n", "brisbane 8\n", "may 8\n", "day 8\n", "melbourne 7\n", "Name: count, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_words_by_year[\"word\"].value_counts()[:25]" ] }, { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Visualise top ten words per year" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(top_words_by_year).mark_bar().encode(\n", " y=alt.Y(\"word:N\", sort=\"-x\"), x=\"count:Q\", facet=alt.Facet(\"year\", columns=4)\n", ").properties(width=120, height=120).resolve_scale(x=\"independent\", y=\"independent\")" ] }, { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Visualise word frequencies over time" ] }, { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "### Create a faceted chart" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(\n", " words_by_year.loc[words_by_year[\"word\"].isin([\"storm\", \"cyclone\", \"snow\"])]\n", ").mark_line().encode(\n", " x=alt.X(\"year:Q\", axis=alt.Axis(format=\"c\", title=\"Year\")),\n", " y=\"count:Q\",\n", " color=\"word:N\",\n", " facet=alt.Facet(\"word:N\", columns=1),\n", ").properties(\n", " width=700, height=100\n", ").resolve_scale(\n", " y=\"independent\"\n", ")" ] }, { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org) ([@wragge](https://twitter.com/wragge)) for the [GLAM Workbench](https://github.com/glam-workbench/). \n", "Support this project by [becoming a GitHub sponsor](https://github.com/sponsors/wragge?o=esb).\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "rocrate": { "author": [ { "name": "Sherratt, Tim", "orcid": "https://orcid.org/0000-0001-7956-4498" } ], "name": "Explore harvested text files" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }