{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Word frequencies in digitised journals\n", "\n", "This notebook uses word frequency to explore the [OCRd texts harvested](Download-text-for-all-digitised-journals.ipynb) from Trove's digitised journals. More documentation coming..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " This notebook is in draft form and may not be complete or up-to-date.\n", "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import re\n", "\n", "# import tarfile\n", "import zipfile\n", "from io import BytesIO\n", "from pathlib import Path\n", "\n", "import altair as alt\n", "import ipywidgets as widgets\n", "import numpy as np\n", "import pandas as pd\n", "import requests\n", "from IPython.display import display\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select a journal\n", "\n", "Create a dropdown widget to select a digitised journal. The cells below will use this widget to get the value of the currently selected journal." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load details of digitised journals from CSV\n", "df_journals = pd.read_csv(\"digital-journals-with-text-20220831.csv\").sort_values(\n", " by=\"title\"\n", ")\n", "journal_list = [\n", " (f\"{j['title']} ({j['issues_with_text']} issues)\", j[\"directory\"])\n", " for j in df_journals[[\"title\", \"directory\", \"issues_with_text\"]].to_dict(\"records\")\n", "]\n", "journals = widgets.Dropdown(options=journal_list, disabled=False)\n", "display(journals)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download all the issues of the journal" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_docs_path(journal):\n", " path = os.path.join(\"downloads\", journal, \"texts\")\n", " docs_path = [p for p in sorted(Path(path).glob(\"*.txt\"))]\n", " return docs_path\n", "\n", "\n", "def download_journal(journal):\n", " path = os.path.join(\"downloads\", journal)\n", " os.makedirs(path, exist_ok=True)\n", " params = {\"path\": f\"/{journal}/texts\"}\n", " response = requests.get(\n", " \"https://cloudstor.aarnet.edu.au/plus/s/QOmnqpGQCNCSC2h/download\", params=params\n", " )\n", " zipped = zipfile.ZipFile(BytesIO(response.content))\n", " zipped.extractall(path)\n", " print(f\"{len(get_docs_path(journal))} issues downloaded\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "download_journal(journals.value)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate word frequencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_docs(journal):\n", " docs_path = get_docs_path(journal)\n", " for p in docs_path:\n", " yield p.read_text(encoding=\"utf-8\").strip()\n", "\n", "\n", "def get_file_names(journal):\n", " return [p.stem for p in get_docs_path(journal)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Remove numbers\n", "# preprocessor = lambda x: re.sub(r'\\b(\\w*\\d\\w*)\\b', 'NUM', x.lower())\n", "vectorizer = CountVectorizer(\n", " stop_words=\"english\",\n", " ngram_range=(1, 1),\n", " token_pattern=\"[a-z]{3,}\",\n", " lowercase=True,\n", " max_features=100000,\n", ")\n", "\n", "X_freq = np.asarray(vectorizer.fit_transform(get_docs(journals.value)).todense())\n", "df_freq = pd.DataFrame(\n", " X_freq,\n", " columns=vectorizer.get_feature_names_out(),\n", " index=get_file_names(journals.value),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Save to CSV\n", "df_freq.to_csv(f\"{journals.value}-word-frequencies.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "np.save(f\"{journals.value}-word-frequencies.npy\", X_freq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "df_freq.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "totals = df_freq.sum().to_frame().reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "totals.columns = [\"word\", \"count\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "totals.loc[totals[\"count\"] > 180].shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "totals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Most frequent words in the journal\n", "\n", "Change the number as you wish." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "df_freq.sum().nlargest(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Frequency of a specific word" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "word = \"knoll\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# If the word's not in the index you'll get a KeyError -- don't worry about it, just try another word!!\n", "df_freq[word].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find the issue that this word occurs in most frequently." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "df_freq[word].idxmax()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Most frequent words per issue\n", "\n", "Get the most frequent words for each issue of the journal. Set `num_words` to the number of words you want to show." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# The number of words you want to show\n", "num_words = 20\n", "top_words = pd.DataFrame(\n", " {\n", " n: df_freq.T[col].nlargest(num_words).index.tolist()\n", " for n, col in enumerate(df_freq.T)\n", " }\n", ").T\n", "top_words.index = get_file_names(journals.value)\n", "top_words.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the top words for a specific issue." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "top_words.loc[top_words.index.str.contains(\"nla.obj-9139951\")]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Track word frequencies over time" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "def extract_year(name):\n", " \"\"\"\n", " Try to extract the year from the filename.\n", " \"\"\"\n", " try:\n", " years = re.findall(r\"-((?:18|19|20)\\d{2})-\", name)\n", " year = int(years[-1])\n", " except IndexError:\n", " year = 0\n", " print(f\"YEAR NOT FOUND: {name}\")\n", " return year\n", "\n", "\n", "df_freq[\"year\"] = df_freq.apply(lambda x: extract_year(x.name), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Top words per year\n", "year_groups = df_freq.groupby(by=\"year\")\n", "year_group_totals = year_groups.sum()\n", "df_years = pd.DataFrame(\n", " {\n", " n: year_group_totals.T[col].nlargest(10).index.tolist()\n", " for n, col in enumerate(year_group_totals.T)\n", " }\n", ").T\n", "df_years.index = [name for name, _ in year_groups]\n", "df_years.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "year_group_totals.to_csv(\"words_by_year.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "year_group_totals[\"total_words\"] = year_group_totals.sum(axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "def words_by_year(df, words):\n", " df_words = pd.DataFrame()\n", " for word in words:\n", " try:\n", " df_word = (\n", " df.groupby(by=\"year\")\n", " .sum()[word]\n", " .to_frame()\n", " .reset_index()\n", " .rename({word: \"count\"}, axis=1)\n", " )\n", " except KeyError:\n", " print(f\"'{word}' not found\")\n", " else:\n", " df_word[\"word\"] = word\n", " df_words = df_words.append(df_word, ignore_index=True)\n", " return df_words" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make a list of words that we want to compare." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "words = [\"nation\", \"chinese\", \"republic\", \"worker\", \"unions\", \"union\", \"labor\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the data for those words." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "df_words = words_by_year(df_freq, words)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a faceted line chart." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "alt.Chart(df_words.loc[df_words[\"year\"] > 0]).mark_line().encode(\n", " x=alt.X(\"year:Q\", axis=alt.Axis(format=\"c\", title=\"Year\")),\n", " y=\"count:Q\",\n", " color=\"word:N\",\n", " facet=\"word:N\",\n", ").properties(width=700, height=100, columns=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or perhaps you prefer bubblelines." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Create a chart\n", "alt.Chart(df_words.loc[df_words[\"year\"] > 0]).mark_circle(\n", " # Style the circles\n", " opacity=0.8,\n", " stroke=\"black\",\n", " strokeWidth=1,\n", ").encode(\n", " # Year on the X axis\n", " x=alt.X(\"year:O\", axis=alt.Axis(format=\"c\", title=\"Year\", labelAngle=0)),\n", " # Object type on the Y axis\n", " y=alt.Y(\"word:N\", title=\"Word\"),\n", " # Size of the circles represents the number of objects\n", " size=alt.Size(\n", " \"count:Q\",\n", " scale=alt.Scale(range=[0, 1000]),\n", " legend=alt.Legend(title=\"Frequency\"),\n", " ),\n", " # Color the circles by object type\n", " color=alt.Color(\"word:N\", legend=None),\n", " # More details on hover\n", " tooltip=[\n", " alt.Tooltip(\"word:N\", title=\"Word\"),\n", " alt.Tooltip(\"year:O\", title=\"Year\"),\n", " alt.Tooltip(\"count:Q\", title=\"Frequency\", format=\",\"),\n", " ],\n", ").properties(\n", " width=700, height=300\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "rocrate": { "author": [ { "mainEntityOfPage": "https://timsherratt.au", "name": "Sherratt, Tim", "orcid": "https://orcid.org/0000-0001-7956-4498" } ], "name": "Word frequencies in digitised journals" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }