{ "cells": [ { "cell_type": "markdown", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "# Explore harvested text files" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /home/tim/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to /home/tim/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "import os\n", "import zipfile\n", "from pathlib import Path\n", "\n", "import altair as alt\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "nltk.download(\"stopwords\")\n", "nltk.download(\"punkt\")\n", "\n", "stopwords = nltk.corpus.stopwords.words(\"english\")\n", "stopwords += [\"tho\", \"tbe\"]\n", "\n", "# Are you using Jupyter Lab?\n", "# If so either don't run this cell or comment out the line below\n", "\n", "# alt.renderers.enable('notebook')\n", "\n", "# If you forget, run this cell, and then get strange warnings when you make a chart,\n", "# uncomment the following line and run this cell to reset the chart renderer\n", "\n", "# alt.renderers.enable('default')\n", "\n", "# alt.data_transformers.enable('json')\n", "# nltk.download('stopwords')\n", "# nltk.download('punkt')\n", "# stopwords = nltk.corpus.stopwords.words('english')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "%%capture\n", "# Load variables from the .env file if it exists\n", "# Use %%capture to suppress messages\n", "%load_ext dotenv\n", "%dotenv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Import a harvest zip file you've created previously\n", "# First upload the zip file to the data directory, then run this cell\n", "\n", "for zipped in sorted(Path(\"data\").glob(\"*.zip\")):\n", " print(f\"Unzipping {zipped}...\")\n", " with zipfile.ZipFile(zipped, \"r\") as zip_file:\n", " zip_file.extractall(Path(f\"data/{zipped.stem}\"))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def get_latest_harvest():\n", " \"\"\"\n", " Get the timestamp of the most recent harvest.\n", " \"\"\"\n", " harvests = sorted(\n", " [d for d in Path(\"data\").iterdir() if d.is_dir() and not d.name.startswith(\".\")]\n", " )\n", " try:\n", " harvest = harvests[-1]\n", " except IndexError:\n", " print(\"No harvests!\")\n", " harvest = None\n", " return harvest" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def get_docs(harvest):\n", " docs_path = get_docs_path(harvest)\n", " for p in docs_path:\n", " yield p.read_text(encoding=\"utf-8\").strip()\n", "\n", "\n", "def get_docs_path(harvest):\n", " path = Path(harvest, \"text\")\n", " docs_path = [p for p in sorted(path.glob(\"*.txt\"))]\n", " return docs_path\n", "\n", "\n", "def get_file_names(harvest):\n", " return [p.stem for p in get_docs_path(harvest)]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# In testing environment, open a test harvest\n", "if os.getenv(\"GW_STATUS\") == \"dev\":\n", " harvest = Path(\"data\", \"1655952487\")\n", "# Otherwise open most recent harvest\n", "# Supply a harvest directory name to open a specific harvest\n", "else:\n", " harvest = get_latest_harvest()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "vectorizer = CountVectorizer(\n", " stop_words=stopwords, max_features=10000, ngram_range=(1, 1)\n", ")\n", "# preprocessor = lambda x: re.sub(r'(\\d[\\d\\.])+', 'NUM', x.lower())\n", "X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())\n", "df_freq = pd.DataFrame(\n", " X_freq, columns=vectorizer.get_feature_names_out(), index=get_file_names(harvest)\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "st 68466\n", "street 62029\n", "good 41017\n", "rooms 39883\n", "new 32997\n", "apply 30852\n", "mr 30242\n", "co 28497\n", "wanted 25910\n", "10 25748\n", "room 25505\n", "house 25309\n", "sale 24637\n", "office 22547\n", "per 21527\n", "two 19313\n", "terms 18843\n", "one 18480\n", "land 18250\n", "brisbane 18066\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.sum().nlargest(20)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " | level_0 | \n", "level_1 | \n", "0 | \n", "
---|---|---|---|
0 | \n", "00 | \n", "18541005-13-4798540 | \n", "2 | \n", "
1 | \n", "00 | \n", "18550403-13-4806194 | \n", "0 | \n", "
2 | \n", "00 | \n", "18561031-13-7139235 | \n", "0 | \n", "
3 | \n", "00 | \n", "18571126-13-7142543 | \n", "0 | \n", "
4 | \n", "00 | \n", "18580710-13-7297359 | \n", "3 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
30659995 | \n", "zu | \n", "19541112-969-204759481 | \n", "0 | \n", "
30659996 | \n", "zu | \n", "19541116-12-50619201 | \n", "0 | \n", "
30659997 | \n", "zu | \n", "19541119-470-135256155 | \n", "0 | \n", "
30659998 | \n", "zu | \n", "19870909-11-122120946 | \n", "0 | \n", "
30659999 | \n", "zu | \n", "19880228-11-101979292 | \n", "0 | \n", "
30660000 rows × 3 columns
\n", "\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "
---|---|---|---|---|---|---|---|---|---|---|
18541005-13-4798540 | \n", "mr | \n", "street | \n", "bo | \n", "co | \n", "melbourne | \n", "per | \n", "hotel | \n", "day | \n", "near | \n", "mrs | \n", "
18550403-13-4806194 | \n", "john | \n", "wm | \n", "james | \n", "mrs | \n", "geo | \n", "thos | \n", "thomas | \n", "henry | \n", "miss | \n", "jno | \n", "
18561031-13-7139235 | \n", "street | \n", "nov | \n", "mr | \n", "sale | \n", "apply | \n", "land | \n", "co | \n", "near | \n", "let | \n", "east | \n", "
18571126-13-7142543 | \n", "machine | \n", "made | \n", "large | \n", "messrs | \n", "one | \n", "year | \n", "two | \n", "iron | \n", "prizes | \n", "three | \n", "
18580710-13-7297359 | \n", "july | \n", "12 | \n", "street | \n", "sale | \n", "clock | \n", "sell | \n", "co | \n", "auction | \n", "terms | \n", "monday | \n", "
\n", " | 18541005-13-4798540 | \n", "18550403-13-4806194 | \n", "18561031-13-7139235 | \n", "18571126-13-7142543 | \n", "18580710-13-7297359 | \n", "18590407-13-5679082 | \n", "18590520-13-5681431 | \n", "18590524-809-154839403 | \n", "18590812-67-60405583 | \n", "18640227-13-5744865 | \n", "... | \n", "19530604-97-62492704 | \n", "19530822-35-18381792 | \n", "19531009-687-145667588 | \n", "19531015-379-100665477 | \n", "19540424-77-57316830 | \n", "19541112-969-204759481 | \n", "19541116-12-50619201 | \n", "19541119-470-135256155 | \n", "19870909-11-122120946 | \n", "19880228-11-101979292 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
00 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "3 | \n", "3 | \n", "4 | \n", "1 | \n", "0 | \n", "5 | \n", "... | \n", "0 | \n", "13 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
000 | \n", "3 | \n", "11 | \n", "5 | \n", "3 | \n", "25 | \n", "12 | \n", "35 | \n", "8 | \n", "0 | \n", "18 | \n", "... | \n", "0 | \n", "262 | \n", "6 | \n", "0 | \n", "16 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
001 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
009 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
01 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "... | \n", "0 | \n", "6 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
yy | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
zealand | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "4 | \n", "1 | \n", "1 | \n", "1 | \n", "0 | \n", "6 | \n", "... | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
zeehan | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
zinc | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "3 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "4 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
zu | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
10000 rows × 3066 columns
\n", "