{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2d494c21-8813-4994-a157-0a2db545639a",
   "metadata": {},
   "source": [
    "# Files digitised in the last week\n",
    "\n",
    "Each Sunday I'm automatically harvesting details of files digitised by the NAA in the previous week. You can view the results [in this repository](https://github.com/wragge/naa-recently-digitised). This notebook analyses the most recent harvest to provide a summary of the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9cf6b2e0-ff28-4555-8a8f-42bae11abbd8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "from urllib.error import HTTPError\n",
    "\n",
    "import arrow\n",
    "import pandas as pd\n",
    "from IPython.display import display\n",
    "from recordsearch_data_scraper.scrapers import RSSeries\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2415ae47-5ce6-4bbf-8bcc-3a726904e4e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Harvested on Sunday, 15 January 2023.\n"
     ]
    }
   ],
   "source": [
    "# Find the date of the most recent Sunday\n",
    "today = arrow.now().to(\"Australia/Sydney\")\n",
    "# Today is Sunday and it's past 2pm so the harvest should have run\n",
    "if today.weekday() == 6 and today.time() >= datetime.time(14, 0, 0, 0):\n",
    "    harvest_day = today\n",
    "# Otherwise get last Sunday\n",
    "else:\n",
    "    harvest_day = arrow.now().to(\"Australia/Sydney\").shift(weekday=6).shift(weeks=-1)\n",
    "\n",
    "print(f'Harvested on {harvest_day.format(\"dddd, D MMMM YYYY\")}.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "4062207b-5ddf-4e69-ac96-439902624c41",
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    df = pd.read_csv(\n",
    "        f'https://raw.githubusercontent.com/wragge/naa-recently-digitised/master/data/digitised-week-ending-{harvest_day.format(\"YYYYMMDD\")}.csv'\n",
    "    )\n",
    "except HTTPError:\n",
    "    df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1959e168-58c4-4e54-8c33-58b04afde2df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, 0)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "eba145be-32d6-4bb0-a49d-83b9264052a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not df.empty:\n",
    "    df[\"series\"].value_counts()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "1383b682-c3da-435c-b07c-8b6674ffabca",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not df.empty:\n",
    "    series_list = list(df[\"series\"].unique())\n",
    "\n",
    "    cited_series = []\n",
    "    for series in tqdm(series_list):\n",
    "        data = RSSeries(\n",
    "            series, include_number_digitised=False, include_access_status=False\n",
    "        ).data\n",
    "        cited_series.append({\"series\": series, \"series_title\": data[\"title\"]})\n",
    "\n",
    "    df_titles = pd.merge(df, pd.DataFrame(cited_series), how=\"left\", on=\"series\")\n",
    "\n",
    "    with pd.option_context(\"display.max_colwidth\", 100):\n",
    "        df_titles = (\n",
    "            df_titles.value_counts([\"series\", \"series_title\"]).to_frame().reset_index()\n",
    "        )\n",
    "        df_titles.columns = [\"series\", \"series_title\", \"total\"]\n",
    "        display(df_titles[:20])\n",
    "        totals = \"\"\n",
    "        for title in df_titles[:20].itertuples():\n",
    "            totals += (\n",
    "                f\"{title.series}, {title.series_title}, {title.total} files digitised; \"\n",
    "            )\n",
    "        print(totals)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}