{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploring your TroveHarvester data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-warning\">\n",
    "    <b>Under construction</b>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd # makes manipulating the data easier\n",
    "# import plotly.offline as py # for charts\n",
    "# import plotly.graph_objs as go\n",
    "import altair as alt\n",
    "import wordcloud\n",
    "\n",
    "# py.init_notebook_mode() # initialise plotly\n",
    "alt.renderers.enable('notebook')\n",
    "\n",
    "# Make sure data directory exists\n",
    "# os.makedirs('../../data/TroveHarvester', exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_latest_harvest():\n",
    "    '''\n",
    "    Get the timestamp of the most recent harvest.\n",
    "    '''\n",
    "    harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])\n",
    "    return harvests[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def open_harvest_data(timestamp=None):\n",
    "    '''\n",
    "    Open the results of the specified harvest (most recent by default).\n",
    "    \n",
    "    Returns a DataFrame.\n",
    "    '''\n",
    "    if not timestamp:\n",
    "        timestamp = get_latest_harvest()\n",
    "    print(timestamp)\n",
    "    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])\n",
    "    return df  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = open_harvest_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show the most common newspapers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "alt.Chart(df).mark_bar().encode(\n",
    "    x=alt.X('count:Q', title='Number of articles'),\n",
    "    y=alt.Y('newspaper_title:N', title='Newspaper', sort=alt.EncodingSortField(field='count', order='descending', op='sum')),\n",
    "    tooltip=[alt.Tooltip('newspaper_title:N', title='Newspaper'), alt.Tooltip('count:Q', title='Articles')]\n",
    ").transform_aggregate(\n",
    "    count='count()',\n",
    "    groupby=['newspaper_title']\n",
    ").transform_window(\n",
    "    window=[{'op': 'rank', 'as': 'rank'}],\n",
    "    sort=[{'field': 'count', 'order': 'descending'}]\n",
    ").transform_filter('datum.rank <= 25')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show when the articles were published"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "alt.Chart(df).mark_line().encode(\n",
    "    x='year(date):T',\n",
    "    y='count()',\n",
    "    tooltip=[alt.Tooltip('year(date):T', title='Year'), alt.Tooltip('count()', title='Articles')]\n",
    ").properties(width=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Find the longest article"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Which is the longest article(s)?\n",
    "df[df['words'] == df['words'].max()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[df['title'].str.contains('protest', case=False, na=False)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make a simple word cloud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_titles = df[(df['title'] != 'No Title') & (df['title'] != 'Advertising')]\n",
    "# Get all the articles titles and turn them into a single string\n",
    "title_text = df_titles['title'].str.lower().str.cat(sep=' ').replace('advertising', '').replace('no title', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "from wordcloud import WordCloud\n",
    "\n",
    "# Generate a word cloud image\n",
    "wordcloud = WordCloud(width=1200, height=800).generate(title_text)\n",
    "\n",
    "# Display the generated image:\n",
    "# the matplotlib way:\n",
    "import matplotlib.pyplot as plt\n",
    "plt.figure(figsize=(12,8))\n",
    "plt.imshow(wordcloud, interpolation='bilinear')\n",
    "plt.axis(\"off\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using TextBlob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textblob import TextBlob\n",
    "from operator import itemgetter\n",
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "nltk.download('punkt')\n",
    "blob = TextBlob(title_text)\n",
    "stopwords = nltk.corpus.stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]\n",
    "word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]\n",
    "pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Analyse text files\n",
    "\n",
    "So far we've only looked at the metadata, but we can also [explore the content of the individual text files](Explore-harvested-text-files.ipynb)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherrratt](https://timsherratt.org) ([@wragge](https://twitter.com/wragge)) as part of the [OzGLAM workbench](https://github.com/wragge/ozglam-workbench).\n",
    "\n",
    "If you think this project is worthwhile you can [support it on Patreon](https://www.patreon.com/timsherratt)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}