{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring your TroveHarvester data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " Under construction\n", "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd # makes manipulating the data easier\n", "# import plotly.offline as py # for charts\n", "# import plotly.graph_objs as go\n", "import altair as alt\n", "import wordcloud\n", "\n", "# py.init_notebook_mode() # initialise plotly\n", "alt.renderers.enable('notebook')\n", "\n", "# Make sure data directory exists\n", "# os.makedirs('../../data/TroveHarvester', exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_latest_harvest():\n", " '''\n", " Get the timestamp of the most recent harvest.\n", " '''\n", " harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])\n", " return harvests[-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def open_harvest_data(timestamp=None):\n", " '''\n", " Open the results of the specified harvest (most recent by default).\n", " \n", " Returns a DataFrame.\n", " '''\n", " if not timestamp:\n", " timestamp = get_latest_harvest()\n", " print(timestamp)\n", " df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])\n", " return df " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = open_harvest_data()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Show the most common newspapers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "alt.Chart(df).mark_bar().encode(\n", " x=alt.X('count:Q', title='Number of articles'),\n", " y=alt.Y('newspaper_title:N', title='Newspaper', sort=alt.EncodingSortField(field='count', order='descending', op='sum')),\n", " tooltip=[alt.Tooltip('newspaper_title:N', title='Newspaper'), alt.Tooltip('count:Q', title='Articles')]\n", ").transform_aggregate(\n", " count='count()',\n", " groupby=['newspaper_title']\n", ").transform_window(\n", " window=[{'op': 'rank', 'as': 'rank'}],\n", " sort=[{'field': 'count', 'order': 'descending'}]\n", ").transform_filter('datum.rank <= 25')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Show when the articles were published" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "alt.Chart(df).mark_line().encode(\n", " x='year(date):T',\n", " y='count()',\n", " tooltip=[alt.Tooltip('year(date):T', title='Year'), alt.Tooltip('count()', title='Articles')]\n", ").properties(width=600)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Find the longest article" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Which is the longest article(s)?\n", "df[df['words'] == df['words'].max()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.loc[df['title'].str.contains('protest', case=False, na=False)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a simple word cloud" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_titles = df[(df['title'] != 'No Title') & (df['title'] != 'Advertising')]\n", "# Get all the articles titles and turn them into a single string\n", "title_text = df_titles['title'].str.lower().str.cat(sep=' ').replace('advertising', '').replace('no title', '')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "from wordcloud import WordCloud\n", "\n", "# Generate a word cloud image\n", "wordcloud = WordCloud(width=1200, height=800).generate(title_text)\n", "\n", "# Display the generated image:\n", "# the matplotlib way:\n", "import matplotlib.pyplot as plt\n", "plt.figure(figsize=(12,8))\n", "plt.imshow(wordcloud, interpolation='bilinear')\n", "plt.axis(\"off\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using TextBlob" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from textblob import TextBlob\n", "from operator import itemgetter\n", "import nltk\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "blob = TextBlob(title_text)\n", "stopwords = nltk.corpus.stopwords.words('english')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]\n", "word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]\n", "pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analyse text files\n", "\n", "So far we've only looked at the metadata, but we can also [explore the content of the individual text files](Explore-harvested-text-files.ipynb)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherrratt](https://timsherratt.org) ([@wragge](https://twitter.com/wragge)) as part of the [OzGLAM workbench](https://github.com/wragge/ozglam-workbench).\n", "\n", "If you think this project is worthwhile you can [support it on Patreon](https://www.patreon.com/timsherratt)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }