# Exploring your TroveHarvester data

<div class="alert alert-block alert-warning">
    <b>Under construction</b>
</div>

In [None]:
import os
import pandas as pd # makes manipulating the data easier
# import plotly.offline as py # for charts
# import plotly.graph_objs as go
import altair as alt
import wordcloud

# py.init_notebook_mode() # initialise plotly
alt.renderers.enable('notebook')

# Make sure data directory exists
# os.makedirs('../../data/TroveHarvester', exist_ok=True)

In [None]:
def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])
    return harvests[-1]

In [None]:
def open_harvest_data(timestamp=None):
    '''
    Open the results of the specified harvest (most recent by default).
    
    Returns a DataFrame.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    print(timestamp)
    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])
    return df  

In [None]:
df = open_harvest_data()

## Show the most common newspapers

In [None]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('count:Q', title='Number of articles'),
    y=alt.Y('newspaper_title:N', title='Newspaper', sort=alt.EncodingSortField(field='count', order='descending', op='sum')),
    tooltip=[alt.Tooltip('newspaper_title:N', title='Newspaper'), alt.Tooltip('count:Q', title='Articles')]
).transform_aggregate(
    count='count()',
    groupby=['newspaper_title']
).transform_window(
    window=[{'op': 'rank', 'as': 'rank'}],
    sort=[{'field': 'count', 'order': 'descending'}]
).transform_filter('datum.rank <= 25')

## Show when the articles were published

In [None]:
alt.Chart(df).mark_line().encode(
    x='year(date):T',
    y='count()',
    tooltip=[alt.Tooltip('year(date):T', title='Year'), alt.Tooltip('count()', title='Articles')]
).properties(width=600)

## Find the longest article

In [None]:
# Which is the longest article(s)?
df[df['words'] == df['words'].max()]

In [None]:
df.loc[df['title'].str.contains('protest', case=False, na=False)]

## Make a simple word cloud

In [None]:
df_titles = df[(df['title'] != 'No Title') & (df['title'] != 'Advertising')]
# Get all the articles titles and turn them into a single string
title_text = df_titles['title'].str.lower().str.cat(sep=' ').replace('advertising', '').replace('no title', '')

In [None]:
from wordcloud import WordCloud

# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800).generate(title_text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

## Using TextBlob

In [None]:
from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
blob = TextBlob(title_text)
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})

## Analyse text files

So far we've only looked at the metadata, but we can also [explore the content of the individual text files](Explore-harvested-text-files.ipynb).

----

Created by [Tim Sherrratt](https://timsherratt.org) ([@wragge](https://twitter.com/wragge)) as part of the [OzGLAM workbench](https://github.com/wragge/ozglam-workbench).

If you think this project is worthwhile you can [support it on Patreon](https://www.patreon.com/timsherratt).