{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [ri.gov](http://www.governor.ri.gov/newsroom/speeches/index.php).\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# State of the State Word Scrape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook contains code used to get a word count of every word used in a state of the state address. All words are converted into lowercase with punctuation and special characters removed. The dataset also marks specific catagories of words using a list that can be found [here](https://github.com/SmirkyGraphs/Python-Notebooks/tree/master/ri-state-of-the-state/files/lists)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "from pathlib import Path\n", "from nltk.corpus import stopwords" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# get count of words from every file\n", "files = list(Path('../input/').rglob('*.txt'))\n", "for file in files:\n", " \n", " with open(file) as f:\n", " try:\n", " text = f.read()\n", " except:\n", " print(f.name)\n", " text = text.splitlines()\n", " \n", " # remove leading/trailing whitespace\n", " text = [x.strip().lower() for x in text]\n", " text = [x for x in text if x != '']\n", "\n", " text = ' '.join(text)\n", " \n", " # remove all non alphabet characters\n", " text = re.sub(r'[^A-Za-z ]+', '', text)\n", " \n", " # add year\n", " year = file.name[:4]\n", " \n", " # get list of all words\n", " words = text.split()\n", " \n", " # get count of each word\n", " wds = {}\n", " for w in words:\n", " wds[w] = wds.get(w, 0) + 1\n", " \n", " # Creating a csv\n", " df = pd.DataFrame(list(wds.items()))\n", " df.columns = ['words', 'count']\n", " \n", " # save files\n", " df.to_csv(f'../output/word_count/raw/{year}_word_count.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# merge counts from each year\n", "files = list(Path('../output/word_count/raw/').rglob('*.csv'))\n", "\n", "combined = []\n", "for file in files:\n", " df = pd.read_csv(file)\n", " df['year'] = file.name[:4]\n", " combined.append(df)\n", " \n", "df = pd.concat(combined)\n", "df = df.pivot(index='words', columns='year', values='count')\n", "df = df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# year differences\n", "df['2017-2018_dif'] = df['2018']-df['2017']\n", "df['2018-2019_dif'] = df['2019']-df['2018']\n", "df['2019-2020_dif'] = df['2020']-df['2019']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# generate stopwords list\n", "stop = stopwords.words('english')\n", "stop.append('rhode')\n", "stop.append('island')\n", "stop.append('islander')\n", "stop.append('islanders')\n", "\n", "stop.append('lets')\n", "\n", "# adding stopwods\n", "for i, row in df.iterrows():\n", " if i in stop:\n", " df.at[i,'stopword'] = 'Stopword' \n", " else:\n", " df.at[i,'stopword'] = 'Not Stopword' " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# adding categories\n", "categories = {}\n", "files = list(Path('../files/lists/').rglob('*.txt'))\n", "\n", "for f in files:\n", " cat = f.name[:-4]\n", " categories[cat] = [line.rstrip() for line in open(f)]\n", " \n", "economy = categories['economy']\n", "education = categories['education']\n", "health = categories['health']\n", "jobs = categories['jobs']\n", "politics = categories['politics']\n", "\n", "for i, row in df.iterrows():\n", " if i in economy:\n", " df.at[i,'category'] = 'economy' \n", " elif i in education:\n", " df.at[i,'category'] = 'education' \n", " elif i in health:\n", " df.at[i,'category'] = 'health' \n", " elif i in jobs:\n", " df.at[i,'category'] = 'jobs' \n", " elif i in politics:\n", " df.at[i,'category'] = 'politics' \n", " else:\n", " df.at[i,'category'] = 'no category'\n", "\n", "# reset index and remove 'year'\n", "df = df.reset_index()\n", "df.columns.name = ''\n", "\n", "# saving combined cleaned file\n", "df.to_csv('../output/word_count/clean/word_counts_combined.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 2 }