{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [ri.gov](http://www.governor.ri.gov/newsroom/speeches/index.php).\n",
    "<hr>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# State of the State Word Scrape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook contains code used to get a word count of every word used in a state of the state address. All words are converted into lowercase with punctuation and special characters removed. The dataset also marks specific catagories of words using a list that can be found [here](https://github.com/SmirkyGraphs/Python-Notebooks/tree/master/ri-state-of-the-state/files/lists)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from nltk.corpus import stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get count of words from every file\n",
    "files = list(Path('../input/').rglob('*.txt'))\n",
    "for file in files:\n",
    "    \n",
    "    with open(file) as f:\n",
    "        try:\n",
    "            text = f.read()\n",
    "        except:\n",
    "            print(f.name)\n",
    "        text = text.splitlines()\n",
    "        \n",
    "        # remove leading/trailing whitespace\n",
    "        text = [x.strip().lower() for x in text]\n",
    "        text = [x for x in text if x != '']\n",
    "\n",
    "        text = ' '.join(text)\n",
    "        \n",
    "        # remove all non alphabet characters\n",
    "        text = re.sub(r'[^A-Za-z ]+', '', text)\n",
    "        \n",
    "        # add year\n",
    "        year = file.name[:4]\n",
    "        \n",
    "        # get list of all words\n",
    "        words = text.split()\n",
    "        \n",
    "        # get count of each word\n",
    "        wds = {}\n",
    "        for w in words:\n",
    "            wds[w] = wds.get(w, 0) + 1\n",
    "            \n",
    "        # Creating a csv\n",
    "        df = pd.DataFrame(list(wds.items()))\n",
    "        df.columns = ['words', 'count']\n",
    "                \n",
    "        # save files\n",
    "        df.to_csv(f'../output/word_count/raw/{year}_word_count.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge counts from each year\n",
    "files = list(Path('../output/word_count/raw/').rglob('*.csv'))\n",
    "\n",
    "combined = []\n",
    "for file in files:\n",
    "    df = pd.read_csv(file)\n",
    "    df['year'] = file.name[:4]\n",
    "    combined.append(df)\n",
    "    \n",
    "df = pd.concat(combined)\n",
    "df = df.pivot(index='words', columns='year', values='count')\n",
    "df = df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# year differences\n",
    "df['2017-2018_dif'] = df['2018']-df['2017']\n",
    "df['2018-2019_dif'] = df['2019']-df['2018']\n",
    "df['2019-2020_dif'] = df['2020']-df['2019']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate stopwords list\n",
    "stop = stopwords.words('english')\n",
    "stop.append('rhode')\n",
    "stop.append('island')\n",
    "stop.append('islander')\n",
    "stop.append('islanders')\n",
    "\n",
    "stop.append('lets')\n",
    "\n",
    "# adding stopwods\n",
    "for i, row in df.iterrows():\n",
    "    if i in stop:\n",
    "        df.at[i,'stopword'] = 'Stopword'    \n",
    "    else:\n",
    "        df.at[i,'stopword'] = 'Not Stopword'   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# adding categories\n",
    "categories = {}\n",
    "files = list(Path('../files/lists/').rglob('*.txt'))\n",
    "\n",
    "for f in files:\n",
    "    cat = f.name[:-4]\n",
    "    categories[cat] = [line.rstrip() for line in open(f)]\n",
    "    \n",
    "economy = categories['economy']\n",
    "education = categories['education']\n",
    "health = categories['health']\n",
    "jobs = categories['jobs']\n",
    "politics = categories['politics']\n",
    "\n",
    "for i, row in df.iterrows():\n",
    "    if i in economy:\n",
    "        df.at[i,'category'] = 'economy'    \n",
    "    elif i in education:\n",
    "        df.at[i,'category'] = 'education' \n",
    "    elif i in health:\n",
    "        df.at[i,'category'] = 'health' \n",
    "    elif i in jobs:\n",
    "        df.at[i,'category'] = 'jobs' \n",
    "    elif i in politics:\n",
    "        df.at[i,'category'] = 'politics' \n",
    "    else:\n",
    "        df.at[i,'category'] = 'no category'\n",
    "\n",
    "# reset index and remove 'year'\n",
    "df = df.reset_index()\n",
    "df.columns.name = ''\n",
    "\n",
    "# saving combined cleaned file\n",
    "df.to_csv('../output/word_count/clean/word_counts_combined.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}