{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Created by [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RIDE](https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx).\n",
    "<hr>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Collecting and Cleaning School Attendance Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using Selenium to automatically collect data on attendance of schools from RIDE's online tool. The data is then cleaned with pandas to compare attendance pre vs. post COVID-19 remote learning. Previously I used the data to create an [online tool](https://ivizri.com/posts/2018/12/ri-schools-attendance/) allowing you to explore attendance and compare it easily with all other schools in Rhode Island.\n",
    "<hr>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
    "from datetime import date\n",
    "from selenium import webdriver\n",
    "\n",
    "def chrome_options(download_dir):\n",
    "    options = webdriver.ChromeOptions() \n",
    "    download_dir = download_dir.replace('/', '\\\\')\n",
    "\n",
    "    prefs = {\n",
    "        'download.prompt_for_download' : False,\n",
    "        'download.default_directory' : f'{download_dir}',\n",
    "    }\n",
    "\n",
    "    #options.add_argument('--headless')\n",
    "    options.add_argument('--disable-gpu')\n",
    "    options.add_argument('--start-maximized')\n",
    "    options.add_experimental_option('prefs', prefs)\n",
    "\n",
    "    return options\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    \n",
    "    with open('config.json', 'r') as f:\n",
    "        config = json.load(f)\n",
    "        save_path = config['save_path']\n",
    "        chromedriver = config['chromedriver']\n",
    "        \n",
    "    url = 'https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx'\n",
    "    chrome_options = chrome_options(save_path)\n",
    "    browser = webdriver.Chrome(chromedriver, options=chrome_options)\n",
    "    browser.get(url)\n",
    "\n",
    "    # get current month & month xpath\n",
    "    month_xpath = '//*[@id=\"calDateSelection\"]/tbody/tr[1]/td/table/tbody/tr/td[2]'\n",
    "    current_month = browser.find_element_by_xpath(month_xpath)\n",
    "\n",
    "    # creating the loop for each month getting all schooldays\n",
    "    while current_month.text != 'August 2008':\n",
    "        # button to click to go to previous month\n",
    "        prev_btn = '#calDateSelection > tbody > tr:nth-child(1) > td > table > tbody > tr > td:nth-child(1) > a'\n",
    "        prev_month = browser.find_element_by_css_selector(prev_btn)\n",
    "        \n",
    "        # loop for each file to download\n",
    "        for ele in browser.find_elements_by_xpath(\"//input[contains(@id,'_imgBtnXls')]\"):\n",
    "            # click button to download file\n",
    "            ele.click()   \n",
    "            time.sleep(1)\n",
    "\n",
    "        # after loop move to next month and wait\n",
    "        prev_month.click()\n",
    "        time.sleep(3)\n",
    "        \n",
    "        # check newly selected month\n",
    "        current_month = browser.find_element_by_xpath(month_xpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import csv\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# creating region dict\n",
    "region_map = dict(csv.reader(open(\"./files/region_map.csv\")))\n",
    "\n",
    "def school_year(date):\n",
    "    if date.month > 7:\n",
    "        return date.year\n",
    "    elif date.month < 7:\n",
    "        return date.year + 1\n",
    "    else:\n",
    "        return 'summer'\n",
    "\n",
    "def clean_data(files):\n",
    "    frames = []\n",
    "    for f in files:\n",
    "        df = pd.read_csv(f)\n",
    "        frames.append(df)\n",
    "    df = pd.concat(frames)\n",
    "\n",
    "    # replacing \"NR\" (not reported) with nulls\n",
    "    df = df.replace('NR', np.nan)\n",
    "\n",
    "    # convert types\n",
    "    df['Enrollment'] = df['Enrollment'].astype(float)\n",
    "    df['Absent'] = df['Absent'].astype(float)\n",
    "    df['PercentageAbsent'] = (df['PercentageAbsent'].astype(float))/100\n",
    "    df['AttendanceDate'] = pd.to_datetime(df['AttendanceDate'])\n",
    "\n",
    "    # setting school year dates\n",
    "    df['school_year'] = df['AttendanceDate'].apply(school_year)\n",
    "    df['school_year'] = df['school_year'].astype(str)\n",
    "\n",
    "    # setting region codes\n",
    "    df['region'] = df['LEA_NAME'].map(region_map)\n",
    "\n",
    "    return df\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    \n",
    "    # filepath of all individual date csvs\n",
    "    files = glob.glob('./data/raw/*.csv')\n",
    "    data = clean_data(files)\n",
    "    data.to_csv('./data/clean/ri_attendance_clean.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}