{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Created by [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RIDE](https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx).\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Collecting and Cleaning School Attendance Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using Selenium to automatically collect data on attendance of schools from RIDE's online tool. The data is then cleaned with pandas to compare attendance pre vs. post COVID-19 remote learning. Previously I used the data to create an [online tool](https://ivizri.com/posts/2018/12/ri-schools-attendance/) allowing you to explore attendance and compare it easily with all other schools in Rhode Island.\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": false }, "outputs": [], "source": [ "import json\n", "import time\n", "from datetime import date\n", "from selenium import webdriver\n", "\n", "def chrome_options(download_dir):\n", " options = webdriver.ChromeOptions() \n", " download_dir = download_dir.replace('/', '\\\\')\n", "\n", " prefs = {\n", " 'download.prompt_for_download' : False,\n", " 'download.default_directory' : f'{download_dir}',\n", " }\n", "\n", " #options.add_argument('--headless')\n", " options.add_argument('--disable-gpu')\n", " options.add_argument('--start-maximized')\n", " options.add_experimental_option('prefs', prefs)\n", "\n", " return options\n", "\n", "if __name__ == \"__main__\":\n", " \n", " with open('config.json', 'r') as f:\n", " config = json.load(f)\n", " save_path = config['save_path']\n", " chromedriver = config['chromedriver']\n", " \n", " url = 'https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx'\n", " chrome_options = chrome_options(save_path)\n", " browser = webdriver.Chrome(chromedriver, options=chrome_options)\n", " browser.get(url)\n", "\n", " # get current month & month xpath\n", " month_xpath = '//*[@id=\"calDateSelection\"]/tbody/tr[1]/td/table/tbody/tr/td[2]'\n", " current_month = browser.find_element_by_xpath(month_xpath)\n", "\n", " # creating the loop for each month getting all schooldays\n", " while current_month.text != 'August 2008':\n", " # button to click to go to previous month\n", " prev_btn = '#calDateSelection > tbody > tr:nth-child(1) > td > table > tbody > tr > td:nth-child(1) > a'\n", " prev_month = browser.find_element_by_css_selector(prev_btn)\n", " \n", " # loop for each file to download\n", " for ele in browser.find_elements_by_xpath(\"//input[contains(@id,'_imgBtnXls')]\"):\n", " # click button to download file\n", " ele.click() \n", " time.sleep(1)\n", "\n", " # after loop move to next month and wait\n", " prev_month.click()\n", " time.sleep(3)\n", " \n", " # check newly selected month\n", " current_month = browser.find_element_by_xpath(month_xpath)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import glob\n", "import csv\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# creating region dict\n", "region_map = dict(csv.reader(open(\"./files/region_map.csv\")))\n", "\n", "def school_year(date):\n", " if date.month > 7:\n", " return date.year\n", " elif date.month < 7:\n", " return date.year + 1\n", " else:\n", " return 'summer'\n", "\n", "def clean_data(files):\n", " frames = []\n", " for f in files:\n", " df = pd.read_csv(f)\n", " frames.append(df)\n", " df = pd.concat(frames)\n", "\n", " # replacing \"NR\" (not reported) with nulls\n", " df = df.replace('NR', np.nan)\n", "\n", " # convert types\n", " df['Enrollment'] = df['Enrollment'].astype(float)\n", " df['Absent'] = df['Absent'].astype(float)\n", " df['PercentageAbsent'] = (df['PercentageAbsent'].astype(float))/100\n", " df['AttendanceDate'] = pd.to_datetime(df['AttendanceDate'])\n", "\n", " # setting school year dates\n", " df['school_year'] = df['AttendanceDate'].apply(school_year)\n", " df['school_year'] = df['school_year'].astype(str)\n", "\n", " # setting region codes\n", " df['region'] = df['LEA_NAME'].map(region_map)\n", "\n", " return df\n", "\n", "if __name__ == \"__main__\":\n", " \n", " # filepath of all individual date csvs\n", " files = glob.glob('./data/raw/*.csv')\n", " data = clean_data(files)\n", " data.to_csv('./data/clean/ri_attendance_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }