{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Created by [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [RIDE](http://ride.ri.gov/InformationAccountability/RIEducationData/UniformChartofAccounts.aspx#18211075-annual-per-pupil-expenditure-reports).\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning UCOA Expenditure Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using Xlwings to pull info from locked excel sheets" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import xlwings as xw\n", "import numpy as np\n", "import os\n", "import glob" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "path = './data/raw/excluding-capital-and-debt-service'" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "year = '2016'\n", "\n", "wb = xw.Book(path + '/' + year + '.xlsx')\n", "\n", "sht = wb.sheets['FY ' + str(year[2:]) + ' Data']" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "location_summary = sht.range('B14:BN40').options(pd.DataFrame).value\n", "function_summary = sht.range('B49:BN54').options(pd.DataFrame).value\n", "function_intermediate = sht.range('B64:BN80').options(pd.DataFrame).value\n", "function_detailed = sht.range('B89:BN124').options(pd.DataFrame).value\n", "program_summary = sht.range('B132:BN142').options(pd.DataFrame).value\n", "subject_summary = sht.range('B151:BN182').options(pd.DataFrame).value\n", "object_summary = sht.range('B192:BN201').options(pd.DataFrame).value\n", "object_intermediate = sht.range('B211:BN260').options(pd.DataFrame).value\n", "object_detailed = sht.range('B270:BN566').options(pd.DataFrame).value\n", "job_summary = sht.range('B575:BN603').options(pd.DataFrame).value\n", "\n", "frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,\n", " subject_summary, object_summary, object_intermediate, object_detailed, job_summary]\n", "\n", "str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',\n", " 'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "i = 0\n", "\n", "for df in frames:\n", " df['class'] = str_frames[i]\n", " df['year'] = str(year)\n", " i += 1\n", " \n", "df = pd.concat(frames)\n", "\n", "df.shape\n", "\n", "df = df.drop(['Row Match?', 'Row Labels'], axis=1)\n", "df = df.reset_index()\n", "\n", "x = ['Row Labels', 'class', 'year']\n", "\n", "df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", "\n", "df = df.fillna(0)\n", "\n", "df['type'] = df['Row Labels'].str.replace('\\d+', '')\n", "\n", "df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')\n", "\n", "df = df.drop('Row Labels', axis=1)\n", "\n", "df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})\n", "\n", "df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 2016 School Attributes\n", "\n", "sht = wb.sheets['Master Table']\n", "\n", "df = sht.range('C2:X64').options(pd.DataFrame).value" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# All 0 values are listed as 0.00001\n", "\n", "df[df == 0.00001] = np.nan\n", "\n", "df = df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df.drop(x, axis=1)\n", "df = df.drop('Other', axis=1)\n", "df = df.reset_index()\n", "\n", "df['year'] = year" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2015 Data" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": true }, "outputs": [], "source": [ "year = '2015'\n", "\n", "wb = xw.Book(path + '/' + year + '.xlsx')\n", "\n", "sht = wb.sheets['FY ' + str(year[2:]) + ' Data']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": true }, "outputs": [], "source": [ "location_summary = sht.range('B14:BM39').options(pd.DataFrame).value\n", "function_summary = sht.range('B48:BM53').options(pd.DataFrame).value\n", "function_intermediate = sht.range('B63:BM79').options(pd.DataFrame).value\n", "function_detailed = sht.range('B88:BM123').options(pd.DataFrame).value\n", "program_summary = sht.range('B131:BM141').options(pd.DataFrame).value\n", "subject_summary = sht.range('B150:BM181').options(pd.DataFrame).value\n", "object_summary = sht.range('B191:BM200').options(pd.DataFrame).value\n", "object_intermediate = sht.range('B210:BM258').options(pd.DataFrame).value\n", "object_detailed = sht.range('B268:BM561').options(pd.DataFrame).value\n", "job_summary = sht.range('B570:BM598').options(pd.DataFrame).value\n", "\n", "frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,\n", " subject_summary, object_summary, object_intermediate, object_detailed, job_summary]\n", "\n", "str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',\n", " 'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "i = 0\n", "\n", "for df in frames:\n", " df['class'] = str_frames[i]\n", " df['year'] = str(year)\n", " i += 1\n", " \n", "df = pd.concat(frames)\n", "\n", "df.shape\n", "\n", "df = df.drop(['Row Match?', 'Row Labels'], axis=1)\n", "df = df.reset_index()\n", "\n", "x = ['Row Labels', 'class', 'year']\n", "\n", "df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", "\n", "df = df.fillna(0)\n", "\n", "df['type'] = df['Row Labels'].str.replace('\\d+', '')\n", "\n", "df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')\n", "\n", "df = df.drop('Row Labels', axis=1)\n", "\n", "df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})\n", "\n", "df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 2015 School Attributes\n", "\n", "sht = wb.sheets['Master Table']\n", "\n", "df = sht.range('C2:X63').options(pd.DataFrame).value" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# All 0 values are listed as 0.00001\n", "\n", "df[df == 0.0001] = np.nan\n", "\n", "df = df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df.drop(x, axis=1)\n", "df = df.drop('Other', axis=1)\n", "df = df.reset_index()\n", "\n", "df['year'] = year" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2014 Data" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": true }, "outputs": [], "source": [ "year = '2014'\n", "\n", "wb = xw.Book(path + '/' + year + '.xlsx')\n", "\n", "sht = wb.sheets['FY ' + str(year[2:]) + ' Data']" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": true }, "outputs": [], "source": [ "location_summary = sht.range('B14:BK38').options(pd.DataFrame).value\n", "function_summary = sht.range('B47:BK52').options(pd.DataFrame).value\n", "function_intermediate = sht.range('B62:BK78').options(pd.DataFrame).value\n", "function_detailed = sht.range('B87:BK122').options(pd.DataFrame).value\n", "program_summary = sht.range('B130:BK140').options(pd.DataFrame).value\n", "subject_summary = sht.range('B149:BK180').options(pd.DataFrame).value\n", "object_summary = sht.range('B190:BK199').options(pd.DataFrame).value\n", "object_intermediate = sht.range('B209:BK253').options(pd.DataFrame).value\n", "object_detailed = sht.range('B263:BK555').options(pd.DataFrame).value\n", "job_summary = sht.range('B564:BK592').options(pd.DataFrame).value\n", "\n", "frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,\n", " subject_summary, object_summary, object_intermediate, object_detailed, job_summary]\n", "\n", "str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',\n", " 'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "i = 0\n", "\n", "for df in frames:\n", " df['class'] = str_frames[i]\n", " df['year'] = str(year)\n", " i += 1\n", " \n", "df = pd.concat(frames)\n", "\n", "df.shape\n", "\n", "df = df.drop(['Row Match?', 'Row Labels'], axis=1)\n", "df = df.reset_index()\n", "\n", "x = ['Row Labels', 'class', 'year']\n", "\n", "df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", "\n", "df = df.fillna(0)\n", "\n", "df['type'] = df['Row Labels'].str.replace('\\d+', '')\n", "\n", "df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')\n", "\n", "df = df.drop('Row Labels', axis=1)\n", "\n", "df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})\n", "\n", "df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 2014 School Attributes\n", "\n", "sht = wb.sheets['Master Table']\n", "\n", "df = sht.range('C2:X61').options(pd.DataFrame).value" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# All 0 values are listed as 0.00001\n", "\n", "df[df == 0.0001] = np.nan\n", "\n", "df = df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df.drop(x, axis=1)\n", "df = df.drop('Other', axis=1)\n", "df = df.reset_index()\n", "\n", "df['year'] = year" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2013 Data" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": true }, "outputs": [], "source": [ "year = '2013'\n", "\n", "wb = xw.Book(path + '/' + year + '.xlsx')\n", "\n", "sht = wb.sheets['FY ' + str(year[2:]) + ' Data']" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": true }, "outputs": [], "source": [ "location_summary = sht.range('B14:BG38').options(pd.DataFrame).value\n", "function_summary = sht.range('B47:BG52').options(pd.DataFrame).value\n", "function_intermediate = sht.range('B62:BG78').options(pd.DataFrame).value\n", "function_detailed = sht.range('B87:BG122').options(pd.DataFrame).value\n", "program_summary = sht.range('B130:BG140').options(pd.DataFrame).value\n", "subject_summary = sht.range('B149:BG180').options(pd.DataFrame).value\n", "object_summary = sht.range('B190:BG199').options(pd.DataFrame).value\n", "object_intermediate = sht.range('B209:BG253').options(pd.DataFrame).value\n", "object_detailed = sht.range('B263:BG555').options(pd.DataFrame).value\n", "job_summary = sht.range('B564:BG592').options(pd.DataFrame).value\n", "\n", "frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,\n", " subject_summary, object_summary, object_intermediate, object_detailed, job_summary]\n", "\n", "str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',\n", " 'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "i = 0\n", "\n", "for df in frames:\n", " df['class'] = str_frames[i]\n", " df['year'] = str(year)\n", " i += 1\n", " \n", "df = pd.concat(frames)\n", "\n", "df.shape\n", "\n", "df = df.drop(['Row Match?', 'Row Labels'], axis=1)\n", "df = df.reset_index()\n", "\n", "x = ['Row Labels', 'class', 'year']\n", "\n", "df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", "\n", "df = df.fillna(0)\n", "\n", "df['type'] = df['Row Labels'].str.replace('\\d+', '')\n", "\n", "df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')\n", "\n", "df = df.drop('Row Labels', axis=1)\n", "\n", "df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})\n", "\n", "df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 2013 School Attributes\n", "\n", "sht = wb.sheets['Master Table']\n", "\n", "df = sht.range('C2:X57').options(pd.DataFrame).value" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# All 0 values are listed as 0.00001\n", "\n", "df[df == 0.00001] = np.nan\n", "\n", "df = df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df.drop(x, axis=1)\n", "df = df.drop('Diff', axis=1)\n", "df = df.reset_index()\n", "\n", "df['year'] = year" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining all data" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Set File Path\n", "path = './data/clean/UCOA/'\n", "allFiles = glob.glob(path + \"/*.csv\")\n", "\n", "# Load Data into Dataframe\n", "frame = pd.DataFrame()\n", "list_ = []\n", "for file_ in allFiles:\n", " df = pd.read_csv(file_,index_col=None, header=0)\n", " list_.append(df)\n", "frame = pd.concat(list_)\n", "\n", "df = frame\n", "\n", "df.to_csv('./data/clean/UCOA/UCOA_All.csv')" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Set File Path\n", "path = './data/clean/Attributes/'\n", "allFiles = glob.glob(path + \"/*.csv\")\n", "\n", "# Load Data into Dataframe\n", "frame = pd.DataFrame()\n", "list_ = []\n", "for file_ in allFiles:\n", " df = pd.read_csv(file_,index_col=None, header=0)\n", " list_.append(df)\n", "frame = pd.concat(list_)\n", "\n", "df = frame\n", "\n", "df.to_csv('./data/clean/Attributes/Attributes_All.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }