{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs). Source: [RIDE](http://infoworks.ride.ri.gov/).\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning RI Public Schools Testing Scores"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"using pandas to clean and combine multiple different datasets covering PRACC, SAT, NECAP, Graduation, Attendance"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import glob\n",
"import csv\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"necap_dict = csv.reader(open(\"./dicts/necap.csv\"))\n",
"map_necap = dict(necap_dict)\n",
"\n",
"pracc_dict = csv.reader(open(\"./dicts/pracc.csv\"))\n",
"map_pracc = dict(pracc_dict)\n",
"\n",
"middle_dict = csv.reader(open(\"./dicts/middle_schools.csv\"))\n",
"middle_map = dict(middle_dict)\n",
"\n",
"region_dict = csv.reader(open(\"./dicts/region_map.csv\"))\n",
"region_map = dict(region_dict)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#NECAP Cleaning\n",
"\n",
"fp = './raw/NECAP'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv, skiprows=3)\n",
" df['filename'] = os.path.basename(csv)\n",
" df['category'] = 'NECAP'\n",
" df = df.iloc[1:]\n",
" \n",
" # Unpivot the columns\n",
" x = ['District', 'School', 'category', 'filename']\n",
" df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n",
" \n",
" df = df.rename(index=str, columns={'variable': 'year'})\n",
" df['year'] = df.year.str[13:-3]\n",
" \n",
" df = df.rename(index=str, columns={'filename': 'subject'})\n",
" df['subject'] = df.subject.str[7:-12]\n",
" \n",
" df['level'] = df.subject.map(map_necap)\n",
" \n",
" df = df.replace('no data', np.nan)\n",
" df = df.replace('too few data', np.nan)\n",
" df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n",
" \n",
" df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n",
" df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data)\n",
"\n",
"cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"necap = frame\n",
"necap.to_csv('./clean/necap_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" School | \n",
" level | \n",
" category | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Barrington High School | \n",
" High School | \n",
" NECAP | \n",
" 11th Grade Math | \n",
" 2008 | \n",
" 0.71 | \n",
" 1.0 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" Bristol Warren | \n",
" Mt. Hope High School | \n",
" High School | \n",
" NECAP | \n",
" 11th Grade Math | \n",
" 2008 | \n",
" 0.40 | \n",
" 9.0 | \n",
" 0.827586 | \n",
"
\n",
" \n",
" 2 | \n",
" Burrillville | \n",
" Burrillville High School | \n",
" High School | \n",
" NECAP | \n",
" 11th Grade Math | \n",
" 2008 | \n",
" 0.30 | \n",
" 16.0 | \n",
" 0.672414 | \n",
"
\n",
" \n",
" 3 | \n",
" Central Falls | \n",
" Central Falls High School | \n",
" High School | \n",
" NECAP | \n",
" 11th Grade Math | \n",
" 2008 | \n",
" 0.04 | \n",
" 32.0 | \n",
" 0.206897 | \n",
"
\n",
" \n",
" 4 | \n",
" Chariho | \n",
" Chariho Regional High School | \n",
" High School | \n",
" NECAP | \n",
" 11th Grade Math | \n",
" 2008 | \n",
" 0.31 | \n",
" 15.0 | \n",
" 0.724138 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District School level category \\\n",
"0 Barrington Barrington High School High School NECAP \n",
"1 Bristol Warren Mt. Hope High School High School NECAP \n",
"2 Burrillville Burrillville High School High School NECAP \n",
"3 Central Falls Central Falls High School High School NECAP \n",
"4 Chariho Chariho Regional High School High School NECAP \n",
"\n",
" subject year value rank percentile \n",
"0 11th Grade Math 2008 0.71 1.0 1.000000 \n",
"1 11th Grade Math 2008 0.40 9.0 0.827586 \n",
"2 11th Grade Math 2008 0.30 16.0 0.672414 \n",
"3 11th Grade Math 2008 0.04 32.0 0.206897 \n",
"4 11th Grade Math 2008 0.31 15.0 0.724138 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"necap.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#PRACC Cleaning\n",
"\n",
"fp = './raw/PRACC'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv, skiprows=3)\n",
" df['filename'] = os.path.basename(csv)\n",
" df['category'] = 'PRACC'\n",
" df = df.iloc[1:]\n",
" \n",
" # Unpivot the columns\n",
" x = ['District', 'School', 'category', 'filename']\n",
" df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n",
" \n",
" df = df.rename(index=str, columns={'variable': 'year'})\n",
" df['year'] = df.year.str[29:-3]\n",
" \n",
" df = df.rename(index=str, columns={'filename': 'subject'})\n",
" df['subject'] = df.subject.str[7:-12]\n",
" \n",
" df['level'] = df.subject.map(map_pracc)\n",
" df['level2'] = df.School.map(middle_map) \n",
" df['level3'] = np.where((df['subject'] == 'Geometry Mathematics')\n",
" & (df['level2'] == 'Middle School')\n",
" | (df['subject'] == 'Algebra I Mathematics') \n",
" & (df['level2'] == 'Middle School'),'Middle School', None)\n",
" \n",
" df.level3.fillna(df.level, inplace=True)\n",
" \n",
" df = df.drop(['level'], axis=1)\n",
" df = df.drop(['level2'], axis=1)\n",
" df = df.rename(index=str, columns={'level3': 'level'})\n",
"\n",
" df = df.replace('no data', np.nan)\n",
" df = df.replace('too few data', np.nan)\n",
" df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n",
" \n",
" df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n",
" df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data)\n",
"\n",
"cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"pracc = frame\n",
"pracc.to_csv('./clean/pracc_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" School | \n",
" level | \n",
" category | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Barrington High School | \n",
" High School | \n",
" PRACC | \n",
" 10th Grade ELA_Literacy | \n",
" 2014 | \n",
" 0.7510 | \n",
" 2.0 | \n",
" 0.983871 | \n",
"
\n",
" \n",
" 1 | \n",
" Bristol Warren | \n",
" Mt. Hope High School | \n",
" High School | \n",
" PRACC | \n",
" 10th Grade ELA_Literacy | \n",
" 2014 | \n",
" 0.2418 | \n",
" 31.0 | \n",
" 0.516129 | \n",
"
\n",
" \n",
" 2 | \n",
" Burrillville | \n",
" Burrillville High School | \n",
" High School | \n",
" PRACC | \n",
" 10th Grade ELA_Literacy | \n",
" 2014 | \n",
" 0.2636 | \n",
" 29.0 | \n",
" 0.548387 | \n",
"
\n",
" \n",
" 3 | \n",
" Central Falls | \n",
" Central Falls High School | \n",
" High School | \n",
" PRACC | \n",
" 10th Grade ELA_Literacy | \n",
" 2014 | \n",
" NaN | \n",
" 52.0 | \n",
" 0.096774 | \n",
"
\n",
" \n",
" 4 | \n",
" Chariho | \n",
" Chariho Regional High School | \n",
" High School | \n",
" PRACC | \n",
" 10th Grade ELA_Literacy | \n",
" 2014 | \n",
" 0.4220 | \n",
" 15.0 | \n",
" 0.774194 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District School level category \\\n",
"0 Barrington Barrington High School High School PRACC \n",
"1 Bristol Warren Mt. Hope High School High School PRACC \n",
"2 Burrillville Burrillville High School High School PRACC \n",
"3 Central Falls Central Falls High School High School PRACC \n",
"4 Chariho Chariho Regional High School High School PRACC \n",
"\n",
" subject year value rank percentile \n",
"0 10th Grade ELA_Literacy 2014 0.7510 2.0 0.983871 \n",
"1 10th Grade ELA_Literacy 2014 0.2418 31.0 0.516129 \n",
"2 10th Grade ELA_Literacy 2014 0.2636 29.0 0.548387 \n",
"3 10th Grade ELA_Literacy 2014 NaN 52.0 0.096774 \n",
"4 10th Grade ELA_Literacy 2014 0.4220 15.0 0.774194 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pracc.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#SAT Cleaning\n",
"\n",
"fp = './raw/SAT'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv, skiprows=3)\n",
" df['filename'] = os.path.basename(csv)\n",
" df['category'] = 'SAT'\n",
" df = df.iloc[1:]\n",
" \n",
" df = df.replace('no data',np.nan)\n",
" df = df.replace('too few data',np.nan)\n",
" \n",
" col_list = ['Mathematics Average', 'Reading Average', 'Writing Average']\n",
" df['Total Average'] = df[col_list].astype(float).sum(axis=1)\n",
" \n",
" # Unpivot the columns\n",
" x = ['District', 'School', 'category', 'filename']\n",
" df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n",
" \n",
" df = df.rename(index=str, columns={'variable': 'subject'})\n",
" df['subject'] = df.subject.str[:-7]\n",
" df['subject'] = 'SAT ' + df['subject'].astype(str) + 'Avg'\n",
" \n",
" df['level'] = 'High School'\n",
" \n",
" df = df.rename(index=str, columns={'filename': 'year'})\n",
" df['year'] = df.year.str[:-9]\n",
" \n",
" df['value'] = df['value'].astype('float')\n",
" df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n",
" df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n",
" \n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data)\n",
"\n",
"cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"sat = frame\n",
"sat.to_csv('./clean/sat_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" School | \n",
" level | \n",
" category | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Barrington High School | \n",
" High School | \n",
" SAT | \n",
" SAT Mathematics Avg | \n",
" 2010 | \n",
" 576.0 | \n",
" 2.0 | \n",
" 0.982456 | \n",
"
\n",
" \n",
" 1 | \n",
" Bristol Warren | \n",
" Mt. Hope High School | \n",
" High School | \n",
" SAT | \n",
" SAT Mathematics Avg | \n",
" 2010 | \n",
" 484.0 | \n",
" 19.0 | \n",
" 0.622807 | \n",
"
\n",
" \n",
" 2 | \n",
" Burrillville | \n",
" Burrillville High School | \n",
" High School | \n",
" SAT | \n",
" SAT Mathematics Avg | \n",
" 2010 | \n",
" 488.0 | \n",
" 18.0 | \n",
" 0.649123 | \n",
"
\n",
" \n",
" 3 | \n",
" Central Falls | \n",
" Central Falls High School | \n",
" High School | \n",
" SAT | \n",
" SAT Mathematics Avg | \n",
" 2010 | \n",
" 374.0 | \n",
" 39.0 | \n",
" 0.263158 | \n",
"
\n",
" \n",
" 4 | \n",
" Chariho | \n",
" Chariho Regional High School | \n",
" High School | \n",
" SAT | \n",
" SAT Mathematics Avg | \n",
" 2010 | \n",
" 520.0 | \n",
" 9.0 | \n",
" 0.842105 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District School level category \\\n",
"0 Barrington Barrington High School High School SAT \n",
"1 Bristol Warren Mt. Hope High School High School SAT \n",
"2 Burrillville Burrillville High School High School SAT \n",
"3 Central Falls Central Falls High School High School SAT \n",
"4 Chariho Chariho Regional High School High School SAT \n",
"\n",
" subject year value rank percentile \n",
"0 SAT Mathematics Avg 2010 576.0 2.0 0.982456 \n",
"1 SAT Mathematics Avg 2010 484.0 19.0 0.622807 \n",
"2 SAT Mathematics Avg 2010 488.0 18.0 0.649123 \n",
"3 SAT Mathematics Avg 2010 374.0 39.0 0.263158 \n",
"4 SAT Mathematics Avg 2010 520.0 9.0 0.842105 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sat.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Graduation Rate\n",
"\n",
"fp = './raw/Graduation Rate'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv, skiprows=4)\n",
" df['filename'] = os.path.basename(csv)\n",
" df['category'] = 'GRAD'\n",
" df = df.iloc[1:, [0,1,2,6,7]]\n",
" \n",
" # Unpivot the columns\n",
" x = ['District', 'School', 'category', 'filename']\n",
" df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n",
" \n",
" df = df.rename(index=str, columns={'variable': 'subject'})\n",
" df['subject'] = 'Pct. Graduated 4 Yrs'\n",
" \n",
" df['level'] = 'High School'\n",
" \n",
" df = df.rename(index=str, columns={'filename': 'year'})\n",
" df['year'] = df.year.str[0:4]\n",
" \n",
" df = df.replace('no data', np.nan)\n",
" df = df.replace('too few data', np.nan)\n",
" df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n",
" \n",
" df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n",
" df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n",
" \n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data)\n",
"\n",
"cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"grad = frame\n",
"grad.to_csv('./clean/grad_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" School | \n",
" category | \n",
" level | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Barrington High School | \n",
" GRAD | \n",
" High School | \n",
" Pct. Graduated 4 Yrs | \n",
" 2010 | \n",
" 0.966 | \n",
" 2.0 | \n",
" 0.982456 | \n",
"
\n",
" \n",
" 1 | \n",
" Bristol Warren | \n",
" Mt. Hope High School | \n",
" GRAD | \n",
" High School | \n",
" Pct. Graduated 4 Yrs | \n",
" 2010 | \n",
" 0.881 | \n",
" 14.0 | \n",
" 0.754386 | \n",
"
\n",
" \n",
" 2 | \n",
" Burrillville | \n",
" Burrillville High School | \n",
" GRAD | \n",
" High School | \n",
" Pct. Graduated 4 Yrs | \n",
" 2010 | \n",
" 0.868 | \n",
" 16.0 | \n",
" 0.719298 | \n",
"
\n",
" \n",
" 3 | \n",
" Central Falls | \n",
" Central Falls High School | \n",
" GRAD | \n",
" High School | \n",
" Pct. Graduated 4 Yrs | \n",
" 2010 | \n",
" 0.708 | \n",
" 37.0 | \n",
" 0.315789 | \n",
"
\n",
" \n",
" 4 | \n",
" Chariho | \n",
" Chariho Regional High School | \n",
" GRAD | \n",
" High School | \n",
" Pct. Graduated 4 Yrs | \n",
" 2010 | \n",
" 0.893 | \n",
" 10.0 | \n",
" 0.824561 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District School category level \\\n",
"0 Barrington Barrington High School GRAD High School \n",
"1 Bristol Warren Mt. Hope High School GRAD High School \n",
"2 Burrillville Burrillville High School GRAD High School \n",
"3 Central Falls Central Falls High School GRAD High School \n",
"4 Chariho Chariho Regional High School GRAD High School \n",
"\n",
" subject year value rank percentile \n",
"0 Pct. Graduated 4 Yrs 2010 0.966 2.0 0.982456 \n",
"1 Pct. Graduated 4 Yrs 2010 0.881 14.0 0.754386 \n",
"2 Pct. Graduated 4 Yrs 2010 0.868 16.0 0.719298 \n",
"3 Pct. Graduated 4 Yrs 2010 0.708 37.0 0.315789 \n",
"4 Pct. Graduated 4 Yrs 2010 0.893 10.0 0.824561 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grad.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Attendance\n",
"\n",
"fp = './raw/Attendance'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv, skiprows=4)\n",
" df['filename'] = os.path.basename(csv)\n",
" df['category'] = 'Attend'\n",
" df = df.iloc[1:]\n",
" \n",
" # Unpivot the columns\n",
" x = ['District', 'School', 'category', 'filename']\n",
" df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n",
" \n",
" df = df.rename(index=str, columns={'variable': 'subject'})\n",
" df['subject'] = 'Attendance Rate'\n",
" \n",
" df['level'] = df.filename.str[-7:-5]\n",
" df = df.replace('ES', 'Elementary School')\n",
" df = df.replace('MS', 'Middle School')\n",
" df = df.replace('HS', 'High School')\n",
" \n",
" df = df.rename(index=str, columns={'filename': 'year'})\n",
" df['year'] = df.year.str[0:4]\n",
" \n",
" df = df.replace('no data', np.nan)\n",
" df = df.replace('too few data', np.nan)\n",
" df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n",
" \n",
" df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n",
" df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n",
" \n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data)\n",
"\n",
"cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"attend = frame\n",
"attend.to_csv('./clean/attend_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" School | \n",
" level | \n",
" category | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Hampden Meadows School | \n",
" Elementary School | \n",
" Attend | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9659 | \n",
" 21.0 | \n",
" 0.887179 | \n",
"
\n",
" \n",
" 1 | \n",
" Barrington | \n",
" Nayatt School | \n",
" Elementary School | \n",
" Attend | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9585 | \n",
" 50.0 | \n",
" 0.712821 | \n",
"
\n",
" \n",
" 2 | \n",
" Barrington | \n",
" Primrose Hill School | \n",
" Elementary School | \n",
" Attend | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9620 | \n",
" 36.0 | \n",
" 0.810256 | \n",
"
\n",
" \n",
" 3 | \n",
" Barrington | \n",
" Sowams Elementary School | \n",
" Elementary School | \n",
" Attend | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9638 | \n",
" 26.0 | \n",
" 0.861538 | \n",
"
\n",
" \n",
" 4 | \n",
" Bristol Warren | \n",
" Colt Andrews School | \n",
" Elementary School | \n",
" Attend | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9494 | \n",
" 92.0 | \n",
" 0.425641 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District School level category \\\n",
"0 Barrington Hampden Meadows School Elementary School Attend \n",
"1 Barrington Nayatt School Elementary School Attend \n",
"2 Barrington Primrose Hill School Elementary School Attend \n",
"3 Barrington Sowams Elementary School Elementary School Attend \n",
"4 Bristol Warren Colt Andrews School Elementary School Attend \n",
"\n",
" subject year value rank percentile \n",
"0 Attendance Rate 2010 0.9659 21.0 0.887179 \n",
"1 Attendance Rate 2010 0.9585 50.0 0.712821 \n",
"2 Attendance Rate 2010 0.9620 36.0 0.810256 \n",
"3 Attendance Rate 2010 0.9638 26.0 0.861538 \n",
"4 Attendance Rate 2010 0.9494 92.0 0.425641 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"attend.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Chronic Absence\n",
"\n",
"fp = './raw/Chronic Absenteeism'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv, skiprows=4)\n",
" df['filename'] = os.path.basename(csv)\n",
" df['category'] = 'Chronic'\n",
" df = df.iloc[1:]\n",
" \n",
" # Unpivot the columns\n",
" x = ['District', 'School', 'category', 'filename']\n",
" df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n",
" \n",
" df = df.rename(index=str, columns={'variable': 'subject'})\n",
" df['subject'] = 'Chronic Absence'\n",
" \n",
" df['level'] = df.filename.str[-7:-5]\n",
" df = df.replace('ES', 'Elementary School')\n",
" df = df.replace('MS', 'Middle School')\n",
" df = df.replace('HS', 'High School')\n",
" \n",
" df = df.rename(index=str, columns={'filename': 'year'})\n",
" df['year'] = df.year.str[0:4]\n",
" \n",
" df = df.replace('no data', np.nan)\n",
" df = df.replace('too few data', np.nan)\n",
" df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n",
" \n",
" df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n",
" df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n",
" \n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data)\n",
"\n",
"cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"chronic = frame\n",
"chronic.to_csv('./clean/chronic_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" School | \n",
" category | \n",
" level | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Hampden Meadows School | \n",
" Chronic | \n",
" Elementary School | \n",
" Chronic Absence | \n",
" 2012 | \n",
" 0.0175 | \n",
" 177.0 | \n",
" 0.056995 | \n",
"
\n",
" \n",
" 1 | \n",
" Barrington | \n",
" Nayatt School | \n",
" Chronic | \n",
" Elementary School | \n",
" Chronic Absence | \n",
" 2012 | \n",
" 0.0746 | \n",
" 126.0 | \n",
" 0.341969 | \n",
"
\n",
" \n",
" 2 | \n",
" Barrington | \n",
" Primrose Hill School | \n",
" Chronic | \n",
" Elementary School | \n",
" Chronic Absence | \n",
" 2012 | \n",
" 0.0785 | \n",
" 121.0 | \n",
" 0.367876 | \n",
"
\n",
" \n",
" 3 | \n",
" Barrington | \n",
" Sowams Elementary School | \n",
" Chronic | \n",
" Elementary School | \n",
" Chronic Absence | \n",
" 2012 | \n",
" 0.0560 | \n",
" 144.0 | \n",
" 0.243523 | \n",
"
\n",
" \n",
" 4 | \n",
" Bristol Warren | \n",
" Colt Andrews School | \n",
" Chronic | \n",
" Elementary School | \n",
" Chronic Absence | \n",
" 2012 | \n",
" 0.1162 | \n",
" 73.0 | \n",
" 0.616580 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District School category level \\\n",
"0 Barrington Hampden Meadows School Chronic Elementary School \n",
"1 Barrington Nayatt School Chronic Elementary School \n",
"2 Barrington Primrose Hill School Chronic Elementary School \n",
"3 Barrington Sowams Elementary School Chronic Elementary School \n",
"4 Bristol Warren Colt Andrews School Chronic Elementary School \n",
"\n",
" subject year value rank percentile \n",
"0 Chronic Absence 2012 0.0175 177.0 0.056995 \n",
"1 Chronic Absence 2012 0.0746 126.0 0.341969 \n",
"2 Chronic Absence 2012 0.0785 121.0 0.367876 \n",
"3 Chronic Absence 2012 0.0560 144.0 0.243523 \n",
"4 Chronic Absence 2012 0.1162 73.0 0.616580 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chronic.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"school_region_map = {\n",
" 'Academy for Career Exploration' : 'Charter', \n",
" \"NE Laborers'/Cranston Public Schools Construction Career Academy\" : 'Charter',\n",
" 'Times2 Academy' : 'Charter'}"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"x = region_map"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Combine ALl\n",
"\n",
"fp = './clean/'\n",
"\n",
"allFiles = glob.glob(fp + \"/*.csv\")\n",
"frame = pd.DataFrame()\n",
"data = []\n",
"for csv in allFiles:\n",
" \n",
" df = pd.read_csv(csv) \n",
" \n",
" data.append(df)\n",
"frame = pd.concat(data, sort=True)\n",
"\n",
"cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"frame = frame[cols]\n",
"\n",
"final = frame\n",
"\n",
"final['region'] = final.District.map(region_map)\n",
"final['region2'] = final.School.map(school_region_map)\n",
"\n",
"final.region2.fillna(final.region, inplace=True)\n",
"\n",
"final = final.drop(['region'], axis=1)\n",
"\n",
"final['subject'] = final['subject'].str.replace('Mathematics', 'Math')\n",
"final['subject'] = final['subject'].str.replace('ELA_Literacy', 'English')\n",
"\n",
"final = final.rename(columns={'region2': 'region'})\n",
"\n",
"final = final[final.District != 'Regional Collaborative']\n",
"\n",
"cols = ['District', 'region', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n",
"final = final[cols]\n",
"\n",
"final.to_csv('./clean/final_clean.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" District | \n",
" region | \n",
" School | \n",
" category | \n",
" level | \n",
" subject | \n",
" year | \n",
" value | \n",
" rank | \n",
" percentile | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Barrington | \n",
" Suburban | \n",
" Hampden Meadows School | \n",
" Attend | \n",
" Elementary School | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9659 | \n",
" 21.0 | \n",
" 0.887179 | \n",
"
\n",
" \n",
" 1 | \n",
" Barrington | \n",
" Suburban | \n",
" Nayatt School | \n",
" Attend | \n",
" Elementary School | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9585 | \n",
" 50.0 | \n",
" 0.712821 | \n",
"
\n",
" \n",
" 2 | \n",
" Barrington | \n",
" Suburban | \n",
" Primrose Hill School | \n",
" Attend | \n",
" Elementary School | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9620 | \n",
" 36.0 | \n",
" 0.810256 | \n",
"
\n",
" \n",
" 3 | \n",
" Barrington | \n",
" Suburban | \n",
" Sowams Elementary School | \n",
" Attend | \n",
" Elementary School | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9638 | \n",
" 26.0 | \n",
" 0.861538 | \n",
"
\n",
" \n",
" 4 | \n",
" Bristol Warren | \n",
" Regional | \n",
" Colt Andrews School | \n",
" Attend | \n",
" Elementary School | \n",
" Attendance Rate | \n",
" 2010 | \n",
" 0.9494 | \n",
" 92.0 | \n",
" 0.425641 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" District region School category \\\n",
"0 Barrington Suburban Hampden Meadows School Attend \n",
"1 Barrington Suburban Nayatt School Attend \n",
"2 Barrington Suburban Primrose Hill School Attend \n",
"3 Barrington Suburban Sowams Elementary School Attend \n",
"4 Bristol Warren Regional Colt Andrews School Attend \n",
"\n",
" level subject year value rank percentile \n",
"0 Elementary School Attendance Rate 2010 0.9659 21.0 0.887179 \n",
"1 Elementary School Attendance Rate 2010 0.9585 50.0 0.712821 \n",
"2 Elementary School Attendance Rate 2010 0.9620 36.0 0.810256 \n",
"3 Elementary School Attendance Rate 2010 0.9638 26.0 0.861538 \n",
"4 Elementary School Attendance Rate 2010 0.9494 92.0 0.425641 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}