{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs). Source: [RIDE](http://infoworks.ride.ri.gov/).\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cleaning RI Public Schools Testing Scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "using pandas to clean and combine multiple different datasets covering PRACC, SAT, NECAP, Graduation, Attendance" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import glob\n", "import csv\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "necap_dict = csv.reader(open(\"./dicts/necap.csv\"))\n", "map_necap = dict(necap_dict)\n", "\n", "pracc_dict = csv.reader(open(\"./dicts/pracc.csv\"))\n", "map_pracc = dict(pracc_dict)\n", "\n", "middle_dict = csv.reader(open(\"./dicts/middle_schools.csv\"))\n", "middle_map = dict(middle_dict)\n", "\n", "region_dict = csv.reader(open(\"./dicts/region_map.csv\"))\n", "region_map = dict(region_dict)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#NECAP Cleaning\n", "\n", "fp = './raw/NECAP'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv, skiprows=3)\n", " df['filename'] = os.path.basename(csv)\n", " df['category'] = 'NECAP'\n", " df = df.iloc[1:]\n", " \n", " # Unpivot the columns\n", " x = ['District', 'School', 'category', 'filename']\n", " df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", " \n", " df = df.rename(index=str, columns={'variable': 'year'})\n", " df['year'] = df.year.str[13:-3]\n", " \n", " df = df.rename(index=str, columns={'filename': 'subject'})\n", " df['subject'] = df.subject.str[7:-12]\n", " \n", " df['level'] = df.subject.map(map_necap)\n", " \n", " df = df.replace('no data', np.nan)\n", " df = df.replace('too few data', np.nan)\n", " df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n", " \n", " df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n", " df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n", " \n", " data.append(df)\n", "frame = pd.concat(data)\n", "\n", "cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "necap = frame\n", "necap.to_csv('./clean/necap_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictSchoollevelcategorysubjectyearvaluerankpercentile
0BarringtonBarrington High SchoolHigh SchoolNECAP11th Grade Math20080.711.01.000000
1Bristol WarrenMt. Hope High SchoolHigh SchoolNECAP11th Grade Math20080.409.00.827586
2BurrillvilleBurrillville High SchoolHigh SchoolNECAP11th Grade Math20080.3016.00.672414
3Central FallsCentral Falls High SchoolHigh SchoolNECAP11th Grade Math20080.0432.00.206897
4CharihoChariho Regional High SchoolHigh SchoolNECAP11th Grade Math20080.3115.00.724138
\n", "
" ], "text/plain": [ " District School level category \\\n", "0 Barrington Barrington High School High School NECAP \n", "1 Bristol Warren Mt. Hope High School High School NECAP \n", "2 Burrillville Burrillville High School High School NECAP \n", "3 Central Falls Central Falls High School High School NECAP \n", "4 Chariho Chariho Regional High School High School NECAP \n", "\n", " subject year value rank percentile \n", "0 11th Grade Math 2008 0.71 1.0 1.000000 \n", "1 11th Grade Math 2008 0.40 9.0 0.827586 \n", "2 11th Grade Math 2008 0.30 16.0 0.672414 \n", "3 11th Grade Math 2008 0.04 32.0 0.206897 \n", "4 11th Grade Math 2008 0.31 15.0 0.724138 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "necap.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#PRACC Cleaning\n", "\n", "fp = './raw/PRACC'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv, skiprows=3)\n", " df['filename'] = os.path.basename(csv)\n", " df['category'] = 'PRACC'\n", " df = df.iloc[1:]\n", " \n", " # Unpivot the columns\n", " x = ['District', 'School', 'category', 'filename']\n", " df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", " \n", " df = df.rename(index=str, columns={'variable': 'year'})\n", " df['year'] = df.year.str[29:-3]\n", " \n", " df = df.rename(index=str, columns={'filename': 'subject'})\n", " df['subject'] = df.subject.str[7:-12]\n", " \n", " df['level'] = df.subject.map(map_pracc)\n", " df['level2'] = df.School.map(middle_map) \n", " df['level3'] = np.where((df['subject'] == 'Geometry Mathematics')\n", " & (df['level2'] == 'Middle School')\n", " | (df['subject'] == 'Algebra I Mathematics') \n", " & (df['level2'] == 'Middle School'),'Middle School', None)\n", " \n", " df.level3.fillna(df.level, inplace=True)\n", " \n", " df = df.drop(['level'], axis=1)\n", " df = df.drop(['level2'], axis=1)\n", " df = df.rename(index=str, columns={'level3': 'level'})\n", "\n", " df = df.replace('no data', np.nan)\n", " df = df.replace('too few data', np.nan)\n", " df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n", " \n", " df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n", " df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n", " \n", " data.append(df)\n", "frame = pd.concat(data)\n", "\n", "cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "pracc = frame\n", "pracc.to_csv('./clean/pracc_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictSchoollevelcategorysubjectyearvaluerankpercentile
0BarringtonBarrington High SchoolHigh SchoolPRACC10th Grade ELA_Literacy20140.75102.00.983871
1Bristol WarrenMt. Hope High SchoolHigh SchoolPRACC10th Grade ELA_Literacy20140.241831.00.516129
2BurrillvilleBurrillville High SchoolHigh SchoolPRACC10th Grade ELA_Literacy20140.263629.00.548387
3Central FallsCentral Falls High SchoolHigh SchoolPRACC10th Grade ELA_Literacy2014NaN52.00.096774
4CharihoChariho Regional High SchoolHigh SchoolPRACC10th Grade ELA_Literacy20140.422015.00.774194
\n", "
" ], "text/plain": [ " District School level category \\\n", "0 Barrington Barrington High School High School PRACC \n", "1 Bristol Warren Mt. Hope High School High School PRACC \n", "2 Burrillville Burrillville High School High School PRACC \n", "3 Central Falls Central Falls High School High School PRACC \n", "4 Chariho Chariho Regional High School High School PRACC \n", "\n", " subject year value rank percentile \n", "0 10th Grade ELA_Literacy 2014 0.7510 2.0 0.983871 \n", "1 10th Grade ELA_Literacy 2014 0.2418 31.0 0.516129 \n", "2 10th Grade ELA_Literacy 2014 0.2636 29.0 0.548387 \n", "3 10th Grade ELA_Literacy 2014 NaN 52.0 0.096774 \n", "4 10th Grade ELA_Literacy 2014 0.4220 15.0 0.774194 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pracc.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#SAT Cleaning\n", "\n", "fp = './raw/SAT'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv, skiprows=3)\n", " df['filename'] = os.path.basename(csv)\n", " df['category'] = 'SAT'\n", " df = df.iloc[1:]\n", " \n", " df = df.replace('no data',np.nan)\n", " df = df.replace('too few data',np.nan)\n", " \n", " col_list = ['Mathematics Average', 'Reading Average', 'Writing Average']\n", " df['Total Average'] = df[col_list].astype(float).sum(axis=1)\n", " \n", " # Unpivot the columns\n", " x = ['District', 'School', 'category', 'filename']\n", " df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", " \n", " df = df.rename(index=str, columns={'variable': 'subject'})\n", " df['subject'] = df.subject.str[:-7]\n", " df['subject'] = 'SAT ' + df['subject'].astype(str) + 'Avg'\n", " \n", " df['level'] = 'High School'\n", " \n", " df = df.rename(index=str, columns={'filename': 'year'})\n", " df['year'] = df.year.str[:-9]\n", " \n", " df['value'] = df['value'].astype('float')\n", " df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n", " df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n", " \n", " \n", " data.append(df)\n", "frame = pd.concat(data)\n", "\n", "cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "sat = frame\n", "sat.to_csv('./clean/sat_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictSchoollevelcategorysubjectyearvaluerankpercentile
0BarringtonBarrington High SchoolHigh SchoolSATSAT Mathematics Avg2010576.02.00.982456
1Bristol WarrenMt. Hope High SchoolHigh SchoolSATSAT Mathematics Avg2010484.019.00.622807
2BurrillvilleBurrillville High SchoolHigh SchoolSATSAT Mathematics Avg2010488.018.00.649123
3Central FallsCentral Falls High SchoolHigh SchoolSATSAT Mathematics Avg2010374.039.00.263158
4CharihoChariho Regional High SchoolHigh SchoolSATSAT Mathematics Avg2010520.09.00.842105
\n", "
" ], "text/plain": [ " District School level category \\\n", "0 Barrington Barrington High School High School SAT \n", "1 Bristol Warren Mt. Hope High School High School SAT \n", "2 Burrillville Burrillville High School High School SAT \n", "3 Central Falls Central Falls High School High School SAT \n", "4 Chariho Chariho Regional High School High School SAT \n", "\n", " subject year value rank percentile \n", "0 SAT Mathematics Avg 2010 576.0 2.0 0.982456 \n", "1 SAT Mathematics Avg 2010 484.0 19.0 0.622807 \n", "2 SAT Mathematics Avg 2010 488.0 18.0 0.649123 \n", "3 SAT Mathematics Avg 2010 374.0 39.0 0.263158 \n", "4 SAT Mathematics Avg 2010 520.0 9.0 0.842105 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sat.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Graduation Rate\n", "\n", "fp = './raw/Graduation Rate'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv, skiprows=4)\n", " df['filename'] = os.path.basename(csv)\n", " df['category'] = 'GRAD'\n", " df = df.iloc[1:, [0,1,2,6,7]]\n", " \n", " # Unpivot the columns\n", " x = ['District', 'School', 'category', 'filename']\n", " df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", " \n", " df = df.rename(index=str, columns={'variable': 'subject'})\n", " df['subject'] = 'Pct. Graduated 4 Yrs'\n", " \n", " df['level'] = 'High School'\n", " \n", " df = df.rename(index=str, columns={'filename': 'year'})\n", " df['year'] = df.year.str[0:4]\n", " \n", " df = df.replace('no data', np.nan)\n", " df = df.replace('too few data', np.nan)\n", " df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n", " \n", " df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n", " df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n", " \n", " \n", " data.append(df)\n", "frame = pd.concat(data)\n", "\n", "cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "grad = frame\n", "grad.to_csv('./clean/grad_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictSchoolcategorylevelsubjectyearvaluerankpercentile
0BarringtonBarrington High SchoolGRADHigh SchoolPct. Graduated 4 Yrs20100.9662.00.982456
1Bristol WarrenMt. Hope High SchoolGRADHigh SchoolPct. Graduated 4 Yrs20100.88114.00.754386
2BurrillvilleBurrillville High SchoolGRADHigh SchoolPct. Graduated 4 Yrs20100.86816.00.719298
3Central FallsCentral Falls High SchoolGRADHigh SchoolPct. Graduated 4 Yrs20100.70837.00.315789
4CharihoChariho Regional High SchoolGRADHigh SchoolPct. Graduated 4 Yrs20100.89310.00.824561
\n", "
" ], "text/plain": [ " District School category level \\\n", "0 Barrington Barrington High School GRAD High School \n", "1 Bristol Warren Mt. Hope High School GRAD High School \n", "2 Burrillville Burrillville High School GRAD High School \n", "3 Central Falls Central Falls High School GRAD High School \n", "4 Chariho Chariho Regional High School GRAD High School \n", "\n", " subject year value rank percentile \n", "0 Pct. Graduated 4 Yrs 2010 0.966 2.0 0.982456 \n", "1 Pct. Graduated 4 Yrs 2010 0.881 14.0 0.754386 \n", "2 Pct. Graduated 4 Yrs 2010 0.868 16.0 0.719298 \n", "3 Pct. Graduated 4 Yrs 2010 0.708 37.0 0.315789 \n", "4 Pct. Graduated 4 Yrs 2010 0.893 10.0 0.824561 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grad.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Attendance\n", "\n", "fp = './raw/Attendance'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv, skiprows=4)\n", " df['filename'] = os.path.basename(csv)\n", " df['category'] = 'Attend'\n", " df = df.iloc[1:]\n", " \n", " # Unpivot the columns\n", " x = ['District', 'School', 'category', 'filename']\n", " df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", " \n", " df = df.rename(index=str, columns={'variable': 'subject'})\n", " df['subject'] = 'Attendance Rate'\n", " \n", " df['level'] = df.filename.str[-7:-5]\n", " df = df.replace('ES', 'Elementary School')\n", " df = df.replace('MS', 'Middle School')\n", " df = df.replace('HS', 'High School')\n", " \n", " df = df.rename(index=str, columns={'filename': 'year'})\n", " df['year'] = df.year.str[0:4]\n", " \n", " df = df.replace('no data', np.nan)\n", " df = df.replace('too few data', np.nan)\n", " df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n", " \n", " df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n", " df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n", " \n", " \n", " data.append(df)\n", "frame = pd.concat(data)\n", "\n", "cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "attend = frame\n", "attend.to_csv('./clean/attend_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictSchoollevelcategorysubjectyearvaluerankpercentile
0BarringtonHampden Meadows SchoolElementary SchoolAttendAttendance Rate20100.965921.00.887179
1BarringtonNayatt SchoolElementary SchoolAttendAttendance Rate20100.958550.00.712821
2BarringtonPrimrose Hill SchoolElementary SchoolAttendAttendance Rate20100.962036.00.810256
3BarringtonSowams Elementary SchoolElementary SchoolAttendAttendance Rate20100.963826.00.861538
4Bristol WarrenColt Andrews SchoolElementary SchoolAttendAttendance Rate20100.949492.00.425641
\n", "
" ], "text/plain": [ " District School level category \\\n", "0 Barrington Hampden Meadows School Elementary School Attend \n", "1 Barrington Nayatt School Elementary School Attend \n", "2 Barrington Primrose Hill School Elementary School Attend \n", "3 Barrington Sowams Elementary School Elementary School Attend \n", "4 Bristol Warren Colt Andrews School Elementary School Attend \n", "\n", " subject year value rank percentile \n", "0 Attendance Rate 2010 0.9659 21.0 0.887179 \n", "1 Attendance Rate 2010 0.9585 50.0 0.712821 \n", "2 Attendance Rate 2010 0.9620 36.0 0.810256 \n", "3 Attendance Rate 2010 0.9638 26.0 0.861538 \n", "4 Attendance Rate 2010 0.9494 92.0 0.425641 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "attend.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Chronic Absence\n", "\n", "fp = './raw/Chronic Absenteeism'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv, skiprows=4)\n", " df['filename'] = os.path.basename(csv)\n", " df['category'] = 'Chronic'\n", " df = df.iloc[1:]\n", " \n", " # Unpivot the columns\n", " x = ['District', 'School', 'category', 'filename']\n", " df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())\n", " \n", " df = df.rename(index=str, columns={'variable': 'subject'})\n", " df['subject'] = 'Chronic Absence'\n", " \n", " df['level'] = df.filename.str[-7:-5]\n", " df = df.replace('ES', 'Elementary School')\n", " df = df.replace('MS', 'Middle School')\n", " df = df.replace('HS', 'High School')\n", " \n", " df = df.rename(index=str, columns={'filename': 'year'})\n", " df['year'] = df.year.str[0:4]\n", " \n", " df = df.replace('no data', np.nan)\n", " df = df.replace('too few data', np.nan)\n", " df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0\n", " \n", " df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)\n", " df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)\n", " \n", " \n", " data.append(df)\n", "frame = pd.concat(data)\n", "\n", "cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "chronic = frame\n", "chronic.to_csv('./clean/chronic_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictSchoolcategorylevelsubjectyearvaluerankpercentile
0BarringtonHampden Meadows SchoolChronicElementary SchoolChronic Absence20120.0175177.00.056995
1BarringtonNayatt SchoolChronicElementary SchoolChronic Absence20120.0746126.00.341969
2BarringtonPrimrose Hill SchoolChronicElementary SchoolChronic Absence20120.0785121.00.367876
3BarringtonSowams Elementary SchoolChronicElementary SchoolChronic Absence20120.0560144.00.243523
4Bristol WarrenColt Andrews SchoolChronicElementary SchoolChronic Absence20120.116273.00.616580
\n", "
" ], "text/plain": [ " District School category level \\\n", "0 Barrington Hampden Meadows School Chronic Elementary School \n", "1 Barrington Nayatt School Chronic Elementary School \n", "2 Barrington Primrose Hill School Chronic Elementary School \n", "3 Barrington Sowams Elementary School Chronic Elementary School \n", "4 Bristol Warren Colt Andrews School Chronic Elementary School \n", "\n", " subject year value rank percentile \n", "0 Chronic Absence 2012 0.0175 177.0 0.056995 \n", "1 Chronic Absence 2012 0.0746 126.0 0.341969 \n", "2 Chronic Absence 2012 0.0785 121.0 0.367876 \n", "3 Chronic Absence 2012 0.0560 144.0 0.243523 \n", "4 Chronic Absence 2012 0.1162 73.0 0.616580 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chronic.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "school_region_map = {\n", " 'Academy for Career Exploration' : 'Charter', \n", " \"NE Laborers'/Cranston Public Schools Construction Career Academy\" : 'Charter',\n", " 'Times2 Academy' : 'Charter'}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "x = region_map" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Combine ALl\n", "\n", "fp = './clean/'\n", "\n", "allFiles = glob.glob(fp + \"/*.csv\")\n", "frame = pd.DataFrame()\n", "data = []\n", "for csv in allFiles:\n", " \n", " df = pd.read_csv(csv) \n", " \n", " data.append(df)\n", "frame = pd.concat(data, sort=True)\n", "\n", "cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n", "frame = frame[cols]\n", "\n", "final = frame\n", "\n", "final['region'] = final.District.map(region_map)\n", "final['region2'] = final.School.map(school_region_map)\n", "\n", "final.region2.fillna(final.region, inplace=True)\n", "\n", "final = final.drop(['region'], axis=1)\n", "\n", "final['subject'] = final['subject'].str.replace('Mathematics', 'Math')\n", "final['subject'] = final['subject'].str.replace('ELA_Literacy', 'English')\n", "\n", "final = final.rename(columns={'region2': 'region'})\n", "\n", "final = final[final.District != 'Regional Collaborative']\n", "\n", "cols = ['District', 'region', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']\n", "final = final[cols]\n", "\n", "final.to_csv('./clean/final_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistrictregionSchoolcategorylevelsubjectyearvaluerankpercentile
0BarringtonSuburbanHampden Meadows SchoolAttendElementary SchoolAttendance Rate20100.965921.00.887179
1BarringtonSuburbanNayatt SchoolAttendElementary SchoolAttendance Rate20100.958550.00.712821
2BarringtonSuburbanPrimrose Hill SchoolAttendElementary SchoolAttendance Rate20100.962036.00.810256
3BarringtonSuburbanSowams Elementary SchoolAttendElementary SchoolAttendance Rate20100.963826.00.861538
4Bristol WarrenRegionalColt Andrews SchoolAttendElementary SchoolAttendance Rate20100.949492.00.425641
\n", "
" ], "text/plain": [ " District region School category \\\n", "0 Barrington Suburban Hampden Meadows School Attend \n", "1 Barrington Suburban Nayatt School Attend \n", "2 Barrington Suburban Primrose Hill School Attend \n", "3 Barrington Suburban Sowams Elementary School Attend \n", "4 Bristol Warren Regional Colt Andrews School Attend \n", "\n", " level subject year value rank percentile \n", "0 Elementary School Attendance Rate 2010 0.9659 21.0 0.887179 \n", "1 Elementary School Attendance Rate 2010 0.9585 50.0 0.712821 \n", "2 Elementary School Attendance Rate 2010 0.9620 36.0 0.810256 \n", "3 Elementary School Attendance Rate 2010 0.9638 26.0 0.861538 \n", "4 Elementary School Attendance Rate 2010 0.9494 92.0 0.425641 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }