{ "cells": [ { "cell_type": "code", "execution_count": 203, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.mode.chained_assignment = None" ] }, { "cell_type": "code", "execution_count": 204, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo = pd.read_csv('bad-scrape.raw.csv', names=[i for i in range(0, 4)])" ] }, { "cell_type": "code", "execution_count": 205, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo_test = octo.copy()" ] }, { "cell_type": "code", "execution_count": 206, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# blank col, name col, value col, blank col\n", "rows = octo_test.loc[octo_test[0].isnull() & octo_test[3].isnull()].index.values\n", "octo_test.iloc[rows, 0] = octo_test.iloc[rows][1]\n", "octo_test.iloc[rows, 1] = octo_test.iloc[rows][2]" ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# blank col, name col, blank col, value col\n", "rows = octo_test.loc[octo_test[0].isnull() & octo_test[2].isnull()].index.values\n", "octo_test.iloc[rows, 0] = octo_test.iloc[rows][1]\n", "octo_test.iloc[rows, 1] = octo_test.iloc[rows][3]" ] }, { "cell_type": "code", "execution_count": 208, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# blank col, blank col, name col, value col\n", "rows = octo_test.loc[octo_test[0].isnull() & octo_test[1].isnull()].index.values\n", "octo_test.iloc[rows, 0] = octo_test.iloc[rows][2]\n", "octo_test.iloc[rows, 1] = octo_test.iloc[rows][3]" ] }, { "cell_type": "code", "execution_count": 209, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# name col, blank col, blank col, value col\n", "octo_test.loc[octo_test[1].isnull() & octo_test[2].isnull(), 1] = octo_test.loc[octo_test[1].isnull() & octo_test[2].isnull()][3]" ] }, { "cell_type": "code", "execution_count": 210, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# name col, blank col, value col, blank col\n", "octo_test.loc[octo_test[0].notnull() & octo_test[1].isnull() & octo_test[2].notnull(), 1] = octo_test.loc[octo_test[0].notnull() & octo_test[1].isnull() & octo_test[2].notnull()][2]" ] }, { "cell_type": "code", "execution_count": 211, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# remove indoor/outdoor rows\n", "octo_test[0] = octo_test[0].str.rstrip()\n", "octo_test.drop(list(octo_test.loc[octo_test[0].isin(['Indoor', 'Outdoor', 'Oudoor'])].index.values), inplace=True)\n", "\n", "# remove dummy cols\n", "octo_test.drop([2, 3], axis=1, inplace=True)\n", "\n", "# reset the index\n", "octo_test.reset_index(inplace=True, drop=True)" ] }, { "cell_type": "code", "execution_count": 212, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def process(df):\n", " # cleanup\n", " df.rename(columns={0: 'fixture_location', 1: 'result_value'}, inplace=True)\n", " df['park_name'] = ['' for i in range(0, len(df))]\n", " \n", " # get park names\n", " indices = df.loc[df.result_value.isnull()].index\n", " names = df.loc[df.result_value.isnull()].fixture_location\n", " parks = list(zip(indices, names))\n", "\n", " # restructure\n", " for ind, park in enumerate(parks):\n", " start, park = park\n", " try:\n", " end = parks[ind+1][0] - 1\n", " except IndexError:\n", " end = len(df)\n", " df.loc[start:end, 'park_name'] = park\n", " \n", " df.drop(list(indices), inplace=True)\n", " df['result_flag'] = df['result_value'].apply(lambda x: x.split(' ')[-1] if len(x.split(' ')) > 1 else '')\n", " df['result_value'] = df['result_value'].apply(lambda x: x.split(' ')[0]).str.replace('˂', '<')" ] }, { "cell_type": "code", "execution_count": 213, "metadata": { "collapsed": false }, "outputs": [], "source": [ "process(octo_test)\n", "octo_test = octo_test.ix[:, [2, 0, 1, 3]]" ] }, { "cell_type": "code", "execution_count": 214, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# octo_test.to_csv('chi-parks-lead-results-10-2016.processed.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 215, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# octo_test.loc[octo_test['fixture_location'].str.contains('\\*'), 'offline'] = 'Y'\n", "# octo_test.loc[octo_test['result_value'].str.contains('<'), 'threshold'] = 'Y'" ] }, { "cell_type": "code", "execution_count": 216, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# clean up two rows w/o space between result value and result flag\n", "octo_test.iloc[1002]['result_value'] = '3.09'\n", "octo_test.iloc[1002]['result_flag'] = 'J'\n", "octo_test.iloc[1002]['processed_result'] = 3.09\n", "\n", "octo_test.iloc[2020]['result_value'] = '0.243'\n", "octo_test.iloc[2020]['result_flag'] = 'J'\n", "octo_test.iloc[2020]['processed_result'] = 0.243" ] }, { "cell_type": "code", "execution_count": 217, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo_test['processed_result'] = octo_test['result_value'].apply(lambda x: 0 if '<' in x else pd.to_numeric(x))" ] }, { "cell_type": "code", "execution_count": 218, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo_test['exceeds_epa'] = octo_test['processed_result'].apply(lambda x: 1 if x > 15 else 0)" ] }, { "cell_type": "code", "execution_count": 219, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | park_name | \n", "fixture_location | \n", "result_value | \n", "result_flag | \n", "processed_result | \n", "exceeds_epa | \n", "
---|---|---|---|---|---|---|
1 | \n", "Abbott | \n", "Fitness Center | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2 | \n", "Abbott | \n", "Main Lobby | \n", "0.579 | \n", "J | \n", "0.579 | \n", "0 | \n", "
3 | \n", "Abbott | \n", "Fountain 1 High | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
4 | \n", "Abbott | \n", "Fountain 1 Low | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
5 | \n", "Abbott | \n", "Fountain 2 | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
6 | \n", "Abbott | \n", "Fountain 3 | \n", "0.245 | \n", "J | \n", "0.245 | \n", "0 | \n", "
7 | \n", "Abbott | \n", "Fountain 4 | \n", "4.48 | \n", "J | \n", "4.480 | \n", "0 | \n", "
9 | \n", "Ada | \n", "Main Lobby | \n", "<2.0 | \n", "U | \n", "0.000 | \n", "0 | \n", "
10 | \n", "Ada | \n", "Kitchen Sink | \n", "<2.0 | \n", "U | \n", "0.000 | \n", "0 | \n", "
11 | \n", "Ada | \n", "Fountain 1 | \n", "11.7 | \n", "\n", " | 11.700 | \n", "0 | \n", "
12 | \n", "Ada | \n", "Fountain 2 | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
13 | \n", "Ada | \n", "Fountain 3 | \n", "9.63 | \n", "\n", " | 9.630 | \n", "0 | \n", "
14 | \n", "Ada | \n", "Fountain 4 | \n", "7.57 | \n", "\n", " | 7.570 | \n", "0 | \n", "
16 | \n", "Adams (George & Adelle) | \n", "Hallway | \n", "4.34 | \n", "J | \n", "4.340 | \n", "0 | \n", "
17 | \n", "Adams (George & Adelle) | \n", "Kitchen Sink | \n", "5.3 | \n", "\n", " | 5.300 | \n", "0 | \n", "
18 | \n", "Adams (George & Adelle) | \n", "Fountain 1 | \n", "<2.0 | \n", "U | \n", "0.000 | \n", "0 | \n", "
19 | \n", "Adams (George & Adelle) | \n", "Fountain 2 High* | \n", "75 | \n", "\n", " | 75.000 | \n", "1 | \n", "
20 | \n", "Adams (George & Adelle) | \n", "Fountain 2 Low* | \n", "75 | \n", "\n", " | 75.000 | \n", "1 | \n", "
22 | \n", "Adams (John) | \n", "Hallway Outside Washroom | \n", "4.02 | \n", "J | \n", "4.020 | \n", "0 | \n", "
24 | \n", "Addam/Medil | \n", "Fountain 1 | \n", "7.5 | \n", "\n", " | 7.500 | \n", "0 | \n", "
26 | \n", "Addams Memorial (Jane) | \n", "Fountain 1 | \n", "<2.0 | \n", "U | \n", "0.000 | \n", "0 | \n", "
28 | \n", "Aiello | \n", "Fountain 1 High | \n", "2.9 | \n", "J | \n", "2.900 | \n", "0 | \n", "
29 | \n", "Aiello | \n", "Fountain 1 Middle | \n", "2.88 | \n", "J | \n", "2.880 | \n", "0 | \n", "
30 | \n", "Aiello | \n", "Fountain 1 Low | \n", "3.3 | \n", "J | \n", "3.300 | \n", "0 | \n", "
32 | \n", "Algonquin | \n", "Fountain 1 High | \n", "6.64 | \n", "\n", " | 6.640 | \n", "0 | \n", "
33 | \n", "Algonquin | \n", "Fountain 1 Middle | \n", "6.65 | \n", "\n", " | 6.650 | \n", "0 | \n", "
34 | \n", "Algonquin | \n", "Fountain 1 Low | \n", "6.67 | \n", "\n", " | 6.670 | \n", "0 | \n", "
36 | \n", "Altgeld | \n", "Main Lobby - High | \n", "4.91 | \n", "J | \n", "4.910 | \n", "0 | \n", "
37 | \n", "Altgeld | \n", "Main Lobby - Low | \n", "2.31 | \n", "J | \n", "2.310 | \n", "0 | \n", "
38 | \n", "Altgeld | \n", "Basement Fitness Center | \n", "0.35 | \n", "J | \n", "0.350 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2842 | \n", "Williams-Davis | \n", "Fountain 1 Low | \n", "<2.0 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2844 | \n", "Willye White | \n", "Lobby | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2845 | \n", "Willye White | \n", "Gym Hallway West | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2846 | \n", "Willye White | \n", "Fountain 1 High | \n", "<2.0 | \n", "\n", " | 0.000 | \n", "0 | \n", "
2847 | \n", "Willye White | \n", "Fountain 1 Low | \n", "<2.0 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2849 | \n", "Wilson (Frank) | \n", "Club Room | \n", "0.201 | \n", "J | \n", "0.201 | \n", "0 | \n", "
2850 | \n", "Wilson (Frank) | \n", "Fountain 1* | \n", "51.2 | \n", "\n", " | 51.200 | \n", "1 | \n", "
2851 | \n", "Wilson (Frank) | \n", "Fountain 2 High* | \n", "1.21 | \n", "J | \n", "1.210 | \n", "0 | \n", "
2852 | \n", "Wilson (Frank) | \n", "Fountain 2 Middle* | \n", "49.6 | \n", "\n", " | 49.600 | \n", "1 | \n", "
2853 | \n", "Wilson (Frank) | \n", "Fountain 2 Low* | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2854 | \n", "Wilson (Frank) | \n", "Fountain 3* | \n", "23.9 | \n", "\n", " | 23.900 | \n", "1 | \n", "
2856 | \n", "Wilson (John) Community Center | \n", "Hallway Near Washrooms | \n", "0.789 | \n", "J | \n", "0.789 | \n", "0 | \n", "
2857 | \n", "Wilson (John) Community Center | \n", "Fountain 1 High | \n", "4.11 | \n", "J | \n", "4.110 | \n", "0 | \n", "
2858 | \n", "Wilson (John) Community Center | \n", "Fountain 1 Low | \n", "5.26 | \n", "\n", " | 5.260 | \n", "0 | \n", "
2860 | \n", "Winnemac | \n", "Fountain 1 High | \n", "0.121 | \n", "J | \n", "0.121 | \n", "0 | \n", "
2861 | \n", "Winnemac | \n", "Fountain 1 Middle | \n", "0.121 | \n", "J | \n", "0.121 | \n", "0 | \n", "
2862 | \n", "Winnemac | \n", "Fountain 1 Low | \n", "<0.120 | \n", "U | \n", "0.000 | \n", "0 | \n", "
2864 | \n", "Wolcott | \n", "Fountain 1 High | \n", "14.8 | \n", "\n", " | 14.800 | \n", "0 | \n", "
2865 | \n", "Wolcott | \n", "Fountain 1 Low | \n", "14.4 | \n", "\n", " | 14.400 | \n", "0 | \n", "
2867 | \n", "Wolfe | \n", "Main Lobby | \n", "1.18 | \n", "J | \n", "1.180 | \n", "0 | \n", "
2868 | \n", "Wolfe | \n", "Fountain 1 | \n", "4 | \n", "\n", " | 4.000 | \n", "0 | \n", "
2870 | \n", "Woodhull | \n", "Main Lobby | \n", "0.231 | \n", "J | \n", "0.231 | \n", "0 | \n", "
2871 | \n", "Woodhull | \n", "Fountain 1 High | \n", "0.381 | \n", "J | \n", "0.381 | \n", "0 | \n", "
2872 | \n", "Woodhull | \n", "Fountain 1 Middle | \n", "0.717 | \n", "J | \n", "0.717 | \n", "0 | \n", "
2873 | \n", "Woodhull | \n", "Fountain 1 Low | \n", "0.284 | \n", "\n", " | 0.284 | \n", "0 | \n", "
2875 | \n", "Wrightwood | \n", "Fountain 1 Low* | \n", "26 | \n", "\n", " | 26.000 | \n", "1 | \n", "
2876 | \n", "Wrightwood | \n", "Fountain 1 Middle* | \n", "31 | \n", "\n", " | 31.000 | \n", "1 | \n", "
2877 | \n", "Wrightwood | \n", "Fountain 1 High* | \n", "26 | \n", "\n", " | 26.000 | \n", "1 | \n", "
2878 | \n", "Wrightwood | \n", "Fountain 2* | \n", "24 | \n", "\n", " | 24.000 | \n", "1 | \n", "
2880 | \n", "Zatterberg | \n", "Fountain 1* | \n", "86 | \n", "\n", " | 86.000 | \n", "1 | \n", "
2409 rows × 6 columns
\n", "