{ "cells": [ { "cell_type": "code", "execution_count": 203, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "pd.options.mode.chained_assignment = None" ] }, { "cell_type": "code", "execution_count": 204, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo = pd.read_csv('bad-scrape.raw.csv', names=[i for i in range(0, 4)])" ] }, { "cell_type": "code", "execution_count": 205, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo_test = octo.copy()" ] }, { "cell_type": "code", "execution_count": 206, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# blank col, name col, value col, blank col\n", "rows = octo_test.loc[octo_test[0].isnull() & octo_test[3].isnull()].index.values\n", "octo_test.iloc[rows, 0] = octo_test.iloc[rows][1]\n", "octo_test.iloc[rows, 1] = octo_test.iloc[rows][2]" ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# blank col, name col, blank col, value col\n", "rows = octo_test.loc[octo_test[0].isnull() & octo_test[2].isnull()].index.values\n", "octo_test.iloc[rows, 0] = octo_test.iloc[rows][1]\n", "octo_test.iloc[rows, 1] = octo_test.iloc[rows][3]" ] }, { "cell_type": "code", "execution_count": 208, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# blank col, blank col, name col, value col\n", "rows = octo_test.loc[octo_test[0].isnull() & octo_test[1].isnull()].index.values\n", "octo_test.iloc[rows, 0] = octo_test.iloc[rows][2]\n", "octo_test.iloc[rows, 1] = octo_test.iloc[rows][3]" ] }, { "cell_type": "code", "execution_count": 209, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# name col, blank col, blank col, value col\n", "octo_test.loc[octo_test[1].isnull() & octo_test[2].isnull(), 1] = octo_test.loc[octo_test[1].isnull() & octo_test[2].isnull()][3]" ] }, { "cell_type": "code", "execution_count": 210, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# name col, blank col, value col, blank col\n", "octo_test.loc[octo_test[0].notnull() & octo_test[1].isnull() & octo_test[2].notnull(), 1] = octo_test.loc[octo_test[0].notnull() & octo_test[1].isnull() & octo_test[2].notnull()][2]" ] }, { "cell_type": "code", "execution_count": 211, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# remove indoor/outdoor rows\n", "octo_test[0] = octo_test[0].str.rstrip()\n", "octo_test.drop(list(octo_test.loc[octo_test[0].isin(['Indoor', 'Outdoor', 'Oudoor'])].index.values), inplace=True)\n", "\n", "# remove dummy cols\n", "octo_test.drop([2, 3], axis=1, inplace=True)\n", "\n", "# reset the index\n", "octo_test.reset_index(inplace=True, drop=True)" ] }, { "cell_type": "code", "execution_count": 212, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def process(df):\n", " # cleanup\n", " df.rename(columns={0: 'fixture_location', 1: 'result_value'}, inplace=True)\n", " df['park_name'] = ['' for i in range(0, len(df))]\n", " \n", " # get park names\n", " indices = df.loc[df.result_value.isnull()].index\n", " names = df.loc[df.result_value.isnull()].fixture_location\n", " parks = list(zip(indices, names))\n", "\n", " # restructure\n", " for ind, park in enumerate(parks):\n", " start, park = park\n", " try:\n", " end = parks[ind+1][0] - 1\n", " except IndexError:\n", " end = len(df)\n", " df.loc[start:end, 'park_name'] = park\n", " \n", " df.drop(list(indices), inplace=True)\n", " df['result_flag'] = df['result_value'].apply(lambda x: x.split(' ')[-1] if len(x.split(' ')) > 1 else '')\n", " df['result_value'] = df['result_value'].apply(lambda x: x.split(' ')[0]).str.replace('˂', '<')" ] }, { "cell_type": "code", "execution_count": 213, "metadata": { "collapsed": false }, "outputs": [], "source": [ "process(octo_test)\n", "octo_test = octo_test.ix[:, [2, 0, 1, 3]]" ] }, { "cell_type": "code", "execution_count": 214, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# octo_test.to_csv('chi-parks-lead-results-10-2016.processed.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 215, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# octo_test.loc[octo_test['fixture_location'].str.contains('\\*'), 'offline'] = 'Y'\n", "# octo_test.loc[octo_test['result_value'].str.contains('<'), 'threshold'] = 'Y'" ] }, { "cell_type": "code", "execution_count": 216, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# clean up two rows w/o space between result value and result flag\n", "octo_test.iloc[1002]['result_value'] = '3.09'\n", "octo_test.iloc[1002]['result_flag'] = 'J'\n", "octo_test.iloc[1002]['processed_result'] = 3.09\n", "\n", "octo_test.iloc[2020]['result_value'] = '0.243'\n", "octo_test.iloc[2020]['result_flag'] = 'J'\n", "octo_test.iloc[2020]['processed_result'] = 0.243" ] }, { "cell_type": "code", "execution_count": 217, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo_test['processed_result'] = octo_test['result_value'].apply(lambda x: 0 if '<' in x else pd.to_numeric(x))" ] }, { "cell_type": "code", "execution_count": 218, "metadata": { "collapsed": false }, "outputs": [], "source": [ "octo_test['exceeds_epa'] = octo_test['processed_result'].apply(lambda x: 1 if x > 15 else 0)" ] }, { "cell_type": "code", "execution_count": 219, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
park_namefixture_locationresult_valueresult_flagprocessed_resultexceeds_epa
1AbbottFitness Center<0.120U0.0000
2AbbottMain Lobby0.579J0.5790
3AbbottFountain 1 High<0.120U0.0000
4AbbottFountain 1 Low<0.120U0.0000
5AbbottFountain 2<0.120U0.0000
6AbbottFountain 30.245J0.2450
7AbbottFountain 44.48J4.4800
9AdaMain Lobby<2.0U0.0000
10AdaKitchen Sink<2.0U0.0000
11AdaFountain 111.711.7000
12AdaFountain 2<0.120U0.0000
13AdaFountain 39.639.6300
14AdaFountain 47.577.5700
16Adams (George & Adelle)Hallway4.34J4.3400
17Adams (George & Adelle)Kitchen Sink5.35.3000
18Adams (George & Adelle)Fountain 1<2.0U0.0000
19Adams (George & Adelle)Fountain 2 High*7575.0001
20Adams (George & Adelle)Fountain 2 Low*7575.0001
22Adams (John)Hallway Outside Washroom4.02J4.0200
24Addam/MedilFountain 17.57.5000
26Addams Memorial (Jane)Fountain 1<2.0U0.0000
28AielloFountain 1 High2.9J2.9000
29AielloFountain 1 Middle2.88J2.8800
30AielloFountain 1 Low3.3J3.3000
32AlgonquinFountain 1 High6.646.6400
33AlgonquinFountain 1 Middle6.656.6500
34AlgonquinFountain 1 Low6.676.6700
36AltgeldMain Lobby - High4.91J4.9100
37AltgeldMain Lobby - Low2.31J2.3100
38AltgeldBasement Fitness Center0.35J0.3500
.....................
2842Williams-DavisFountain 1 Low<2.0U0.0000
2844Willye WhiteLobby<0.120U0.0000
2845Willye WhiteGym Hallway West<0.120U0.0000
2846Willye WhiteFountain 1 High<2.00.0000
2847Willye WhiteFountain 1 Low<2.0U0.0000
2849Wilson (Frank)Club Room0.201J0.2010
2850Wilson (Frank)Fountain 1*51.251.2001
2851Wilson (Frank)Fountain 2 High*1.21J1.2100
2852Wilson (Frank)Fountain 2 Middle*49.649.6001
2853Wilson (Frank)Fountain 2 Low*<0.120U0.0000
2854Wilson (Frank)Fountain 3*23.923.9001
2856Wilson (John) Community CenterHallway Near Washrooms0.789J0.7890
2857Wilson (John) Community CenterFountain 1 High4.11J4.1100
2858Wilson (John) Community CenterFountain 1 Low5.265.2600
2860WinnemacFountain 1 High0.121J0.1210
2861WinnemacFountain 1 Middle0.121J0.1210
2862WinnemacFountain 1 Low<0.120U0.0000
2864WolcottFountain 1 High14.814.8000
2865WolcottFountain 1 Low14.414.4000
2867WolfeMain Lobby1.18J1.1800
2868WolfeFountain 144.0000
2870WoodhullMain Lobby0.231J0.2310
2871WoodhullFountain 1 High0.381J0.3810
2872WoodhullFountain 1 Middle0.717J0.7170
2873WoodhullFountain 1 Low0.2840.2840
2875WrightwoodFountain 1 Low*2626.0001
2876WrightwoodFountain 1 Middle*3131.0001
2877WrightwoodFountain 1 High*2626.0001
2878WrightwoodFountain 2*2424.0001
2880ZatterbergFountain 1*8686.0001
\n", "

2409 rows × 6 columns

\n", "
" ], "text/plain": [ " park_name fixture_location result_value \\\n", "1 Abbott Fitness Center <0.120 \n", "2 Abbott Main Lobby 0.579 \n", "3 Abbott Fountain 1 High <0.120 \n", "4 Abbott Fountain 1 Low <0.120 \n", "5 Abbott Fountain 2 <0.120 \n", "6 Abbott Fountain 3 0.245 \n", "7 Abbott Fountain 4 4.48 \n", "9 Ada Main Lobby <2.0 \n", "10 Ada Kitchen Sink <2.0 \n", "11 Ada Fountain 1 11.7 \n", "12 Ada Fountain 2 <0.120 \n", "13 Ada Fountain 3 9.63 \n", "14 Ada Fountain 4 7.57 \n", "16 Adams (George & Adelle) Hallway 4.34 \n", "17 Adams (George & Adelle) Kitchen Sink 5.3 \n", "18 Adams (George & Adelle) Fountain 1 <2.0 \n", "19 Adams (George & Adelle) Fountain 2 High* 75 \n", "20 Adams (George & Adelle) Fountain 2 Low* 75 \n", "22 Adams (John) Hallway Outside Washroom 4.02 \n", "24 Addam/Medil Fountain 1 7.5 \n", "26 Addams Memorial (Jane) Fountain 1 <2.0 \n", "28 Aiello Fountain 1 High 2.9 \n", "29 Aiello Fountain 1 Middle 2.88 \n", "30 Aiello Fountain 1 Low 3.3 \n", "32 Algonquin Fountain 1 High 6.64 \n", "33 Algonquin Fountain 1 Middle 6.65 \n", "34 Algonquin Fountain 1 Low 6.67 \n", "36 Altgeld Main Lobby - High 4.91 \n", "37 Altgeld Main Lobby - Low 2.31 \n", "38 Altgeld Basement Fitness Center 0.35 \n", "... ... ... ... \n", "2842 Williams-Davis Fountain 1 Low <2.0 \n", "2844 Willye White Lobby <0.120 \n", "2845 Willye White Gym Hallway West <0.120 \n", "2846 Willye White Fountain 1 High <2.0 \n", "2847 Willye White Fountain 1 Low <2.0 \n", "2849 Wilson (Frank) Club Room 0.201 \n", "2850 Wilson (Frank) Fountain 1* 51.2 \n", "2851 Wilson (Frank) Fountain 2 High* 1.21 \n", "2852 Wilson (Frank) Fountain 2 Middle* 49.6 \n", "2853 Wilson (Frank) Fountain 2 Low* <0.120 \n", "2854 Wilson (Frank) Fountain 3* 23.9 \n", "2856 Wilson (John) Community Center Hallway Near Washrooms 0.789 \n", "2857 Wilson (John) Community Center Fountain 1 High 4.11 \n", "2858 Wilson (John) Community Center Fountain 1 Low 5.26 \n", "2860 Winnemac Fountain 1 High 0.121 \n", "2861 Winnemac Fountain 1 Middle 0.121 \n", "2862 Winnemac Fountain 1 Low <0.120 \n", "2864 Wolcott Fountain 1 High 14.8 \n", "2865 Wolcott Fountain 1 Low 14.4 \n", "2867 Wolfe Main Lobby 1.18 \n", "2868 Wolfe Fountain 1 4 \n", "2870 Woodhull Main Lobby 0.231 \n", "2871 Woodhull Fountain 1 High 0.381 \n", "2872 Woodhull Fountain 1 Middle 0.717 \n", "2873 Woodhull Fountain 1 Low 0.284 \n", "2875 Wrightwood Fountain 1 Low* 26 \n", "2876 Wrightwood Fountain 1 Middle* 31 \n", "2877 Wrightwood Fountain 1 High* 26 \n", "2878 Wrightwood Fountain 2* 24 \n", "2880 Zatterberg Fountain 1* 86 \n", "\n", " result_flag processed_result exceeds_epa \n", "1 U 0.000 0 \n", "2 J 0.579 0 \n", "3 U 0.000 0 \n", "4 U 0.000 0 \n", "5 U 0.000 0 \n", "6 J 0.245 0 \n", "7 J 4.480 0 \n", "9 U 0.000 0 \n", "10 U 0.000 0 \n", "11 11.700 0 \n", "12 U 0.000 0 \n", "13 9.630 0 \n", "14 7.570 0 \n", "16 J 4.340 0 \n", "17 5.300 0 \n", "18 U 0.000 0 \n", "19 75.000 1 \n", "20 75.000 1 \n", "22 J 4.020 0 \n", "24 7.500 0 \n", "26 U 0.000 0 \n", "28 J 2.900 0 \n", "29 J 2.880 0 \n", "30 J 3.300 0 \n", "32 6.640 0 \n", "33 6.650 0 \n", "34 6.670 0 \n", "36 J 4.910 0 \n", "37 J 2.310 0 \n", "38 J 0.350 0 \n", "... ... ... ... \n", "2842 U 0.000 0 \n", "2844 U 0.000 0 \n", "2845 U 0.000 0 \n", "2846 0.000 0 \n", "2847 U 0.000 0 \n", "2849 J 0.201 0 \n", "2850 51.200 1 \n", "2851 J 1.210 0 \n", "2852 49.600 1 \n", "2853 U 0.000 0 \n", "2854 23.900 1 \n", "2856 J 0.789 0 \n", "2857 J 4.110 0 \n", "2858 5.260 0 \n", "2860 J 0.121 0 \n", "2861 J 0.121 0 \n", "2862 U 0.000 0 \n", "2864 14.800 0 \n", "2865 14.400 0 \n", "2867 J 1.180 0 \n", "2868 4.000 0 \n", "2870 J 0.231 0 \n", "2871 J 0.381 0 \n", "2872 J 0.717 0 \n", "2873 0.284 0 \n", "2875 26.000 1 \n", "2876 31.000 1 \n", "2877 26.000 1 \n", "2878 24.000 1 \n", "2880 86.000 1 \n", "\n", "[2409 rows x 6 columns]" ] }, "execution_count": 219, "metadata": {}, "output_type": "execute_result" } ], "source": [ "octo_test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "handle single case of [follow up sample](https://github.com/datamade/chicago-lead/blob/parks-processing/assets/chi-parks-lead-results/chi-parks-lead-results-10-2016.processed.csv#L1118)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }