{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Cleaning in Python Pandas\n", "\n", "
\n", "\n", "#### Tips & Tricks to overcome the mundane grind of data cleaning.\n", "\n", "Made by HP-Nunes [@geocuriosity](https://twitter.com/geocuriosity)\n", "
\n", "Updated March 10th, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Depedencies" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Test Data (using pd.read_csv)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "url = \"https://raw.githubusercontent.com/HP-Nunes/cheatSheets/master/input/hepData.csv\" # [1]\n", "df = pd.read_csv(url, error_bad_lines=False) # Caution: 'Offending lines' will be skipped and not imported" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pandas Essentials: Examining your dataframe" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LocationFIPSMortality Rate, 1980*Mortality Rate, 1985*Mortality Rate, 1990*Mortality Rate, 1995*Mortality Rate, 2000*Mortality Rate, 2005*Mortality Rate, 2010*Mortality Rate, 2014*% Change in Mortality Rate, 1980-2014
0United StatesNaN0.49 (0.46, 0.53)0.53 (0.50, 0.56)0.78 (0.74, 0.82)1.42 (1.35, 1.49)1.84 (1.75, 1.94)0.86 (0.82, 0.91)0.30 (0.29, 0.32)0.29 (0.27, 0.32)-40.74 (-48.49, -32.30)
1Alabama1.00.50 (0.45, 0.55)0.53 (0.49, 0.58)0.81 (0.76, 0.87)1.45 (1.36, 1.55)1.88 (1.75, 2.00)0.81 (0.75, 0.87)0.25 (0.23, 0.28)0.24 (0.21, 0.27)-51.75 (-60.02, -43.16)
2Autauga County, Alabama1001.00.37 (0.29, 0.47)0.42 (0.33, 0.52)0.68 (0.54, 0.83)1.18 (0.94, 1.44)1.46 (1.16, 1.79)0.68 (0.52, 0.86)0.20 (0.14, 0.27)0.20 (0.13, 0.28)-46.72 (-63.99, -24.18)
3Baldwin County, Alabama1003.00.43 (0.35, 0.52)0.46 (0.39, 0.54)0.69 (0.59, 0.81)1.19 (1.03, 1.40)1.50 (1.28, 1.76)0.66 (0.55, 0.78)0.22 (0.17, 0.27)0.21 (0.16, 0.28)-51.28 (-64.03, -34.52)
4Barbour County, Alabama1005.00.43 (0.34, 0.53)0.45 (0.36, 0.54)0.66 (0.53, 0.81)1.22 (0.99, 1.47)1.69 (1.37, 2.06)0.70 (0.55, 0.89)0.21 (0.16, 0.29)0.20 (0.14, 0.28)-52.71 (-66.28, -34.42)
\n", "
" ], "text/plain": [ " Location FIPS Mortality Rate, 1980* \\\n", "0 United States NaN 0.49 (0.46, 0.53) \n", "1 Alabama 1.0 0.50 (0.45, 0.55) \n", "2 Autauga County, Alabama 1001.0 0.37 (0.29, 0.47) \n", "3 Baldwin County, Alabama 1003.0 0.43 (0.35, 0.52) \n", "4 Barbour County, Alabama 1005.0 0.43 (0.34, 0.53) \n", "\n", " Mortality Rate, 1985* Mortality Rate, 1990* Mortality Rate, 1995* \\\n", "0 0.53 (0.50, 0.56) 0.78 (0.74, 0.82) 1.42 (1.35, 1.49) \n", "1 0.53 (0.49, 0.58) 0.81 (0.76, 0.87) 1.45 (1.36, 1.55) \n", "2 0.42 (0.33, 0.52) 0.68 (0.54, 0.83) 1.18 (0.94, 1.44) \n", "3 0.46 (0.39, 0.54) 0.69 (0.59, 0.81) 1.19 (1.03, 1.40) \n", "4 0.45 (0.36, 0.54) 0.66 (0.53, 0.81) 1.22 (0.99, 1.47) \n", "\n", " Mortality Rate, 2000* Mortality Rate, 2005* Mortality Rate, 2010* \\\n", "0 1.84 (1.75, 1.94) 0.86 (0.82, 0.91) 0.30 (0.29, 0.32) \n", "1 1.88 (1.75, 2.00) 0.81 (0.75, 0.87) 0.25 (0.23, 0.28) \n", "2 1.46 (1.16, 1.79) 0.68 (0.52, 0.86) 0.20 (0.14, 0.27) \n", "3 1.50 (1.28, 1.76) 0.66 (0.55, 0.78) 0.22 (0.17, 0.27) \n", "4 1.69 (1.37, 2.06) 0.70 (0.55, 0.89) 0.21 (0.16, 0.29) \n", "\n", " Mortality Rate, 2014* % Change in Mortality Rate, 1980-2014 \n", "0 0.29 (0.27, 0.32) -40.74 (-48.49, -32.30) \n", "1 0.24 (0.21, 0.27) -51.75 (-60.02, -43.16) \n", "2 0.20 (0.13, 0.28) -46.72 (-63.99, -24.18) \n", "3 0.21 (0.16, 0.28) -51.28 (-64.03, -34.52) \n", "4 0.20 (0.14, 0.28) -52.71 (-66.28, -34.42) " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head() # Returns the first 5 records of the dataframe" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LocationFIPSMortality Rate, 1980*Mortality Rate, 1985*Mortality Rate, 1990*Mortality Rate, 1995*Mortality Rate, 2000*Mortality Rate, 2005*Mortality Rate, 2010*Mortality Rate, 2014*% Change in Mortality Rate, 1980-2014
3189Sweetwater County, Wyoming56037.00.21 (0.17, 0.26)0.24 (0.20, 0.29)0.44 (0.36, 0.53)0.96 (0.80, 1.13)1.20 (0.98, 1.45)0.54 (0.42, 0.68)0.16 (0.12, 0.21)0.15 (0.10, 0.21)-27.23 (-49.28, 2.16)
3190Teton County, Wyoming56039.00.19 (0.15, 0.23)0.21 (0.17, 0.26)0.37 (0.31, 0.44)0.78 (0.66, 0.92)0.92 (0.76, 1.10)0.40 (0.32, 0.49)0.12 (0.09, 0.15)0.11 (0.08, 0.15)-41.52 (-58.04, -19.65)
3191Uinta County, Wyoming56041.00.21 (0.17, 0.27)0.25 (0.20, 0.31)0.45 (0.35, 0.56)0.96 (0.78, 1.16)1.20 (0.94, 1.49)0.53 (0.39, 0.68)0.15 (0.10, 0.21)0.15 (0.10, 0.22)-31.09 (-54.60, 2.87)
3192Washakie County, Wyoming56043.00.21 (0.17, 0.26)0.25 (0.19, 0.31)0.46 (0.37, 0.55)1.03 (0.84, 1.24)1.23 (0.97, 1.56)0.52 (0.38, 0.68)0.14 (0.10, 0.20)0.14 (0.09, 0.20)-34.69 (-56.71, -3.32)
3193Weston County, Wyoming56045.00.18 (0.14, 0.23)0.21 (0.17, 0.26)0.40 (0.32, 0.49)0.90 (0.74, 1.08)1.12 (0.89, 1.40)0.47 (0.36, 0.60)0.12 (0.08, 0.17)0.11 (0.07, 0.16)-39.63 (-57.96, -13.14)
\n", "
" ], "text/plain": [ " Location FIPS Mortality Rate, 1980* \\\n", "3189 Sweetwater County, Wyoming 56037.0 0.21 (0.17, 0.26) \n", "3190 Teton County, Wyoming 56039.0 0.19 (0.15, 0.23) \n", "3191 Uinta County, Wyoming 56041.0 0.21 (0.17, 0.27) \n", "3192 Washakie County, Wyoming 56043.0 0.21 (0.17, 0.26) \n", "3193 Weston County, Wyoming 56045.0 0.18 (0.14, 0.23) \n", "\n", " Mortality Rate, 1985* Mortality Rate, 1990* Mortality Rate, 1995* \\\n", "3189 0.24 (0.20, 0.29) 0.44 (0.36, 0.53) 0.96 (0.80, 1.13) \n", "3190 0.21 (0.17, 0.26) 0.37 (0.31, 0.44) 0.78 (0.66, 0.92) \n", "3191 0.25 (0.20, 0.31) 0.45 (0.35, 0.56) 0.96 (0.78, 1.16) \n", "3192 0.25 (0.19, 0.31) 0.46 (0.37, 0.55) 1.03 (0.84, 1.24) \n", "3193 0.21 (0.17, 0.26) 0.40 (0.32, 0.49) 0.90 (0.74, 1.08) \n", "\n", " Mortality Rate, 2000* Mortality Rate, 2005* Mortality Rate, 2010* \\\n", "3189 1.20 (0.98, 1.45) 0.54 (0.42, 0.68) 0.16 (0.12, 0.21) \n", "3190 0.92 (0.76, 1.10) 0.40 (0.32, 0.49) 0.12 (0.09, 0.15) \n", "3191 1.20 (0.94, 1.49) 0.53 (0.39, 0.68) 0.15 (0.10, 0.21) \n", "3192 1.23 (0.97, 1.56) 0.52 (0.38, 0.68) 0.14 (0.10, 0.20) \n", "3193 1.12 (0.89, 1.40) 0.47 (0.36, 0.60) 0.12 (0.08, 0.17) \n", "\n", " Mortality Rate, 2014* % Change in Mortality Rate, 1980-2014 \n", "3189 0.15 (0.10, 0.21) -27.23 (-49.28, 2.16) \n", "3190 0.11 (0.08, 0.15) -41.52 (-58.04, -19.65) \n", "3191 0.15 (0.10, 0.22) -31.09 (-54.60, 2.87) \n", "3192 0.14 (0.09, 0.20) -34.69 (-56.71, -3.32) \n", "3193 0.11 (0.07, 0.16) -39.63 (-57.96, -13.14) " ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail() # Returns the last 5 records of the dataframe" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "______________________________________________________________________________________\n", "Location object\n", "FIPS float64\n", "Mortality Rate, 1980* object\n", "Mortality Rate, 1985* object\n", "Mortality Rate, 1990* object\n", "Mortality Rate, 1995* object\n", "Mortality Rate, 2000* object\n", "Mortality Rate, 2005* object\n", "Mortality Rate, 2010* object\n", "Mortality Rate, 2014* object\n", "% Change in Mortality Rate, 1980-2014 object\n", "dtype: object\n", "______________________________________________________________________________________\n", "\n", "RangeIndex: 3194 entries, 0 to 3193\n", "Data columns (total 11 columns):\n", "Location 3194 non-null object\n", "FIPS 3193 non-null float64\n", "Mortality Rate, 1980* 3194 non-null object\n", "Mortality Rate, 1985* 3194 non-null object\n", "Mortality Rate, 1990* 3194 non-null object\n", "Mortality Rate, 1995* 3194 non-null object\n", "Mortality Rate, 2000* 3194 non-null object\n", "Mortality Rate, 2005* 3194 non-null object\n", "Mortality Rate, 2010* 3194 non-null object\n", "Mortality Rate, 2014* 3194 non-null object\n", "% Change in Mortality Rate, 1980-2014 3194 non-null object\n", "dtypes: float64(1), object(10)\n", "memory usage: 274.6+ KB\n", "None\n", " \n", "CPU times: user 8.15 ms, sys: 0 ns, total: 8.15 ms\n", "Wall time: 6.28 ms\n" ] } ], "source": [ "%%time \n", " # To observe the processing time of a query\n", "\n", "print('______________________________________________________________________________________')\n", "print(df.dtypes) # Observe the attribute type of each field\n", "print('______________________________________________________________________________________')\n", "print(df.info()) # Returns display information such as the number of rows and columns, the total memory usage,\n", " # the data type of each column, and the number of non-NaN elements.\n", "print(' ')" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The dataframe has 3194 rows\n", "The dataframe has 11 columns\n", "The dataframe has (3194, 11) rows and columns\n", "The dataframe has 35134 total records\n" ] } ], "source": [ "print('The dataframe has ' + str(len(df)) + ' rows') # Prints the number of rows/records.\n", "print('The dataframe has ' + str(len(df.columns)) + ' columns') # Returns the number of columns/fields\n", "print('The dataframe has ' + str(df.shape) + ' rows and columns') # My personal favorite\n", "print('The dataframe has ' + str(df.size) + ' total records') # I.e. rows * columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example 1: Splitting the numerical values of a field into new fields\n", "
\n", "For the field '% Change in Mortality Rate, 1980-2014' I'd like to have both Confidence Interval, and the change in mortality rate, in separate fields: 'CI Lower Boundary','CI Upper Boundary', and 'Mortality Rate' respectively." ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "These are the values we want to split into three separate new fields:\n" ] }, { "data": { "text/plain": [ "0 -40.74 (-48.49, -32.30)\n", "1 -51.75 (-60.02, -43.16)\n", "2 -46.72 (-63.99, -24.18)\n", "3 -51.28 (-64.03, -34.52)\n", "4 -52.71 (-66.28, -34.42)\n", "Name: % Change in Mortality Rate, 1980-2014, dtype: object" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('These are the values we want to split into three separate new fields:')\n", "df['% Change in Mortality Rate, 1980-2014'].head()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "df[['CI Lower Boundary','CI Upper Boundary']] = df['% Change in Mortality Rate, 1980-2014'].str.split(\",\",expand=True,) \n", " # This splits the values in seperate fields by the comma.\n", " \n", "df['CI Upper Boundary'] = df['CI Upper Boundary'].str.strip(')')\n", " # Removing the parenthesis.\n", "df['CI Lower Boundary'] = df['CI Lower Boundary'].str.split('(').str[1]\n", " # Removing the parenthesis and everything left of it (indicated by the index of [1])\n", "df['Mortality Rate'] = df['% Change in Mortality Rate, 1980-2014'].str.split('(').str[0]\n", " # Removing the parenthesis and everything right of it (indicated by the index of [0])" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Outcome successful:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
% Change in Mortality Rate, 1980-2014CI Lower BoundaryCI Upper BoundaryMortality Rate
0-40.74 (-48.49, -32.30)-48.49-32.30-40.74
1-51.75 (-60.02, -43.16)-60.02-43.16-51.75
2-46.72 (-63.99, -24.18)-63.99-24.18-46.72
3-51.28 (-64.03, -34.52)-64.03-34.52-51.28
4-52.71 (-66.28, -34.42)-66.28-34.42-52.71
\n", "
" ], "text/plain": [ " % Change in Mortality Rate, 1980-2014 CI Lower Boundary CI Upper Boundary \\\n", "0 -40.74 (-48.49, -32.30) -48.49 -32.30 \n", "1 -51.75 (-60.02, -43.16) -60.02 -43.16 \n", "2 -46.72 (-63.99, -24.18) -63.99 -24.18 \n", "3 -51.28 (-64.03, -34.52) -64.03 -34.52 \n", "4 -52.71 (-66.28, -34.42) -66.28 -34.42 \n", "\n", " Mortality Rate \n", "0 -40.74 \n", "1 -51.75 \n", "2 -46.72 \n", "3 -51.28 \n", "4 -52.71 " ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Outcome successful:')\n", "df[['% Change in Mortality Rate, 1980-2014','CI Lower Boundary','CI Upper Boundary','Mortality Rate']].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example 2: Converting field's types and arithmatic between fields\n", "
\n", "I'm interested to find the Margin of Error of the Confidence Intervals, and have it written in a new field named 'Margin of Error'." ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observe the field type; we can not do math if the values are recognized as strings (i.e. text):\n" ] }, { "data": { "text/plain": [ "CI Lower Boundary object\n", "CI Upper Boundary object\n", "dtype: object" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Observe the field type; we can not do math if the values are recognized as strings (i.e. text):')\n", "df[['CI Lower Boundary','CI Upper Boundary']].dtypes" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Both fields are being identified as strings, so we need to convert to floats in order to find the margin of error\n" ] }, { "data": { "text/plain": [ "0 8.095\n", "1 8.430\n", "2 19.905\n", "3 14.755\n", "4 15.930\n", "Name: Margin of Error, dtype: float64" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print('Both fields are being identified as strings, so we need to convert to floats in order to find the margin of error')\n", "\n", "df['CI Upper Boundary'] = df['CI Upper Boundary'].astype(float)\n", "df['CI Lower Boundary'] = df['CI Lower Boundary'].astype(float)\n", "df['Margin of Error'] = (df['CI Upper Boundary'] - df['CI Lower Boundary']) / 2\n", "\n", "df['Margin of Error'].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example 3: Combining days and hours into a new column, and converting to DateTime.\n", "
\n", "I want to combine two columns, dates and hours, into a new field. Then I want to convert the field into DateTime so that I can sort chronologically my field and plot timeseries" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df1 = pd.read_csv(\"input/pool.csv\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Aquatic Programs Count float64\n", "day object\n", "Day of Week object\n", "Family Swim Count float64\n", "Lap Swim Count float64\n", "Outage Count float64\n", "tally_Time object\n", "Water Excercise Count float64\n", "dtype: object\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Aquatic Programs CountdayDay of WeekFamily Swim CountLap Swim CountOutage Counttally_TimeWater Excercise Count
0NaN03/03/2020TuesdayNaNNaNNaN05:45:00NaN
1NaN03/03/2020TuesdayNaNNaNNaN06:15:00NaN
2NaN03/03/2020TuesdayNaNNaNNaN06:45:00NaN
3NaN03/03/2020TuesdayNaNNaNNaN07:15:00NaN
4NaN03/03/2020TuesdayNaNNaNNaN07:45:00NaN
\n", "
" ], "text/plain": [ " Aquatic Programs Count day Day of Week Family Swim Count \\\n", "0 NaN 03/03/2020 Tuesday NaN \n", "1 NaN 03/03/2020 Tuesday NaN \n", "2 NaN 03/03/2020 Tuesday NaN \n", "3 NaN 03/03/2020 Tuesday NaN \n", "4 NaN 03/03/2020 Tuesday NaN \n", "\n", " Lap Swim Count Outage Count tally_Time Water Excercise Count \n", "0 NaN NaN 05:45:00 NaN \n", "1 NaN NaN 06:15:00 NaN \n", "2 NaN NaN 06:45:00 NaN \n", "3 NaN NaN 07:15:00 NaN \n", "4 NaN NaN 07:45:00 NaN " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(df1.dtypes)\n", "df1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have identified our fields of interest: \"day\" and \"tally_Time\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Aquatic Programs Count float64\n", "Day of Week object\n", "Family Swim Count float64\n", "Lap Swim Count float64\n", "Outage Count float64\n", "Water Excercise Count float64\n", "dateTime datetime64[ns]\n", "dtype: object\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Aquatic Programs CountDay of WeekFamily Swim CountLap Swim CountOutage CountWater Excercise CountdateTime
64NaNTuesdayNaN1.0NaNNaN2019-01-08 05:45:00
65NaNTuesdayNaN5.0NaNNaN2019-01-08 06:15:00
66NaNTuesdayNaN7.0NaNNaN2019-01-08 06:45:00
67NaNTuesdayNaN9.0NaNNaN2019-01-08 07:15:00
68NaNTuesdayNaN7.0NaNNaN2019-01-08 07:45:00
\n", "
" ], "text/plain": [ " Aquatic Programs Count Day of Week Family Swim Count Lap Swim Count \\\n", "64 NaN Tuesday NaN 1.0 \n", "65 NaN Tuesday NaN 5.0 \n", "66 NaN Tuesday NaN 7.0 \n", "67 NaN Tuesday NaN 9.0 \n", "68 NaN Tuesday NaN 7.0 \n", "\n", " Outage Count Water Excercise Count dateTime \n", "64 NaN NaN 2019-01-08 05:45:00 \n", "65 NaN NaN 2019-01-08 06:15:00 \n", "66 NaN NaN 2019-01-08 06:45:00 \n", "67 NaN NaN 2019-01-08 07:15:00 \n", "68 NaN NaN 2019-01-08 07:45:00 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['dateTime'] = pd.to_datetime(df1['day'] + ' ' + df1['tally_Time']) # Combine the fields and convert to dateTime\n", "df1 = df1.sort_values(by='dateTime') #Sort chronologically by the date and the hour\n", "df1 = df1.drop(['day','tally_Time'],axis=1) #Remove unwanted/redundant columns\n", "print(df1.dtypes)\n", "\n", "df1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export the edited dataset to CSV" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "df.to_csv('output/hepData_edit.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### About the Data\n", "\n", "[1] United States Infectious Disease Mortality Rates by County 1980-2014 | GHDx. [online] Available at: http://ghdx.healthdata.org/record/ihme-data/united-states-infectious-disease-mortality-rates-county-1980-2014 \n", "[Accessed 5 Mar. 2020]." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }