{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Read, clean, and validate\n", "> A Summary of lecture \"Exploratory Data Analysis in Python\", via datacamp\n", "\n", "- toc: true \n", "- badges: true\n", "- comments: true\n", "- author: Chanseok Kang\n", "- categories: [Python, Datacamp]\n", "- image: images/conception.png" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrames and Series\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exploring the NSFG data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "nsfg = pd.read_hdf('./dataset/nsfg.hdf5', 'nsfg')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(9358, 10)\n", "Index(['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1', 'prglngth',\n", " 'nbrnaliv', 'agecon', 'agepreg', 'hpagelb', 'wgt2013_2015'],\n", " dtype='object')\n", "0 4.0\n", "1 12.0\n", "2 4.0\n", "3 NaN\n", "4 13.0\n", "Name: birthwgt_oz1, dtype: float64\n" ] } ], "source": [ "# Display the number of rows and columns\n", "print(nsfg.shape)\n", "\n", "# Display the names of the columns\n", "print(nsfg.columns)\n", "\n", "# Select columns birthwgt_oz1: ounces\n", "ounces = nsfg['birthwgt_oz1']\n", "\n", "# Print the first 5 elements of ounces\n", "print(ounces.head(5))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean and Validate" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Clean a variable" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0 6379\n", "2.0 100\n", "3.0 5\n", "8.0 1\n", "Name: nbrnaliv, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nsfg['nbrnaliv'].value_counts()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0 6379\n", "2.0 100\n", "3.0 5\n", "Name: nbrnaliv, dtype: int64\n" ] } ], "source": [ "# replace the value 8 with NaN\n", "nsfg['nbrnaliv'].replace([8], np.nan, inplace=True)\n", "\n", "# Print the values and their frequencies\n", "print(nsfg['nbrnaliv'].value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compute a variable" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 9358.000000\n", "mean 2446.330199\n", "std 579.392363\n", "min 750.000000\n", "25% 1983.000000\n", "50% 2366.000000\n", "75% 2850.000000\n", "max 4350.000000\n", "Name: agecon, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nsfg['agecon'].describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 9109.000000\n", "mean 2494.934570\n", "std 578.099231\n", "min 825.000000\n", "25% 2041.000000\n", "50% 2416.000000\n", "75% 2900.000000\n", "max 4350.000000\n", "Name: agepreg, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nsfg['agepreg'].describe()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 9109.000000\n", "mean 0.552069\n", "std 0.271479\n", "min 0.000000\n", "25% 0.250000\n", "50% 0.670000\n", "75% 0.750000\n", "max 0.920000\n", "dtype: float64\n" ] } ], "source": [ "# Select the columns and divide by 100\n", "agecon = nsfg['agecon'] / 100\n", "agepreg = nsfg['agepreg'] / 100\n", "\n", "# Compute the difference\n", "preg_length = agepreg - agecon\n", "\n", "# Compute summary statistics\n", "print(preg_length.describe())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filter and visualize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make a histogram" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Plot the histogram\n", "plt.hist(agecon, bins=20)\n", "\n", "# Label the axes\n", "plt.xlabel(\"Age at conception\")\n", "plt.ylabel('Number of pregnancies')\n", "plt.savefig('../images/conception.png')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Number of pregnancies')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Plot the histogram\n", "plt.hist(agecon, bins=20, histtype='step')\n", "\n", "# Label the axes\n", "plt.xlabel(\"Age at conception\")\n", "plt.ylabel('Number of pregnancies')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compute birth weight" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def resample_rows_weighted(df, column='wgt2013_2015'):\n", " \"\"\"Resamples a DataFrame using probabilities proportional to given column.\n", " Args:\n", " df: DataFrame\n", " column: string column name to use as weights\n", " returns: \n", " DataFrame\n", " \"\"\"\n", " weights = df[column].copy()\n", " weights /= sum(weights)\n", " indices = np.random.choice(df.index, len(df), replace=True, p=weights)\n", " sample = df.loc[indices]\n", " return sample" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Resample the data\n", "nsfg = resample_rows_weighted(nsfg, 'wgt2013_2015')\n", "\n", "# Clean the weight variables\n", "pounds = nsfg['birthwgt_lb1'].replace([98, 99], np.nan)\n", "ounces = nsfg['birthwgt_oz1'].replace([98, 99], np.nan)\n", "\n", "# Compute total birth weight\n", "birth_weight = pounds + ounces/16" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7.409452970741339\n" ] } ], "source": [ "# Create a Boolean Series for full-term babies\n", "full_term = nsfg['prglngth'] >= 37\n", "\n", "# Select the weights of full-term babies\n", "full_term_weight = birth_weight[full_term]\n", "\n", "# Compute the mean weight of full-term babies\n", "print(full_term_weight.mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filter" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Single full-term mean: 7.421643309222423\n", "Multiple full-term mean: 5.765243902439025\n" ] } ], "source": [ "# Filter full-term babies\n", "full_term = nsfg['prglngth'] >= 37\n", "\n", "# Filter single birth\n", "single = nsfg['nbrnaliv'] == 1\n", "\n", "# Compute birth weight for single full-term babies\n", "single_full_term_weight = birth_weight[single & full_term]\n", "print('Single full-term mean:', single_full_term_weight.mean())\n", "\n", "# Compute birth weight for multiple full-term babies\n", "mult_full_term_weight = birth_weight[~single & full_term]\n", "print('Multiple full-term mean:', mult_full_term_weight.mean())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }