{ "cells": [ { "cell_type": "markdown", "id": "9d9c766f", "metadata": {}, "source": [ "

Dataset 1: Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv

" ] }, { "cell_type": "code", "execution_count": 1, "id": "032d5c8f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Data As Of Start Date End Date Group Year Month State \\\n", "0 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States \n", "1 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States \n", "2 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States \n", "3 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States \n", "4 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States \n", "\n", " Condition Group Condition ICD10_codes Age Group \\\n", "0 Respiratory diseases Influenza and pneumonia J09-J18 0-24 \n", "1 Respiratory diseases Influenza and pneumonia J09-J18 25-34 \n", "2 Respiratory diseases Influenza and pneumonia J09-J18 35-44 \n", "3 Respiratory diseases Influenza and pneumonia J09-J18 45-54 \n", "4 Respiratory diseases Influenza and pneumonia J09-J18 55-64 \n", "\n", " COVID-19 Deaths Number of Mentions Flag \n", "0 1569.0 1647.0 NaN \n", "1 5804.0 6029.0 NaN \n", "2 15080.0 15699.0 NaN \n", "3 37414.0 38878.0 NaN \n", "4 82668.0 85708.0 NaN \n" ] } ], "source": [ "import pandas as pd\n", "file_path_1 = 'Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv'\n", "data_1 = pd.read_csv(file_path_1)\n", "print(data_1.head())" ] }, { "cell_type": "code", "execution_count": 2, "id": "a90d5564", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((621000, 14),\n", " Data As Of Start Date End Date Group Year \\\n", " count 621000 621000 621000 621000 608580.000000 \n", " unique 1 45 45 3 NaN \n", " top 09/24/2023 01/01/2020 09/23/2023 By Month NaN \n", " freq 621000 37260 37260 558900 NaN \n", " mean NaN NaN NaN NaN 2021.408163 \n", " std NaN NaN NaN NaN 1.086436 \n", " min NaN NaN NaN NaN 2020.000000 \n", " 25% NaN NaN NaN NaN 2020.000000 \n", " 50% NaN NaN NaN NaN 2021.000000 \n", " 75% NaN NaN NaN NaN 2022.000000 \n", " max NaN NaN NaN NaN 2023.000000 \n", " \n", " Month State Condition Group \\\n", " count 558900.000000 621000 621000 \n", " unique NaN 54 12 \n", " top NaN United States Circulatory diseases \n", " freq NaN 11500 189000 \n", " mean 6.200000 NaN NaN \n", " std 3.350625 NaN NaN \n", " min 1.000000 NaN NaN \n", " 25% 3.000000 NaN NaN \n", " 50% 6.000000 NaN NaN \n", " 75% 9.000000 NaN NaN \n", " max 12.000000 NaN NaN \n", " \n", " Condition ICD10_codes Age Group COVID-19 Deaths \\\n", " count 621000 621000 621000 4.375510e+05 \n", " unique 23 23 10 NaN \n", " top Influenza and pneumonia J09-J18 0-24 NaN \n", " freq 27000 27000 62100 NaN \n", " mean NaN NaN NaN 1.201179e+02 \n", " std NaN NaN NaN 2.980201e+03 \n", " min NaN NaN NaN 0.000000e+00 \n", " 25% NaN NaN NaN 0.000000e+00 \n", " 50% NaN NaN NaN 0.000000e+00 \n", " 75% NaN NaN NaN 1.800000e+01 \n", " max NaN NaN NaN 1.146242e+06 \n", " \n", " Number of Mentions Flag \n", " count 4.434230e+05 183449 \n", " unique NaN 1 \n", " top NaN One or more data cells have counts between 1-9... \n", " freq NaN 183449 \n", " mean 1.293348e+02 NaN \n", " std 3.203936e+03 NaN \n", " min 0.000000e+00 NaN \n", " 25% 0.000000e+00 NaN \n", " 50% 0.000000e+00 NaN \n", " 75% 1.900000e+01 NaN \n", " max 1.146242e+06 NaN ,\n", " Data As Of Start Date End Date Group Year Month \\\n", " 620995 09/24/2023 05/01/2023 05/31/2023 By Month 2023.0 5.0 \n", " 620996 09/24/2023 06/01/2023 06/30/2023 By Month 2023.0 6.0 \n", " 620997 09/24/2023 07/01/2023 07/31/2023 By Month 2023.0 7.0 \n", " 620998 09/24/2023 08/01/2023 08/31/2023 By Month 2023.0 8.0 \n", " 620999 09/24/2023 09/01/2023 09/23/2023 By Month 2023.0 9.0 \n", " \n", " State Condition Group Condition ICD10_codes Age Group \\\n", " 620995 Puerto Rico COVID-19 COVID-19 U071 All Ages \n", " 620996 Puerto Rico COVID-19 COVID-19 U071 All Ages \n", " 620997 Puerto Rico COVID-19 COVID-19 U071 All Ages \n", " 620998 Puerto Rico COVID-19 COVID-19 U071 All Ages \n", " 620999 Puerto Rico COVID-19 COVID-19 U071 All Ages \n", " \n", " COVID-19 Deaths Number of Mentions Flag \n", " 620995 67.0 67.0 NaN \n", " 620996 122.0 122.0 NaN \n", " 620997 114.0 114.0 NaN \n", " 620998 78.0 78.0 NaN \n", " 620999 36.0 36.0 NaN ,\n", " Data As Of object\n", " Start Date object\n", " End Date object\n", " Group object\n", " Year float64\n", " Month float64\n", " State object\n", " Condition Group object\n", " Condition object\n", " ICD10_codes object\n", " Age Group object\n", " COVID-19 Deaths float64\n", " Number of Mentions float64\n", " Flag object\n", " dtype: object)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_1_shape = data_1.shape\n", "\n", "# Descriptive statistics for all columns\n", "data_1_describe = data_1.describe(include='all')\n", "\n", "# Display the last few rows of the DataFrame\n", "data_1_tail = data_1.tail()\n", "\n", "# Display the data types of each column\n", "data_1_dtypes = data_1.dtypes\n", "\n", "data_1_shape, data_1_describe, data_1_tail, data_1_dtypes" ] }, { "cell_type": "code", "execution_count": null, "id": "a33437ba", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "6016716c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Data As Of Start Date End Date Group Year Month \\\n", "0 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN \n", "1 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN \n", "2 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN \n", "3 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN \n", "4 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN \n", "... ... ... ... ... ... ... \n", "620995 2023-09-24 2023-05-01 2023-05-31 By Month 2023.0 5.0 \n", "620996 2023-09-24 2023-06-01 2023-06-30 By Month 2023.0 6.0 \n", "620997 2023-09-24 2023-07-01 2023-07-31 By Month 2023.0 7.0 \n", "620998 2023-09-24 2023-08-01 2023-08-31 By Month 2023.0 8.0 \n", "620999 2023-09-24 2023-09-01 2023-09-23 By Month 2023.0 9.0 \n", "\n", " State Condition Group Condition \\\n", "0 United States Respiratory diseases Influenza and pneumonia \n", "1 United States Respiratory diseases Influenza and pneumonia \n", "2 United States Respiratory diseases Influenza and pneumonia \n", "3 United States Respiratory diseases Influenza and pneumonia \n", "4 United States Respiratory diseases Influenza and pneumonia \n", "... ... ... ... \n", "620995 Puerto Rico COVID-19 COVID-19 \n", "620996 Puerto Rico COVID-19 COVID-19 \n", "620997 Puerto Rico COVID-19 COVID-19 \n", "620998 Puerto Rico COVID-19 COVID-19 \n", "620999 Puerto Rico COVID-19 COVID-19 \n", "\n", " ICD10_codes Age Group COVID-19 Deaths Number of Mentions Flag \n", "0 J09-J18 0-24 1569.0 1647.0 NaN \n", "1 J09-J18 25-34 5804.0 6029.0 NaN \n", "2 J09-J18 35-44 15080.0 15699.0 NaN \n", "3 J09-J18 45-54 37414.0 38878.0 NaN \n", "4 J09-J18 55-64 82668.0 85708.0 NaN \n", "... ... ... ... ... ... \n", "620995 U071 All Ages 67.0 67.0 NaN \n", "620996 U071 All Ages 122.0 122.0 NaN \n", "620997 U071 All Ages 114.0 114.0 NaN \n", "620998 U071 All Ages 78.0 78.0 NaN \n", "620999 U071 All Ages 36.0 36.0 NaN \n", "\n", "[621000 rows x 14 columns]\n" ] } ], "source": [ "data_1 = pd.DataFrame(data_1)\n", "\n", "# Convert dates to datetime\n", "data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of'])\n", "data_1['Start Date'] = pd.to_datetime(data_1['Start Date'])\n", "data_1 ['End Date'] = pd.to_datetime(data_1['End Date'])\n", "\n", "# Display the DataFrame\n", "print(data_1)\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "08b0192f", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Since the dataframe is named data_1, let's perform the analysis using that correct name\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Frequency distribution of Age Group\n", "age_group_counts_data_1 = data_1['Age Group'].value_counts()\n", "\n", "# Bar chart of Age Group counts in 'data_1'\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue')\n", "plt.title('Frequency Distribution of Age Groups in Data 1')\n", "plt.xlabel('Age Group')\n", "plt.ylabel('Frequency')\n", "plt.xticks(rotation=45) # Rotate x-axis labels to show clearly\n", "plt.show()\n", "\n", "# Boxplot of COVID-19 Deaths by Age Group in 'data_1'\n", "plt.figure(figsize=(10, 6))\n", "sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1)\n", "plt.title('COVID-19 Deaths by Age Group in Data 1')\n", "plt.xlabel('Age Group')\n", "plt.ylabel('COVID-19 Deaths')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "f1a3d2b8", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Could not interpret input 'Age Group Numeric'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[5], line 6\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# Bar chart for 'Condition Group' with vertical x-axis labels\u001b[39;00m\n\u001b[0;32m 5\u001b[0m plt\u001b[38;5;241m.\u001b[39mfigure(figsize\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m12\u001b[39m, \u001b[38;5;241m6\u001b[39m))\n\u001b[1;32m----> 6\u001b[0m barplot1 \u001b[38;5;241m=\u001b[39m sns\u001b[38;5;241m.\u001b[39mbarplot(x\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCondition Group\u001b[39m\u001b[38;5;124m'\u001b[39m, y\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCOVID-19 Deaths\u001b[39m\u001b[38;5;124m'\u001b[39m, hue\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAge Group Numeric\u001b[39m\u001b[38;5;124m'\u001b[39m, data\u001b[38;5;241m=\u001b[39mdata_1)\n\u001b[0;32m 7\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCOVID-19 Deaths by Condition Group and Age Group\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 8\u001b[0m plt\u001b[38;5;241m.\u001b[39mxlabel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCondition Group\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\seaborn\\categorical.py:2755\u001b[0m, in \u001b[0;36mbarplot\u001b[1;34m(data, x, y, hue, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge, ci, ax, **kwargs)\u001b[0m\n\u001b[0;32m 2752\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m estimator \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28mlen\u001b[39m:\n\u001b[0;32m 2753\u001b[0m estimator \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m-> 2755\u001b[0m plotter \u001b[38;5;241m=\u001b[39m _BarPlotter(x, y, hue, data, order, hue_order,\n\u001b[0;32m 2756\u001b[0m estimator, errorbar, n_boot, units, seed,\n\u001b[0;32m 2757\u001b[0m orient, color, palette, saturation,\n\u001b[0;32m 2758\u001b[0m width, errcolor, errwidth, capsize, dodge)\n\u001b[0;32m 2760\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ax \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 2761\u001b[0m ax \u001b[38;5;241m=\u001b[39m plt\u001b[38;5;241m.\u001b[39mgca()\n", "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\seaborn\\categorical.py:1530\u001b[0m, in \u001b[0;36m_BarPlotter.__init__\u001b[1;34m(self, x, y, hue, data, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge)\u001b[0m\n\u001b[0;32m 1525\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, x, y, hue, data, order, hue_order,\n\u001b[0;32m 1526\u001b[0m estimator, errorbar, n_boot, units, seed,\n\u001b[0;32m 1527\u001b[0m orient, color, palette, saturation, width,\n\u001b[0;32m 1528\u001b[0m errcolor, errwidth, capsize, dodge):\n\u001b[0;32m 1529\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Initialize the plotter.\"\"\"\u001b[39;00m\n\u001b[1;32m-> 1530\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestablish_variables(x, y, hue, data, orient,\n\u001b[0;32m 1531\u001b[0m order, hue_order, units)\n\u001b[0;32m 1532\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestablish_colors(color, palette, saturation)\n\u001b[0;32m 1533\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimate_statistic(estimator, errorbar, n_boot, seed)\n", "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\seaborn\\categorical.py:541\u001b[0m, in \u001b[0;36m_CategoricalPlotter.establish_variables\u001b[1;34m(self, x, y, hue, data, orient, order, hue_order, units)\u001b[0m\n\u001b[0;32m 539\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(var, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 540\u001b[0m err \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not interpret input \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 541\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(err)\n\u001b[0;32m 543\u001b[0m \u001b[38;5;66;03m# Figure out the plotting orientation\u001b[39;00m\n\u001b[0;32m 544\u001b[0m orient \u001b[38;5;241m=\u001b[39m infer_orient(\n\u001b[0;32m 545\u001b[0m x, y, orient, require_numeric\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequire_numeric\n\u001b[0;32m 546\u001b[0m )\n", "\u001b[1;31mValueError\u001b[0m: Could not interpret input 'Age Group Numeric'" ] }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Bar chart for 'Condition Group' with vertical x-axis labels\n", "plt.figure(figsize=(12, 6))\n", "barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)\n", "plt.title('COVID-19 Deaths by Condition Group and Age Group')\n", "plt.xlabel('Condition Group')\n", "plt.ylabel('COVID-19 Deaths')\n", "plt.legend(title='Age Group')\n", "barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90) # Rotate x-axis labels\n", "plt.show()\n", "\n", "plt.figure(figsize=(12, 6))\n", "barplot2 = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)\n", "plt.title('COVID-19 Deaths by State and Age Group')\n", "plt.xlabel('State')\n", "plt.ylabel('COVID-19 Deaths')\n", "plt.legend(title='Age Group')\n", "barplot2.set_xticklabels(barplot2.get_xticklabels(), rotation=90) # Rotate x-axis labels\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "67710766", "metadata": {}, "outputs": [], "source": [ "# We will remove 'United States' from the 'State' column and then recreate the bar plot.\n", "\n", "# Check if there are other states in the dataset besides 'United States'\n", "unique_states = data_1['State'].unique()\n", "\n", "# If 'United States' is the only state, the following code will not be able to create a meaningful plot.\n", "# We'll proceed under the assumption that there are other states in the full dataset.\n", "\n", "# Filter out the 'United States' entry from the dataset\n", "data_1_no_us = data_1[data_1['State'] != 'United States']\n", "\n", "# Now let's create the bar plot without 'United States'\n", "plt.figure(figsize=(12, 6))\n", "barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1_no_us)\n", "plt.title('COVID-19 Deaths by State and Age Group (excluding United States)')\n", "plt.xlabel('State')\n", "plt.ylabel('COVID-19 Deaths')\n", "plt.legend(title='Age Group')\n", "barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90) # Rotate x-axis labels\n", "plt.tight_layout() # This will adjust the plot to make sure everything fits without overlapping\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0c3a1783", "metadata": {}, "outputs": [], "source": [ "pip install ydata-profiling" ] }, { "cell_type": "code", "execution_count": null, "id": "b3d0f506", "metadata": {}, "outputs": [], "source": [ "from ydata_profiling import ProfileReport\n", "\n", "ProfileReport(data_1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }