{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import important libraries\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# this line shows the plots\n",
    "%matplotlib inline  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's load in our data files!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# us confirmed - https://raw.githubusercontent.com/lidderupk/covid-19-cali/master/data/johns_hopkins_csse/2019-novel-coronavirus-covid-19-2019-ncov-data-repository-confirmed-cases-in-the-us.csv\n",
    "df_us_confirmed = pd.read_csv('https://raw.githubusercontent.com/lidderupk/covid-19-cali/master/data/johns_hopkins_csse/2019-novel-coronavirus-covid-19-2019-ncov-data-repository-confirmed-cases-in-the-us.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#woh! that was smooth! Let's explore the dataset \n",
    "#how big is this dataset? shape returns a tuple. The first number is the number of rows and the second is the number of columns.\n",
    "print(df_us_confirmed.shape)\n",
    "\n",
    "print(f'\\n\\n{df_us_confirmed.shape[0]:,} rows!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#what are the different columns in this dataframe\n",
    "df_us_confirmed.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#cool! What are some of the datatypes? Read object is mostly string\n",
    "df_us_confirmed.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#are there any null values? this takes a little time to go over 3.7M rows! Also only works for numerical data\n",
    "df_us_confirmed.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hey! what about non numeric numbers? Can you figure this out?\n",
    "pd.DataFrame.describe?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# answer\n",
    "df_us_confirmed.describe(include='all')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Query - does slicing work here? well YES!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#does slicing work here? well YES!\n",
    "print('first 10 rows:')\n",
    "df_us_confirmed[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('last 10 rows:')\n",
    "df_us_confirmed[-10:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#cool deal, but there are other methods that are more popular to take a quick peek at the data\n",
    "df_us_confirmed.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# can anybody guess how to get the last 5?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what about randomly selected sample data and why would you want to do that?\n",
    "df_us_confirmed.sample()\n",
    "df_us_confirmed.iso3.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what if want a random sample of 10, but be able to reproduce? use seed! Look up the syntax\n",
    "df_us_confirmed.sample(10, random_state=19)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### This all is great, but I want to get specific rows and columns. There are two very important methods to look up something in a dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## loc and iloc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#loc is used to look up by label\n",
    "#iloc is used to look up by index or location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#let's use iloc first. Get the first row with iloc. The general syntax for iloc is dataframe.iloc[rows,columns]\n",
    "df_us_confirmed.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#notice that returned a series. Another way to check is\n",
    "type(df_us_confirmed.iloc[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#now let's get the 5th and 6th rows. Remember numerical index is 0 based and also the second limit in the iloc index is excluded\n",
    "df_us_confirmed.iloc[5:7]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# great, can you get the 100th and 101th row using the iloc method?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# woh what kind of math is this!\n",
    "print('-'*50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# that's wonderful. How do we get only the columns we need using iloc? Let's say we only want title column for the 5th and 6th row\n",
    "df_us_confirmed.iloc[5:7,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#notice we again got a series. To get a dataframe, we can speficy a range as the second parameter for the columns.\n",
    "df_us_confirmed.iloc[5:7, 0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What if we want admin2, province_state?\n",
    "df_us_confirmed.iloc[5:7,5:7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#What if we want admin2, province_state and confirmed? Look at the docs again\n",
    "df_us_confirmed.iloc[5:7,[5,6,7,12]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#can you get admin2, province_state and confirmed cases for the 200th and 300th rows?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('-'*50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#let's try loc now. The general syntax remains the same dataframe.loc[rows,columns]. We now use labels instead of location or numerical index\n",
    "#let's get the first row again\n",
    "df_us_confirmed.loc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#woh! why did that work? It just so happens that in this case, the label or the index is the same as the numerical location\n",
    "df_us_confirmed.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#try it for the columns? It will give you an error. \n",
    "# df_us_confirmed.loc[0,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#now, let's get the state for the first two rows. notice in this case, the second argument is inclusive! Ya! Don't ask me why!\n",
    "df_us_confirmed.loc[0:1,'province_state']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise 3\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# grab the 200th and 201st rows with admin2, province_state, country_region, and confirmed values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#also to note is that you can use dataframe[''] to select columns, but it is recommeneded to use the more explicit loc or iloc function\n",
    "df_us_confirmed['province_state'].sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#that gave you a series. To get a dataframe, put another []\n",
    "df_us_confirmed[['province_state']].sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#you can get multiple columns as follows\n",
    "df_us_confirmed[['province_state', 'admin2']].sample(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sorting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#use the sort_index method to sort the index\n",
    "print('current index:', df_us_confirmed.index)\n",
    "\n",
    "#let's change the index to year\n",
    "df_us_confirmed_new = df_us_confirmed.set_index(df_us_confirmed.date)\n",
    "print('new index:',df_us_confirmed_new.index)\n",
    "#notice that it is not sorted\n",
    "\n",
    "#sort_index\n",
    "df_us_confirmed_new.sort_index().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#to sort by value, you can use sort_values for dataframe\n",
    "df_us_confirmed.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_us_confirmed.sort_values('combined_key').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what if you want to sort by combined key followed by date?\n",
    "# use df_us_confirmed.sort_values?\n",
    "\n",
    "# df_us_confirmed.sort_values(['combined_key', 'confirmed']).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what about sorting as before but show highest confirmed first?\n",
    "df_us_confirmed.sort_values(['combined_key', 'confirmed'], ascending=[True, False]).head(20)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Can you print out all the rows that have admin2 as null?\n",
    "df_us_confirmed[df_us_confirmed.admin2.isnull()]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Can you print out how many of each province_state have admin2 as null?\n",
    "df_us_confirmed[df_us_confirmed.admin2.isnull()].province_state.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#what is the syntax to sort a series? Look up the syntax here and complete the code below to print out all states in descending order...\n",
    "#https://pandas.pydata.org/pandas-docs/stable/reference/series.html\n",
    "states = df_us_confirmed.loc[:,'province_state']\n",
    "print(type(states))\n",
    "\n",
    "states.sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conditional Lookups!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Okay! so we know how to select based on position and index, but how do you do conditional lookups? Actually, this is fairly simple!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#let's get all California cases from April only\n",
    "df_cali = df_us_confirmed.loc[df_us_confirmed.province_state == 'California']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what is the dtype of the date column?\n",
    "df_cali.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#first let's convert date type to date\n",
    "df_cali.loc[:,'date'] = pd.to_datetime(df_cali.date)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cali.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now let's get those cases for april only\n",
    "after_april_filter = df_cali.date > '04.01.2020'\n",
    "df_cali_april = df_cali[after_april_filter]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_cali.shape)\n",
    "print(df_cali_april.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# how much data do we have? What's the last date?\n",
    "df_us_confirmed.date.value_counts().sort_index()\n",
    "df_us_confirmed.date.max()\n",
    "df_us_confirmed.date.min()\n",
    "df_us_confirmed.date.sort_values()[-1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#we can still use the second argument for loc to only get selected columns\n",
    "df_cali.loc[after_april_filter ,['province_state', 'admin2', 'date', 'confirmed']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#we can use the shorter syntax as well and just do ...\n",
    "df_cali[after_april_filter][['province_state', 'admin2', 'date', 'confirmed']].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The loc version seems easier to decipher! Note that you cannot do conditional selecting with iloc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#select all confirmed california cases for Feb and march. Remember the first parameter to loc can be a conditional (boolean)\n",
    "march_april_filter = df_cali.date > '02.01.20' & df_cali.date < '03.31.20'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# oops, how do we fix that?\n",
    "march_april_filter = (df_cali.date > '02.01.20') & (df_cali.date < '03.31.20')\n",
    "df_cali[march_april_filter]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create a new dataframe with only california cases and another with only Oregon cases "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## What if you have a more complicated query? Get cases for all states that have the word 'south' in them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#you can use str on columns that are of object type and then perform any string operation on that column.\n",
    "#for example to convert all titles to lowercase, we can do\n",
    "print('original states in the dataframe:',df_us_confirmed.province_state.sample(5, random_state=19).values)\n",
    "\n",
    "#use str.lower to lower case the series. Just showing the top 5.\n",
    "df_us_confirmed.province_state.str.lower().sample(5, random_state=19).values\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### See what else you can do with str here: https://pandas.pydata.org/pandas-docs/stable/reference/series.html#string-handling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#you can combine multiple str functions, but have to convert each sub step to str\n",
    "#here, we first convert everything to lower case and then swapcase, that converts it back to upper case.\n",
    "df_us_confirmed.province_state.str.lower().str.swapcase().sample(5, random_state=19)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# your turn with the original question.\n",
    "# Get cases for all states that have the word 'south' in them\n",
    "df_us_confirmed[df_us_confirmed.province_state.str.lower().str.contains('south')].province_state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Another very important method used all the time is value_counts()\n",
    "df_confirmed_south = df_us_confirmed[df_us_confirmed.province_state.str.lower().str.contains('south')]\n",
    "df_confirmed_south.province_state.value_counts()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Can you get all the states with north in them?\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Charts and Plots!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cali.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(df_cali.date, df_cali.confirmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# San Francisco only\n",
    "df_cali_sf = df_cali.loc[df_cali.admin2.str.lower().str.contains('francisco')] \n",
    "plt.plot(df_cali_sf.index, df_cali_sf.confirmed)\n",
    "\n",
    "#why did that work? we did not use date!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(df_cali_sf.date, df_cali_sf.confirmed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.xticks( df_cali_sf['confirmed'], df_cali_sf.date.values )\n",
    "plt.plot( df_cali_sf['confirmed'] )\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# can we compare different counties? How about Alamed and San Francisco?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cali_alameda = df_cali.loc[df_cali.admin2.str.lower().str.contains('alameda')] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cali_alameda.admin2.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cali_sf.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.dates as mdates\n",
    "import matplotlib\n",
    "from matplotlib.ticker import NullFormatter\n",
    "from matplotlib.dates import MonthLocator, DateFormatter\n",
    "\n",
    "ax = plt.gca()\n",
    "ax.tick_params(pad=20)\n",
    "\n",
    "plt.plot(df_cali_alameda.date, df_cali_alameda.confirmed, color='r', label='Alameda', linewidth=0.5)\n",
    "plt.plot(df_cali_sf.date, df_cali_sf.confirmed, color='b', label='San Francisco', linewidth=1)\n",
    "\n",
    "ax.set_xlabel('Date')\n",
    "ax.set_ylabel('Confirmed Cases')\n",
    "ax.legend(loc='best')\n",
    "\n",
    "plt.setp(plt.gca().get_xticklabels(), rotation=45, ha=\"right\")\n",
    "\n",
    "\n",
    "plt.show() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# can you plot San Francisco and newyork from jan1 to apr1?\n",
    "# df_nyc = df_cali.loc[df_cali.admin2.str.lower().str.contains('alameda')] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_us_confirmed_not_null = df_us_confirmed[~df_us_confirmed.admin2.isnull()]\n",
    "# note that the string function will fail on null values\n",
    "\n",
    "df_nyc = df_us_confirmed_not_null[df_us_confirmed_not_null.admin2.str.lower().str.contains('new york')]\n",
    "\n",
    "df_nyc.loc[:,'date'] = pd.to_datetime(df_nyc.date)\n",
    "\n",
    "# df_nyc.set_index('date', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.dates as mdates\n",
    "import matplotlib\n",
    "from matplotlib.ticker import NullFormatter\n",
    "from matplotlib.dates import MonthLocator, DateFormatter\n",
    "\n",
    "ax = plt.gca()\n",
    "ax.tick_params(pad=20)\n",
    "\n",
    "plt.plot(df_nyc.date, df_nyc.confirmed, color='r', label='NYC', linewidth=0.5)\n",
    "plt.plot(df_cali_sf.date, df_cali_sf.confirmed, color='b', label='San Francisco', linewidth=1)\n",
    "\n",
    "ax.set_xlabel('Date')\n",
    "ax.set_ylabel('Confirmed Cases')\n",
    "ax.legend(loc='best')\n",
    "\n",
    "plt.ylim(0, 1000)\n",
    "plt.setp(plt.gca().get_xticklabels(), rotation=45, ha=\"right\")\n",
    "\n",
    "\n",
    "plt.show() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# groupby - a very powerful idea!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cali.admin2.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# great, let's look at total cases per county\n",
    "df_cali.groupby(['admin2', 'date'])[['confirmed']].sum().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_us_confirmed.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_us_confirmed.groupby(['province_state', 'admin2'])[['confirmed']].mean().sort_values('confirmed', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine everything. Average number of cases in sf by county\n",
    "df_cali.groupby(['admin2'])[['confirmed']].mean().sort_values('confirmed', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's look at time series now"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_time = pd.read_csv('..//data/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')\n",
    "df_time = pd.read_csv('https://raw.githubusercontent.com/lidderupk/covid-19-cali/master/data/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let's print all columns\n",
    "df_time.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print([col for col in df_time.columns])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time_cali = df_time[df_time.Province_State == 'California']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time_cali.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time_clean = df_time.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print([col for col in df_time_clean.columns])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time_clean = df_time_clean.dropna(subset=['Admin2'])\n",
    "df_time_clean.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi = pd.DataFrame(df_time_clean.set_index('Admin2').stack())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# example from pandas documentation\n",
    ">>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),\n",
    "...                                        ('height', 'm')])\n",
    ">>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],\n",
    "...                                     index=['cat', 'dog'],\n",
    "...                                     columns=multicol2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi_level_cols2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi_level_cols2.stack(level=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi_level_cols2.stack(level=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# going back to our dataset\n",
    "df_multi.sort_index(ascending=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi.loc[('San Francisco')].plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi.loc['San Francisco']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi.loc[('Alameda')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi.loc[slice('Alameda', 'San Francisco')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.dates as mdates\n",
    "import matplotlib\n",
    "from matplotlib.ticker import NullFormatter\n",
    "from matplotlib.dates import MonthLocator, DateFormatter\n",
    "\n",
    "ax = plt.gca()\n",
    "ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator())\n",
    "ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%M'))\n",
    "\n",
    "ax.tick_params(pad=20)\n",
    "\n",
    "plt.plot(df_multi.loc['Alameda'], color='r', label='Alameda', linewidth=0.5)\n",
    "plt.plot(df_multi.loc['San Francisco'], color='b', label='San Francisco', linewidth=0.5)\n",
    "plt.plot(df_multi.loc['New York'], color='g', label='New York', linewidth=0.5)\n",
    "\n",
    "ax.set_xlabel('Date')\n",
    "ax.set_ylabel('Confirmed Cases')\n",
    "ax.legend(loc='best')\n",
    "\n",
    "plt.setp(plt.gca().get_xticklabels(), rotation=45, ha=\"right\")\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi.loc['New York'].plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_multi.loc['San Francisco'].plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time.loc[df_time.Admin2 == 'San Francisco', '3/15/20':'5/2/20']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time.loc[df_time.Admin2 == 'New York', '3/15/20':'5/2/20']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_time.loc[df_time.Admin2 == 'Dallas', ['Admin2', 'Province_State', '3/15/20', '5/2/20']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}