In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

### Let's apply some of our new skills to the Covid-19 data

We'll first process the data as just as we did in last class. Then, we'll sample from all of the counties, and display that subset. We'll also use a loop to create a more informative label for each bubble in the map.


In [2]:
covid_table = Table.read_table("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
county_geo = Table.read_table("https://raw.githubusercontent.com/jdlafferty/covid-19/master/data/geo-counties.csv") 


In [3]:
first_date = '2021-02-21'

# Some subsets of states to visualize:
all_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
       'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

states = all_states

recent_data = covid_table.where('date', are.above(first_date))
recent_state_data = recent_data.where('state', are.contained_in(states))


In [4]:
# remove extra columns
data = recent_state_data.drop('date').drop('county').drop('state').drop('deaths')

# exclude cases where fips is not known
data = data.where('fips', are.above(0))

# now, group by fips and form a list of the cumlative cases
data = data.group('fips', list)

# apply the difference function np.diff to get the new cases
data = data.with_column('new cases', data.apply(np.diff, 'cases list'))
data = data.drop('cases list')

# Now average to get the average new cases in each county over the past week
# We add a small amount .001 to avoid zeros, which the graphics handles badly 
new_cases = Table().with_columns('fips', data['fips'], 
                                 'new cases', data.apply(np.mean, 'new cases') + .001)


In [5]:
state_geo = county_geo.where('state', are.contained_in(states)).sort('fips')
new_cases_geo = state_geo.join('fips', new_cases)
new_cases_geo = new_cases_geo.drop('fips')
new_cases_geo

county,state,lat,lon,new cases
Autauga,Alabama,32.5077,-86.651,25.6484
Baldwin,Alabama,30.7698,-87.7827,97.2517
Blount,Alabama,34.0128,-86.5337,23.7723
Bullock,Alabama,32.0927,-85.7129,3.0864
Butler,Alabama,32.0894,-88.2213,8.47207
Calhoun,Alabama,33.7623,-85.8421,50.7338
Chambers,Alabama,32.9188,-85.3938,13.8881
Cherokee,Alabama,34.7555,-87.9734,9.08364
Chilton,Alabama,32.866,-86.6652,18.4087
Choctaw,Alabama,32.004,-88.2858,3.99274


In [6]:
n = new_cases_geo.num_rows

# A random sample of 100 counties across the US:
rows = np.random.choice(np.arange(n), 100, replace=False)
sample = new_cases_geo.take(rows)

labels = []
for i in np.arange(sample.num_rows):
    s = sample['county'][i] + " County, " + \
        sample['state'][i] + ": " + \
        str(np.round(sample['new cases'][i],1))
    labels.append(s)

dat = Table().with_columns('lat', sample['lat'], 
                           'long', sample['lon'], 
                           'labels', labels,
                           'areas', 10*sample['new cases'],
                           'colors', 'red')
Circle.map_table(dat, weight=1)

We can check our results by comparing to the *Times* numbers [reported here](https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html#states).