import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
penguins = sns.load_dataset('penguins')
penguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
sns.pairplot(penguins)
<seaborn.axisgrid.PairGrid at 0x19a7c576d40>
sns.scatterplot(x='body_mass_g', y='bill_length_mm', data=penguins);
sns.regplot(x='body_mass_g', y='bill_length_mm', data=penguins, fit_reg=True)
<AxesSubplot:xlabel='body_mass_g', ylabel='bill_length_mm'>
sns.jointplot(x='body_mass_g', y='bill_length_mm', kind='reg', data=penguins)
<seaborn.axisgrid.JointGrid at 0x19a046ccb50>
import pingouin as pg
pg.corr(penguins.body_mass_g, penguins.bill_length_mm).round(2)
n | r | CI95% | p-val | BF10 | power | |
---|---|---|---|---|---|---|
pearson | 342 | 0.6 | [0.52, 0.66] | 0.0 | 8.34e+30 | 1.0 |
# pairwise correlation
pg.pairwise_corr(penguins)
X | Y | method | alternative | n | r | CI95% | p-unc | BF10 | power | |
---|---|---|---|---|---|---|---|---|---|---|
0 | bill_length_mm | bill_depth_mm | pearson | two-sided | 342 | -0.235053 | [-0.33, -0.13] | 1.119662e-05 | 1005.717 | 0.99298 |
1 | bill_length_mm | flipper_length_mm | pearson | two-sided | 342 | 0.656181 | [0.59, 0.71] | 1.743974e-43 | 1.46e+40 | 1.00000 |
2 | bill_length_mm | body_mass_g | pearson | two-sided | 342 | 0.595110 | [0.52, 0.66] | 3.808283e-34 | 8.34e+30 | 1.00000 |
3 | bill_depth_mm | flipper_length_mm | pearson | two-sided | 342 | -0.583851 | [-0.65, -0.51] | 1.232734e-32 | 2.679e+29 | 1.00000 |
4 | bill_depth_mm | body_mass_g | pearson | two-sided | 342 | -0.471916 | [-0.55, -0.39] | 2.276941e-20 | 2.103e+17 | 1.00000 |
5 | flipper_length_mm | body_mass_g | pearson | two-sided | 342 | 0.871202 | [0.84, 0.89] | 4.370681e-107 | 1.87e+103 | 1.00000 |
import numpy as np
corr = penguins.corr() # calculate pairwise correlations
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 5))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
<AxesSubplot:>
x = penguins.body_mass_g.dropna() # predictor
y = penguins.bill_length_mm.dropna() # predicted
reg1 = pg.linear_regression(x,y).round(4)
print(reg1.iloc[:,0:7])
print('R-squared is .35, meaning that body mass explains 35% of variability in bill length of a penguin')
names coef se T pval r2 adj_r2 0 Intercept 26.8989 1.2691 21.1944 0.0 0.3542 0.3523 1 body_mass_g 0.0041 0.0003 13.6544 0.0 0.3542 0.3523 R-squared is .35, meaning that body mass explains 35% of variability in bill length of a penguin
r = x.corr(y)
print(f'correlation coef: {r**2}')
print(f'regression coef: {reg1.r2[0]}')
correlation coef: 0.354155703142187 regression coef: 0.3542
Correlation with a lagged copy of self
print(sm.datasets.sunspots.NOTE)
:: Number of Observations - 309 (Annual 1700 - 2008) Number of Variables - 1 Variable name definitions:: SUNACTIVITY - Number of sunspots for each year The data file contains a 'YEAR' variable that is not returned by load.
dta = sm.datasets.sunspots.load_pandas().data #sunspots per year
dta.index = pd.Index(sm.tsa.datetools.dates_from_range("1700", "2008"))
dta.index.freq = dta.index.inferred_freq
del dta["YEAR"]
dta.plot(figsize=(12, 8));
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(111)
fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
from libpysal.examples import load_example
import geopandas as gpd
guerry_loc = load_example('Guerry')
guerry = gpd.read_file(guerry_loc.get_path('guerry.geojson'))
print(guerry.shape)
guerry.head()
Example not available: Guerry Example not downloaded: Chicago parcels Example not downloaded: Chile Migration Example not downloaded: Spirals (85, 24)
dept | Region | Dprtmnt | Crm_prs | Crm_prp | Litercy | Donatns | Infants | Suicids | MainCty | ... | Infntcd | Dntn_cl | Lottery | Desertn | Instrct | Prsttts | Distanc | Area | Pop1831 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | E | Ain | 28870 | 15890 | 37 | 5098 | 33120 | 35039 | 2 | ... | 60 | 69 | 41 | 55 | 46 | 13 | 218.372 | 5762 | 346.03 | MULTIPOLYGON (((801150.00000 2092615.00000, 80... |
1 | 2 | N | Aisne | 26226 | 5521 | 51 | 8901 | 14572 | 12831 | 2 | ... | 82 | 36 | 38 | 82 | 24 | 327 | 65.945 | 7369 | 513.00 | MULTIPOLYGON (((729326.00000 2521619.00000, 72... |
2 | 3 | C | Allier | 26747 | 7925 | 13 | 10973 | 17044 | 114121 | 2 | ... | 42 | 76 | 66 | 16 | 85 | 34 | 161.927 | 7340 | 298.26 | MULTIPOLYGON (((710830.00000 2137350.00000, 71... |
3 | 4 | E | Basses-Alpes | 12935 | 7289 | 46 | 2733 | 23018 | 14238 | 1 | ... | 12 | 37 | 80 | 32 | 29 | 2 | 351.399 | 6925 | 155.90 | MULTIPOLYGON (((882701.00000 1920024.00000, 88... |
4 | 5 | E | Hautes-Alpes | 17488 | 8174 | 69 | 6962 | 23076 | 16171 | 1 | ... | 23 | 64 | 79 | 35 | 7 | 1 | 320.280 | 5549 | 129.10 | MULTIPOLYGON (((886504.00000 1922890.00000, 88... |
5 rows × 24 columns
French lawyer and amateur statistician, who, together with Adolphe Quetelet, is considered a founder of moral statistics, which led to the development criminology and sociologyRead more [here](https://www.datavis.ca/papers/guerry-STS241.pdf) |
# re-creating the three maps from original publication
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(12,5))
guerry.plot(ax=ax1, aspect=1, fc='None', ec='k')
guerry.plot(ax=ax2, aspect=1, fc='None', ec='k')
guerry.plot(ax=ax3, aspect=1, fc='None', ec='k')
guerry.plot(column='Crm_prs', cmap='Greys', scheme='quantiles', k=5, ax=ax1, aspect=1,
legend=True, legend_kwds={'loc': 'center', 'bbox_to_anchor':(0.5, -0.3)})
guerry.plot(column='Crm_prp', cmap='Greys', scheme='quantiles', k=5, ax=ax2, aspect=1,
legend=True, legend_kwds={'loc': 'center', 'bbox_to_anchor':(0.5, -0.3)})
guerry.plot(column='Instrct', cmap='Greys', scheme='quantiles', k=5, ax=ax3, aspect=1,
legend=True, legend_kwds={'loc': 'center', 'bbox_to_anchor':(0.5, -0.3)})
ax1.set_title('Crimes Against Persons', fontsize=16);
ax2.set_title('Crimes Against Property', fontsize=16);
ax3.set_title('Instruction', fontsize=16);
ax1.axis('off');
ax2.axis('off');
ax3.axis('off');
fig.savefig('guerry.png')
plt.close()
from IPython.display import Image
Image(filename='guerry.png')
where $n$ is the number of observations, $z_i$ is the standardized value of the variable of interest at location $i$, and $w_{ij}$ is the cell corresponding to the $i$-th row and $j$-th column of a $W$ spatial weights matrix.
from libpysal.weights.contiguity import Queen
import splot
from esda.moran import Moran
y = guerry['Crm_prp'].values
w = Queen.from_dataframe(guerry)
w.transform = 'r'
moran = Moran(y, w)
moran.I
0.26355334031776523
from splot.esda import moran_scatterplot
fig, ax = moran_scatterplot(moran, aspect_equal=True)
plt.show()
A result that has a p-value of 0.01 with 99 permutations is not necessarily more significant than a result with a p-value of 0.001 with 999 permutations. (Source: https://geodacenter.github.io/workbook/5a_global_auto/lab5a.html)
from splot.esda import plot_moran
plot_moran(moran, zstandard=True, figsize=(10,4))
plt.show()
C:\Users\noibar\anaconda3\envs\geo_env\lib\site-packages\splot\_viz_esda_mpl.py:354: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sbn.kdeplot(moran.sim, shade=shade, color=color, ax=ax, **kwargs)
p = moran.p_sim
print(f'Our simulated p-value for Moran`s I: {p}')
Our simulated p-value for Moran`s I: 0.001