{
"cells": [
{
"cell_type": "markdown",
"id": "f53ef1d6-ec34-4b1e-9b34-c625edc86331",
"metadata": {},
"source": [
"# MPG Correlogram\n",
"\n",
"A correlogram provides a quick overview of the entire dataset and allows analysing the relationship between each pair of numerical variables."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d48d91c8-4a42-434c-85c6-f84a96e10d5b",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:43:22.671957Z",
"iopub.status.busy": "2024-08-23T10:43:22.671872Z",
"iopub.status.idle": "2024-08-23T10:43:22.994980Z",
"shell.execute_reply": "2024-08-23T10:43:22.994718Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from lets_plot import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "184c6176-bd7a-4881-bb9d-1642ffd3bc40",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:43:22.997253Z",
"iopub.status.busy": "2024-08-23T10:43:22.997081Z",
"iopub.status.idle": "2024-08-23T10:43:22.999857Z",
"shell.execute_reply": "2024-08-23T10:43:22.999660Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"LetsPlot.setup_html()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "191b90ef-a38e-4988-b98c-774e0a2aa8c0",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:43:23.000773Z",
"iopub.status.busy": "2024-08-23T10:43:23.000699Z",
"iopub.status.idle": "2024-08-23T10:43:23.371020Z",
"shell.execute_reply": "2024-08-23T10:43:23.370723Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(234, 11)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" manufacturer | \n",
" model | \n",
" displ | \n",
" year | \n",
" cyl | \n",
" trans | \n",
" drv | \n",
" cty | \n",
" hwy | \n",
" fl | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" audi | \n",
" a4 | \n",
" 1.8 | \n",
" 1999 | \n",
" 4 | \n",
" auto(l5) | \n",
" f | \n",
" 18 | \n",
" 29 | \n",
" p | \n",
" compact | \n",
"
\n",
" \n",
" 1 | \n",
" audi | \n",
" a4 | \n",
" 1.8 | \n",
" 1999 | \n",
" 4 | \n",
" manual(m5) | \n",
" f | \n",
" 21 | \n",
" 29 | \n",
" p | \n",
" compact | \n",
"
\n",
" \n",
" 2 | \n",
" audi | \n",
" a4 | \n",
" 2.0 | \n",
" 2008 | \n",
" 4 | \n",
" manual(m6) | \n",
" f | \n",
" 20 | \n",
" 31 | \n",
" p | \n",
" compact | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" a4 | \n",
" 2.0 | \n",
" 2008 | \n",
" 4 | \n",
" auto(av) | \n",
" f | \n",
" 21 | \n",
" 30 | \n",
" p | \n",
" compact | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" a4 | \n",
" 2.8 | \n",
" 1999 | \n",
" 6 | \n",
" auto(l5) | \n",
" f | \n",
" 16 | \n",
" 26 | \n",
" p | \n",
" compact | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" manufacturer model displ year cyl trans drv cty hwy fl class\n",
"0 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact\n",
"1 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact\n",
"2 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact\n",
"3 audi a4 2.0 2008 4 auto(av) f 21 30 p compact\n",
"4 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n",
"df.drop(columns=[\"Unnamed: 0\"], inplace=True)\n",
"print(df.shape)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "52e52e19-90fa-4edf-b626-bad6ffb2994d",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:43:23.372641Z",
"iopub.status.busy": "2024-08-23T10:43:23.372521Z",
"iopub.status.idle": "2024-08-23T10:43:23.376827Z",
"shell.execute_reply": "2024-08-23T10:43:23.376654Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" x | \n",
" y | \n",
" corr | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" displ | \n",
" displ | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" displ | \n",
" year | \n",
" 0.147843 | \n",
"
\n",
" \n",
" 2 | \n",
" displ | \n",
" cyl | \n",
" 0.930227 | \n",
"
\n",
" \n",
" 3 | \n",
" displ | \n",
" cty | \n",
" -0.798524 | \n",
"
\n",
" \n",
" 4 | \n",
" displ | \n",
" hwy | \n",
" -0.766020 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" x y corr\n",
"0 displ displ 1.000000\n",
"1 displ year 0.147843\n",
"2 displ cyl 0.930227\n",
"3 displ cty -0.798524\n",
"4 displ hwy -0.766020"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corr_df = df.corr(numeric_only=True).stack().to_frame().reset_index()\n",
"corr_df.columns = [\"x\", \"y\", \"corr\"]\n",
"corr_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "98012f57-2c7d-4099-a7d6-cc6b23224256",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:43:23.377938Z",
"iopub.status.busy": "2024-08-23T10:43:23.377868Z",
"iopub.status.idle": "2024-08-23T10:43:23.381514Z",
"shell.execute_reply": "2024-08-23T10:43:23.381333Z"
}
},
"outputs": [],
"source": [
"corr_df0 = corr_df[corr_df[\"x\"] == corr_df[\"y\"]]\n",
"\n",
"corr_df1 = corr_df[corr_df[\"x\"] < corr_df[\"y\"]]\n",
"corr_df1 = pd.concat([\n",
" corr_df1.assign(half=\"corr\"),\n",
" corr_df1.assign(corr=np.where(corr_df1[\"corr\"] > 0, 1 - corr_df1[\"corr\"], -1 - corr_df1[\"corr\"]), half=\"remainder\")\n",
"]).reset_index(drop=True)\n",
"\n",
"corr_df2 = corr_df[corr_df[\"x\"] > corr_df[\"y\"]]\n",
"corr_df2 = corr_df2.assign(angle=np.where(corr_df2[\"corr\"] > 0, np.pi / 4, 3 * np.pi / 4))\n",
"\n",
"vars = sorted(corr_df[\"x\"].unique())"
]
},
{
"cell_type": "markdown",
"id": "3284b8f2-2203-4e50-b7e6-5cf05938071e",
"metadata": {},
"source": [
"### The Correlogram\n",
"\n",
"Let's plot a correlogram of the mpg dataset variables. Here's how it should be interpreted:\n",
"\n",
"- The filled portion of the pie shows the correlation magnitude.\n",
"- The diagonal of the squares shows the sign of the correlation.\n",
"- The depth of the figures shading shows the correlation magnitude.\n",
"- The names of the variables are on the diagonal."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "616a7c13-7e9f-4219-a502-e4e0f77b0ffb",
"metadata": {
"execution": {
"iopub.execute_input": "2024-08-23T10:43:23.382461Z",
"iopub.status.busy": "2024-08-23T10:43:23.382386Z",
"iopub.status.idle": "2024-08-23T10:43:23.414481Z",
"shell.execute_reply": "2024-08-23T10:43:23.413952Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot() + \\\n",
" geom_pie(aes(\"x\", \"y\", slice=\"corr\", paint_a=\"corr\"), \\\n",
" data=corr_df1, stat='identity', \\\n",
" size=1, size_unit='x', spacer_width=1, \\\n",
" fill_by='paint_a', tooltips='none') + \\\n",
" geom_pie(aes(\"x\", \"y\", slice=\"corr\", paint_b=\"half\"), \\\n",
" data=corr_df1, stat='identity', \\\n",
" size=1, size_unit='x', show_legend=False, \\\n",
" fill_by='paint_b', tooltips='none') + \\\n",
" geom_point(aes(\"x\", \"y\", paint_a=\"corr\"), \\\n",
" data=corr_df2, shape=22, size=1, stroke=1, size_unit='x', \\\n",
" color=\"white\", fill_by='paint_a', tooltips='none') + \\\n",
" geom_spoke(aes(\"x\", \"y\", angle=\"angle\"), \\\n",
" radius=np.sqrt(2), data=corr_df2, \\\n",
" pivot='middle', size=1, color=\"white\") + \\\n",
" geom_text(aes(\"x\", \"y\", label=\"x\"), \\\n",
" data=corr_df0, size=1, size_unit='x') + \\\n",
" scale_x_discrete(breaks=vars, expand=[.1, 0]) + scale_y_discrete(breaks=vars, expand=[.1, 0]) + \\\n",
" scale_gradient2('paint_a', low=\"#a50026\", mid=\"white\", high=\"#313695\") + \\\n",
" scale_manual('paint_b', values=[\"rgba(0, 0, 0, 0)\", \"lightgrey\"]) + \\\n",
" coord_fixed() + \\\n",
" ggsize(660, 600) + \\\n",
" theme_void()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}