{ "cells": [ { "cell_type": "markdown", "id": "f53ef1d6-ec34-4b1e-9b34-c625edc86331", "metadata": {}, "source": [ "# MPG Correlogram\n", "\n", "A correlogram provides a quick overview of the entire dataset and allows analysing the relationship between each pair of numerical variables." ] }, { "cell_type": "code", "execution_count": 1, "id": "d48d91c8-4a42-434c-85c6-f84a96e10d5b", "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:43:22.671957Z", "iopub.status.busy": "2024-08-23T10:43:22.671872Z", "iopub.status.idle": "2024-08-23T10:43:22.994980Z", "shell.execute_reply": "2024-08-23T10:43:22.994718Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from lets_plot import *" ] }, { "cell_type": "code", "execution_count": 2, "id": "184c6176-bd7a-4881-bb9d-1642ffd3bc40", "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:43:22.997253Z", "iopub.status.busy": "2024-08-23T10:43:22.997081Z", "iopub.status.idle": "2024-08-23T10:43:22.999857Z", "shell.execute_reply": "2024-08-23T10:43:22.999660Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 3, "id": "191b90ef-a38e-4988-b98c-774e0a2aa8c0", "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:43:23.000773Z", "iopub.status.busy": "2024-08-23T10:43:23.000699Z", "iopub.status.idle": "2024-08-23T10:43:23.371020Z", "shell.execute_reply": "2024-08-23T10:43:23.370723Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(234, 11)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
manufacturermodeldisplyearcyltransdrvctyhwyflclass
0audia41.819994auto(l5)f1829pcompact
1audia41.819994manual(m5)f2129pcompact
2audia42.020084manual(m6)f2031pcompact
3audia42.020084auto(av)f2130pcompact
4audia42.819996auto(l5)f1626pcompact
\n", "
" ], "text/plain": [ " manufacturer model displ year cyl trans drv cty hwy fl class\n", "0 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact\n", "1 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact\n", "2 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact\n", "3 audi a4 2.0 2008 4 auto(av) f 21 30 p compact\n", "4 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n", "df.drop(columns=[\"Unnamed: 0\"], inplace=True)\n", "print(df.shape)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "52e52e19-90fa-4edf-b626-bad6ffb2994d", "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:43:23.372641Z", "iopub.status.busy": "2024-08-23T10:43:23.372521Z", "iopub.status.idle": "2024-08-23T10:43:23.376827Z", "shell.execute_reply": "2024-08-23T10:43:23.376654Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
xycorr
0displdispl1.000000
1displyear0.147843
2displcyl0.930227
3displcty-0.798524
4displhwy-0.766020
\n", "
" ], "text/plain": [ " x y corr\n", "0 displ displ 1.000000\n", "1 displ year 0.147843\n", "2 displ cyl 0.930227\n", "3 displ cty -0.798524\n", "4 displ hwy -0.766020" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corr_df = df.corr(numeric_only=True).stack().to_frame().reset_index()\n", "corr_df.columns = [\"x\", \"y\", \"corr\"]\n", "corr_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "98012f57-2c7d-4099-a7d6-cc6b23224256", "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:43:23.377938Z", "iopub.status.busy": "2024-08-23T10:43:23.377868Z", "iopub.status.idle": "2024-08-23T10:43:23.381514Z", "shell.execute_reply": "2024-08-23T10:43:23.381333Z" } }, "outputs": [], "source": [ "corr_df0 = corr_df[corr_df[\"x\"] == corr_df[\"y\"]]\n", "\n", "corr_df1 = corr_df[corr_df[\"x\"] < corr_df[\"y\"]]\n", "corr_df1 = pd.concat([\n", " corr_df1.assign(half=\"corr\"),\n", " corr_df1.assign(corr=np.where(corr_df1[\"corr\"] > 0, 1 - corr_df1[\"corr\"], -1 - corr_df1[\"corr\"]), half=\"remainder\")\n", "]).reset_index(drop=True)\n", "\n", "corr_df2 = corr_df[corr_df[\"x\"] > corr_df[\"y\"]]\n", "corr_df2 = corr_df2.assign(angle=np.where(corr_df2[\"corr\"] > 0, np.pi / 4, 3 * np.pi / 4))\n", "\n", "vars = sorted(corr_df[\"x\"].unique())" ] }, { "cell_type": "markdown", "id": "3284b8f2-2203-4e50-b7e6-5cf05938071e", "metadata": {}, "source": [ "### The Correlogram\n", "\n", "Let's plot a correlogram of the mpg dataset variables. Here's how it should be interpreted:\n", "\n", "- The filled portion of the pie shows the correlation magnitude.\n", "- The diagonal of the squares shows the sign of the correlation.\n", "- The depth of the figures shading shows the correlation magnitude.\n", "- The names of the variables are on the diagonal." ] }, { "cell_type": "code", "execution_count": 6, "id": "616a7c13-7e9f-4219-a502-e4e0f77b0ffb", "metadata": { "execution": { "iopub.execute_input": "2024-08-23T10:43:23.382461Z", "iopub.status.busy": "2024-08-23T10:43:23.382386Z", "iopub.status.idle": "2024-08-23T10:43:23.414481Z", "shell.execute_reply": "2024-08-23T10:43:23.413952Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot() + \\\n", " geom_pie(aes(\"x\", \"y\", slice=\"corr\", paint_a=\"corr\"), \\\n", " data=corr_df1, stat='identity', \\\n", " size=1, size_unit='x', spacer_width=1, \\\n", " fill_by='paint_a', tooltips='none') + \\\n", " geom_pie(aes(\"x\", \"y\", slice=\"corr\", paint_b=\"half\"), \\\n", " data=corr_df1, stat='identity', \\\n", " size=1, size_unit='x', show_legend=False, \\\n", " fill_by='paint_b', tooltips='none') + \\\n", " geom_point(aes(\"x\", \"y\", paint_a=\"corr\"), \\\n", " data=corr_df2, shape=22, size=1, stroke=1, size_unit='x', \\\n", " color=\"white\", fill_by='paint_a', tooltips='none') + \\\n", " geom_spoke(aes(\"x\", \"y\", angle=\"angle\"), \\\n", " radius=np.sqrt(2), data=corr_df2, \\\n", " pivot='middle', size=1, color=\"white\") + \\\n", " geom_text(aes(\"x\", \"y\", label=\"x\"), \\\n", " data=corr_df0, size=1, size_unit='x') + \\\n", " scale_x_discrete(breaks=vars, expand=[.1, 0]) + scale_y_discrete(breaks=vars, expand=[.1, 0]) + \\\n", " scale_gradient2('paint_a', low=\"#a50026\", mid=\"white\", high=\"#313695\") + \\\n", " scale_manual('paint_b', values=[\"rgba(0, 0, 0, 0)\", \"lightgrey\"]) + \\\n", " coord_fixed() + \\\n", " ggsize(660, 600) + \\\n", " theme_void()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }