{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lets-Plot in 2020"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Preparation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:26.699196Z",
"iopub.status.busy": "2024-03-26T14:42:26.698988Z",
"iopub.status.idle": "2024-03-26T14:42:27.391202Z",
"shell.execute_reply": "2024-03-26T14:42:27.390730Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: colorcet in /home/asmirnov/Applications/miniconda3/envs/lets-plot-docs/lib/python3.10/site-packages (3.1.0)\r\n"
]
}
],
"source": [
"from sys import executable\n",
"!{executable} -m pip install colorcet"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:27.392848Z",
"iopub.status.busy": "2024-03-26T14:42:27.392755Z",
"iopub.status.idle": "2024-03-26T14:42:27.754675Z",
"shell.execute_reply": "2024-03-26T14:42:27.754350Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import colorcet as cc\n",
"from PIL import Image\n",
"\n",
"from lets_plot import *\n",
"from lets_plot.bistro.corr import *"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:27.756182Z",
"iopub.status.busy": "2024-03-26T14:42:27.756058Z",
"iopub.status.idle": "2024-03-26T14:42:27.758509Z",
"shell.execute_reply": "2024-03-26T14:42:27.758332Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"LetsPlot.setup_html()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:27.759514Z",
"iopub.status.busy": "2024-03-26T14:42:27.759440Z",
"iopub.status.idle": "2024-03-26T14:42:28.269145Z",
"shell.execute_reply": "2024-03-26T14:42:28.268947Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" author_date | \n",
" author_name | \n",
" files_changed | \n",
" insertions | \n",
" deletions | \n",
" diff | \n",
" month | \n",
" day | \n",
" weekday | \n",
" hour | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2020-01-06 14:43:17+00:00 | \n",
" Ivan Kupriyanov | \n",
" 1 | \n",
" 3 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 6 | \n",
" 0 | \n",
" 14 | \n",
"
\n",
" \n",
" 1 | \n",
" 2020-01-06 17:42:31+00:00 | \n",
" Ivan Kupriyanov | \n",
" 132 | \n",
" 665 | \n",
" 612 | \n",
" 53 | \n",
" 1 | \n",
" 6 | \n",
" 0 | \n",
" 17 | \n",
"
\n",
" \n",
" 2 | \n",
" 2020-01-09 11:47:11+00:00 | \n",
" Ivan Seleznev | \n",
" 1 | \n",
" 11 | \n",
" 1 | \n",
" 10 | \n",
" 1 | \n",
" 9 | \n",
" 3 | \n",
" 11 | \n",
"
\n",
" \n",
" 3 | \n",
" 2020-01-09 15:32:34+00:00 | \n",
" Ivan Kupriyanov | \n",
" 6 | \n",
" 35 | \n",
" 40 | \n",
" -5 | \n",
" 1 | \n",
" 9 | \n",
" 3 | \n",
" 15 | \n",
"
\n",
" \n",
" 4 | \n",
" 2020-01-10 12:35:09+00:00 | \n",
" Ivan Seleznev | \n",
" 4 | \n",
" 299 | \n",
" 22 | \n",
" 277 | \n",
" 1 | \n",
" 10 | \n",
" 4 | \n",
" 12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" author_date author_name files_changed insertions \\\n",
"0 2020-01-06 14:43:17+00:00 Ivan Kupriyanov 1 3 \n",
"1 2020-01-06 17:42:31+00:00 Ivan Kupriyanov 132 665 \n",
"2 2020-01-09 11:47:11+00:00 Ivan Seleznev 1 11 \n",
"3 2020-01-09 15:32:34+00:00 Ivan Kupriyanov 6 35 \n",
"4 2020-01-10 12:35:09+00:00 Ivan Seleznev 4 299 \n",
"\n",
" deletions diff month day weekday hour \n",
"0 1 2 1 6 0 14 \n",
"1 612 53 1 6 0 17 \n",
"2 1 10 1 9 3 11 \n",
"3 40 -5 1 9 3 15 \n",
"4 22 277 1 10 4 12 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/lets_plot_git_history.csv\", sep=';')\n",
"\n",
"df = df[['author_date', 'author_name', 'files_changed', 'insertions', 'deletions']]\n",
"df.author_date = pd.to_datetime(df.author_date, utc=True)\n",
"df.files_changed = df.files_changed.str.split(' ').str[0].astype(int)\n",
"df.insertions = df.insertions.str.split(' ').str[0].astype(int)\n",
"df.deletions = df.deletions.fillna('0').str.split(' ').str[0].astype(int)\n",
"\n",
"df['diff'] = df.insertions - df.deletions\n",
"df['month'] = df.author_date.dt.month\n",
"df['day'] = df.author_date.dt.day\n",
"df['weekday'] = df.author_date.dt.weekday\n",
"df['hour'] = df.author_date.dt.hour\n",
"\n",
"df = df[df.author_date.dt.year == 2020].sort_values(by='author_date').reset_index(drop=True)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### General Analytics"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.270335Z",
"iopub.status.busy": "2024-03-26T14:42:28.270220Z",
"iopub.status.idle": "2024-03-26T14:42:28.274212Z",
"shell.execute_reply": "2024-03-26T14:42:28.274042Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" author_name | \n",
" commits_number | \n",
"
\n",
" \n",
" | \n",
" | \n",
" sum | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Igor Alshannikov | \n",
" 337 | \n",
"
\n",
" \n",
" 1 | \n",
" Ivan Kupriyanov | \n",
" 303 | \n",
"
\n",
" \n",
" 2 | \n",
" Valentin Dovidaytis | \n",
" 92 | \n",
"
\n",
" \n",
" 3 | \n",
" Olga Larionova | \n",
" 53 | \n",
"
\n",
" \n",
" 4 | \n",
" Ivan Ryabov | \n",
" 28 | \n",
"
\n",
" \n",
" 5 | \n",
" Ivan Seleznev | \n",
" 27 | \n",
"
\n",
" \n",
" 6 | \n",
" Ilya Krukov | \n",
" 23 | \n",
"
\n",
" \n",
" 7 | \n",
" Alla Redko | \n",
" 7 | \n",
"
\n",
" \n",
" 8 | \n",
" Mikhail Koroteev | \n",
" 3 | \n",
"
\n",
" \n",
" 9 | \n",
" Henrikh Kantuni | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" author_name commits_number\n",
" sum\n",
"0 Igor Alshannikov 337\n",
"1 Ivan Kupriyanov 303\n",
"2 Valentin Dovidaytis 92\n",
"3 Olga Larionova 53\n",
"4 Ivan Ryabov 28\n",
"5 Ivan Seleznev 27\n",
"6 Ilya Krukov 23\n",
"7 Alla Redko 7\n",
"8 Mikhail Koroteev 3\n",
"9 Henrikh Kantuni 1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.275370Z",
"iopub.status.busy": "2024-03-26T14:42:28.275237Z",
"iopub.status.idle": "2024-03-26T14:42:28.324480Z",
"shell.execute_reply": "2024-03-26T14:42:28.324289Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_208538/2962890979.py:7: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n",
" agg_df = pd.merge(agg_df.sort_index(axis=1), \\\n"
]
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"agg_features = {'files_changed': ['sum', 'mean'], \\\n",
" 'insertions': ['sum', 'mean'], \\\n",
" 'deletions': ['sum', 'mean'], \\\n",
" 'diff': ['sum']}\n",
"agg_df = df.groupby('author_name').agg(agg_features).reset_index()\n",
"agg_features['commits_number'] = ['sum']\n",
"agg_df = pd.merge(agg_df.sort_index(axis=1), \\\n",
" df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index().sort_index(axis=1), \\\n",
" left_on='author_name', right_on='author_name')\n",
"agg_df['color'] = cc.palette['glasbey_bw'][:agg_df.shape[0]]\n",
"\n",
"plots = []\n",
"for feature, agg in [(key, val) for key, vals in agg_features.items() for val in vals]:\n",
" agg_df = agg_df.sort_values(by=(feature, agg), ascending=False)\n",
" aes_name = ('total {0}' if agg == 'sum' else 'mean {0} per commit').format(feature.replace('_', ' '))\n",
" plotted_df = agg_df[[('author_name', ''), (feature, agg), ('color', '')]]\n",
" plotted_df.columns = plotted_df.columns.get_level_values(0)\n",
" plots.append(ggplot(plotted_df) + \\\n",
" geom_bar(aes(x='author_name', y=feature, color='color', fill='color'), \\\n",
" stat='identity', alpha=.25, size=1, \\\n",
" tooltips=layer_tooltips().line('^x')\n",
" .line('{0}|^y'.format(aes_name))) + \\\n",
" scale_color_identity() + scale_fill_identity() + \\\n",
" xlab('') + ylab('') + \\\n",
" ggtitle(aes_name.title()))\n",
"\n",
"w, h = 400, 300\n",
"bunch = GGBunch()\n",
"bunch.add_plot(plots[7], 0, 0, w, h)\n",
"bunch.add_plot(plots[6], w, 0, w, h)\n",
"bunch.add_plot(plots[0], 0, h, w, h)\n",
"bunch.add_plot(plots[1], w, h, w, h)\n",
"bunch.add_plot(plots[2], 0, 2 * h, w, h)\n",
"bunch.add_plot(plots[3], w, 2 * h, w, h)\n",
"bunch.add_plot(plots[4], 0, 3 * h, w, h)\n",
"bunch.add_plot(plots[5], w, 3 * h, w, h)\n",
"bunch.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looking at the total values, we clearly see that Igor Alshannikov and Ivan Kupriyanov outcompete the rest. But there is a real intrigue as to who takes the third place.\n",
"\n",
"Meanwhile, we see more diversity in mean values of different contribution types."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.325664Z",
"iopub.status.busy": "2024-03-26T14:42:28.325588Z",
"iopub.status.idle": "2024-03-26T14:42:28.329628Z",
"shell.execute_reply": "2024-03-26T14:42:28.329446Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df.hour.value_counts().to_frame('count').reset_index().sort_values(by='hour')) + \\\n",
" geom_histogram(aes(x='hour', y='count', color='hour', fill='hour'), \\\n",
" stat='identity', show_legend=False, \\\n",
" tooltips=layer_tooltips().line('^y')) + \\\n",
" scale_x_discrete(breaks=list(range(24))) + \\\n",
" scale_color_gradient(low='#e0ecf4', high='#8856a7') + \\\n",
" scale_fill_gradient(low='#e0ecf4', high='#8856a7') + \\\n",
" ylab('commits number') + \\\n",
" ggtitle('Total Hourly Committing') + ggsize(600, 450)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The peak of commit activity is about 18 p.m. The evening seems to be a good time to save daily results."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Higher Resolution"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.330967Z",
"iopub.status.busy": "2024-03-26T14:42:28.330814Z",
"iopub.status.idle": "2024-03-26T14:42:28.344294Z",
"shell.execute_reply": "2024-03-26T14:42:28.344119Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df[df.insertions > 0]) + \\\n",
" geom_lollipop(aes(x='author_date', y='insertions', fill='month'), shape=21, fatten=1, color='black', \\\n",
" tooltips=layer_tooltips().line('@author_name').line('@|@insertions').line('@|@month')) + \\\n",
" scale_x_datetime(name='date') + \\\n",
" scale_y_log10(name='insertions (log)') + \\\n",
" scale_fill_brewer(name='', type='qual', palette='Accent') + \\\n",
" facet_grid(y='author_name')+ \\\n",
" ggtitle('Lollipop Plot of Commits by Authors') + ggsize(800, 1000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some of the team members started their work only a few months ago, so they still have time to catch up next year."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.345440Z",
"iopub.status.busy": "2024-03-26T14:42:28.345313Z",
"iopub.status.idle": "2024-03-26T14:42:28.357471Z",
"shell.execute_reply": "2024-03-26T14:42:28.357295Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ggplot(df) + \\\n",
" geom_point(aes(x='weekday', y='insertions', color='author_name', size='files_changed'), \\\n",
" shape=8, alpha=.75, position='jitter', show_legend=False, \\\n",
" tooltips=layer_tooltips().line('author|@author_name')\n",
" .line('@|@insertions')\n",
" .line('@|@deletions')\n",
" .line('files changed|@files_changed')) + \\\n",
" scale_x_discrete(labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', \\\n",
" 'Friday', 'Saturday', 'Sunday']) + \\\n",
" scale_y_log10(breaks=[2 ** n for n in range(16)]) + \\\n",
" scale_color_brewer(type='qual', palette='Pastel1') + \\\n",
" scale_size(range=[3, 7], trans='sqrt') + \\\n",
" ggtitle('All Commits') + ggsize(800, 600) + \\\n",
" theme(axis_tooltip='blank')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Usually no one works at the weekend. But if something needs to be done - it should be."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### And Finally..."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.358770Z",
"iopub.status.busy": "2024-03-26T14:42:28.358625Z",
"iopub.status.idle": "2024-03-26T14:42:28.360747Z",
"shell.execute_reply": "2024-03-26T14:42:28.360559Z"
}
},
"outputs": [],
"source": [
"r = df.groupby('day').insertions.median().values\n",
"x = r * np.cos(np.linspace(0, 2 * np.pi, r.size))\n",
"y = r * np.sin(np.linspace(0, 2 * np.pi, r.size))\n",
"daily_insertions_df = pd.DataFrame({'x': x, 'y': y})"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.361806Z",
"iopub.status.busy": "2024-03-26T14:42:28.361666Z",
"iopub.status.idle": "2024-03-26T14:42:28.377963Z",
"shell.execute_reply": "2024-03-26T14:42:28.377768Z"
}
},
"outputs": [],
"source": [
"MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n",
"mask_width, mask_height = 60, 80\n",
"\n",
"mask = np.array(Image.open(\"images/snowman_mask.bmp\").resize((mask_width, mask_height), Image.Resampling.BILINEAR))\n",
"grid = [[(0 if color.mean() > 255 / 2 else 1) for color in row] for row in mask]\n",
"\n",
"grid_df = pd.DataFrame(grid).stack().to_frame('month_id')\n",
"grid_df.index.set_names(['y', 'x'], inplace=True)\n",
"grid_df = grid_df.reset_index()\n",
"grid_df.y = grid_df.y.max() - grid_df.y\n",
"grid_df = grid_df[grid_df['month_id'] > 0].reset_index(drop=True)\n",
"\n",
"agg_df = np.round(df.month.value_counts() * grid_df.shape[0] / df.shape[0]).to_frame('commits_number')\n",
"agg_df.iloc[0].commits_number += grid_df.shape[0] - agg_df.commits_number.sum()\n",
"agg_df.commits_number = agg_df.commits_number.astype(int)\n",
"agg_df.index.name = 'month_id'\n",
"agg_df = agg_df.reset_index()\n",
"\n",
"grid_df['commits_number'] = 0\n",
"start_idx = 0\n",
"for idx, (month, commits_number) in agg_df.iterrows():\n",
" grid_df.loc[start_idx:(start_idx + commits_number), 'month'] = MONTHS[month - 1]\n",
" grid_df.loc[start_idx:(start_idx + commits_number), 'commits_number'] = commits_number\n",
" start_idx += commits_number"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2024-03-26T14:42:28.379228Z",
"iopub.status.busy": "2024-03-26T14:42:28.379155Z",
"iopub.status.idle": "2024-03-26T14:42:28.400851Z",
"shell.execute_reply": "2024-03-26T14:42:28.400663Z"
}
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"blank_theme = theme_void() + theme(axis_ticks_length=0, legend_position='none')\n",
"\n",
"ps = ggplot(daily_insertions_df, aes(x='x', y='y')) + \\\n",
" geom_polygon(color='#f03b20', fill='#fd8d3c', size=1) + coord_fixed() + blank_theme\n",
"p1l = corr_plot(data=df[['insertions', 'deletions']], flip=False).tiles(type='lower', diag=True)\\\n",
" .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n",
"p1r = corr_plot(data=df[['deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\\\n",
" .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n",
"p2l = corr_plot(data=df[['insertions', 'deletions', 'diff']], flip=False).tiles(type='lower', diag=True)\\\n",
" .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n",
"p2r = corr_plot(data=df[['diff', 'deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\\\n",
" .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n",
"p3l = corr_plot(data=df[['insertions', 'deletions', 'diff', 'files_changed']], flip=False)\\\n",
" .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n",
"p3r = corr_plot(data=df[['files_changed', 'diff', 'deletions', 'insertions']], flip=True)\\\n",
" .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n",
"pt = ggplot({'x': [0], 'y': [0], 'greetings': ['Happy New Year!']}, aes(x='x', y='y')) + \\\n",
" geom_text(aes(label='greetings'), color='blue', size=20, family='Times New Roman', fontface='bold') + blank_theme\n",
"pm = ggplot(grid_df, aes(x='x', y='y')) + \\\n",
" geom_tile(aes(fill='month'), width=.8, height=.8, \\\n",
" tooltips=layer_tooltips().line('@|@month')\n",
" .line('@|@commits_number')) + \\\n",
" scale_fill_brewer(type='qual', palette='Set2') + \\\n",
" blank_theme\n",
"\n",
"w, h = 50, 50\n",
"bunch = GGBunch()\n",
"bunch.add_plot(ps, 3 * w, 0, 2 * w, 2 * h)\n",
"bunch.add_plot(p1l, 2 * w, 2 * h, 2 * w, 2 * h)\n",
"bunch.add_plot(p1r, 4 * w, 2 * h, 2 * w, 2 * h)\n",
"bunch.add_plot(p2l, w, 4 * h, 3 * w, 3 * h)\n",
"bunch.add_plot(p2r, 4 * w, 4 * h, 3 * w, 3 * h)\n",
"bunch.add_plot(p3l, 0, 7 * h, 4 * w, 4 * h)\n",
"bunch.add_plot(p3r, 4 * w, 7 * h, 4 * w, 4 * h)\n",
"bunch.add_plot(pt, 0, 11 * h, 16 * w, 2 * h)\n",
"bunch.add_plot(pm, 8 * w, 3 * h, 8 * w, 8 * h)\n",
"bunch.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}