{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lets-Plot in 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:52.749851Z", "iopub.status.busy": "2024-04-26T12:15:52.749851Z", "iopub.status.idle": "2024-04-26T12:15:56.088960Z", "shell.execute_reply": "2024-04-26T12:15:56.088960Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: colorcet in d:\\anaconda3\\envs\\lets-plot-docs\\lib\\site-packages (3.1.0)\n" ] } ], "source": [ "from sys import executable\n", "!{executable} -m pip install colorcet" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:56.104579Z", "iopub.status.busy": "2024-04-26T12:15:56.104579Z", "iopub.status.idle": "2024-04-26T12:15:57.315355Z", "shell.execute_reply": "2024-04-26T12:15:57.315355Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import colorcet as cc\n", "from PIL import Image\n", "\n", "from lets_plot import *\n", "from lets_plot.bistro.corr import *" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.315355Z", "iopub.status.busy": "2024-04-26T12:15:57.315355Z", "iopub.status.idle": "2024-04-26T12:15:57.331225Z", "shell.execute_reply": "2024-04-26T12:15:57.331225Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.331225Z", "iopub.status.busy": "2024-04-26T12:15:57.331225Z", "iopub.status.idle": "2024-04-26T12:15:57.756037Z", "shell.execute_reply": "2024-04-26T12:15:57.756037Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
author_dateauthor_namefiles_changedinsertionsdeletionsdiffmonthdayweekdayhour
02020-01-06 14:43:17+00:00Ivan Kupriyanov131216014
12020-01-06 17:42:31+00:00Ivan Kupriyanov1326656125316017
22020-01-09 11:47:11+00:00Ivan Seleznev11111019311
32020-01-09 15:32:34+00:00Ivan Kupriyanov63540-519315
42020-01-10 12:35:09+00:00Ivan Seleznev429922277110412
\n", "
" ], "text/plain": [ " author_date author_name files_changed insertions \\\n", "0 2020-01-06 14:43:17+00:00 Ivan Kupriyanov 1 3 \n", "1 2020-01-06 17:42:31+00:00 Ivan Kupriyanov 132 665 \n", "2 2020-01-09 11:47:11+00:00 Ivan Seleznev 1 11 \n", "3 2020-01-09 15:32:34+00:00 Ivan Kupriyanov 6 35 \n", "4 2020-01-10 12:35:09+00:00 Ivan Seleznev 4 299 \n", "\n", " deletions diff month day weekday hour \n", "0 1 2 1 6 0 14 \n", "1 612 53 1 6 0 17 \n", "2 1 10 1 9 3 11 \n", "3 40 -5 1 9 3 15 \n", "4 22 277 1 10 4 12 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/lets_plot_git_history.csv\", sep=';')\n", "\n", "df = df[['author_date', 'author_name', 'files_changed', 'insertions', 'deletions']]\n", "df.author_date = pd.to_datetime(df.author_date, utc=True)\n", "df.files_changed = df.files_changed.str.split(' ').str[0].astype(int)\n", "df.insertions = df.insertions.str.split(' ').str[0].astype(int)\n", "df.deletions = df.deletions.fillna('0').str.split(' ').str[0].astype(int)\n", "\n", "df['diff'] = df.insertions - df.deletions\n", "df['month'] = df.author_date.dt.month\n", "df['day'] = df.author_date.dt.day\n", "df['weekday'] = df.author_date.dt.weekday\n", "df['hour'] = df.author_date.dt.hour\n", "\n", "df = df[df.author_date.dt.year == 2020].sort_values(by='author_date').reset_index(drop=True)\n", "\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### General Analytics" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.756037Z", "iopub.status.busy": "2024-04-26T12:15:57.756037Z", "iopub.status.idle": "2024-04-26T12:15:57.771631Z", "shell.execute_reply": "2024-04-26T12:15:57.771631Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
author_namecommits_number
sum
0Igor Alshannikov337
1Ivan Kupriyanov303
2Valentin Dovidaytis92
3Olga Larionova53
4Ivan Ryabov28
5Ivan Seleznev27
6Ilya Krukov23
7Alla Redko7
8Mikhail Koroteev3
9Henrikh Kantuni1
\n", "
" ], "text/plain": [ " author_name commits_number\n", " sum\n", "0 Igor Alshannikov 337\n", "1 Ivan Kupriyanov 303\n", "2 Valentin Dovidaytis 92\n", "3 Olga Larionova 53\n", "4 Ivan Ryabov 28\n", "5 Ivan Seleznev 27\n", "6 Ilya Krukov 23\n", "7 Alla Redko 7\n", "8 Mikhail Koroteev 3\n", "9 Henrikh Kantuni 1" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.771631Z", "iopub.status.busy": "2024-04-26T12:15:57.771631Z", "iopub.status.idle": "2024-04-26T12:15:57.912916Z", "shell.execute_reply": "2024-04-26T12:15:57.912916Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "agg_features = {'files_changed': ['sum', 'mean'], \\\n", " 'insertions': ['sum', 'mean'], \\\n", " 'deletions': ['sum', 'mean'], \\\n", " 'diff': ['sum']}\n", "agg_df = df.groupby('author_name').agg(agg_features).reset_index()\n", "agg_features['commits_number'] = ['sum']\n", "agg_df = pd.merge(agg_df.sort_index(axis=1, level=0), \\\n", " df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index().sort_index(axis=1, level=0), \\\n", " left_on='author_name', right_on='author_name')\n", "agg_df['color'] = cc.palette['glasbey_bw'][:agg_df.shape[0]]\n", "\n", "plots = []\n", "for feature, agg in [(key, val) for key, vals in agg_features.items() for val in vals]:\n", " agg_df = agg_df.sort_values(by=(feature, agg), ascending=False)\n", " aes_name = ('total {0}' if agg == 'sum' else 'mean {0} per commit').format(feature.replace('_', ' '))\n", " plotted_df = agg_df[[('author_name', ''), (feature, agg), ('color', '')]]\n", " plotted_df.columns = plotted_df.columns.get_level_values(0)\n", " plots.append(ggplot(plotted_df) + \\\n", " geom_bar(aes(x='author_name', y=feature, color='color', fill='color'), \\\n", " stat='identity', alpha=.25, size=1, \\\n", " tooltips=layer_tooltips().line('^x')\n", " .line('{0}|^y'.format(aes_name))) + \\\n", " scale_color_identity() + scale_fill_identity() + \\\n", " xlab('') + ylab('') + \\\n", " ggtitle(aes_name.title()))\n", "\n", "w, h = 400, 300\n", "bunch = GGBunch()\n", "bunch.add_plot(plots[7], 0, 0, w, h)\n", "bunch.add_plot(plots[6], w, 0, w, h)\n", "bunch.add_plot(plots[0], 0, h, w, h)\n", "bunch.add_plot(plots[1], w, h, w, h)\n", "bunch.add_plot(plots[2], 0, 2 * h, w, h)\n", "bunch.add_plot(plots[3], w, 2 * h, w, h)\n", "bunch.add_plot(plots[4], 0, 3 * h, w, h)\n", "bunch.add_plot(plots[5], w, 3 * h, w, h)\n", "bunch.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looking at the total values, we clearly see that Igor Alshannikov and Ivan Kupriyanov outcompete the rest. But there is a real intrigue as to who takes the third place.\n", "\n", "Meanwhile, we see more diversity in mean values of different contribution types." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.912916Z", "iopub.status.busy": "2024-04-26T12:15:57.912916Z", "iopub.status.idle": "2024-04-26T12:15:57.929058Z", "shell.execute_reply": "2024-04-26T12:15:57.929058Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df.hour.value_counts().to_frame('count').reset_index().sort_values(by='hour')) + \\\n", " geom_histogram(aes(x='hour', y='count', color='hour', fill='hour'), \\\n", " stat='identity', show_legend=False, \\\n", " tooltips=layer_tooltips().line('^y')) + \\\n", " scale_x_discrete(breaks=list(range(24))) + \\\n", " scale_color_gradient(low='#e0ecf4', high='#8856a7') + \\\n", " scale_fill_gradient(low='#e0ecf4', high='#8856a7') + \\\n", " ylab('commits number') + \\\n", " ggtitle('Total Hourly Committing') + ggsize(600, 450)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The peak of commit activity is about 18 p.m. The evening seems to be a good time to save daily results." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Higher Resolution" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.929058Z", "iopub.status.busy": "2024-04-26T12:15:57.929058Z", "iopub.status.idle": "2024-04-26T12:15:57.976156Z", "shell.execute_reply": "2024-04-26T12:15:57.976156Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df[df.insertions > 0]) + \\\n", " geom_lollipop(aes(x='author_date', y='insertions', fill='month'), shape=21, fatten=1, color='black', \\\n", " tooltips=layer_tooltips().line('@author_name').line('@|@insertions').line('@|@month')) + \\\n", " scale_x_datetime(name='date') + \\\n", " scale_y_log10(name='insertions (log)') + \\\n", " scale_fill_brewer(name='', type='qual', palette='Accent') + \\\n", " facet_grid(y='author_name')+ \\\n", " ggtitle('Lollipop Plot of Commits by Authors') + ggsize(800, 1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some of the team members started their work only a few months ago, so they still have time to catch up next year." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:57.976156Z", "iopub.status.busy": "2024-04-26T12:15:57.976156Z", "iopub.status.idle": "2024-04-26T12:15:58.023753Z", "shell.execute_reply": "2024-04-26T12:15:58.023017Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df) + \\\n", " geom_point(aes(x='weekday', y='insertions', color='author_name', size='files_changed'), \\\n", " shape=8, alpha=.75, position='jitter', show_legend=False, \\\n", " tooltips=layer_tooltips().line('author|@author_name')\n", " .line('@|@insertions')\n", " .line('@|@deletions')\n", " .line('files changed|@files_changed')) + \\\n", " scale_x_discrete(labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', \\\n", " 'Friday', 'Saturday', 'Sunday']) + \\\n", " scale_y_log10(breaks=[2 ** n for n in range(16)]) + \\\n", " scale_color_brewer(type='qual', palette='Pastel1') + \\\n", " scale_size(range=[3, 7], trans='sqrt') + \\\n", " ggtitle('All Commits') + ggsize(800, 600) + \\\n", " theme(axis_tooltip='blank')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Usually no one works at the weekend. But if something needs to be done - it should be." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### And Finally..." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:58.023753Z", "iopub.status.busy": "2024-04-26T12:15:58.023753Z", "iopub.status.idle": "2024-04-26T12:15:58.039038Z", "shell.execute_reply": "2024-04-26T12:15:58.039038Z" } }, "outputs": [], "source": [ "r = df.groupby('day').insertions.median().values\n", "x = r * np.cos(np.linspace(0, 2 * np.pi, r.size))\n", "y = r * np.sin(np.linspace(0, 2 * np.pi, r.size))\n", "daily_insertions_df = pd.DataFrame({'x': x, 'y': y})" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:58.039038Z", "iopub.status.busy": "2024-04-26T12:15:58.039038Z", "iopub.status.idle": "2024-04-26T12:15:58.101728Z", "shell.execute_reply": "2024-04-26T12:15:58.101728Z" } }, "outputs": [], "source": [ "MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n", "mask_width, mask_height = 60, 80\n", "\n", "mask = np.array(Image.open(\"images/snowman_mask.bmp\").resize((mask_width, mask_height), Image.Resampling.BILINEAR))\n", "grid = [[(0 if color.mean() > 255 / 2 else 1) for color in row] for row in mask]\n", "\n", "grid_df = pd.DataFrame(grid).stack().to_frame('month_id')\n", "grid_df.index.set_names(['y', 'x'], inplace=True)\n", "grid_df = grid_df.reset_index()\n", "grid_df.y = grid_df.y.max() - grid_df.y\n", "grid_df = grid_df[grid_df['month_id'] > 0].reset_index(drop=True)\n", "\n", "agg_df = np.round(df.month.value_counts() * grid_df.shape[0] / df.shape[0]).to_frame('commits_number')\n", "agg_df.iloc[0].commits_number += grid_df.shape[0] - agg_df.commits_number.sum()\n", "agg_df.commits_number = agg_df.commits_number.astype(int)\n", "agg_df.index.name = 'month_id'\n", "agg_df = agg_df.reset_index()\n", "\n", "grid_df['commits_number'] = 0\n", "start_idx = 0\n", "for idx, (month, commits_number) in agg_df.iterrows():\n", " grid_df.loc[start_idx:(start_idx + commits_number), 'month'] = MONTHS[month - 1]\n", " grid_df.loc[start_idx:(start_idx + commits_number), 'commits_number'] = commits_number\n", " start_idx += commits_number" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2024-04-26T12:15:58.103038Z", "iopub.status.busy": "2024-04-26T12:15:58.103038Z", "iopub.status.idle": "2024-04-26T12:15:58.164885Z", "shell.execute_reply": "2024-04-26T12:15:58.164885Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "blank_theme = theme_void() + theme(axis_ticks_length=0, legend_position='none')\n", "\n", "ps = ggplot(daily_insertions_df, aes(x='x', y='y')) + \\\n", " geom_polygon(color='#f03b20', fill='#fd8d3c', size=1) + coord_fixed() + blank_theme\n", "p1l = corr_plot(data=df[['insertions', 'deletions']], flip=False).tiles(type='lower', diag=True)\\\n", " .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n", "p1r = corr_plot(data=df[['deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\\\n", " .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n", "p2l = corr_plot(data=df[['insertions', 'deletions', 'diff']], flip=False).tiles(type='lower', diag=True)\\\n", " .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n", "p2r = corr_plot(data=df[['diff', 'deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\\\n", " .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n", "p3l = corr_plot(data=df[['insertions', 'deletions', 'diff', 'files_changed']], flip=False)\\\n", " .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n", "p3r = corr_plot(data=df[['files_changed', 'diff', 'deletions', 'insertions']], flip=True)\\\n", " .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme\n", "pt = ggplot({'x': [0], 'y': [0], 'greetings': ['Happy New Year!']}, aes(x='x', y='y')) + \\\n", " geom_text(aes(label='greetings'), color='blue', size=20, family='Times New Roman', fontface='bold') + blank_theme\n", "pm = ggplot(grid_df, aes(x='x', y='y')) + \\\n", " geom_tile(aes(fill='month'), width=.8, height=.8, \\\n", " tooltips=layer_tooltips().line('@|@month')\n", " .line('@|@commits_number')) + \\\n", " scale_fill_brewer(type='qual', palette='Set2') + \\\n", " blank_theme\n", "\n", "w, h = 50, 50\n", "bunch = GGBunch()\n", "bunch.add_plot(ps, 3 * w, 0, 2 * w, 2 * h)\n", "bunch.add_plot(p1l, 2 * w, 2 * h, 2 * w, 2 * h)\n", "bunch.add_plot(p1r, 4 * w, 2 * h, 2 * w, 2 * h)\n", "bunch.add_plot(p2l, w, 4 * h, 3 * w, 3 * h)\n", "bunch.add_plot(p2r, 4 * w, 4 * h, 3 * w, 3 * h)\n", "bunch.add_plot(p3l, 0, 7 * h, 4 * w, 4 * h)\n", "bunch.add_plot(p3r, 4 * w, 7 * h, 4 * w, 4 * h)\n", "bunch.add_plot(pt, 0, 11 * h, 16 * w, 2 * h)\n", "bunch.add_plot(pm, 8 * w, 3 * h, 8 * w, 8 * h)\n", "bunch.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }