{ "cells": [ { "cell_type": "markdown", "id": "8364c581", "metadata": {}, "source": [ "# Lets-Plot in 2021" ] }, { "cell_type": "markdown", "id": "18b0b501", "metadata": {}, "source": [ "## Preparation" ] }, { "cell_type": "code", "execution_count": 1, "id": "15092fb8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "\n", "from lets_plot import *\n", "from lets_plot.mapping import as_discrete\n", "LetsPlot.setup_html()" ] }, { "cell_type": "code", "execution_count": 2, "id": "bfe2a766", "metadata": {}, "outputs": [], "source": [ "WINTER_PALETTE = {\n", " 'cold': ['#ffffff', '#97b7d8', '#4177a1', '#172540'],\n", " 'warm': ['#fdd0a2', '#fdae6b', '#f16913', '#d94801'],\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "id": "7a7093c2", "metadata": {}, "outputs": [], "source": [ "def winter_theme(tiles=False):\n", " winter_line_light = element_line(color=WINTER_PALETTE['cold'][1])\n", " winter_line_dark = element_line(color=WINTER_PALETTE['cold'][3])\n", " winter_text = element_text(color=WINTER_PALETTE['cold'][3])\n", " return theme_none() + theme(\n", " axis_line_x=(winter_line_dark if not tiles else 'blank'),\n", " axis_line_y='blank',\n", " axis_ticks=winter_line_light,\n", " axis_text=winter_text,\n", " axis_title=winter_text,\n", " line=winter_line_light,\n", " rect=element_rect(color=WINTER_PALETTE['cold'][1], fill=WINTER_PALETTE['cold'][0]),\n", " panel_background=(None if not tiles else 'blank'),\n", " text=winter_text,\n", " panel_grid=(winter_line_light if not tiles else 'blank'),\n", " panel_grid_minor='blank',\n", " axis_tooltip=element_rect(color=WINTER_PALETTE['cold'][0], fill=WINTER_PALETTE['cold'][3])\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "id": "0d0290e0", "metadata": {}, "outputs": [], "source": [ "def style_getter(d):\n", " def style(keys=None):\n", " if not keys:\n", " return d\n", " if type(keys) is list:\n", " return {k: v for k, v in d.items() if k in keys}\n", " else:\n", " return d[keys] if keys in d.keys() else d\n", " return style\n", "\n", "def geom_style(keys=None, is_point=False):\n", " style = {\n", " 'color': WINTER_PALETTE['warm'][2],\n", " 'fill': WINTER_PALETTE['warm'][1],\n", " 'outlier_color': WINTER_PALETTE['warm'][2],\n", " 'outlier_fill': WINTER_PALETTE['warm'][1],\n", " 'shape': 21,\n", " 'size': 3 if is_point else 1,\n", " 'outlier_shape': 21,\n", " 'outlier_size': 3,\n", " }\n", " return style_getter(style)(keys)\n", "\n", "def scale_continuous_style(keys=None):\n", " return style_getter({\n", " 'low': WINTER_PALETTE['warm'][0],\n", " 'high': WINTER_PALETTE['warm'][3],\n", " })(keys)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1356f913", "metadata": {}, "outputs": [], "source": [ "def plot_size(size_type=None):\n", " if size_type == 'big':\n", " return ggsize(720, 540)\n", " return ggsize(600, 450)" ] }, { "cell_type": "code", "execution_count": 6, "id": "6d67d35e", "metadata": {}, "outputs": [], "source": [ "def get_data(path_to_csv):\n", " from datetime import datetime\n", "\n", " EXTENSIONS_TOP_SIZE = 3\n", "\n", " df = pd.read_csv(path_to_csv)\n", " df = df[df.extension.isin(df.extension.value_counts().iloc[:EXTENSIONS_TOP_SIZE].index)]\n", " df.creation_date = pd.to_datetime(df.creation_date)\n", " df.modification_date = pd.to_datetime(df.modification_date)\n", " df = df[df.creation_date >= datetime(2021, 1, 1)]\n", "\n", " return df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "7bdbb5ac", "metadata": {}, "outputs": [], "source": [ "def dt_breaks(df, dt_column, breaks_count=5):\n", " import numpy as np\n", "\n", " return pd.to_datetime(np.linspace(\n", " df[dt_column].min().value,\n", " df[dt_column].max().value,\n", " breaks_count\n", " )).round('1s').to_list()" ] }, { "cell_type": "code", "execution_count": 8, "id": "fdd4102f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(389, 12)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenamedirectoryextensionlines_countwords_countsymbols_countcommits_countauthors_countcreation_datecreation_authormodification_datemodification_author
0DomEventMapperbase/src/jsMain/kotlin/jetbrains/datalore/base...kt1103304011112021-09-28 22:47:31I. Kupriyanov2021-09-28 22:47:31I. Kupriyanov
1DateTimeFormatbase-portable/src/commonMain/kotlin/jetbrains/...kt1304435221312021-08-18 00:34:33O. Larionova2021-11-25 13:06:51O. Larionova
2EventHandlerExtbase-portable/src/commonMain/kotlin/jetbrains/...kt1551431112021-09-27 22:29:05I. Kupriyanov2021-09-27 22:29:05I. Kupriyanov
3AdaptiveResamplingbase-portable/src/commonMain/kotlin/jetbrains/...kt832762381112021-08-23 14:01:25I. Kupriyanov2021-08-23 14:01:25I. Kupriyanov
4Transformsbase-portable/src/commonMain/kotlin/jetbrains/...kt1073203611112021-08-23 14:01:25I. Kupriyanov2021-08-23 14:01:25I. Kupriyanov
\n", "
" ], "text/plain": [ " filename directory \\\n", "0 DomEventMapper base/src/jsMain/kotlin/jetbrains/datalore/base... \n", "1 DateTimeFormat base-portable/src/commonMain/kotlin/jetbrains/... \n", "2 EventHandlerExt base-portable/src/commonMain/kotlin/jetbrains/... \n", "3 AdaptiveResampling base-portable/src/commonMain/kotlin/jetbrains/... \n", "4 Transforms base-portable/src/commonMain/kotlin/jetbrains/... \n", "\n", " extension lines_count words_count symbols_count commits_count \\\n", "0 kt 110 330 4011 1 \n", "1 kt 130 443 5221 3 \n", "2 kt 15 51 431 1 \n", "3 kt 83 276 2381 1 \n", "4 kt 107 320 3611 1 \n", "\n", " authors_count creation_date creation_author modification_date \\\n", "0 1 2021-09-28 22:47:31 I. Kupriyanov 2021-09-28 22:47:31 \n", "1 1 2021-08-18 00:34:33 O. Larionova 2021-11-25 13:06:51 \n", "2 1 2021-09-27 22:29:05 I. Kupriyanov 2021-09-27 22:29:05 \n", "3 1 2021-08-23 14:01:25 I. Kupriyanov 2021-08-23 14:01:25 \n", "4 1 2021-08-23 14:01:25 I. Kupriyanov 2021-08-23 14:01:25 \n", "\n", " modification_author \n", "0 I. Kupriyanov \n", "1 O. Larionova \n", "2 I. Kupriyanov \n", "3 I. Kupriyanov \n", "4 I. Kupriyanov " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = get_data(\"../data/lets-plot_2.2.1.csv\")\n", "print(df.shape)\n", "df.head()" ] }, { "cell_type": "markdown", "id": "e0bdaed7", "metadata": {}, "source": [ "The data comes from the [lets-plot repo](https://github.com/JetBrains/lets-plot), each row provides information about a project file.\n", "\n", "Some extra information was extracted using Git:\n", "\n", "- `commits_count` - the number of commits corresponding to the file (according to the `git log` command);\n", "- `authors_count` - the total number of different authors;\n", "- `creation_date` - the date and time of the commit in which the file appeared;\n", "- `creation_author` - the author of the commit in which the file appeared;\n", "- `modification_date` - the date and time of the last commit corresponding to the file;\n", "- `modification_author` - the author of the last commit corresponding to the file." ] }, { "cell_type": "markdown", "id": "93f8e362", "metadata": {}, "source": [ "## Lets-Plot files exploration" ] }, { "cell_type": "code", "execution_count": 9, "id": "1e9dd6aa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(as_discrete(\"extension\", order_by='..count..'))) + \\\n", " geom_bar(**geom_style()) + \\\n", " scale_y_log10() + \\\n", " ylab(\"number of files (log)\") + \\\n", " ggtitle(\"Number of project files by extension\") + \\\n", " winter_theme() + \\\n", " plot_size()" ] }, { "cell_type": "markdown", "id": "729794e5", "metadata": {}, "source": [ "Obviously, most Lets-Plot files are written in Kotlin." ] }, { "cell_type": "code", "execution_count": 10, "id": "fd070c73", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(as_discrete(\"extension\", order_by=\"..middle..\"), \"words_count\")) + \\\n", " geom_boxplot(**geom_style()) + \\\n", " scale_y_log10() + \\\n", " ylab(\"number of words (log)\") + \\\n", " ggtitle(\"Number of words per file\") + \\\n", " winter_theme() + \\\n", " plot_size()" ] }, { "cell_type": "code", "execution_count": 11, "id": "601a901a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(as_discrete(\"extension\", order_by=\"..middle..\"), \"commits_count\")) + \\\n", " geom_boxplot(**geom_style()) + \\\n", " scale_y_log10() + \\\n", " ylab(\"number of commits (log)\") + \\\n", " ggtitle(\"Number of commits per file\") + \\\n", " winter_theme() + \\\n", " plot_size()" ] }, { "cell_type": "code", "execution_count": 12, "id": "da5e83d8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(x=as_discrete(\"creation_author\", label=\"author\", order_by='..count..'))) + \\\n", " geom_bar(**geom_style()) + \\\n", " ylab(\"number of files created\") + \\\n", " ggtitle(\"Number of project files by creators\") + \\\n", " winter_theme() + \\\n", " plot_size('big')" ] }, { "cell_type": "markdown", "id": "716577e2", "metadata": {}, "source": [ "The undisputed leader in creating new files for the project in 2021 is Ivan Kupriyanov." ] }, { "cell_type": "code", "execution_count": 13, "id": "fa7cc7f0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(\"creation_author\", \"extension\")) + \\\n", " geom_bin2d(color=scale_continuous_style('high'), size=geom_style('size')) + \\\n", " scale_fill_gradient(name=\"number of files (log)\", trans='log10', **scale_continuous_style()) + \\\n", " xlab(\"file creator\") + \\\n", " ggtitle(\"Popularity of extensions among authors\") + \\\n", " winter_theme(tiles=True) + \\\n", " plot_size('big')" ] }, { "cell_type": "code", "execution_count": 14, "id": "54cfb6bb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df, aes(\"authors_count\", \"commits_count\")) + \\\n", " geom_bin2d(binwidth=(1, 5), color=scale_continuous_style('high'), size=geom_style('size')) + \\\n", " coord_fixed(ratio=.2) + \\\n", " scale_x_continuous(breaks=list(range(5))) + \\\n", " scale_fill_gradient(name=\"number of files (log)\", trans='log10', **scale_continuous_style()) + \\\n", " xlab(\"number of authors\") + ylab(\"number of commits\") + \\\n", " ggtitle(\"Correlation between numbers of authors and commits\") + \\\n", " winter_theme(tiles=True) + \\\n", " plot_size()" ] }, { "cell_type": "markdown", "id": "676d02c0", "metadata": {}, "source": [ "Most of the files have been created or edited by a small number of authors, not too many times." ] }, { "cell_type": "code", "execution_count": 15, "id": "d3fa38e9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df[(df.authors_count > 3)|(df.commits_count >= 10)], \\\n", " aes(\"lines_count\", \"words_count\")) + \\\n", " geom_point(aes(fill=\"extension\", size=\"symbols_count\"), \\\n", " color='black', shape=21, alpha=.7, \\\n", " tooltips=layer_tooltips().line(\"@{directory}@filename.@extension\")\\\n", " .line(\"lines number|@lines_count\")\\\n", " .line(\"words number|@words_count\")\\\n", " .line(\"symbols number|@symbols_count\")\\\n", " .line(\"commits number|@commits_count\")\\\n", " .line(\"authors number|@authors_count\")\\\n", " .line(\"creator|@creation_author\")\\\n", " .line(\"most recent commit author|@modification_author\")\\\n", " .anchor('bottom_center')) + \\\n", " scale_x_log10() + scale_y_log10() + \\\n", " scale_fill_brewer(type='qual', palette='Pastel1') + \\\n", " scale_size(trans='log10', guide='none') + \\\n", " xlim(32, 1024) + ylim(64, 4096) + \\\n", " xlab(\"number of lines (log)\") + ylab(\"number of words (log)\") + \\\n", " ggtitle(\"Most popular files (with the largest number of authors or commits)\") + \\\n", " winter_theme() + \\\n", " plot_size('big')" ] }, { "cell_type": "markdown", "id": "b05e0a06", "metadata": {}, "source": [ "Among the files that have often been edited, the biggest one is the `python-package/lets_plot/geo_data/geocoder.py`.\n", "\n", "Obviously, there is a correlation between the number of lines and the number of words in the file." ] }, { "cell_type": "code", "execution_count": 16, "id": "79cc5760", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df[df.creation_date == df.modification_date], \\\n", " aes(\"creation_author\", \"words_count\", fill=\"creation_author\")) + \\\n", " geom_violin(size=geom_style('size')) + \\\n", " geom_boxplot(size=geom_style('size'), fill='white', width=.2) + \\\n", " facet_grid(x=\"extension\") + \\\n", " coord_flip() + \\\n", " scale_y_log10(breaks=[2**n for n in range(5, 12)]) + \\\n", " scale_fill_brewer(type='qual', palette='Pastel1', guide='none') + \\\n", " xlab(\"creator\") + ylab(\"number of words (log)\") + \\\n", " ggtitle(\"Files that never changed after they were created\") + \\\n", " winter_theme() + \\\n", " plot_size('big')" ] }, { "cell_type": "markdown", "id": "5ae77d6f", "metadata": {}, "source": [ "If we look at the files that have never been changed after they were created, we will see that those are mostly Kotlin files and are usually not too big." ] }, { "cell_type": "code", "execution_count": 17, "id": "5810e753", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ggplot(df.sort_values(by=\"creation_date\").groupby(\"creation_date\").count().iloc[:, 0].cumsum()\\\n", " .to_frame(\"count\").reset_index(), aes(\"creation_date\", \"count\")) + \\\n", " geom_area(stat=\"identity\",\n", " tooltips=layer_tooltips().line(\"files count|^y\")\\\n", " .format(\"@creation_date\", \"%m/%d/%Y\")\\\n", " .line(\"creation date|^x\"),\n", " **geom_style()) + \\\n", " scale_x_datetime(breaks=dt_breaks(df, 'creation_date'), format='%b %Y') + \\\n", " xlab(\"file creation date\") + ylab(\"number of files\") + \\\n", " ggtitle(\"Increasing number of new files in 2021\") + \\\n", " winter_theme() + \\\n", " plot_size('big')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }