{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from siuba import *\n", "from siuba import meta_hook\n", "\n", "import pandas as pd\n", "\n", "from pandas import DataFrame, Series" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = DataFrame({\n", " \"repo\": [\"pandas\", \"dplyr\", \"ggplot2\", \"plotnine\"],\n", " \"owner\": [\"pandas-dev\", \"tidyverse\", \"tidyverse\", \"has2k1\"],\n", " \"language\": [\"python\", \"R\", \"R\", \"python\"],\n", " \"stars\": [17800, 2800, 3500, 1450],\n", " \"x\": [1,2,3,None]\n", " })" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## mutate" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from pandas.core.groupby import DataFrameGroupBy" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " <th>rel_stars1</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " <td>700</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>python</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x rel_stars1\n", "0 pandas pandas-dev python 17800 1.0 0\n", "1 dplyr tidyverse R 2800 2.0 0\n", "2 ggplot2 tidyverse R 3500 3.0 700\n", "3 plotnine has2k1 python 1450 NaN 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf = group_by(df, 'language', \"owner\")\n", "\n", "out = mutate(gdf, rel_stars1 = _.stars - _.stars.min())\n", "\n", "ungroup(out)\n", "\n", "df.siu_group_by(\"language\", \"owner\").siu_mutate(rel_stars1 = _.stars - _.stars.min()).siu_ungroup()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " <th>rel_stars1</th>\n", " <th>rel_stars2</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " <td>35600</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " <td>0</td>\n", " <td>5600</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " <td>700</td>\n", " <td>7000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>python</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>2900</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x rel_stars1 rel_stars2\n", "0 pandas pandas-dev python 17800 1.0 0 35600\n", "1 dplyr tidyverse R 2800 2.0 0 5600\n", "2 ggplot2 tidyverse R 3500 3.0 700 7000\n", "3 plotnine has2k1 python 1450 NaN 0 2900" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ungroup(mutate(out, rel_stars2 = _.stars + _.stars))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " <th>rel_stars1</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " <td>700</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>python</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x rel_stars1\n", "0 pandas pandas-dev python 17800 1.0 0\n", "1 dplyr tidyverse R 2800 2.0 0\n", "2 ggplot2 tidyverse R 3500 3.0 700\n", "3 plotnine has2k1 python 1450 NaN 0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df \\\n", " .siu_group_by(\"language\", \"owner\") \\\n", " .siu_mutate(rel_stars1 = _.stars - _.stars.min()) \\\n", " .siu_ungroup()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## filter" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x\n", "0 pandas pandas-dev python 17800 1.0\n", "1 ggplot2 tidyverse R 3500 3.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# TODO: change name filter to query?\n", "\n", "# regular filter\n", "filter(df, _.stars > 3000, _.stars < 15000)\n", "\n", "# grouped filter\n", "gdf = group_by(df, \"language\")\n", "\n", "ungroup(filter(gdf, _.stars != _.stars.min()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## summarize" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>ttl_stars</th>\n", " <th>wat</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>R</td>\n", " <td>6300</td>\n", " <td>2800</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>python</td>\n", " <td>19250</td>\n", " <td>1450</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language ttl_stars wat\n", "0 R 6300 2800\n", "1 python 19250 1450" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# summarize DataFrame\n", "summarize(df, min_stars = _.stars.min())\n", "\n", "# summarize grouped DataFrame\n", "gdf = group_by(df, \"language\")\n", "\n", "summarize(gdf, ttl_stars = _.stars.sum(), wat = _.stars.min())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## transmute" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>repo</th>\n", " <th>rel_stars1</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>python</td>\n", " <td>pandas</td>\n", " <td>16350</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>R</td>\n", " <td>dplyr</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>R</td>\n", " <td>ggplot2</td>\n", " <td>700</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>python</td>\n", " <td>plotnine</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language repo rel_stars1\n", "0 python pandas 16350\n", "1 R dplyr 0\n", "2 R ggplot2 700\n", "3 python plotnine 0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transmute(df, \"repo\", rel_stars1 = _.stars - _.stars.min())\n", "\n", "ungroup(transmute(gdf, \"repo\", rel_stars1 = _.stars - _.stars.min()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## select" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>y</th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>stars</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>17800</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2.0</td>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>2800</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3.0</td>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>3500</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>NaN</td>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>1450</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " y repo owner stars\n", "0 1.0 pandas pandas-dev 17800\n", "1 2.0 dplyr tidyverse 2800\n", "2 3.0 ggplot2 tidyverse 3500\n", "3 NaN plotnine has2k1 1450" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# thoughts:\n", "# + can use dynamic values, e.g. colname == .x\n", "# + if select implements some name class, then nothing magic happening\n", "# e.g. _.y == _.x is equivalent to lambda cols: cols.y == cols.x\n", "# - long winded (==, _.y seems harder to read than \"y\")\n", "# select(df, _.y == _.x, -_.language)\n", "\n", "select(df, _.y == _.x, -_.language)\n", "# considered alternative with strings. E.g...\n", "# select(df, \"y = x\", \"language\")\n", "# select(df, dict(y = \"x\"), \"language\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## arrange" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>python</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " </tr>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x\n", "3 plotnine has2k1 python 1450 NaN\n", "1 dplyr tidyverse R 2800 2.0\n", "2 ggplot2 tidyverse R 3500 3.0\n", "0 pandas pandas-dev python 17800 1.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arrange(df, -_.owner, _.repo)\n", "\n", "arrange(df, _.owner.str.len())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## helpers (if_else, case_when)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['no', 'yeah', 'no', 'no'], dtype='<U4')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if_else(df.repo == \"dplyr\", \"yeah\", \"no\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['pandas', 'dplyr', 'wat', 'wat'], dtype=object)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = if_else(_.repo.str.contains(\"d\"), _.repo, \"wat\")\n", "\n", "f(df)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['incredible!', 'pretty good!', 'pretty good!', 'pretty good!'],\n", " dtype='<U12')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "case_when(df, {\n", " _.stars > 10000: \"incredible!\",\n", " _.stars > 1000: \"pretty good!\",\n", " _.stars > 100 : \"keep going!\",\n", " True: \"I don't know\"\n", "})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## nest and unnest " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>data</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>R</td>\n", " <td>repo owner stars x\n", "1 dplyr ...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>python</td>\n", " <td>repo owner stars x\n", "0 panda...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language data\n", "0 R repo owner stars x\n", "1 dplyr ...\n", "1 python repo owner stars x\n", "0 panda..." ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# data column is an array of DataFrames\n", "nest(df, -_.language, key = \"data\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>R</td>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>R</td>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>python</td>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>python</td>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language repo owner stars x\n", "0 R dplyr tidyverse 2800 2.0\n", "1 R ggplot2 tidyverse 3500 3.0\n", "2 python pandas pandas-dev 17800 1.0\n", "3 python plotnine has2k1 1450 NaN" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unnest(nest(df, -_.language, key = \"data\"), \"data\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## count " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>owner</th>\n", " <th>n</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>R</td>\n", " <td>tidyverse</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>python</td>\n", " <td>has2k1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>python</td>\n", " <td>pandas-dev</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language owner n\n", "0 R tidyverse 2\n", "1 python has2k1 1\n", "2 python pandas-dev 1" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count(df, \"language\", \"owner\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " <th>n</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>python</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x n\n", "0 pandas pandas-dev python 17800 1.0 1\n", "1 dplyr tidyverse R 2800 2.0 2\n", "2 ggplot2 tidyverse R 3500 3.0 2\n", "3 plotnine has2k1 python 1450 NaN 1" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "add_count(df, \"language\", \"owner\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distinct" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x\n", "0 pandas pandas-dev python 17800 1.0\n", "1 dplyr tidyverse R 2800 2.0" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct(df, _.language, _keep_all = True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>python</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>R</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language\n", "0 python\n", "1 R" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct(df, \"language\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>lang2</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>python</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>r</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " lang2\n", "0 python\n", "1 r" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distinct(df, lang2 = _.language.str.lower())" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>lang2</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>R</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>PYTHON</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " lang2\n", "0 R\n", "1 PYTHON" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf = group_by(df, \"language\")\n", "\n", "ungroup(distinct(gdf, lang2 = _.language.str.upper()))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Joins" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " <th>repo2</th>\n", " <th>follow</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " <td>pandas</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>2800</td>\n", " <td>2.0</td>\n", " <td>dplyr</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>R</td>\n", " <td>3500</td>\n", " <td>3.0</td>\n", " <td>None</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>python</td>\n", " <td>1450</td>\n", " <td>NaN</td>\n", " <td>None</td>\n", " <td>False</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x repo2 follow\n", "0 pandas pandas-dev python 17800 1.0 pandas True\n", "1 dplyr tidyverse R 2800 2.0 dplyr False\n", "2 ggplot2 tidyverse R 3500 3.0 None True\n", "3 plotnine has2k1 python 1450 NaN None False" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "follow = pd.DataFrame({\n", " 'repo': ['pandas', 'dplyr', 'ggplot2', 'plotnine'],\n", " 'repo2': ['pandas', 'dplyr', None, None],\n", " 'follow': [True, False, True, False]\n", "})\n", "\n", "join(df, follow, how = \"inner\", on = \"repo\")\n", "# join(df, df)\n", "\n", "join(df, follow, how = \"inner\", on = {\"repo\": \"repo2\"})\n", "\n", "left_join(df, follow, \"repo\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Spread and Gather" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>x</th>\n", " <th>R</th>\n", " <th>python</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>2.0</td>\n", " <td>2800.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>3.0</td>\n", " <td>3500.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>1.0</td>\n", " <td>NaN</td>\n", " <td>17800.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1450.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner x R python\n", "0 dplyr tidyverse 2.0 2800.0 NaN\n", "1 ggplot2 tidyverse 3.0 3500.0 NaN\n", "2 pandas pandas-dev 1.0 NaN 17800.0\n", "3 plotnine has2k1 NaN NaN 1450.0" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "df2 = spread(df, 'language', 'stars')\n", "df2\n", "# this should raise an error, because duplicate id col x key combos\n", "#spread(pd.concat([df, df]), 'language', 'stars')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>x</th>\n", " <th>key</th>\n", " <th>value</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>dplyr</td>\n", " <td>tidyverse</td>\n", " <td>2.0</td>\n", " <td>R</td>\n", " <td>2800.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>ggplot2</td>\n", " <td>tidyverse</td>\n", " <td>3.0</td>\n", " <td>R</td>\n", " <td>3500.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>1.0</td>\n", " <td>python</td>\n", " <td>17800.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>plotnine</td>\n", " <td>has2k1</td>\n", " <td>NaN</td>\n", " <td>python</td>\n", " <td>1450.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner x key value\n", "0 dplyr tidyverse 2.0 R 2800.0\n", "1 ggplot2 tidyverse 3.0 R 3500.0\n", "2 pandas pandas-dev 1.0 python 17800.0\n", "3 plotnine has2k1 NaN python 1450.0" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gather(df2, \"key\", \"value\", \"R\", \"python\", drop_na = True)\n", "\n", "# TODO\n", "# gather(df2, _.key, _.value, _[\"R\":\"python\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Piping" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'x is: 3'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = Pipeable(f = lambda x: x + 1) >> Pipeable(f = lambda x: \"x is: {}\".format(x))\n", "\n", "f(2)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo</th>\n", " <th>owner</th>\n", " <th>language</th>\n", " <th>stars</th>\n", " <th>x</th>\n", " <th>new_repo</th>\n", " <th>case</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pandas</td>\n", " <td>pandas-dev</td>\n", " <td>python</td>\n", " <td>17800</td>\n", " <td>1.0</td>\n", " <td>pandas waattt</td>\n", " <td>aw yeah</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo owner language stars x new_repo case\n", "0 pandas pandas-dev python 17800 1.0 pandas waattt aw yeah" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df\n", " >> mutate(\n", " new_repo = _.repo + \" waattt\",\n", " case = case_when(_, {_.language == \"python\": \"aw yeah\", True: 'wat'})\n", " )\n", " >> filter(_.stars > 5000)\n", " )" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>wat</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>R</td>\n", " <td>3150.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>python</td>\n", " <td>9625.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " language wat\n", "0 R 3150.0\n", "1 python 9625.0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df >> group_by(_.language) >> summarize(wat = _.stars.mean())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "222px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }