{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from siuba import *\n",
"from siuba import meta_hook\n",
"\n",
"import pandas as pd\n",
"\n",
"from pandas import DataFrame, Series"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = DataFrame({\n",
" \"repo\": [\"pandas\", \"dplyr\", \"ggplot2\", \"plotnine\"],\n",
" \"owner\": [\"pandas-dev\", \"tidyverse\", \"tidyverse\", \"has2k1\"],\n",
" \"language\": [\"python\", \"R\", \"R\", \"python\"],\n",
" \"stars\": [17800, 2800, 3500, 1450],\n",
" \"x\": [1,2,3,None]\n",
" })"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## mutate"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from pandas.core.groupby import DataFrameGroupBy"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
" rel_stars1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
" 700 | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" python | \n",
" 1450 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x rel_stars1\n",
"0 pandas pandas-dev python 17800 1.0 0\n",
"1 dplyr tidyverse R 2800 2.0 0\n",
"2 ggplot2 tidyverse R 3500 3.0 700\n",
"3 plotnine has2k1 python 1450 NaN 0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf = group_by(df, 'language', \"owner\")\n",
"\n",
"out = mutate(gdf, rel_stars1 = _.stars - _.stars.min())\n",
"\n",
"ungroup(out)\n",
"\n",
"df.siu_group_by(\"language\", \"owner\").siu_mutate(rel_stars1 = _.stars - _.stars.min()).siu_ungroup()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
" rel_stars1 | \n",
" rel_stars2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
" 0 | \n",
" 35600 | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
" 0 | \n",
" 5600 | \n",
"
\n",
" \n",
" | 2 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
" 700 | \n",
" 7000 | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" python | \n",
" 1450 | \n",
" NaN | \n",
" 0 | \n",
" 2900 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x rel_stars1 rel_stars2\n",
"0 pandas pandas-dev python 17800 1.0 0 35600\n",
"1 dplyr tidyverse R 2800 2.0 0 5600\n",
"2 ggplot2 tidyverse R 3500 3.0 700 7000\n",
"3 plotnine has2k1 python 1450 NaN 0 2900"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ungroup(mutate(out, rel_stars2 = _.stars + _.stars))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
" rel_stars1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
" 700 | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" python | \n",
" 1450 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x rel_stars1\n",
"0 pandas pandas-dev python 17800 1.0 0\n",
"1 dplyr tidyverse R 2800 2.0 0\n",
"2 ggplot2 tidyverse R 3500 3.0 700\n",
"3 plotnine has2k1 python 1450 NaN 0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df \\\n",
" .siu_group_by(\"language\", \"owner\") \\\n",
" .siu_mutate(rel_stars1 = _.stars - _.stars.min()) \\\n",
" .siu_ungroup()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## filter"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x\n",
"0 pandas pandas-dev python 17800 1.0\n",
"1 ggplot2 tidyverse R 3500 3.0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TODO: change name filter to query?\n",
"\n",
"# regular filter\n",
"filter(df, _.stars > 3000, _.stars < 15000)\n",
"\n",
"# grouped filter\n",
"gdf = group_by(df, \"language\")\n",
"\n",
"ungroup(filter(gdf, _.stars != _.stars.min()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## summarize"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
" ttl_stars | \n",
" wat | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" R | \n",
" 6300 | \n",
" 2800 | \n",
"
\n",
" \n",
" | 1 | \n",
" python | \n",
" 19250 | \n",
" 1450 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language ttl_stars wat\n",
"0 R 6300 2800\n",
"1 python 19250 1450"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# summarize DataFrame\n",
"summarize(df, min_stars = _.stars.min())\n",
"\n",
"# summarize grouped DataFrame\n",
"gdf = group_by(df, \"language\")\n",
"\n",
"summarize(gdf, ttl_stars = _.stars.sum(), wat = _.stars.min())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## transmute"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
" repo | \n",
" rel_stars1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" python | \n",
" pandas | \n",
" 16350 | \n",
"
\n",
" \n",
" | 1 | \n",
" R | \n",
" dplyr | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" R | \n",
" ggplot2 | \n",
" 700 | \n",
"
\n",
" \n",
" | 3 | \n",
" python | \n",
" plotnine | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language repo rel_stars1\n",
"0 python pandas 16350\n",
"1 R dplyr 0\n",
"2 R ggplot2 700\n",
"3 python plotnine 0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transmute(df, \"repo\", rel_stars1 = _.stars - _.stars.min())\n",
"\n",
"ungroup(transmute(gdf, \"repo\", rel_stars1 = _.stars - _.stars.min()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## select"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" y | \n",
" repo | \n",
" owner | \n",
" stars | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1.0 | \n",
" pandas | \n",
" pandas-dev | \n",
" 17800 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2.0 | \n",
" dplyr | \n",
" tidyverse | \n",
" 2800 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3.0 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" 3500 | \n",
"
\n",
" \n",
" | 3 | \n",
" NaN | \n",
" plotnine | \n",
" has2k1 | \n",
" 1450 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" y repo owner stars\n",
"0 1.0 pandas pandas-dev 17800\n",
"1 2.0 dplyr tidyverse 2800\n",
"2 3.0 ggplot2 tidyverse 3500\n",
"3 NaN plotnine has2k1 1450"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# thoughts:\n",
"# + can use dynamic values, e.g. colname == .x\n",
"# + if select implements some name class, then nothing magic happening\n",
"# e.g. _.y == _.x is equivalent to lambda cols: cols.y == cols.x\n",
"# - long winded (==, _.y seems harder to read than \"y\")\n",
"# select(df, _.y == _.x, -_.language)\n",
"\n",
"select(df, _.y == _.x, -_.language)\n",
"# considered alternative with strings. E.g...\n",
"# select(df, \"y = x\", \"language\")\n",
"# select(df, dict(y = \"x\"), \"language\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## arrange"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" python | \n",
" 1450 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
"
\n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x\n",
"3 plotnine has2k1 python 1450 NaN\n",
"1 dplyr tidyverse R 2800 2.0\n",
"2 ggplot2 tidyverse R 3500 3.0\n",
"0 pandas pandas-dev python 17800 1.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arrange(df, -_.owner, _.repo)\n",
"\n",
"arrange(df, _.owner.str.len())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## helpers (if_else, case_when)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['no', 'yeah', 'no', 'no'], dtype=' 10000: \"incredible!\",\n",
" _.stars > 1000: \"pretty good!\",\n",
" _.stars > 100 : \"keep going!\",\n",
" True: \"I don't know\"\n",
"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## nest and unnest "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
" data | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" R | \n",
" repo owner stars x\n",
"1 dplyr ... | \n",
"
\n",
" \n",
" | 1 | \n",
" python | \n",
" repo owner stars x\n",
"0 panda... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language data\n",
"0 R repo owner stars x\n",
"1 dplyr ...\n",
"1 python repo owner stars x\n",
"0 panda..."
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# data column is an array of DataFrames\n",
"nest(df, -_.language, key = \"data\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
" repo | \n",
" owner | \n",
" stars | \n",
" x | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" R | \n",
" dplyr | \n",
" tidyverse | \n",
" 2800 | \n",
" 2.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" R | \n",
" ggplot2 | \n",
" tidyverse | \n",
" 3500 | \n",
" 3.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" python | \n",
" pandas | \n",
" pandas-dev | \n",
" 17800 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" python | \n",
" plotnine | \n",
" has2k1 | \n",
" 1450 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language repo owner stars x\n",
"0 R dplyr tidyverse 2800 2.0\n",
"1 R ggplot2 tidyverse 3500 3.0\n",
"2 python pandas pandas-dev 17800 1.0\n",
"3 python plotnine has2k1 1450 NaN"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unnest(nest(df, -_.language, key = \"data\"), \"data\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## count "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
" owner | \n",
" n | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" R | \n",
" tidyverse | \n",
" 2 | \n",
"
\n",
" \n",
" | 1 | \n",
" python | \n",
" has2k1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" python | \n",
" pandas-dev | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language owner n\n",
"0 R tidyverse 2\n",
"1 python has2k1 1\n",
"2 python pandas-dev 1"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count(df, \"language\", \"owner\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
" n | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" python | \n",
" 1450 | \n",
" NaN | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x n\n",
"0 pandas pandas-dev python 17800 1.0 1\n",
"1 dplyr tidyverse R 2800 2.0 2\n",
"2 ggplot2 tidyverse R 3500 3.0 2\n",
"3 plotnine has2k1 python 1450 NaN 1"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"add_count(df, \"language\", \"owner\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Distinct"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x\n",
"0 pandas pandas-dev python 17800 1.0\n",
"1 dplyr tidyverse R 2800 2.0"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct(df, _.language, _keep_all = True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" python | \n",
"
\n",
" \n",
" | 1 | \n",
" R | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language\n",
"0 python\n",
"1 R"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct(df, \"language\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" lang2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" python | \n",
"
\n",
" \n",
" | 1 | \n",
" r | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" lang2\n",
"0 python\n",
"1 r"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct(df, lang2 = _.language.str.lower())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" lang2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" R | \n",
"
\n",
" \n",
" | 1 | \n",
" PYTHON | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" lang2\n",
"0 R\n",
"1 PYTHON"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf = group_by(df, \"language\")\n",
"\n",
"ungroup(distinct(gdf, lang2 = _.language.str.upper()))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Joins"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
" repo2 | \n",
" follow | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
" pandas | \n",
" True | \n",
"
\n",
" \n",
" | 1 | \n",
" dplyr | \n",
" tidyverse | \n",
" R | \n",
" 2800 | \n",
" 2.0 | \n",
" dplyr | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" R | \n",
" 3500 | \n",
" 3.0 | \n",
" None | \n",
" True | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" python | \n",
" 1450 | \n",
" NaN | \n",
" None | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x repo2 follow\n",
"0 pandas pandas-dev python 17800 1.0 pandas True\n",
"1 dplyr tidyverse R 2800 2.0 dplyr False\n",
"2 ggplot2 tidyverse R 3500 3.0 None True\n",
"3 plotnine has2k1 python 1450 NaN None False"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"follow = pd.DataFrame({\n",
" 'repo': ['pandas', 'dplyr', 'ggplot2', 'plotnine'],\n",
" 'repo2': ['pandas', 'dplyr', None, None],\n",
" 'follow': [True, False, True, False]\n",
"})\n",
"\n",
"join(df, follow, how = \"inner\", on = \"repo\")\n",
"# join(df, df)\n",
"\n",
"join(df, follow, how = \"inner\", on = {\"repo\": \"repo2\"})\n",
"\n",
"left_join(df, follow, \"repo\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Spread and Gather"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" x | \n",
" R | \n",
" python | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" dplyr | \n",
" tidyverse | \n",
" 2.0 | \n",
" 2800.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" 3.0 | \n",
" 3500.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2 | \n",
" pandas | \n",
" pandas-dev | \n",
" 1.0 | \n",
" NaN | \n",
" 17800.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" NaN | \n",
" NaN | \n",
" 1450.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner x R python\n",
"0 dplyr tidyverse 2.0 2800.0 NaN\n",
"1 ggplot2 tidyverse 3.0 3500.0 NaN\n",
"2 pandas pandas-dev 1.0 NaN 17800.0\n",
"3 plotnine has2k1 NaN NaN 1450.0"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"df2 = spread(df, 'language', 'stars')\n",
"df2\n",
"# this should raise an error, because duplicate id col x key combos\n",
"#spread(pd.concat([df, df]), 'language', 'stars')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" x | \n",
" key | \n",
" value | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" dplyr | \n",
" tidyverse | \n",
" 2.0 | \n",
" R | \n",
" 2800.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" ggplot2 | \n",
" tidyverse | \n",
" 3.0 | \n",
" R | \n",
" 3500.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" pandas | \n",
" pandas-dev | \n",
" 1.0 | \n",
" python | \n",
" 17800.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" plotnine | \n",
" has2k1 | \n",
" NaN | \n",
" python | \n",
" 1450.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner x key value\n",
"0 dplyr tidyverse 2.0 R 2800.0\n",
"1 ggplot2 tidyverse 3.0 R 3500.0\n",
"2 pandas pandas-dev 1.0 python 17800.0\n",
"3 plotnine has2k1 NaN python 1450.0"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gather(df2, \"key\", \"value\", \"R\", \"python\", drop_na = True)\n",
"\n",
"# TODO\n",
"# gather(df2, _.key, _.value, _[\"R\":\"python\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Piping"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'x is: 3'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = Pipeable(f = lambda x: x + 1) >> Pipeable(f = lambda x: \"x is: {}\".format(x))\n",
"\n",
"f(2)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" repo | \n",
" owner | \n",
" language | \n",
" stars | \n",
" x | \n",
" new_repo | \n",
" case | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" pandas | \n",
" pandas-dev | \n",
" python | \n",
" 17800 | \n",
" 1.0 | \n",
" pandas waattt | \n",
" aw yeah | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" repo owner language stars x new_repo case\n",
"0 pandas pandas-dev python 17800 1.0 pandas waattt aw yeah"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df\n",
" >> mutate(\n",
" new_repo = _.repo + \" waattt\",\n",
" case = case_when(_, {_.language == \"python\": \"aw yeah\", True: 'wat'})\n",
" )\n",
" >> filter(_.stars > 5000)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" language | \n",
" wat | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" R | \n",
" 3150.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" python | \n",
" 9625.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" language wat\n",
"0 R 3150.0\n",
"1 python 9625.0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df >> group_by(_.language) >> summarize(wat = _.stars.mean())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "222px"
},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 4
}