{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", "pd.options.display.max_columns = 20\n", "pd.options.display.max_rows = 20\n", "pd.options.display.max_colwidth = 80\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc(\"figure\", figsize=(10, 6))\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"key1\" : [\"a\", \"a\", None, \"b\", \"b\", \"a\", None],\n", " \"key2\" : pd.Series([1, 2, 1, 2, 1, None, 1],\n", " dtype=\"Int64\"),\n", " \"data1\" : np.random.standard_normal(7),\n", " \"data2\" : np.random.standard_normal(7)})\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "grouped = df[\"data1\"].groupby(df[\"key1\"])\n", "grouped" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "grouped.mean()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "means = df[\"data1\"].groupby([df[\"key1\"], df[\"key2\"]]).mean()\n", "means" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "means.unstack()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "states = np.array([\"OH\", \"CA\", \"CA\", \"OH\", \"OH\", \"CA\", \"OH\"])\n", "years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]\n", "df[\"data1\"].groupby([states, years]).mean()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df.groupby(\"key1\").mean()\n", "df.groupby(\"key2\").mean(numeric_only=True)\n", "df.groupby([\"key1\", \"key2\"]).mean()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df.groupby([\"key1\", \"key2\"]).size()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df.groupby(\"key1\", dropna=False).size()\n", "df.groupby([\"key1\", \"key2\"], dropna=False).size()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df.groupby(\"key1\").count()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "for name, group in df.groupby(\"key1\"):\n", " print(name)\n", " print(group)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "for (k1, k2), group in df.groupby([\"key1\", \"key2\"]):\n", " print((k1, k2))\n", " print(group)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "pieces = {name: group for name, group in df.groupby(\"key1\")}\n", "pieces[\"b\"]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "grouped = df.groupby({\"key1\": \"key\", \"key2\": \"key\",\n", " \"data1\": \"data\", \"data2\": \"data\"}, axis=\"columns\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "for group_key, group_values in grouped:\n", " print(group_key)\n", " print(group_values)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df.groupby([\"key1\", \"key2\"])[[\"data2\"]].mean()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "s_grouped = df.groupby([\"key1\", \"key2\"])[\"data2\"]\n", "s_grouped\n", "s_grouped.mean()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "people = pd.DataFrame(np.random.standard_normal((5, 5)),\n", " columns=[\"a\", \"b\", \"c\", \"d\", \"e\"],\n", " index=[\"Joe\", \"Steve\", \"Wanda\", \"Jill\", \"Trey\"])\n", "people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values\n", "people" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "mapping = {\"a\": \"red\", \"b\": \"red\", \"c\": \"blue\",\n", " \"d\": \"blue\", \"e\": \"red\", \"f\" : \"orange\"}" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "by_column = people.groupby(mapping, axis=\"columns\")\n", "by_column.sum()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "map_series = pd.Series(mapping)\n", "map_series\n", "people.groupby(map_series, axis=\"columns\").count()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "people.groupby(len).sum()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "key_list = [\"one\", \"one\", \"one\", \"two\", \"two\"]\n", "people.groupby([len, key_list]).min()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "columns = pd.MultiIndex.from_arrays([[\"US\", \"US\", \"US\", \"JP\", \"JP\"],\n", " [1, 3, 5, 1, 3]],\n", " names=[\"cty\", \"tenor\"])\n", "hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)\n", "hier_df" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "hier_df.groupby(level=\"cty\", axis=\"columns\").count()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "df\n", "grouped = df.groupby(\"key1\")\n", "grouped[\"data1\"].nsmallest(2)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def peak_to_peak(arr):\n", " return arr.max() - arr.min()\n", "grouped.agg(peak_to_peak)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "grouped.describe()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "tips = pd.read_csv(\"examples/tips.csv\")\n", "tips.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "tips[\"tip_pct\"] = tips[\"tip\"] / tips[\"total_bill\"]\n", "tips.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "grouped = tips.groupby([\"day\", \"smoker\"])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "grouped_pct = grouped[\"tip_pct\"]\n", "grouped_pct.agg(\"mean\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "grouped_pct.agg([\"mean\", \"std\", peak_to_peak])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "grouped_pct.agg([(\"average\", \"mean\"), (\"stdev\", np.std)])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "functions = [\"count\", \"mean\", \"max\"]\n", "result = grouped[[\"tip_pct\", \"total_bill\"]].agg(functions)\n", "result" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "result[\"tip_pct\"]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "ftuples = [(\"Average\", \"mean\"), (\"Variance\", np.var)]\n", "grouped[[\"tip_pct\", \"total_bill\"]].agg(ftuples)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "grouped.agg({\"tip\" : np.max, \"size\" : \"sum\"})\n", "grouped.agg({\"tip_pct\" : [\"min\", \"max\", \"mean\", \"std\"],\n", " \"size\" : \"sum\"})" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "grouped = tips.groupby([\"day\", \"smoker\"], as_index=False)\n", "grouped.mean(numeric_only=True)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "def top(df, n=5, column=\"tip_pct\"):\n", " return df.sort_values(column, ascending=False)[:n]\n", "top(tips, n=6)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "tips.groupby(\"smoker\").apply(top)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "tips.groupby([\"smoker\", \"day\"]).apply(top, n=1, column=\"total_bill\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "result = tips.groupby(\"smoker\")[\"tip_pct\"].describe()\n", "result\n", "result.unstack(\"smoker\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "tips.groupby(\"smoker\", group_keys=False).apply(top)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "frame = pd.DataFrame({\"data1\": np.random.standard_normal(1000),\n", " \"data2\": np.random.standard_normal(1000)})\n", "frame.head()\n", "quartiles = pd.cut(frame[\"data1\"], 4)\n", "quartiles.head(10)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "def get_stats(group):\n", " return pd.DataFrame(\n", " {\"min\": group.min(), \"max\": group.max(),\n", " \"count\": group.count(), \"mean\": group.mean()}\n", " )\n", "\n", "grouped = frame.groupby(quartiles)\n", "grouped.apply(get_stats)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "grouped.agg([\"min\", \"max\", \"count\", \"mean\"])" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "quartiles_samp = pd.qcut(frame[\"data1\"], 4, labels=False)\n", "quartiles_samp.head()\n", "grouped = frame.groupby(quartiles_samp)\n", "grouped.apply(get_stats)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "s = pd.Series(np.random.standard_normal(6))\n", "s[::2] = np.nan\n", "s\n", "s.fillna(s.mean())" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "states = [\"Ohio\", \"New York\", \"Vermont\", \"Florida\",\n", " \"Oregon\", \"Nevada\", \"California\", \"Idaho\"]\n", "group_key = [\"East\", \"East\", \"East\", \"East\",\n", " \"West\", \"West\", \"West\", \"West\"]\n", "data = pd.Series(np.random.standard_normal(8), index=states)\n", "data" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "data[[\"Vermont\", \"Nevada\", \"Idaho\"]] = np.nan\n", "data\n", "data.groupby(group_key).size()\n", "data.groupby(group_key).count()\n", "data.groupby(group_key).mean()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "def fill_mean(group):\n", " return group.fillna(group.mean())\n", "\n", "data.groupby(group_key).apply(fill_mean)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "fill_values = {\"East\": 0.5, \"West\": -1}\n", "def fill_func(group):\n", " return group.fillna(fill_values[group.name])\n", "\n", "data.groupby(group_key).apply(fill_func)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "suits = [\"H\", \"S\", \"C\", \"D\"] # Hearts, Spades, Clubs, Diamonds\n", "card_val = (list(range(1, 11)) + [10] * 3) * 4\n", "base_names = [\"A\"] + list(range(2, 11)) + [\"J\", \"K\", \"Q\"]\n", "cards = []\n", "for suit in suits:\n", " cards.extend(str(num) + suit for num in base_names)\n", "\n", "deck = pd.Series(card_val, index=cards)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "deck.head(13)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "def draw(deck, n=5):\n", " return deck.sample(n)\n", "draw(deck)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "def get_suit(card):\n", " # last letter is suit\n", " return card[-1]\n", "\n", "deck.groupby(get_suit).apply(draw, n=2)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "deck.groupby(get_suit, group_keys=False).apply(draw, n=2)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"category\": [\"a\", \"a\", \"a\", \"a\",\n", " \"b\", \"b\", \"b\", \"b\"],\n", " \"data\": np.random.standard_normal(8),\n", " \"weights\": np.random.uniform(size=8)})\n", "df" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "grouped = df.groupby(\"category\")\n", "def get_wavg(group):\n", " return np.average(group[\"data\"], weights=group[\"weights\"])\n", "\n", "grouped.apply(get_wavg)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "close_px = pd.read_csv(\"examples/stock_px.csv\", parse_dates=True,\n", " index_col=0)\n", "close_px.info()\n", "close_px.tail(4)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "def spx_corr(group):\n", " return group.corrwith(group[\"SPX\"])" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "rets = close_px.pct_change().dropna()" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "def get_year(x):\n", " return x.year\n", "\n", "by_year = rets.groupby(get_year)\n", "by_year.apply(spx_corr)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "def corr_aapl_msft(group):\n", " return group[\"AAPL\"].corr(group[\"MSFT\"])\n", "by_year.apply(corr_aapl_msft)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "import statsmodels.api as sm\n", "def regress(data, yvar=None, xvars=None):\n", " Y = data[yvar]\n", " X = data[xvars]\n", " X[\"intercept\"] = 1.\n", " result = sm.OLS(Y, X).fit()\n", " return result.params" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "by_year.apply(regress, yvar=\"AAPL\", xvars=[\"SPX\"])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,\n", " 'value': np.arange(12.)})\n", "df" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "g = df.groupby('key')['value']\n", "g.mean()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "def get_mean(group):\n", " return group.mean()\n", "g.transform(get_mean)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "g.transform('mean')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "def times_two(group):\n", " return group * 2\n", "g.transform(times_two)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "def get_ranks(group):\n", " return group.rank(ascending=False)\n", "g.transform(get_ranks)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "def normalize(x):\n", " return (x - x.mean()) / x.std()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "g.transform(normalize)\n", "g.apply(normalize)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "g.transform('mean')\n", "normalized = (df['value'] - g.transform('mean')) / g.transform('std')\n", "normalized" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "tips.head()\n", "tips.pivot_table(index=[\"day\", \"smoker\"],\n", " values=[\"size\", \"tip\", \"tip_pct\", \"total_bill\"])" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "tips.pivot_table(index=[\"time\", \"day\"], columns=\"smoker\",\n", " values=[\"tip_pct\", \"size\"])" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "tips.pivot_table(index=[\"time\", \"day\"], columns=\"smoker\",\n", " values=[\"tip_pct\", \"size\"], margins=True)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "tips.pivot_table(index=[\"time\", \"smoker\"], columns=\"day\",\n", " values=\"tip_pct\", aggfunc=len, margins=True)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "tips.pivot_table(index=[\"time\", \"size\", \"smoker\"], columns=\"day\",\n", " values=\"tip_pct\", fill_value=0)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "from io import StringIO\n", "data = \"\"\"Sample Nationality Handedness\n", "1 USA Right-handed\n", "2 Japan Left-handed\n", "3 USA Right-handed\n", "4 Japan Right-handed\n", "5 Japan Left-handed\n", "6 Japan Right-handed\n", "7 USA Right-handed\n", "8 USA Left-handed\n", "9 Japan Right-handed\n", "10 USA Right-handed\"\"\"\n", "data = pd.read_table(StringIO(data), sep=\"\\s+\")" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "pd.crosstab(data[\"Nationality\"], data[\"Handedness\"], margins=True)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "pd.crosstab([tips[\"time\"], tips[\"day\"]], tips[\"smoker\"], margins=True)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }