{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "pd.options.display.max_rows = 20\n", "pd.options.display.max_colwidth = 80\n", "pd.options.display.max_columns = 20\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc(\"figure\", figsize=(10, 6))\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data = pd.Series(np.random.uniform(size=9),\n", " index=[[\"a\", \"a\", \"a\", \"b\", \"b\", \"c\", \"c\", \"d\", \"d\"],\n", " [1, 2, 3, 1, 3, 1, 2, 2, 3]])\n", "data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data.index" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "data[\"b\"]\n", "data[\"b\":\"c\"]\n", "data.loc[[\"b\", \"d\"]]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data.loc[:, 2]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data.unstack()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "data.unstack().stack()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "frame = pd.DataFrame(np.arange(12).reshape((4, 3)),\n", " index=[[\"a\", \"a\", \"b\", \"b\"], [1, 2, 1, 2]],\n", " columns=[[\"Ohio\", \"Ohio\", \"Colorado\"],\n", " [\"Green\", \"Red\", \"Green\"]])\n", "frame" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "frame.index.names = [\"key1\", \"key2\"]\n", "frame.columns.names = [\"state\", \"color\"]\n", "frame" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "frame.index.nlevels" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "frame[\"Ohio\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "frame.swaplevel(\"key1\", \"key2\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "frame.sort_index(level=1)\n", "frame.swaplevel(0, 1).sort_index(level=0)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "frame.groupby(level=\"key2\").sum()\n", "frame.groupby(level=\"color\", axis=\"columns\").sum()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "frame = pd.DataFrame({\"a\": range(7), \"b\": range(7, 0, -1),\n", " \"c\": [\"one\", \"one\", \"one\", \"two\", \"two\",\n", " \"two\", \"two\"],\n", " \"d\": [0, 1, 2, 0, 1, 2, 3]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "frame2 = frame.set_index([\"c\", \"d\"])\n", "frame2" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "frame.set_index([\"c\", \"d\"], drop=False)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "frame2.reset_index()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"a\", \"b\"],\n", " \"data1\": pd.Series(range(7), dtype=\"Int64\")})\n", "df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"d\"],\n", " \"data2\": pd.Series(range(3), dtype=\"Int64\")})\n", "df1\n", "df2" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "pd.merge(df1, df2)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "pd.merge(df1, df2, on=\"key\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df3 = pd.DataFrame({\"lkey\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"a\", \"b\"],\n", " \"data1\": pd.Series(range(7), dtype=\"Int64\")})\n", "df4 = pd.DataFrame({\"rkey\": [\"a\", \"b\", \"d\"],\n", " \"data2\": pd.Series(range(3), dtype=\"Int64\")})\n", "pd.merge(df3, df4, left_on=\"lkey\", right_on=\"rkey\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "pd.merge(df1, df2, how=\"outer\")\n", "pd.merge(df3, df4, left_on=\"lkey\", right_on=\"rkey\", how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n", " \"data1\": pd.Series(range(6), dtype=\"Int64\")})\n", "df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"a\", \"b\", \"d\"],\n", " \"data2\": pd.Series(range(5), dtype=\"Int64\")})\n", "df1\n", "df2\n", "pd.merge(df1, df2, on=\"key\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "pd.merge(df1, df2, how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "left = pd.DataFrame({\"key1\": [\"foo\", \"foo\", \"bar\"],\n", " \"key2\": [\"one\", \"two\", \"one\"],\n", " \"lval\": pd.Series([1, 2, 3], dtype='Int64')})\n", "right = pd.DataFrame({\"key1\": [\"foo\", \"foo\", \"bar\", \"bar\"],\n", " \"key2\": [\"one\", \"one\", \"one\", \"two\"],\n", " \"rval\": pd.Series([4, 5, 6, 7], dtype='Int64')})\n", "pd.merge(left, right, on=[\"key1\", \"key2\"], how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "pd.merge(left, right, on=\"key1\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "pd.merge(left, right, on=\"key1\", suffixes=(\"_left\", \"_right\"))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "left1 = pd.DataFrame({\"key\": [\"a\", \"b\", \"a\", \"a\", \"b\", \"c\"],\n", " \"value\": pd.Series(range(6), dtype=\"Int64\")})\n", "right1 = pd.DataFrame({\"group_val\": [3.5, 7]}, index=[\"a\", \"b\"])\n", "left1\n", "right1\n", "pd.merge(left1, right1, left_on=\"key\", right_index=True)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "pd.merge(left1, right1, left_on=\"key\", right_index=True, how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "lefth = pd.DataFrame({\"key1\": [\"Ohio\", \"Ohio\", \"Ohio\",\n", " \"Nevada\", \"Nevada\"],\n", " \"key2\": [2000, 2001, 2002, 2001, 2002],\n", " \"data\": pd.Series(range(5), dtype=\"Int64\")})\n", "righth_index = pd.MultiIndex.from_arrays(\n", " [\n", " [\"Nevada\", \"Nevada\", \"Ohio\", \"Ohio\", \"Ohio\", \"Ohio\"],\n", " [2001, 2000, 2000, 2000, 2001, 2002]\n", " ]\n", ")\n", "righth = pd.DataFrame({\"event1\": pd.Series([0, 2, 4, 6, 8, 10], dtype=\"Int64\",\n", " index=righth_index),\n", " \"event2\": pd.Series([1, 3, 5, 7, 9, 11], dtype=\"Int64\",\n", " index=righth_index)})\n", "lefth\n", "righth" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "pd.merge(lefth, righth, left_on=[\"key1\", \"key2\"], right_index=True)\n", "pd.merge(lefth, righth, left_on=[\"key1\", \"key2\"],\n", " right_index=True, how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],\n", " index=[\"a\", \"c\", \"e\"],\n", " columns=[\"Ohio\", \"Nevada\"]).astype(\"Int64\")\n", "right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n", " index=[\"b\", \"c\", \"d\", \"e\"],\n", " columns=[\"Missouri\", \"Alabama\"]).astype(\"Int64\")\n", "left2\n", "right2\n", "pd.merge(left2, right2, how=\"outer\", left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "left2.join(right2, how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "left1.join(right1, on=\"key\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n", " index=[\"a\", \"c\", \"e\", \"f\"],\n", " columns=[\"New York\", \"Oregon\"])\n", "another\n", "left2.join([right2, another])\n", "left2.join([right2, another], how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "arr = np.arange(12).reshape((3, 4))\n", "arr\n", "np.concatenate([arr, arr], axis=1)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "s1 = pd.Series([0, 1], index=[\"a\", \"b\"], dtype=\"Int64\")\n", "s2 = pd.Series([2, 3, 4], index=[\"c\", \"d\", \"e\"], dtype=\"Int64\")\n", "s3 = pd.Series([5, 6], index=[\"f\", \"g\"], dtype=\"Int64\")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "s1\n", "s2\n", "s3\n", "pd.concat([s1, s2, s3])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "pd.concat([s1, s2, s3], axis=\"columns\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "s4 = pd.concat([s1, s3])\n", "s4\n", "pd.concat([s1, s4], axis=\"columns\")\n", "pd.concat([s1, s4], axis=\"columns\", join=\"inner\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "result = pd.concat([s1, s1, s3], keys=[\"one\", \"two\", \"three\"])\n", "result\n", "result.unstack()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "pd.concat([s1, s2, s3], axis=\"columns\", keys=[\"one\", \"two\", \"three\"])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=[\"a\", \"b\", \"c\"],\n", " columns=[\"one\", \"two\"])\n", "df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=[\"a\", \"c\"],\n", " columns=[\"three\", \"four\"])\n", "df1\n", "df2\n", "pd.concat([df1, df2], axis=\"columns\", keys=[\"level1\", \"level2\"])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "pd.concat({\"level1\": df1, \"level2\": df2}, axis=\"columns\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "pd.concat([df1, df2], axis=\"columns\", keys=[\"level1\", \"level2\"],\n", " names=[\"upper\", \"lower\"])" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame(np.random.standard_normal((3, 4)),\n", " columns=[\"a\", \"b\", \"c\", \"d\"])\n", "df2 = pd.DataFrame(np.random.standard_normal((2, 3)),\n", " columns=[\"b\", \"d\", \"a\"])\n", "df1\n", "df2" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "pd.concat([df1, df2], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],\n", " index=[\"f\", \"e\", \"d\", \"c\", \"b\", \"a\"])\n", "b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],\n", " index=[\"a\", \"b\", \"c\", \"d\", \"e\", \"f\"])\n", "a\n", "b\n", "np.where(pd.isna(a), b, a)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "a.combine_first(b)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame({\"a\": [1., np.nan, 5., np.nan],\n", " \"b\": [np.nan, 2., np.nan, 6.],\n", " \"c\": range(2, 18, 4)})\n", "df2 = pd.DataFrame({\"a\": [5., 4., np.nan, 3., 7.],\n", " \"b\": [np.nan, 3., 4., 6., 8.]})\n", "df1\n", "df2\n", "df1.combine_first(df2)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame(np.arange(6).reshape((2, 3)),\n", " index=pd.Index([\"Ohio\", \"Colorado\"], name=\"state\"),\n", " columns=pd.Index([\"one\", \"two\", \"three\"],\n", " name=\"number\"))\n", "data" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "result = data.stack()\n", "result" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "result.unstack()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "result.unstack(level=0)\n", "result.unstack(level=\"state\")" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "s1 = pd.Series([0, 1, 2, 3], index=[\"a\", \"b\", \"c\", \"d\"], dtype=\"Int64\")\n", "s2 = pd.Series([4, 5, 6], index=[\"c\", \"d\", \"e\"], dtype=\"Int64\")\n", "data2 = pd.concat([s1, s2], keys=[\"one\", \"two\"])\n", "data2" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "data2.unstack()\n", "data2.unstack().stack()\n", "data2.unstack().stack(dropna=False)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"left\": result, \"right\": result + 5},\n", " columns=pd.Index([\"left\", \"right\"], name=\"side\"))\n", "df\n", "df.unstack(level=\"state\")" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "df.unstack(level=\"state\").stack(level=\"side\")" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv(\"examples/macrodata.csv\")\n", "data = data.loc[:, [\"year\", \"quarter\", \"realgdp\", \"infl\", \"unemp\"]]\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "periods = pd.PeriodIndex(year=data.pop(\"year\"),\n", " quarter=data.pop(\"quarter\"),\n", " name=\"date\")\n", "periods\n", "data.index = periods.to_timestamp(\"D\")\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "data = data.reindex(columns=[\"realgdp\", \"infl\", \"unemp\"])\n", "data.columns.name = \"item\"\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "long_data = (data.stack()\n", " .reset_index()\n", " .rename(columns={0: \"value\"}))" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "long_data[:10]" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "pivoted = long_data.pivot(index=\"date\", columns=\"item\",\n", " values=\"value\")\n", "pivoted.head()" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "long_data.index.name = None" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "long_data[\"value2\"] = np.random.standard_normal(len(long_data))\n", "long_data[:10]" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "pivoted = long_data.pivot(index=\"date\", columns=\"item\")\n", "pivoted.head()\n", "pivoted[\"value\"].head()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "unstacked = long_data.set_index([\"date\", \"item\"]).unstack(level=\"item\")\n", "unstacked.head()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"key\": [\"foo\", \"bar\", \"baz\"],\n", " \"A\": [1, 2, 3],\n", " \"B\": [4, 5, 6],\n", " \"C\": [7, 8, 9]})\n", "df" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "melted = pd.melt(df, id_vars=\"key\")\n", "melted" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "reshaped = melted.pivot(index=\"key\", columns=\"variable\",\n", " values=\"value\")\n", "reshaped" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "reshaped.reset_index()" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "pd.melt(df, id_vars=\"key\", value_vars=[\"A\", \"B\"])" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "pd.melt(df, value_vars=[\"A\", \"B\", \"C\"])\n", "pd.melt(df, value_vars=[\"key\", \"A\", \"B\"])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }