{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_rows = 25\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_colwidth = 82\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "float_data = pd.Series([1.2, -3.5, np.nan, 0])\n",
    "float_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "float_data.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "string_data = pd.Series([\"aardvark\", np.nan, None, \"avocado\"])\n",
    "string_data\n",
    "string_data.isna()\n",
    "float_data = pd.Series([1, 2, None], dtype='float64')\n",
    "float_data\n",
    "float_data.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series([1, np.nan, 3.5, np.nan, 7])\n",
    "data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[data.notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],\n",
    "                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])\n",
    "data\n",
    "data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.dropna(how=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[4] = np.nan\n",
    "data\n",
    "data.dropna(axis=\"columns\", how=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.standard_normal((7, 3)))\n",
    "df.iloc[:4, 1] = np.nan\n",
    "df.iloc[:2, 2] = np.nan\n",
    "df\n",
    "df.dropna()\n",
    "df.dropna(thresh=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.fillna({1: 0.5, 2: 0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.standard_normal((6, 3)))\n",
    "df.iloc[2:, 1] = np.nan\n",
    "df.iloc[4:, 2] = np.nan\n",
    "df\n",
    "df.fillna(method=\"ffill\")\n",
    "df.fillna(method=\"ffill\", limit=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series([1., np.nan, 3.5, np.nan, 7])\n",
    "data.fillna(data.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\"k1\": [\"one\", \"two\"] * 3 + [\"two\"],\n",
    "                     \"k2\": [1, 1, 2, 3, 3, 4, 4]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"v1\"] = range(7)\n",
    "data\n",
    "data.drop_duplicates(subset=[\"k1\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop_duplicates([\"k1\", \"k2\"], keep=\"last\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\"food\": [\"bacon\", \"pulled pork\", \"bacon\",\n",
    "                              \"pastrami\", \"corned beef\", \"bacon\",\n",
    "                              \"pastrami\", \"honey ham\", \"nova lox\"],\n",
    "                     \"ounces\": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "meat_to_animal = {\n",
    "  \"bacon\": \"pig\",\n",
    "  \"pulled pork\": \"pig\",\n",
    "  \"pastrami\": \"cow\",\n",
    "  \"corned beef\": \"cow\",\n",
    "  \"honey ham\": \"pig\",\n",
    "  \"nova lox\": \"salmon\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"animal\"] = data[\"food\"].map(meat_to_animal)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_animal(x):\n",
    "    return meat_to_animal[x]\n",
    "data[\"food\"].map(get_animal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace(-999, np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace([-999, -1000], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace([-999, -1000], [np.nan, 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace({-999: np.nan, -1000: 0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n",
    "                    index=[\"Ohio\", \"Colorado\", \"New York\"],\n",
    "                    columns=[\"one\", \"two\", \"three\", \"four\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform(x):\n",
    "    return x[:4].upper()\n",
    "\n",
    "data.index.map(transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.index = data.index.map(transform)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.rename(index=str.title, columns=str.upper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.rename(index={\"OHIO\": \"INDIANA\"},\n",
    "            columns={\"three\": \"peekaboo\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = [18, 25, 35, 60, 100]\n",
    "age_categories = pd.cut(ages, bins)\n",
    "age_categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_categories.codes\n",
    "age_categories.categories\n",
    "age_categories.categories[0]\n",
    "pd.value_counts(age_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.cut(ages, bins, right=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_names = [\"Youth\", \"YoungAdult\", \"MiddleAged\", \"Senior\"]\n",
    "pd.cut(ages, bins, labels=group_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.random.uniform(size=20)\n",
    "pd.cut(data, 4, precision=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.random.standard_normal(1000)\n",
    "quartiles = pd.qcut(data, 4, precision=2)\n",
    "quartiles\n",
    "pd.value_counts(quartiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.random.standard_normal((1000, 4)))\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "col = data[2]\n",
    "col[col.abs() > 3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[(data.abs() > 3).any(axis=\"columns\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[data.abs() > 3] = np.sign(data) * 3\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.sign(data).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))\n",
    "df\n",
    "sampler = np.random.permutation(5)\n",
    "sampler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.take(sampler)\n",
    "df.iloc[sampler]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_sampler = np.random.permutation(7)\n",
    "column_sampler\n",
    "df.take(column_sampler, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sample(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "choices = pd.Series([5, 7, -1, 6, 4])\n",
    "choices.sample(n=10, replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n",
    "                   \"data1\": range(6)})\n",
    "df\n",
    "pd.get_dummies(df[\"key\"], dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummies = pd.get_dummies(df[\"key\"], prefix=\"key\", dtype=float)\n",
    "df_with_dummy = df[[\"data1\"]].join(dummies)\n",
    "df_with_dummy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnames = [\"movie_id\", \"title\", \"genres\"]\n",
    "movies = pd.read_table(\"datasets/movielens/movies.dat\", sep=\"::\",\n",
    "                       header=None, names=mnames, engine=\"python\")\n",
    "movies[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummies = movies[\"genres\"].str.get_dummies(\"|\")\n",
    "dummies.iloc[:10, :6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_windic = movies.join(dummies.add_prefix(\"Genre_\"))\n",
    "movies_windic.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(12345) # to make the example repeatable\n",
    "values = np.random.uniform(size=10)\n",
    "values\n",
    "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n",
    "pd.get_dummies(pd.cut(values, bins))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([1, 2, 3, None])\n",
    "s\n",
    "s.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())\n",
    "s\n",
    "s.isna()\n",
    "s.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[3]\n",
    "s[3] is pd.NA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([1, 2, 3, None], dtype=\"Int64\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"A\": [1, 2, None, 4],\n",
    "                   \"B\": [\"one\", \"two\", \"three\", None],\n",
    "                   \"C\": [False, None, False, True]})\n",
    "df\n",
    "df[\"A\"] = df[\"A\"].astype(\"Int64\")\n",
    "df[\"B\"] = df[\"B\"].astype(\"string\")\n",
    "df[\"C\"] = df[\"C\"].astype(\"boolean\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "val = \"a,b,  guido\"\n",
    "val.split(\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "pieces = [x.strip() for x in val.split(\",\")]\n",
    "pieces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "first, second, third = pieces\n",
    "first + \"::\" + second + \"::\" + third"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"::\".join(pieces)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"guido\" in val\n",
    "val.index(\",\")\n",
    "val.find(\":\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.index(\":\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.count(\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.replace(\",\", \"::\")\n",
    "val.replace(\",\", \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "text = \"foo    bar\\t baz  \\tqux\"\n",
    "re.split(r\"\\s+\", text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex = re.compile(r\"\\s+\")\n",
    "regex.split(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"Dave dave@google.com\n",
    "Steve steve@gmail.com\n",
    "Rob rob@gmail.com\n",
    "Ryan ryan@yahoo.com\"\"\"\n",
    "pattern = r\"[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\"\n",
    "\n",
    "# re.IGNORECASE makes the regex case insensitive\n",
    "regex = re.compile(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = regex.search(text)\n",
    "m\n",
    "text[m.start():m.end()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(regex.match(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(regex.sub(\"REDACTED\", text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n",
    "regex = re.compile(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = regex.match(\"wesm@bright.net\")\n",
    "m.groups()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(regex.sub(r\"Username: \\1, Domain: \\2, Suffix: \\3\", text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\"Dave\": \"dave@google.com\", \"Steve\": \"steve@gmail.com\",\n",
    "        \"Rob\": \"rob@gmail.com\", \"Wes\": np.nan}\n",
    "data = pd.Series(data)\n",
    "data\n",
    "data.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.str.contains(\"gmail\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_as_string_ext = data.astype('string')\n",
    "data_as_string_ext\n",
    "data_as_string_ext.str.contains(\"gmail\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n",
    "data.str.findall(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]\n",
    "matches\n",
    "matches.str.get(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.str[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.str.extract(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = pd.Series(['apple', 'orange', 'apple',\n",
    "                    'apple'] * 2)\n",
    "values\n",
    "pd.unique(values)\n",
    "pd.value_counts(values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = pd.Series([0, 1, 0, 0] * 2)\n",
    "dim = pd.Series(['apple', 'orange'])\n",
    "values\n",
    "dim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "dim.take(values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "fruits = ['apple', 'orange', 'apple', 'apple'] * 2\n",
    "N = len(fruits)\n",
    "rng = np.random.default_rng(seed=12345)\n",
    "df = pd.DataFrame({'fruit': fruits,\n",
    "                   'basket_id': np.arange(N),\n",
    "                   'count': rng.integers(3, 15, size=N),\n",
    "                   'weight': rng.uniform(0, 4, size=N)},\n",
    "                  columns=['basket_id', 'fruit', 'count', 'weight'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "fruit_cat = df['fruit'].astype('category')\n",
    "fruit_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = fruit_cat.array\n",
    "type(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.categories\n",
    "c.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict(enumerate(c.categories))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['fruit'] = df['fruit'].astype('category')\n",
    "df[\"fruit\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])\n",
    "my_categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = ['foo', 'bar', 'baz']\n",
    "codes = [0, 1, 2, 0, 0, 1]\n",
    "my_cats_2 = pd.Categorical.from_codes(codes, categories)\n",
    "my_cats_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "ordered_cat = pd.Categorical.from_codes(codes, categories,\n",
    "                                        ordered=True)\n",
    "ordered_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_cats_2.as_ordered()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "rng = np.random.default_rng(seed=12345)\n",
    "draws = rng.standard_normal(1000)\n",
    "draws[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.qcut(draws, 4)\n",
    "bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\n",
    "bins\n",
    "bins.codes[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.Series(bins, name='quartile')\n",
    "results = (pd.Series(draws)\n",
    "           .groupby(bins)\n",
    "           .agg(['count', 'min', 'max'])\n",
    "           .reset_index())\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "results['quartile']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 10_000_000\n",
    "labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = labels.astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels.memory_usage(deep=True)\n",
    "categories.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time _ = labels.astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit labels.value_counts()\n",
    "%timeit categories.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series(['a', 'b', 'c', 'd'] * 2)\n",
    "cat_s = s.astype('category')\n",
    "cat_s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s.cat.codes\n",
    "cat_s.cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "actual_categories = ['a', 'b', 'c', 'd', 'e']\n",
    "cat_s2 = cat_s.cat.set_categories(actual_categories)\n",
    "cat_s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s.value_counts()\n",
    "cat_s2.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s3 = cat_s[cat_s.isin(['a', 'b'])]\n",
    "cat_s3\n",
    "cat_s3.cat.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.get_dummies(cat_s, dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}