{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", "pd.options.display.max_rows = 25\n", "pd.options.display.max_columns = 20\n", "pd.options.display.max_colwidth = 82\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc(\"figure\", figsize=(10, 6))\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "float_data = pd.Series([1.2, -3.5, np.nan, 0])\n", "float_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "float_data.isna()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "string_data = pd.Series([\"aardvark\", np.nan, None, \"avocado\"])\n", "string_data\n", "string_data.isna()\n", "float_data = pd.Series([1, 2, None], dtype='float64')\n", "float_data\n", "float_data.isna()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data = pd.Series([1, np.nan, 3.5, np.nan, 7])\n", "data.dropna()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "data[data.notna()]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],\n", " [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])\n", "data\n", "data.dropna()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "data.dropna(how=\"all\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data[4] = np.nan\n", "data\n", "data.dropna(axis=\"columns\", how=\"all\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(np.random.standard_normal((7, 3)))\n", "df.iloc[:4, 1] = np.nan\n", "df.iloc[:2, 2] = np.nan\n", "df\n", "df.dropna()\n", "df.dropna(thresh=2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df.fillna({1: 0.5, 2: 0})" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(np.random.standard_normal((6, 3)))\n", "df.iloc[2:, 1] = np.nan\n", "df.iloc[4:, 2] = np.nan\n", "df\n", "df.fillna(method=\"ffill\")\n", "df.fillna(method=\"ffill\", limit=2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "data = pd.Series([1., np.nan, 3.5, np.nan, 7])\n", "data.fillna(data.mean())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame({\"k1\": [\"one\", \"two\"] * 3 + [\"two\"],\n", " \"k2\": [1, 1, 2, 3, 3, 4, 4]})\n", "data" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "data.duplicated()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "data.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "data[\"v1\"] = range(7)\n", "data\n", "data.drop_duplicates(subset=[\"k1\"])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "data.drop_duplicates([\"k1\", \"k2\"], keep=\"last\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame({\"food\": [\"bacon\", \"pulled pork\", \"bacon\",\n", " \"pastrami\", \"corned beef\", \"bacon\",\n", " \"pastrami\", \"honey ham\", \"nova lox\"],\n", " \"ounces\": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n", "data" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "meat_to_animal = {\n", " \"bacon\": \"pig\",\n", " \"pulled pork\": \"pig\",\n", " \"pastrami\": \"cow\",\n", " \"corned beef\": \"cow\",\n", " \"honey ham\": \"pig\",\n", " \"nova lox\": \"salmon\"\n", "}" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "data[\"animal\"] = data[\"food\"].map(meat_to_animal)\n", "data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def get_animal(x):\n", " return meat_to_animal[x]\n", "data[\"food\"].map(get_animal)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n", "data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "data.replace(-999, np.nan)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "data.replace([-999, -1000], np.nan)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "data.replace([-999, -1000], [np.nan, 0])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "data.replace({-999: np.nan, -1000: 0})" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n", " index=[\"Ohio\", \"Colorado\", \"New York\"],\n", " columns=[\"one\", \"two\", \"three\", \"four\"])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def transform(x):\n", " return x[:4].upper()\n", "\n", "data.index.map(transform)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "data.index = data.index.map(transform)\n", "data" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "data.rename(index=str.title, columns=str.upper)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "data.rename(index={\"OHIO\": \"INDIANA\"},\n", " columns={\"three\": \"peekaboo\"})" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "bins = [18, 25, 35, 60, 100]\n", "age_categories = pd.cut(ages, bins)\n", "age_categories" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "age_categories.codes\n", "age_categories.categories\n", "age_categories.categories[0]\n", "pd.value_counts(age_categories)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "pd.cut(ages, bins, right=False)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "group_names = [\"Youth\", \"YoungAdult\", \"MiddleAged\", \"Senior\"]\n", "pd.cut(ages, bins, labels=group_names)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "data = np.random.uniform(size=20)\n", "pd.cut(data, 4, precision=2)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "data = np.random.standard_normal(1000)\n", "quartiles = pd.qcut(data, 4, precision=2)\n", "quartiles\n", "pd.value_counts(quartiles)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame(np.random.standard_normal((1000, 4)))\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "col = data[2]\n", "col[col.abs() > 3]" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "data[(data.abs() > 3).any(axis=\"columns\")]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "data[data.abs() > 3] = np.sign(data) * 3\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "np.sign(data).head()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))\n", "df\n", "sampler = np.random.permutation(5)\n", "sampler" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "df.take(sampler)\n", "df.iloc[sampler]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "column_sampler = np.random.permutation(7)\n", "column_sampler\n", "df.take(column_sampler, axis=\"columns\")" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "df.sample(n=3)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "choices = pd.Series([5, 7, -1, 6, 4])\n", "choices.sample(n=10, replace=True)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n", " \"data1\": range(6)})\n", "df\n", "pd.get_dummies(df[\"key\"], dtype=float)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "dummies = pd.get_dummies(df[\"key\"], prefix=\"key\", dtype=float)\n", "df_with_dummy = df[[\"data1\"]].join(dummies)\n", "df_with_dummy" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "mnames = [\"movie_id\", \"title\", \"genres\"]\n", "movies = pd.read_table(\"datasets/movielens/movies.dat\", sep=\"::\",\n", " header=None, names=mnames, engine=\"python\")\n", "movies[:10]" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "dummies = movies[\"genres\"].str.get_dummies(\"|\")\n", "dummies.iloc[:10, :6]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "movies_windic = movies.join(dummies.add_prefix(\"Genre_\"))\n", "movies_windic.iloc[0]" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "np.random.seed(12345) # to make the example repeatable\n", "values = np.random.uniform(size=10)\n", "values\n", "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n", "pd.get_dummies(pd.cut(values, bins))" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "s = pd.Series([1, 2, 3, None])\n", "s\n", "s.dtype" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())\n", "s\n", "s.isna()\n", "s.dtype" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "s[3]\n", "s[3] is pd.NA" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "s = pd.Series([1, 2, 3, None], dtype=\"Int64\")" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())\n", "s" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"A\": [1, 2, None, 4],\n", " \"B\": [\"one\", \"two\", \"three\", None],\n", " \"C\": [False, None, False, True]})\n", "df\n", "df[\"A\"] = df[\"A\"].astype(\"Int64\")\n", "df[\"B\"] = df[\"B\"].astype(\"string\")\n", "df[\"C\"] = df[\"C\"].astype(\"boolean\")\n", "df" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "val = \"a,b, guido\"\n", "val.split(\",\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "pieces = [x.strip() for x in val.split(\",\")]\n", "pieces" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "first, second, third = pieces\n", "first + \"::\" + second + \"::\" + third" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "\"::\".join(pieces)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "\"guido\" in val\n", "val.index(\",\")\n", "val.find(\":\")" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "val.index(\":\")" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "val.count(\",\")" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "val.replace(\",\", \"::\")\n", "val.replace(\",\", \"\")" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "import re\n", "text = \"foo bar\\t baz \\tqux\"\n", "re.split(r\"\\s+\", text)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "regex = re.compile(r\"\\s+\")\n", "regex.split(text)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "regex.findall(text)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "text = \"\"\"Dave dave@google.com\n", "Steve steve@gmail.com\n", "Rob rob@gmail.com\n", "Ryan ryan@yahoo.com\"\"\"\n", "pattern = r\"[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\"\n", "\n", "# re.IGNORECASE makes the regex case insensitive\n", "regex = re.compile(pattern, flags=re.IGNORECASE)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "regex.findall(text)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "m = regex.search(text)\n", "m\n", "text[m.start():m.end()]" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "print(regex.match(text))" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "print(regex.sub(\"REDACTED\", text))" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n", "regex = re.compile(pattern, flags=re.IGNORECASE)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "m = regex.match(\"wesm@bright.net\")\n", "m.groups()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "regex.findall(text)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "print(regex.sub(r\"Username: \\1, Domain: \\2, Suffix: \\3\", text))" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "data = {\"Dave\": \"dave@google.com\", \"Steve\": \"steve@gmail.com\",\n", " \"Rob\": \"rob@gmail.com\", \"Wes\": np.nan}\n", "data = pd.Series(data)\n", "data\n", "data.isna()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "data.str.contains(\"gmail\")" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "data_as_string_ext = data.astype('string')\n", "data_as_string_ext\n", "data_as_string_ext.str.contains(\"gmail\")" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n", "data.str.findall(pattern, flags=re.IGNORECASE)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]\n", "matches\n", "matches.str.get(1)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "data.str[:5]" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "data.str.extract(pattern, flags=re.IGNORECASE)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "values = pd.Series(['apple', 'orange', 'apple',\n", " 'apple'] * 2)\n", "values\n", "pd.unique(values)\n", "pd.value_counts(values)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "values = pd.Series([0, 1, 0, 0] * 2)\n", "dim = pd.Series(['apple', 'orange'])\n", "values\n", "dim" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "dim.take(values)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "fruits = ['apple', 'orange', 'apple', 'apple'] * 2\n", "N = len(fruits)\n", "rng = np.random.default_rng(seed=12345)\n", "df = pd.DataFrame({'fruit': fruits,\n", " 'basket_id': np.arange(N),\n", " 'count': rng.integers(3, 15, size=N),\n", " 'weight': rng.uniform(0, 4, size=N)},\n", " columns=['basket_id', 'fruit', 'count', 'weight'])\n", "df" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "fruit_cat = df['fruit'].astype('category')\n", "fruit_cat" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "c = fruit_cat.array\n", "type(c)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "c.categories\n", "c.codes" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "dict(enumerate(c.categories))" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "df['fruit'] = df['fruit'].astype('category')\n", "df[\"fruit\"]" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])\n", "my_categories" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "categories = ['foo', 'bar', 'baz']\n", "codes = [0, 1, 2, 0, 0, 1]\n", "my_cats_2 = pd.Categorical.from_codes(codes, categories)\n", "my_cats_2" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "ordered_cat = pd.Categorical.from_codes(codes, categories,\n", " ordered=True)\n", "ordered_cat" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "my_cats_2.as_ordered()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "rng = np.random.default_rng(seed=12345)\n", "draws = rng.standard_normal(1000)\n", "draws[:5]" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "bins = pd.qcut(draws, 4)\n", "bins" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\n", "bins\n", "bins.codes[:10]" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "bins = pd.Series(bins, name='quartile')\n", "results = (pd.Series(draws)\n", " .groupby(bins)\n", " .agg(['count', 'min', 'max'])\n", " .reset_index())\n", "results" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "results['quartile']" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "N = 10_000_000\n", "labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "categories = labels.astype('category')" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "labels.memory_usage(deep=True)\n", "categories.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "%time _ = labels.astype('category')" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "%timeit labels.value_counts()\n", "%timeit categories.value_counts()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "s = pd.Series(['a', 'b', 'c', 'd'] * 2)\n", "cat_s = s.astype('category')\n", "cat_s" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "cat_s.cat.codes\n", "cat_s.cat.categories" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "actual_categories = ['a', 'b', 'c', 'd', 'e']\n", "cat_s2 = cat_s.cat.set_categories(actual_categories)\n", "cat_s2" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "cat_s.value_counts()\n", "cat_s2.value_counts()" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "cat_s3 = cat_s[cat_s.isin(['a', 'b'])]\n", "cat_s3\n", "cat_s3.cat.remove_unused_categories()" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "pd.get_dummies(cat_s, dtype=float)" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }