{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Categorical data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 apple\n", "1 orange\n", "2 banana\n", "3 apple\n", "4 orange\n", "5 banana\n", "6 apple\n", "7 orange\n", "8 banana\n", "dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "values = pd.Series([\"apple\", \"orange\", \"banana\"] * 3)\n", "values" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['apple', 'orange', 'banana'], dtype=object)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "values.unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "apple 3\n", "orange 3\n", "banana 3\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "values.value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fruits
0apple
1orange
2banana
3apple
4orange
5banana
6apple
7orange
8banana
\n", "
" ], "text/plain": [ " fruits\n", "0 apple\n", "1 orange\n", "2 banana\n", "3 apple\n", "4 orange\n", "5 banana\n", "6 apple\n", "7 orange\n", "8 banana" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(values, columns = [\"fruits\"])\n", "df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "f = df[\"fruits\"].astype(\"category\")\n", "f = f.values" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[apple, orange, banana, apple, orange, banana, apple, orange, banana]\n", "Categories (3, object): [apple, banana, orange]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.categorical.Categorical" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(f)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['apple', 'banana', 'orange'], dtype='object')" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.categories" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 2, 1, 0, 2, 1, 0, 2, 1], dtype=int8)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.codes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }