{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Iris Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> 📢: **This document was used during early development of siuba. See the [select docs](https://siuba.readthedocs.io/en/latest/api_table_core/03_select.html).**\n", "\n", "Many different ways of selecting columns from the iris dataset. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from siuba import *\n", "import pandas as pd\n", "\n", "pd.set_option('max_rows', 5)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "## Rather than import the iris data from sklearn, I am just including the\n", "## first 5 rows.\n", "\n", "# from sklearn import datasets\n", "# iris = datasets.load_iris()\n", "# df_iris = pd.DataFrame(iris.data, columns = iris.feature_names)\n", "# df_iris['species'] = iris.target_names[iris.target]\n", "\n", "df_iris = pd.DataFrame({\n", " 'sepal length (cm)': [5.1, 4.9, 4.7, 4.6, 5.0],\n", " 'sepal width (cm)': [3.5, 3.0, 3.2, 3.1, 3.6],\n", " 'petal length (cm)': [1.4, 1.4, 1.3, 1.5, 1.4],\n", " 'petal width (cm)': [0.2, 0.2, 0.2, 0.2, 0.2],\n", " 'species': ['setosa', 'setosa', 'setosa', 'setosa', 'setosa']\n", "})" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal width (cm)petal length (cm)petal width (cm)speciessepal length (cm)
03.51.40.2setosa5.1
13.01.40.2setosa4.9
23.21.30.2setosa4.7
33.11.50.2setosa4.6
43.61.40.2setosa5.0
\n", "
" ], "text/plain": [ " sepal width (cm) petal length (cm) petal width (cm) species \\\n", "0 3.5 1.4 0.2 setosa \n", "1 3.0 1.4 0.2 setosa \n", "2 3.2 1.3 0.2 setosa \n", "3 3.1 1.5 0.2 setosa \n", "4 3.6 1.4 0.2 setosa \n", "\n", " sepal length (cm) \n", "0 5.1 \n", "1 4.9 \n", "2 4.7 \n", "3 4.6 \n", "4 5.0 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get sepal columns\n", "select(df_iris, _.startswith(\"sepal\"))\n", "\n", "# get width measure columns\n", "# note method calls sent to df_iris.columns.str\n", "# so _.endswith(\"...\") is equivalent to df_iris.columns.str.endswith(\"...\")\n", "select(df_iris, _.endswith(\"width (cm)\"))\n", "\n", "# movie species to front\n", "# _.endswith(\"\") is a hack to get everything\n", "select(df_iris, _.species, _.endswith(\"\"))\n", "\n", "# move sepal length to the back\n", "# first select all variables except Sepal.Length, then re select Sepal.Length\n", "select(df_iris, -_[\"sepal length (cm)\"], _[\"sepal length (cm)\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Wide table" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
V2V3V6V0V8V7V4V1V5V9
00.2268510.5513150.9807640.6964690.4809320.6848300.7194690.2861390.4231060.392118
10.4385720.0596780.1824920.3431780.5315510.1754520.3980440.7290500.7379950.531828
.................................
80.5543830.3889510.3573980.3187660.3047680.0435910.9251320.6919700.8416700.398186
90.3559150.7625480.1511270.7049590.2408560.3988760.5931770.9953580.6917020.343456
\n", "

10 rows × 10 columns

\n", "
" ], "text/plain": [ " V2 V3 V6 V0 V8 V7 V4 \\\n", "0 0.226851 0.551315 0.980764 0.696469 0.480932 0.684830 0.719469 \n", "1 0.438572 0.059678 0.182492 0.343178 0.531551 0.175452 0.398044 \n", ".. ... ... ... ... ... ... ... \n", "8 0.554383 0.388951 0.357398 0.318766 0.304768 0.043591 0.925132 \n", "9 0.355915 0.762548 0.151127 0.704959 0.240856 0.398876 0.593177 \n", "\n", " V1 V5 V9 \n", "0 0.286139 0.423106 0.392118 \n", "1 0.729050 0.737995 0.531828 \n", ".. ... ... ... \n", "8 0.691970 0.841670 0.398186 \n", "9 0.995358 0.691702 0.343456 \n", "\n", "[10 rows x 10 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "from numpy.random import uniform, seed\n", "\n", "seed(123)\n", "\n", "df = pd.DataFrame(uniform(size = [10, 10]))\n", "df = df[np.array([3, 4, 7, 1, 9, 8, 5, 2, 6, 10]) - 1]\n", "df.columns = \"V\" + df.columns.astype(str)\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
V3V6V0V8V7V4V1V5
00.5513150.9807640.6964690.4809320.6848300.7194690.2861390.423106
10.0596780.1824920.3431780.5315510.1754520.3980440.7290500.737995
...........................
80.3889510.3573980.3187660.3047680.0435910.9251320.6919700.841670
90.7625480.1511270.7049590.2408560.3988760.5931770.9953580.691702
\n", "

10 rows × 8 columns

\n", "
" ], "text/plain": [ " V3 V6 V0 V8 V7 V4 V1 \\\n", "0 0.551315 0.980764 0.696469 0.480932 0.684830 0.719469 0.286139 \n", "1 0.059678 0.182492 0.343178 0.531551 0.175452 0.398044 0.729050 \n", ".. ... ... ... ... ... ... ... \n", "8 0.388951 0.357398 0.318766 0.304768 0.043591 0.925132 0.691970 \n", "9 0.762548 0.151127 0.704959 0.240856 0.398876 0.593177 0.995358 \n", "\n", " V5 \n", "0 0.423106 \n", "1 0.737995 \n", ".. ... \n", "8 0.841670 \n", "9 0.691702 \n", "\n", "[10 rows x 8 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select(df, _[\"V3\":\"V5\"])\n", "#select(df, _[\"V5\":\"V3\"])\n", "\n", "# no num_range capability" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)species
05.13.5setosa
14.93.0setosa
24.73.2setosa
34.63.1setosa
45.03.6setosa
\n", "
" ], "text/plain": [ " sepal length (cm) sepal width (cm) species\n", "0 5.1 3.5 setosa\n", "1 4.9 3.0 setosa\n", "2 4.7 3.2 setosa\n", "3 4.6 3.1 setosa\n", "4 5.0 3.6 setosa" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# can exclude matches\n", "select(df_iris, -_.startswith(\"petal\"))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)petal_lengthpetal width (cm)species
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " sepal length (cm) sepal width (cm) petal_length petal width (cm) species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select can grab specific and rename columns\n", "select(df_iris, _.petal_length == _[\"petal length (cm)\"])\n", "\n", "# rename leaves all columns\n", "rename(df_iris, petal_length = \"petal length (cm)\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Uh--- TODO? should return error? In tidyverse does group rename\n", "# pretty rare to see!\n", "#select(df_iris, _.obs == _.startswith('s'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }