{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Iris Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> 📢: **This document was used during early development of siuba. See the [select docs](https://siuba.readthedocs.io/en/latest/api_table_core/03_select.html).**\n",
"\n",
"Many different ways of selecting columns from the iris dataset. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from siuba import *\n",
"import pandas as pd\n",
"\n",
"pd.set_option('max_rows', 5)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"## Rather than import the iris data from sklearn, I am just including the\n",
"## first 5 rows.\n",
"\n",
"# from sklearn import datasets\n",
"# iris = datasets.load_iris()\n",
"# df_iris = pd.DataFrame(iris.data, columns = iris.feature_names)\n",
"# df_iris['species'] = iris.target_names[iris.target]\n",
"\n",
"df_iris = pd.DataFrame({\n",
" 'sepal length (cm)': [5.1, 4.9, 4.7, 4.6, 5.0],\n",
" 'sepal width (cm)': [3.5, 3.0, 3.2, 3.1, 3.6],\n",
" 'petal length (cm)': [1.4, 1.4, 1.3, 1.5, 1.4],\n",
" 'petal width (cm)': [0.2, 0.2, 0.2, 0.2, 0.2],\n",
" 'species': ['setosa', 'setosa', 'setosa', 'setosa', 'setosa']\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal width (cm) | \n",
" petal length (cm) | \n",
" petal width (cm) | \n",
" species | \n",
" sepal length (cm) | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 5.1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 4.9 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
" 4.7 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
" 4.6 | \n",
"
\n",
" \n",
" | 4 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 5.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal width (cm) petal length (cm) petal width (cm) species \\\n",
"0 3.5 1.4 0.2 setosa \n",
"1 3.0 1.4 0.2 setosa \n",
"2 3.2 1.3 0.2 setosa \n",
"3 3.1 1.5 0.2 setosa \n",
"4 3.6 1.4 0.2 setosa \n",
"\n",
" sepal length (cm) \n",
"0 5.1 \n",
"1 4.9 \n",
"2 4.7 \n",
"3 4.6 \n",
"4 5.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get sepal columns\n",
"select(df_iris, _.startswith(\"sepal\"))\n",
"\n",
"# get width measure columns\n",
"# note method calls sent to df_iris.columns.str\n",
"# so _.endswith(\"...\") is equivalent to df_iris.columns.str.endswith(\"...\")\n",
"select(df_iris, _.endswith(\"width (cm)\"))\n",
"\n",
"# movie species to front\n",
"# _.endswith(\"\") is a hack to get everything\n",
"select(df_iris, _.species, _.endswith(\"\"))\n",
"\n",
"# move sepal length to the back\n",
"# first select all variables except Sepal.Length, then re select Sepal.Length\n",
"select(df_iris, -_[\"sepal length (cm)\"], _[\"sepal length (cm)\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wide table"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" V2 | \n",
" V3 | \n",
" V6 | \n",
" V0 | \n",
" V8 | \n",
" V7 | \n",
" V4 | \n",
" V1 | \n",
" V5 | \n",
" V9 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.226851 | \n",
" 0.551315 | \n",
" 0.980764 | \n",
" 0.696469 | \n",
" 0.480932 | \n",
" 0.684830 | \n",
" 0.719469 | \n",
" 0.286139 | \n",
" 0.423106 | \n",
" 0.392118 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.438572 | \n",
" 0.059678 | \n",
" 0.182492 | \n",
" 0.343178 | \n",
" 0.531551 | \n",
" 0.175452 | \n",
" 0.398044 | \n",
" 0.729050 | \n",
" 0.737995 | \n",
" 0.531828 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 8 | \n",
" 0.554383 | \n",
" 0.388951 | \n",
" 0.357398 | \n",
" 0.318766 | \n",
" 0.304768 | \n",
" 0.043591 | \n",
" 0.925132 | \n",
" 0.691970 | \n",
" 0.841670 | \n",
" 0.398186 | \n",
"
\n",
" \n",
" | 9 | \n",
" 0.355915 | \n",
" 0.762548 | \n",
" 0.151127 | \n",
" 0.704959 | \n",
" 0.240856 | \n",
" 0.398876 | \n",
" 0.593177 | \n",
" 0.995358 | \n",
" 0.691702 | \n",
" 0.343456 | \n",
"
\n",
" \n",
"
\n",
"
10 rows × 10 columns
\n",
"
"
],
"text/plain": [
" V2 V3 V6 V0 V8 V7 V4 \\\n",
"0 0.226851 0.551315 0.980764 0.696469 0.480932 0.684830 0.719469 \n",
"1 0.438572 0.059678 0.182492 0.343178 0.531551 0.175452 0.398044 \n",
".. ... ... ... ... ... ... ... \n",
"8 0.554383 0.388951 0.357398 0.318766 0.304768 0.043591 0.925132 \n",
"9 0.355915 0.762548 0.151127 0.704959 0.240856 0.398876 0.593177 \n",
"\n",
" V1 V5 V9 \n",
"0 0.286139 0.423106 0.392118 \n",
"1 0.729050 0.737995 0.531828 \n",
".. ... ... ... \n",
"8 0.691970 0.841670 0.398186 \n",
"9 0.995358 0.691702 0.343456 \n",
"\n",
"[10 rows x 10 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"from numpy.random import uniform, seed\n",
"\n",
"seed(123)\n",
"\n",
"df = pd.DataFrame(uniform(size = [10, 10]))\n",
"df = df[np.array([3, 4, 7, 1, 9, 8, 5, 2, 6, 10]) - 1]\n",
"df.columns = \"V\" + df.columns.astype(str)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" V3 | \n",
" V6 | \n",
" V0 | \n",
" V8 | \n",
" V7 | \n",
" V4 | \n",
" V1 | \n",
" V5 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.551315 | \n",
" 0.980764 | \n",
" 0.696469 | \n",
" 0.480932 | \n",
" 0.684830 | \n",
" 0.719469 | \n",
" 0.286139 | \n",
" 0.423106 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.059678 | \n",
" 0.182492 | \n",
" 0.343178 | \n",
" 0.531551 | \n",
" 0.175452 | \n",
" 0.398044 | \n",
" 0.729050 | \n",
" 0.737995 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 8 | \n",
" 0.388951 | \n",
" 0.357398 | \n",
" 0.318766 | \n",
" 0.304768 | \n",
" 0.043591 | \n",
" 0.925132 | \n",
" 0.691970 | \n",
" 0.841670 | \n",
"
\n",
" \n",
" | 9 | \n",
" 0.762548 | \n",
" 0.151127 | \n",
" 0.704959 | \n",
" 0.240856 | \n",
" 0.398876 | \n",
" 0.593177 | \n",
" 0.995358 | \n",
" 0.691702 | \n",
"
\n",
" \n",
"
\n",
"
10 rows × 8 columns
\n",
"
"
],
"text/plain": [
" V3 V6 V0 V8 V7 V4 V1 \\\n",
"0 0.551315 0.980764 0.696469 0.480932 0.684830 0.719469 0.286139 \n",
"1 0.059678 0.182492 0.343178 0.531551 0.175452 0.398044 0.729050 \n",
".. ... ... ... ... ... ... ... \n",
"8 0.388951 0.357398 0.318766 0.304768 0.043591 0.925132 0.691970 \n",
"9 0.762548 0.151127 0.704959 0.240856 0.398876 0.593177 0.995358 \n",
"\n",
" V5 \n",
"0 0.423106 \n",
"1 0.737995 \n",
".. ... \n",
"8 0.841670 \n",
"9 0.691702 \n",
"\n",
"[10 rows x 8 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select(df, _[\"V3\":\"V5\"])\n",
"#select(df, _[\"V5\":\"V3\"])\n",
"\n",
"# no num_range capability"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal length (cm) | \n",
" sepal width (cm) | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" setosa | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal length (cm) sepal width (cm) species\n",
"0 5.1 3.5 setosa\n",
"1 4.9 3.0 setosa\n",
"2 4.7 3.2 setosa\n",
"3 4.6 3.1 setosa\n",
"4 5.0 3.6 setosa"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# can exclude matches\n",
"select(df_iris, -_.startswith(\"petal\"))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal length (cm) | \n",
" sepal width (cm) | \n",
" petal_length | \n",
" petal width (cm) | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal length (cm) sepal width (cm) petal_length petal width (cm) species\n",
"0 5.1 3.5 1.4 0.2 setosa\n",
"1 4.9 3.0 1.4 0.2 setosa\n",
"2 4.7 3.2 1.3 0.2 setosa\n",
"3 4.6 3.1 1.5 0.2 setosa\n",
"4 5.0 3.6 1.4 0.2 setosa"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# select can grab specific and rename columns\n",
"select(df_iris, _.petal_length == _[\"petal length (cm)\"])\n",
"\n",
"# rename leaves all columns\n",
"rename(df_iris, petal_length = \"petal length (cm)\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Uh--- TODO? should return error? In tidyverse does group rename\n",
"# pretty rare to see!\n",
"#select(df_iris, _.obs == _.startswith('s'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}