{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## These are the follow-up code samples for the following blog post: [Pandas DataFrame by Example](http://queirozf.com/entries/pandas-dataframe-by-example)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0.23.1'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.__version__" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0john23iowa20
1mary78dc24
2peter22california00
3jeff19texas15
4bill45washington20
5lisa33dc10
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 john 23 iowa 2 0\n", "1 mary 78 dc 2 4\n", "2 peter 22 california 0 0\n", "3 jeff 19 texas 1 5\n", "4 bill 45 washington 2 0\n", "5 lisa 33 dc 1 0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','mary','peter','jeff','bill','lisa'],\n", " 'age':[23,78,22,19,45,33],\n", " 'state':['iowa','dc','california','texas','washington','dc'],\n", " 'num_children':[2,2,0,1,2,1],\n", " 'num_pets':[0,4,0,5,0,0]\n", "})\n", "# sorting columns\n", "df=df[['name','age','state','num_children','num_pets']]\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select rows by position" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0john23iowa20
1mary78dc24
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 john 23 iowa 2 0\n", "1 mary 78 dc 2 4" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select the first 2 rows\n", "df.iloc[:2]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
4bill45washington20
5lisa33dc10
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "4 bill 45 washington 2 0\n", "5 lisa 33 dc 1 0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select the last 2 rows\n", "df.iloc[-2:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select rows by index value\n", "\n", "> compare this with `iloc` above" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0john23iowa20
1mary78dc24
2peter22california00
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 john 23 iowa 2 0\n", "1 mary 78 dc 2 4\n", "2 peter 22 california 0 0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select rows up to and including the one\n", "# with index=2\n", "df.loc[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select rows based upon the value of columns" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
1mary78dc24
4bill45washington20
5lisa33dc10
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "1 mary 78 dc 2 4\n", "4 bill 45 washington 2 0\n", "5 lisa 33 dc 1 0" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# by a simple numeric condition\n", "df[df[\"age\"] > 30]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
1mary78dc24
3jeff19texas15
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "1 mary 78 dc 2 4\n", "3 jeff 19 texas 1 5" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# comparing the value of two columns\n", "df[ df[\"num_pets\"] > df[ \"num_children\"] ]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
1mary78dc24
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "1 mary 78 dc 2 4" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# using boolean AND\n", "df[ (df[\"age\"] > 40) & (df[\"num_pets\"] > 0) ]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## select columns starting with" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namenum_childrennum_pets
0john20
1mary24
2peter00
3jeff15
4bill20
5lisa10
\n", "
" ], "text/plain": [ " name num_children num_pets\n", "0 john 2 0\n", "1 mary 2 4\n", "2 peter 0 0\n", "3 jeff 1 5\n", "4 bill 2 0\n", "5 lisa 1 0" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[colname for colname in df.columns if colname.startswith('n')]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## select all columns but one" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namestatenum_childrennum_pets
0johniowa20
1marydc24
2petercalifornia00
3jefftexas15
4billwashington20
5lisadc10
\n", "
" ], "text/plain": [ " name state num_children num_pets\n", "0 john iowa 2 0\n", "1 mary dc 2 4\n", "2 peter california 0 0\n", "3 jeff texas 1 5\n", "4 bill washington 2 0\n", "5 lisa dc 1 0" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop(['age'],axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop a column" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namestatenum_pets
0johniowa0
1marydc4
2petercalifornia0
3jefftexas5
4billwashington0
5lisadc0
\n", "
" ], "text/plain": [ " name state num_pets\n", "0 john iowa 0\n", "1 mary dc 4\n", "2 peter california 0\n", "3 jeff texas 5\n", "4 bill washington 0\n", "5 lisa dc 0" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop([\"age\",\"num_children\"],axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply a function to every column (as aggregates)\n", "\n", "> Using **numpy** vectorized functions for numerical values" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
age36.666667
num_pets1.500000
num_children1.333333
\n", "
" ], "text/plain": [ " 0\n", "age 36.666667\n", "num_pets 1.500000\n", "num_children 1.333333" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get the mean for each of the selected columns\n", "df[[\"age\",\"num_pets\",\"num_children\"]].apply(lambda row: np.mean(row),axis=0).to_frame()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply a function to every row (as aggregates)\n", "> Using **numpy** vectorized functions for numerical values" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
025
184
222
325
447
534
\n", "
" ], "text/plain": [ " 0\n", "0 25\n", "1 84\n", "2 22\n", "3 25\n", "4 47\n", "5 34" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sum columns age, num_pets and num_children for each row\n", "df[[\"age\",\"num_pets\",\"num_children\"]].apply(lambda row: np.sum(row),axis=1).to_frame()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply a function elementwise using apply" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
age
046
1156
244
338
490
566
\n", "
" ], "text/plain": [ " age\n", "0 46\n", "1 156\n", "2 44\n", "3 38\n", "4 90\n", "5 66" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"age\"]].apply(lambda value: value*2)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
age
046
1156
244
338
490
566
\n", "
" ], "text/plain": [ " age\n", "0 46\n", "1 156\n", "2 44\n", "3 38\n", "4 90\n", "5 66" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# certain numerical functions can also be used:\n", "df[[\"age\"]] * 2" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
0JOHN
1MARY
2PETER
3JEFF
4BILL
5LISA
\n", "
" ], "text/plain": [ " name\n", "0 JOHN\n", "1 MARY\n", "2 PETER\n", "3 JEFF\n", "4 BILL\n", "5 LISA" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# also works for string values\n", "df[[\"name\"]].apply(lambda value: value.str.upper())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply a function elementwise using map\n", "\n", "> use `apply` for DataFrame objects and `map` for Series objects\n" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
0JOHN
1MARY
2PETER
3JEFF
4BILL
5LISA
\n", "
" ], "text/plain": [ " name\n", "0 JOHN\n", "1 MARY\n", "2 PETER\n", "3 JEFF\n", "4 BILL\n", "5 LISA" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['name'].map(lambda name: name.upper()).to_frame()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Add new columns based on old ones" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_petspets_and_children
0john23iowa202
1mary78dc246
2peter22california000
3jeff19texas156
4bill45washington202
5lisa33dc101
\n", "
" ], "text/plain": [ " name age state num_children num_pets pets_and_children\n", "0 john 23 iowa 2 0 2\n", "1 mary 78 dc 2 4 6\n", "2 peter 22 california 0 0 0\n", "3 jeff 19 texas 1 5 6\n", "4 bill 45 washington 2 0 2\n", "5 lisa 33 dc 1 0 1" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# simple sum of two columns\n", "df[\"pets_and_children\"] = df[\"num_pets\"] + df[\"num_children\"]\n", "df" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_petsname_uppercase
0john23iowa20JOHN
1mary78dc24MARY
2peter22california00PETER
3jeff19texas15JEFF
4bill45washington20BILL
5lisa33dc10LISA
\n", "
" ], "text/plain": [ " name age state num_children num_pets name_uppercase\n", "0 john 23 iowa 2 0 JOHN\n", "1 mary 78 dc 2 4 MARY\n", "2 peter 22 california 0 0 PETER\n", "3 jeff 19 texas 1 5 JEFF\n", "4 bill 45 washington 2 0 BILL\n", "5 lisa 33 dc 1 0 LISA" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','mary','peter','jeff','bill','lisa'],\n", " 'age':[23,78,22,19,45,33],\n", " 'state':['iowa','dc','california','texas','washington','dc'],\n", " 'num_children':[2,2,0,1,2,1],\n", " 'num_pets':[0,4,0,5,0,0]\n", "})\n", "# sorting columns\n", "df=df[['name','age','state','num_children','num_pets']]\n", "df\n", "\n", "# you can also use custom functions we used on \"elementwise application\"\n", "df[\"name_uppercase\"] = df[[\"name\"]].apply(lambda name: name.str.upper())\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Shuffle rows" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
4bill45washington20
3jeff19texas15
2peter22california00
1mary78dc24
5lisa33dc10
0john23iowa20
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "4 bill 45 washington 2 0\n", "3 jeff 19 texas 1 5\n", "2 peter 22 california 0 0\n", "1 mary 78 dc 2 4\n", "5 lisa 33 dc 1 0\n", "0 john 23 iowa 2 0" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','mary','peter','jeff','bill','lisa'],\n", " 'age':[23,78,22,19,45,33],\n", " 'state':['iowa','dc','california','texas','washington','dc'],\n", " 'num_children':[2,2,0,1,2,1],\n", " 'num_pets':[0,4,0,5,0,0]\n", "})\n", "# sorting columns\n", "df=df[['name','age','state','num_children','num_pets']]\n", "df\n", "\n", "df.reindex(np.random.permutation(df.index))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Iterate over all rows" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 has name: john\n", "1 has name: mary\n", "2 has name: peter\n", "3 has name: jeff\n", "4 has name: bill\n", "5 has name: lisa\n" ] } ], "source": [ "for index,row in df.iterrows():\n", " print(\"{0} has name: {1}\".format(index,row[\"name\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Randomly sample rows" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
2peter22california00
3jeff19texas15
1mary78dc24
5lisa33dc10
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "2 peter 22 california 0 0\n", "3 jeff 19 texas 1 5\n", "1 mary 78 dc 2 4\n", "5 lisa 33 dc 1 0" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sample 10 rows from df\n", "random_indices = np.random.choice(df.index.values, 4, replace=False)\n", "\n", "# iloc allows you to retrieve rows by their numeric indices\n", "sampled_df = df.iloc[random_indices]\n", "sampled_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sort a dataframe" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
1mary78dc24
4bill45washington20
5lisa33dc10
0john23iowa20
2peter22california00
3jeff19texas15
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "1 mary 78 dc 2 4\n", "4 bill 45 washington 2 0\n", "5 lisa 33 dc 1 0\n", "0 john 23 iowa 2 0\n", "2 peter 22 california 0 0\n", "3 jeff 19 texas 1 5" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort by age, largest first\n", "df.sort_values(\"age\",ascending=False )" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
3jeff19texas15
1mary78dc24
2peter22california00
0john23iowa20
5lisa33dc10
4bill45washington20
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "3 jeff 19 texas 1 5\n", "1 mary 78 dc 2 4\n", "2 peter 22 california 0 0\n", "0 john 23 iowa 2 0\n", "5 lisa 33 dc 1 0\n", "4 bill 45 washington 2 0" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort by num_pets descending then sort by age ascending\n", "df.sort_values( [\"num_pets\",\"age\"], ascending=[False,True] )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## custom sort" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestate
0john23N/A
1mary78dc
2peter22california
3jeff19texas
4bill12N/A
5lisa33dc
\n", "
" ], "text/plain": [ " name age state\n", "0 john 23 N/A\n", "1 mary 78 dc\n", "2 peter 22 california\n", "3 jeff 19 texas\n", "4 bill 12 N/A\n", "5 lisa 33 dc" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','mary','peter','jeff','bill','lisa'],\n", " 'age':[23,78,22,19,12,33],\n", " 'state':['N/A','dc','california','texas','N/A','dc']\n", "})\n", "\n", "# sorting columns\n", "df=df[['name','age','state']]\n", "df" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestate
0jeff19texas
1peter22california
2lisa33dc
3mary78dc
4bill12N/A
5john23N/A
\n", "
" ], "text/plain": [ " name age state\n", "0 jeff 19 texas\n", "1 peter 22 california\n", "2 lisa 33 dc\n", "3 mary 78 dc\n", "4 bill 12 N/A\n", "5 john 23 N/A" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def state_to_rank(state):\n", " if state==\"N/A\":\n", " return 1\n", " else:\n", " return 0\n", " \n", "df['rank'] = df['state'].map(lambda x: state_to_rank(x))\n", "\n", "df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Perform complex selections using lambdas" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0john23iowa20
3jeff19texas15
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 john 23 iowa 2 0\n", "3 jeff 19 texas 1 5" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Change column names" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage_yearsstatenum_childrennum_pets
0john23iowa20
1mary78dc24
2peter22california00
3jeff19texas15
4bill45washington20
5lisa33dc10
\n", "
" ], "text/plain": [ " name age_years state num_children num_pets\n", "0 john 23 iowa 2 0\n", "1 mary 78 dc 2 4\n", "2 peter 22 california 0 0\n", "3 jeff 19 texas 1 5\n", "4 bill 45 washington 2 0\n", "5 lisa 33 dc 1 0" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use inplace=True if you want to mutate the current dataframe\n", "df.rename(columns={\"age\":\"age_years\"} )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Change column dtype" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('int64')" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['num_children'].dtype" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0john23iowa20
1mary78dc24
2peter22california00
3jeff19texas15
4bill45washington20
5lisa33dc10
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 john 23 iowa 2 0\n", "1 mary 78 dc 2 4\n", "2 peter 22 california 0 0\n", "3 jeff 19 texas 1 5\n", "4 bill 45 washington 2 0\n", "5 lisa 33 dc 1 0" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we don't need 64 bits for num_children\n", "df['num_children'] = df['num_children'].astype('int32')\n", "df['num_children'].dtype\n", "\n", "# it looks the same but takes less space\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Veryfing that the dataframe includes specific values\n", "\n", "This is done using the `.isin()` method, which returns a **boolean** dataframe to indicate where the passed values are.\n" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0FalseFalseFalseTrueFalse
1FalseFalseFalseTrueTrue
2FalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalse
4FalseFalseFalseTrueFalse
5FalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 False False False True False\n", "1 False False False True True\n", "2 False False False False False\n", "3 False False False False False\n", "4 False False False True False\n", "5 False False False False False" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','mary','peter','jeff','bill','lisa'],\n", " 'age':[23,78,22,19,45,33],\n", " 'state':['iowa','dc','california','texas','washington','dc'],\n", " 'num_children':[2,2,0,1,2,1],\n", " 'num_pets':[0,4,0,5,0,0]\n", "})\n", "# sorting columns\n", "df=df[['name','age','state','num_children','num_pets']]\n", "\n", "# if the method is passed a simple list, it matches\n", "# those values anywhere in the dataframe \n", "df.isin([2,4])" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagestatenum_childrennum_pets
0FalseFalseFalseFalseFalse
1FalseFalseFalseFalseTrue
2FalseFalseFalseFalseFalse
3FalseFalseFalseFalseTrue
4FalseFalseFalseFalseFalse
5FalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " name age state num_children num_pets\n", "0 False False False False False\n", "1 False False False False True\n", "2 False False False False False\n", "3 False False False False True\n", "4 False False False False False\n", "5 False False False False False" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# you can also pass a dict or another dataframe\n", "# as argument\n", "df.isin({'num_pets':[4,5]})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create an empty Dataframe and append rows one by one" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col_acol_b
05.010
11.0100
\n", "
" ], "text/plain": [ " col_a col_b\n", "0 5.0 10\n", "1 1.0 100" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set column names and dtypes\n", "new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})\n", "\n", "\n", "# must reassign since the append method does not work in place\n", "new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)\n", "new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)\n", "\n", "new_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create from list of dicts" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idname
01john
12mary
23peter
\n", "
" ], "text/plain": [ " id name\n", "0 1 john\n", "1 2 mary\n", "2 3 peter" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = pd.DataFrame(columns=['id','name'])\n", "\n", "data_dict = [\n", " {'id':1,'name':\"john\"},\n", " {'id':2,'name':\"mary\"},\n", " {'id':3,'name':\"peter\"}\n", "]\n", "\n", "# must reassign since the append method does not work in place\n", "\n", "new_df = new_df.from_records(data_dict)\n", "new_df" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## converting types" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
date_of_birthname
027/05/2002john
110/10/1999mary
201/04/1985peter
\n", "
" ], "text/plain": [ " date_of_birth name\n", "0 27/05/2002 john\n", "1 10/10/1999 mary\n", "2 01/04/1985 peter" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','mary','peter'],\n", " \"date_of_birth\": ['27/05/2002','10/10/1999','01/04/1985']\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
date_of_birthname
02002-05-27john
11999-10-10mary
21985-04-01peter
\n", "
" ], "text/plain": [ " date_of_birth name\n", "0 2002-05-27 john\n", "1 1999-10-10 mary\n", "2 1985-04-01 peter" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')\n", "df" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "date_of_birth datetime64[ns]\n", "name object\n", "dtype: object" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Global TF Kernel (Python 3)", "language": "python", "name": "global-tf-python-3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }