{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## These are the follow-up code samples for the following blog post: [Pandas DataFrame by Example](http://queirozf.com/entries/pandas-dataframe-by-example)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0.23.1'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 john 23 iowa 2 0\n",
"1 mary 78 dc 2 4\n",
"2 peter 22 california 0 0\n",
"3 jeff 19 texas 1 5\n",
"4 bill 45 washington 2 0\n",
"5 lisa 33 dc 1 0"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['john','mary','peter','jeff','bill','lisa'],\n",
" 'age':[23,78,22,19,45,33],\n",
" 'state':['iowa','dc','california','texas','washington','dc'],\n",
" 'num_children':[2,2,0,1,2,1],\n",
" 'num_pets':[0,4,0,5,0,0]\n",
"})\n",
"# sorting columns\n",
"df=df[['name','age','state','num_children','num_pets']]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select rows by position"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 john 23 iowa 2 0\n",
"1 mary 78 dc 2 4"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# select the first 2 rows\n",
"df.iloc[:2]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"4 bill 45 washington 2 0\n",
"5 lisa 33 dc 1 0"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# select the last 2 rows\n",
"df.iloc[-2:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select rows by index value\n",
"\n",
"> compare this with `iloc` above"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 john 23 iowa 2 0\n",
"1 mary 78 dc 2 4\n",
"2 peter 22 california 0 0"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# select rows up to and including the one\n",
"# with index=2\n",
"df.loc[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select rows based upon the value of columns"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"1 mary 78 dc 2 4\n",
"4 bill 45 washington 2 0\n",
"5 lisa 33 dc 1 0"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# by a simple numeric condition\n",
"df[df[\"age\"] > 30]"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"1 mary 78 dc 2 4\n",
"3 jeff 19 texas 1 5"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# comparing the value of two columns\n",
"df[ df[\"num_pets\"] > df[ \"num_children\"] ]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"1 mary 78 dc 2 4"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# using boolean AND\n",
"df[ (df[\"age\"] > 40) & (df[\"num_pets\"] > 0) ]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## select columns starting with"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name num_children num_pets\n",
"0 john 2 0\n",
"1 mary 2 4\n",
"2 peter 0 0\n",
"3 jeff 1 5\n",
"4 bill 2 0\n",
"5 lisa 1 0"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[[colname for colname in df.columns if colname.startswith('n')]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## select all columns but one"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name state num_children num_pets\n",
"0 john iowa 2 0\n",
"1 mary dc 2 4\n",
"2 peter california 0 0\n",
"3 jeff texas 1 5\n",
"4 bill washington 2 0\n",
"5 lisa dc 1 0"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop(['age'],axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Drop a column"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" state | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" iowa | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" dc | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" california | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" texas | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" washington | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" dc | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name state num_pets\n",
"0 john iowa 0\n",
"1 mary dc 4\n",
"2 peter california 0\n",
"3 jeff texas 5\n",
"4 bill washington 0\n",
"5 lisa dc 0"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop([\"age\",\"num_children\"],axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply a function to every column (as aggregates)\n",
"\n",
"> Using **numpy** vectorized functions for numerical values"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
"
\n",
" \n",
" \n",
" \n",
" age | \n",
" 36.666667 | \n",
"
\n",
" \n",
" num_pets | \n",
" 1.500000 | \n",
"
\n",
" \n",
" num_children | \n",
" 1.333333 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0\n",
"age 36.666667\n",
"num_pets 1.500000\n",
"num_children 1.333333"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the mean for each of the selected columns\n",
"df[[\"age\",\"num_pets\",\"num_children\"]].apply(lambda row: np.mean(row),axis=0).to_frame()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply a function to every row (as aggregates)\n",
"> Using **numpy** vectorized functions for numerical values"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 25 | \n",
"
\n",
" \n",
" 1 | \n",
" 84 | \n",
"
\n",
" \n",
" 2 | \n",
" 22 | \n",
"
\n",
" \n",
" 3 | \n",
" 25 | \n",
"
\n",
" \n",
" 4 | \n",
" 47 | \n",
"
\n",
" \n",
" 5 | \n",
" 34 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0\n",
"0 25\n",
"1 84\n",
"2 22\n",
"3 25\n",
"4 47\n",
"5 34"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sum columns age, num_pets and num_children for each row\n",
"df[[\"age\",\"num_pets\",\"num_children\"]].apply(lambda row: np.sum(row),axis=1).to_frame()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply a function elementwise using apply"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 46 | \n",
"
\n",
" \n",
" 1 | \n",
" 156 | \n",
"
\n",
" \n",
" 2 | \n",
" 44 | \n",
"
\n",
" \n",
" 3 | \n",
" 38 | \n",
"
\n",
" \n",
" 4 | \n",
" 90 | \n",
"
\n",
" \n",
" 5 | \n",
" 66 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age\n",
"0 46\n",
"1 156\n",
"2 44\n",
"3 38\n",
"4 90\n",
"5 66"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[[\"age\"]].apply(lambda value: value*2)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 46 | \n",
"
\n",
" \n",
" 1 | \n",
" 156 | \n",
"
\n",
" \n",
" 2 | \n",
" 44 | \n",
"
\n",
" \n",
" 3 | \n",
" 38 | \n",
"
\n",
" \n",
" 4 | \n",
" 90 | \n",
"
\n",
" \n",
" 5 | \n",
" 66 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age\n",
"0 46\n",
"1 156\n",
"2 44\n",
"3 38\n",
"4 90\n",
"5 66"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# certain numerical functions can also be used:\n",
"df[[\"age\"]] * 2"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" JOHN | \n",
"
\n",
" \n",
" 1 | \n",
" MARY | \n",
"
\n",
" \n",
" 2 | \n",
" PETER | \n",
"
\n",
" \n",
" 3 | \n",
" JEFF | \n",
"
\n",
" \n",
" 4 | \n",
" BILL | \n",
"
\n",
" \n",
" 5 | \n",
" LISA | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name\n",
"0 JOHN\n",
"1 MARY\n",
"2 PETER\n",
"3 JEFF\n",
"4 BILL\n",
"5 LISA"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# also works for string values\n",
"df[[\"name\"]].apply(lambda value: value.str.upper())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply a function elementwise using map\n",
"\n",
"> use `apply` for DataFrame objects and `map` for Series objects\n"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" JOHN | \n",
"
\n",
" \n",
" 1 | \n",
" MARY | \n",
"
\n",
" \n",
" 2 | \n",
" PETER | \n",
"
\n",
" \n",
" 3 | \n",
" JEFF | \n",
"
\n",
" \n",
" 4 | \n",
" BILL | \n",
"
\n",
" \n",
" 5 | \n",
" LISA | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name\n",
"0 JOHN\n",
"1 MARY\n",
"2 PETER\n",
"3 JEFF\n",
"4 BILL\n",
"5 LISA"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['name'].map(lambda name: name.upper()).to_frame()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Add new columns based on old ones"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
" pets_and_children | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
" 6 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets pets_and_children\n",
"0 john 23 iowa 2 0 2\n",
"1 mary 78 dc 2 4 6\n",
"2 peter 22 california 0 0 0\n",
"3 jeff 19 texas 1 5 6\n",
"4 bill 45 washington 2 0 2\n",
"5 lisa 33 dc 1 0 1"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# simple sum of two columns\n",
"df[\"pets_and_children\"] = df[\"num_pets\"] + df[\"num_children\"]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
" name_uppercase | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
" JOHN | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
" MARY | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
" PETER | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
" JEFF | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
" BILL | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
" LISA | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets name_uppercase\n",
"0 john 23 iowa 2 0 JOHN\n",
"1 mary 78 dc 2 4 MARY\n",
"2 peter 22 california 0 0 PETER\n",
"3 jeff 19 texas 1 5 JEFF\n",
"4 bill 45 washington 2 0 BILL\n",
"5 lisa 33 dc 1 0 LISA"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['john','mary','peter','jeff','bill','lisa'],\n",
" 'age':[23,78,22,19,45,33],\n",
" 'state':['iowa','dc','california','texas','washington','dc'],\n",
" 'num_children':[2,2,0,1,2,1],\n",
" 'num_pets':[0,4,0,5,0,0]\n",
"})\n",
"# sorting columns\n",
"df=df[['name','age','state','num_children','num_pets']]\n",
"df\n",
"\n",
"# you can also use custom functions we used on \"elementwise application\"\n",
"df[\"name_uppercase\"] = df[[\"name\"]].apply(lambda name: name.str.upper())\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Shuffle rows"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"4 bill 45 washington 2 0\n",
"3 jeff 19 texas 1 5\n",
"2 peter 22 california 0 0\n",
"1 mary 78 dc 2 4\n",
"5 lisa 33 dc 1 0\n",
"0 john 23 iowa 2 0"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['john','mary','peter','jeff','bill','lisa'],\n",
" 'age':[23,78,22,19,45,33],\n",
" 'state':['iowa','dc','california','texas','washington','dc'],\n",
" 'num_children':[2,2,0,1,2,1],\n",
" 'num_pets':[0,4,0,5,0,0]\n",
"})\n",
"# sorting columns\n",
"df=df[['name','age','state','num_children','num_pets']]\n",
"df\n",
"\n",
"df.reindex(np.random.permutation(df.index))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Iterate over all rows"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 has name: john\n",
"1 has name: mary\n",
"2 has name: peter\n",
"3 has name: jeff\n",
"4 has name: bill\n",
"5 has name: lisa\n"
]
}
],
"source": [
"for index,row in df.iterrows():\n",
" print(\"{0} has name: {1}\".format(index,row[\"name\"]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Randomly sample rows"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"2 peter 22 california 0 0\n",
"3 jeff 19 texas 1 5\n",
"1 mary 78 dc 2 4\n",
"5 lisa 33 dc 1 0"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sample 10 rows from df\n",
"random_indices = np.random.choice(df.index.values, 4, replace=False)\n",
"\n",
"# iloc allows you to retrieve rows by their numeric indices\n",
"sampled_df = df.iloc[random_indices]\n",
"sampled_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sort a dataframe"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"1 mary 78 dc 2 4\n",
"4 bill 45 washington 2 0\n",
"5 lisa 33 dc 1 0\n",
"0 john 23 iowa 2 0\n",
"2 peter 22 california 0 0\n",
"3 jeff 19 texas 1 5"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sort by age, largest first\n",
"df.sort_values(\"age\",ascending=False )"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"3 jeff 19 texas 1 5\n",
"1 mary 78 dc 2 4\n",
"2 peter 22 california 0 0\n",
"0 john 23 iowa 2 0\n",
"5 lisa 33 dc 1 0\n",
"4 bill 45 washington 2 0"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sort by num_pets descending then sort by age ascending\n",
"df.sort_values( [\"num_pets\",\"age\"], ascending=[False,True] )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## custom sort"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" N/A | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 12 | \n",
" N/A | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state\n",
"0 john 23 N/A\n",
"1 mary 78 dc\n",
"2 peter 22 california\n",
"3 jeff 19 texas\n",
"4 bill 12 N/A\n",
"5 lisa 33 dc"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['john','mary','peter','jeff','bill','lisa'],\n",
" 'age':[23,78,22,19,12,33],\n",
" 'state':['N/A','dc','california','texas','N/A','dc']\n",
"})\n",
"\n",
"# sorting columns\n",
"df=df[['name','age','state']]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
"
\n",
" \n",
" 1 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
"
\n",
" \n",
" 2 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
"
\n",
" \n",
" 3 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 12 | \n",
" N/A | \n",
"
\n",
" \n",
" 5 | \n",
" john | \n",
" 23 | \n",
" N/A | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state\n",
"0 jeff 19 texas\n",
"1 peter 22 california\n",
"2 lisa 33 dc\n",
"3 mary 78 dc\n",
"4 bill 12 N/A\n",
"5 john 23 N/A"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def state_to_rank(state):\n",
" if state==\"N/A\":\n",
" return 1\n",
" else:\n",
" return 0\n",
" \n",
"df['rank'] = df['state'].map(lambda x: state_to_rank(x))\n",
"\n",
"df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Perform complex selections using lambdas"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 john 23 iowa 2 0\n",
"3 jeff 19 texas 1 5"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Change column names"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age_years | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age_years state num_children num_pets\n",
"0 john 23 iowa 2 0\n",
"1 mary 78 dc 2 4\n",
"2 peter 22 california 0 0\n",
"3 jeff 19 texas 1 5\n",
"4 bill 45 washington 2 0\n",
"5 lisa 33 dc 1 0"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# use inplace=True if you want to mutate the current dataframe\n",
"df.rename(columns={\"age\":\"age_years\"} )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Change column dtype"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dtype('int64')"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['num_children'].dtype"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" john | \n",
" 23 | \n",
" iowa | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" mary | \n",
" 78 | \n",
" dc | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" peter | \n",
" 22 | \n",
" california | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" jeff | \n",
" 19 | \n",
" texas | \n",
" 1 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" bill | \n",
" 45 | \n",
" washington | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" lisa | \n",
" 33 | \n",
" dc | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 john 23 iowa 2 0\n",
"1 mary 78 dc 2 4\n",
"2 peter 22 california 0 0\n",
"3 jeff 19 texas 1 5\n",
"4 bill 45 washington 2 0\n",
"5 lisa 33 dc 1 0"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we don't need 64 bits for num_children\n",
"df['num_children'] = df['num_children'].astype('int32')\n",
"df['num_children'].dtype\n",
"\n",
"# it looks the same but takes less space\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Veryfing that the dataframe includes specific values\n",
"\n",
"This is done using the `.isin()` method, which returns a **boolean** dataframe to indicate where the passed values are.\n"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" 5 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 False False False True False\n",
"1 False False False True True\n",
"2 False False False False False\n",
"3 False False False False False\n",
"4 False False False True False\n",
"5 False False False False False"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['john','mary','peter','jeff','bill','lisa'],\n",
" 'age':[23,78,22,19,45,33],\n",
" 'state':['iowa','dc','california','texas','washington','dc'],\n",
" 'num_children':[2,2,0,1,2,1],\n",
" 'num_pets':[0,4,0,5,0,0]\n",
"})\n",
"# sorting columns\n",
"df=df[['name','age','state','num_children','num_pets']]\n",
"\n",
"# if the method is passed a simple list, it matches\n",
"# those values anywhere in the dataframe \n",
"df.isin([2,4])"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" state | \n",
" num_children | \n",
" num_pets | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" 4 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 5 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age state num_children num_pets\n",
"0 False False False False False\n",
"1 False False False False True\n",
"2 False False False False False\n",
"3 False False False False True\n",
"4 False False False False False\n",
"5 False False False False False"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# you can also pass a dict or another dataframe\n",
"# as argument\n",
"df.isin({'num_pets':[4,5]})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an empty Dataframe and append rows one by one"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" col_a | \n",
" col_b | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.0 | \n",
" 10 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 100 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" col_a col_b\n",
"0 5.0 10\n",
"1 1.0 100"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# set column names and dtypes\n",
"new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})\n",
"\n",
"\n",
"# must reassign since the append method does not work in place\n",
"new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)\n",
"new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)\n",
"\n",
"new_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create from list of dicts"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" john | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" mary | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" peter | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id name\n",
"0 1 john\n",
"1 2 mary\n",
"2 3 peter"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df = pd.DataFrame(columns=['id','name'])\n",
"\n",
"data_dict = [\n",
" {'id':1,'name':\"john\"},\n",
" {'id':2,'name':\"mary\"},\n",
" {'id':3,'name':\"peter\"}\n",
"]\n",
"\n",
"# must reassign since the append method does not work in place\n",
"\n",
"new_df = new_df.from_records(data_dict)\n",
"new_df"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## converting types"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date_of_birth | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 27/05/2002 | \n",
" john | \n",
"
\n",
" \n",
" 1 | \n",
" 10/10/1999 | \n",
" mary | \n",
"
\n",
" \n",
" 2 | \n",
" 01/04/1985 | \n",
" peter | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date_of_birth name\n",
"0 27/05/2002 john\n",
"1 10/10/1999 mary\n",
"2 01/04/1985 peter"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'name':['john','mary','peter'],\n",
" \"date_of_birth\": ['27/05/2002','10/10/1999','01/04/1985']\n",
"})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date_of_birth | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2002-05-27 | \n",
" john | \n",
"
\n",
" \n",
" 1 | \n",
" 1999-10-10 | \n",
" mary | \n",
"
\n",
" \n",
" 2 | \n",
" 1985-04-01 | \n",
" peter | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date_of_birth name\n",
"0 2002-05-27 john\n",
"1 1999-10-10 mary\n",
"2 1985-04-01 peter"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"date_of_birth datetime64[ns]\n",
"name object\n",
"dtype: object"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Global TF Kernel (Python 3)",
"language": "python",
"name": "global-tf-python-3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}