{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Python For Data Science Cheat Sheet\n", "\n", "## Pandas Basics\n", "\n", "https://www.datacamp.com/community/blog/python-pandas-cheat-sheet" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pandas\n", "\n", "The Pandas library is built on NumPy and provides easy-to-use\n", "data structures and data analysis tools for the Python\n", "programming language." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use the following import convention:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ " import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pandas Data Structures" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Series\n", "\n", "A one-dimensional labeled array A\n", "capable of holding any data type " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame\n", "\n", "A two-dimensional labeled\n", "data structure with columns\n", "of potentially different types" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = {'Country': ['Belgium', 'India', 'Brazil'],\n", " 'Capital': ['Brussels', 'New Delhi', 'Brasilia'],\n", " 'Population': [11190846, 1303171035, 207847528]}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.DataFrame(data,\n", " columns=['Country', 'Capital', 'Population'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Asking For Help" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "help(pd.Series.loc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selection (Also see NumPy Arrays)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Getting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Get one element\n", "s['b']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Get subset of a DataFrame\n", "df[1:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Selecting, Boolean Indexing & Setting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### By Position" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## Select single value by row & column\n", "df.iloc[0, 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Select single value by row & column\n", "df.iat[0, 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### By Label" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Select single value by row & column labels\n", "df.loc[0, 'Country']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Select single value by row & column labels\n", "df.at[0, 'Country']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### By Label/Position" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Select single row of subset of rows\n", "df.ix[2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Select a single column of subset of columns \n", "df.ix[:,'Capital']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Select rows and columns\n", "df.ix[1,'Capital']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Boolean Indexing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Series s where value is not >1\n", "s[~(s > 1)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# s where value is <-1 or >2\n", "s[(s < -1) | (s > 2)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Use filter to adjust DataFrame\n", "df[df['Population'] > 1200000000] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Setting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Set index a of Series s to 6\n", "s['a'] = 6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dropping" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Drop values from rows (axis=0)\n", "s.drop(['a', 'c'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Drop values from columns(axis=1)\n", "df.drop('Country', axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort & Rank" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Sort by labels along an axis\n", "df.sort_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Sort by the values along an axis\n", "df.sort_values(by='Country')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Assign ranks to entries\n", "df.rank()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Retrieving Series/DataFrame Information" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Basic Information" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# (rows,columns)\n", "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Describe index\n", "df.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Describe DataFrame columns\n", "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Info on DataFrame\n", "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Number of non-NA values\n", "df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Summary" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Sum of values\n", "df.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Cummulative sum of values\n", "df.cumsum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Minimum values\n", "df.min()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Maximum values\n", "df.max()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "# Minimum index value\n", "df.idxmin()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Maximum index value\n", "df.idxmax()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Summary statistics\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Mean of values\n", "df.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Median of values\n", "df.median()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Applying Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "f = lambda x: x*2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Apply function\n", "df.apply(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Apply function element-wise\n", "df.applymap(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Alignment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Internal Data Alignment\n", "\n", "NA values are introduced in the indices that don’t overlap:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])\n", "s + s3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Arithmetic Operations with Fill Methods\n", "\n", "You can also do the internal data alignment yourself with the help of the fill methods:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s.add(s3, fill_value=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s.sub(s3, fill_value=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s.div(s3, fill_value=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s.mul(s3, fill_value=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### I/O" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read and Write to CSV" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_csv('myDataFrame.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_csv('myDataFrame.csv', header=None, nrows=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read and Write to Excel" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_excel('myDataFrame.xlsx', sheet_name='Sheet1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_excel('myDataFrame.xlsx')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "#### Read and Write to SQL Query or Database Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sqlalchemy import create_engine\n", "engine = create_engine('sqlite:///:memory:')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "write records stored in a DataFrame to a SQL database." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.to_sql(\"my_table\", engine)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "read_sql() is a convenience wrapper around read_sql_table() and read_sql_query()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_sql(\"SELECT * FROM my_table;\", engine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_sql_table('my_table', engine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_sql_query(\"SELECT * FROM my_table;\", engine)" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }