{ "metadata": { "name": "", "signature": "sha256:7297ae4752c51fdb9aef4ca4e62304eb3613732639295f8d51637c7f7d707213" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading A CSV Into Pandas\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### import modules" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe (that we will be importing)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], \n", " 'last_name': ['Miller', 'Jacobson', \".\", 'Milner', 'Cooze'], \n", " 'age': [42, 52, 36, 24, 73], \n", " 'preTestScore': [4, 24, 31, \".\", \".\"],\n", " 'postTestScore': [\"25,000\", \"94,000\", 57, 62, 70]}\n", "df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
first_namelast_nameagepreTestScorepostTestScore
0 Jason Miller 42 4 25,000
1 Molly Jacobson 52 24 94,000
2 Tina . 36 31 57
3 Jake Milner 24 . 62
4 Amy Cooze 73 . 70
\n", "

5 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 45, "text": [ " first_name last_name age preTestScore postTestScore\n", "0 Jason Miller 42 4 25,000\n", "1 Molly Jacobson 52 24 94,000\n", "2 Tina . 36 31 57\n", "3 Jake Milner 24 . 62\n", "4 Amy Cooze 73 . 70\n", "\n", "[5 rows x 5 columns]" ] } ], "prompt_number": 45 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save dataframe as csv in the working director" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.to_csv('example.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 46 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv')\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0first_namelast_nameagepreTestScorepostTestScore
0 0 Jason Miller 42 4 25,000
1 1 Molly Jacobson 52 24 94,000
2 2 Tina . 36 31 57
3 3 Jake Milner 24 . 62
4 4 Amy Cooze 73 . 70
\n", "

5 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ " Unnamed: 0 first_name last_name age preTestScore postTestScore\n", "0 0 Jason Miller 42 4 25,000\n", "1 1 Molly Jacobson 52 24 94,000\n", "2 2 Tina . 36 31 57\n", "3 3 Jake Milner 24 . 62\n", "4 4 Amy Cooze 73 . 70\n", "\n", "[5 rows x 6 columns]" ] } ], "prompt_number": 47 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv with no headers" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', header=None)\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
0NaN first_name last_name age preTestScore postTestScore
1 0 Jason Miller 42 4 25,000
2 1 Molly Jacobson 52 24 94,000
3 2 Tina . 36 31 57
4 3 Jake Milner 24 . 62
5 4 Amy Cooze 73 . 70
\n", "

6 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 48, "text": [ " 0 1 2 3 4 5\n", "0 NaN first_name last_name age preTestScore postTestScore\n", "1 0 Jason Miller 42 4 25,000\n", "2 1 Molly Jacobson 52 24 94,000\n", "3 2 Tina . 36 31 57\n", "4 3 Jake Milner 24 . 62\n", "5 4 Amy Cooze 73 . 70\n", "\n", "[6 rows x 6 columns]" ] } ], "prompt_number": 48 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while specifying column names" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', header=True, names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])\n", "df\n", "\n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UIDFirst NameLast NameAgePre-Test ScorePost-Test Score
0 1 Molly Jacobson 52 24 94,000
1 2 Tina . 36 31 57
2 3 Jake Milner 24 . 62
3 4 Amy Cooze 73 . 70
\n", "

4 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 49, "text": [ " UID First Name Last Name Age Pre-Test Score Post-Test Score\n", "0 1 Molly Jacobson 52 24 94,000\n", "1 2 Tina . 36 31 57\n", "2 3 Jake Milner 24 . 62\n", "3 4 Amy Cooze 73 . 70\n", "\n", "[4 rows x 6 columns]" ] } ], "prompt_number": 49 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv with setting the index column to UID" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', index_col='UID', header=True, names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
First NameLast NameAgePre-Test ScorePost-Test Score
UID
1 Molly Jacobson 52 24 94,000
2 Tina . 36 31 57
3 Jake Milner 24 . 62
4 Amy Cooze 73 . 70
\n", "

4 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 50, "text": [ " First Name Last Name Age Pre-Test Score Post-Test Score\n", "UID \n", "1 Molly Jacobson 52 24 94,000\n", "2 Tina . 36 31 57\n", "3 Jake Milner 24 . 62\n", "4 Amy Cooze 73 . 70\n", "\n", "[4 rows x 5 columns]" ] } ], "prompt_number": 50 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while setting the index columns to First Name and Last Name" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', index_col=['First Name', 'Last Name'], header=True, names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UIDAgePre-Test ScorePost-Test Score
First NameLast Name
MollyJacobson 1 52 24 94,000
Tina. 2 36 31 57
JakeMilner 3 24 . 62
AmyCooze 4 73 . 70
\n", "

4 rows \u00d7 4 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 51, "text": [ " UID Age Pre-Test Score Post-Test Score\n", "First Name Last Name \n", "Molly Jacobson 1 52 24 94,000\n", "Tina . 2 36 31 57\n", "Jake Milner 3 24 . 62\n", "Amy Cooze 4 73 . 70\n", "\n", "[4 rows x 4 columns]" ] } ], "prompt_number": 51 }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while specifying \".\" as missing values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', na_values=['.'])\n", "pd.isnull(df)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0first_namelast_nameagepreTestScorepostTestScore
0 False False False False False False
1 False False False False False False
2 False False True False False False
3 False False False False True False
4 False False False False True False
\n", "

5 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 52, "text": [ " Unnamed: 0 first_name last_name age preTestScore postTestScore\n", "0 False False False False False False\n", "1 False False False False False False\n", "2 False False True False False False\n", "3 False False False False True False\n", "4 False False False False True False\n", "\n", "[5 rows x 6 columns]" ] } ], "prompt_number": 52 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while specifying \".\" and \"NA\" as missing values in the Last Name column and \".\" as missing values in Pre-Test Score column" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sentinels = {'Last Name': ['.', 'NA'], 'Pre-Test Score': ['.']}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', na_values=sentinels)\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0first_namelast_nameagepreTestScorepostTestScore
0 0 Jason Miller 42 4 25,000
1 1 Molly Jacobson 52 24 94,000
2 2 Tina . 36 31 57
3 3 Jake Milner 24 . 62
4 4 Amy Cooze 73 . 70
\n", "

5 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 54, "text": [ " Unnamed: 0 first_name last_name age preTestScore postTestScore\n", "0 0 Jason Miller 42 4 25,000\n", "1 1 Molly Jacobson 52 24 94,000\n", "2 2 Tina . 36 31 57\n", "3 3 Jake Milner 24 . 62\n", "4 4 Amy Cooze 73 . 70\n", "\n", "[5 rows x 6 columns]" ] } ], "prompt_number": 54 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while skipping the top 3 rows" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', na_values=sentinels, skiprows=3)\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2Tina.363157
0 3 Jake Milner 24 . 62
1 4 Amy Cooze 73 . 70
\n", "

2 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 55, "text": [ " 2 Tina . 36 31 57\n", "0 3 Jake Milner 24 . 62\n", "1 4 Amy Cooze 73 . 70\n", "\n", "[2 rows x 6 columns]" ] } ], "prompt_number": 55 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while skipping the bottom three rows" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', na_values=sentinels, skip_footer=3)\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0first_namelast_nameagepreTestScorepostTestScore
0 0 Jason Miller 42 4 25,000
1 1 Molly Jacobson 52 24 94,000
\n", "

2 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 56, "text": [ " Unnamed: 0 first_name last_name age preTestScore postTestScore\n", "0 0 Jason Miller 42 4 25,000\n", "1 1 Molly Jacobson 52 24 94,000\n", "\n", "[2 rows x 6 columns]" ] } ], "prompt_number": 56 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load a csv while interpreting \",\" in strings around numbers as thousands seperators" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('example.csv', thousands=',')\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0first_namelast_nameagepreTestScorepostTestScore
0 0 Jason Miller 42 4 25000
1 1 Molly Jacobson 52 24 94000
2 2 Tina . 36 31 57
3 3 Jake Milner 24 . 62
4 4 Amy Cooze 73 . 70
\n", "

5 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 57, "text": [ " Unnamed: 0 first_name last_name age preTestScore postTestScore\n", "0 0 Jason Miller 42 4 25000\n", "1 1 Molly Jacobson 52 24 94000\n", "2 2 Tina . 36 31 57\n", "3 3 Jake Milner 24 . 62\n", "4 4 Amy Cooze 73 . 70\n", "\n", "[5 rows x 6 columns]" ] } ], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }