{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Loading, Storage, File Formats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading and Writing Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"examples/ex1.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\"examples/ex2.csv\", sep = \",\", header = None)\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "source": [ "!cat examples/ex2.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothereefourfive
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " one two theree four five\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(\"examples/ex2.csv\", names = [\"one\", \"two\", \"theree\", \"four\", \"five\"])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothereefour
five
hello1234
world5678
foo9101112
\n", "
" ], "text/plain": [ " one two theree four\n", "five \n", "hello 1 2 3 4\n", "world 5 6 7 8\n", "foo 9 10 11 12" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(\"examples/ex2.csv\", names = [\"one\", \"two\", \"theree\", \"four\", \"five\"], index_col = [\"five\"])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516
\n", "
" ], "text/plain": [ " value1 value2\n", "key1 key2 \n", "one a 1 2\n", " b 3 4\n", " c 5 6\n", " d 7 8\n", "two a 9 10\n", " b 11 12\n", " c 13 14\n", " d 15 16" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(\"examples/csv_mindex.csv\", index_col = [\"key1\", \"key2\"])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491
\n", "
" ], "text/plain": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# using regular expressions for tables without fixed delimiter\n", "pd.read_table(\"examples/ex3.txt\", sep = \"\\s+\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# skip rows\n", "pd.read_csv(\"examples/ex4.csv\", skiprows = [0, 2, 3])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NaN values\n", "df = pd.read_csv(\"examples/ex5.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseFalse
2FalseFalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 False False False False False True\n", "1 False False False True False False\n", "2 False False False False False False" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading Text Files in Pieces" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
\n", "
" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"examples/ex6.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
\n", "
" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read a few rows\n", "pd.read_csv(\"examples/ex6.csv\", nrows = 3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Writing Data Out to Text Format" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"examples/output.csv\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.to_csv(\"examples/output.txt\", sep = \"|\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## JSON Data" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = \"\"\"\n", "{\"name\": \"Wes\",\n", "\"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", "\"pet\": null,\n", "\"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n", "{\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n", "}\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(obj)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'name': 'Wes',\n", " 'pet': None,\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},\n", " {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = json.loads(obj)\n", "result" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binary Data Formats\n", "### pickle serialization\n", "* To store data efficiently in binary format, use Python’s builtin **pickle** serialization\n", "* pickle is only recommended as a short-term storage format" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.read_csv(\"examples/ex1.csv\")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "df.to_pickle(\"examples/output_pickle\")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_pickle(\"examples/output_pickle\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Excel Files" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.io.excel.ExcelFile" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xls_file = pd.ExcelFile(\"examples/ex1.xlsx\")\n", "type(xls_file)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = xls_file.parse(\"Sheet1\")\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }