{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Loading, Storage, File Formats"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading and Writing Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"examples/ex1.csv\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_table(\"examples/ex2.csv\", sep = \",\", header = None)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1,2,3,4,hello\r\n",
"5,6,7,8,world\r\n",
"9,10,11,12,foo"
]
}
],
"source": [
"!cat examples/ex2.csv"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" theree | \n",
" four | \n",
" five | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two theree four five\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(\"examples/ex2.csv\", names = [\"one\", \"two\", \"theree\", \"four\", \"five\"])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" theree | \n",
" four | \n",
"
\n",
" \n",
" five | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" hello | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" world | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" foo | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two theree four\n",
"five \n",
"hello 1 2 3 4\n",
"world 5 6 7 8\n",
"foo 9 10 11 12"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(\"examples/ex2.csv\", names = [\"one\", \"two\", \"theree\", \"four\", \"five\"], index_col = [\"five\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" value1 | \n",
" value2 | \n",
"
\n",
" \n",
" key1 | \n",
" key2 | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" one | \n",
" a | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" b | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" c | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" d | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" two | \n",
" a | \n",
" 9 | \n",
" 10 | \n",
"
\n",
" \n",
" b | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
" c | \n",
" 13 | \n",
" 14 | \n",
"
\n",
" \n",
" d | \n",
" 15 | \n",
" 16 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value1 value2\n",
"key1 key2 \n",
"one a 1 2\n",
" b 3 4\n",
" c 5 6\n",
" d 7 8\n",
"two a 9 10\n",
" b 11 12\n",
" c 13 14\n",
" d 15 16"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(\"examples/csv_mindex.csv\", index_col = [\"key1\", \"key2\"])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" aaa | \n",
" -0.264438 | \n",
" -1.026059 | \n",
" -0.619500 | \n",
"
\n",
" \n",
" bbb | \n",
" 0.927272 | \n",
" 0.302904 | \n",
" -0.032399 | \n",
"
\n",
" \n",
" ccc | \n",
" -0.264273 | \n",
" -0.386314 | \n",
" -0.217601 | \n",
"
\n",
" \n",
" ddd | \n",
" -0.871858 | \n",
" -0.348382 | \n",
" 1.100491 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C\n",
"aaa -0.264438 -1.026059 -0.619500\n",
"bbb 0.927272 0.302904 -0.032399\n",
"ccc -0.264273 -0.386314 -0.217601\n",
"ddd -0.871858 -0.348382 1.100491"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# using regular expressions for tables without fixed delimiter\n",
"pd.read_table(\"examples/ex3.txt\", sep = \"\\s+\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# skip rows\n",
"pd.read_csv(\"examples/ex4.csv\", skiprows = [0, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
" 2 | \n",
" 3.0 | \n",
" 4 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" two | \n",
" 5 | \n",
" 6 | \n",
" NaN | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" three | \n",
" 9 | \n",
" 10 | \n",
" 11.0 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 one 1 2 3.0 4 NaN\n",
"1 two 5 6 NaN 8 world\n",
"2 three 9 10 11.0 12 foo"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# NaN values\n",
"df = pd.read_csv(\"examples/ex5.csv\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 2 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 False False False False False True\n",
"1 False False False True False False\n",
"2 False False False False False False"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading Text Files in Pieces"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.467976 | \n",
" -0.038649 | \n",
" -0.295344 | \n",
" -1.824726 | \n",
" L | \n",
"
\n",
" \n",
" 1 | \n",
" -0.358893 | \n",
" 1.404453 | \n",
" 0.704965 | \n",
" -0.200638 | \n",
" B | \n",
"
\n",
" \n",
" 2 | \n",
" -0.501840 | \n",
" 0.659254 | \n",
" -0.421691 | \n",
" -0.057688 | \n",
" G | \n",
"
\n",
" \n",
" 3 | \n",
" 0.204886 | \n",
" 1.074134 | \n",
" 1.388361 | \n",
" -0.982404 | \n",
" R | \n",
"
\n",
" \n",
" 4 | \n",
" 0.354628 | \n",
" -0.133116 | \n",
" 0.283763 | \n",
" -0.837063 | \n",
" Q | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four key\n",
"0 0.467976 -0.038649 -0.295344 -1.824726 L\n",
"1 -0.358893 1.404453 0.704965 -0.200638 B\n",
"2 -0.501840 0.659254 -0.421691 -0.057688 G\n",
"3 0.204886 1.074134 1.388361 -0.982404 R\n",
"4 0.354628 -0.133116 0.283763 -0.837063 Q"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"examples/ex6.csv\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.467976 | \n",
" -0.038649 | \n",
" -0.295344 | \n",
" -1.824726 | \n",
" L | \n",
"
\n",
" \n",
" 1 | \n",
" -0.358893 | \n",
" 1.404453 | \n",
" 0.704965 | \n",
" -0.200638 | \n",
" B | \n",
"
\n",
" \n",
" 2 | \n",
" -0.501840 | \n",
" 0.659254 | \n",
" -0.421691 | \n",
" -0.057688 | \n",
" G | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four key\n",
"0 0.467976 -0.038649 -0.295344 -1.824726 L\n",
"1 -0.358893 1.404453 0.704965 -0.200638 B\n",
"2 -0.501840 0.659254 -0.421691 -0.057688 G"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# read a few rows\n",
"pd.read_csv(\"examples/ex6.csv\", nrows = 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Writing Data Out to Text Format"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"examples/output.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.to_csv(\"examples/output.txt\", sep = \"|\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSON Data"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"obj = \"\"\"\n",
"{\"name\": \"Wes\",\n",
"\"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n",
"\"pet\": null,\n",
"\"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n",
"{\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"str"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(obj)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Wes',\n",
" 'pet': None,\n",
" 'places_lived': ['United States', 'Spain', 'Germany'],\n",
" 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},\n",
" {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = json.loads(obj)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Binary Data Formats\n",
"### pickle serialization\n",
"* To store data efficiently in binary format, use Python’s builtin **pickle** serialization\n",
"* pickle is only recommended as a short-term storage format"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_csv(\"examples/ex1.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"df.to_pickle(\"examples/output_pickle\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_pickle(\"examples/output_pickle\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Excel Files"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.io.excel.ExcelFile"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xls_file = pd.ExcelFile(\"examples/ex1.xlsx\")\n",
"type(xls_file)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = xls_file.parse(\"Sheet1\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}