{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CHAPTER 6"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Date Loading, Storage, and File Formats"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reading and writing data in text format"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pandas import Series\n",
"from pandas import DataFrame\n",
"import sys\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a,b,c,d,message\r\n",
"1,2,3,4,hello\r\n",
"5,6,7,8,world\r\n",
"9,10,11,12,foo"
]
}
],
"source": [
"!cat ch06/ex1.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_csv('ch06/ex1.csv') "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_table('ch06/ex1.csv', sep = ',')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1,2,3,4,hello\r\n",
"5,6,7,8,world\r\n",
"9,10,11,12,foo"
]
}
],
"source": [
"!cat ch06/ex2.csv"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv('ch06/ex2.csv', header = None)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv('ch06/ex2.csv', names = ['a', 'b', 'c', 'd', 'message'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" | message | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | hello | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" | world | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" | foo | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"message \n",
"hello 1 2 3 4\n",
"world 5 6 7 8\n",
"foo 9 10 11 12"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names = ['a', 'b', 'c', 'd', 'message']\n",
"pd.read_csv('ch06/ex2.csv', names = names, index_col = 'message')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"key1,key2,value1,value2\r\n",
"one,a,1,2\r\n",
"one,b,3,4\r\n",
"one,c,5,6\r\n",
"one,d,7,8\r\n",
"two,a,9,10\r\n",
"two,b,11,12\r\n",
"two,c,13,14\r\n",
"two,d,15,16\r\n"
]
}
],
"source": [
"!cat ch06/csv_mindex.csv"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" value1 | \n",
" value2 | \n",
"
\n",
" \n",
" | key1 | \n",
" key2 | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | one | \n",
" a | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | b | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" | c | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" | d | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" | two | \n",
" a | \n",
" 9 | \n",
" 10 | \n",
"
\n",
" \n",
" | b | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
" | c | \n",
" 13 | \n",
" 14 | \n",
"
\n",
" \n",
" | d | \n",
" 15 | \n",
" 16 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" value1 value2\n",
"key1 key2 \n",
"one a 1 2\n",
" b 3 4\n",
" c 5 6\n",
" d 7 8\n",
"two a 9 10\n",
" b 11 12\n",
" c 13 14\n",
" d 15 16"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parsed = pd.read_csv('ch06/csv_mindex.csv', index_col = ['key1', 'key2'])\n",
"parsed"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[' A B C\\n',\n",
" 'aaa -0.264438 -1.026059 -0.619500\\n',\n",
" 'bbb 0.927272 0.302904 -0.032399\\n',\n",
" 'ccc -0.264273 -0.386314 -0.217601\\n',\n",
" 'ddd -0.871858 -0.348382 1.100491\\n']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(open('ch06/ex3.txt'))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" | aaa | \n",
" -0.264438 | \n",
" -1.026059 | \n",
" -0.619500 | \n",
"
\n",
" \n",
" | bbb | \n",
" 0.927272 | \n",
" 0.302904 | \n",
" -0.032399 | \n",
"
\n",
" \n",
" | ccc | \n",
" -0.264273 | \n",
" -0.386314 | \n",
" -0.217601 | \n",
"
\n",
" \n",
" | ddd | \n",
" -0.871858 | \n",
" -0.348382 | \n",
" 1.100491 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C\n",
"aaa -0.264438 -1.026059 -0.619500\n",
"bbb 0.927272 0.302904 -0.032399\n",
"ccc -0.264273 -0.386314 -0.217601\n",
"ddd -0.871858 -0.348382 1.100491"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = pd.read_csv('ch06/ex3.txt', sep = '\\s+')\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# hey!\r\n",
"a,b,c,d,message\r\n",
"# just wanted to make things more difficult for you\r\n",
"# who reads CSV files with computers, anyway?\r\n",
"1,2,3,4,hello\r\n",
"5,6,7,8,world\r\n",
"9,10,11,12,foo"
]
}
],
"source": [
"!cat ch06/ex4.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv('ch06/ex4.csv', skiprows = [0, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"something,a,b,c,d,message\r\n",
"one,1,2,3,4,NA\r\n",
"two,5,6,,8,world\r\n",
"three,9,10,11,12,foo"
]
}
],
"source": [
"!cat ch06/ex5.csv"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one | \n",
" 1 | \n",
" 2 | \n",
" 3.0 | \n",
" 4 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" two | \n",
" 5 | \n",
" 6 | \n",
" NaN | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" three | \n",
" 9 | \n",
" 10 | \n",
" 11.0 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 one 1 2 3.0 4 NaN\n",
"1 two 5 6 NaN 8 world\n",
"2 three 9 10 11.0 12 foo"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = pd.read_csv('ch06/ex5.csv')\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" | 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 False False False False False True\n",
"1 False False False True False False\n",
"2 False False False False False False"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.isnull(result)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one | \n",
" 1 | \n",
" 2 | \n",
" 3.0 | \n",
" 4 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" two | \n",
" 5 | \n",
" 6 | \n",
" NaN | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" three | \n",
" 9 | \n",
" 10 | \n",
" 11.0 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 one 1 2 3.0 4 NaN\n",
"1 two 5 6 NaN 8 world\n",
"2 three 9 10 11.0 12 foo"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one | \n",
" 1 | \n",
" 2 | \n",
" 3.0 | \n",
" 4 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" NaN | \n",
" 5 | \n",
" 6 | \n",
" NaN | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" three | \n",
" 9 | \n",
" 10 | \n",
" 11.0 | \n",
" 12 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 one 1 2 3.0 4 NaN\n",
"1 NaN 5 6 NaN 8 world\n",
"2 three 9 10 11.0 12 NaN"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n",
"result = pd.read_csv('ch06/ex5.csv', na_values=sentinels)\n",
"result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading text file in pieces"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.467976 | \n",
" -0.038649 | \n",
" -0.295344 | \n",
" -1.824726 | \n",
" L | \n",
"
\n",
" \n",
" | 1 | \n",
" -0.358893 | \n",
" 1.404453 | \n",
" 0.704965 | \n",
" -0.200638 | \n",
" B | \n",
"
\n",
" \n",
" | 2 | \n",
" -0.501840 | \n",
" 0.659254 | \n",
" -0.421691 | \n",
" -0.057688 | \n",
" G | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.204886 | \n",
" 1.074134 | \n",
" 1.388361 | \n",
" -0.982404 | \n",
" R | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.354628 | \n",
" -0.133116 | \n",
" 0.283763 | \n",
" -0.837063 | \n",
" Q | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four key\n",
"0 0.467976 -0.038649 -0.295344 -1.824726 L\n",
"1 -0.358893 1.404453 0.704965 -0.200638 B\n",
"2 -0.501840 0.659254 -0.421691 -0.057688 G\n",
"3 0.204886 1.074134 1.388361 -0.982404 R\n",
"4 0.354628 -0.133116 0.283763 -0.837063 Q"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = pd.read_csv('ch06/ex6.csv')\n",
"result.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.467976 | \n",
" -0.038649 | \n",
" -0.295344 | \n",
" -1.824726 | \n",
" L | \n",
"
\n",
" \n",
" | 1 | \n",
" -0.358893 | \n",
" 1.404453 | \n",
" 0.704965 | \n",
" -0.200638 | \n",
" B | \n",
"
\n",
" \n",
" | 2 | \n",
" -0.501840 | \n",
" 0.659254 | \n",
" -0.421691 | \n",
" -0.057688 | \n",
" G | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.204886 | \n",
" 1.074134 | \n",
" 1.388361 | \n",
" -0.982404 | \n",
" R | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.354628 | \n",
" -0.133116 | \n",
" 0.283763 | \n",
" -0.837063 | \n",
" Q | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four key\n",
"0 0.467976 -0.038649 -0.295344 -1.824726 L\n",
"1 -0.358893 1.404453 0.704965 -0.200638 B\n",
"2 -0.501840 0.659254 -0.421691 -0.057688 G\n",
"3 0.204886 1.074134 1.388361 -0.982404 R\n",
"4 0.354628 -0.133116 0.283763 -0.837063 Q"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv('ch06/ex6.csv', nrows = 5)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"chunker = pd.read_csv('ch06/ex6.csv', chunksize = 1000)\n",
"\n",
"tot = Series([])\n",
"for piece in chunker:\n",
" tot = tot.add(piece['key'].value_counts(), fill_value = 0)\n",
"tot = tot.sort_values(ascending = False)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"E 368.0\n",
"X 364.0\n",
"L 346.0\n",
"O 343.0\n",
"Q 340.0\n",
"M 338.0\n",
"J 337.0\n",
"F 335.0\n",
"K 334.0\n",
"H 330.0\n",
"dtype: float64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tot[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Writing data out to data format"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" something | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one | \n",
" 1 | \n",
" 2 | \n",
" 3.0 | \n",
" 4 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" two | \n",
" 5 | \n",
" 6 | \n",
" NaN | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" three | \n",
" 9 | \n",
" 10 | \n",
" 11.0 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" something a b c d message\n",
"0 one 1 2 3.0 4 NaN\n",
"1 two 5 6 NaN 8 world\n",
"2 three 9 10 11.0 12 foo"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('ch06/ex5.csv')\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data.to_csv('ch06/out.csv')"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
",something,a,b,c,d,message\r\n",
"0,one,1,2,3.0,4,\r\n",
"1,two,5,6,,8,world\r\n",
"2,three,9,10,11.0,12,foo\r\n"
]
}
],
"source": [
"!cat ch06/out.csv"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"|something|a|b|c|d|message\n",
"0|one|1|2|3.0|4|\n",
"1|two|5|6||8|world\n",
"2|three|9|10|11.0|12|foo\n"
]
}
],
"source": [
"data.to_csv(sys.stdout, sep = '|')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
",something,a,b,c,d,message\n",
"0,one,1,2,3.0,4,NULL\n",
"1,two,5,6,NULL,8,world\n",
"2,three,9,10,11.0,12,foo\n"
]
}
],
"source": [
"data.to_csv(sys.stdout, na_rep='NULL')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"one,1,2,3.0,4,\n",
"two,5,6,,8,world\n",
"three,9,10,11.0,12,foo\n"
]
}
],
"source": [
"data.to_csv(sys.stdout, header = False, index = False)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"something,a,b,c,d,message\n",
"one,1,2,3.0,4,\n",
"two,5,6,,8,world\n",
"three,9,10,11.0,12,foo\n"
]
}
],
"source": [
"data.to_csv(sys.stdout, index = False, cols = ['a', 'b', 'c'])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',\n",
" '2000-01-05', '2000-01-06', '2000-01-07'],\n",
" dtype='datetime64[ns]', freq='D')"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dates = pd.date_range('1/1/2000', periods=7)\n",
"dates"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2000-01-01 0\n",
"2000-01-02 1\n",
"2000-01-03 2\n",
"2000-01-04 3\n",
"2000-01-05 4\n",
"2000-01-06 5\n",
"2000-01-07 6\n",
"Freq: D, dtype: int64"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ts = Series(np.arange(7), index = dates)\n",
"ts"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2000-01-01,0\r\n",
"2000-01-02,1\r\n",
"2000-01-03,2\r\n",
"2000-01-04,3\r\n",
"2000-01-05,4\r\n",
"2000-01-06,5\r\n",
"2000-01-07,6\r\n"
]
}
],
"source": [
"ts.to_csv('ch06/tseries.csv')\n",
"!cat ch06/tseries.csv"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2000-01-01 0\n",
"2000-01-02 1\n",
"2000-01-03 2\n",
"2000-01-04 3\n",
"2000-01-05 4\n",
"2000-01-06 5\n",
"2000-01-07 6\n",
"dtype: int64"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Series.from_csv('ch06/tseries.csv', parse_dates=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Manually Working with Dilimiter Formats"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\"a\",\"b\",\"c\"\r\n",
"\"1\",\"2\",\"3\"\r\n",
"\"1\",\"2\",\"3\",\"4\"\r\n"
]
}
],
"source": [
"!cat ch06/ex7.csv"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import csv\n",
"f = open('ch06/ex7.csv')\n",
"reader = csv.reader(f)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['a', 'b', 'c']\n",
"['1', '2', '3']\n",
"['1', '2', '3', '4']\n"
]
}
],
"source": [
"for line in reader:\n",
" print (line)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines = list(csv.reader(open('ch06/ex7.csv')))\n",
"lines"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['a', 'b', 'c']"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines[0]"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[['1', '2', '3'], ['1', '2', '3', '4']]"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines[1:]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"header, values = lines[0], lines[1:]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
"data_dict"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<_csv.reader at 0x1083a42e8>"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class my_dialect(csv.Dialect):\n",
" lineterminator = '\\n'\n",
" delimiter = ';'\n",
" quotechar = '\"'\n",
" quoting = csv.QUOTE_MINIMAL\n",
"reader = csv.reader(f, dialect = my_dialect)\n",
"reader"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<_csv.reader at 0x1083a4198>"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reader = csv.reader(f, delimiter = '|')\n",
"reader"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSON Data"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"obj = \"\"\" \n",
"{\"name\": \"Wes\",\n",
"\"places_lived\": [\"United States\", \"Spain\", \"Germany\"], \n",
"\"pet\": null,\n",
"\"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n",
"{\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n",
"} \"\"\""
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import json\n",
"result = json.loads(obj)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Wes',\n",
" 'pet': None,\n",
" 'places_lived': ['United States', 'Spain', 'Germany'],\n",
" 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},\n",
" {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"asjson = json.dumps(result)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"siblings = DataFrame(result['siblings'], columns = ['name', 'age'])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Scott | \n",
" 25 | \n",
"
\n",
" \n",
" | 1 | \n",
" Katie | \n",
" 33 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age\n",
"0 Scott 25\n",
"1 Katie 33"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"siblings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XML and HTML: Web Scraping"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from lxml.html import parse\n",
"from urllib.request import urlopen"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"doc = parsed.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"links = doc.findall('.//a')"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[,\n",
" ,\n",
" ,\n",
" ,\n",
" ]"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"links[15: 20]"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"link = links[18]"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'https://www.yahoo.com/beauty'"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link.get('href')"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Beauty'"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link.text_content()"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"urls = [link.get('href') for link in doc.findall('.//a')]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['https://www.yahoo.com/',\n",
" 'https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym',\n",
" 'https://search.yahoo.com/search',\n",
" 'http://news.yahoo.com/',\n",
" 'http://sports.yahoo.com/',\n",
" 'http://finance.yahoo.com/',\n",
" 'https://celebrity.yahoo.com/',\n",
" 'https://weather.yahoo.com/',\n",
" 'https://answers.yahoo.com/',\n",
" 'https://www.flickr.com/']"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"urls[0: 10]"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tables = doc.findall('.//table')"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"calls = tables[1]"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"puts = tables[2]"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calls"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rows = calls.findall('.//tr')"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def _unpack(row, kind = 'td'):\n",
" elts = row.findall('.//%s' %kind)\n",
" return [val.text_content() for val in elts]"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['\\n \\n Strike\\n \\n \\ue004\\n \\ue002\\n \\n \\n ∵ Filter\\n ',\n",
" 'Contract Name',\n",
" '\\n \\n Last\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n Bid\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n Ask\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n Change\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n %Change\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n Volume\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n Open Interest\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n",
" '\\n \\n Implied Volatility\\n \\n \\ue004\\n \\ue002\\n \\n \\n ']"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_unpack(rows[0], kind='th')"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['\\n \\n \\n ✕\\n [modify]\\n \\n ']"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_unpack(rows[1], kind = 'td')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Binary Data Frame"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.read_csv('ch06/ex1.csv')\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"frame.to_pickle('ch06/frame_pickle')"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" message | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" hello | \n",
"
\n",
" \n",
" | 1 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" world | \n",
"
\n",
" \n",
" | 2 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d message\n",
"0 1 2 3 4 hello\n",
"1 5 6 7 8 world\n",
"2 9 10 11 12 foo"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_pickle('ch06/frame_pickle')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using HDF5 Format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading Microsoft Excel Files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Interacting with HTML and Web APIs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Interacting with Databases"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Storing and Loading Data in MongoDB"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}