{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CHAPTER 6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Date Loading, Storage, and File Formats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading and writing data in text format" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "from pandas import Series\n", "from pandas import DataFrame\n", "import sys\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a,b,c,d,message\r\n", "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "source": [ "!cat ch06/ex1.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.read_csv('ch06/ex1.csv') " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_table('ch06/ex1.csv', sep = ',')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "source": [ "!cat ch06/ex2.csv" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv('ch06/ex2.csv', header = None)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv('ch06/ex2.csv', names = ['a', 'b', 'c', 'd', 'message'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
message
hello1234
world5678
foo9101112
\n", "
" ], "text/plain": [ " a b c d\n", "message \n", "hello 1 2 3 4\n", "world 5 6 7 8\n", "foo 9 10 11 12" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names = ['a', 'b', 'c', 'd', 'message']\n", "pd.read_csv('ch06/ex2.csv', names = names, index_col = 'message')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "key1,key2,value1,value2\r\n", "one,a,1,2\r\n", "one,b,3,4\r\n", "one,c,5,6\r\n", "one,d,7,8\r\n", "two,a,9,10\r\n", "two,b,11,12\r\n", "two,c,13,14\r\n", "two,d,15,16\r\n" ] } ], "source": [ "!cat ch06/csv_mindex.csv" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516
\n", "
" ], "text/plain": [ " value1 value2\n", "key1 key2 \n", "one a 1 2\n", " b 3 4\n", " c 5 6\n", " d 7 8\n", "two a 9 10\n", " b 11 12\n", " c 13 14\n", " d 15 16" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col = ['key1', 'key2'])\n", "parsed" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[' A B C\\n',\n", " 'aaa -0.264438 -1.026059 -0.619500\\n',\n", " 'bbb 0.927272 0.302904 -0.032399\\n',\n", " 'ccc -0.264273 -0.386314 -0.217601\\n',\n", " 'ddd -0.871858 -0.348382 1.100491\\n']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(open('ch06/ex3.txt'))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491
\n", "
" ], "text/plain": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv('ch06/ex3.txt', sep = '\\s+')\n", "result" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# hey!\r\n", "a,b,c,d,message\r\n", "# just wanted to make things more difficult for you\r\n", "# who reads CSV files with computers, anyway?\r\n", "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "source": [ "!cat ch06/ex4.csv" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv('ch06/ex4.csv', skiprows = [0, 2, 3])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "something,a,b,c,d,message\r\n", "one,1,2,3,4,NA\r\n", "two,5,6,,8,world\r\n", "three,9,10,11,12,foo" ] } ], "source": [ "!cat ch06/ex5.csv" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv('ch06/ex5.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseFalse
2FalseFalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 False False False False False True\n", "1 False False False True False False\n", "2 False False False False False False" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.isnull(result)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n", "result" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1NaN56NaN8world
2three91011.012NaN
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 NaN 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 NaN" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n", "result = pd.read_csv('ch06/ex5.csv', na_values=sentinels)\n", "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading text file in pieces" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
\n", "
" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv('ch06/ex6.csv')\n", "result.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
\n", "
" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv('ch06/ex6.csv', nrows = 5)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize = 1000)\n", "\n", "tot = Series([])\n", "for piece in chunker:\n", " tot = tot.add(piece['key'].value_counts(), fill_value = 0)\n", "tot = tot.sort_values(ascending = False)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "E 368.0\n", "X 364.0\n", "L 346.0\n", "O 343.0\n", "Q 340.0\n", "M 338.0\n", "J 337.0\n", "F 335.0\n", "K 334.0\n", "H 330.0\n", "dtype: float64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tot[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Writing data out to data format" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('ch06/ex5.csv')\n", "data" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.to_csv('ch06/out.csv')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",something,a,b,c,d,message\r\n", "0,one,1,2,3.0,4,\r\n", "1,two,5,6,,8,world\r\n", "2,three,9,10,11.0,12,foo\r\n" ] } ], "source": [ "!cat ch06/out.csv" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "|something|a|b|c|d|message\n", "0|one|1|2|3.0|4|\n", "1|two|5|6||8|world\n", "2|three|9|10|11.0|12|foo\n" ] } ], "source": [ "data.to_csv(sys.stdout, sep = '|')" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,NULL\n", "1,two,5,6,NULL,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "source": [ "data.to_csv(sys.stdout, na_rep='NULL')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one,1,2,3.0,4,\n", "two,5,6,,8,world\n", "three,9,10,11.0,12,foo\n" ] } ], "source": [ "data.to_csv(sys.stdout, header = False, index = False)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "something,a,b,c,d,message\n", "one,1,2,3.0,4,\n", "two,5,6,,8,world\n", "three,9,10,11.0,12,foo\n" ] } ], "source": [ "data.to_csv(sys.stdout, index = False, cols = ['a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',\n", " '2000-01-05', '2000-01-06', '2000-01-07'],\n", " dtype='datetime64[ns]', freq='D')" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range('1/1/2000', periods=7)\n", "dates" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2000-01-01 0\n", "2000-01-02 1\n", "2000-01-03 2\n", "2000-01-04 3\n", "2000-01-05 4\n", "2000-01-06 5\n", "2000-01-07 6\n", "Freq: D, dtype: int64" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ts = Series(np.arange(7), index = dates)\n", "ts" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2000-01-01,0\r\n", "2000-01-02,1\r\n", "2000-01-03,2\r\n", "2000-01-04,3\r\n", "2000-01-05,4\r\n", "2000-01-06,5\r\n", "2000-01-07,6\r\n" ] } ], "source": [ "ts.to_csv('ch06/tseries.csv')\n", "!cat ch06/tseries.csv" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2000-01-01 0\n", "2000-01-02 1\n", "2000-01-03 2\n", "2000-01-04 3\n", "2000-01-05 4\n", "2000-01-06 5\n", "2000-01-07 6\n", "dtype: int64" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Series.from_csv('ch06/tseries.csv', parse_dates=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Manually Working with Dilimiter Formats" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\"a\",\"b\",\"c\"\r\n", "\"1\",\"2\",\"3\"\r\n", "\"1\",\"2\",\"3\",\"4\"\r\n" ] } ], "source": [ "!cat ch06/ex7.csv" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import csv\n", "f = open('ch06/ex7.csv')\n", "reader = csv.reader(f)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c']\n", "['1', '2', '3']\n", "['1', '2', '3', '4']\n" ] } ], "source": [ "for line in reader:\n", " print (line)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = list(csv.reader(open('ch06/ex7.csv')))\n", "lines" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['a', 'b', 'c']" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines[0]" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['1', '2', '3'], ['1', '2', '3', '4']]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines[1:]" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [], "source": [ "header, values = lines[0], lines[1:]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", "data_dict" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<_csv.reader at 0x1083a42e8>" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class my_dialect(csv.Dialect):\n", " lineterminator = '\\n'\n", " delimiter = ';'\n", " quotechar = '\"'\n", " quoting = csv.QUOTE_MINIMAL\n", "reader = csv.reader(f, dialect = my_dialect)\n", "reader" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<_csv.reader at 0x1083a4198>" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reader = csv.reader(f, delimiter = '|')\n", "reader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## JSON Data" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = \"\"\" \n", "{\"name\": \"Wes\",\n", "\"places_lived\": [\"United States\", \"Spain\", \"Germany\"], \n", "\"pet\": null,\n", "\"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n", "{\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n", "} \"\"\"" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import json\n", "result = json.loads(obj)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'name': 'Wes',\n", " 'pet': None,\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},\n", " {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": true }, "outputs": [], "source": [ "asjson = json.dumps(result)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": true }, "outputs": [], "source": [ "siblings = DataFrame(result['siblings'], columns = ['name', 'age'])" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0Scott25
1Katie33
\n", "
" ], "text/plain": [ " name age\n", "0 Scott 25\n", "1 Katie 33" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "siblings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XML and HTML: Web Scraping" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from lxml.html import parse\n", "from urllib.request import urlopen" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": true }, "outputs": [], "source": [ "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": true }, "outputs": [], "source": [ "doc = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": true }, "outputs": [], "source": [ "links = doc.findall('.//a')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links[15: 20]" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": true }, "outputs": [], "source": [ "link = links[18]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'https://www.yahoo.com/beauty'" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "link.get('href')" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'Beauty'" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "link.text_content()" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": true }, "outputs": [], "source": [ "urls = [link.get('href') for link in doc.findall('.//a')]" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['https://www.yahoo.com/',\n", " 'https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym',\n", " 'https://search.yahoo.com/search',\n", " 'http://news.yahoo.com/',\n", " 'http://sports.yahoo.com/',\n", " 'http://finance.yahoo.com/',\n", " 'https://celebrity.yahoo.com/',\n", " 'https://weather.yahoo.com/',\n", " 'https://answers.yahoo.com/',\n", " 'https://www.flickr.com/']" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urls[0: 10]" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tables = doc.findall('.//table')" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [], "source": [ "calls = tables[1]" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": true }, "outputs": [], "source": [ "puts = tables[2]" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "calls" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": true }, "outputs": [], "source": [ "rows = calls.findall('.//tr')" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def _unpack(row, kind = 'td'):\n", " elts = row.findall('.//%s' %kind)\n", " return [val.text_content() for val in elts]" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['\\n \\n Strike\\n \\n \\ue004\\n \\ue002\\n \\n \\n ∵ Filter\\n ',\n", " 'Contract Name',\n", " '\\n \\n Last\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n Bid\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n Ask\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n Change\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n %Change\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n Volume\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n Open Interest\\n \\n \\ue004\\n \\ue002\\n \\n \\n ',\n", " '\\n \\n Implied Volatility\\n \\n \\ue004\\n \\ue002\\n \\n \\n ']" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "_unpack(rows[0], kind='th')" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['\\n \\n \\n ✕\\n [modify]\\n \\n ']" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "_unpack(rows[1], kind = 'td')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binary Data Frame" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.read_csv('ch06/ex1.csv')\n", "frame" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [], "source": [ "frame.to_pickle('ch06/frame_pickle')" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_pickle('ch06/frame_pickle')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using HDF5 Format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading Microsoft Excel Files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interacting with HTML and Web APIs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interacting with Databases" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Storing and Loading Data in MongoDB" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }