{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "6. \ub370\uc774\ud130 \ub85c\ub529, \uc800\uc7a5, \ud30c\uc77c \ud615\uc2dd" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.1 \ud14d\uc2a4\ud2b8 \ud30c\uc77c \uc774\uc6a9\ud558\ub294 \ubc29\ubc95" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ud30c\uc774\uc36c \uc88b\uc740 \uc774\uc720\n", "\n", "- \ub2e8\uc21c\ud55c \ubb38\ubc95\n", "- \uc9c1\uad00\uc801\uc778 \uc790\ub8cc \uad6c\uc870\n", "- \ud29c\ud50c\uc5d0 \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\uace0 \uc77d\uc5b4\ub0b4\ub294 \ud3b8\ub9ac\ud55c \uae30\ub2a5\n", "\n", "#### pandas \ud30c\uc77c \ud30c\uc2f1 \ud568\uc218\n", "\n", "- read_csv: \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \uc27c\ud45c(,)\ub97c \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "- read_table: \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \ud0ed('\\t')\uc744 \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "- read_fwf: \uace0\uc815\ud3ed \uce7c\ub7fc \ud615\uc2dd\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4(\uad6c\ubd84\uc790\uac00 \uc5c6\ub294 \ub370\uc774\ud130)\n", "- read_clipboard: \ud074\ub9bd\ubcf4\ub4dc\uc5d0 \uc788\ub294 \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc624\ub294 read_table \ud568\uc218. \uc6f9\ud398\uc774\uc9c0\uc5d0\uc11c \ud45c\ub97c \uae01\uc5b4\uc62c \ub54c \uc720\uc6a9\ud558\ub2e4.\n", "\n", "#### pandas \ud30c\uc77c \ud30c\uc2f1 \ud568\uc218 \uc635\uc158\n", "\n", "- **\uc0c9\uc778**: \ubc18\ud658\ud558\ub294 DataFrame\uc5d0\uc11c \ud558\ub098 \uc774\uc0c1\uc758 \uce7c\ub7fc\uc744 \uc0c9\uc778\uc73c\ub85c \uc9c0\uc815\ud560 \uc218 \uc788\ub2e4. \ud30c\uc77c\uc774\ub098 \uc0ac\uc6a9\uc790\ub85c\ubd80\ud130 \uce7c\ub7fc\uc758 \uc774\ub984\uc744 \ubc1b\uac70\ub098 \uc544\ubb34\uac83\ub3c4 \ubc1b\uc9c0 \uc54a\uc744 \uc218 \uc788\ub2e4.\n", "- **\uc790\ub8cc\ud615 \ucd94\ub860\uacfc \ub370\uc774\ud130 \ubcc0\ud658**: \uc0ac\uc6a9\uc790 \uc815\uc758 \uac12 \ubcc0\ud658\uacfc \ube44\uc5b4\uc788\ub294 \uac12\uc744 \uc704\ud55c \uc0ac\uc6a9\uc790 \ub9ac\uc2a4\ud2b8\ub97c \ud3ec\ud568\ud55c\ub2e4.\n", "- **\ub0a0\uc9dc \ubd84\uc11d**: \uc5ec\ub7ec \uce7c\ub7fc\uc5d0 \uac78\uccd0 \uc788\ub294 \ub0a0\uc9dc\uc640 \uc2dc\uac04 \uc815\ubcf4\ub97c \ud558\ub098\uc758 \uce7c\ub7fc\uc5d0 \uc870\ud569\ud574\uc11c \uacb0\uacfc\uc5d0 \ubc18\uc601\ud55c\ub2e4.\n", "- **\ubc18\ubcf5**: \uc5ec\ub7ec \ud30c\uc77c\uc5d0 \uac78\uccd0 \uc788\ub294 \uc790\ub8cc\ub97c \ubc18\ubcf5\uc801\uc73c\ub85c \uc77d\uc5b4\uc62c \uc218 \uc788\ub2e4.\n", "- **\uc815\uc81c\ub418\uc9c0 \uc54a\ub294 \ub370\uc774\ud130 \ucc98\ub9ac**: \ub85c\uc6b0\ub098 \uaf2c\ub9ac\ub9d0, \uc8fc\uc11d \uac74\ub108\ub6f0\uae30 \ub610\ub294 \ucc9c \ub2e8\uc704\ub9c8\ub2e4 \uc27c\ud45c\ub85c \uad6c\ubd84\ub41c \uc22b\uc790 \uac19\uc740 \uc0ac\uc18c\ud55c \uc77c\uc744 \ucc98\ub9ac\ud574\uc900\ub2e4.\n", "\n", "#### \uc790\ub8cc\ud615 \ucd94\ub860\uc740 \ub9e4\uc6b0 \uc911\uc694\n", "\n", "- \uc5b4\ub5a4 \uce7c\ub7fc\uc774 \uc22b\uc790\uc778\uc9c0 \ubd88\ub9ac\uc5b8\uc778\uc9c0 \uc9c0\uc815\ud574\uc904 \ud544\uc694\uac00 \uc5c6\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pandas import DataFrame, Series\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex1.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "a,b,c,d,message\r\n", "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('ch06/ex1.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "type(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_table('ch06/ex1.csv', sep=',')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 20, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex2.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex2.csv', header=None)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ " 0 1 2 3 4\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'message'])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcmessage
1 2 3 4 hello
5 6 7 8 world
9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ " a b c message\n", "1 2 3 4 hello\n", "5 6 7 8 world\n", "9 10 11 12 foo" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "names = ['a', 'b', 'c', 'd', 'message']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "# message -> index\n", "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": [ " a b c d\n", "message \n", "hello 1 2 3 4\n", "world 5 6 7 8\n", "foo 9 10 11 12" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/csv_mindex.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "key1,key2,value1,value2\r\n", "one,a,1,2\r\n", "one,b,3,4\r\n", "one,c,5,6\r\n", "one,d,7,8\r\n", "two,a,9,10\r\n", "two,b,11,12\r\n", "two,c,13,14\r\n", "two,d,15,16\r\n" ] } ], "prompt_number": 26 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uacc4\uce35\uc801 \uc0c9\uc778\uc744 \uc9c0\uc815\ud558\uace0 \uc2f6\ub2e4\uba74 \uce7c\ub7fc \ubc88\ud638\ub098 \uc774\ub984\uc758 \ub9ac\uc2a4\ud2b8\ub97c \ub118\uae34\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "parsed" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value1value2
key1key2
onea 1 2
b 3 4
c 5 6
d 7 8
twoa 9 10
b 11 12
c 13 14
d 15 16
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 28, "text": [ " value1 value2\n", "key1 key2 \n", "one a 1 2\n", " b 3 4\n", " c 5 6\n", " d 7 8\n", "two a 9 10\n", " b 11 12\n", " c 13 14\n", " d 15 16" ] } ], "prompt_number": 28 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uace0\uc815\ub41c \uad6c\ubd84\uc790\uac00 \uc5c6\ub2e4\uba74 read_table\uc758 \uad6c\ubd84\uc790\ub85c \uc815\uaddc\ud45c\ud604\uc2dd([\ud30c\uc774\uc36c \u2013 \uc815\uaddc\uc2dd\ud45c\ud604\uc2dd(Regular Expression) \ubaa8\ub4c8](http://devanix.tistory.com/296), [\ubc88\uc5ed \ud30c\uc774\uc36c \uc815\uaddc\ud45c\ud604\uc2dd](http://codeflow.co.kr/question/1061/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%A0%95%EA%B7%9C-%ED%91%9C%ED%98%84%EC%8B%9D/))\uc744 \uc0ac\uc6a9\ud558\uba74 \ub41c\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "list(open('ch06/ex3.txt'))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "[' A B C\\n',\n", " 'aaa -0.264438 -1.026059 -0.619500\\n',\n", " 'bbb 0.927272 0.302904 -0.032399\\n',\n", " 'ccc -0.264273 -0.386314 -0.217601\\n',\n", " 'ddd -0.871858 -0.348382 1.100491\\n']" ] } ], "prompt_number": 29 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc9c1\uc811 \ud30c\uc77c\uc744 \uace0\uccd0\ub3c4 \ub418\uc9c0\ub9cc \uc774 \ud30c\uc77c\uc740 \uc5ec\ub7ec \uac1c\uc758 \uacf5\ubc31\ubb38\uc790\ub85c \ud544\ub4dc\uac00 \uad6c\ubd84\ub418\uc5b4 \uc788\uc73c\ubbc0\ub85c \uc774\ub97c \ud45c\ud604\ud560 \uc218 \uc788\ub294 \uc815\uaddc\ud45c\ud604\uc2dd \\s+\ub97c \uc0ac\uc6a9\ud574\uc11c \ucc98\ub9ac" ] }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_table('ch06/ex3.txt', sep='\\s+')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb 0.927272 0.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.348382 1.100491
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 31, "text": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] } ], "prompt_number": 31 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc774 \uacbd\uc6b0, \uccab\ubc88\uc9f8 \ub85c\uc6b0\ub294 \ub2e4\ub978 \ub85c\uc6b0\ubcf4\ub2e4 \uce7c\ub7fc\uc774 \ud558\ub098 \uc801\uae30 \ub54c\ubb38\uc5d0 read_table\uc740 \uccab \ubc88\uc9f8 \uce7c\ub7fc\uc774 DataFrame\uc758 \uc0c9\uc778\uc774 \ub418\uc5b4\uc57c \ud55c\ub2e4\uace0 \ucd94\ub860" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### read_table\uacfc read_csv\uc758 \ucc28\uc774\uc810\uc740??\n", "\n", "- read_csv: \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \uc27c\ud45c(,)\ub97c \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "- read_table: \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \ud0ed('\\t')\ub97c \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "\n", "#### \uadf8\ub7ec\ub2c8 \ub458 \ub2e4 \uc0ac\uc6a9\uc744 \ud574\ub3c4 \ub418\uc9c0\ub9cc \uc660\ub9cc\ud558\uba74 read_csv \uac19\uc740 \uacbd\uc6b0\ub294 csv \ud30c\uc77c\ub9cc \uc0ac\uc6a9\uc744 \ud558\uace0 \ub098\uba38\uc9c0 \ud2b9\ubcc4\ud55c \uacbd\uc6b0\ub97c read_table\ub85c \ud65c\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex3.txt', delimiter='\\s+')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb 0.927272 0.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.348382 1.100491
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 32, "text": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] } ], "prompt_number": 32 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### [IO Tools(Text, CSV, HDF5, \u22ef) example](http://pandas.pydata.org/pandas-docs/stable/io.html)\n", "\n", "- \ud30c\uc11c \ud568\uc218\ub294 \ud30c\uc77c \ud615\uc2dd\uc5d0\uc11c \ubc1c\uc0dd\ud560 \uc218 \uc788\ub294 \ub9e4\uc6b0 \ub2e4\uc591\ud55c \uc608\uc678\ub97c \uc798 \ucc98\ub9ac\ud560 \uc218 \uc788\ub3c4\ub85d \ub9ce\uc740 \ucd94\uac00 \uc778\uc790\ub97c \uac00\uc9c0\uace0 \uc788\ub2e4.\n", "- skiprows\ub97c \uc774\uc6a9\ud574\uc11c \uccab\ubc88\uc9f8, \uc138\ubc88\uc9f8, \ub124\ubc88\uc9f8 \ub85c\uc6b0\ub97c \uac74\ub108\ub6f8 \uc218 \uc788\uc74c" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Read CSV(comma-separated) file into DataFrame\n", "pd.read_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex4.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "# hey!\r\n", "a,b,c,d,message\r\n", "# just wanted to make things more difficult for you\r\n", "# who reads CSV files with computers, anyway?\r\n", "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 35, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex5.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "something,a,b,c,d,message\r\n", "one,1,2,3,4,NA\r\n", "two,5,6,,8,world\r\n", "three,9,10,11,12,foo" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_csv('ch06/ex5.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 world
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.isnull(result)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 False False False False False True
1 False False False True False False
2 False False False False False False
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ " something a b c d message\n", "0 False False False False False True\n", "1 False False False True False False\n", "2 False False False False False False" ] } ], "prompt_number": 39 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### na_values \uc635\uc158\uc740 \ub9ac\uc2a4\ud2b8\ub098 \ubb38\uc790\uc5f4 \uc9d1\ud569\uc744 \ubc1b\uc544\uc11c \ub204\ub77d\ub41c \uac12\uc744 \ucc98\ub9ac\n", "\n", "### Why? na_values\ub97c \uc0ac\uc6a9\ud558\uc9c0?\n", "\n", "- \ud2b9\uc815\ud55c \uac12\ub4e4\uc740 \uacc4\uc0b0\uc744 \ud558\uc9c0 \uc54a\uc73c\ub824\uace0??? \uc74c.." ] }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 world
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 41, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "# world\ub97c NA\uac12\uc73c\ub85c \ucc98\ub9ac\ud558\ub2c8 NaN\uc73c\ub85c \ub098\uc628\ub2e4.\n", "# \ud2b9\uc815\ud55c \uac12\uc744 NA \ucc98\ub9ac\ud560 \uc218 \uc788\uc744\uac83 \uac19\ub2e4.\n", "pd.read_csv('ch06/ex5.csv', na_values=['world'])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 NaN
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 42, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 NaN\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 42 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc5f4\ub9c8\ub2e4 \ub2e4\ub978 NA \ubb38\uc790\ub97c \uc0ac\uc804 \uac12\uc73c\ub85c \ub118\uaca8 \ucc98\ub9ac \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex5.csv', na_values=sentinels)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 NaN 5 6NaN 8 world
2 three 9 10 11 12 NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 44, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 NaN 5 6 NaN 8 world\n", "2 three 9 10 11 12 NaN" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_csv('ch06/ex6.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n",
        "<class 'pandas.core.frame.DataFrame'>\n",
        "Int64Index: 10000 entries, 0 to 9999\n",
        "Data columns (total 5 columns):\n",
        "one      10000  non-null values\n",
        "two      10000  non-null values\n",
        "three    10000  non-null values\n",
        "four     10000  non-null values\n",
        "key      10000  non-null values\n",
        "dtypes: float64(4), object(1)\n",
        "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ "\n", "Int64Index: 10000 entries, 0 to 9999\n", "Data columns (total 5 columns):\n", "one 10000 non-null values\n", "two 10000 non-null values\n", "three 10000 non-null values\n", "four 10000 non-null values\n", "key 10000 non-null values\n", "dtypes: float64(4), object(1)" ] } ], "prompt_number": 46 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### nrows\ub85c \ucc98\uc74c \uba87 \uc904\ub9cc \uc77d\uc5b4\ubcfc \uc218 \uc788\ub2e4.\n", "\n", "- \ub9ac\ub205\uc2a4\uc758 head \ud30c\uc77c\uc774\ub984 \uacfc \uac19\ub2e4\uace0 \uc0dd\uac01\ud558\uba74 \ub41c\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex6.csv', nrows=5)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
0 0.467976-0.038649-0.295344-1.824726 L
1-0.358893 1.404453 0.704965-0.200638 B
2-0.501840 0.659254-0.421691-0.057688 G
3 0.204886 1.074134 1.388361-0.982404 R
4 0.354628-0.133116 0.283763-0.837063 Q
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] } ], "prompt_number": 47 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TextParser \uac1d\uccb4\ub97c \uc774\uc6a9\ud574\uc11c chunksize\uc5d0 \ub530\ub77c \ubd84\ub9ac\ub41c \ud30c\uc77c\uc744 \uc21c\ud68c \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "chunker" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 49, "text": [ "" ] } ], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", "\n", "tot = Series([])\n", "for piece in chunker:\n", " tot = tot.add( piece['key'].value_counts(), fill_value=0)\n", "\n", "tot = tot.order(ascending=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 50 }, { "cell_type": "code", "collapsed": false, "input": [ "tot[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 51, "text": [ "E 368\n", "X 364\n", "L 346\n", "O 343\n", "Q 340\n", "M 338\n", "J 337\n", "F 335\n", "K 334\n", "H 330\n", "dtype: float64" ] } ], "prompt_number": 51 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.1.2 \ub370\uc774\ud130\ub97c \ud14d\uc2a4\ud2b8 \ud615\uc2dd\uc73c\ub85c \uae30\ub85d\ud558\uae30" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data = pd.read_csv('ch06/ex5.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "data" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 world
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 53, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv('ch06/out.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/out.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ",something,a,b,c,d,message\r\n", "0,one,1,2,3.0,4,\r\n", "1,two,5,6,,8,world\r\n", "2,three,9,10,11.0,12,foo\r\n" ] } ], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, sep='|')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "|something|a|b|c|d|message\n", "0|one|1|2|3.0|4|\n", "1|two|5|6||8|world\n", "2|three|9|10|11.0|12|foo\n" ] } ], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "# Write DataFrame to a comma-separated value (csv) file\n", "# na_rep -> Missing data representation. NA REPresentation\n", "data.to_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 57 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### na_rep\ub85c \ub204\ub77d\ub41c\uac12\uc744 \uc6d0\ud558\ub294 \uac12\uc73c\ub85c \ubcc0\uacbd \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, na_rep='NULL')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,NULL\n", "1,two,5,6,NULL,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, na_rep='NaN')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,NaN\n", "1,two,5,6,NaN,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "prompt_number": 59 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### row, column \uac12\uc744 \uc800\uc7a5\ud560 \uac83\uc778\uc9c0 \uc120\ud0dd \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, index=False, header=False)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "one,1,2,3.0,4,\n", "two,5,6,,8,world\n", "three,9,10,11.0,12,foo\n" ] } ], "prompt_number": 60 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ucee4\ub7fc\uc758 \uc77c\ubd80\ubd84\ub9cc \uae30\ub85d \uac00\ub2a5, \uc21c\uc11c\ub97c \uc9c1\uc811 \uc9c0\uc815 \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "a,b,c\n", "1,2,3.0\n", "5,6,\n", "9,10,11.0\n" ] } ], "prompt_number": 61 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Series\uc5d0\ub3c4 to_csv method \uc874\uc7ac" ] }, { "cell_type": "code", "collapsed": false, "input": [ "dates = pd.date_range('1/1/2000', periods=7)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 62 }, { "cell_type": "code", "collapsed": false, "input": [ "ts = Series(np.arange(7), index=dates)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 63 }, { "cell_type": "code", "collapsed": false, "input": [ "ts.to_csv('ch06/tseries.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 64 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/tseries.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "2000-01-01,0\r\n", "2000-01-02,1\r\n", "2000-01-03,2\r\n", "2000-01-04,3\r\n", "2000-01-05,4\r\n", "2000-01-06,5\r\n", "2000-01-07,6\r\n" ] } ], "prompt_number": 65 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc57d\uac04 \ubcf5\uc7a1\ud558\uac8c \ud5e4\ub354\ub97c \uc5c6\uc560\uace0 \uccab \ubc88\uc9f8 \uce7c\ub7fc\uc744 \uc0c9\uc778\uc73c\ub85c \ud558\uba74 read_csv \uba54\uc11c\ub4dc\ub85c Series \uac1d\uccb4\ub97c \uc5bb\uc744 \uc218 \uc788\uc9c0\ub9cc from_csv \uba54\uc11c\ub4dc\uac00 \uc880 \ub354 \ud3b8\ub9ac\ud558\uace0 \uac04\ub2e8\ud558\uac8c \ubb38\uc81c \ud574\uacb0" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.DataFrame.to_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 66 }, { "cell_type": "code", "collapsed": false, "input": [ "Series.from_csv('ch06/tseries.csv', parse_dates=True)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 67, "text": [ "2000-01-01 0\n", "2000-01-02 1\n", "2000-01-03 2\n", "2000-01-04 3\n", "2000-01-05 4\n", "2000-01-06 5\n", "2000-01-07 6\n", "dtype: int64" ] } ], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "type( Series.from_csv('ch06/tseries.csv', parse_dates=True) )" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 68, "text": [ "pandas.core.series.Series" ] } ], "prompt_number": 68 }, { "cell_type": "code", "collapsed": false, "input": [ "# parse dates: boolean, default True.\n", "# Parse dates. Different default from read_table\n", "Series.from_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 69 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### read_csv\ub97c Series\ub85c \uc77d\uc744 \uc218 \uc788\ub2e4\uace0 \uc2e4\ud5d8\ud558\ub294 \uc911\uc778\ub370 \uc798 \uc548\ub418\ub124.. \n", "\n", "- DataFrame\uc73c\ub85c \uc77d\uc5b4\uc9d0" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/tseries.csv', header=None)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0 2000-01-01 0
1 2000-01-02 1
2 2000-01-03 2
3 2000-01-04 3
4 2000-01-05 4
5 2000-01-06 5
6 2000-01-07 6
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 70, "text": [ " 0 1\n", "0 2000-01-01 0\n", "1 2000-01-02 1\n", "2 2000-01-03 2\n", "3 2000-01-04 3\n", "4 2000-01-05 4\n", "5 2000-01-06 5\n", "6 2000-01-07 6" ] } ], "prompt_number": 70 }, { "cell_type": "code", "collapsed": false, "input": [ "type(pd.read_csv('ch06/tseries.csv', header=None))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 71, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 71 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 72 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###6.1.3 \uc218\ub3d9\uc73c\ub85c \uad6c\ubd84 \ud615\uc2dd \ucc98\ub9ac\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### read_table\uc5d0\uc11c \uc77d\uc744 \uc218 \uc5c6\ub294 \uc798\ubabb\ub41c \ud615\uc2dd\uc758 \uc904\uc774 \ud3ec\ud568\ub41c \ub370\uc774\ud130\uac00 \ub4dc\ubb3c\uac8c \ubc1c\uacac \ub428 -> \uc218\ub3d9 \ucc98\ub9ac" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex7.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\"a\",\"b\",\"c\"\r\n", "\"1\",\"2\",\"3\"\r\n", "\"1\",\"2\",\"3\",\"4\"\r\n" ] } ], "prompt_number": 73 }, { "cell_type": "code", "collapsed": false, "input": [ "import csv\n", "f = open('ch06/ex7.csv')\n", "\n", "reader = csv.reader(f)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 74 }, { "cell_type": "code", "collapsed": false, "input": [ "for line in reader:\n", " print line" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['a', 'b', 'c']\n", "['1', '2', '3']\n", "['1', '2', '3', '4']\n" ] } ], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [ "lines = list(csv.reader(open('ch06/ex7.csv')))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 76 }, { "cell_type": "code", "collapsed": false, "input": [ "header, values = lines[0], lines[1:]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 77 }, { "cell_type": "code", "collapsed": false, "input": [ "header" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 78, "text": [ "['a', 'b', 'c']" ] } ], "prompt_number": 78 }, { "cell_type": "code", "collapsed": false, "input": [ "values" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 79, "text": [ "[['1', '2', '3'], ['1', '2', '3', '4']]" ] } ], "prompt_number": 79 }, { "cell_type": "code", "collapsed": false, "input": [ "# header = a,b,c\n", "# values\ub97c 1,1\uc744 \uac19\uc774 \ubb36\ub294\ub2e4. 2,2 \ubb36\uace0. 3,3 \ubb36\uace0. 4\ub294 header\uac00 a,b,c 3\uac1c \ubc16\uc5d0 \uc5c6\uae30 \ub54c\ubb38\uc5d0 \ud3ec\ud568\ub418\uc9c0 \uc54a\ub294\ub2e4.\n", "data_dict = {h: v for h, v in zip(header, zip(*values))}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 80 }, { "cell_type": "code", "collapsed": false, "input": [ "data_dict" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 81, "text": [ "{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}" ] } ], "prompt_number": 81 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CSV \ud30c\uc77c\uc740 \ub2e4\uc591\ud55c \ud30c\uc77c \uc874\uc7ac\ud558\uae30 \ub54c\ubb38\uc5d0 \ub2e4\uc591\ud55c \uc635\uc158\ub4e4\uc740 csv.Dialect \uc0c1\uc18d\ubc1b\uc544 \ud574\uacb0\n", "\n", "- \ub2e4\uc591\ud55c \uad6c\ubd84\uc790\n", "- \ubb38\uc790\uc5f4\uc744 \ub458\ub7ec\uc2f8\ub294 \ubc29\ubc95\n", "- \uac1c\ud589\ubb38\uc790" ] }, { "cell_type": "code", "collapsed": false, "input": [ "class my_dialect(csv.Dialect):\n", " lineterminator = '\\n'\n", " delimiter = ';'\n", " quotechar = '\"'\n", "\n", "reader = csv.reader" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 82 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 83 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 84 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TypeError: \"quoting\" must be an integer\n", "\n", "- [_csv.Error: field larger than field limit (131072) \ucc38\uace0](http://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# quoting\uc774 \uaf2d integer\uc5ec\uc57c \ud55c\ub2e4\ub294 \uc624\ub958\uac00 \ubc1c\uc0dd\ud574\uc11c \uc0bd\uc9c8\ud558\ub2e4\uac00 \ub4a4\uc5d0 quoting keyword\ub97c \ubd99\uc5ec\uc90c..\n", "reader = csv.reader(f, dialect=my_dialect)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "\"quoting\" must be an integer", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# quoting\uc774 \uaf2d integer\uc5ec\uc57c \ud55c\ub2e4\ub294 \uc624\ub958\uac00 \ubc1c\uc0dd\ud574\uc11c \uc0bd\uc9c8\ud558\ub2e4\uac00 \ub4a4\uc5d0 quoting keyword\ub97c \ubd99\uc5ec\uc90c..\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mreader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdialect\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmy_dialect\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: \"quoting\" must be an integer" ] } ], "prompt_number": 85 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 86 }, { "cell_type": "code", "collapsed": false, "input": [ "csv.QUOTE_NONE" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 87, "text": [ "3" ] } ], "prompt_number": 87 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader(f, delimiter='|')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 88 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### [13.1. csv \u2014 CSV File Reading and Writing](http://docs.python.org/3.1/library/csv.html)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc5b4\ub5a4 \uc635\uc158\ub4e4 \uc788\ub294\uc9c0 \ubcf4\ub824\uace0 \ud588\ub354\ub2c8 \uc548 \ubcf4\uc5ec\uc8fc\ub124...\n", "csv.reader??" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 89 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### CSV Note\n", "\n", "- \uc880 \ub354 \ubcf5\uc7a1\ud558\uac70\ub098 \uad6c\ubd84\uc790\uac00 \ud55c \uae00\uc790\ub97c \ucd08\uacfc\ud558\ub294 \uace0\uc815 \uae38\uc774\ub97c \uac00\uc9c4\ub2e4\uba74 csv \ubaa8\ub4c8\uc744 \uc0ac\uc6a9\ud560 \uc218 \uc5c6\ub2e4.\n", "- \uc774\ub7f0 \uacbd\uc6b0\uc5d0\ub294 \uc904\uc744 \ub098\ub204\uace0 \ubb38\uc790\uc5f4\uc758 split \uba54\uc11c\ub4dc\ub098 \uc815\uaddc\ud45c\ud604\uc2dd \uba54\uc11c\ub4dc\uc778 re.split \ub4f1\uc744 \uc774\uc6a9\ud574\uc11c \uac00\uacf5\ud558\ub294 \uc791\uc5c5\uc744 \ud574\uc57c \ud55c\ub2e4." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### CSV \ud30c\uc77c \uae30\ub85d" ] }, { "cell_type": "code", "collapsed": false, "input": [ "with open('mydata.csv', 'w') as f:\n", " writer = csv.writer(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)\n", " writer.writerow(('one', 'two', 'three'))\n", " writer.writerow(('1', '2', '3'))\n", " writer.writerow(('4', '5', '6'))\n", " writer.writerow(('7', '8', '9'))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 90 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat mydata.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "one;two;three\r\n", "1;2;3\r\n", "4;5;6\r\n", "7;8;9\r\n" ] } ], "prompt_number": 91 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JSON \ub370\uc774\ud130\n", "\n", "- JSON(JavaScript Object Notation)\uc740 \uc6f9\ube0c\ub77c\uc6b0\uc800\uc640 \ub2e4\ub978 \uc560\ud50c\ub9ac\ucf00\uc774\uc158\uc774 HTTP \uc694\uccad\uc73c\ub85c \ub370\uc774\ud130\ub97c \ubcf4\ub0bc \ub54c \ub110\ub9ac \uc0ac\uc6a9\ud558\ub294 \ud45c\uc900 \ud30c\uc77c \ud615\uc2dd \uc911 \ud558\ub098\ub2e4.\n", "- JSON\uc740 CSV \uac19\uc740 \ud45c \ud615\uc2dd\uc758 \ud14d\uc2a4\ud2b8\ubcf4\ub2e4 \uc880 \ub354 \uc720\uc5f0\ud55c \ub370\uc774\ud130 \ud615\uc2dd\uc774\uba70, JSON \ub370\uc774\ud130\uc758 \uc608\ub294 \ub2e4\uc74c\uacfc \uac19\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# json\uc740 python\uc5d0\uc11c\ucc98\ub7fc '\uc73c\ub85c \ud558\uba74 \uc548\ub41c\ub2e4. \ud604\uc7ac \"\"\"\ub85c \uac10\uc2f8 \ubb38\uc790\uc5f4\ub85c \uc800\uc7a5\ub418\uc5b4 \uc788\uae30 \ub54c\ubb38\uc5d0 javascript\uc5d0\uc11c\ub294 '\ub97c string \uac12\uc73c\ub85c \uc778\uc2dd\ud558\uc9c0 \uc54a\uc544\uc11c \uc5d0\ub7ec \ubc1c\uc0dd\n", "obj = \"\"\"\n", "{\n", " 'name': 'Wes',\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'pet': null, 'siblings': [{'name': 'Scott', 'age':25, 'pet':'Zuko'},\n", " {'name': 'Katie', 'age':33, 'pet': 'Cisco'}]\n", "}\n", "\"\"\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 92 }, { "cell_type": "code", "collapsed": false, "input": [ "# ValueError: Expecting property name: line 3 column 5 (char 7)\n", "result = json.loads(obj)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'json' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# ValueError: Expecting property name: line 3 column 5 (char 7)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'json' is not defined" ] } ], "prompt_number": 93 }, { "cell_type": "code", "collapsed": false, "input": [ "obj = \"\"\"\n", "{\n", " \"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null, \"siblings\": [{\"name\": \"Scott\", \"age\":25, \"pet\":\"Zuko\"},\n", " {\"name\": \"Katie\", \"age\":33, \"pet\": \"Cisco\"}]\n", "}\n", "\"\"\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 94 }, { "cell_type": "code", "collapsed": false, "input": [ "obj" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 95, "text": [ "'\\n{\\n \"name\": \"Wes\",\\n \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\\n \"pet\": null, \"siblings\": [{\"name\": \"Scott\", \"age\":25, \"pet\":\"Zuko\"},\\n {\"name\": \"Katie\", \"age\":33, \"pet\": \"Cisco\"}]\\n}\\n'" ] } ], "prompt_number": 95 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### JSON\uc740 \ub110 \uac12\uc778 null\uacfc \ub2e4\ub978 \uba87 \uac00\uc9c0 \uc0ac\uc18c\ud55c \uc8fc\uc758\uc0ac\ud56d(\ub9ac\uc2a4\ud2b8\uc758 \ub9c8\uc9c0\ub9c9\uc5d0 \uc27c\ud45c\uac00 \uc788\uc73c\uba74 \uc548\ub41c\ub2e4\ub358\uac00 \ud558\ub294)\uc744 \uc81c\uc678\ud558\uba74 \ud30c\uc774\uc36c \ucf54\ub4dc\uc640 \uac70\uc758 \uc720\uc0ac\n", "\n", "- \uae30\ubcf8 \uc790\ub8cc\ud615\uc740 \uac1d\uccb4(\uc0ac\uc804), \ubc30\uc5f4(\ub9ac\uc2a4\ud2b8), \ubb38\uc790\uc5f4, \uc22b\uc790, \ubd88\ub9ac\uc5b8 \uadf8\ub9ac\uace0 \ub110\n", "- \uac1d\uccb4\uc758 \ud0a4\ub294 \ubc18\ub4dc\uc2dc \ubb38\uc790\uc5f4\n", "- JSON \uc77d\uace0 \uc4f8 \uc218 \uc788\ub294 \ub77c\uc774\ube0c\ub7ec\ub9ac\uac00 \uba87 \uac1c \uc788\uc9c0\ub9cc \ud45c\uc900 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc778 json \uc0ac\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import json" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "# ValueError: Expecting property name: line 3 column 5 (char 7)\n", "result = json.loads(obj)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 97 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 98, "text": [ "{u'name': u'Wes',\n", " u'pet': None,\n", " u'places_lived': [u'United States', u'Spain', u'Germany'],\n", " u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},\n", " {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}" ] } ], "prompt_number": 98 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### json.dumps\ub294 \ud30c\uc774\uc36c \uac1d\uccb4\ub97c JSON \ud615\ud0dc\ub85c \ubcc0\ud658" ] }, { "cell_type": "code", "collapsed": false, "input": [ "asjson = json.dumps(result)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 99 }, { "cell_type": "code", "collapsed": false, "input": [ "# '\uac00 \uc544\ub2c8\ub77c \"\uc778 \uac83\uc744 \ud655\uc778\ud558\uc790\n", "asjson" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 100, "text": [ "'{\"pet\": null, \"siblings\": [{\"pet\": \"Zuko\", \"age\": 25, \"name\": \"Scott\"}, {\"pet\": \"Cisco\", \"age\": 33, \"name\": \"Katie\"}], \"name\": \"Wes\", \"places_lived\": [\"United States\", \"Spain\", \"Germany\"]}'" ] } ], "prompt_number": 100 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### JSON \uac1d\uccb4\ub098 \uac1d\uccb4\uc758 \ub9ac\uc2a4\ud2b8\ub97c DataFrame\uc774\ub098 \ub2e4\ub978 \uc790\ub8cc \uad6c\uc870\ub85c \uc5b4\ub5bb\uac8c \ubcc0\ud658\ud574\uc11c \ubd84\uc11d\uc744 \ud560 \uac83\uc778\uc9c0\ub294 \ub3c5\uc790\uc758 \ubaab\n", "\n", "- JSON \uac1d\uccb4\uc758 \ub9ac\uc2a4\ud2b8\ub97c DataFrame \uc0dd\uc131\uc790\ub85c \ub118\uae30\uace0 \ub370\uc774\ud130 \ud544\ub4dc \uc120\ud0dd \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "siblings = DataFrame(result['siblings'], columns=['name', 'age'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 101 }, { "cell_type": "code", "collapsed": false, "input": [ "siblings" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0 Scott 25
1 Katie 33
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 102, "text": [ " name age\n", "0 Scott 25\n", "1 Katie 33" ] } ], "prompt_number": 102 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### pandas\uc5d0\uc11c JSON\uc744 \ube60\ub974\uac8c \uc77d\uace0(from_json) \uc4f0\ub294(to_json) \ub124\uc774\ud2f0\ube0c \uad6c\ud604\uc911" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.1.5 XML\uacfc HTML: \uc6f9 \ub0b4\uc6a9 \uae01\uc5b4\uc624\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### lxml\n", "\n", "- \uc544\uc8fc \ud070 \ud30c\uc77c\uc744 \ube60\ub974\uac8c \ucc98\ub9ac \uac00\ub2a5\n", "- \uc5ec\ub7ec \uc885\ub958\uc758 \uc778\ud130\ud398\uc774\uc2a4 \uc81c\uacf5\n", "- lxml.html, lxml.objectify\ub97c \uc774\uc6a9\ud574\uc11c XML \ucc98\ub9ac\n", "\n", "#### \ub300\ubd80\ubd84\uc758 \uc6f9\uc0ac\uc774\ud2b8\ub294 \ub531 \ud544\uc694\ud55c \ub0b4\uc6a9\ub9cc \ub4e4\uc5b4\uc788\ub294 JSON\uc774\ub098 XML\uc744 \ub9ce\uc774 \uc0ac\uc6a9\ud558\uc9c0 \uc54a\uace0 HTML\uc744 \uc0ac\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from lxml.html import parse\n", "from urllib2 import urlopen\n", "\n", "# \ub370\uc774\ud130\ub97c \uac00\uc838 \uc62c url\uc744 \ub118\uae34 \ud6c4\n", "# \ub370\uc774\ud130\ub97c \ubc1b\uc544 \uc628 \ud6c4 parse\n", "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n", "\n", "doc = parsed.getroot()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 103 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### doc \uac1d\uccb4\uc5d0\ub294 \ubaa8\ub4e0 HTML \ud0dc\uadf8 \ucd94\ucd9c\n", "\n", "- \uc6b0\ub9ac\uac00 \uad00\uc2ec \uac00\uc838\uc57c \ud560 table \ud0dc\uadf8\ub3c4 \ud3ec\ud568\n", "- \uc5b4\ub5bb\uac8c \ub3d9\uc791\ud558\ub294\uc9c0 \ud655\uc778\ud558\uae30 \uc704\ud574 \uae01\uc5b4\uc628 HTML \ubb38\uc11c\uc5d0\uc11c \uc678\ubd80 \uc5f0\uacb0 URL\uc744 \ubaa8\ub450 \ucc3e\uc544\ubcf4\uc790.\n", "- \uc678\ubd80 \uc5f0\uacb0\uc740 a \ud0dc\uadf8\ub85c \uc9c0\uc815\n", "- findall \uba54\uc11c\ub4dc\uc5d0 XPath(\ubb38\uc11c \uc9c8\uc758 \uc5b8\uc5b4)\ub97c \uc0ac\uc6a9\ud574\uc11c \ud574\ub2f9 \uc5d8\ub9ac\uba3c\ud2b8\ub97c \uac00\uc838\uc62c \uc218 \uc788\ub2e4.\n", "\n", "#### XPath tutorial site\n", "\n", "- [W3schools](http://www.w3schools.com/XPath/)\n", "- [XPath and XSLT with lxml](http://lxml.de/xpathxslt.html)\n", "- [Using Chrome Developer Tools](http://stackoverflow.com/questions/3030487/is-there-a-way-to-get-the-xpath-in-google-chrome)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "links = doc.findall('.//a')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 104 }, { "cell_type": "code", "collapsed": false, "input": [ "links[15:20]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 105, "text": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] } ], "prompt_number": 105 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc774 \uac1d\uccb4\ub294 HTML \uc5d8\ub9ac\uba3c\ud2b8\ub97c \ud45c\ud604\ud558\ub294 \uac1d\uccb4\uc77c \ubfd0\n", "\n", "- \uc5d8\ub9ac\uba3c\ud2b8\ub97c \ud45c\ud604\ud558\ub294 \uac1d\uccb4\ub77c\uace0 \uc0dd\uac01\ud558\uc790. \uc548 \uadf8\ub7ec\uba74 \uc0bd\uc9c8\ud558\uac8c \ub41c\ub2e4!\n", "- URL\uacfc \ub9c1\ud06c \uc774\ub984\uc744 \uac00\uc838\uc624\ub824\uba74 \uac01 \uc5d8\ub9ac\uba3c\ud2b8\uc5d0 \ub300\ud574 get \uba54\uc11c\ub4dc\ub97c \ud638\ucd9c\ud558\uc5ec URL\uc744 \uc5bb\uace0, text_content \uba54\uc11c\ub4dc\ub97c \uc774\uc6a9\ud574\uc11c \ub9c1\ud06c \uc774\ub984\uc744 \uac00\uc838\uc640\uc57c \ud55c\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "lnk = links[28]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 106 }, { "cell_type": "code", "collapsed": false, "input": [ "lnk" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 107, "text": [ "" ] } ], "prompt_number": 107 }, { "cell_type": "code", "collapsed": false, "input": [ "lnk.get('href')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 108, "text": [ "'https://edit.yahoo.com/mc2.0/eval_profile?.intl=us&.lang=en-US&.done=http://finance.yahoo.com/q/op%3fs=AAPL%2bOptions&.src=quote&.intl=us&.lang=en-US'" ] } ], "prompt_number": 108 }, { "cell_type": "code", "collapsed": false, "input": [ "lnk.text_content()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 109, "text": [ "'Account Info'" ] } ], "prompt_number": 109 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### [list comprehensions in Python](http://www.pythonforbeginners.com/lists/list-comprehensions-in-python/)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "urls = [lnk.get('href') for lnk in doc.findall('.//a')]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 110 }, { "cell_type": "code", "collapsed": false, "input": [ "urls[-10:]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 111, "text": [ "['/q/op?s=AAPL&k=580.000000',\n", " '/q?s=AAPL140328P00580000',\n", " '/q/op?s=AAPL&k=600.000000',\n", " '/q?s=AAPL140328P00600000',\n", " '/q/os?s=AAPL&m=2014-03-28',\n", " 'http://help.yahoo.com/l/us/yahoo/finance/quotes/fitadelay.html',\n", " 'http://billing.finance.yahoo.com/realtime_quotes/signup?.src=quote&.refer=quote',\n", " 'http://www.capitaliq.com',\n", " 'http://www.csidata.com',\n", " 'http://www.morningstar.com/']" ] } ], "prompt_number": 111 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ucc3e\uace0\uc790 \ud558\ub294 table \uc77c\uc77c\uc774 \ud655\uc778\n", "\n", "- \uba87\uba87 \uc6f9\uc0ac\uc774\ud2b8\ub294 table\ub9c8\ub2e4 id \uc18d\uc131\uc744 \uc918\uc11c \uc27d\uac8c \ud560 \uc218 \uc788\uc9c0\ub9cc \uc5b4\ub514 \uc138\uc0c1 \uc77c\uc774 \uc27d\uac8c \ub418\ub294\uac8c \uc788\ub098? \ub178\uac00\ub2e4 \ud574\uc57c\uc9c0.." ] }, { "cell_type": "code", "collapsed": false, "input": [ "tables = doc.findall('.//table')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 112 }, { "cell_type": "code", "collapsed": false, "input": [ "calls = tables[9]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 113 }, { "cell_type": "code", "collapsed": false, "input": [ "puts = tables[13]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 114 }, { "cell_type": "code", "collapsed": false, "input": [ "rows = calls.findall('.//tr')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 115 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### \uc6f9\ud398\uc774\uc9c0 \uad6c\uc870\uac00 \uc548 \ubc14\ub00c\uc5c8\ub124.\n", "\n", "- \ucc45\uc744 \uc4f4\uac8c 2012\ub144 10\uc6d4 29\uc77c\uc778\ub370 \uc544\uc9c1\uae4c\uc9c0 \uc548 \ubc14\ub00c\ub2e4\ub2c8..." ] }, { "cell_type": "code", "collapsed": false, "input": [ "def _unpack(row, kind='td'):\n", " elts = row.findall('.//%s' % kind)\n", " return [val.text_content() for val in elts]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 116 }, { "cell_type": "code", "collapsed": false, "input": [ "_unpack(rows[0], kind='th')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 117, "text": [ "['Strike', 'Symbol', 'Last', 'Chg', 'Bid', 'Ask', 'Vol', 'Open Int']" ] } ], "prompt_number": 117 }, { "cell_type": "code", "collapsed": false, "input": [ "_unpack(rows[1], kind='td')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 118, "text": [ "['440.00',\n", " 'AAPL7140328C00440000',\n", " '94.04',\n", " ' 0.00',\n", " '98.30',\n", " '102.15',\n", " '1',\n", " '1']" ] } ], "prompt_number": 118 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ub2e8\uacc4\ub4e4\uc744 \ud1b5\ud569\ud558\uc5ec \uc6f9\uc5d0\uc11c \uae01\uc5b4\uc628 \ub370\uc774\ud130\ub97c DataFrame\uc73c\ub85c \ubcc0\ud658\n", "\n", "- \uc22b\uc790 \ub370\uc774\ud130\uc9c0\ub9cc \uc5ec\uc804\ud788 \ubb38\uc790\uc5f4 \ud615\uc2dd\uc73c\ub85c \uc800\uc7a5\ub418\uc5b4 \uc788\uc73c\ubbc0\ub85c \uc801\uc808\ud558\uac8c \ubcc0\ud658\uc744 \ud574\uc918\uc57c \ud558\ub294\ub370 \ubaa8\ub4e0 \ub370\uc774\ud130\uac00 \uc2e4\uc218\ud615\uc740 \uc544\ub2d0 \uac83\uc774\ubbc0\ub85c \uc774 \uc791\uc5c5\uc740 \uc218\ub3d9\uc73c\ub85c \ucc98\ub9ac\n", "- \ud558\uc9c0\ub9cc \uc6b4 \uc88b\uaca0\ub3c4 pandas\uc5d0\ub294 TextParser \ud074\ub798\uc2a4\uac00 \uc788\uc5b4 \uc790\ub3d9 \ud615 \ubcc0\ud658\uc744 \uc801\uc808\ud558\uac8c \uc218\ud589\ud574\uc900\ub2e4.\n", "- TextParser \ud074\ub798\uc2a4\ub294 read_csv \ud568\uc218\uc640 \ub2e4\ub978 \ud30c\uc2f1 \ud568\uc218\uc5d0\uc11c\ub3c4 \uc0ac\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pandas.io.parsers import TextParser\n", "\n", "def parse_options_data(table):\n", " rows = table.findall('.//tr')\n", " header = _unpack(rows[0], kind='th')\n", " data = [_unpack(r) for r in rows[1:]]\n", " return TextParser(data, names=header).get_chunk()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 119 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ub9c8\uc9c0\ub9c9\uc73c\ub85c lxml \ud14c\uc774\ube14 \uac1d\uccb4\ub97c \uc704\uc5d0\uc11c \uc791\uc131\ud55c \ud30c\uc2f1 \ud568\uc218\ub97c \uc774\uc6a9\ud574\uc11c \ucc98\ub9ac\ud558\uba74 DataFrame \uacb0\uacfc\uac12 \uc5bb\uc744 \uc218 \uc788\ub2e4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### [\uc635\uc158(\uae08\uc735) - wiki kr](http://ko.wikipedia.org/wiki/%EC%98%B5%EC%85%98_(%EA%B8%88%EC%9C%B5)\n", "\n", "- \uae08\uc735 \ub370\uc774\ud130\ub97c \ubd84\uc11d\ud558\ub294 \uac83\uc774\uae30 \ub54c\ubb38\uc5d0 \uae08\uc735 \ud30c\ud2b8\uc5d0 \ub300\ud55c \ub3c4\uba54\uc778 \uc9c0\uc2dd\uc774 \uc788\uc5b4\uc57c \ud55c\ub2e4. \ub0b4\uac00 \ubd84\uc11d\ud558\ub824\ub294 \ub370\uc774\ud130\uac00 \uc5b4\ub5a0\ud55c \uc5ed\ud560\uc744 \ud558\ub294\uc9c0 \ubaa8\ub974\uba74 \ub9d0\uc9f1 \ud669!\n", "- \uc635\uc158(option)\uc740 \ud30c\uc0dd \uc0c1\ud488\uc758 \uc77c\uc885\uc774\uba70, \ubbf8\ub9ac \uacb0\uc815\ub41c \uae30\uac04 \uc548\uc5d0 \ud2b9\uc815 \uc0c1\ud488\uc744 \uc815\ud574\uc9c4 \uac00\uaca9\uc73c\ub85c \uc0ac\uace0 \ud314 \uc218 \uc788\ub294 \uad8c\ub9ac\ub97c \ub9d0\ud55c\ub2e4. \n", "- call option: \ud2b9\uc815 \uae08\uc735 \uc0c1\ud488\uc744 \uc815\ud574\uc9c4 \uac00\uaca9\uc5d0 \ub9e4\uc785\ud560 \uc218 \uc788\ub294 \uad8c\ub9ac\ub97c \uac00\uc9c4 \ub9e4\uc785 \uc635\uc158(call option)\n", "- put option: \ub9e4\ub3c4\ud560 \uc218 \uc788\ub294 \uad8c\ub9ac\ub97c \uac00\uc9c4 \ub9e4\ub3c4 \uc635\uc158(put option)\uc73c\ub85c \ub098\ub25c\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "call_data = parse_options_data(calls)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 120 }, { "cell_type": "code", "collapsed": false, "input": [ "put_data = parse_options_data(puts)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 121 }, { "cell_type": "code", "collapsed": false, "input": [ "call_data[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StrikeSymbolLastChgBidAskVolOpen Int
0 440.0 AAPL7140328C00440000 94.04 0.00 98.30 102.15 1 1
1 450.0 AAPL140328C00450000 86.25 0.00 89.40 91.75 5 11
2 450.0 AAPL7140328C00450000 79.35 0.00 88.20 92.15 2 2
3 460.0 AAPL140328C00460000 69.52 0.00 78.50 81.80 3 3
4 470.0 AAPL140328C00470000 71.75 2.78 69.55 70.90 16 78
5 480.0 AAPL140328C00480000 68.05 17.45 59.65 61.75 11 32
6 485.0 AAPL140328C00485000 62.35 4.41 54.50 56.80 5 6
7 485.0 AAPL7140328C00485000 64.00 0.00 53.20 57.20 0 1
8 490.0 AAPL140328C00490000 58.50 7.09 49.30 50.90 1 60
9 492.5 AAPL140328C00492500 37.15 0.00 46.30 49.10 5 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 122, "text": [ " Strike Symbol Last Chg Bid Ask Vol Open Int\n", "0 440.0 AAPL7140328C00440000 94.04 0.00 98.30 102.15 1 1\n", "1 450.0 AAPL140328C00450000 86.25 0.00 89.40 91.75 5 11\n", "2 450.0 AAPL7140328C00450000 79.35 0.00 88.20 92.15 2 2\n", "3 460.0 AAPL140328C00460000 69.52 0.00 78.50 81.80 3 3\n", "4 470.0 AAPL140328C00470000 71.75 2.78 69.55 70.90 16 78\n", "5 480.0 AAPL140328C00480000 68.05 17.45 59.65 61.75 11 32\n", "6 485.0 AAPL140328C00485000 62.35 4.41 54.50 56.80 5 6\n", "7 485.0 AAPL7140328C00485000 64.00 0.00 53.20 57.20 0 1\n", "8 490.0 AAPL140328C00490000 58.50 7.09 49.30 50.90 1 60\n", "9 492.5 AAPL140328C00492500 37.15 0.00 46.30 49.10 5 5" ] } ], "prompt_number": 122 }, { "cell_type": "code", "collapsed": false, "input": [ "put_data[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StrikeSymbolLastChgBidAskVolOpen Int
0 430 AAPL140328P00430000 0.01 0.00 N/A 0.01 6 164
1 440 AAPL140328P00440000 0.01 0.00 N/A 0.01 105 236
2 440 AAPL7140328P00440000 0.18 0.00 N/A 0.58 1 1
3 450 AAPL140328P00450000 0.01 0.00 N/A 0.01 27 771
4 460 AAPL140328P00460000 0.01 0.01 N/A 0.01 10 281
5 460 AAPL7140328P00460000 0.15 0.00 N/A 0.07 20 20
6 470 AAPL140328P00470000 0.01 0.01 N/A 0.02 2 493
7 480 AAPL140328P00480000 0.02 0.00 N/A 0.02 1 1,065
8 485 AAPL140328P00485000 0.01 0.02 0.01 0.02 63 781
9 485 AAPL7140328P00485000 0.21 0.00 N/A 0.26 10 10
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 123, "text": [ " Strike Symbol Last Chg Bid Ask Vol Open Int\n", "0 430 AAPL140328P00430000 0.01 0.00 N/A 0.01 6 164\n", "1 440 AAPL140328P00440000 0.01 0.00 N/A 0.01 105 236\n", "2 440 AAPL7140328P00440000 0.18 0.00 N/A 0.58 1 1\n", "3 450 AAPL140328P00450000 0.01 0.00 N/A 0.01 27 771\n", "4 460 AAPL140328P00460000 0.01 0.01 N/A 0.01 10 281\n", "5 460 AAPL7140328P00460000 0.15 0.00 N/A 0.07 20 20\n", "6 470 AAPL140328P00470000 0.01 0.01 N/A 0.02 2 493\n", "7 480 AAPL140328P00480000 0.02 0.00 N/A 0.02 1 1,065\n", "8 485 AAPL140328P00485000 0.01 0.02 0.01 0.02 63 781\n", "9 485 AAPL7140328P00485000 0.21 0.00 N/A 0.26 10 10" ] } ], "prompt_number": 123 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### lxml.objectify \uc774\uc6a9\ud574 XML \ud30c\uc2f1\ud558\uae30\n", "\n", "- [XML(eXtensible Markup Language)](http://en.wikipedia.org/wiki/Xml)\uc740 \uacc4\uce35\uc801 \uad6c\uc870\uc640 \uba54\ud0c0\ub370\uc774\ud130\ub97c \ud3ec\ud568\ud558\ub294 \uc911\ucca9\ub41c \ub370\uc774\ud130 \uad6c\uc870\ub97c \uc9c0\uc6d0\ud558\ub294 \ub610 \ub2e4\ub978 \uc720\uba85\ud55c \ub370\uc774\ud130 \ud615\uc2dd\uc774\ub2e4. \uc9c0\uae08 \uc774 \ucc45\ub3c4 \uc2e4\uc81c\ub85c\ub294 XML \ubb38\uc11c\ub85c \uc791\uc131\n", "- \ub274\uc695 MTA(Metropolitan Transportation Authority)\ub294 \ubc84\uc2a4\uc640 \uc804\ucca0 \uc6b4\uc601\uc5d0 \uad00\ud55c \uc5ec\ub7ec \uac00\uc9c0 [\ub370\uc774\ud130 \uacf5\uac1c](http://www.mta.info/developers/download.html)\n", "- \uc0b4\ud3b4\ubcfc \uac83\uc740 \uc5ec\ub7ec XML \ud30c\uc77c\ub85c \uc81c\uacf5\ub418\ub294 \uc2e4\uc801 \uc790\ub8cc\n", "- \uc804\ucca0\uacfc \ubc84\uc2a4 \uc6b4\uc601\uc740 \ub9e4\uc6d4 \uc544\ub798\uc640 \ube44\uc2b7\ud55c \ub0b4\uc6a9\uc758 \uac01\uac01 \ub2e4\ub978 \ud30c\uc77c(Metro-North Railroad\uc758 \uacbd\uc6b0 Preformance_MNR.xml \uac19\uc740)\ub85c \uc81c\uacf5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc9dc\uc99d\ub098\uac8c Performance_MNR.xml\uc744 \uc5b4\ub5bb\uac8c \ubc1b\ub294\uc9c0 \ubaa8\ub974\uaca0\ub2e4.\n", "\n", "- \uc18c\uc2a4\ud30c\uc77c\uc744 6\uc7a5\uc744 \ub4a4\uc838\ubd10\ub3c4 \uc5c6\uace0\n", "- \ud648\ud398\uc774\uc9c0\uc5d0\ub294 \uc544\ub9c8 XML \uad6c\uc870\uac00 \ubc14\ub010\ub4ef \uc2f6\ub2e4.\n", "- \uadf8\ub798\uc11c \ucd5c\ud6c4\uc758 \uc218\ub2e8\uc73c\ub85c \uc5bc\ub9c8 \uc548\ub418\uc11c \uadf8\ub0e5 \ub0b4\uac00 \uc77c\uc77c\uc774 \ucce4\ub2e4.\n", "- XML\uc740 \uc5c4\uaca9\ud558\uae30 \ub54c\ubb38\uc5d0 \ud558\ub098\ub77c\ub3c4 \uc624\ud0c0\uac00 \uc788\uc73c\uba74 \uc624\ub958 \ubc1c\uc0dd\ud558\ubbc0\ub85c \uc8fc\uc758!" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile Performance_MNR.xml\n", "\n", " 373889\n", " \n", " MEtro-North Railroad\n", " Escalator Availability\n", " Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.\n", " 2011\n", " 12\n", " Service Indicators\n", " M\n", " U\n", " %\n", " 1\n", " 97.00\n", " \n", " 97.00\n", " \n", "" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Overwriting Performance_MNR.xml\n" ] } ], "prompt_number": 124 }, { "cell_type": "code", "collapsed": false, "input": [ "from lxml import objectify\n", "import urllib2\n", "\n", "path = 'Performance_MNR.xml'\n", "# online_path = 'http://www.mta.info/developers/data/lirr/lirr_gtfs.xml'\n", "\n", "# data = urllib2.urlopen(online_path).read()\n", "# f = open(path, 'w')\n", "# f.write(data)\n", "# f.close()\n", "\n", "parsed = objectify.parse(open(path))\n", "root = parsed.getroot()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 125 }, { "cell_type": "code", "collapsed": false, "input": [ "data = []" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 126 }, { "cell_type": "code", "collapsed": false, "input": [ "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", " 'DESIRED_CHANGE', 'DECIMAL_PLACES']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 127 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### root.INDICATOR\ub97c \ud1b5\ud574 \ubaa8\ub4e0 \uc5d8\ub9ac\uba3c\ud2b8\ub97c \ub044\uc9d1\uc5b4 \ub0bc \uc218 \uc788\ub2e4\n", "\n", "- \uac01\uac01\uc758 \ud56d\ubaa9\uc5d0 \ub300\ud574 \uba87\uba87 \ud0dc\uadf8\ub294 \uc81c\uc678\ud558\uace0 \ud0dc\uadf8 \uc774\ub984(YTD_ACTUAL \uac19\uc740)\uc744 \ud0a4 \uac12\uc73c\ub85c \ud558\ub294 \uc0ac\uc804\uc744 \ub9cc\ub4e4\uc5b4 \ub0c4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# root.INDICATOR -> root\n", "\n", "for elt in root:\n", " el_data = {}\n", " for child in elt.getchildren():\n", " if child.tag in skip_fields:\n", " continue\n", " el_data[child.tag] = child.pyval\n", " data.append(el_data)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 128 }, { "cell_type": "code", "collapsed": false, "input": [ "data" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 129, "text": [ "[{'AGENCY_NAME': 'MEtro-North Railroad',\n", " 'CATEGORY': 'Service Indicators',\n", " 'DESCRIPTION': 'Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.',\n", " 'FREQUENCY': 'M',\n", " 'INDICATOR_NAME': 'Escalator Availability',\n", " 'INDICATOR_UNIT': '%',\n", " 'MONTHLY_ACTUAL': u'',\n", " 'MONTHLY_TARGET': 97.0,\n", " 'PERIOD_MONTH': 12,\n", " 'PERIOD_YEAR': 2011,\n", " 'YTD_ACTUAL': u'',\n", " 'YTD_TARGET': 97.0}]" ] } ], "prompt_number": 129 }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc704\uc758 \uac12\uacfc \ube44\uad50\ud558\uae30 \uc704\ud574 \ud14c\uc2a4\ud2b8 \ud574\ubcf8 \uac83\n", "for elt in root:\n", " for child in elt.getchildren():\n", " print child.tag, child.pyval" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "INDICATOR_SEQ 373889\n", "PARENT_SEQ \n", "AGENCY_NAME MEtro-North Railroad\n", "INDICATOR_NAME Escalator Availability\n", "DESCRIPTION Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.\n", "PERIOD_YEAR 2011\n", "PERIOD_MONTH 12\n", "CATEGORY Service Indicators\n", "FREQUENCY M\n", "DESIRED_CHANGE U\n", "INDICATOR_UNIT %\n", "DECIMAL_PLACES 1\n", "YTD_TARGET 97.0\n", "YTD_ACTUAL \n", "MONTHLY_TARGET 97.0\n", "MONTHLY_ACTUAL \n" ] } ], "prompt_number": 130 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 5\uc7a5\uc5d0\uc11c \uc0ac\uc804 \ud615\uc2dd\uc740 DataFrame\uc73c\ub85c \ubcc0\ud658\ud560 \uc218 \uc788\ub2e4\ub294 \uac83 \ucc38\uace0" ] }, { "cell_type": "code", "collapsed": false, "input": [ "perf = DataFrame(data)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 131 }, { "cell_type": "code", "collapsed": false, "input": [ "perf" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AGENCY_NAMECATEGORYDESCRIPTIONFREQUENCYINDICATOR_NAMEINDICATOR_UNITMONTHLY_ACTUALMONTHLY_TARGETPERIOD_MONTHPERIOD_YEARYTD_ACTUALYTD_TARGET
0 MEtro-North Railroad Service Indicators Percent of the time that escalators are operat... M Escalator Availability % 97 12 2011 97
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 132, "text": [ " AGENCY_NAME CATEGORY \\\n", "0 MEtro-North Railroad Service Indicators \n", "\n", " DESCRIPTION FREQUENCY \\\n", "0 Percent of the time that escalators are operat... M \n", "\n", " INDICATOR_NAME INDICATOR_UNIT MONTHLY_ACTUAL MONTHLY_TARGET \\\n", "0 Escalator Availability % 97 \n", "\n", " PERIOD_MONTH PERIOD_YEAR YTD_ACTUAL YTD_TARGET \n", "0 12 2011 97 " ] } ], "prompt_number": 132 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.2 \uc774\uc9c4 \ub370\uc774\ud130 \ud615\uc2dd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### \ub370\uc774\ud130\ub97c \ud6a8\uc728\uc801\uc73c\ub85c \uc800\uc7a5\ud558\ub294 \uac00\uc7a5 \uc190\uc26c\uc6b4 \ubc29\ubc95\n", "\n", "- \ud30c\uc774\uc36c\uc5d0 \uae30\ubcf8\uc73c\ub85c \ub0b4\uc7a5\ub418\uc5b4 \uc788\ub294 pickle \uc9c1\ub82c\ud654\ub97c \ud1b5\ud574 \ub370\uc774\ud130\ub97c \uc774\uc9c4 \ud615\uc2dd\uc73c\ub85c \uc800\uc7a5\ud558\ub294 \uac83\n", "- \ud3b8\ub9ac\ud558\uac8c\ub3c4 pandas\uc758 \uac1d\uccb4\ub294 \ubaa8\ub450 pickle\uc744 \uc774\uc6a9\ud574\uc11c \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\ub294 save \uba54\uc11c\ub4dc \uc788\uc74c" ] }, { "cell_type": "code", "collapsed": false, "input": [ "frame = pd.read_csv('ch06/ex1.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 133 }, { "cell_type": "code", "collapsed": false, "input": [ "frame" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 134, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 134 }, { "cell_type": "code", "collapsed": false, "input": [ "frame.save('ch06/frame_pickle')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/generic.py:720: FutureWarning: save is deprecated, use to_pickle\n", " warnings.warn(\"save is deprecated, use to_pickle\", FutureWarning)\n" ] } ], "prompt_number": 135 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.load('ch06/frame_pickle')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/common.py:2306: FutureWarning: load is deprecated, use read_pickle\n", " warnings.warn(\"load is deprecated, use read_pickle\", FutureWarning)\n" ] }, { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 136, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 136 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pickle \uc0ac\uc6a9\uc2dc \uc8fc\uc758\uc0ac\ud56d\n", "\n", "- pickle\uc740 \uc624\ub798 \ubcf4\uad00\ud560 \ud544\uc694\uac00 \uc5c6\ub294 \ub370\uc774\ud130\uc5d0\ub9cc \ucd94\ucc9c\n", "- \uc624\ub79c \uc2dc\uac04\uc774 \uc9c0\ub098\ub3c4 \uc548\uc815\uc801\uc73c\ub85c \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud560 \uac70\ub77c\uace0 \ubcf4\uc7a5 \ubabb\ud568" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.2.1 HDF5 \ud615\uc2dd \uc0ac\uc6a9\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \ub514\uc2a4\ud06c\uc5d0 \uc774\uc9c4 \ud615\uc2dd\uc73c\ub85c \uc800\uc7a5\ub41c \ub300\uc6a9\ub7c9\uc758 \uacfc\ud559 \uc790\ub8cc\ub97c \ud6a8\uc728\uc801\uc73c\ub85c \uc77d\uace0 \uc4f8 \uc218 \uc788\ub294 \ub2e4\uc591\ud55c \ub3c4\uad6c \uc874\uc7ac\n", "- \uc0b0\uc5c5 \uae30\uc900\uc5d0 \ub9de\ub294 \uc778\uae30 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc911 \ud558\ub098\uac00 HDF5(Hierarchical Data Format), \uacc4\uce35\uc801 \ub370\uc774\ud130 \ud615\uc2dd\n", "- \ub0b4\ubd80\uc801\uc73c\ub85c \ud30c\uc77c \uc2dc\uc2a4\ud15c \uac19\uc740 \ub178\ub4dc \uad6c\uc870\n", "- \uc5ec\ub7ec \uac1c\uc758 \ub370\uc774\ud130\uc14b\uc744 \uc800\uc7a5\ud558\uace0 \ubd80\uac00 \uc815\ubcf4 \uae30\ub85d \uac00\ub2a5\n", "- \ub2e4\uc591\ud55c \uc555\ucd95 \uae30\uc220\uc744 \uc0ac\uc6a9\ud574\uc11c on-the-fly(\uc2e4\uc2dc\uac04) \uc555\ucd95 \uc9c0\uc6d0\n", "- \ubc18\ubcf5\ub418\ub294 \ud328\ud134\uc744 \uac00\uc9c4 \ub370\uc774\ud130 \uc880 \ub354 \ud6a8\uacfc\uc801 \uc800\uc7a5\n", "- \uba54\ubaa8\ub9ac\uc5d0 \ubaa8\ub450 \uc801\uc7ac\ud560 \uc218 \uc5c6\ub294 \uc5c4\u314a\uc5b4\ub098\uac8c \ud070 \ub370\uc774\ud130\ub97c \uc544\uc8fc \ud070 \ubc30\uc5f4\uc5d0\uc11c \ud544\uc694\ud55c \ub9cc\ud07c\uc758 \uc791\uc740 \ubd80\ubd84\ub4e4\ub9cc \ud6a8\uacfc\uc801\uc73c\ub85c \uc77d\uace0 \uc4f8 \uc218 \uc788\ub294 \ud6cc\ub96d\ud55c \uc120\ud0dd\n", "\n", "- PyTables: HDF5\ub97c \ucd94\uc0c1\ud654\ud558\uc5ec \uc5ec\ub7ec\uac00\uc9c0 \uc720\uc5f0\ud55c \ub370\uc774\ud130 \ucee8\ud14c\uc774\ub108\uc640 \ud14c\uc774\ube14 \uc0c9\uc778, \uc9c8\uc758 \uae30\ub2a5 \uadf8\ub9ac\uace0 \uc678\ubd80 \uba54\ubaa8\ub9ac \uc5f0\uc0b0(out-of-core, external memory algorithm) \uc9c0\uc6d0\n", "- h5py: \uc9c1\uc811\uc801\uc774\uc9c0\ub9cc \uace0\uc218\uc900\uc758 HDF5 API\uc5d0 \ub300\ud55c \uc778\ud130\ud398\uc774\uc2a4 \uc81c\uacf5\n", "- pandas\ub294 PyTable\ub97c \uc774\uc6a9\ud55c HDFStore\ub77c\ub294 \uac00\ubcbc\uc6b4 \uc0ac\uc804 \ud074\ub798\uc2a4\ub97c \ud1b5\ud574 pandas \uac1d\uccb4\ub97c \uc800\uc7a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "store = pd.HDFStore('mydata.h5')" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "Exception", "evalue": "HDFStore requires PyTables", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mException\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHDFStore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'mydata.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/pytables.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, complevel, complib, fletcher32, **kwargs)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtables\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pragma: no cover\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 345\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'HDFStore requires PyTables'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 346\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mException\u001b[0m: HDFStore requires PyTables" ] } ], "prompt_number": 137 }, { "cell_type": "raw", "metadata": {}, "source": [ "\n", "---------------------------------------------------------------------------\n", "Exception Traceback (most recent call last)\n", " in ()\n", "----> 1 store = pd.HDFStore('mydata.h5')\n", "\n", "/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/pytables.pyc in __init__(self, path, mode, complevel, complib, fletcher32, **kwargs)\n", " 343 import tables as _\n", " 344 except ImportError: # pragma: no cover\n", "--> 345 raise Exception('HDFStore requires PyTables')\n", " 346 \n", " 347 self._path = path\n", "\n", "Exception: HDFStore requires PyTables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### HDFStore\ub97c \uc0ac\uc6a9\ud558\uae30 \uc704\ud574\uc11c\ub294 PyTables \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c \uc124\uce58\ud574\uc57c \ud55c\ub2e4.\n", "\n", "- \uadf8\ub0e5 \ud14c\uc2a4\ud2b8\uae30 \ub54c\ubb38\uc5d0 pass \ud55c\ub2e4.\n", "- \uc774\ub7f0\uac8c \uc788\ub2e4\ub294 \uac83\ub9cc indexing \ud574\ub450\uace0 \ub098\uc911\uc5d0 \ud544\uc694\ud558\uba74 \ucc3e\uc544\ubcf4\uc790!" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "# \ub77c\uc774\ube0c\ub7ec\ub9ac \uc124\uce58\ud574\ubcf4\uace0 \ud14c\uc2a4\ud2b8 \ud574\ubcf4\ub77c.\n", "\n", "store = pd.HDFStore('mydata.h5')\n", "store['obj1'] = frame\n", "store['obj1_col'] = frame['a']\n", "store\n", "store[obj1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ub370\uc774\ud130 \ubd84\uc11d \ubb38\uc81c\n", "\n", "- \ub300\ubd80\ubd84 CPU\ubcf4\ub2e4\ub294 IO \uc131\ub2a5\uc5d0 \uc758\uc874\uc801\n", "- HDF5\ub294 \ub370\uc774\ud130\ubca0\uc774\uc2a4\uac00 \uc544\ub2c8\ub2e4. HDF5\ub294 \ud55c \ubc88\ub9cc \uae30\ub85d\ud558\uace0 \uc5ec\ub7ec \ubc88 \uc790\uc8fc \uc77d\uc5b4\uc57c \ud558\ub294 \ub370\uc774\ud130\uc5d0 \ucd5c\uc801\ud654\ub418\uc5b4 \uc788\ub2e4. \ub370\uc774\ud130\ub294 \uc544\ubb34\ub54c\ub098 \ud30c\uc77c\uc5d0 \ucd94\uac00\ud560 \uc218 \uc788\uc9c0\ub9cc \ub9cc\uc57d \uc5ec\ub7ec \uacf3\uc5d0\uc11c \ub3d9\uc2dc\uc5d0 \ud30c\uc77c\uc744 \uc4f4\ub2e4\uba74 \ud30c\uc77c\uc774 \uae68\uc9c0\ub294 \ubb38\uc81c\uac00 \ubc1c\uc0dd\ud560 \uc218 \uc788\ub2e4." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.2.2 \ub9c8\uc774\ud06c\ub85c\uc18c\ud504\ud2b8 \uc5d1\uc140 \ud30c\uc77c\uc5d0\uc11c \ub370\uc774\ud130 \uc77d\uc5b4\uc624\uae30\n", "\n", "- pandas\ub294 ExcelFile \ud074\ub798\uc2a4\ub97c \ud1b5\ud574 \ub9c8\uc774\ud06c\ub85c\uc18c\ud504\ud2b8 \uc5d1\uc140 2003 \uc774\ud6c4 \ubc84\uc804\uc758 \ub370\uc774\ud130\ub97c \uc77d\uae30 \uac00\ub2a5\n", "- \ub0b4\ubd80\uc801\uc73c\ub85c ExcelFile \ud074\ub798\uc2a4\ub294 xlrd, openpyxl \ud328\ud0a4\uc9c0 \ud65c\uc6a9. \uc0ac\uc6a9\ud558\uae30 \uc804\uc5d0 \uba3c\uc800 \uc124\uce58" ] }, { "cell_type": "code", "collapsed": false, "input": [ "xls_file = pd.ExcelFile('data.xls')" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "ImportError", "evalue": "No module named xlrd", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mxls_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExcelFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data.xls'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/excel.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path_or_buf, **kwds)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath_or_buf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mxlrd\u001b[0m \u001b[0;31m# throw an ImportError if we need to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxlrd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__VERSION__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mImportError\u001b[0m: No module named xlrd" ] } ], "prompt_number": 138 }, { "cell_type": "code", "collapsed": false, "input": [ "table = xls_file.parse('Sheet1')" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'xls_file' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxls_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Sheet1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'xls_file' is not defined" ] } ], "prompt_number": 139 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.3 HTML, \uc6f9 API\uc640 \ud568\uaf10 \uc0ac\uc6a9\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### [Requests: HTTP for Humans](http://docs.python-requests.org/en/latest/#)\n", "\n", "- urllib2\ubcf4\ub2e4 \ub354 \uac04\ud3b8\n", "- [similar code, without Requests.](https://gist.github.com/973705)\n", "\n", "### \ud2b8\uc704\ud130 \ubd84\uc11d \ubb38\uc81c \ubc1c\uc0dd\n", "\n", "- \ud2b8\uc704\ud130\ub294 \ucc98\uc74c\uc5d0 \uc544\ubb34\ub7f0 \uc778\uc99d\uc5c6\uc774 API\ub97c \uc81c\uacf5\ud558\ub2e4 \ub9dd \uacfc\ubd80\ud558\uac00 \ubc1c\uc0dd\ud558\uc790 OAuth \uc778\uc99d \ubc29\uc2dd\uc73c\ub85c \ubcc0\uacbd\n", "- [OAuth2 - API \uc778\uc99d\uc744 \uc704\ud55c \ub9cc\ub2a5\ub3c4\uad6c\uc0c1\uc790](http://www.slideshare.net/tebica/oauth2-api)\n", "- [Twitter API](https://dev.twitter.com/docs/auth/sign-twitter)\n", "- \uc9c0\uae08\uc740 \uc778\uc99d \ubb38\uc81c \ub54c\ubb38\uc5d0 pass \ud558\uaca0\uc74c" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import requests" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 140 }, { "cell_type": "code", "collapsed": false, "input": [ "url = 'http://search.twitter.com/search.json?q=python%20pandas'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 141 }, { "cell_type": "code", "collapsed": false, "input": [ "resp = requests.get(url)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 142 }, { "cell_type": "code", "collapsed": false, "input": [ "resp" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 143, "text": [ "" ] } ], "prompt_number": 143 }, { "cell_type": "code", "collapsed": false, "input": [ "resp.text" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 144, "text": [ "u'{\"errors\":[{\"message\":\"The Twitter REST API v1 is no longer active. Please migrate to API v1.1. https://dev.twitter.com/docs/api/1.1/overview.\",\"code\":64}]}'" ] } ], "prompt_number": 144 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc870\uae08\ub9cc \uc218\uace0\ub97c \ud558\uba74 \ud3c9\ubc94\ud55c \uc6f9 API\ub97c \uc704\ud55c \uace0\uc218\uc900\uc758 \uc778\ud130\ud398\uc774\uc2a4\ub97c \ub9cc\ub4e4\uc5b4\uc11c DataFrame\uc5d0 \uc800\uc7a5\ud558\uace0 \uc27d\uac8c \ubd84\uc11d \uc791\uc5c5 \uc218\ud589 \uac00\ub2a5" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.4 \ub370\uc774\ud130\ubca0\uc774\uc2a4\uc640 \ud568\uaed8 \uc0ac\uc6a9\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \ub300\ubd80\ubd84\uc758 \uc560\ud50c\ub9ac\ucf00\uc774\uc158\uc740 \ud14d\uc2a4\ud2b8 \ud30c\uc77c\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc624\uc9c0 \uc54a\uc74c\n", "- \uc65c\ub0d0\ud558\uba74 \ub300\uc6a9\ub7c9\uc758 \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\uae30\uc5d0 \ud14d\uc2a4\ud2b8 \ud30c\uc77c\uc740 \uc0c1\ub2f9\ud788 \ube44\ud6a8\uc728\uc801\n", "- SQL \uae30\ubc18\uc758 \uad00\uacc4\ud615 \ub370\uc774\ud130 \ubca0\uc774\uc2a4\uac00 \ub9ce\uc774 \uc0ac\uc6a9\ub428. MySql \uac19\uc740\n", "- \ucd5c\uadfc \uc720\uba85\ud574\uc9c4 NoSQL\uc774\ub77c \ubd88\ub9ac\ub294 \ube44 SQL \uae30\ubc18\uc758 \ub370\uc774\ud130\ubca0\uc774\uc2a4\ub3c4 \ub9ce\uc774 \uc0ac\uc6a9\ub428\n", "- SQL vs NoSQL\uc740 \uc11c\ub85c \uac01\uac01\uc758 \uc7a5\uc810\uc744 \ud30c\uc545\ud558\uace0 \uc790\uc2e0\uc758 \uc5c5\ubb34\uc5d0 \ub9de\ub294 DB\ub97c \uc120\ud0dd\n", "- SQL\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc640\uc11c DataFrame\uc5d0 \uc800\uc7a5\ud558\ub294 \ubc29\ubc95\uc740 \uaf64 \uc9c1\uad00\uc801" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import sqlite3\n", "\n", "query = \"\"\"\n", "CREATE TABLE test\n", "(a VARCHAR(20), b VARCHAR(20),\n", "c REAL, d INTEGER\n", ");\"\"\"\n", "\n", "con = sqlite3.connect(':memory:')\n", "con.execute(query)\n", "con.commit()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 145 }, { "cell_type": "code", "collapsed": false, "input": [ "data = [('Atlanta', 'Georgia', 1.25, 6),\n", " ('Tallahassee', 'Florida', 2.6, 3),\n", " ('Sacramento', 'California', 1.7, 5)]\n", "\n", "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", "\n", "con.executemany(stmt, data)\n", "con.commit()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 146 }, { "cell_type": "markdown", "metadata": {}, "source": [ "\ub300\ubd80\ubd84\uc758 \ud30c\uc774\uc36c SQL \ub4dc\ub77c\uc774\ubc84(PyODBC, psycopg2, MySQLdb, pymssql \ub4f1)\ub294 \ud14c\uc774\ube14\uc5d0 \ub300\ud574 select \ucffc\ub9ac\ub97c \uc218\ud589\ud558\uba74 \ud29c\ud50c \ub9ac\uc2a4\ud2b8\ub97c \ubc18\ud658\ud55c\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor = con.execute('select * from test')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 147 }, { "cell_type": "code", "collapsed": false, "input": [ "rows = cursor.fetchall()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 148 }, { "cell_type": "code", "collapsed": false, "input": [ "rows" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 149, "text": [ "[(u'Atlanta', u'Georgia', 1.25, 6),\n", " (u'Tallahassee', u'Florida', 2.6, 3),\n", " (u'Sacramento', u'California', 1.7, 5)]" ] } ], "prompt_number": 149 }, { "cell_type": "markdown", "metadata": {}, "source": [ "\ubc18\ud658\ub41c \ud29c\ud50c \ub9ac\uc2a4\ud2b8\ub97c DataFrame \uc0dd\uc131\uc790\uc5d0 \ubc14\ub85c \uc804\ub2ec\ud574\ub3c4 \ub418\uc9c0\ub9cc \uce7c\ub7fc\uc758 \uc774\ub984\uc744 \uc9c0\uc815\ud574\uc8fc\uba74 \ub354 \ud3b8\ud558\ub2e4. cursor\uc758 description \uc18d\uc131\uc744 \ud65c\uc6a9\ud558\uc790." ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.description" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 150, "text": [ "(('a', None, None, None, None, None, None),\n", " ('b', None, None, None, None, None, None),\n", " ('c', None, None, None, None, None, None),\n", " ('d', None, None, None, None, None, None))" ] } ], "prompt_number": 150 }, { "cell_type": "code", "collapsed": false, "input": [ "DataFrame(rows, columns=zip(*cursor.description)[0])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 151, "text": [ " a b c d\n", "0 Atlanta Georgia 1.25 6\n", "1 Tallahassee Florida 2.60 3\n", "2 Sacramento California 1.70 5" ] } ], "prompt_number": 151 }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \ub370\uc774\ud130\ubca0\uc774\uc2a4\uc5d0 \ucffc\ub9ac\ub97c \ubcf4\ub0b4\ub824\uace0 \ub9e4\ubc88 \uc774\ub807\uac8c \ud558\ub294\uac74 \ub108\ubb34 \uadc0\ucc2e\uc74c\n", "- pandas.io.sql \ubaa8\ub4c8\uc758 read_frame \ud568\uc218\ub97c \uc774\uc6a9\ud558\uba74 \uac04\ud3b8\ud558\uac8c \ud574\uacb0\n", "- \uadf8\ub0e5 select \ucffc\ub9ac\ubb38\uacfc \ub370\uc774\ud130 \ubca0\uc774\uc2a4 \uc5f0\uacb0 \uac1d\uccb4(con)\ub9cc \ub118\uae30\uba74 \ub41c\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas.io.sql as sql" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 152 }, { "cell_type": "code", "collapsed": false, "input": [ "sql.read_frame('select * from test', con)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 153, "text": [ " a b c d\n", "0 Atlanta Georgia 1.25 6\n", "1 Tallahassee Florida 2.60 3\n", "2 Sacramento California 1.70 5" ] } ], "prompt_number": 153 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.4.1 MongoDB\uc5d0 \ub370\uc774\ud130 \uc800\uc7a5\ud558\uace0 \ubd88\ub7ec\uc624\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- NoSQL \ub370\uc774\ud130\ubca0\uc774\uc2a4\ub294 \ub9e4\uc6b0 \ub2e4\uc591\ud55c \ud615\ud0dc\n", "- \ubc84\ud074\ub9acDB\ub098 \ub3c4\ucfc4\uce90\ube44\ub2db \uac19\uc740 \uac83\uc740 \uc0ac\uc804\ucc98\ub7fc \ud0a4-\uac12\uc744 \uc800\uc7a5\ud558\uae30\ub3c4 \ud558\uace0\n", "- \ub610 \ub2e4\ub978 \uac83\uc740 \uae30\ubcf8 \uc800\uc7a5\uc18c\ub294 \uc0ac\uc804 \uac19\uc740 \uac1d\uccb4\ub97c \uc0ac\uc6a9\ud558\uba70 \ubb38\uc11c \uae30\ubc18\uc73c\ub85c \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\uae30\ub3c4 \ud55c\ub2e4.\n", "- \uc774 \ucc45\uc5d0\uc11c\ub294 [MongoDB](http://mongodb.org)\ub97c \uc608\uc81c\ub85c \uc120\ud0dd\n", "- MongoDB \uc11c\ubc84\ub97c \ub85c\uceec\uc5d0 \uc124\uce58\ud558\uace0 \uacf5\uc2dd \ub4dc\ub77c\uc774\ubc84\uc778 pymongo\ub97c \uc0ac\uc6a9\ud574\uc11c \uae30\ubcf8 \ud3ec\ud2b8\ub85c \ubc88\ud638\ub85c \uc5f0\uacb0\n", "- \ud604\uc7ac \ud544\uc790\uc758 \ucef4\ud4e8\ud130\uc5d0\ub294 \uc544\uc9c1 \uc124\uce58\ud558\uc9c0 \uc54a\uc74c. \uc774\ub7f0 \ud615\uc2dd\uc73c\ub85c \ud55c\ub2e4\ub294 \ub290\ub08c\ub9cc \uac00\uc9c0\uc790" ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }