{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 6. \ub370\uc774\ud130 \ub85c\ub529, \uc800\uc7a5, \ud30c\uc77c \ud615\uc2dd\n", "\n", "### \uc785\u2219\ucd9c\ub825 \ubc29\ubc95\n", "\n", "- \ud14d\uc2a4\ud2b8 \ud30c\uc77c \uc774\uc6a9\ud558\ub294 \ubc29\ubc95\n", "- \ub370\uc774\ud130\ubca0\uc774\uc2a4 \uc774\uc6a9\ud558\ub294 \ubc29\ubc95\n", "- \uc6f9 API \uc774\uc6a9\ud574\uc11c \ub124\ud2b8\uc6cc\ud06c\ub97c \ud1b5\ud574 \ubd88\ub7ec\uc624\ub294 \ubc29\ubc95" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.1 \ud14d\uc2a4\ud2b8 \ud30c\uc77c \uc774\uc6a9\ud558\ub294 \ubc29\ubc95" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ud30c\uc774\uc36c \uc88b\uc740 \uc774\uc720\n", "\n", "- \ub2e8\uc21c\ud55c \ubb38\ubc95\n", "- \uc9c1\uad00\uc801\uc778 \uc790\ub8cc \uad6c\uc870\n", "- \ud29c\ud50c\uc5d0 \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\uace0 \uc77d\uc5b4\ub0b4\ub294 \ud3b8\ub9ac\ud55c \uae30\ub2a5\n", "\n", "#### pandas \ud30c\uc77c \ud30c\uc2f1 \ud568\uc218\n", "\n", "\ud568\uc218 | \uc124\uba85\n", "--- | ---\n", "read_csv | \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \uc27c\ud45c(,)\ub97c \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "read_table | \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \ud0ed('\\t')\uc744 \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "read_fwf | \uace0\uc815\ud3ed \uce7c\ub7fc \ud615\uc2dd\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4(\uad6c\ubd84\uc790\uac00 \uc5c6\ub294 \ub370\uc774\ud130)\n", "read_clipboard | \ud074\ub9bd\ubcf4\ub4dc\uc5d0 \uc788\ub294 \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc624\ub294 read_table \ud568\uc218. \uc6f9\ud398\uc774\uc9c0\uc5d0\uc11c \ud45c\ub97c \uae01\uc5b4\uc62c \ub54c \uc720\uc6a9\ud558\ub2e4.\n", "\n", "#### pandas \ud30c\uc77c \ud30c\uc2f1 \ud568\uc218 \uc635\uc158\n", "\n", "- **\uc0c9\uc778**: \ubc18\ud658\ud558\ub294 DataFrame\uc5d0\uc11c \ud558\ub098 \uc774\uc0c1\uc758 \uce7c\ub7fc\uc744 \uc0c9\uc778\uc73c\ub85c \uc9c0\uc815\ud560 \uc218 \uc788\ub2e4. \ud30c\uc77c\uc774\ub098 \uc0ac\uc6a9\uc790\ub85c\ubd80\ud130 \uce7c\ub7fc\uc758 \uc774\ub984\uc744 \ubc1b\uac70\ub098 \uc544\ubb34\uac83\ub3c4 \ubc1b\uc9c0 \uc54a\uc744 \uc218 \uc788\ub2e4.\n", "- **\uc790\ub8cc\ud615 \ucd94\ub860\uacfc \ub370\uc774\ud130 \ubcc0\ud658**: \uc0ac\uc6a9\uc790 \uc815\uc758 \uac12 \ubcc0\ud658\uacfc \ube44\uc5b4\uc788\ub294 \uac12\uc744 \uc704\ud55c \uc0ac\uc6a9\uc790 \ub9ac\uc2a4\ud2b8\ub97c \ud3ec\ud568\ud55c\ub2e4.\n", "- **\ub0a0\uc9dc \ubd84\uc11d**: \uc5ec\ub7ec \uce7c\ub7fc\uc5d0 \uac78\uccd0 \uc788\ub294 \ub0a0\uc9dc\uc640 \uc2dc\uac04 \uc815\ubcf4\ub97c \ud558\ub098\uc758 \uce7c\ub7fc\uc5d0 \uc870\ud569\ud574\uc11c \uacb0\uacfc\uc5d0 \ubc18\uc601\ud55c\ub2e4.\n", "- **\ubc18\ubcf5**: \uc5ec\ub7ec \ud30c\uc77c\uc5d0 \uac78\uccd0 \uc788\ub294 \uc790\ub8cc\ub97c \ubc18\ubcf5\uc801\uc73c\ub85c \uc77d\uc5b4\uc62c \uc218 \uc788\ub2e4.\n", "- **\uc815\uc81c\ub418\uc9c0 \uc54a\ub294 \ub370\uc774\ud130 \ucc98\ub9ac**: \ub85c\uc6b0\ub098 \uaf2c\ub9ac\ub9d0, \uc8fc\uc11d \uac74\ub108\ub6f0\uae30 \ub610\ub294 \ucc9c \ub2e8\uc704\ub9c8\ub2e4 \uc27c\ud45c\ub85c \uad6c\ubd84\ub41c \uc22b\uc790 \uac19\uc740 \uc0ac\uc18c\ud55c \uc77c\uc744 \ucc98\ub9ac\ud574\uc900\ub2e4.\n", "\n", "#### \uc790\ub8cc\ud615 \ucd94\ub860\uc740 \ub9e4\uc6b0 \uc911\uc694\n", "\n", "- \uc5b4\ub5a4 \uce7c\ub7fc\uc774 \uc22b\uc790\uc778\uc9c0 \ubd88\ub9ac\uc5b8\uc778\uc9c0 \uc9c0\uc815\ud574\uc904 \ud544\uc694\uac00 \uc5c6\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pandas import DataFrame, Series\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 179 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex1.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "a,b,c,d,message\r\n", "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "prompt_number": 180 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('ch06/ex1.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 181 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex1.csv', header=None)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0 a b c d message
1 1 2 3 4 hello
2 5 6 7 8 world
3 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 182, "text": [ " 0 1 2 3 4\n", "0 a b c d message\n", "1 1 2 3 4 hello\n", "2 5 6 7 8 world\n", "3 9 10 11 12 foo" ] } ], "prompt_number": 182 }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc6d0\ub798 \uc788\ub358 Column\uba85 \ubb34\uc2dc\ud558\uace0 \ub0b4\uac00 \uc6d0\ud558\ub294 Column\uba85 \uc124\uc815\n", "pd.read_csv('ch06/ex1.csv', names=[5,6,7,8,9])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
56789
0 a b c d message
1 1 2 3 4 hello
2 5 6 7 8 world
3 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 183, "text": [ " 5 6 7 8 9\n", "0 a b c d message\n", "1 1 2 3 4 hello\n", "2 5 6 7 8 world\n", "3 9 10 11 12 foo" ] } ], "prompt_number": 183 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex1.csv', names=['a1', 'b1', 'c1', 'd1', 'message1'])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
a1b1c1d1message1
0 a b c d message
1 1 2 3 4 hello
2 5 6 7 8 world
3 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 184, "text": [ " a1 b1 c1 d1 message1\n", "0 a b c d message\n", "1 1 2 3 4 hello\n", "2 5 6 7 8 world\n", "3 9 10 11 12 foo" ] } ], "prompt_number": 184 }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 185, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 185 }, { "cell_type": "code", "collapsed": false, "input": [ "# csv\ub294 DataFrame\uc73c\ub85c \uc77d\uc5b4\uc628\ub2e4.\n", "type(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 186, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 186 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_table('ch06/ex1.csv', sep=',')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 187, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 187 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_table('ch06/ex1.csv', sep=',', header=None)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0 a b c d message
1 1 2 3 4 hello
2 5 6 7 8 world
3 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 188, "text": [ " 0 1 2 3 4\n", "0 a b c d message\n", "1 1 2 3 4 hello\n", "2 5 6 7 8 world\n", "3 9 10 11 12 foo" ] } ], "prompt_number": 188 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex2.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "prompt_number": 189 }, { "cell_type": "code", "collapsed": false, "input": [ "# header \uc790\ub3d9 \uc0dd\uc131\n", "pd.read_csv('ch06/ex2.csv', header=None)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 190, "text": [ " 0 1 2 3 4\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 190 }, { "cell_type": "code", "collapsed": false, "input": [ "# header \uc635\uc158\uc774 \uc5c6\uc744\uc2dc header\ub97c \uccab\ubc88\uc9f8 \uc904\ub85c \uc774\uc6a9\n", "pd.read_csv('ch06/ex2.csv')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1234hello
0 5 6 7 8 world
1 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 191, "text": [ " 1 2 3 4 hello\n", "0 5 6 7 8 world\n", "1 9 10 11 12 foo" ] } ], "prompt_number": 191 }, { "cell_type": "code", "collapsed": false, "input": [ "# Column\uba85 \ucd94\uac00\n", "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'message'])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcmessage
1 2 3 4 hello
5 6 7 8 world
9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 192, "text": [ " a b c message\n", "1 2 3 4 hello\n", "5 6 7 8 world\n", "9 10 11 12 foo" ] } ], "prompt_number": 192 }, { "cell_type": "code", "collapsed": false, "input": [ "names = ['a', 'b', 'c', 'd', 'message']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 193 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex2.csv', names=names)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 194, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 194 }, { "cell_type": "code", "collapsed": false, "input": [ "# message -> index\n", "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 195, "text": [ " a b c d\n", "message \n", "hello 1 2 3 4\n", "world 5 6 7 8\n", "foo 9 10 11 12" ] } ], "prompt_number": 195 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex2.csv', names=names, index_col='a')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bcdmessage
a
1 2 3 4 hello
5 6 7 8 world
9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 196, "text": [ " b c d message\n", "a \n", "1 2 3 4 hello\n", "5 6 7 8 world\n", "9 10 11 12 foo" ] } ], "prompt_number": 196 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/csv_mindex.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "key1,key2,value1,value2\r\n", "one,a,1,2\r\n", "one,b,3,4\r\n", "one,c,5,6\r\n", "one,d,7,8\r\n", "two,a,9,10\r\n", "two,b,11,12\r\n", "two,c,13,14\r\n", "two,d,15,16\r\n" ] } ], "prompt_number": 197 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uacc4\uce35\uc801 \uc0c9\uc778\uc744 \uc9c0\uc815\ud558\uace0 \uc2f6\ub2e4\uba74 \uce7c\ub7fc \ubc88\ud638\ub098 \uc774\ub984\uc758 \ub9ac\uc2a4\ud2b8\ub97c \ub118\uae34\ub2e4\n", "\n", "- 2\ubc88\uc9f8 \uacf5\ubd80\ud558\uba74\uc11c \uc815\ub9ac\ud558\ub2c8 \uacc4\uce35\uc801 \uc0c9\uc778\uc744 \uc5b4\ub5bb\uac8c \uc0ac\uc6a9\ud558\ub294\uc9c0 \uc870\uae08\uc740 \uc774\ud574\uac00 \uac04\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 198 }, { "cell_type": "code", "collapsed": false, "input": [ "parsed" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value1value2
key1key2
onea 1 2
b 3 4
c 5 6
d 7 8
twoa 9 10
b 11 12
c 13 14
d 15 16
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 199, "text": [ " value1 value2\n", "key1 key2 \n", "one a 1 2\n", " b 3 4\n", " c 5 6\n", " d 7 8\n", "two a 9 10\n", " b 11 12\n", " c 13 14\n", " d 15 16" ] } ], "prompt_number": 199 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uace0\uc815\ub41c \uad6c\ubd84\uc790\uac00 \uc5c6\ub2e4\uba74 read_table\uc758 \uad6c\ubd84\uc790\ub85c \uc815\uaddc\ud45c\ud604\uc2dd\uc744 \uc0ac\uc6a9\ud558\uba74 \ub41c\ub2e4.\n", "\n", "- [\ud30c\uc774\uc36c \u2013 \uc815\uaddc\uc2dd\ud45c\ud604\uc2dd(Regular Expression) \ubaa8\ub4c8](http://devanix.tistory.com/296)\n", "- [\ubc88\uc5ed \ud30c\uc774\uc36c \uc815\uaddc\ud45c\ud604\uc2dd](http://codeflow.co.kr/question/1061/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%A0%95%EA%B7%9C-%ED%91%9C%ED%98%84%EC%8B%9D/)\n", "- [tutorial point](http://www.tutorialspoint.com/python/python_reg_expressions.htm)\n", "- [\ud30c\uc774\uc36c - \uc815\uaddc\ud45c\ud604\uc2dd \ubaa8\ub4c8](http://devanix.tistory.com/296)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "list(open('ch06/ex3.txt'))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 200, "text": [ "[' A B C\\n',\n", " 'aaa -0.264438 -1.026059 -0.619500\\n',\n", " 'bbb 0.927272 0.302904 -0.032399\\n',\n", " 'ccc -0.264273 -0.386314 -0.217601\\n',\n", " 'ddd -0.871858 -0.348382 1.100491\\n']" ] } ], "prompt_number": 200 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc9c1\uc811 \ud30c\uc77c\uc744 \uace0\uccd0\ub3c4 \ub418\uc9c0\ub9cc \uc774 \ud30c\uc77c\uc740 \uc5ec\ub7ec \uac1c\uc758 \uacf5\ubc31\ubb38\uc790\ub85c \ud544\ub4dc\uac00 \uad6c\ubd84\ub418\uc5b4 \uc788\uc73c\ubbc0\ub85c \uc774\ub97c \ud45c\ud604\ud560 \uc218 \uc788\ub294 \uc815\uaddc\ud45c\ud604\uc2dd \\s+\ub97c \uc0ac\uc6a9\ud574\uc11c \ucc98\ub9ac" ] }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_table('ch06/ex3.txt', sep='\\s+')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 201 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb 0.927272 0.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.348382 1.100491
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 202, "text": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] } ], "prompt_number": 202 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc774 \uacbd\uc6b0, \uccab\ubc88\uc9f8 \ub85c\uc6b0\ub294 \ub2e4\ub978 \ub85c\uc6b0\ubcf4\ub2e4 \uce7c\ub7fc\uc774 \ud558\ub098 \uc801\uae30 \ub54c\ubb38\uc5d0 read_table\uc740 \uccab \ubc88\uc9f8 \uce7c\ub7fc\uc774 DataFrame\uc758 \uc0c9\uc778\uc774 \ub418\uc5b4\uc57c \ud55c\ub2e4\uace0 \ucd94\ub860" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-------\n", "\n", "### read_table\uacfc read_csv\uc758 \ucc28\uc774\uc810\uc740??\n", "\n", "- read_csv: \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \uc27c\ud45c(,)\ub97c \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "- read_table: \ud30c\uc77c, URL \ub610\ub294 \ud30c\uc77c\uacfc \uc720\uc0ac\ud55c \uac1d\uccb4\ub85c\ubd80\ud130 \uad6c\ubd84\ub41c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc628\ub2e4. \ub370\uc774\ud130 \uad6c\ubd84\uc790\ub294 \ud0ed('\\t')\ub97c \uae30\ubcf8\uc73c\ub85c \ud55c\ub2e4.\n", "\n", "#### \uadf8\ub7ec\ub2c8 \ub458 \ub2e4 \uc0ac\uc6a9\uc744 \ud574\ub3c4 \ub418\uc9c0\ub9cc \uc660\ub9cc\ud558\uba74 read_csv \uac19\uc740 \uacbd\uc6b0\ub294 csv \ud30c\uc77c\ub9cc \uc0ac\uc6a9\uc744 \ud558\uace0 \ub098\uba38\uc9c0 \ud2b9\ubcc4\ud55c \uacbd\uc6b0\ub97c read_table\ub85c \ud65c\uc6a9\n", "\n", "--------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex3.txt', delimiter='\\s+')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb 0.927272 0.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.348382 1.100491
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 203, "text": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] } ], "prompt_number": 203 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### [IO Tools(Text, CSV, HDF5, \u22ef) example](http://pandas.pydata.org/pandas-docs/stable/io.html)\n", "\n", "- \ud30c\uc11c \ud568\uc218\ub294 \ud30c\uc77c \ud615\uc2dd\uc5d0\uc11c \ubc1c\uc0dd\ud560 \uc218 \uc788\ub294 \ub9e4\uc6b0 \ub2e4\uc591\ud55c \uc608\uc678\ub97c \uc798 \ucc98\ub9ac\ud560 \uc218 \uc788\ub3c4\ub85d \ub9ce\uc740 \ucd94\uac00 \uc778\uc790\ub97c \uac00\uc9c0\uace0 \uc788\ub2e4.\n", "- skiprows\ub97c \uc774\uc6a9\ud574\uc11c \uccab\ubc88\uc9f8, \uc138\ubc88\uc9f8, \ub124\ubc88\uc9f8 \ub85c\uc6b0\ub97c \uac74\ub108\ub6f8 \uc218 \uc788\uc74c" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Read CSV(comma-separated) file into DataFrame\n", "pd.read_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 204 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex4.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "# hey!\r\n", "a,b,c,d,message\r\n", "# just wanted to make things more difficult for you\r\n", "# who reads CSV files with computers, anyway?\r\n", "1,2,3,4,hello\r\n", "5,6,7,8,world\r\n", "9,10,11,12,foo" ] } ], "prompt_number": 205 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 206, "text": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] } ], "prompt_number": 206 }, { "cell_type": "markdown", "metadata": {}, "source": [ "- **\ub204\ub77d\ub41c \uac12\uc744 \uc798 \ucc98\ub9ac\ud558\ub294 \uc77c**\uc740 \ud30c\uc77c\uc744 \uc77d\ub294 \uacfc\uc815\uc5d0\uc11c **\uc790\uc8fc \ubc1c\uc0dd**\ud558\ub294 \uc77c\uc774\uace0 **\uc911\uc694\ud55c \ubb38\uc81c**\n", "- \ub204\ub77d\ub41c \uac12\uc740 \ud45c\uae30\ud558\uc9c0 \uc54a\uac70\ub098(\ube44\uc5b4\uc788\ub294 \ubb38\uc790\uc5f4) \uad6c\ubd84\ud558\uae30 \uc26c\uc6b4 \ud2b9\uc218\ud55c \ubb38\uc790\ub85c \ud45c\uae30\n", "- NA, -1, #IND, NULL\ucc98\ub7fc \ube44\uc5b4\uc788\ub294 \uac12\uc73c\ub85c \uc778\uc2dd" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex5.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "something,a,b,c,d,message\r\n", "one,1,2,3,4,NA\r\n", "two,5,6,,8,world\r\n", "three,9,10,11,12,foo" ] } ], "prompt_number": 207 }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_csv('ch06/ex5.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 208 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 world
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 209, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 209 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.isnull(result)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 False False False False False True
1 False False False True False False
2 False False False False False False
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 210, "text": [ " something a b c d message\n", "0 False False False False False True\n", "1 False False False True False False\n", "2 False False False False False False" ] } ], "prompt_number": 210 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### na_values \uc635\uc158\uc740 \ub9ac\uc2a4\ud2b8\ub098 \ubb38\uc790\uc5f4 \uc9d1\ud569\uc744 \ubc1b\uc544\uc11c \ub204\ub77d\ub41c \uac12\uc744 \ucc98\ub9ac\n", "\n", "----------\n", "\n", "### Why? na_values\ub97c \uc0ac\uc6a9\ud558\uc9c0?\n", "\n", "- \ud2b9\uc815\ud55c \uac12\ub4e4\uc740 \uacc4\uc0b0\uc744 \ud558\uc9c0 \uc54a\uc73c\ub824\uace0??? \uc74c..\n", "\n", "-----------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 211 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 world
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 212, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 212 }, { "cell_type": "code", "collapsed": false, "input": [ "# world\ub97c NA\uac12\uc73c\ub85c \ucc98\ub9ac\ud558\ub2c8 NaN\uc73c\ub85c \ub098\uc628\ub2e4.\n", "# \ud2b9\uc815\ud55c \uac12\uc744 NA \ucc98\ub9ac\ud560 \uc218 \uc788\uc744\uac83 \uac19\ub2e4.\n", "pd.read_csv('ch06/ex5.csv', na_values=['world'])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 NaN
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 213, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 NaN\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 213 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc5f4\ub9c8\ub2e4 \ub2e4\ub978 NA \ubb38\uc790\ub97c \uc0ac\uc804 \uac12\uc73c\ub85c \ub118\uaca8 \ucc98\ub9ac \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 214 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex5.csv', na_values=sentinels)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 NaN 5 6NaN 8 world
2 three 9 10 11 12 NaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 215, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 NaN 5 6 NaN 8 world\n", "2 three 9 10 11 12 NaN" ] } ], "prompt_number": 215 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### read_csv / read_table \ud568\uc218 \uc778\uc790\n", "\n", "\uc778\uc790 | \uc124\uba85\n", "--- | ---\n", "path | \ud30c\uc77c \uc2dc\uc2a4\ud15c\uc5d0\uc11c\uc758 \uc704\uce58, URL, \ud30c\uc77c \uac1d\uccb4\ub97c \ub098\ud0c0\ub0b4\ub294 \ubb38\uc790\uc5f4\n", "sep or delimiter | \ud544\ub4dc\ub97c \uad6c\ubd84\ud558\uae30 \uc704\ud574 \uc0ac\uc6a9\ud560 \uc5f0\uc18d\ub41c \ubb38\uc790\ub098 \uc815\uaddc\ud45c\ud604\uc2dd\n", "header | \uce7c\ub7fc\uc758 \uc774\ub984\uc73c\ub85c \uc0ac\uc6a9\ud560 \ub85c\uc6b0\uc758 \ubc88\ud638, \uae30\ubcf8 \uac12\uc740 0(\uccab \ub85c\uc6b0)\uc774\uba70 \ud5e4\ub354\uac00 \uc5c6\uc73c\uba74 None\uc73c\ub85c \uc9c0\uc815\ud560 \uc218 \uc788\ub2e4.\n", "index_col | \uc0c9\uc778\uc73c\ub85c \uc0ac\uc6a9\ud560 \uce7c\ub7fc \ubc88\ud638\ub098 \uc774\ub984, \uacc4\uce35\uc801 \uc0c9\uc778\uc744 \uc9c0\uc815\ud560 \uacbd\uc6b0 \ub9ac\uc2a4\ud2b8\ub97c \ub118\uae38 \uc218 \uc788\ub2e4.\n", "names | \uceec\ub7fc \uc774\ub984\uc73c\ub85c \uc0ac\uc6a9\ud560 \ub9ac\uc2a4\ud2b8. header = None\uacfc \ud568\uaed8 \uc0ac\uc6a9\ud55c\ub2e4.\n", "skiprows | \ud30c\uc77c\uc758 \uc2dc\uc791\ubd80\ud130 \ubb34\uc2dc\ud560 \ub85c\uc6b0\uc758 \uac1c\uc218 \ub610\ub294 \ubb34\uc2dc\ud560 \ub85c\uc6b0 \ubc88\ud638\uac00 \ub2f4\uae34 \ub9ac\uc2a4\ud2b8\n", "na_values | NA \uac12\uc73c\ub85c \ucc98\ub9ac\ud560 \uac12\ub4e4\uc758 \ub098\uc5f4\n", "comment | \uc8fc\uc11d\uc73c\ub85c \ubd84\ub958\ub418\uc5b4 \ud30c\uc2f1\ud558\uc9c0 \uc54a\uc744 \ubb38\uc790 \ud639\uc740 \ubb38\uc790\uc5f4\n", "parse_dates | \ub0a0\uc9dc\ub97c datetime\uc73c\ub85c \ubcc0\ud658\ud560\uc9c0\uc758 \uc5ec\ubd80. \uae30\ubcf8\uac12\uc740 False\uc774\uba70, True\uc77c \uacbd\uc6b0 \ubaa8\ub4e0 \uce7c\ub7fc\uc5d0 \ub2e4 \uc801\uc6a9\ub41c\ub2e4. \ub9ac\uc2a4\ud2b8\ub97c \ub118\uae30\uba74 \ubcc0\ud658\ud560 \uce7c\ub7fc\uc744 \uc9c0\uc815\ud560 \uc218 \uc788\ub294\ub370, [1, 2, 3]\uc744 \ub118\uae30\uba74 \uac01\uac01\uc758 \uce7c\ub7fc\uc744 datetime\uc73c\ub85c \ubcc0\ud658\ud558\uace0, [[1, 3]]\uc744 \ub118\uae30\uba74 1, 3\ubc88 \uce7c\ub7fc\uc744 \uc870\ud569\ud574\uc11c \ud558\ub098\uc758 datetime\uc73c\ub85c \ubcc0\ud658\ud55c\ub2e4.\n", "keep_date_col | \uc5ec\ub7ec \uce7c\ub7fc\uc744 datetime\uc73c\ub85c \ubcc0\ud658\ud588\uc744 \uacbd\uc6b0 \uc6d0\ub798 \uce7c\ub7fc\uc744 \ub0a8\uaca8\ub458\uc9c0\uc758 \uc5ec\ubd80. \uae30\ubcf8\uac12\uc740 False\n", "converters | \ubcc0\ud658 \uc2dc \uce7c\ub7fc\uc5d0 \uc801\uc6a9\ud560 \ud568\uc218\ub97c \uc9c0\uc815\ud55c\ub2e4. \uc608\ub97c \ub4e4\uc5b4 {'foo': f}\ub294 'foo'\uce7c\ub7fc\uc5d0 f \ud568\uc218\ub97c \uc801\uc6a9\ud55c\ub2e4. \uc804\ub2ec\ud558\ub294 \uc0ac\uc804\uc758 \ud0a4 \uac12\uc740 \uce7c\ub7fc \uc774\ub984\uc774\ub098 \ubc88\ud638\uac00 \ub420 \uc218 \uc788\ub2e4.\n", "dayfirst | \ubaa8\ud638\ud55c \ub0a0\uc9dc \ud615\uc2dd\uc77c \uacbd\uc6b0 \uad6d\uc81c \ud615\uc2dd\uc73c\ub85c \uac04\uc8fc\ud55c\ub2e4(7/6/2012\ub294 2012\ub144 6\uc6d4 7\uc77c\ub85c \uac04\uc8fc\ud55c\ub2e4). \uae30\ubcf8\uac12\uc740 False\n", "date_parser | \ub0a0\uc9dc \ubcc0\ud658 \uc2dc \uc0ac\uc6a9\ud560 \ud568\uc218\n", "nrows | \ud30c\uc77c\uc758 \uccab \uc77c\ubd80\ub9cc \uc77d\uc5b4\uc62c \ub54c \ucc98\uc74c \uba87 \uc904\uc744 \uc77d\uc744 \uac83\uc778\uc9c0 \uc9c0\uc815\ud55c\ub2e4.\n", "iterator | \ud30c\uc77c\uc744 \uc870\uae08\uc529 \uc77d\uc744 \ub54c \uc0ac\uc6a9\ud558\ub3c4\ub85d TextParser \uac1d\uccb4\ub97c \ubc18\ud658\ud558\ub3c4\ub85d \ud55c\ub2e4. \uae30\ubcf8\uac12\uc740 False\n", "chunksize | TextParser \uac1d\uccb4\uc5d0\uc11c \uc0ac\uc6a9\ud560, \ud55c \ubc88\uc5d0 \uc77d\uc744 \ud30c\uc77c\uc758 \ud06c\uae30\n", "skip_footer | \ubb34\uc2dc\ud560 \ud30c\uc77c\uc758 \ub9c8\uc9c0\ub9c9 \uc904 \uc218\n", "verbose | \ud30c\uc2f1 \uacb0\uacfc\uc5d0 \ub300\ud55c \uc815\ubcf4\ub97c \ucd9c\ub825\ud55c\ub2e4. \uc22b\uc790\uac00 \uc544\ub2cc \uac12\ub4e4\uc774 \ub4e4\uc5b4\uc788\ub294 \uce7c\ub7fc\uc774\uba74\uc11c \ub204\ub77d\ub41c \uac12\uc774 \uc788\ub2e4\uba74 \uc904 \ubc88\ud638\ub97c \ucd9c\ub825\ud55c\ub2e4. \uae30\ubcf8\uac12\uc740 False\n", "encoding | \uc720\ub2c8\ucf54\ub4dc \uc778\ucf54\ub529 \uc885\ub958\ub97c \uc9c0\uc815\ud55c\ub2e4. UTF-8\ub85c \uc778\ucf54\ub529\ub41c \ud14d\uc2a4\ud2b8\uc77c \uacbd\uc6b0 'utf-8'\ub85c \uc9c0\uc815\ud55c\ub2e4.\n", "squeeze | \ub85c\uc6b0\uac00 \ud558\ub098\ubfd0\uc774\ub77c\uba74 Series \uac1d\uccb4\ub97c \ubc18\ud658\ud55c\ub2e4. \uae30\ubcf8\uac12\uc740 False\n", "thousands | \uc22b\uc790\ub97c \ucc9c \ub2e8\uc704\ub85c \ub04a\uc744 \ub54c \uc0ac\uc6a9\ud560 ', '\ub098 '.' \uac19\uc740 \uad6c\ubd84\uc790" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc774 \uba85\ub839\uc5b4\ub85c \uc5b4\ub5a4 \ud568\uc218\uc778\uc9c0, \uc5b4\ub5a4 \ud30c\ub77c\ubbf8\ud130\ub97c \ub118\uaca8\uc57c \ud558\ub294\uc9c0 \uc815\ud655\ud788 \uc54c \uc218 \uc788\ub2e4.\n", "# \uad73\uc774 \uba85\ub839\uc5b4\ub4e4\uc744 \ub530\ub77c\uce60 \ud544\uc694\ub294 \uc5c6\ub294\ub370 \uc5b4\ub5a4 \ud30c\ub77c\ubbf8\ud130\ub4e4\uc744 \ub118\uae30\ub294\uc9c0 \ud55c \ubc88 \uacf5\ubd80\ud558\ub294 \uacb8\uacb8\ud574\uc11c \uccd0\ubd24\ub2e4.\n", "pd.read_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 216 }, { "cell_type": "raw", "metadata": {}, "source": [ "Type: function\n", "String form: \n", "File: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/parsers.py\n", "Definition: pd.read_csv(filepath_or_buffer, sep=',', dialect=None, compression=None, doublequote=True, escapechar=None, quotechar='\"', quoting=0, skipinitialspace=False, lineterminator=None, header='infer', index_col=None, names=None, prefix=None, skiprows=None, skipfooter=None, skip_footer=0, na_values=None, na_fvalues=None, true_values=None, false_values=None, delimiter=None, converters=None, dtype=None, usecols=None, engine='c', delim_whitespace=False, as_recarray=False, na_filter=True, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, warn_bad_lines=True, error_bad_lines=True, keep_default_na=True, thousands=None, comment=None, decimal='.', parse_dates=False, keep_date_col=False, dayfirst=False, date_parser=None, memory_map=False, nrows=None, iterator=False, chunksize=None, verbose=False, encoding=None, squeeze=False, mangle_dupe_cols=True, tupleize_cols=True)\n", "Docstring:\n", "Read CSV (comma-separated) file into DataFrame\n", "\n", "Also supports optionally iterating or breaking of the file\n", "into chunks.\n", "\n", "Parameters\n", "----------\n", "filepath_or_buffer : string or file handle / StringIO. The string could be\n", " a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host\n", " is expected. For instance, a local file could be\n", " file ://localhost/path/to/table.csv\n", "sep : string, default ','\n", " Delimiter to use. If sep is None, will try to automatically determine\n", " this. Regular expressions are accepted.\n", "\n", "lineterminator : string (length 1), default None\n", " Character to break file into lines. Only valid with C parser\n", "quotechar : string\n", " The character to used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.\n", "quoting : int\n", " Controls whether quotes should be recognized. Values are taken from\n", " `csv.QUOTE_*` values. Acceptable values are 0, 1, 2, and 3 for\n", " QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONE, and QUOTE_NONNUMERIC, respectively.\n", "skipinitialspace : boolean, default False\n", " Skip spaces after delimiter\n", "escapechar : string\n", "dtype : Type name or dict of column -> type\n", " Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}\n", "compression : {'gzip', 'bz2', None}, default None\n", " For on-the-fly decompression of on-disk data\n", "dialect : string or csv.Dialect instance, default None\n", " If None defaults to Excel dialect. Ignored if sep longer than 1 char\n", " See csv.Dialect documentation for more details\n", "header : int, default 0 if names parameter not specified,\n", " Row to use for the column labels of the parsed DataFrame. Specify None if\n", " there is no header row. Can be a list of integers that specify row\n", " locations for a multi-index on the columns E.g. [0,1,3]. Interveaning\n", " rows that are not specified (E.g. 2 in this example are skipped)\n", "skiprows : list-like or integer\n", " Row numbers to skip (0-indexed) or number of rows to skip (int)\n", " at the start of the file\n", "index_col : int or sequence or False, default None\n", " Column to use as the row labels of the DataFrame. If a sequence is given, a\n", " MultiIndex is used. If you have a malformed file with delimiters at the end\n", " of each line, you might consider index_col=False to force pandas to _not_\n", " use the first column as the index (row names)\n", "names : array-like\n", " List of column names to use. If file contains no header row, then you\n", " should explicitly pass header=None\n", "prefix : string or None (default)\n", " Prefix to add to column numbers when no header, e.g 'X' for X0, X1, ...\n", "na_values : list-like or dict, default None\n", " Additional strings to recognize as NA/NaN. If dict passed, specific\n", " per-column NA values\n", "true_values : list\n", " Values to consider as True\n", "false_values : list\n", " Values to consider as False\n", "keep_default_na : bool, default True\n", " If na_values are specified and keep_default_na is False the default NaN\n", " values are overridden, otherwise they're appended to\n", "parse_dates : boolean, list of ints or names, list of lists, or dict\n", " If True -> try parsing the index.\n", " If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.\n", " If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column.\n", " {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo'\n", "keep_date_col : boolean, default False\n", " If True and parse_dates specifies combining multiple columns then\n", " keep the original columns.\n", "date_parser : function\n", " Function to use for converting a sequence of string columns to an\n", " array of datetime instances. The default uses dateutil.parser.parser\n", " to do the conversion.\n", "dayfirst : boolean, default False\n", " DD/MM format dates, international and European format\n", "thousands : str, default None\n", " Thousands separator\n", "comment : str, default None\n", " Indicates remainder of line should not be parsed\n", " Does not support line commenting (will return empty line)\n", "decimal : str, default '.'\n", " Character to recognize as decimal point. E.g. use ',' for European data\n", "nrows : int, default None\n", " Number of rows of file to read. Useful for reading pieces of large files\n", "iterator : boolean, default False\n", " Return TextFileReader object\n", "chunksize : int, default None\n", " Return TextFileReader object for iteration\n", "skipfooter : int, default 0\n", " Number of line at bottom of file to skip\n", "converters : dict. optional\n", " Dict of functions for converting values in certain columns. Keys can either\n", " be integers or column labels\n", "verbose : boolean, default False\n", " Indicate number of NA values placed in non-numeric columns\n", "delimiter : string, default None\n", " Alternative argument name for sep. Regular expressions are accepted.\n", "encoding : string, default None\n", " Encoding to use for UTF when reading/writing (ex. 'utf-8')\n", "squeeze : boolean, default False\n", " If the parsed data only contains one column then return a Series\n", "na_filter: boolean, default True\n", " Detect missing value markers (empty strings and the value of na_values). In\n", " data without any NAs, passing na_filter=False can improve the performance\n", " of reading a large file\n", "usecols : array-like\n", " Return a subset of the columns.\n", " Results in much faster parsing time and lower memory usage.\n", "mangle_dupe_cols: boolean, default True\n", " Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'\n", "tupleize_cols: boolean, default False\n", " Leave a list of tuples on columns as is (default is to convert to\n", " a Multi Index on the columns)\n", "\n", "Returns\n", "-------\n", "result : DataFrame or TextParser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1.1 \ud14d\uc2a4\ud2b8 \ud30c\uc77c \uc870\uae08\uc529 \uc77d\uc5b4\uc624\uae30" ] }, { "cell_type": "code", "collapsed": false, "input": [ "result = pd.read_csv('ch06/ex6.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 217 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n",
        "<class 'pandas.core.frame.DataFrame'>\n",
        "Int64Index: 10000 entries, 0 to 9999\n",
        "Data columns (total 5 columns):\n",
        "one      10000  non-null values\n",
        "two      10000  non-null values\n",
        "three    10000  non-null values\n",
        "four     10000  non-null values\n",
        "key      10000  non-null values\n",
        "dtypes: float64(4), object(1)\n",
        "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 218, "text": [ "\n", "Int64Index: 10000 entries, 0 to 9999\n", "Data columns (total 5 columns):\n", "one 10000 non-null values\n", "two 10000 non-null values\n", "three 10000 non-null values\n", "four 10000 non-null values\n", "key 10000 non-null values\n", "dtypes: float64(4), object(1)" ] } ], "prompt_number": 218 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### nrows\ub85c \ucc98\uc74c \uba87 \uc904\ub9cc \uc77d\uc5b4\ubcfc \uc218 \uc788\ub2e4.\n", "\n", "- \ub9ac\ub205\uc2a4\uc758 head \uc758 \uae30\ub2a5\uacfc \uac19\ub2e4\uace0 \uc0dd\uac01\ud558\uba74 \ub41c\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/ex6.csv', nrows=5)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
0 0.467976-0.038649-0.295344-1.824726 L
1-0.358893 1.404453 0.704965-0.200638 B
2-0.501840 0.659254-0.421691-0.057688 G
3 0.204886 1.074134 1.388361-0.982404 R
4 0.354628-0.133116 0.283763-0.837063 Q
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 219, "text": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] } ], "prompt_number": 219 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TextParser \uac1d\uccb4\ub97c \uc774\uc6a9\ud574\uc11c chunksize\uc5d0 \ub530\ub77c \ubd84\ub9ac\ub41c \ud30c\uc77c\uc744 \uc21c\ud68c \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 220 }, { "cell_type": "code", "collapsed": false, "input": [ "chunker" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 221, "text": [ "" ] } ], "prompt_number": 221 }, { "cell_type": "code", "collapsed": false, "input": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", "\n", "tot = Series([])\n", "for piece in chunker:\n", " # piece['key']\uc5d0 \uc788\ub294 E, X, L \ub4f1\uc758 \uc22b\uc790\ub97c \uc13c\ub2e4. \uac12\uc774 \uc5c6\ub294 \uac83\ub4e4\uc740 0\uc73c\ub85c \ucc44\uc6b4\ub2e4.\n", " tot = tot.add( piece['key'].value_counts(), fill_value=0)\n", "\n", "# Key\uac00 \uc544\ub2cc \uac12\uc744(order) \uae30\uc900\uc73c\ub85c \ub0b4\ub9bc\ucc28\uc21c \uc815\ub9ac\n", "tot = tot.order(ascending=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 222 }, { "cell_type": "code", "collapsed": false, "input": [ "tot[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 223, "text": [ "E 368\n", "X 364\n", "L 346\n", "O 343\n", "Q 340\n", "M 338\n", "J 337\n", "F 335\n", "K 334\n", "H 330\n", "dtype: float64" ] } ], "prompt_number": 223 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.1.2 \ub370\uc774\ud130\ub97c \ud14d\uc2a4\ud2b8 \ud615\uc2dd\uc73c\ub85c \uae30\ub85d\ud558\uae30" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data = pd.read_csv('ch06/ex5.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 224 }, { "cell_type": "code", "collapsed": false, "input": [ "data" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0 one 1 2 3 4 NaN
1 two 5 6NaN 8 world
2 three 9 10 11 12 foo
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 225, "text": [ " something a b c d message\n", "0 one 1 2 3 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11 12 foo" ] } ], "prompt_number": 225 }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv('ch06/out.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 226 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/out.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ",something,a,b,c,d,message\r\n", "0,one,1,2,3.0,4,\r\n", "1,two,5,6,,8,world\r\n", "2,three,9,10,11.0,12,foo\r\n" ] } ], "prompt_number": 227 }, { "cell_type": "code", "collapsed": false, "input": [ "# csv\ub85c \uc9c0\uc815\ud558\ub294\ub370 output\uc740 \ud45c\uc900\uc544\uc6c3\ud48b(\ubaa8\ub2c8\ud130), separator\ub294 '|'\n", "data.to_csv(sys.stdout, sep='|')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "|something|a|b|c|d|message\n", "0|one|1|2|3.0|4|\n", "1|two|5|6||8|world\n", "2|three|9|10|11.0|12|foo\n" ] } ], "prompt_number": 228 }, { "cell_type": "code", "collapsed": false, "input": [ "# Write DataFrame to a comma-separated value (csv) file\n", "# na_rep -> Missing data representation. NA REPresentation\n", "data.to_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 229 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### na_rep\ub85c \ub204\ub77d\ub41c\uac12\uc744 \uc6d0\ud558\ub294 \uac12\uc73c\ub85c \ubcc0\uacbd \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, na_rep='NULL')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,NULL\n", "1,two,5,6,NULL,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "prompt_number": 230 }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, na_rep='NaN')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,NaN\n", "1,two,5,6,NaN,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "prompt_number": 231 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### row, column \uac12\uc744 \uc800\uc7a5\ud560 \uac83\uc778\uc9c0 \uc120\ud0dd \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, index=False, header=False)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "one,1,2,3.0,4,\n", "two,5,6,,8,world\n", "three,9,10,11.0,12,foo\n" ] } ], "prompt_number": 232 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uceec\ub7fc\uc758 \uc77c\ubd80\ubd84\ub9cc \uae30\ub85d \uac00\ub2a5, \uc21c\uc11c\ub97c \uc9c1\uc811 \uc9c0\uc815 \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "a,b,c\n", "1,2,3.0\n", "5,6,\n", "9,10,11.0\n" ] } ], "prompt_number": 233 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Series\uc5d0\ub3c4 to_csv method \uc874\uc7ac" ] }, { "cell_type": "code", "collapsed": false, "input": [ "dates = pd.date_range('1/1/2000', periods=7)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 234 }, { "cell_type": "code", "collapsed": false, "input": [ "ts = Series(np.arange(7), index=dates)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 235 }, { "cell_type": "code", "collapsed": false, "input": [ "ts.to_csv('ch06/tseries.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 236 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/tseries.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "2000-01-01,0\r\n", "2000-01-02,1\r\n", "2000-01-03,2\r\n", "2000-01-04,3\r\n", "2000-01-05,4\r\n", "2000-01-06,5\r\n", "2000-01-07,6\r\n" ] } ], "prompt_number": 237 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc57d\uac04 \ubcf5\uc7a1\ud558\uac8c \ud5e4\ub354\ub97c \uc5c6\uc560\uace0 \uccab \ubc88\uc9f8 \uce7c\ub7fc\uc744 \uc0c9\uc778\uc73c\ub85c \ud558\uba74 read_csv \uba54\uc11c\ub4dc\ub85c Series \uac1d\uccb4\ub97c \uc5bb\uc744 \uc218 \uc788\uc9c0\ub9cc from_csv \uba54\uc11c\ub4dc\uac00 \uc880 \ub354 \ud3b8\ub9ac\ud558\uace0 \uac04\ub2e8\ud558\uac8c \ubb38\uc81c \ud574\uacb0" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.DataFrame.to_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 238 }, { "cell_type": "code", "collapsed": false, "input": [ "Series.from_csv('ch06/tseries.csv', parse_dates=True)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 239, "text": [ "2000-01-01 0\n", "2000-01-02 1\n", "2000-01-03 2\n", "2000-01-04 3\n", "2000-01-05 4\n", "2000-01-06 5\n", "2000-01-07 6\n", "dtype: int64" ] } ], "prompt_number": 239 }, { "cell_type": "code", "collapsed": false, "input": [ "type( Series.from_csv('ch06/tseries.csv', parse_dates=True) )" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 240, "text": [ "pandas.core.series.Series" ] } ], "prompt_number": 240 }, { "cell_type": "code", "collapsed": false, "input": [ "# parse dates: boolean, default True.\n", "# Parse dates. Different default from read_table\n", "Series.from_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 241 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### read_csv\ub97c Series\ub85c \uc77d\uc744 \uc218 \uc788\ub2e4\uace0 \uc2e4\ud5d8\ud558\ub294 \uc911\uc778\ub370 \uc798 \uc548\ub418\ub124.. \n", "\n", "- DataFrame\uc73c\ub85c \uc77d\uc5b4\uc9d0" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv('ch06/tseries.csv', header=None)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0 2000-01-01 0
1 2000-01-02 1
2 2000-01-03 2
3 2000-01-04 3
4 2000-01-05 4
5 2000-01-06 5
6 2000-01-07 6
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 242, "text": [ " 0 1\n", "0 2000-01-01 0\n", "1 2000-01-02 1\n", "2 2000-01-03 2\n", "3 2000-01-04 3\n", "4 2000-01-05 4\n", "5 2000-01-06 5\n", "6 2000-01-07 6" ] } ], "prompt_number": 242 }, { "cell_type": "code", "collapsed": false, "input": [ "type(pd.read_csv('ch06/tseries.csv', header=None))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 243, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 243 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.read_csv?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 244 }, { "cell_type": "markdown", "metadata": {}, "source": [ "###6.1.3 \uc218\ub3d9\uc73c\ub85c \uad6c\ubd84 \ud615\uc2dd \ucc98\ub9ac\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### read_table\uc5d0\uc11c \uc77d\uc744 \uc218 \uc5c6\ub294 \uc798\ubabb\ub41c \ud615\uc2dd\uc758 \uc904\uc774 \ud3ec\ud568\ub41c \ub370\uc774\ud130\uac00 \ub4dc\ubb3c\uac8c \ubc1c\uacac \ub428 -> \uc218\ub3d9 \ucc98\ub9ac" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/ex7.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\"a\",\"b\",\"c\"\r\n", "\"1\",\"2\",\"3\"\r\n", "\"1\",\"2\",\"3\",\"4\"\r\n" ] } ], "prompt_number": 245 }, { "cell_type": "code", "collapsed": false, "input": [ "import csv\n", "f = open('ch06/ex7.csv')\n", "\n", "reader = csv.reader(f)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 246 }, { "cell_type": "code", "collapsed": false, "input": [ "for line in reader:\n", " print line" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['a', 'b', 'c']\n", "['1', '2', '3']\n", "['1', '2', '3', '4']\n" ] } ], "prompt_number": 247 }, { "cell_type": "code", "collapsed": false, "input": [ "lines = list(csv.reader(open('ch06/ex7.csv')))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 248 }, { "cell_type": "code", "collapsed": false, "input": [ "header, values = lines[0], lines[1:]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 249 }, { "cell_type": "code", "collapsed": false, "input": [ "header" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 250, "text": [ "['a', 'b', 'c']" ] } ], "prompt_number": 250 }, { "cell_type": "code", "collapsed": false, "input": [ "values" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 251, "text": [ "[['1', '2', '3'], ['1', '2', '3', '4']]" ] } ], "prompt_number": 251 }, { "cell_type": "code", "collapsed": false, "input": [ "# header = a,b,c\n", "# values\ub97c 1,1\uc744 \uac19\uc774 \ubb36\ub294\ub2e4. 2,2 \ubb36\uace0. 3,3 \ubb36\uace0. 4\ub294 header\uac00 a,b,c 3\uac1c \ubc16\uc5d0 \uc5c6\uae30 \ub54c\ubb38\uc5d0 \ud3ec\ud568\ub418\uc9c0 \uc54a\ub294\ub2e4.\n", "data_dict = {h: v for h, v in zip(header, zip(*values))}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 252 }, { "cell_type": "code", "collapsed": false, "input": [ "data_dict" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 253, "text": [ "{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}" ] } ], "prompt_number": 253 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CSV \ud30c\uc77c\uc740 \ub2e4\uc591\ud55c \ud30c\uc77c \uc874\uc7ac\ud558\uae30 \ub54c\ubb38\uc5d0 \ub2e4\uc591\ud55c \uc635\uc158\ub4e4\uc740 csv.Dialect \uc0c1\uc18d\ubc1b\uc544 \ud574\uacb0\n", "\n", "- \ub2e4\uc591\ud55c \uad6c\ubd84\uc790\n", "- \ubb38\uc790\uc5f4\uc744 \ub458\ub7ec\uc2f8\ub294 \ubc29\ubc95\n", "- \uac1c\ud589\ubb38\uc790" ] }, { "cell_type": "code", "collapsed": false, "input": [ "class my_dialect(csv.Dialect):\n", " lineterminator = '\\n'\n", " delimiter = ';'\n", " quotechar = '\"'\n", "\n", "reader = csv.reader" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 254 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 255 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 256 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TypeError: \"quoting\" must be an integer\n", "\n", "- [_csv.Error: field larger than field limit (131072) \ucc38\uace0](http://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# quoting\uc774 \uaf2d integer\uc5ec\uc57c \ud55c\ub2e4\ub294 \uc624\ub958\uac00 \ubc1c\uc0dd\ud574\uc11c \uc0bd\uc9c8\ud558\ub2e4\uac00 \ub4a4\uc5d0 quoting keyword\ub97c \ubd99\uc5ec\uc90c..\n", "reader = csv.reader(f, dialect=my_dialect)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "\"quoting\" must be an integer", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# quoting\uc774 \uaf2d integer\uc5ec\uc57c \ud55c\ub2e4\ub294 \uc624\ub958\uac00 \ubc1c\uc0dd\ud574\uc11c \uc0bd\uc9c8\ud558\ub2e4\uac00 \ub4a4\uc5d0 quoting keyword\ub97c \ubd99\uc5ec\uc90c..\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mreader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdialect\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmy_dialect\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: \"quoting\" must be an integer" ] } ], "prompt_number": 257 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 258 }, { "cell_type": "code", "collapsed": false, "input": [ "csv.QUOTE_NONE" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 259, "text": [ "3" ] } ], "prompt_number": 259 }, { "cell_type": "code", "collapsed": false, "input": [ "reader = csv.reader(f, delimiter='|')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 260 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### [13.1. csv \u2014 CSV File Reading and Writing](https://docs.python.org/3.1/library/csv.html#dialects-and-formatting-parameters)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc5b4\ub5a4 \uc635\uc158\ub4e4 \uc788\ub294\uc9c0 \ubcf4\ub824\uace0 \ud588\ub354\ub2c8 \uc548 \ubcf4\uc5ec\uc8fc\ub124...\n", "csv.reader??" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 261 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### CSV Note\n", "\n", "- \uc880 \ub354 \ubcf5\uc7a1\ud558\uac70\ub098 \uad6c\ubd84\uc790\uac00 \ud55c \uae00\uc790\ub97c \ucd08\uacfc\ud558\ub294 \uace0\uc815 \uae38\uc774\ub97c \uac00\uc9c4\ub2e4\uba74 csv \ubaa8\ub4c8\uc744 \uc0ac\uc6a9\ud560 \uc218 \uc5c6\ub2e4.\n", "- \uc774\ub7f0 \uacbd\uc6b0\uc5d0\ub294 \uc904\uc744 \ub098\ub204\uace0 \ubb38\uc790\uc5f4\uc758 split \uba54\uc11c\ub4dc\ub098 \uc815\uaddc\ud45c\ud604\uc2dd \uba54\uc11c\ub4dc\uc778 re.split \ub4f1\uc744 \uc774\uc6a9\ud574\uc11c \uac00\uacf5\ud558\ub294 \uc791\uc5c5\uc744 \ud574\uc57c \ud55c\ub2e4." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### CSV \ud30c\uc77c \uae30\ub85d" ] }, { "cell_type": "code", "collapsed": false, "input": [ "with open('ch06/mydata.csv', 'w') as f:\n", " writer = csv.writer(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)\n", " writer.writerow(('one', 'two', 'three'))\n", " writer.writerow(('1', '2', '3'))\n", " writer.writerow(('4', '5', '6'))\n", " writer.writerow(('7', '8', '9'))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 262 }, { "cell_type": "code", "collapsed": false, "input": [ "!cat ch06/mydata.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "one;two;three\r\n", "1;2;3\r\n", "4;5;6\r\n", "7;8;9\r\n" ] } ], "prompt_number": 263 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JSON \ub370\uc774\ud130\n", "\n", "- JSON(JavaScript Object Notation)\uc740 \uc6f9\ube0c\ub77c\uc6b0\uc800\uc640 \ub2e4\ub978 \uc560\ud50c\ub9ac\ucf00\uc774\uc158\uc774 HTTP \uc694\uccad\uc73c\ub85c \ub370\uc774\ud130\ub97c \ubcf4\ub0bc \ub54c \ub110\ub9ac \uc0ac\uc6a9\ud558\ub294 \ud45c\uc900 \ud30c\uc77c \ud615\uc2dd \uc911 \ud558\ub098\ub2e4.\n", "- JSON\uc740 CSV \uac19\uc740 \ud45c \ud615\uc2dd\uc758 \ud14d\uc2a4\ud2b8\ubcf4\ub2e4 \uc880 \ub354 \uc720\uc5f0\ud55c \ub370\uc774\ud130 \ud615\uc2dd\uc774\uba70, JSON \ub370\uc774\ud130\uc758 \uc608\ub294 \ub2e4\uc74c\uacfc \uac19\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# json\uc740 python\uc5d0\uc11c\ucc98\ub7fc '\uc73c\ub85c \ud558\uba74 \uc548\ub41c\ub2e4. \ud604\uc7ac \"\"\"\ub85c \uac10\uc2f8 \ubb38\uc790\uc5f4\ub85c \uc800\uc7a5\ub418\uc5b4 \uc788\uae30 \ub54c\ubb38\uc5d0 \n", "# javascript\uc5d0\uc11c\ub294 '\ub97c string \uac12\uc73c\ub85c \uc778\uc2dd\ud558\uc9c0 \uc54a\uc544\uc11c \uc5d0\ub7ec \ubc1c\uc0dd\n", "obj = \"\"\"\n", "{\n", " 'name': 'Wes',\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'pet': null, 'siblings': [{'name': 'Scott', 'age':25, 'pet':'Zuko'},\n", " {'name': 'Katie', 'age':33, 'pet': 'Cisco'}]\n", "}\n", "\"\"\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 264 }, { "cell_type": "code", "collapsed": false, "input": [ "import json" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 265 }, { "cell_type": "code", "collapsed": false, "input": [ "# ValueError: Expecting property name: line 3 column 5 (char 7)\n", "result = json.loads(obj)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Expecting property name: line 3 column 5 (char 7)", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# ValueError: Expecting property name: line 3 column 5 (char 7)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.pyc\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 338\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.pyc\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \"\"\"\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.pyc\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 379\u001b[0m \"\"\"\n\u001b[1;32m 380\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 381\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 382\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No JSON object could be decoded\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Expecting property name: line 3 column 5 (char 7)" ] } ], "prompt_number": 266 }, { "cell_type": "code", "collapsed": false, "input": [ "obj = \"\"\"\n", "{\n", " \"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null, \"siblings\": [{\"name\": \"Scott\", \"age\":25, \"pet\":\"Zuko\"},\n", " {\"name\": \"Katie\", \"age\":33, \"pet\": \"Cisco\"}]\n", "}\n", "\"\"\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 267 }, { "cell_type": "code", "collapsed": false, "input": [ "obj" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 268, "text": [ "'\\n{\\n \"name\": \"Wes\",\\n \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\\n \"pet\": null, \"siblings\": [{\"name\": \"Scott\", \"age\":25, \"pet\":\"Zuko\"},\\n {\"name\": \"Katie\", \"age\":33, \"pet\": \"Cisco\"}]\\n}\\n'" ] } ], "prompt_number": 268 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### JSON\uc740 \ub110 \uac12\uc778 null\uacfc \ub2e4\ub978 \uba87 \uac00\uc9c0 \uc0ac\uc18c\ud55c \uc8fc\uc758\uc0ac\ud56d(\ub9ac\uc2a4\ud2b8\uc758 \ub9c8\uc9c0\ub9c9\uc5d0 \uc27c\ud45c\uac00 \uc788\uc73c\uba74 \uc548\ub41c\ub2e4\ub358\uac00 \ud558\ub294)\uc744 \uc81c\uc678\ud558\uba74 \ud30c\uc774\uc36c \ucf54\ub4dc\uc640 \uac70\uc758 \uc720\uc0ac\n", "\n", "- \uae30\ubcf8 \uc790\ub8cc\ud615\uc740 \uac1d\uccb4(\uc0ac\uc804), \ubc30\uc5f4(\ub9ac\uc2a4\ud2b8), \ubb38\uc790\uc5f4, \uc22b\uc790, \ubd88\ub9ac\uc5b8 \uadf8\ub9ac\uace0 \ub110\n", "- \uac1d\uccb4\uc758 \ud0a4\ub294 \ubc18\ub4dc\uc2dc \ubb38\uc790\uc5f4\n", "- JSON \uc77d\uace0 \uc4f8 \uc218 \uc788\ub294 \ub77c\uc774\ube0c\ub7ec\ub9ac\uac00 \uba87 \uac1c \uc788\uc9c0\ub9cc \ud45c\uc900 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc778 json \uc0ac\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# ValueError: Expecting property name: line 3 column 5 (char 7)\n", "result = json.loads(obj)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 269 }, { "cell_type": "code", "collapsed": false, "input": [ "result" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 270, "text": [ "{u'name': u'Wes',\n", " u'pet': None,\n", " u'places_lived': [u'United States', u'Spain', u'Germany'],\n", " u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},\n", " {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}" ] } ], "prompt_number": 270 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### json.dumps\ub294 \ud30c\uc774\uc36c \uac1d\uccb4\ub97c JSON \ud615\ud0dc\ub85c \ubcc0\ud658" ] }, { "cell_type": "code", "collapsed": false, "input": [ "asjson = json.dumps(result)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 271 }, { "cell_type": "code", "collapsed": false, "input": [ "# '\uac00 \uc544\ub2c8\ub77c \"\uc778 \uac83\uc744 \ud655\uc778\ud558\uc790\n", "asjson" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 272, "text": [ "'{\"pet\": null, \"siblings\": [{\"pet\": \"Zuko\", \"age\": 25, \"name\": \"Scott\"}, {\"pet\": \"Cisco\", \"age\": 33, \"name\": \"Katie\"}], \"name\": \"Wes\", \"places_lived\": [\"United States\", \"Spain\", \"Germany\"]}'" ] } ], "prompt_number": 272 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### JSON \uac1d\uccb4\ub098 \uac1d\uccb4\uc758 \ub9ac\uc2a4\ud2b8\ub97c DataFrame\uc774\ub098 \ub2e4\ub978 \uc790\ub8cc \uad6c\uc870\ub85c \uc5b4\ub5bb\uac8c \ubcc0\ud658\ud574\uc11c \ubd84\uc11d\uc744 \ud560 \uac83\uc778\uc9c0\ub294 \ub3c5\uc790\uc758 \ubaab\n", "\n", "- JSON \uac1d\uccb4\uc758 \ub9ac\uc2a4\ud2b8\ub97c DataFrame \uc0dd\uc131\uc790\ub85c \ub118\uae30\uace0 \ub370\uc774\ud130 \ud544\ub4dc \uc120\ud0dd \uac00\ub2a5" ] }, { "cell_type": "code", "collapsed": false, "input": [ "siblings = DataFrame(result['siblings'], columns=['name', 'age'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 273 }, { "cell_type": "code", "collapsed": false, "input": [ "siblings" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0 Scott 25
1 Katie 33
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 274, "text": [ " name age\n", "0 Scott 25\n", "1 Katie 33" ] } ], "prompt_number": 274 }, { "cell_type": "code", "collapsed": false, "input": [ "# \ucc45\uc5d0 \ub098\uc640\uc788\uc9c0 \uc54a\uc740 \ub0b4\uc6a9\uc744 \ud55c \ubc88 \ub354 \ud574\ubd10\uc57c \uc27d\uac8c \uc774\ud574\uac00 \ub418\ub294\ub4ef\n", "siblings2 = DataFrame(result['siblings'], columns=['name', 'age', 'pet'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 275 }, { "cell_type": "code", "collapsed": false, "input": [ "siblings2" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagepet
0 Scott 25 Zuko
1 Katie 33 Cisco
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 276, "text": [ " name age pet\n", "0 Scott 25 Zuko\n", "1 Katie 33 Cisco" ] } ], "prompt_number": 276 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### pandas\uc5d0\uc11c JSON\uc744 \ube60\ub974\uac8c \uc77d\uace0(from_json) \uc4f0\ub294(to_json) \ub124\uc774\ud2f0\ube0c \uad6c\ud604\uc911" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.1.5 XML\uacfc HTML: \uc6f9 \ub0b4\uc6a9 \uae01\uc5b4\uc624\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### [lxml](http://lxml.de)\n", "\n", "- \uc544\uc8fc \ud070 \ud30c\uc77c\uc744 \ube60\ub974\uac8c \ucc98\ub9ac \uac00\ub2a5\n", "- \uc5ec\ub7ec \uc885\ub958\uc758 \uc778\ud130\ud398\uc774\uc2a4 \uc81c\uacf5\n", "- lxml.html: HTML \ucc98\ub9ac\n", "- lxml.objectify: XML \ucc98\ub9ac\n", "\n", "#### \ub300\ubd80\ubd84\uc758 \uc6f9\uc0ac\uc774\ud2b8\ub294 \ub531 \ud544\uc694\ud55c \ub0b4\uc6a9\ub9cc \ub4e4\uc5b4\uc788\ub294 JSON\uc774\ub098 XML\uc744 \ub9ce\uc774 \uc0ac\uc6a9\ud558\uc9c0 \uc54a\uace0 HTML\uc744 \uc0ac\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from lxml.html import parse\n", "from urllib2 import urlopen\n", "\n", "# \ub370\uc774\ud130\ub97c \uac00\uc838 \uc62c url\uc744 \ub118\uae34 \ud6c4\n", "# \ub370\uc774\ud130\ub97c \ubc1b\uc544 \uc628 \ud6c4 parse\n", "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n", "\n", "doc = parsed.getroot()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 277 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### doc \uac1d\uccb4\uc5d0\ub294 \ubaa8\ub4e0 HTML \ud0dc\uadf8 \ucd94\ucd9c\n", "\n", "- \uc6b0\ub9ac\uac00 \uad00\uc2ec \uac00\uc838\uc57c \ud560 table \ud0dc\uadf8\ub3c4 \ud3ec\ud568\n", "- \uc5b4\ub5bb\uac8c \ub3d9\uc791\ud558\ub294\uc9c0 \ud655\uc778\ud558\uae30 \uc704\ud574 \uae01\uc5b4\uc628 HTML \ubb38\uc11c\uc5d0\uc11c \uc678\ubd80 \uc5f0\uacb0 URL\uc744 \ubaa8\ub450 \ucc3e\uc544\ubcf4\uc790.\n", "- \uc678\ubd80 \uc5f0\uacb0\uc740 a \ud0dc\uadf8\ub85c \uc9c0\uc815\n", "- findall \uba54\uc11c\ub4dc\uc5d0 XPath(\ubb38\uc11c \uc9c8\uc758 \uc5b8\uc5b4)\ub97c \uc0ac\uc6a9\ud574\uc11c \ud574\ub2f9 \uc5d8\ub9ac\uba3c\ud2b8\ub97c \uac00\uc838\uc62c \uc218 \uc788\ub2e4.\n", "\n", "#### XPath tutorial site\n", "\n", "- [W3schools](http://www.w3schools.com/XPath/)\n", "- [XPath and XSLT with lxml](http://lxml.de/xpathxslt.html)\n", "- [Using Chrome Developer Tools](http://stackoverflow.com/questions/3030487/is-there-a-way-to-get-the-xpath-in-google-chrome)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "links = doc.findall('.//a')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 278 }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc774 \uac1d\uccb4\ub294 HTML \uc5d8\ub9ac\uba58\ud2b8\ub97c \ud45c\ud604\ud558\ub294 \uac1d\uccb4\uc77c\ubfd0\n", "# URL\uacfc \ub9c1\ud06c \uc774\ub984\uc744 \uac00\uc838\uc624\ub824\uba74 \uac01 \uc5d8\ub9ac\uba3c\ud2b8\uc5d0 \ub300\ud574 get \uba54\uc11c\ub4dc\ub97c \ud638\ucd9c\ud558\uc5ec URL\uc744 \uc5bb\uace0\n", "# text_content \uba54\uc11c\ub4dc\ub97c \uc0ac\uc6a9\ud574\uc11c \ub9c1\ud06c \uc774\ub984\uc744 \uac00\uc838\uc640\uc57c \ud55c\ub2e4.\n", "links[15:20]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 279, "text": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] } ], "prompt_number": 279 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc774 \uac1d\uccb4\ub294 HTML \uc5d8\ub9ac\uba3c\ud2b8\ub97c \ud45c\ud604\ud558\ub294 \uac1d\uccb4\uc77c \ubfd0\n", "\n", "- \uc5d8\ub9ac\uba3c\ud2b8\ub97c \ud45c\ud604\ud558\ub294 \uac1d\uccb4\ub77c\uace0 \uc0dd\uac01\ud558\uc790. \uc548 \uadf8\ub7ec\uba74 \uc0bd\uc9c8\ud558\uac8c \ub41c\ub2e4!\n", "- URL\uacfc \ub9c1\ud06c \uc774\ub984\uc744 \uac00\uc838\uc624\ub824\uba74 \uac01 \uc5d8\ub9ac\uba3c\ud2b8\uc5d0 \ub300\ud574 get \uba54\uc11c\ub4dc\ub97c \ud638\ucd9c\ud558\uc5ec URL\uc744 \uc5bb\uace0, text_content \uba54\uc11c\ub4dc\ub97c \uc774\uc6a9\ud574\uc11c \ub9c1\ud06c \uc774\ub984\uc744 \uac00\uc838\uc640\uc57c \ud55c\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "lnk = links[28]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 280 }, { "cell_type": "code", "collapsed": false, "input": [ "lnk" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 281, "text": [ "" ] } ], "prompt_number": 281 }, { "cell_type": "code", "collapsed": false, "input": [ "lnk.get('href')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 282, "text": [ "'https://edit.yahoo.com/mc2.0/eval_profile?.intl=us&.lang=en-US&.done=http://finance.yahoo.com/q/op%3fs=AAPL%2bOptions&.src=quote&.intl=us&.lang=en-US'" ] } ], "prompt_number": 282 }, { "cell_type": "code", "collapsed": false, "input": [ "lnk.text_content()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 283, "text": [ "'Account Info'" ] } ], "prompt_number": 283 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### [list comprehensions in Python](http://www.pythonforbeginners.com/lists/list-comprehensions-in-python/)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "urls = [lnk.get('href') for lnk in doc.findall('.//a')]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 284 }, { "cell_type": "code", "collapsed": false, "input": [ "len(urls)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 285, "text": [ "1239" ] } ], "prompt_number": 285 }, { "cell_type": "code", "collapsed": false, "input": [ "urls[-3:-1]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 286, "text": [ "['http://www.capitaliq.com', 'http://www.csidata.com']" ] } ], "prompt_number": 286 }, { "cell_type": "code", "collapsed": false, "input": [ "urls[-10:]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 287, "text": [ "['/q?s=AAPL140517P00780000',\n", " '/q/op?s=AAPL&k=800.000000',\n", " '/q?s=AAPL140517P00800000',\n", " '/q/op?s=AAPL&k=805.000000',\n", " '/q?s=AAPL140517P00805000',\n", " '/q/os?s=AAPL&m=2014-05-30',\n", " 'http://help.yahoo.com/l/us/yahoo/finance/quotes/fitadelay.html',\n", " 'http://www.capitaliq.com',\n", " 'http://www.csidata.com',\n", " 'http://www.morningstar.com/']" ] } ], "prompt_number": 287 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ucc3e\uace0\uc790 \ud558\ub294 table \uc77c\uc77c\uc774 \ud655\uc778\n", "\n", "- \uba87\uba87 \uc6f9\uc0ac\uc774\ud2b8\ub294 table\ub9c8\ub2e4 id \uc18d\uc131\uc744 \uc918\uc11c \uc27d\uac8c \ud560 \uc218 \uc788\uc9c0\ub9cc \uc5b4\ub514 \uc138\uc0c1 \uc77c\uc774 \uc27d\uac8c \ub418\ub294\uac8c \uc788\ub098? \ub178\uac00\ub2e4 \ud574\uc57c\uc9c0.." ] }, { "cell_type": "code", "collapsed": false, "input": [ "tables = doc.findall('.//table')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 288 }, { "cell_type": "code", "collapsed": false, "input": [ "tables" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 289, "text": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] } ], "prompt_number": 289 }, { "cell_type": "code", "collapsed": false, "input": [ "calls = tables[9]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 290 }, { "cell_type": "code", "collapsed": false, "input": [ "calls" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 291, "text": [ "" ] } ], "prompt_number": 291 }, { "cell_type": "code", "collapsed": false, "input": [ "puts = tables[13]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 292 }, { "cell_type": "code", "collapsed": false, "input": [ "rows = calls.findall('.//tr')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 293 }, { "cell_type": "code", "collapsed": false, "input": [ "rows" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 294, "text": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] } ], "prompt_number": 294 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### \uc6f9\ud398\uc774\uc9c0 \uad6c\uc870\uac00 \uc548 \ubc14\ub00c\uc5c8\ub124.\n", "\n", "- \ucc45\uc744 \uc4f4\uac8c 2012\ub144 10\uc6d4 29\uc77c\uc778\ub370 \uc544\uc9c1\uae4c\uc9c0 \uc548 \ubc14\ub00c\ub2e4\ub2c8..." ] }, { "cell_type": "code", "collapsed": false, "input": [ "def _unpack(row, kind='td'):\n", " elts = row.findall('.//%s' % kind)\n", " return [val.text_content() for val in elts]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 295 }, { "cell_type": "code", "collapsed": false, "input": [ "_unpack(rows[0], kind='th')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 296, "text": [ "['Strike', 'Symbol', 'Last', 'Chg', 'Bid', 'Ask', 'Vol', 'Open Int']" ] } ], "prompt_number": 296 }, { "cell_type": "code", "collapsed": false, "input": [ "_unpack(rows[1], kind='td')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 297, "text": [ "['330.00',\n", " 'AAPL140517C00330000',\n", " '263.00',\n", " ' 0.00',\n", " '255.70',\n", " '258.25',\n", " '6',\n", " '2']" ] } ], "prompt_number": 297 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Yahoo finance HTML structure in Chrome Developer Tools\n", "\n", "- \ucc98\uc74c \ubcf4\ub294 \uc0ac\ub78c\uc740 \uc798 \uc774\ud574\uac00 \uc548 \ub420 \uac83\uc774\ub2e4.\n", "- Yahoo \uc0ac\uc774\ud2b8\uc758 HTML \uad6c\uc870 \uba3c\uc800 \ud30c\uc545\uc744 \ud558\uace0 \uc2dc\uc791\ud558\uc790.\n", "- \uc774 \uad6c\uc870\uac00 \uc774\ud574 \uc548\ub418\uba74 2\ubc88 \ubd10\ub77c.\n", "\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ub2e8\uacc4\ub4e4\uc744 \ud1b5\ud569\ud558\uc5ec \uc6f9\uc5d0\uc11c \uae01\uc5b4\uc628 \ub370\uc774\ud130\ub97c DataFrame\uc73c\ub85c \ubcc0\ud658\n", "\n", "- \uc22b\uc790 \ub370\uc774\ud130\uc9c0\ub9cc \uc5ec\uc804\ud788 \ubb38\uc790\uc5f4 \ud615\uc2dd\uc73c\ub85c \uc800\uc7a5\ub418\uc5b4 \uc788\uc73c\ubbc0\ub85c \uc801\uc808\ud558\uac8c \ubcc0\ud658\uc744 \ud574\uc918\uc57c \ud558\ub294\ub370 \ubaa8\ub4e0 \ub370\uc774\ud130\uac00 \uc2e4\uc218\ud615\uc740 \uc544\ub2d0 \uac83\uc774\ubbc0\ub85c \uc774 \uc791\uc5c5\uc740 \uc218\ub3d9\uc73c\ub85c \ucc98\ub9ac\n", "- \ud558\uc9c0\ub9cc \uc6b4 \uc88b\uaca0\ub3c4 pandas\uc5d0\ub294 TextParser \ud074\ub798\uc2a4\uac00 \uc788\uc5b4 \uc790\ub3d9 \ud615 \ubcc0\ud658\uc744 \uc801\uc808\ud558\uac8c \uc218\ud589\ud574\uc900\ub2e4.\n", "- TextParser \ud074\ub798\uc2a4\ub294 read_csv \ud568\uc218\uc640 \ub2e4\ub978 \ud30c\uc2f1 \ud568\uc218\uc5d0\uc11c\ub3c4 \uc0ac\uc6a9" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pandas.io.parsers import TextParser\n", "\n", "def parse_options_data(table):\n", " rows = table.findall('.//tr')\n", " # rows[0]\uc740 header\n", " header = _unpack(rows[0], kind='th')\n", " # rows[1:] \ubd80\ud130 \uc2e4\uc81c\uc801\uc778 data\n", " data = [_unpack(r) for r in rows[1:]]\n", " # TextParser\uc5d0 data\ub97c \ub118\uae30\uace0 column\uba85\uc73c\ub85c header\ub97c \uc0ac\uc6a9\n", " return TextParser(data, names=header).get_chunk()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 298 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ub9c8\uc9c0\ub9c9\uc73c\ub85c lxml \ud14c\uc774\ube14 \uac1d\uccb4\ub97c \uc704\uc5d0\uc11c \uc791\uc131\ud55c \ud30c\uc2f1 \ud568\uc218\ub97c \uc774\uc6a9\ud574\uc11c \ucc98\ub9ac\ud558\uba74 DataFrame \uacb0\uacfc\uac12 \uc5bb\uc744 \uc218 \uc788\ub2e4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### [\uc635\uc158(\uae08\uc735) - wiki kr](http://ko.wikipedia.org/wiki/%EC%98%B5%EC%85%98_(%EA%B8%88%EC%9C%B5)\n", "\n", "- \uae08\uc735 \ub370\uc774\ud130\ub97c \ubd84\uc11d\ud558\ub294 \uac83\uc774\uae30 \ub54c\ubb38\uc5d0 \uae08\uc735 \ud30c\ud2b8\uc5d0 \ub300\ud55c \ub3c4\uba54\uc778 \uc9c0\uc2dd\uc774 \uc788\uc5b4\uc57c \ud55c\ub2e4. \ub0b4\uac00 \ubd84\uc11d\ud558\ub824\ub294 \ub370\uc774\ud130\uac00 \uc5b4\ub5a0\ud55c \uc5ed\ud560\uc744 \ud558\ub294\uc9c0 \ubaa8\ub974\uba74 \ub9d0\uc9f1 \ud669!\n", "- \uc635\uc158(option)\uc740 \ud30c\uc0dd \uc0c1\ud488\uc758 \uc77c\uc885\uc774\uba70, \ubbf8\ub9ac \uacb0\uc815\ub41c \uae30\uac04 \uc548\uc5d0 \ud2b9\uc815 \uc0c1\ud488\uc744 \uc815\ud574\uc9c4 \uac00\uaca9\uc73c\ub85c \uc0ac\uace0 \ud314 \uc218 \uc788\ub294 \uad8c\ub9ac\ub97c \ub9d0\ud55c\ub2e4. \n", "- call option: \ud2b9\uc815 \uae08\uc735 \uc0c1\ud488\uc744 \uc815\ud574\uc9c4 \uac00\uaca9\uc5d0 \ub9e4\uc785\ud560 \uc218 \uc788\ub294 \uad8c\ub9ac\ub97c \uac00\uc9c4 \ub9e4\uc785 \uc635\uc158(call option). \uc2dc\uc7a5\uc5d0\uc11c \ub0b4\uac00 \uc5bc\ub9c8 \uc904\ud14c\ub2c8 \ud314\uc544\ub77c \ud558\ub294 \ud615\uc2dd\uc774\ub77c call \uc774\ub77c \ubd80\ub974\ub294\ub4ef\n", "- put option: \ub9e4\ub3c4\ud560 \uc218 \uc788\ub294 \uad8c\ub9ac\ub97c \uac00\uc9c4 \ub9e4\ub3c4 \uc635\uc158(put option)\uc73c\ub85c \ub098\ub25c\ub2e4. put. \ubc00\ub2e4. \uc2dc\uc7a5\uc5d0 \ub0b4\uac00 \uc5bc\ub9c8\uc5d0 \ud314\uaca0\ub2e4\uace0 \ubbfc\ub2e4. \ub77c\uace0 \uc0dd\uac01\ud558\uba74 \ud3b8\ud560\ub4ef" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# call option data\n", "call_data = parse_options_data(calls)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 299 }, { "cell_type": "code", "collapsed": false, "input": [ "# put option data\n", "put_data = parse_options_data(puts)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 300 }, { "cell_type": "code", "collapsed": false, "input": [ "call_data[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StrikeSymbolLastChgBidAskVolOpen Int
0 330 AAPL140517C00330000 263.00 0.00 255.70 258.25 6 2
1 400 AAPL140517C00400000 193.00 0.00 186.40 187.95 1 9
2 410 AAPL140517C00410000 181.30 0.00 175.70 178.10 68 10
3 420 AAPL140517C00420000 170.80 0.00 165.85 168.65 124 1
4 430 AAPL140517C00430000 160.75 0.00 155.85 158.05 376 2
5 440 AAPL140517C00440000 152.87 0.00 146.35 147.90 1 1
6 445 AAPL140517C00445000 145.55 0.00 140.95 143.25 106 1
7 450 AAPL140517C00450000 137.00 5.00 136.35 137.90 1 48
8 450 AAPL7140517C00450000 137.86 2.39 135.05 138.80 1 1
9 455 AAPL140517C00455000 138.00 0.00 131.35 132.90 2 2
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 301, "text": [ " Strike Symbol Last Chg Bid Ask Vol Open Int\n", "0 330 AAPL140517C00330000 263.00 0.00 255.70 258.25 6 2\n", "1 400 AAPL140517C00400000 193.00 0.00 186.40 187.95 1 9\n", "2 410 AAPL140517C00410000 181.30 0.00 175.70 178.10 68 10\n", "3 420 AAPL140517C00420000 170.80 0.00 165.85 168.65 124 1\n", "4 430 AAPL140517C00430000 160.75 0.00 155.85 158.05 376 2\n", "5 440 AAPL140517C00440000 152.87 0.00 146.35 147.90 1 1\n", "6 445 AAPL140517C00445000 145.55 0.00 140.95 143.25 106 1\n", "7 450 AAPL140517C00450000 137.00 5.00 136.35 137.90 1 48\n", "8 450 AAPL7140517C00450000 137.86 2.39 135.05 138.80 1 1\n", "9 455 AAPL140517C00455000 138.00 0.00 131.35 132.90 2 2" ] } ], "prompt_number": 301 }, { "cell_type": "code", "collapsed": false, "input": [ "put_data[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StrikeSymbolLastChgBidAskVolOpen Int
0 280 AAPL140517P00280000 0.05 0 N/A 0.04 2 6
1 290 AAPL140517P00290000 0.02 0 N/A 0.01 11 11
2 295 AAPL140517P00295000 0.01 0 N/A 0.04 3 8
3 300 AAPL140517P00300000 0.05 0 N/A 0.01 1 23
4 305 AAPL140517P00305000 0.05 0 N/A 0.01 10 20
5 310 AAPL140517P00310000 0.10 0 N/A 0.04 0 1
6 315 AAPL140517P00315000 0.12 0 N/A 0.04 0 1
7 320 AAPL140517P00320000 0.02 0 N/A 0.04 1 7
8 325 AAPL140517P00325000 0.05 0 N/A 0.01 185 342
9 330 AAPL140517P00330000 0.02 0 N/A 0.04 5 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 302, "text": [ " Strike Symbol Last Chg Bid Ask Vol Open Int\n", "0 280 AAPL140517P00280000 0.05 0 N/A 0.04 2 6\n", "1 290 AAPL140517P00290000 0.02 0 N/A 0.01 11 11\n", "2 295 AAPL140517P00295000 0.01 0 N/A 0.04 3 8\n", "3 300 AAPL140517P00300000 0.05 0 N/A 0.01 1 23\n", "4 305 AAPL140517P00305000 0.05 0 N/A 0.01 10 20\n", "5 310 AAPL140517P00310000 0.10 0 N/A 0.04 0 1\n", "6 315 AAPL140517P00315000 0.12 0 N/A 0.04 0 1\n", "7 320 AAPL140517P00320000 0.02 0 N/A 0.04 1 7\n", "8 325 AAPL140517P00325000 0.05 0 N/A 0.01 185 342\n", "9 330 AAPL140517P00330000 0.02 0 N/A 0.04 5 5" ] } ], "prompt_number": 302 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### lxml.objectify \uc774\uc6a9\ud574 XML \ud30c\uc2f1\ud558\uae30\n", "\n", "- [XML(eXtensible Markup Language)](http://en.wikipedia.org/wiki/Xml)\uc740 \uacc4\uce35\uc801 \uad6c\uc870\uc640 \uba54\ud0c0\ub370\uc774\ud130\ub97c \ud3ec\ud568\ud558\ub294 \uc911\ucca9\ub41c \ub370\uc774\ud130 \uad6c\uc870\ub97c \uc9c0\uc6d0\ud558\ub294 \ub610 \ub2e4\ub978 \uc720\uba85\ud55c \ub370\uc774\ud130 \ud615\uc2dd\uc774\ub2e4. \uc9c0\uae08 \uc774 \ucc45\ub3c4 \uc2e4\uc81c\ub85c\ub294 XML \ubb38\uc11c\ub85c \uc791\uc131\n", "- \ub274\uc695 MTA(Metropolitan Transportation Authority)\ub294 \ubc84\uc2a4\uc640 \uc804\ucca0 \uc6b4\uc601\uc5d0 \uad00\ud55c \uc5ec\ub7ec \uac00\uc9c0 [\ub370\uc774\ud130 \uacf5\uac1c](http://www.mta.info/developers/download.html)\n", "- \uc0b4\ud3b4\ubcfc \uac83\uc740 \uc5ec\ub7ec XML \ud30c\uc77c\ub85c \uc81c\uacf5\ub418\ub294 \uc2e4\uc801 \uc790\ub8cc\n", "- \uc804\ucca0\uacfc \ubc84\uc2a4 \uc6b4\uc601\uc740 \ub9e4\uc6d4 \uc544\ub798\uc640 \ube44\uc2b7\ud55c \ub0b4\uc6a9\uc758 \uac01\uac01 \ub2e4\ub978 \ud30c\uc77c(Metro-North Railroad\uc758 \uacbd\uc6b0 Preformance_MNR.xml \uac19\uc740)\ub85c \uc81c\uacf5\n", "- Preformance_MNR.xml \uc774 \ud30c\uc77c\uc744 \ubabb \ucc3e\uc558\ub294\ub370 \uc784\uc8fc\uc601\ub2d8\uaed8\uc11c \ucc3e\uc544\uc11c \uc62c\ub824\uc8fc\uc168\uc2b5\ub2c8\ub2e4. github ch06 \ub514\ub809\ud1a0\ub9ac\uc5d0 \uc800\uc7a5\ud574 \ub450\uc5c8\uc2b5\ub2c8\ub2e4. \uadf8\ub7f0\ub370 \uacc4\uc18d \uc624\ub958\uac00 \ub098\ub294\uad70\uc694. < PERFORMANCE > \uc774 \ud0dc\uadf8 \ub54c\ubb38\uc778\ub4ef\ud55c\ub370. \ud574\uacb0\ucc45 \uc54c\uace0 \uacc4\uc2e0\ubd84\uc740 \uc880 \uc54c\ub824\uc8fc\uc138\uc694. \uacc4\uc18d \uc624\ub958\uac00 \ubc1c\uc0dd\ud558\uc5ec \ucc45\uc5d0 \ub098\uc640\uc788\ub294 \uac83\uc73c\ub85c \ud14c\uc2a4\ud2b8 \ud558\uc600\uc2b5\ub2c8\ub2e4." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Performance_MNR.xml\uc744 \uc5b4\ub5bb\uac8c \ubc1b\ub294\uc9c0 \ubaa8\ub974\uaca0\ub2e4.\n", "\n", "- \uc18c\uc2a4\ud30c\uc77c\uc744 6\uc7a5\uc744 \ub4a4\uc838\ubd10\ub3c4 \uc5c6\uace0\n", "- \ud648\ud398\uc774\uc9c0\uc5d0\ub294 \uc544\ub9c8 XML \uad6c\uc870\uac00 \ubc14\ub010\ub4ef \uc2f6\ub2e4.\n", "- \uadf8\ub798\uc11c \ucd5c\ud6c4\uc758 \uc218\ub2e8\uc73c\ub85c \uc5bc\ub9c8 \uc548\ub418\uc11c \uadf8\ub0e5 \ub0b4\uac00 \uc77c\uc77c\uc774 \ucce4\ub2e4.\n", "- XML\uc740 \uc5c4\uaca9\ud558\uae30 \ub54c\ubb38\uc5d0 \ud558\ub098\ub77c\ub3c4 \uc624\ud0c0\uac00 \uc788\uc73c\uba74 \uc624\ub958 \ubc1c\uc0dd\ud558\ubbc0\ub85c \uc8fc\uc758!" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%%writefile ch06/Performance_MNR.xml\n", "\n", " 373889\n", " \n", " MEtro-North Railroad\n", " Escalator Availability\n", " Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.\n", " 2011\n", " 12\n", " Service Indicators\n", " M\n", " U\n", " %\n", " 1\n", " 97.00\n", " \n", " 97.00\n", " \n", "" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Overwriting ch06/Performance_MNR.xml\n" ] } ], "prompt_number": 303 }, { "cell_type": "code", "collapsed": false, "input": [ "from lxml import objectify\n", "import urllib2\n", "\n", "path = 'Performance_MNR.xml'\n", "# online_path = 'http://www.mta.info/developers/data/lirr/lirr_gtfs.xml'\n", "\n", "# data = urllib2.urlopen(online_path).read()\n", "# f = open(path, 'w')\n", "# f.write(data)\n", "# f.close()\n", "\n", "# objectify\ub97c \uc774\uc6a9\ud574\uc11c \ud30c\uc77c \ud30c\uc2f1\n", "parsed = objectify.parse(open(path))\n", "root = parsed.getroot()" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "IOError", "evalue": "[Errno 2] No such file or directory: 'Performance_MNR.xml'", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# objectify\ub97c \uc774\uc6a9\ud574\uc11c \ud30c\uc77c \ud30c\uc2f1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mparsed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobjectify\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mroot\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetroot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'Performance_MNR.xml'" ] } ], "prompt_number": 304 }, { "cell_type": "code", "collapsed": false, "input": [ "data = []" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", " 'DESIRED_CHANGE', 'DECIMAL_PLACES']" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### root.INDICATOR\ub97c \ud1b5\ud574 \ubaa8\ub4e0 \uc5d8\ub9ac\uba3c\ud2b8\ub97c \ub044\uc9d1\uc5b4 \ub0bc \uc218 \uc788\ub2e4\n", "\n", "- \uac01\uac01\uc758 \ud56d\ubaa9\uc5d0 \ub300\ud574 \uba87\uba87 \ud0dc\uadf8\ub294 \uc81c\uc678\ud558\uace0 \ud0dc\uadf8 \uc774\ub984(YTD_ACTUAL \uac19\uc740)\uc744 \ud0a4 \uac12\uc73c\ub85c \ud558\ub294 \uc0ac\uc804\uc744 \ub9cc\ub4e4\uc5b4 \ub0c4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# root.INDICATOR -> root\n", "\n", "for elt in root:\n", " el_data = {}\n", " for child in elt.getchildren():\n", " if child.tag in skip_fields:\n", " continue\n", " el_data[child.tag] = child.pyval\n", " data.append(el_data)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# \uc704\uc758 \uac12\uacfc \ube44\uad50\ud558\uae30 \uc704\ud574 \ud14c\uc2a4\ud2b8 \ud574\ubcf8 \uac83\n", "for elt in root:\n", " for child in elt.getchildren():\n", " print child.tag, child.pyval" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 5\uc7a5\uc5d0\uc11c \uc0ac\uc804 \ud615\uc2dd\uc740 DataFrame\uc73c\ub85c \ubcc0\ud658\ud560 \uc218 \uc788\ub2e4\ub294 \uac83 \ucc38\uace0" ] }, { "cell_type": "code", "collapsed": false, "input": [ "perf = DataFrame(data)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "perf" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.2 \uc774\uc9c4 \ub370\uc774\ud130 \ud615\uc2dd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### \ub370\uc774\ud130\ub97c \ud6a8\uc728\uc801\uc73c\ub85c \uc800\uc7a5\ud558\ub294 \uac00\uc7a5 \uc190\uc26c\uc6b4 \ubc29\ubc95\n", "\n", "- \ud30c\uc774\uc36c\uc5d0 \uae30\ubcf8\uc73c\ub85c \ub0b4\uc7a5\ub418\uc5b4 \uc788\ub294 pickle \uc9c1\ub82c\ud654\ub97c \ud1b5\ud574 \ub370\uc774\ud130\ub97c \uc774\uc9c4 \ud615\uc2dd\uc73c\ub85c \uc800\uc7a5\ud558\ub294 \uac83\n", "- \ud3b8\ub9ac\ud558\uac8c\ub3c4 pandas\uc758 \uac1d\uccb4\ub294 \ubaa8\ub450 pickle\uc744 \uc774\uc6a9\ud574\uc11c \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\ub294 save \uba54\uc11c\ub4dc \uc788\uc74c" ] }, { "cell_type": "code", "collapsed": false, "input": [ "frame = pd.read_csv('ch06/ex1.csv')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "frame" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "frame.save('ch06/frame_pickle')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pd.load('ch06/frame_pickle')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pickle \uc0ac\uc6a9\uc2dc \uc8fc\uc758\uc0ac\ud56d\n", "\n", "- pickle\uc740 \uc624\ub798 \ubcf4\uad00\ud560 \ud544\uc694\uac00 \uc5c6\ub294 \ub370\uc774\ud130\uc5d0\ub9cc \ucd94\ucc9c\n", "- \uc624\ub79c \uc2dc\uac04\uc774 \uc9c0\ub098\ub3c4 \uc548\uc815\uc801\uc73c\ub85c \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud560 \uac70\ub77c\uace0 \ubcf4\uc7a5 \ubabb\ud568" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.2.1 HDF5 \ud615\uc2dd \uc0ac\uc6a9\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \ub514\uc2a4\ud06c\uc5d0 \uc774\uc9c4 \ud615\uc2dd\uc73c\ub85c \uc800\uc7a5\ub41c \ub300\uc6a9\ub7c9\uc758 \uacfc\ud559 \uc790\ub8cc\ub97c \ud6a8\uc728\uc801\uc73c\ub85c \uc77d\uace0 \uc4f8 \uc218 \uc788\ub294 \ub2e4\uc591\ud55c \ub3c4\uad6c \uc874\uc7ac\n", "- \uc0b0\uc5c5 \uae30\uc900\uc5d0 \ub9de\ub294 \uc778\uae30 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc911 \ud558\ub098\uac00 HDF5(Hierarchical Data Format), \uacc4\uce35\uc801 \ub370\uc774\ud130 \ud615\uc2dd\n", "- \ub0b4\ubd80\uc801\uc73c\ub85c \ud30c\uc77c \uc2dc\uc2a4\ud15c \uac19\uc740 \ub178\ub4dc \uad6c\uc870\n", "- \uc5ec\ub7ec \uac1c\uc758 \ub370\uc774\ud130\uc14b\uc744 \uc800\uc7a5\ud558\uace0 \ubd80\uac00 \uc815\ubcf4 \uae30\ub85d \uac00\ub2a5\n", "- \ub2e4\uc591\ud55c \uc555\ucd95 \uae30\uc220\uc744 \uc0ac\uc6a9\ud574\uc11c on-the-fly(\uc2e4\uc2dc\uac04) \uc555\ucd95 \uc9c0\uc6d0\n", "- \ubc18\ubcf5\ub418\ub294 \ud328\ud134\uc744 \uac00\uc9c4 \ub370\uc774\ud130 \uc880 \ub354 \ud6a8\uacfc\uc801 \uc800\uc7a5\n", "- \uba54\ubaa8\ub9ac\uc5d0 \ubaa8\ub450 \uc801\uc7ac\ud560 \uc218 \uc5c6\ub294 \uc5c4\uccad\ub098\uac8c \ud070 \ub370\uc774\ud130\ub97c \uc544\uc8fc \ud070 \ubc30\uc5f4\uc5d0\uc11c \ud544\uc694\ud55c \ub9cc\ud07c\uc758 \uc791\uc740 \ubd80\ubd84\ub4e4\ub9cc \ud6a8\uacfc\uc801\uc73c\ub85c \uc77d\uace0 \uc4f8 \uc218 \uc788\ub294 \ud6cc\ub96d\ud55c \uc120\ud0dd\n", "\n", "- PyTables: HDF5\ub97c \ucd94\uc0c1\ud654\ud558\uc5ec \uc5ec\ub7ec\uac00\uc9c0 \uc720\uc5f0\ud55c \ub370\uc774\ud130 \ucee8\ud14c\uc774\ub108\uc640 \ud14c\uc774\ube14 \uc0c9\uc778, \uc9c8\uc758 \uae30\ub2a5 \uadf8\ub9ac\uace0 \uc678\ubd80 \uba54\ubaa8\ub9ac \uc5f0\uc0b0(out-of-core, external memory algorithm) \uc9c0\uc6d0\n", "- h5py: \uc9c1\uc811\uc801\uc774\uc9c0\ub9cc \uace0\uc218\uc900\uc758 HDF5 API\uc5d0 \ub300\ud55c \uc778\ud130\ud398\uc774\uc2a4 \uc81c\uacf5\n", "- pandas\ub294 PyTable\ub97c \uc774\uc6a9\ud55c HDFStore\ub77c\ub294 \uac00\ubcbc\uc6b4 \uc0ac\uc804 \ud074\ub798\uc2a4\ub97c \ud1b5\ud574 pandas \uac1d\uccb4\ub97c \uc800\uc7a5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### PyTables\ub97c \uc124\uce58 \uc548\ud588\uc744 \ub54c \ub098\uc624\ub294 \uc0c1\ud669\n", "\n", "- \uadf8\ub7fc \uc77c\ubc18\uc801\uc778 \uc0ac\uc6a9\uc790\ub4e4\uc740 numexpr \uc778\uac00? \uc774 \ub77c\uc774\ube0c\ub7ec\ub9ac\uac00 \ub610 \uc5c6\ub2e4\uace0 \ud55c\ub2e4. \uadf8\ub7fc \uc774 \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c \uba3c\uc800 \uc124\uce58\ud558\uace0 tables\ub97c \uc124\uce58\ud55c\ub2e4.\n", "\n", "- \uc124\uce58 \ubc29\ubc951: sudo easy_install tables(\uc624\ub958 \ud655\uc778)\n", "- \uc124\uce58 \ubc29\ubc952: sudo easy_install numexpr(\ud655\uc2e4\ud558\uc9c0 \uc54a\uc74c. \ud655\uc778 \uc694\ud568)\n", "- \uc124\uce58 \ubc29\ubc953: sudo easy_install tables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", " ---------------------------------------------------------------------------\n", " Exception Traceback (most recent call last)\n", " in ()\n", " ----> 1 store = pd.HDFStore('ch06/mydata.h5')\n", "\n", " /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/pytables.pyc in __init__(self, path, mode, complevel, complib, fletcher32, **kwargs)\n", " 343 import tables as _\n", " 344 except ImportError: # pragma: no cover\n", " --> 345 raise Exception('HDFStore requires PyTables')\n", " 346 \n", " 347 self._path = path\n", "\n", " Exception: HDFStore requires PyTables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### HDFStore\ub97c \uc0ac\uc6a9\ud558\uae30 \uc704\ud574\uc11c\ub294 PyTables \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c \uc124\uce58\ud574\uc57c \ud55c\ub2e4.\n", "\n", "- \uc774\ub7f0\uac8c \uc788\ub2e4\ub294 \uac83\uc744 indexing \ud574\ub450\uace0 \ub098\uc911\uc5d0 \ud544\uc694\ud558\uba74 \ucc3e\uc544\ubcf4\uc790!" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# \ub77c\uc774\ube0c\ub7ec\ub9ac \uc124\uce58\ud574\ubcf4\uace0 \ud14c\uc2a4\ud2b8 \ud574\ubcf4\ub77c.\n", "\n", "store = pd.HDFStore('ch06/mydata.h5')\n", "store['obj1'] = frame\n", "store['obj1_col'] = frame['a']\n", "store\n", "store['obj1']" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \ub370\uc774\ud130 \ubd84\uc11d \ubb38\uc81c\n", "\n", "- \ub300\ubd80\ubd84 CPU\ubcf4\ub2e4\ub294 IO \uc131\ub2a5\uc5d0 \uc758\uc874\uc801\n", "- **HDF5\ub294 \ub370\uc774\ud130\ubca0\uc774\uc2a4\uac00 \uc544\ub2c8\ub2e4.** HDF5\ub294 **\ud55c \ubc88\ub9cc \uae30\ub85d**\ud558\uace0 **\uc5ec\ub7ec \ubc88 \uc790\uc8fc \uc77d\uc5b4\uc57c** \ud558\ub294 \ub370\uc774\ud130\uc5d0 \ucd5c\uc801\ud654\ub418\uc5b4 \uc788\ub2e4. \ub370\uc774\ud130\ub294 \uc544\ubb34\ub54c\ub098 \ud30c\uc77c\uc5d0 \ucd94\uac00\ud560 \uc218 \uc788\uc9c0\ub9cc \ub9cc\uc57d \uc5ec\ub7ec \uacf3\uc5d0\uc11c \ub3d9\uc2dc\uc5d0 \ud30c\uc77c\uc744 \uc4f4\ub2e4\uba74 \ud30c\uc77c\uc774 \uae68\uc9c0\ub294 \ubb38\uc81c\uac00 \ubc1c\uc0dd\ud560 \uc218 \uc788\ub2e4." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.2.2 \ub9c8\uc774\ud06c\ub85c\uc18c\ud504\ud2b8 \uc5d1\uc140 \ud30c\uc77c\uc5d0\uc11c \ub370\uc774\ud130 \uc77d\uc5b4\uc624\uae30\n", "\n", "- pandas\ub294 ExcelFile \ud074\ub798\uc2a4\ub97c \ud1b5\ud574 \ub9c8\uc774\ud06c\ub85c\uc18c\ud504\ud2b8 \uc5d1\uc140 2003 \uc774\ud6c4 \ubc84\uc804\uc758 \ub370\uc774\ud130\ub97c \uc77d\uae30 \uac00\ub2a5\n", "- \ub0b4\ubd80\uc801\uc73c\ub85c ExcelFile \ud074\ub798\uc2a4\ub294 xlrd, openpyxl \ud328\ud0a4\uc9c0 \ud65c\uc6a9. \uc0ac\uc6a9\ud558\uae30 \uc804\uc5d0 \uba3c\uc800 \uc124\uce58\n", "\n", "#### Excel \uc791\uc5c5\uc2dc \uc8fc\uc758\uc0ac\ud56d\n", "\n", "- **\ud604\uc5c5**\uc5d0\uc11c\ub294 Excel \uc5d0\uc11c \uc624\ub958\uac00 \ub9ce\uc774 \ubc1c\uc0dd\ud558\uae30 \ub54c\ubb38\uc5d0 **csv\ub85c \ubcc0\uacbd \ud6c4\uc5d0 \uc791\uc5c5**\ud55c\ub2e4\uace0 \ud55c\ub2e4. \uadf8\ub7ec\ub2c8 \uad73\uc774 excel \ud30c\uc77c\ub85c \ud558\uc5ec Error\ub97c \ub9cc\ub4e4\uc9c0 \ub9d0\uace0 \uc548\uc804\ud558\uac8c csv \ud30c\uc77c\ub85c \ubcc0\uacbd \ud6c4\uc5d0 \uc0ac\uc6a9\ud558\uc790.\n", "- \uc774\ub7f0\uac83\uc774 \uc788\ub2e4 \uc815\ub3c4\ub9cc \uc54c\uc544\ub193\uc790.\n", "- \uc5ec\uae30\ub294 \uadf8\ub0e5 skip \ud558\uaca0\ub2e4." ] }, { "cell_type": "code", "collapsed": false, "input": [ "xls_file = pd.ExcelFile('data.xls')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "table = xls_file.parse('Sheet1')" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.3 HTML, \uc6f9 API\uc640 \ud568\uaf10 \uc0ac\uc6a9\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### [Requests: HTTP for Humans](http://docs.python-requests.org/en/latest/#)\n", "\n", "- urllib2\ubcf4\ub2e4 \ub354 \uac04\ud3b8\n", "- [similar code, without Requests.](https://gist.github.com/973705)\n", "\n", "### \ud2b8\uc704\ud130 \ubd84\uc11d \ubb38\uc81c \ubc1c\uc0dd\n", "\n", "- \ud2b8\uc704\ud130\ub294 \ucc98\uc74c\uc5d0 \uc544\ubb34\ub7f0 \uc778\uc99d\uc5c6\uc774 API\ub97c \uc81c\uacf5\ud558\ub2e4 \ub9dd \uacfc\ubd80\ud558\uac00 \ubc1c\uc0dd\ud558\uc790 OAuth \uc778\uc99d \ubc29\uc2dd\uc73c\ub85c \ubcc0\uacbd\n", "- [OAuth2 - API \uc778\uc99d\uc744 \uc704\ud55c \ub9cc\ub2a5\ub3c4\uad6c\uc0c1\uc790](http://www.slideshare.net/tebica/oauth2-api)\n", "- [Twitter API](https://dev.twitter.com/docs/auth/sign-twitter)\n", "- \uc9c0\uae08\uc740 \uc778\uc99d \ubb38\uc81c \ub54c\ubb38\uc5d0 pass \ud558\uaca0\uc74c" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import requests" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "url = 'http://search.twitter.com/search.json?q=python%20pandas'" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "resp = requests.get(url)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "resp" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 305, "text": [ "" ] } ], "prompt_number": 305 }, { "cell_type": "code", "collapsed": false, "input": [ "resp.text" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 306, "text": [ "u'{\"errors\":[{\"message\":\"The Twitter REST API v1 is no longer active. Please migrate to API v1.1. https://dev.twitter.com/docs/api/1.1/overview.\",\"code\":64}]}'" ] } ], "prompt_number": 306 }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### \uc870\uae08\ub9cc \uc218\uace0\ub97c \ud558\uba74 \ud3c9\ubc94\ud55c \uc6f9 API\ub97c \uc704\ud55c \uace0\uc218\uc900\uc758 \uc778\ud130\ud398\uc774\uc2a4\ub97c \ub9cc\ub4e4\uc5b4\uc11c DataFrame\uc5d0 \uc800\uc7a5\ud558\uace0 \uc27d\uac8c \ubd84\uc11d \uc791\uc5c5 \uc218\ud589 \uac00\ub2a5" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "6.4 \ub370\uc774\ud130\ubca0\uc774\uc2a4\uc640 \ud568\uaed8 \uc0ac\uc6a9\ud558\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \ub300\ubd80\ubd84\uc758 \uc560\ud50c\ub9ac\ucf00\uc774\uc158\uc740 \ud14d\uc2a4\ud2b8 \ud30c\uc77c\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc624\uc9c0 \uc54a\uc74c\n", "- \uc65c\ub0d0\ud558\uba74 \ub300\uc6a9\ub7c9\uc758 \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\uae30\uc5d0 \ud14d\uc2a4\ud2b8 \ud30c\uc77c\uc740 \uc0c1\ub2f9\ud788 \ube44\ud6a8\uc728\uc801\n", "- SQL \uae30\ubc18\uc758 \uad00\uacc4\ud615 \ub370\uc774\ud130 \ubca0\uc774\uc2a4\uac00 \ub9ce\uc774 \uc0ac\uc6a9\ub428. MySql \uac19\uc740\n", "- \ucd5c\uadfc \uc720\uba85\ud574\uc9c4 NoSQL\uc774\ub77c \ubd88\ub9ac\ub294 \ube44 SQL \uae30\ubc18\uc758 \ub370\uc774\ud130\ubca0\uc774\uc2a4\ub3c4 \ub9ce\uc774 \uc0ac\uc6a9\ub428\n", "- SQL vs NoSQL\uc740 \uc11c\ub85c \uac01\uac01\uc758 \uc7a5\uc810\uc744 \ud30c\uc545\ud558\uace0 \uc790\uc2e0\uc758 \uc5c5\ubb34\uc5d0 \ub9de\ub294 DB\ub97c \uc120\ud0dd\n", "- SQL\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc77d\uc5b4\uc640\uc11c DataFrame\uc5d0 \uc800\uc7a5\ud558\ub294 \ubc29\ubc95\uc740 \uaf64 \uc9c1\uad00\uc801" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import sqlite3\n", "\n", "query = \"\"\"\n", "CREATE TABLE test\n", "(a VARCHAR(20), b VARCHAR(20),\n", "c REAL, d INTEGER\n", ");\"\"\"\n", "\n", "con = sqlite3.connect(':memory:')\n", "con.execute(query)\n", "con.commit()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 307 }, { "cell_type": "code", "collapsed": false, "input": [ "data = [('Atlanta', 'Georgia', 1.25, 6),\n", " ('Tallahassee', 'Florida', 2.6, 3),\n", " ('Sacramento', 'California', 1.7, 5)]\n", "\n", "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", "\n", "con.executemany(stmt, data)\n", "con.commit()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 308 }, { "cell_type": "markdown", "metadata": {}, "source": [ "\ub300\ubd80\ubd84\uc758 \ud30c\uc774\uc36c SQL \ub4dc\ub77c\uc774\ubc84(PyODBC, psycopg2, MySQLdb, pymssql \ub4f1)\ub294 \ud14c\uc774\ube14\uc5d0 \ub300\ud574 select \ucffc\ub9ac\ub97c \uc218\ud589\ud558\uba74 \ud29c\ud50c \ub9ac\uc2a4\ud2b8\ub97c \ubc18\ud658\ud55c\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor = con.execute('select * from test')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 309 }, { "cell_type": "code", "collapsed": false, "input": [ "rows = cursor.fetchall()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 310 }, { "cell_type": "code", "collapsed": false, "input": [ "rows" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 311, "text": [ "[(u'Atlanta', u'Georgia', 1.25, 6),\n", " (u'Tallahassee', u'Florida', 2.6, 3),\n", " (u'Sacramento', u'California', 1.7, 5)]" ] } ], "prompt_number": 311 }, { "cell_type": "markdown", "metadata": {}, "source": [ "\ubc18\ud658\ub41c \ud29c\ud50c \ub9ac\uc2a4\ud2b8\ub97c DataFrame \uc0dd\uc131\uc790\uc5d0 \ubc14\ub85c \uc804\ub2ec\ud574\ub3c4 \ub418\uc9c0\ub9cc \uce7c\ub7fc\uc758 \uc774\ub984\uc744 \uc9c0\uc815\ud574\uc8fc\uba74 \ub354 \ud3b8\ud558\ub2e4. [cursor\uc758 description](https://code.google.com/p/pyodbc/wiki/Cursor) \uc18d\uc131\uc744 \ud65c\uc6a9\ud558\uc790.\n", "\n", "This read-only attribute is a list of 7-item tuples, each containing (name, type_code, display_size, internal_size, precision, scale, null_ok). pyodbc only provides values for name, type_code, internal_size, and null_ok. The other values are set to None.\n", "\n", "This attribute will be None for operations that do not return rows or if one of the execute methods has not been called.\n", "\n", "The type_code member is the class type used to create the Python objects when reading rows. For example, a varchar column's type will be str.\n", "\n", "#### cursor.desccription \uc774 \uc544\uc9c1\ub3c4 \ubb54\uc9c0 \uc798 \ubaa8\ub974\uaca0\ub2e4. \uc65c a, b, c, d\ub85c \uc815\ud574\uc9c4 \uac83\ub3c4 \uc798 \ubaa8\ub974\uaca0\uace0.." ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.description?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 312 }, { "cell_type": "markdown", "metadata": {}, "source": [ " Type: tuple\n", " String form: (('a', None, None, None, None, None, None), ('b', None, None, None, None, None, None), ('c', None, None, None, None, None, None), ('d', None, None, None, None, None, None))\n", " Length: 4\n", " Docstring:\n", " tuple() -> empty tuple\n", " tuple(iterable) -> tuple initialized from iterable's items\n", "\n", " If the argument is a tuple, the return value is the same object." ] }, { "cell_type": "code", "collapsed": false, "input": [ "cursor.description" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 313, "text": [ "(('a', None, None, None, None, None, None),\n", " ('b', None, None, None, None, None, None),\n", " ('c', None, None, None, None, None, None),\n", " ('d', None, None, None, None, None, None))" ] } ], "prompt_number": 313 }, { "cell_type": "code", "collapsed": false, "input": [ "# cursor.description\uc744 \uc5ec\ub7ec\uac1c \ubc1b\uc544\uc11c 0\ubc88\uc9f8 \uac12\ub4e4\uc744 zip\uc73c\ub85c \ubb36\ub294\ub2e4.\n", "zip(*cursor.description)[0]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 314, "text": [ "('a', 'b', 'c', 'd')" ] } ], "prompt_number": 314 }, { "cell_type": "code", "collapsed": false, "input": [ "DataFrame(rows, columns=zip(*cursor.description)[0])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 315, "text": [ " a b c d\n", "0 Atlanta Georgia 1.25 6\n", "1 Tallahassee Florida 2.60 3\n", "2 Sacramento California 1.70 5" ] } ], "prompt_number": 315 }, { "cell_type": "code", "collapsed": false, "input": [ "DataFrame(rows, columns=zip(*cursor.description)[1])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NoneNoneNoneNone
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 316, "text": [ " NaN NaN NaN NaN\n", "0 Atlanta Georgia 1.25 6\n", "1 Tallahassee Florida 2.60 3\n", "2 Sacramento California 1.70 5" ] } ], "prompt_number": 316 }, { "cell_type": "code", "collapsed": false, "input": [ "# readonly attribute \ub780\ub2e4.\n", "# \ub09c cursor.description\uc744 \uc218\uc815\ud574\uc11c \ub0b4\uac00 \uc6d0\ud558\ub294 \uceec\ub7fc\uac12\uc73c\ub85c \ubcc0\uacbd\ud558\ub824\uace0 \ud588\ub294\ub370..\n", "# \uadf8\ub7fc \uc5b4\ub5bb\uac8c \ubcc0\uacbd\uc744 \ud574\uc57c\ud558\uc9c0?\n", "cursor.description = '1'" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "readonly attribute", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# \ub09c cursor.description\uc744 \uc218\uc815\ud574\uc11c \ub0b4\uac00 \uc6d0\ud558\ub294 \uceec\ub7fc\uac12\uc73c\ub85c \ubcc0\uacbd\ud558\ub824\uace0 \ud588\ub294\ub370..\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# \uadf8\ub7fc \uc5b4\ub5bb\uac8c \ubcc0\uacbd\uc744 \ud574\uc57c\ud558\uc9c0?\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mcursor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdescription\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'1'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: readonly attribute" ] } ], "prompt_number": 317 }, { "cell_type": "code", "collapsed": false, "input": [ "# \uadf8\ub0e5 column\uc5d0 \ub0b4\uac00 \uc4f0\uace0 \uc2f6\uc740\uac83 \uc815\ud558\uba74 \ub418\ub124..\n", "DataFrame(rows, columns=['country', 'state', 'grade1', 'grade2'])" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrystategrade1grade2
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 318, "text": [ " country state grade1 grade2\n", "0 Atlanta Georgia 1.25 6\n", "1 Tallahassee Florida 2.60 3\n", "2 Sacramento California 1.70 5" ] } ], "prompt_number": 318 }, { "cell_type": "markdown", "metadata": {}, "source": [ "- \ub370\uc774\ud130\ubca0\uc774\uc2a4\uc5d0 \ucffc\ub9ac\ub97c \ubcf4\ub0b4\ub824\uace0 \ub9e4\ubc88 \uc774\ub807\uac8c \ud558\ub294\uac74 \ub108\ubb34 \uadc0\ucc2e\uc74c\n", "- pandas.io.sql \ubaa8\ub4c8\uc758 read_frame \ud568\uc218\ub97c \uc774\uc6a9\ud558\uba74 \uac04\ud3b8\ud558\uac8c \ud574\uacb0\n", "- \uadf8\ub0e5 select \ucffc\ub9ac\ubb38\uacfc \ub370\uc774\ud130 \ubca0\uc774\uc2a4 \uc5f0\uacb0 \uac1d\uccb4(con)\ub9cc \ub118\uae30\uba74 \ub41c\ub2e4" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas.io.sql as sql" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 319 }, { "cell_type": "code", "collapsed": false, "input": [ "sql.read_frame('select * from test', con)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 320, "text": [ " a b c d\n", "0 Atlanta Georgia 1.25 6\n", "1 Tallahassee Florida 2.60 3\n", "2 Sacramento California 1.70 5" ] } ], "prompt_number": 320 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "6.4.1 MongoDB\uc5d0 \ub370\uc774\ud130 \uc800\uc7a5\ud558\uace0 \ubd88\ub7ec\uc624\uae30" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- NoSQL \ub370\uc774\ud130\ubca0\uc774\uc2a4\ub294 \ub9e4\uc6b0 \ub2e4\uc591\ud55c \ud615\ud0dc\n", "- \ubc84\ud074\ub9acDB\ub098 \ub3c4\ucfc4\uce90\ube44\ub2db \uac19\uc740 \uac83\uc740 \uc0ac\uc804\ucc98\ub7fc \ud0a4-\uac12\uc744 \uc800\uc7a5\ud558\uae30\ub3c4 \ud558\uace0\n", "- \ub610 \ub2e4\ub978 \uac83\uc740 \uae30\ubcf8 \uc800\uc7a5\uc18c\ub294 \uc0ac\uc804 \uac19\uc740 \uac1d\uccb4\ub97c \uc0ac\uc6a9\ud558\uba70 \ubb38\uc11c \uae30\ubc18\uc73c\ub85c \ub370\uc774\ud130\ub97c \uc800\uc7a5\ud558\uae30\ub3c4 \ud55c\ub2e4.\n", "- \uc774 \ucc45\uc5d0\uc11c\ub294 [MongoDB](http://mongodb.org)\ub97c \uc608\uc81c\ub85c \uc120\ud0dd\n", "- MongoDB \uc11c\ubc84\ub97c \ub85c\uceec\uc5d0 \uc124\uce58\ud558\uace0 \uacf5\uc2dd \ub4dc\ub77c\uc774\ubc84\uc778 pymongo\ub97c \uc0ac\uc6a9\ud574\uc11c \uae30\ubcf8 \ud3ec\ud2b8\ub85c \ubc88\ud638\ub85c \uc5f0\uacb0\n", "- \ud604\uc7ac \ud544\uc790\uc758 \ucef4\ud4e8\ud130\uc5d0\ub294 \uc544\uc9c1 \uc124\uce58\ud558\uc9c0 \uc54a\uc74c. \uc774\ub7f0 \ud615\uc2dd\uc73c\ub85c \ud55c\ub2e4\ub294 \ub290\ub08c\ub9cc \uac00\uc9c0\uc790" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-------\n", "\n", "### \uc2a4\ud130\ub514\ud560 \ub54c \ub098\uc654\ub358 \uc8fc\uc758\uc0ac\ud56d\ub4e4\n", "\n", "- \ud604\uc5c5\uc5d0\uc11c\ub294 read_csv\ub098 read_table\uc744 \ub9ce\uc774 \uc4f4\ub2e4.\n", "- \uc5d1\uc140 -> csv\ub85c \ubcc0\ud658 \ud6c4 \uc0ac\uc6a9\ud55c\ub2e4.\n", "- \uc65c\ub0d0\ud558\uba74 csv\uc5d0 \ud6e8\uc52c \uac15\ub825\ud55c \ub77c\uc774\ube0c\ub7ec\ub9ac\ub4e4\uc774 \uc788\ub294\ub370 \uad73\uc774 \uc5d1\uc140\uc744 \ud65c\uc6a9\ud560 \ud544\uc694\uac00 \uc5c6\ub2e4. \ub610\ud55c \uc5d1\uc140\uc740 \ubcf4\uc774\uc9c0 \uc54a\ub294 \uac12\ub4e4\uc774 \ub9ce\uc544 \uc624\ub958\uc758 \uc8fc\ubc94\n", "- sep\uc5d0 Regular Expression\uc744 \uc0ac\uc6a9\ud560 \uc218 \uc788\ub2e4. \n", "- [\uc5d1\uc140\uc5d0\uc11c XML \ub370\uc774\ud130 \uac00\uc838\uc624\uae30](http://office.microsoft.com/ko-kr/excel-help/HP010206405.aspx#BMimport_an_xml_file_as_an_xml_list_wit)\n", "\n", "------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## [About my IPython in github](https://github.com/re4lfl0w/ipython)" ] } ], "metadata": {} } ] }