{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:12.942927Z", "start_time": "2019-12-24T17:44:11.943916Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading and Writing Data in Text Format" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:12.966381Z", "start_time": "2019-12-24T17:44:12.945392Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a,b,c,d,message\n", "1,2,3,4,hello\n", "5,6,7,8,world\n", "9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex1.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.113898Z", "start_time": "2019-12-24T17:44:12.967654Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(r'examples\\ex1.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.284776Z", "start_time": "2019-12-24T17:44:13.116362Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_table('examples/ex1.csv', sep=',')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A file will not always have a header row. Consider this file" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.454943Z", "start_time": "2019-12-24T17:44:13.290845Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1,2,3,4,hello\n", "5,6,7,8,world\n", "9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex2.csv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.597498Z", "start_time": "2019-12-24T17:44:13.458877Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex2.csv', header=None)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.738188Z", "start_time": "2019-12-24T17:44:13.609101Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.885401Z", "start_time": "2019-12-24T17:44:13.741950Z" } }, "outputs": [], "source": [ "names=['a', 'b', 'c', 'd', 'message']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.053188Z", "start_time": "2019-12-24T17:44:13.885401Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
message
hello1234
world5678
foo9101112
\n", "
" ], "text/plain": [ " a b c d\n", "message \n", "hello 1 2 3 4\n", "world 5 6 7 8\n", "foo 9 10 11 12" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# make message column to be the index of the returned DataFrame\n", "\n", "pd.read_csv(r'examples/ex2.csv', names=names, index_col='message')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the event that you want to form a hierarchical index from multiple columns, pass a list of column numbers or names" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.240514Z", "start_time": "2019-12-24T17:44:14.054189Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "key1,key2,value1,value2\n", "one,a,1,2\n", "one,b,3,4\n", "one,c,5,6\n", "one,d,7,8\n", "two,a,9,10\n", "two,b,11,12\n", "two,c,13,14\n", "two,d,15,16\n" ] } ], "source": [ "!type examples\\csv_mindex.csv" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.396063Z", "start_time": "2019-12-24T17:44:14.242881Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516
\n", "
" ], "text/plain": [ " value1 value2\n", "key1 key2 \n", "one a 1 2\n", " b 3 4\n", " c 5 6\n", " d 7 8\n", "two a 9 10\n", " b 11 12\n", " c 13 14\n", " d 15 16" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed = pd.read_csv(r'examples/csv_mindex.csv', index_col=['key1', 'key2'])\n", "parsed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In some cases, a table might not have a fixed delimiter, using whitespace or some other pattern to separate fields." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.540609Z", "start_time": "2019-12-24T17:44:14.397062Z" } }, "outputs": [ { "data": { "text/plain": [ "[' A B C\\n',\n", " 'aaa -0.264438 -1.026059 -0.619500\\n',\n", " 'bbb 0.927272 0.302904 -0.032399\\n',\n", " 'ccc -0.264273 -0.386314 -0.217601\\n',\n", " 'ddd -0.871858 -0.348382 1.100491\\n']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(open(r'examples/ex3.txt'))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.697406Z", "start_time": "2019-12-24T17:44:14.546683Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491
\n", "
" ], "text/plain": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_table(r'examples/ex3.txt', sep='\\s+')\n", "result" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.864981Z", "start_time": "2019-12-24T17:44:14.700337Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# hey!\n", "a,b,c,d,message\n", "# just wanted to make things more difficult for you\n", "# who reads CSV files with computers, anyway?\n", "1,2,3,4,hello\n", "5,6,7,8,world\n", "9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex4.csv" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.990299Z", "start_time": "2019-12-24T17:44:14.867993Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdmessage
01234hello
15678world
29101112foo
\n", "
" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex4.csv', skiprows=[0, 2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Handling missing values is an important and frequently nuanced part of the file parsing process. Missing data is usually either not present (empty string) or marked by some sentinel value. By default, pandas uses a set of commonly occurring sentinels, such as **NA** and **NUL**" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.168025Z", "start_time": "2019-12-24T17:44:14.994393Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "something,a,b,c,d,message\n", "one,1,2,3,4,NA\n", "two,5,6,,8,world\n", "three,9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex5.csv" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.304991Z", "start_time": "2019-12-24T17:44:15.169022Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv(r'examples/ex5.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.550424Z", "start_time": "2019-12-24T17:44:15.307795Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseFalse
2FalseFalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 False False False False False True\n", "1 False False False True False False\n", "2 False False False False False False" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.isnull(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The na_values option can take either a list or set of strings to consider missing values" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.711268Z", "start_time": "2019-12-24T17:44:15.554376Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv(r'examples/ex5.csv', na_values=['NULL'])\n", "result" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.847179Z", "start_time": "2019-12-24T17:44:15.711268Z" } }, "outputs": [], "source": [ "# Different NA sentinels can be specified for each column in a dict\n", "\n", "sentinels = {'message': ['foo', 'NA'], 'something':['two']}" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.005092Z", "start_time": "2019-12-24T17:44:15.851139Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1NaN56NaN8world
2three91011.012NaN
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 NaN 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 NaN" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex5.csv', na_values=sentinels)" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-23T12:59:01.714350Z", "start_time": "2019-12-23T12:59:01.697710Z" } }, "source": [ "## Reading Text Files in Pieces" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When processing very large files or figuring out the right set of arguments to correctly process a large file, you may only want to read in a small piece of a file or iterate\n", "through smaller chunks of the file.Before we look at a large file, we make the pandas display settings more compact" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.156659Z", "start_time": "2019-12-24T17:44:16.007763Z" } }, "outputs": [], "source": [ "pd.options.display.max_rows = 10" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.353691Z", "start_time": "2019-12-24T17:44:16.161827Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
..................
99952.311896-0.417070-1.409599-0.515821L
9996-0.479893-0.6504190.745152-0.646038E
99970.5233310.7871120.4860661.093156K
9998-0.3625590.598894-1.8432010.887292G
9999-0.096376-1.012999-0.657431-0.5733150
\n", "

10000 rows × 5 columns

\n", "
" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q\n", "... ... ... ... ... ..\n", "9995 2.311896 -0.417070 -1.409599 -0.515821 L\n", "9996 -0.479893 -0.650419 0.745152 -0.646038 E\n", "9997 0.523331 0.787112 0.486066 1.093156 K\n", "9998 -0.362559 0.598894 -1.843201 0.887292 G\n", "9999 -0.096376 -1.012999 -0.657431 -0.573315 0\n", "\n", "[10000 rows x 5 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv(r'examples/ex6.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.491655Z", "start_time": "2019-12-24T17:44:16.356466Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
\n", "
" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nrows - read a small number of rows \n", "\n", "pd.read_csv(r'examples/ex6.csv', nrows=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To read a file in pieces, specify a **chunksize** as a number of rows\n", "\n", "The TextParser object returned by read_csv allows you to iterate over the parts of the file according to the chunksize." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.639722Z", "start_time": "2019-12-24T17:44:16.497681Z" } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunker = pd.read_csv(r'examples/ex6.csv', chunksize=1000)\n", "chunker" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.931145Z", "start_time": "2019-12-24T17:44:16.643350Z" } }, "outputs": [], "source": [ "chunker = pd.read_csv(r'examples/ex6.csv', chunksize=1000)\n", "tot = pd.Series([])\n", "for piece in chunker:\n", " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", "tot = tot.sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.948713Z", "start_time": "2019-12-24T17:44:16.933227Z" } }, "outputs": [ { "data": { "text/plain": [ "E 368.0\n", "X 364.0\n", "L 346.0\n", "O 343.0\n", "Q 340.0\n", "M 338.0\n", "J 337.0\n", "F 335.0\n", "K 334.0\n", "H 330.0\n", "dtype: float64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tot[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Writing Data to Text Format" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.102861Z", "start_time": "2019-12-24T17:44:16.948713Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
\n", "
" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(r'examples/ex5.csv')\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using DataFrame’s **to_csv** method, we can write the data out to a comma separated file" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.249514Z", "start_time": "2019-12-24T17:44:17.105944Z" } }, "outputs": [], "source": [ "data.to_csv(r'examples/out.csv')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.428423Z", "start_time": "2019-12-24T17:44:17.257017Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,\n", "1,two,5,6,,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "source": [ "!type examples\\out.csv" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.544879Z", "start_time": "2019-12-24T17:44:17.431586Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "|something|a|b|c|d|message\r\n", "0|one|1|2|3.0|4|\r\n", "1|two|5|6||8|world\r\n", "2|three|9|10|11.0|12|foo\r\n" ] } ], "source": [ "import sys\n", "\n", "data.to_csv(sys.stdout, sep='|')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Missing values appear as empty strings in the output. You might want to denote them by some other sentinel value" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.693974Z", "start_time": "2019-12-24T17:44:17.552304Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",something,a,b,c,d,message\r\n", "0,one,1,2,3.0,4,NULL\r\n", "1,two,5,6,NULL,8,world\r\n", "2,three,9,10,11.0,12,foo\r\n" ] } ], "source": [ "data.to_csv(sys.stdout, na_rep='NULL')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.831547Z", "start_time": "2019-12-24T17:44:17.699052Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one,1,2,3.0,4,\r\n", "two,5,6,,8,world\r\n", "three,9,10,11.0,12,foo\r\n" ] } ], "source": [ "# With no other options specified, both the row and column labels \n", "# are written. Both of these can be disabled\n", "\n", "data.to_csv(sys.stdout, index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.980296Z", "start_time": "2019-12-24T17:44:17.836819Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a,b,c\r\n", "1,2,3.0\r\n", "5,6,\r\n", "9,10,11.0\r\n" ] } ], "source": [ "# write only a subset of the columns, and in an order of your choosing\n", "\n", "data.to_csv(sys.stdout ,index=False, columns=['a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.437609Z", "start_time": "2019-12-24T17:44:17.984237Z" } }, "outputs": [], "source": [ "#Series\n", "\n", "dates = pd.date_range('1/1/2020', periods=10)\n", "ts = pd.Series(np.arange(10), index=dates)\n", "ts.to_csv(r'examples/tseries.csv', header=False)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.595013Z", "start_time": "2019-12-24T17:44:18.443009Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-01-01,0\n", "2020-01-02,1\n", "2020-01-03,2\n", "2020-01-04,3\n", "2020-01-05,4\n", "2020-01-06,5\n", "2020-01-07,6\n", "2020-01-08,7\n", "2020-01-09,8\n", "2020-01-10,9\n" ] } ], "source": [ "!type examples\\tseries.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with Delimited Formats\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It’s possible to load most forms of tabular data from disk using functions like pandas.read_table. In some cases, however, some manual processing may be necessary.\n", "It’s not uncommon to receive a file with one or more malformed lines that trip up read_table" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.746774Z", "start_time": "2019-12-24T17:44:18.596249Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\"a\",\"b\",\"c\"\n", "\"1\",\"2\",\"3\"\n", "\"1\",\"2\",\"3\"\n" ] } ], "source": [ "!type examples\\ex7.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For any file with a single-character delimiter, you can use Python’s built-in csv module. To use it, pass any open file or file-like object to csv.reader" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.872970Z", "start_time": "2019-12-24T17:44:18.749959Z" } }, "outputs": [], "source": [ "import csv\n", "\n", "f = open(r'examples/ex7.csv')\n", "reader = csv.reader(f)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.022732Z", "start_time": "2019-12-24T17:44:18.878009Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c']\n", "['1', '2', '3']\n", "['1', '2', '3']\n" ] } ], "source": [ "for line in reader:\n", " print(line)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.162044Z", "start_time": "2019-12-24T17:44:19.026915Z" } }, "outputs": [], "source": [ "with open(r'examples/ex7.csv') as f:\n", " lines = list(csv.reader(f))" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.313160Z", "start_time": "2019-12-24T17:44:19.163107Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c']\n", "[['1', '2', '3'], ['1', '2', '3']]\n" ] } ], "source": [ "header, values = lines[0], lines[1:]\n", "print(header)\n", "print(values)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.469890Z", "start_time": "2019-12-24T17:44:19.318193Z" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 1\n", "2 2\n", "3 3\n" ] } ], "source": [ "for i, j in zip(*values):\n", " print(i, j)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.610273Z", "start_time": "2019-12-24T17:44:19.473506Z" } }, "outputs": [], "source": [ "data_dict = {h: v for h, v in zip(header, zip(*values))}" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.754072Z", "start_time": "2019-12-24T17:44:19.618224Z" } }, "outputs": [ { "data": { "text/plain": [ "{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_dict" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-23T16:17:13.529427Z", "start_time": "2019-12-23T16:17:13.524002Z" } }, "source": [ "## JSON Data" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.901997Z", "start_time": "2019-12-24T17:44:19.761474Z" } }, "outputs": [], "source": [ "obj = '''\n", "{\"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null,\n", " \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},\n", " {\"name\": \"Katie\", \"age\": 38,\n", " \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]\n", "}\n", "'''" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.068177Z", "start_time": "2019-12-24T17:44:19.907459Z" } }, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.222470Z", "start_time": "2019-12-24T17:44:20.072556Z" } }, "outputs": [ { "data": { "text/plain": [ "{'name': 'Wes',\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'pet': None,\n", " 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},\n", " {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = json.loads(obj)\n", "result" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.371155Z", "start_time": "2019-12-24T17:44:20.228491Z" } }, "outputs": [], "source": [ "# json.dumps, on the other hand, converts a Python object back to JSON:\n", "\n", "asjson = json.dumps(result)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.540849Z", "start_time": "2019-12-24T17:44:20.377740Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagepets
0Scott30[Zeus, Zuko]
1Katie38[Sixes, Stache, Cisco]
\n", "
" ], "text/plain": [ " name age pets\n", "0 Scott 30 [Zeus, Zuko]\n", "1 Katie 38 [Sixes, Stache, Cisco]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "siblings = pd.DataFrame(result['siblings'], columns=['name', 'age', 'pets'])\n", "siblings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The **pandas.read_json** can automatically convert JSON datasets in specific arrangements into a Series or DataFrame." ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.729151Z", "start_time": "2019-12-24T17:44:20.544383Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{\"a\": 1, \"b\": 2, \"c\": 3},\n", " {\"a\": 4, \"b\": 5, \"c\": 6},\n", " {\"a\": 7, \"b\": 8, \"c\": 9}]\n" ] } ], "source": [ "!type examples\\example.json" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.868493Z", "start_time": "2019-12-24T17:44:20.731146Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
0123
1456
2789
\n", "
" ], "text/plain": [ " a b c\n", "0 1 2 3\n", "1 4 5 6\n", "2 7 8 9" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_json(r'examples/example.json')\n", "data" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:21.008429Z", "start_time": "2019-12-24T17:44:20.873164Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"a\":{\"0\":1,\"1\":4,\"2\":7},\"b\":{\"0\":2,\"1\":5,\"2\":8},\"c\":{\"0\":3,\"1\":6,\"2\":9}}\n", "[{\"a\":1,\"b\":2,\"c\":3},{\"a\":4,\"b\":5,\"c\":6},{\"a\":7,\"b\":8,\"c\":9}]\n" ] } ], "source": [ "# pandas -> json\n", "\n", "print(data.to_json())\n", "print(data.to_json(orient='records'))" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:21.143500Z", "start_time": "2019-12-24T17:44:21.013899Z" } }, "outputs": [], "source": [ "# **ToDo:**\n", "\n", "# 1.5 - XML and HTML: Web Scraping\n", "# 2 - Binary Data Formats\n", "# 2.1 - Using HDF5 Format\n", "# 2.2 - Reading Microsoft Excel Files\n", "# 3 - Interacting with Web APIs\n", "# 4 - Interacting with Databases" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }