{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n", "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Reading-and-Writing-Data-in-Text-Format\" data-toc-modified-id=\"Reading-and-Writing-Data-in-Text-Format-1\"><span class=\"toc-item-num\">1 </span>Reading and Writing Data in Text Format</a></span><ul class=\"toc-item\"><li><span><a href=\"#Reading-Text-Files-in-Pieces\" data-toc-modified-id=\"Reading-Text-Files-in-Pieces-1.1\"><span class=\"toc-item-num\">1.1 </span>Reading Text Files in Pieces</a></span></li><li><span><a href=\"#Writing-Data-to-Text-Format\" data-toc-modified-id=\"Writing-Data-to-Text-Format-1.2\"><span class=\"toc-item-num\">1.2 </span>Writing Data to Text Format</a></span></li><li><span><a href=\"#Working-with-Delimited-Formats\" data-toc-modified-id=\"Working-with-Delimited-Formats-1.3\"><span class=\"toc-item-num\">1.3 </span>Working with Delimited Formats</a></span></li><li><span><a href=\"#JSON-Data\" data-toc-modified-id=\"JSON-Data-1.4\"><span class=\"toc-item-num\">1.4 </span>JSON Data</a></span></li></ul></li></ul></div>" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:12.942927Z", "start_time": "2019-12-24T17:44:11.943916Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading and Writing Data in Text Format" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:12.966381Z", "start_time": "2019-12-24T17:44:12.945392Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a,b,c,d,message\n", "1,2,3,4,hello\n", "5,6,7,8,world\n", "9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex1.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.113898Z", "start_time": "2019-12-24T17:44:12.967654Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>hello</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(r'examples\\ex1.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.284776Z", "start_time": "2019-12-24T17:44:13.116362Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>hello</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_table('examples/ex1.csv', sep=',')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A file will not always have a header row. Consider this file" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.454943Z", "start_time": "2019-12-24T17:44:13.290845Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1,2,3,4,hello\n", "5,6,7,8,world\n", "9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex2.csv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.597498Z", "start_time": "2019-12-24T17:44:13.458877Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>hello</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex2.csv', header=None)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.738188Z", "start_time": "2019-12-24T17:44:13.609101Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>hello</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:13.885401Z", "start_time": "2019-12-24T17:44:13.741950Z" } }, "outputs": [], "source": [ "names=['a', 'b', 'c', 'd', 'message']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.053188Z", "start_time": "2019-12-24T17:44:13.885401Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " </tr>\n", " <tr>\n", " <th>message</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>hello</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>world</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " </tr>\n", " <tr>\n", " <th>foo</th>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>12</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d\n", "message \n", "hello 1 2 3 4\n", "world 5 6 7 8\n", "foo 9 10 11 12" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# make message column to be the index of the returned DataFrame\n", "\n", "pd.read_csv(r'examples/ex2.csv', names=names, index_col='message')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the event that you want to form a hierarchical index from multiple columns, pass a list of column numbers or names" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.240514Z", "start_time": "2019-12-24T17:44:14.054189Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "key1,key2,value1,value2\n", "one,a,1,2\n", "one,b,3,4\n", "one,c,5,6\n", "one,d,7,8\n", "two,a,9,10\n", "two,b,11,12\n", "two,c,13,14\n", "two,d,15,16\n" ] } ], "source": [ "!type examples\\csv_mindex.csv" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.396063Z", "start_time": "2019-12-24T17:44:14.242881Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>value1</th>\n", " <th>value2</th>\n", " </tr>\n", " <tr>\n", " <th>key1</th>\n", " <th>key2</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">one</th>\n", " <th>a</th>\n", " <td>1</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>3</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>5</td>\n", " <td>6</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>7</td>\n", " <td>8</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"4\" valign=\"top\">two</th>\n", " <th>a</th>\n", " <td>9</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>11</td>\n", " <td>12</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>13</td>\n", " <td>14</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>15</td>\n", " <td>16</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " value1 value2\n", "key1 key2 \n", "one a 1 2\n", " b 3 4\n", " c 5 6\n", " d 7 8\n", "two a 9 10\n", " b 11 12\n", " c 13 14\n", " d 15 16" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed = pd.read_csv(r'examples/csv_mindex.csv', index_col=['key1', 'key2'])\n", "parsed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In some cases, a table might not have a fixed delimiter, using whitespace or some other pattern to separate fields." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.540609Z", "start_time": "2019-12-24T17:44:14.397062Z" } }, "outputs": [ { "data": { "text/plain": [ "[' A B C\\n',\n", " 'aaa -0.264438 -1.026059 -0.619500\\n',\n", " 'bbb 0.927272 0.302904 -0.032399\\n',\n", " 'ccc -0.264273 -0.386314 -0.217601\\n',\n", " 'ddd -0.871858 -0.348382 1.100491\\n']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(open(r'examples/ex3.txt'))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.697406Z", "start_time": "2019-12-24T17:44:14.546683Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>B</th>\n", " <th>C</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>aaa</th>\n", " <td>-0.264438</td>\n", " <td>-1.026059</td>\n", " <td>-0.619500</td>\n", " </tr>\n", " <tr>\n", " <th>bbb</th>\n", " <td>0.927272</td>\n", " <td>0.302904</td>\n", " <td>-0.032399</td>\n", " </tr>\n", " <tr>\n", " <th>ccc</th>\n", " <td>-0.264273</td>\n", " <td>-0.386314</td>\n", " <td>-0.217601</td>\n", " </tr>\n", " <tr>\n", " <th>ddd</th>\n", " <td>-0.871858</td>\n", " <td>-0.348382</td>\n", " <td>1.100491</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A B C\n", "aaa -0.264438 -1.026059 -0.619500\n", "bbb 0.927272 0.302904 -0.032399\n", "ccc -0.264273 -0.386314 -0.217601\n", "ddd -0.871858 -0.348382 1.100491" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_table(r'examples/ex3.txt', sep='\\s+')\n", "result" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.864981Z", "start_time": "2019-12-24T17:44:14.700337Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# hey!\n", "a,b,c,d,message\n", "# just wanted to make things more difficult for you\n", "# who reads CSV files with computers, anyway?\n", "1,2,3,4,hello\n", "5,6,7,8,world\n", "9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex4.csv" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:14.990299Z", "start_time": "2019-12-24T17:44:14.867993Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>hello</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d message\n", "0 1 2 3 4 hello\n", "1 5 6 7 8 world\n", "2 9 10 11 12 foo" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex4.csv', skiprows=[0, 2,3])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Handling missing values is an important and frequently nuanced part of the file parsing process. Missing data is usually either not present (empty string) or marked by some sentinel value. By default, pandas uses a set of commonly occurring sentinels, such as **NA** and **NUL**" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.168025Z", "start_time": "2019-12-24T17:44:14.994393Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "something,a,b,c,d,message\n", "one,1,2,3,4,NA\n", "two,5,6,,8,world\n", "three,9,10,11,12,foo\n" ] } ], "source": [ "!type examples\\ex5.csv" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.304991Z", "start_time": "2019-12-24T17:44:15.169022Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>something</th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>one</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>4</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>two</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>NaN</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>three</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11.0</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv(r'examples/ex5.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.550424Z", "start_time": "2019-12-24T17:44:15.307795Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>something</th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " something a b c d message\n", "0 False False False False False True\n", "1 False False False True False False\n", "2 False False False False False False" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.isnull(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The na_values option can take either a list or set of strings to consider missing values" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.711268Z", "start_time": "2019-12-24T17:44:15.554376Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>something</th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>one</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>4</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>two</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>NaN</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>three</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11.0</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv(r'examples/ex5.csv', na_values=['NULL'])\n", "result" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:15.847179Z", "start_time": "2019-12-24T17:44:15.711268Z" } }, "outputs": [], "source": [ "# Different NA sentinels can be specified for each column in a dict\n", "\n", "sentinels = {'message': ['foo', 'NA'], 'something':['two']}" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.005092Z", "start_time": "2019-12-24T17:44:15.851139Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>something</th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>one</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>4</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>NaN</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>NaN</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>three</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11.0</td>\n", " <td>12</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 NaN 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 NaN" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_csv(r'examples/ex5.csv', na_values=sentinels)" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-23T12:59:01.714350Z", "start_time": "2019-12-23T12:59:01.697710Z" } }, "source": [ "## Reading Text Files in Pieces" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When processing very large files or figuring out the right set of arguments to correctly process a large file, you may only want to read in a small piece of a file or iterate\n", "through smaller chunks of the file.Before we look at a large file, we make the pandas display settings more compact" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.156659Z", "start_time": "2019-12-24T17:44:16.007763Z" } }, "outputs": [], "source": [ "pd.options.display.max_rows = 10" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.353691Z", "start_time": "2019-12-24T17:44:16.161827Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " <th>key</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.467976</td>\n", " <td>-0.038649</td>\n", " <td>-0.295344</td>\n", " <td>-1.824726</td>\n", " <td>L</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>-0.358893</td>\n", " <td>1.404453</td>\n", " <td>0.704965</td>\n", " <td>-0.200638</td>\n", " <td>B</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>-0.501840</td>\n", " <td>0.659254</td>\n", " <td>-0.421691</td>\n", " <td>-0.057688</td>\n", " <td>G</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.204886</td>\n", " <td>1.074134</td>\n", " <td>1.388361</td>\n", " <td>-0.982404</td>\n", " <td>R</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.354628</td>\n", " <td>-0.133116</td>\n", " <td>0.283763</td>\n", " <td>-0.837063</td>\n", " <td>Q</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>9995</th>\n", " <td>2.311896</td>\n", " <td>-0.417070</td>\n", " <td>-1.409599</td>\n", " <td>-0.515821</td>\n", " <td>L</td>\n", " </tr>\n", " <tr>\n", " <th>9996</th>\n", " <td>-0.479893</td>\n", " <td>-0.650419</td>\n", " <td>0.745152</td>\n", " <td>-0.646038</td>\n", " <td>E</td>\n", " </tr>\n", " <tr>\n", " <th>9997</th>\n", " <td>0.523331</td>\n", " <td>0.787112</td>\n", " <td>0.486066</td>\n", " <td>1.093156</td>\n", " <td>K</td>\n", " </tr>\n", " <tr>\n", " <th>9998</th>\n", " <td>-0.362559</td>\n", " <td>0.598894</td>\n", " <td>-1.843201</td>\n", " <td>0.887292</td>\n", " <td>G</td>\n", " </tr>\n", " <tr>\n", " <th>9999</th>\n", " <td>-0.096376</td>\n", " <td>-1.012999</td>\n", " <td>-0.657431</td>\n", " <td>-0.573315</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10000 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q\n", "... ... ... ... ... ..\n", "9995 2.311896 -0.417070 -1.409599 -0.515821 L\n", "9996 -0.479893 -0.650419 0.745152 -0.646038 E\n", "9997 0.523331 0.787112 0.486066 1.093156 K\n", "9998 -0.362559 0.598894 -1.843201 0.887292 G\n", "9999 -0.096376 -1.012999 -0.657431 -0.573315 0\n", "\n", "[10000 rows x 5 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.read_csv(r'examples/ex6.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.491655Z", "start_time": "2019-12-24T17:44:16.356466Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " <th>key</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.467976</td>\n", " <td>-0.038649</td>\n", " <td>-0.295344</td>\n", " <td>-1.824726</td>\n", " <td>L</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>-0.358893</td>\n", " <td>1.404453</td>\n", " <td>0.704965</td>\n", " <td>-0.200638</td>\n", " <td>B</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>-0.501840</td>\n", " <td>0.659254</td>\n", " <td>-0.421691</td>\n", " <td>-0.057688</td>\n", " <td>G</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.204886</td>\n", " <td>1.074134</td>\n", " <td>1.388361</td>\n", " <td>-0.982404</td>\n", " <td>R</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.354628</td>\n", " <td>-0.133116</td>\n", " <td>0.283763</td>\n", " <td>-0.837063</td>\n", " <td>Q</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four key\n", "0 0.467976 -0.038649 -0.295344 -1.824726 L\n", "1 -0.358893 1.404453 0.704965 -0.200638 B\n", "2 -0.501840 0.659254 -0.421691 -0.057688 G\n", "3 0.204886 1.074134 1.388361 -0.982404 R\n", "4 0.354628 -0.133116 0.283763 -0.837063 Q" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nrows - read a small number of rows \n", "\n", "pd.read_csv(r'examples/ex6.csv', nrows=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To read a file in pieces, specify a **chunksize** as a number of rows\n", "\n", "The TextParser object returned by read_csv allows you to iterate over the parts of the file according to the chunksize." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.639722Z", "start_time": "2019-12-24T17:44:16.497681Z" } }, "outputs": [ { "data": { "text/plain": [ "<pandas.io.parsers.TextFileReader at 0x23380d2b708>" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunker = pd.read_csv(r'examples/ex6.csv', chunksize=1000)\n", "chunker" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.931145Z", "start_time": "2019-12-24T17:44:16.643350Z" } }, "outputs": [], "source": [ "chunker = pd.read_csv(r'examples/ex6.csv', chunksize=1000)\n", "tot = pd.Series([])\n", "for piece in chunker:\n", " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", "tot = tot.sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:16.948713Z", "start_time": "2019-12-24T17:44:16.933227Z" } }, "outputs": [ { "data": { "text/plain": [ "E 368.0\n", "X 364.0\n", "L 346.0\n", "O 343.0\n", "Q 340.0\n", "M 338.0\n", "J 337.0\n", "F 335.0\n", "K 334.0\n", "H 330.0\n", "dtype: float64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tot[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Writing Data to Text Format" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.102861Z", "start_time": "2019-12-24T17:44:16.948713Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>something</th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>message</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>one</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3.0</td>\n", " <td>4</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>two</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>NaN</td>\n", " <td>8</td>\n", " <td>world</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>three</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11.0</td>\n", " <td>12</td>\n", " <td>foo</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " something a b c d message\n", "0 one 1 2 3.0 4 NaN\n", "1 two 5 6 NaN 8 world\n", "2 three 9 10 11.0 12 foo" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(r'examples/ex5.csv')\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using DataFrame’s **to_csv** method, we can write the data out to a comma separated file" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.249514Z", "start_time": "2019-12-24T17:44:17.105944Z" } }, "outputs": [], "source": [ "data.to_csv(r'examples/out.csv')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.428423Z", "start_time": "2019-12-24T17:44:17.257017Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",something,a,b,c,d,message\n", "0,one,1,2,3.0,4,\n", "1,two,5,6,,8,world\n", "2,three,9,10,11.0,12,foo\n" ] } ], "source": [ "!type examples\\out.csv" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.544879Z", "start_time": "2019-12-24T17:44:17.431586Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "|something|a|b|c|d|message\r\n", "0|one|1|2|3.0|4|\r\n", "1|two|5|6||8|world\r\n", "2|three|9|10|11.0|12|foo\r\n" ] } ], "source": [ "import sys\n", "\n", "data.to_csv(sys.stdout, sep='|')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Missing values appear as empty strings in the output. You might want to denote them by some other sentinel value" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.693974Z", "start_time": "2019-12-24T17:44:17.552304Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",something,a,b,c,d,message\r\n", "0,one,1,2,3.0,4,NULL\r\n", "1,two,5,6,NULL,8,world\r\n", "2,three,9,10,11.0,12,foo\r\n" ] } ], "source": [ "data.to_csv(sys.stdout, na_rep='NULL')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.831547Z", "start_time": "2019-12-24T17:44:17.699052Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one,1,2,3.0,4,\r\n", "two,5,6,,8,world\r\n", "three,9,10,11.0,12,foo\r\n" ] } ], "source": [ "# With no other options specified, both the row and column labels \n", "# are written. Both of these can be disabled\n", "\n", "data.to_csv(sys.stdout, index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:17.980296Z", "start_time": "2019-12-24T17:44:17.836819Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a,b,c\r\n", "1,2,3.0\r\n", "5,6,\r\n", "9,10,11.0\r\n" ] } ], "source": [ "# write only a subset of the columns, and in an order of your choosing\n", "\n", "data.to_csv(sys.stdout ,index=False, columns=['a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.437609Z", "start_time": "2019-12-24T17:44:17.984237Z" } }, "outputs": [], "source": [ "#Series\n", "\n", "dates = pd.date_range('1/1/2020', periods=10)\n", "ts = pd.Series(np.arange(10), index=dates)\n", "ts.to_csv(r'examples/tseries.csv', header=False)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.595013Z", "start_time": "2019-12-24T17:44:18.443009Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-01-01,0\n", "2020-01-02,1\n", "2020-01-03,2\n", "2020-01-04,3\n", "2020-01-05,4\n", "2020-01-06,5\n", "2020-01-07,6\n", "2020-01-08,7\n", "2020-01-09,8\n", "2020-01-10,9\n" ] } ], "source": [ "!type examples\\tseries.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with Delimited Formats\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It’s possible to load most forms of tabular data from disk using functions like pandas.read_table. In some cases, however, some manual processing may be necessary.\n", "It’s not uncommon to receive a file with one or more malformed lines that trip up read_table" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.746774Z", "start_time": "2019-12-24T17:44:18.596249Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\"a\",\"b\",\"c\"\n", "\"1\",\"2\",\"3\"\n", "\"1\",\"2\",\"3\"\n" ] } ], "source": [ "!type examples\\ex7.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For any file with a single-character delimiter, you can use Python’s built-in csv module. To use it, pass any open file or file-like object to csv.reader" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:18.872970Z", "start_time": "2019-12-24T17:44:18.749959Z" } }, "outputs": [], "source": [ "import csv\n", "\n", "f = open(r'examples/ex7.csv')\n", "reader = csv.reader(f)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.022732Z", "start_time": "2019-12-24T17:44:18.878009Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c']\n", "['1', '2', '3']\n", "['1', '2', '3']\n" ] } ], "source": [ "for line in reader:\n", " print(line)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.162044Z", "start_time": "2019-12-24T17:44:19.026915Z" } }, "outputs": [], "source": [ "with open(r'examples/ex7.csv') as f:\n", " lines = list(csv.reader(f))" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.313160Z", "start_time": "2019-12-24T17:44:19.163107Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['a', 'b', 'c']\n", "[['1', '2', '3'], ['1', '2', '3']]\n" ] } ], "source": [ "header, values = lines[0], lines[1:]\n", "print(header)\n", "print(values)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.469890Z", "start_time": "2019-12-24T17:44:19.318193Z" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 1\n", "2 2\n", "3 3\n" ] } ], "source": [ "for i, j in zip(*values):\n", " print(i, j)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.610273Z", "start_time": "2019-12-24T17:44:19.473506Z" } }, "outputs": [], "source": [ "data_dict = {h: v for h, v in zip(header, zip(*values))}" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.754072Z", "start_time": "2019-12-24T17:44:19.618224Z" } }, "outputs": [ { "data": { "text/plain": [ "{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_dict" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-23T16:17:13.529427Z", "start_time": "2019-12-23T16:17:13.524002Z" } }, "source": [ "## JSON Data" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:19.901997Z", "start_time": "2019-12-24T17:44:19.761474Z" } }, "outputs": [], "source": [ "obj = '''\n", "{\"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null,\n", " \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},\n", " {\"name\": \"Katie\", \"age\": 38,\n", " \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]\n", "}\n", "'''" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.068177Z", "start_time": "2019-12-24T17:44:19.907459Z" } }, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.222470Z", "start_time": "2019-12-24T17:44:20.072556Z" } }, "outputs": [ { "data": { "text/plain": [ "{'name': 'Wes',\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'pet': None,\n", " 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},\n", " {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = json.loads(obj)\n", "result" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.371155Z", "start_time": "2019-12-24T17:44:20.228491Z" } }, "outputs": [], "source": [ "# json.dumps, on the other hand, converts a Python object back to JSON:\n", "\n", "asjson = json.dumps(result)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.540849Z", "start_time": "2019-12-24T17:44:20.377740Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>name</th>\n", " <th>age</th>\n", " <th>pets</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Scott</td>\n", " <td>30</td>\n", " <td>[Zeus, Zuko]</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Katie</td>\n", " <td>38</td>\n", " <td>[Sixes, Stache, Cisco]</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " name age pets\n", "0 Scott 30 [Zeus, Zuko]\n", "1 Katie 38 [Sixes, Stache, Cisco]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "siblings = pd.DataFrame(result['siblings'], columns=['name', 'age', 'pets'])\n", "siblings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The **pandas.read_json** can automatically convert JSON datasets in specific arrangements into a Series or DataFrame." ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.729151Z", "start_time": "2019-12-24T17:44:20.544383Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{\"a\": 1, \"b\": 2, \"c\": 3},\n", " {\"a\": 4, \"b\": 5, \"c\": 6},\n", " {\"a\": 7, \"b\": 8, \"c\": 9}]\n" ] } ], "source": [ "!type examples\\example.json" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:20.868493Z", "start_time": "2019-12-24T17:44:20.731146Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>9</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c\n", "0 1 2 3\n", "1 4 5 6\n", "2 7 8 9" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_json(r'examples/example.json')\n", "data" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:21.008429Z", "start_time": "2019-12-24T17:44:20.873164Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"a\":{\"0\":1,\"1\":4,\"2\":7},\"b\":{\"0\":2,\"1\":5,\"2\":8},\"c\":{\"0\":3,\"1\":6,\"2\":9}}\n", "[{\"a\":1,\"b\":2,\"c\":3},{\"a\":4,\"b\":5,\"c\":6},{\"a\":7,\"b\":8,\"c\":9}]\n" ] } ], "source": [ "# pandas -> json\n", "\n", "print(data.to_json())\n", "print(data.to_json(orient='records'))" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T17:44:21.143500Z", "start_time": "2019-12-24T17:44:21.013899Z" } }, "outputs": [], "source": [ "# **ToDo:**\n", "\n", "# 1.5 - XML and HTML: Web Scraping\n", "# 2 - Binary Data Formats\n", "# 2.1 - Using HDF5 Format\n", "# 2.2 - Reading Microsoft Excel Files\n", "# 3 - Interacting with Web APIs\n", "# 4 - Interacting with Databases" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }