{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# pandas I/O tools and examples"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
"
],
"text/plain": [
""
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import addutils.toc ; addutils.toc.js(ipy_notebook=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n"
],
"text/plain": [
""
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from addutils import css_notebook\n",
"css_notebook()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1 Matlab Variables"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1 Import a Matlab variable from file"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n"
],
"text/plain": [
""
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"import os \n",
"import scipy.io\n",
"import numpy as np\n",
"import pandas as pd\n",
"import sqlite3\n",
"#from pandas.io.data import DataFrame, read_csv\n",
"import pandas.io.sql as psql\n",
"from time import time\n",
"from IPython.display import display\n",
"from addutils import css_notebook\n",
"css_notebook()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import from '.mat' files"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"x = scipy.io.loadmat('example_data/matlab_variable.mat')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 92, 99, 1, 8, 15, 67, 74, 51, 58, 40],\n",
" [ 98, 80, 7, 14, 16, 73, 55, 57, 64, 41],\n",
" [ 4, 81, 88, 20, 22, 54, 56, 63, 70, 47],\n",
" [ 85, 87, 19, 21, 3, 60, 62, 69, 71, 28],\n",
" [ 86, 93, 25, 2, 9, 61, 68, 75, 52, 34],\n",
" [ 17, 24, 76, 83, 90, 42, 49, 26, 33, 65],\n",
" [ 23, 5, 82, 89, 91, 48, 30, 32, 39, 66],\n",
" [ 79, 6, 13, 95, 97, 29, 31, 38, 45, 72],\n",
" [ 10, 12, 94, 96, 78, 35, 37, 44, 46, 53],\n",
" [ 11, 18, 100, 77, 84, 36, 43, 50, 27, 59]], dtype=uint8)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pyA = x['a']\n",
"pyA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Matlab variable is passed to a pandas DataFrame:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 92 | \n",
" 99 | \n",
" 1 | \n",
" 8 | \n",
" 15 | \n",
" 67 | \n",
" 74 | \n",
" 51 | \n",
" 58 | \n",
" 40 | \n",
"
\n",
" \n",
" 1 | \n",
" 98 | \n",
" 80 | \n",
" 7 | \n",
" 14 | \n",
" 16 | \n",
" 73 | \n",
" 55 | \n",
" 57 | \n",
" 64 | \n",
" 41 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 81 | \n",
" 88 | \n",
" 20 | \n",
" 22 | \n",
" 54 | \n",
" 56 | \n",
" 63 | \n",
" 70 | \n",
" 47 | \n",
"
\n",
" \n",
" 3 | \n",
" 85 | \n",
" 87 | \n",
" 19 | \n",
" 21 | \n",
" 3 | \n",
" 60 | \n",
" 62 | \n",
" 69 | \n",
" 71 | \n",
" 28 | \n",
"
\n",
" \n",
" 4 | \n",
" 86 | \n",
" 93 | \n",
" 25 | \n",
" 2 | \n",
" 9 | \n",
" 61 | \n",
" 68 | \n",
" 75 | \n",
" 52 | \n",
" 34 | \n",
"
\n",
" \n",
" 5 | \n",
" 17 | \n",
" 24 | \n",
" 76 | \n",
" 83 | \n",
" 90 | \n",
" 42 | \n",
" 49 | \n",
" 26 | \n",
" 33 | \n",
" 65 | \n",
"
\n",
" \n",
" 6 | \n",
" 23 | \n",
" 5 | \n",
" 82 | \n",
" 89 | \n",
" 91 | \n",
" 48 | \n",
" 30 | \n",
" 32 | \n",
" 39 | \n",
" 66 | \n",
"
\n",
" \n",
" 7 | \n",
" 79 | \n",
" 6 | \n",
" 13 | \n",
" 95 | \n",
" 97 | \n",
" 29 | \n",
" 31 | \n",
" 38 | \n",
" 45 | \n",
" 72 | \n",
"
\n",
" \n",
" 8 | \n",
" 10 | \n",
" 12 | \n",
" 94 | \n",
" 96 | \n",
" 78 | \n",
" 35 | \n",
" 37 | \n",
" 44 | \n",
" 46 | \n",
" 53 | \n",
"
\n",
" \n",
" 9 | \n",
" 11 | \n",
" 18 | \n",
" 100 | \n",
" 77 | \n",
" 84 | \n",
" 36 | \n",
" 43 | \n",
" 50 | \n",
" 27 | \n",
" 59 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9\n",
"0 92 99 1 8 15 67 74 51 58 40\n",
"1 98 80 7 14 16 73 55 57 64 41\n",
"2 4 81 88 20 22 54 56 63 70 47\n",
"3 85 87 19 21 3 60 62 69 71 28\n",
"4 86 93 25 2 9 61 68 75 52 34\n",
"5 17 24 76 83 90 42 49 26 33 65\n",
"6 23 5 82 89 91 48 30 32 39 66\n",
"7 79 6 13 95 97 29 31 38 45 72\n",
"8 10 12 94 96 78 35 37 44 46 53\n",
"9 11 18 100 77 84 36 43 50 27 59"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(pyA)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2 Importing a compressed CSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following example shows how to import directly a compressed csv file, in this case with multiple separators:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Names | \n",
" Phone | \n",
" Town | \n",
" Description | \n",
" Income | \n",
" Coordinates | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" Haddington | \n",
" sed, sapien. Nunc pulvinar arcu | \n",
" â¬6,223 | \n",
" 23.28054, -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" Montes Claros | \n",
" consectetuer, cursus et, magna. Praesent | \n",
" â¬14,967 | \n",
" -77.76329, 69.22339 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Kirk Q. Bowman | \n",
" (06) 4153 7501 | \n",
" Cagli | \n",
" tortor. Nunc commodo auctor velit. | \n",
" â¬17,399 | \n",
" -58.80037, 22.50537 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Karen Pickett | \n",
" (02) 3216 9708 | \n",
" Cobourg | \n",
" at auctor ullamcorper, nisl arcu | \n",
" â¬17,373 | \n",
" 75.73982, -78.01872 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Vance J. Johnson | \n",
" (01) 7568 6371 | \n",
" Carlton | \n",
" ultricies adipiscing, enim mi tempor | \n",
" â¬9,025 | \n",
" -86.51337, 109.46298 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Names Phone Town \\\n",
"0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n",
"1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n",
"2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n",
"3 4 Karen Pickett (02) 3216 9708 Cobourg \n",
"4 5 Vance J. Johnson (01) 7568 6371 Carlton \n",
"\n",
" Description Income \\\n",
"0 sed, sapien. Nunc pulvinar arcu â¬6,223 \n",
"1 consectetuer, cursus et, magna. Praesent â¬14,967 \n",
"2 tortor. Nunc commodo auctor velit. â¬17,399 \n",
"3 at auctor ullamcorper, nisl arcu â¬17,373 \n",
"4 ultricies adipiscing, enim mi tempor â¬9,025 \n",
"\n",
" Coordinates \n",
"0 23.28054, -24.48755 \n",
"1 -77.76329, 69.22339 \n",
"2 -58.80037, 22.50537 \n",
"3 75.73982, -78.01872 \n",
"4 -86.51337, 109.46298 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\\,\\;\\.', \n",
" skipinitialspace=True, compression='gzip', engine='python',\n",
" encoding='iso8859_15')\n",
"df_csv.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**: utf-8 has problems decoding euro sign. \"\\u00e2\\u0082\\u00ac\" are the three chars you get when the UTF-8 encoded \\u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Names | \n",
" Phone | \n",
" Town | \n",
" Description | \n",
" Income | \n",
" Coordinates | \n",
" lat | \n",
" lon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" Haddington | \n",
" sed, sapien. Nunc pulvinar arcu | \n",
" â¬6,223 | \n",
" 23.28054, -24.48755 | \n",
" 23.28054 | \n",
" -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" Montes Claros | \n",
" consectetuer, cursus et, magna. Praesent | \n",
" â¬14,967 | \n",
" -77.76329, 69.22339 | \n",
" -77.76329 | \n",
" 69.22339 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Kirk Q. Bowman | \n",
" (06) 4153 7501 | \n",
" Cagli | \n",
" tortor. Nunc commodo auctor velit. | \n",
" â¬17,399 | \n",
" -58.80037, 22.50537 | \n",
" -58.80037 | \n",
" 22.50537 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Karen Pickett | \n",
" (02) 3216 9708 | \n",
" Cobourg | \n",
" at auctor ullamcorper, nisl arcu | \n",
" â¬17,373 | \n",
" 75.73982, -78.01872 | \n",
" 75.73982 | \n",
" -78.01872 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Vance J. Johnson | \n",
" (01) 7568 6371 | \n",
" Carlton | \n",
" ultricies adipiscing, enim mi tempor | \n",
" â¬9,025 | \n",
" -86.51337, 109.46298 | \n",
" -86.51337 | \n",
" 109.46298 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Names Phone Town \\\n",
"0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n",
"1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n",
"2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n",
"3 4 Karen Pickett (02) 3216 9708 Cobourg \n",
"4 5 Vance J. Johnson (01) 7568 6371 Carlton \n",
"\n",
" Description Income \\\n",
"0 sed, sapien. Nunc pulvinar arcu â¬6,223 \n",
"1 consectetuer, cursus et, magna. Praesent â¬14,967 \n",
"2 tortor. Nunc commodo auctor velit. â¬17,399 \n",
"3 at auctor ullamcorper, nisl arcu â¬17,373 \n",
"4 ultricies adipiscing, enim mi tempor â¬9,025 \n",
"\n",
" Coordinates lat lon \n",
"0 23.28054, -24.48755 23.28054 -24.48755 \n",
"1 -77.76329, 69.22339 -77.76329 69.22339 \n",
"2 -58.80037, 22.50537 -58.80037 22.50537 \n",
"3 75.73982, -78.01872 75.73982 -78.01872 \n",
"4 -86.51337, 109.46298 -86.51337 109.46298 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coor = df_csv['Coordinates']\n",
"df_csv['lat'] = ''\n",
"df_csv['lon'] = ''\n",
"for j, coo in enumerate(coor):\n",
" spl = re.split(',', str(coo))\n",
" df_csv.loc[j, 'lat'] = spl[0] \n",
" df_csv.loc[j, 'lon'] = spl[1]\n",
"df_csv.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_csv.to_json('temp/converted_json.json')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3 Importing JSON files"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Coordinates | \n",
" Description | \n",
" ID | \n",
" Income | \n",
" Names | \n",
" Phone | \n",
" Town | \n",
" lat | \n",
" lon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 23.28054, -24.48755 | \n",
" sed, sapien. Nunc pulvinar arcu | \n",
" 1 | \n",
" â¬6,223 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" Haddington | \n",
" 23.28054 | \n",
" -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" -77.76329, 69.22339 | \n",
" consectetuer, cursus et, magna. Praesent | \n",
" 2 | \n",
" â¬14,967 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" Montes Claros | \n",
" -77.76329 | \n",
" 69.22339 | \n",
"
\n",
" \n",
" 10 | \n",
" -62.95199, 116.29751 | \n",
" Duis sit amet diam eu | \n",
" 11 | \n",
" â¬15,764 | \n",
" Jameson H. Craig | \n",
" (07) 8640 5274 | \n",
" Fairbanks | \n",
" -62.95199 | \n",
" 116.29751 | \n",
"
\n",
" \n",
" 11 | \n",
" -42.26793, -92.23605 | \n",
" et netus et malesuada fames | \n",
" 12 | \n",
" â¬18,460 | \n",
" Michael Randall | \n",
" (04) 3451 8606 | \n",
" Mezzana | \n",
" -42.26793 | \n",
" -92.23605 | \n",
"
\n",
" \n",
" 12 | \n",
" -20.39897, 152.4122 | \n",
" id risus quis diam luctus | \n",
" 13 | \n",
" â¬6,175 | \n",
" Sierra Rivers | \n",
" (01) 2020 4511 | \n",
" Lugnano in Teverina | \n",
" -20.39897 | \n",
" 152.41220 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Coordinates Description ID \\\n",
"0 23.28054, -24.48755 sed, sapien. Nunc pulvinar arcu 1 \n",
"1 -77.76329, 69.22339 consectetuer, cursus et, magna. Praesent 2 \n",
"10 -62.95199, 116.29751 Duis sit amet diam eu 11 \n",
"11 -42.26793, -92.23605 et netus et malesuada fames 12 \n",
"12 -20.39897, 152.4122 id risus quis diam luctus 13 \n",
"\n",
" Income Names Phone \\\n",
"0 â¬6,223 Lydia Z. Flores (02) 0548 5995 \n",
"1 â¬14,967 Bell X. Guerra (07) 8599 9926 \n",
"10 â¬15,764 Jameson H. Craig (07) 8640 5274 \n",
"11 â¬18,460 Michael Randall (04) 3451 8606 \n",
"12 â¬6,175 Sierra Rivers (01) 2020 4511 \n",
"\n",
" Town lat lon \n",
"0 Haddington 23.28054 -24.48755 \n",
"1 Montes Claros -77.76329 69.22339 \n",
"10 Fairbanks -62.95199 116.29751 \n",
"11 Mezzana -42.26793 -92.23605 \n",
"12 Lugnano in Teverina -20.39897 152.41220 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('temp/converted_json.json') as f:\n",
" data = f.read().encode('utf-8')\n",
"dfjson = pd.read_json(data)\n",
"dfjson.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since the import reordered columns in alphabetical order, we can choose a preferred column order:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Names | \n",
" Phone | \n",
" Income | \n",
" Town | \n",
" lat | \n",
" lon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" â¬6,223 | \n",
" Haddington | \n",
" 23.28054 | \n",
" -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" â¬14,967 | \n",
" Montes Claros | \n",
" -77.76329 | \n",
" 69.22339 | \n",
"
\n",
" \n",
" 10 | \n",
" 11 | \n",
" Jameson H. Craig | \n",
" (07) 8640 5274 | \n",
" â¬15,764 | \n",
" Fairbanks | \n",
" -62.95199 | \n",
" 116.29751 | \n",
"
\n",
" \n",
" 11 | \n",
" 12 | \n",
" Michael Randall | \n",
" (04) 3451 8606 | \n",
" â¬18,460 | \n",
" Mezzana | \n",
" -42.26793 | \n",
" -92.23605 | \n",
"
\n",
" \n",
" 12 | \n",
" 13 | \n",
" Sierra Rivers | \n",
" (01) 2020 4511 | \n",
" â¬6,175 | \n",
" Lugnano in Teverina | \n",
" -20.39897 | \n",
" 152.41220 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Names Phone Income \\\n",
"0 1 Lydia Z. Flores (02) 0548 5995 â¬6,223 \n",
"1 2 Bell X. Guerra (07) 8599 9926 â¬14,967 \n",
"10 11 Jameson H. Craig (07) 8640 5274 â¬15,764 \n",
"11 12 Michael Randall (04) 3451 8606 â¬18,460 \n",
"12 13 Sierra Rivers (01) 2020 4511 â¬6,175 \n",
"\n",
" Town lat lon \n",
"0 Haddington 23.28054 -24.48755 \n",
"1 Montes Claros -77.76329 69.22339 \n",
"10 Fairbanks -62.95199 116.29751 \n",
"11 Mezzana -42.26793 -92.23605 \n",
"12 Lugnano in Teverina -20.39897 152.41220 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]\n",
"dfjson.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4 Importing HTML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Nomi maschili | \n",
" Strade | \n",
" Città | \n",
" Coordinate | \n",
" Ente | \n",
" mail | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Drew | \n",
" 5672 Accumsan Road | \n",
" Whitburn | \n",
" -54.68429, -67.21709 | \n",
" Ac Consulting | \n",
" a.aliquet.vel@vitae.co.uk | \n",
"
\n",
" \n",
" 1 | \n",
" Finn | \n",
" Ap #170-4074 Interdum Rd. | \n",
" Pelotas | \n",
" 81.93087, 168.14556 | \n",
" Varius Incorporated | \n",
" nec.malesuada.ut@primisin.com | \n",
"
\n",
" \n",
" 2 | \n",
" Martin | \n",
" Ap #471-6260 Etiam Ave | \n",
" Aliano | \n",
" 33.99272, -148.70584 | \n",
" Arcu Industries | \n",
" lacus@risus.net | \n",
"
\n",
" \n",
" 3 | \n",
" Zephania | \n",
" Ap #720-7951 Aliquam Ave | \n",
" Senneville | \n",
" -56.82569, -72.80435 | \n",
" Tristique PC | \n",
" arcu@sempereratin.co.uk | \n",
"
\n",
" \n",
" 4 | \n",
" Devin | \n",
" 3538 Dui St. | \n",
" Zwettl-Niederösterreich | \n",
" -11.90874, -149.61123 | \n",
" Imperdiet Dictum PC | \n",
" lectus.rutrum.urna@massaQuisqueporttitor.ca | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Nomi maschili Strade Città \\\n",
"0 Drew 5672 Accumsan Road Whitburn \n",
"1 Finn Ap #170-4074 Interdum Rd. Pelotas \n",
"2 Martin Ap #471-6260 Etiam Ave Aliano \n",
"3 Zephania Ap #720-7951 Aliquam Ave Senneville \n",
"4 Devin 3538 Dui St. Zwettl-Niederösterreich \n",
"\n",
" Coordinate Ente \\\n",
"0 -54.68429, -67.21709 Ac Consulting \n",
"1 81.93087, 168.14556 Varius Incorporated \n",
"2 33.99272, -148.70584 Arcu Industries \n",
"3 -56.82569, -72.80435 Tristique PC \n",
"4 -11.90874, -149.61123 Imperdiet Dictum PC \n",
"\n",
" mail \n",
"0 a.aliquet.vel@vitae.co.uk \n",
"1 nec.malesuada.ut@primisin.com \n",
"2 lacus@risus.net \n",
"3 arcu@sempereratin.co.uk \n",
"4 lectus.rutrum.urna@massaQuisqueporttitor.ca "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfhtml = pd.read_html('example_data/generated.html', header=0)\n",
"dfhtml[0].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5 Importing Excel"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Names | \n",
" Streets | \n",
" Town | \n",
" Coordinates | \n",
" Corporation | \n",
" mail | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Stephen | \n",
" Ap #389-3365 Risus, St. | \n",
" Lelystad | \n",
" 51.91783, -47.01037 | \n",
" Consectetuer Rhoncus Nullam Corporation | \n",
" Suspendisse.sed@pedePraesenteu.org | \n",
"
\n",
" \n",
" 1 | \n",
" Kenneth | \n",
" 120-6483 Ligula. Ave | \n",
" Labrecque | \n",
" -1.92625, 10.02451 | \n",
" A Corporation | \n",
" orci@egetmollislectus.com | \n",
"
\n",
" \n",
" 2 | \n",
" Leo | \n",
" Ap #727-2085 Eget Av. | \n",
" Strathcona County | \n",
" -17.31839, 137.99307 | \n",
" Sed Limited | \n",
" Etiam@vel.org | \n",
"
\n",
" \n",
" 3 | \n",
" Joshua | \n",
" P.O. Box 425, 6462 Arcu Rd. | \n",
" Municipal District | \n",
" -51.34642, 80.32145 | \n",
" Mauris Sapien Cursus Corp. | \n",
" lorem@ornarelectusante.net | \n",
"
\n",
" \n",
" 4 | \n",
" Zephania | \n",
" 655 Et, St. | \n",
" Couillet | \n",
" 29.96525, 124.18391 | \n",
" Odio Semper Cursus Corp. | \n",
" metus@a.ca | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Names Streets Town \\\n",
"0 Stephen Ap #389-3365 Risus, St. Lelystad \n",
"1 Kenneth 120-6483 Ligula. Ave Labrecque \n",
"2 Leo Ap #727-2085 Eget Av. Strathcona County \n",
"3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n",
"4 Zephania 655 Et, St. Couillet \n",
"\n",
" Coordinates Corporation \\\n",
"0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n",
"1 -1.92625, 10.02451 A Corporation \n",
"2 -17.31839, 137.99307 Sed Limited \n",
"3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n",
"4 29.96525, 124.18391 Odio Semper Cursus Corp. \n",
"\n",
" mail \n",
"0 Suspendisse.sed@pedePraesenteu.org \n",
"1 orci@egetmollislectus.com \n",
"2 Etiam@vel.org \n",
"3 lorem@ornarelectusante.net \n",
"4 metus@a.ca "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')\n",
"dfxl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6 Working with SQL and databases"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.1 Write SQL"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"con = sqlite3.connect(\"temp.sql\")\n",
"sqlfile = dfxl.to_sql('Names', con, flavor='sqlite')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.2 Import SQL"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(100, 7)\n"
]
}
],
"source": [
"con = sqlite3.connect('temp.sql')\n",
"with con:\n",
" sql = \"SELECT * FROM Names;\"\n",
" df = psql.read_sql(sql, con)\n",
" print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" Names | \n",
" Streets | \n",
" Town | \n",
" Coordinates | \n",
" Corporation | \n",
" mail | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" Stephen | \n",
" Ap #389-3365 Risus, St. | \n",
" Lelystad | \n",
" 51.91783, -47.01037 | \n",
" Consectetuer Rhoncus Nullam Corporation | \n",
" Suspendisse.sed@pedePraesenteu.org | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Kenneth | \n",
" 120-6483 Ligula. Ave | \n",
" Labrecque | \n",
" -1.92625, 10.02451 | \n",
" A Corporation | \n",
" orci@egetmollislectus.com | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" Leo | \n",
" Ap #727-2085 Eget Av. | \n",
" Strathcona County | \n",
" -17.31839, 137.99307 | \n",
" Sed Limited | \n",
" Etiam@vel.org | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Joshua | \n",
" P.O. Box 425, 6462 Arcu Rd. | \n",
" Municipal District | \n",
" -51.34642, 80.32145 | \n",
" Mauris Sapien Cursus Corp. | \n",
" lorem@ornarelectusante.net | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" Zephania | \n",
" 655 Et, St. | \n",
" Couillet | \n",
" 29.96525, 124.18391 | \n",
" Odio Semper Cursus Corp. | \n",
" metus@a.ca | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index Names Streets Town \\\n",
"0 0 Stephen Ap #389-3365 Risus, St. Lelystad \n",
"1 1 Kenneth 120-6483 Ligula. Ave Labrecque \n",
"2 2 Leo Ap #727-2085 Eget Av. Strathcona County \n",
"3 3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n",
"4 4 Zephania 655 Et, St. Couillet \n",
"\n",
" Coordinates Corporation \\\n",
"0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n",
"1 -1.92625, 10.02451 A Corporation \n",
"2 -17.31839, 137.99307 Sed Limited \n",
"3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n",
"4 29.96525, 124.18391 Odio Semper Cursus Corp. \n",
"\n",
" mail \n",
"0 Suspendisse.sed@pedePraesenteu.org \n",
"1 orci@egetmollislectus.com \n",
"2 Etiam@vel.org \n",
"3 lorem@ornarelectusante.net \n",
"4 metus@a.ca "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"con.close()\n",
"os.remove(\"temp.sql\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7 Working with HDF5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.1 Storer format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add DataFrames to the HDFStore object:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"samples_01 = int(3e5)\n",
"samples_02 = int(1e5)\n",
"idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')\n",
"idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')\n",
"randn = np.random.randn"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])\n",
"df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of the Dataset: 13.73291015625 MB\n"
]
}
],
"source": [
"print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 32 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"with pd.get_store('temp/store53.h5') as store:\n",
" store.put('storer/df1', df1)\n",
" store.put('storer/df2', df2)\n",
" store.put('to_remove', df2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Retrieve stored objects:"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['/to_remove', '/storer/df1', '/storer/df2']\n",
" A B C\n",
"2000-01-01 12:00:00.050000+01:00 -0.022728 0.015469 0.962709\n",
"2000-01-01 12:00:00.100000+01:00 -0.453089 0.513491 -1.270842\n",
" A B C\n",
"2000-01-01 12:00:00.050000+01:00 -0.022728 0.015469 0.962709\n",
"2000-01-01 12:00:00.100000+01:00 -0.453089 0.513491 -1.270842\n",
"Check retrieved data equal to original data: \n",
"True\n"
]
}
],
"source": [
"with pd.get_store('temp/store53.h5') as store:\n",
" print (store.keys())\n",
" df1_retrieved = store.get('storer/df1')\n",
" print (df1_retrieved[1:3])\n",
" print (df1[1:3])\n",
" print ('Check retrieved data equal to original data: ')\n",
" print (df1_retrieved[1:3].equals(df1[1:3]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Delete objects:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"File path: temp/store53.h5\n",
"/storer/df1 frame (shape->[300000,3])\n",
"/storer/df2 frame (shape->[100000,4])\n"
]
}
],
"source": [
"with pd.get_store('temp/store53.h5') as store:\n",
" try:\n",
" store.remove('to_remove')\n",
" except:\n",
" pass\n",
" print (store)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.2 Table format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with pd.get_store('temp/store53.h5') as store:\n",
" # store.append creates a table automatically:\n",
" store.append('table/df1_appended', df1.ix[:10000])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with pd.get_store('temp/store53.h5') as store:\n",
" store.append('table/df1_appended', df1.ix[10001:20000])\n",
" store.append('table/df1_appended', df1.ix[20001:50000])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.3 Querying a Table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Query the table using boolean expression with in-line function evaluation:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 2000-01-01 12:00:00.200000+01:00 | \n",
" -0.232597 | \n",
" -0.705741 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.250000+01:00 | \n",
" 0.183234 | \n",
" 0.024464 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.300000+01:00 | \n",
" -0.801220 | \n",
" -0.732630 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.350000+01:00 | \n",
" 0.762667 | \n",
" -0.585462 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"2000-01-01 12:00:00.200000+01:00 -0.232597 -0.705741\n",
"2000-01-01 12:00:00.250000+01:00 0.183234 0.024464\n",
"2000-01-01 12:00:00.300000+01:00 -0.801220 -0.732630\n",
"2000-01-01 12:00:00.350000+01:00 0.762667 -0.585462"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with pd.get_store('temp/store53.h5') as store:\n",
" query01 = store.select('table/df1_appended',\n",
" \"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \\\n",
" & index) for more tutorials and updates.\n",
"\n",
"This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}