{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# pandas I/O tools and examples"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
"
],
"text/plain": [
""
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import addutils.toc ; addutils.toc.js(ipy_notebook=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n"
],
"text/plain": [
""
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from addutils import css_notebook\n",
"css_notebook()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1 Matlab Variables"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1 Import a Matlab variable from file"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n"
],
"text/plain": [
""
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"import os \n",
"import scipy.io\n",
"import numpy as np\n",
"import pandas as pd\n",
"import sqlite3\n",
"import pandas.io.sql as psql\n",
"from time import time\n",
"from IPython.display import display\n",
"from addutils import css_notebook\n",
"css_notebook()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import from '.mat' files"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"x = scipy.io.loadmat('example_data/matlab_variable.mat')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 92, 99, 1, 8, 15, 67, 74, 51, 58, 40],\n",
" [ 98, 80, 7, 14, 16, 73, 55, 57, 64, 41],\n",
" [ 4, 81, 88, 20, 22, 54, 56, 63, 70, 47],\n",
" [ 85, 87, 19, 21, 3, 60, 62, 69, 71, 28],\n",
" [ 86, 93, 25, 2, 9, 61, 68, 75, 52, 34],\n",
" [ 17, 24, 76, 83, 90, 42, 49, 26, 33, 65],\n",
" [ 23, 5, 82, 89, 91, 48, 30, 32, 39, 66],\n",
" [ 79, 6, 13, 95, 97, 29, 31, 38, 45, 72],\n",
" [ 10, 12, 94, 96, 78, 35, 37, 44, 46, 53],\n",
" [ 11, 18, 100, 77, 84, 36, 43, 50, 27, 59]], dtype=uint8)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pyA = x['a']\n",
"pyA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Matlab variable is passed to a pandas DataFrame:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 92 | \n",
" 99 | \n",
" 1 | \n",
" 8 | \n",
" 15 | \n",
" 67 | \n",
" 74 | \n",
" 51 | \n",
" 58 | \n",
" 40 | \n",
"
\n",
" \n",
" 1 | \n",
" 98 | \n",
" 80 | \n",
" 7 | \n",
" 14 | \n",
" 16 | \n",
" 73 | \n",
" 55 | \n",
" 57 | \n",
" 64 | \n",
" 41 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 81 | \n",
" 88 | \n",
" 20 | \n",
" 22 | \n",
" 54 | \n",
" 56 | \n",
" 63 | \n",
" 70 | \n",
" 47 | \n",
"
\n",
" \n",
" 3 | \n",
" 85 | \n",
" 87 | \n",
" 19 | \n",
" 21 | \n",
" 3 | \n",
" 60 | \n",
" 62 | \n",
" 69 | \n",
" 71 | \n",
" 28 | \n",
"
\n",
" \n",
" 4 | \n",
" 86 | \n",
" 93 | \n",
" 25 | \n",
" 2 | \n",
" 9 | \n",
" 61 | \n",
" 68 | \n",
" 75 | \n",
" 52 | \n",
" 34 | \n",
"
\n",
" \n",
" 5 | \n",
" 17 | \n",
" 24 | \n",
" 76 | \n",
" 83 | \n",
" 90 | \n",
" 42 | \n",
" 49 | \n",
" 26 | \n",
" 33 | \n",
" 65 | \n",
"
\n",
" \n",
" 6 | \n",
" 23 | \n",
" 5 | \n",
" 82 | \n",
" 89 | \n",
" 91 | \n",
" 48 | \n",
" 30 | \n",
" 32 | \n",
" 39 | \n",
" 66 | \n",
"
\n",
" \n",
" 7 | \n",
" 79 | \n",
" 6 | \n",
" 13 | \n",
" 95 | \n",
" 97 | \n",
" 29 | \n",
" 31 | \n",
" 38 | \n",
" 45 | \n",
" 72 | \n",
"
\n",
" \n",
" 8 | \n",
" 10 | \n",
" 12 | \n",
" 94 | \n",
" 96 | \n",
" 78 | \n",
" 35 | \n",
" 37 | \n",
" 44 | \n",
" 46 | \n",
" 53 | \n",
"
\n",
" \n",
" 9 | \n",
" 11 | \n",
" 18 | \n",
" 100 | \n",
" 77 | \n",
" 84 | \n",
" 36 | \n",
" 43 | \n",
" 50 | \n",
" 27 | \n",
" 59 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9\n",
"0 92 99 1 8 15 67 74 51 58 40\n",
"1 98 80 7 14 16 73 55 57 64 41\n",
"2 4 81 88 20 22 54 56 63 70 47\n",
"3 85 87 19 21 3 60 62 69 71 28\n",
"4 86 93 25 2 9 61 68 75 52 34\n",
"5 17 24 76 83 90 42 49 26 33 65\n",
"6 23 5 82 89 91 48 30 32 39 66\n",
"7 79 6 13 95 97 29 31 38 45 72\n",
"8 10 12 94 96 78 35 37 44 46 53\n",
"9 11 18 100 77 84 36 43 50 27 59"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(pyA)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2 Importing a compressed CSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following example shows how to import directly a compressed csv file, in this case with multiple separators:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Names | \n",
" Phone | \n",
" Town | \n",
" Description | \n",
" Income | \n",
" Coordinates | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" Haddington | \n",
" sed, sapien. Nunc pulvinar arcu | \n",
" â¬6,223 | \n",
" 23.28054, -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" Montes Claros | \n",
" consectetuer, cursus et, magna. Praesent | \n",
" â¬14,967 | \n",
" -77.76329, 69.22339 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Kirk Q. Bowman | \n",
" (06) 4153 7501 | \n",
" Cagli | \n",
" tortor. Nunc commodo auctor velit. | \n",
" â¬17,399 | \n",
" -58.80037, 22.50537 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Karen Pickett | \n",
" (02) 3216 9708 | \n",
" Cobourg | \n",
" at auctor ullamcorper, nisl arcu | \n",
" â¬17,373 | \n",
" 75.73982, -78.01872 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Vance J. Johnson | \n",
" (01) 7568 6371 | \n",
" Carlton | \n",
" ultricies adipiscing, enim mi tempor | \n",
" â¬9,025 | \n",
" -86.51337, 109.46298 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Names Phone Town \\\n",
"0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n",
"1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n",
"2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n",
"3 4 Karen Pickett (02) 3216 9708 Cobourg \n",
"4 5 Vance J. Johnson (01) 7568 6371 Carlton \n",
"\n",
" Description Income \\\n",
"0 sed, sapien. Nunc pulvinar arcu â¬6,223 \n",
"1 consectetuer, cursus et, magna. Praesent â¬14,967 \n",
"2 tortor. Nunc commodo auctor velit. â¬17,399 \n",
"3 at auctor ullamcorper, nisl arcu â¬17,373 \n",
"4 ultricies adipiscing, enim mi tempor â¬9,025 \n",
"\n",
" Coordinates \n",
"0 23.28054, -24.48755 \n",
"1 -77.76329, 69.22339 \n",
"2 -58.80037, 22.50537 \n",
"3 75.73982, -78.01872 \n",
"4 -86.51337, 109.46298 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\\,\\;\\.', \n",
" skipinitialspace=True, compression='gzip', engine='python',\n",
" encoding='iso8859_15')\n",
"df_csv.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**: utf-8 has problems decoding euro sign. \"\\u00e2\\u0082\\u00ac\" are the three chars you get when the UTF-8 encoded \\u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Names | \n",
" Phone | \n",
" Town | \n",
" Description | \n",
" Income | \n",
" Coordinates | \n",
" lat | \n",
" lon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" Haddington | \n",
" sed, sapien. Nunc pulvinar arcu | \n",
" â¬6,223 | \n",
" 23.28054, -24.48755 | \n",
" 23.28054 | \n",
" -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" Montes Claros | \n",
" consectetuer, cursus et, magna. Praesent | \n",
" â¬14,967 | \n",
" -77.76329, 69.22339 | \n",
" -77.76329 | \n",
" 69.22339 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Kirk Q. Bowman | \n",
" (06) 4153 7501 | \n",
" Cagli | \n",
" tortor. Nunc commodo auctor velit. | \n",
" â¬17,399 | \n",
" -58.80037, 22.50537 | \n",
" -58.80037 | \n",
" 22.50537 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Karen Pickett | \n",
" (02) 3216 9708 | \n",
" Cobourg | \n",
" at auctor ullamcorper, nisl arcu | \n",
" â¬17,373 | \n",
" 75.73982, -78.01872 | \n",
" 75.73982 | \n",
" -78.01872 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Vance J. Johnson | \n",
" (01) 7568 6371 | \n",
" Carlton | \n",
" ultricies adipiscing, enim mi tempor | \n",
" â¬9,025 | \n",
" -86.51337, 109.46298 | \n",
" -86.51337 | \n",
" 109.46298 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Names Phone Town \\\n",
"0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n",
"1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n",
"2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n",
"3 4 Karen Pickett (02) 3216 9708 Cobourg \n",
"4 5 Vance J. Johnson (01) 7568 6371 Carlton \n",
"\n",
" Description Income \\\n",
"0 sed, sapien. Nunc pulvinar arcu â¬6,223 \n",
"1 consectetuer, cursus et, magna. Praesent â¬14,967 \n",
"2 tortor. Nunc commodo auctor velit. â¬17,399 \n",
"3 at auctor ullamcorper, nisl arcu â¬17,373 \n",
"4 ultricies adipiscing, enim mi tempor â¬9,025 \n",
"\n",
" Coordinates lat lon \n",
"0 23.28054, -24.48755 23.28054 -24.48755 \n",
"1 -77.76329, 69.22339 -77.76329 69.22339 \n",
"2 -58.80037, 22.50537 -58.80037 22.50537 \n",
"3 75.73982, -78.01872 75.73982 -78.01872 \n",
"4 -86.51337, 109.46298 -86.51337 109.46298 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coor = df_csv['Coordinates']\n",
"df_csv['lat'] = ''\n",
"df_csv['lon'] = ''\n",
"for j, coo in enumerate(coor):\n",
" spl = re.split(',', str(coo))\n",
" df_csv.loc[j, 'lat'] = spl[0] \n",
" df_csv.loc[j, 'lon'] = spl[1]\n",
"df_csv.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_csv.to_json('temp/converted_json.json')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3 Importing JSON files"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Coordinates | \n",
" Description | \n",
" ID | \n",
" Income | \n",
" Names | \n",
" Phone | \n",
" Town | \n",
" lat | \n",
" lon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 23.28054, -24.48755 | \n",
" sed, sapien. Nunc pulvinar arcu | \n",
" 1 | \n",
" â¬6,223 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" Haddington | \n",
" 23.28054 | \n",
" -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" -77.76329, 69.22339 | \n",
" consectetuer, cursus et, magna. Praesent | \n",
" 2 | \n",
" â¬14,967 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" Montes Claros | \n",
" -77.76329 | \n",
" 69.22339 | \n",
"
\n",
" \n",
" 10 | \n",
" -62.95199, 116.29751 | \n",
" Duis sit amet diam eu | \n",
" 11 | \n",
" â¬15,764 | \n",
" Jameson H. Craig | \n",
" (07) 8640 5274 | \n",
" Fairbanks | \n",
" -62.95199 | \n",
" 116.29751 | \n",
"
\n",
" \n",
" 11 | \n",
" -42.26793, -92.23605 | \n",
" et netus et malesuada fames | \n",
" 12 | \n",
" â¬18,460 | \n",
" Michael Randall | \n",
" (04) 3451 8606 | \n",
" Mezzana | \n",
" -42.26793 | \n",
" -92.23605 | \n",
"
\n",
" \n",
" 12 | \n",
" -20.39897, 152.4122 | \n",
" id risus quis diam luctus | \n",
" 13 | \n",
" â¬6,175 | \n",
" Sierra Rivers | \n",
" (01) 2020 4511 | \n",
" Lugnano in Teverina | \n",
" -20.39897 | \n",
" 152.41220 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Coordinates Description ID \\\n",
"0 23.28054, -24.48755 sed, sapien. Nunc pulvinar arcu 1 \n",
"1 -77.76329, 69.22339 consectetuer, cursus et, magna. Praesent 2 \n",
"10 -62.95199, 116.29751 Duis sit amet diam eu 11 \n",
"11 -42.26793, -92.23605 et netus et malesuada fames 12 \n",
"12 -20.39897, 152.4122 id risus quis diam luctus 13 \n",
"\n",
" Income Names Phone \\\n",
"0 â¬6,223 Lydia Z. Flores (02) 0548 5995 \n",
"1 â¬14,967 Bell X. Guerra (07) 8599 9926 \n",
"10 â¬15,764 Jameson H. Craig (07) 8640 5274 \n",
"11 â¬18,460 Michael Randall (04) 3451 8606 \n",
"12 â¬6,175 Sierra Rivers (01) 2020 4511 \n",
"\n",
" Town lat lon \n",
"0 Haddington 23.28054 -24.48755 \n",
"1 Montes Claros -77.76329 69.22339 \n",
"10 Fairbanks -62.95199 116.29751 \n",
"11 Mezzana -42.26793 -92.23605 \n",
"12 Lugnano in Teverina -20.39897 152.41220 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('temp/converted_json.json') as f:\n",
" data = f.read().encode('utf-8')\n",
"dfjson = pd.read_json(data)\n",
"dfjson.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since the import reordered columns in alphabetical order, we can choose a preferred column order:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ID | \n",
" Names | \n",
" Phone | \n",
" Income | \n",
" Town | \n",
" lat | \n",
" lon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Lydia Z. Flores | \n",
" (02) 0548 5995 | \n",
" â¬6,223 | \n",
" Haddington | \n",
" 23.28054 | \n",
" -24.48755 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Bell X. Guerra | \n",
" (07) 8599 9926 | \n",
" â¬14,967 | \n",
" Montes Claros | \n",
" -77.76329 | \n",
" 69.22339 | \n",
"
\n",
" \n",
" 10 | \n",
" 11 | \n",
" Jameson H. Craig | \n",
" (07) 8640 5274 | \n",
" â¬15,764 | \n",
" Fairbanks | \n",
" -62.95199 | \n",
" 116.29751 | \n",
"
\n",
" \n",
" 11 | \n",
" 12 | \n",
" Michael Randall | \n",
" (04) 3451 8606 | \n",
" â¬18,460 | \n",
" Mezzana | \n",
" -42.26793 | \n",
" -92.23605 | \n",
"
\n",
" \n",
" 12 | \n",
" 13 | \n",
" Sierra Rivers | \n",
" (01) 2020 4511 | \n",
" â¬6,175 | \n",
" Lugnano in Teverina | \n",
" -20.39897 | \n",
" 152.41220 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ID Names Phone Income \\\n",
"0 1 Lydia Z. Flores (02) 0548 5995 â¬6,223 \n",
"1 2 Bell X. Guerra (07) 8599 9926 â¬14,967 \n",
"10 11 Jameson H. Craig (07) 8640 5274 â¬15,764 \n",
"11 12 Michael Randall (04) 3451 8606 â¬18,460 \n",
"12 13 Sierra Rivers (01) 2020 4511 â¬6,175 \n",
"\n",
" Town lat lon \n",
"0 Haddington 23.28054 -24.48755 \n",
"1 Montes Claros -77.76329 69.22339 \n",
"10 Fairbanks -62.95199 116.29751 \n",
"11 Mezzana -42.26793 -92.23605 \n",
"12 Lugnano in Teverina -20.39897 152.41220 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfjson = dfjson.loc[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]\n",
"dfjson.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4 Importing HTML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Nomi maschili | \n",
" Strade | \n",
" Città | \n",
" Coordinate | \n",
" Ente | \n",
" mail | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Drew | \n",
" 5672 Accumsan Road | \n",
" Whitburn | \n",
" -54.68429, -67.21709 | \n",
" Ac Consulting | \n",
" a.aliquet.vel@vitae.co.uk | \n",
"
\n",
" \n",
" 1 | \n",
" Finn | \n",
" Ap #170-4074 Interdum Rd. | \n",
" Pelotas | \n",
" 81.93087, 168.14556 | \n",
" Varius Incorporated | \n",
" nec.malesuada.ut@primisin.com | \n",
"
\n",
" \n",
" 2 | \n",
" Martin | \n",
" Ap #471-6260 Etiam Ave | \n",
" Aliano | \n",
" 33.99272, -148.70584 | \n",
" Arcu Industries | \n",
" lacus@risus.net | \n",
"
\n",
" \n",
" 3 | \n",
" Zephania | \n",
" Ap #720-7951 Aliquam Ave | \n",
" Senneville | \n",
" -56.82569, -72.80435 | \n",
" Tristique PC | \n",
" arcu@sempereratin.co.uk | \n",
"
\n",
" \n",
" 4 | \n",
" Devin | \n",
" 3538 Dui St. | \n",
" Zwettl-Niederösterreich | \n",
" -11.90874, -149.61123 | \n",
" Imperdiet Dictum PC | \n",
" lectus.rutrum.urna@massaQuisqueporttitor.ca | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Nomi maschili Strade Città \\\n",
"0 Drew 5672 Accumsan Road Whitburn \n",
"1 Finn Ap #170-4074 Interdum Rd. Pelotas \n",
"2 Martin Ap #471-6260 Etiam Ave Aliano \n",
"3 Zephania Ap #720-7951 Aliquam Ave Senneville \n",
"4 Devin 3538 Dui St. Zwettl-Niederösterreich \n",
"\n",
" Coordinate Ente \\\n",
"0 -54.68429, -67.21709 Ac Consulting \n",
"1 81.93087, 168.14556 Varius Incorporated \n",
"2 33.99272, -148.70584 Arcu Industries \n",
"3 -56.82569, -72.80435 Tristique PC \n",
"4 -11.90874, -149.61123 Imperdiet Dictum PC \n",
"\n",
" mail \n",
"0 a.aliquet.vel@vitae.co.uk \n",
"1 nec.malesuada.ut@primisin.com \n",
"2 lacus@risus.net \n",
"3 arcu@sempereratin.co.uk \n",
"4 lectus.rutrum.urna@massaQuisqueporttitor.ca "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfhtml = pd.read_html('example_data/generated.html', header=0)\n",
"dfhtml[0].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5 Importing Excel"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Names | \n",
" Streets | \n",
" Town | \n",
" Coordinates | \n",
" Corporation | \n",
" mail | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Stephen | \n",
" Ap #389-3365 Risus, St. | \n",
" Lelystad | \n",
" 51.91783, -47.01037 | \n",
" Consectetuer Rhoncus Nullam Corporation | \n",
" Suspendisse.sed@pedePraesenteu.org | \n",
"
\n",
" \n",
" 1 | \n",
" Kenneth | \n",
" 120-6483 Ligula. Ave | \n",
" Labrecque | \n",
" -1.92625, 10.02451 | \n",
" A Corporation | \n",
" orci@egetmollislectus.com | \n",
"
\n",
" \n",
" 2 | \n",
" Leo | \n",
" Ap #727-2085 Eget Av. | \n",
" Strathcona County | \n",
" -17.31839, 137.99307 | \n",
" Sed Limited | \n",
" Etiam@vel.org | \n",
"
\n",
" \n",
" 3 | \n",
" Joshua | \n",
" P.O. Box 425, 6462 Arcu Rd. | \n",
" Municipal District | \n",
" -51.34642, 80.32145 | \n",
" Mauris Sapien Cursus Corp. | \n",
" lorem@ornarelectusante.net | \n",
"
\n",
" \n",
" 4 | \n",
" Zephania | \n",
" 655 Et, St. | \n",
" Couillet | \n",
" 29.96525, 124.18391 | \n",
" Odio Semper Cursus Corp. | \n",
" metus@a.ca | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Names Streets Town \\\n",
"0 Stephen Ap #389-3365 Risus, St. Lelystad \n",
"1 Kenneth 120-6483 Ligula. Ave Labrecque \n",
"2 Leo Ap #727-2085 Eget Av. Strathcona County \n",
"3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n",
"4 Zephania 655 Et, St. Couillet \n",
"\n",
" Coordinates Corporation \\\n",
"0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n",
"1 -1.92625, 10.02451 A Corporation \n",
"2 -17.31839, 137.99307 Sed Limited \n",
"3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n",
"4 29.96525, 124.18391 Odio Semper Cursus Corp. \n",
"\n",
" mail \n",
"0 Suspendisse.sed@pedePraesenteu.org \n",
"1 orci@egetmollislectus.com \n",
"2 Etiam@vel.org \n",
"3 lorem@ornarelectusante.net \n",
"4 metus@a.ca "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')\n",
"dfxl.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6 Working with SQL and databases"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.1 Write SQL"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con = sqlite3.connect(\"temp.sql\")\n",
"sqlfile = dfxl.to_sql('Names', con)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.2 Import SQL"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(100, 7)\n"
]
}
],
"source": [
"con = sqlite3.connect('temp.sql')\n",
"with con:\n",
" sql = \"SELECT * FROM Names;\"\n",
" df = psql.read_sql(sql, con)\n",
" print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" Names | \n",
" Streets | \n",
" Town | \n",
" Coordinates | \n",
" Corporation | \n",
" mail | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" Stephen | \n",
" Ap #389-3365 Risus, St. | \n",
" Lelystad | \n",
" 51.91783, -47.01037 | \n",
" Consectetuer Rhoncus Nullam Corporation | \n",
" Suspendisse.sed@pedePraesenteu.org | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Kenneth | \n",
" 120-6483 Ligula. Ave | \n",
" Labrecque | \n",
" -1.92625, 10.02451 | \n",
" A Corporation | \n",
" orci@egetmollislectus.com | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" Leo | \n",
" Ap #727-2085 Eget Av. | \n",
" Strathcona County | \n",
" -17.31839, 137.99307 | \n",
" Sed Limited | \n",
" Etiam@vel.org | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Joshua | \n",
" P.O. Box 425, 6462 Arcu Rd. | \n",
" Municipal District | \n",
" -51.34642, 80.32145 | \n",
" Mauris Sapien Cursus Corp. | \n",
" lorem@ornarelectusante.net | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" Zephania | \n",
" 655 Et, St. | \n",
" Couillet | \n",
" 29.96525, 124.18391 | \n",
" Odio Semper Cursus Corp. | \n",
" metus@a.ca | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index Names Streets Town \\\n",
"0 0 Stephen Ap #389-3365 Risus, St. Lelystad \n",
"1 1 Kenneth 120-6483 Ligula. Ave Labrecque \n",
"2 2 Leo Ap #727-2085 Eget Av. Strathcona County \n",
"3 3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n",
"4 4 Zephania 655 Et, St. Couillet \n",
"\n",
" Coordinates Corporation \\\n",
"0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n",
"1 -1.92625, 10.02451 A Corporation \n",
"2 -17.31839, 137.99307 Sed Limited \n",
"3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n",
"4 29.96525, 124.18391 Odio Semper Cursus Corp. \n",
"\n",
" mail \n",
"0 Suspendisse.sed@pedePraesenteu.org \n",
"1 orci@egetmollislectus.com \n",
"2 Etiam@vel.org \n",
"3 lorem@ornarelectusante.net \n",
"4 metus@a.ca "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con.close()\n",
"os.remove(\"temp.sql\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7 Working with HDF5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.1 Storer format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add DataFrames to the HDFStore object:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"samples_01 = int(3e5)\n",
"samples_02 = int(1e5)\n",
"idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')\n",
"idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')\n",
"randn = np.random.randn"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])\n",
"df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of the Dataset: 13.73291015625 MB\n"
]
}
],
"source": [
"print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20.4 ms ± 612 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"with pd.HDFStore('temp/store53.h5') as store:\n",
" store.put('storer/df1', df1)\n",
" store.put('storer/df2', df2)\n",
" store.put('to_remove', df2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Retrieve stored objects:"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['/', '/to_remove', '/storer/df1', '/storer/df2', '/table/df1_appended']\n",
" A B C\n",
"2000-01-01 12:00:00.050000+01:00 -1.224140 0.058376 2.181400\n",
"2000-01-01 12:00:00.100000+01:00 0.461911 0.018024 -0.165891\n",
" A B C\n",
"2000-01-01 12:00:00.050000+01:00 -1.224140 0.058376 2.181400\n",
"2000-01-01 12:00:00.100000+01:00 0.461911 0.018024 -0.165891\n",
"Check retrieved data equal to original data: \n",
"True\n"
]
}
],
"source": [
"with pd.HDFStore('temp/store53.h5') as store:\n",
" print (store.keys())\n",
" df1_retrieved = store.get('storer/df1')\n",
" print (df1_retrieved[1:3])\n",
" print (df1[1:3])\n",
" print ('Check retrieved data equal to original data: ')\n",
" print (df1_retrieved[1:3].equals(df1[1:3]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Delete objects:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"File path: temp/store53.h5\n",
"\n"
]
}
],
"source": [
"with pd.HDFStore('temp/store53.h5') as store:\n",
" try:\n",
" store.remove('to_remove')\n",
" except:\n",
" pass\n",
" print (store)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.2 Table format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with pd.HDFStore('temp/store53.h5') as store:\n",
" # store.append creates a table automatically:\n",
" store.append('table/df1_appended', df1.iloc[:10000])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with pd.HDFStore('temp/store53.h5') as store:\n",
" store.append('table/df1_appended', df1.iloc[10001:20000])\n",
" store.append('table/df1_appended', df1.iloc[20001:50000])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.3 Querying a Table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Query the table using boolean expression with in-line function evaluation:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 2000-01-01 12:00:00.200000+01:00 | \n",
" -0.516980 | \n",
" 0.231047 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.250000+01:00 | \n",
" 0.151773 | \n",
" 1.263926 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.300000+01:00 | \n",
" -0.221863 | \n",
" 2.628103 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.350000+01:00 | \n",
" 0.454621 | \n",
" -0.378720 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.200000+01:00 | \n",
" -3.104964 | \n",
" -1.301010 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.250000+01:00 | \n",
" 0.883893 | \n",
" 0.319630 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.300000+01:00 | \n",
" 0.780367 | \n",
" -0.921197 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.350000+01:00 | \n",
" 1.117064 | \n",
" -0.678611 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.200000+01:00 | \n",
" 1.091494 | \n",
" -1.163870 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.250000+01:00 | \n",
" -0.131051 | \n",
" -1.156213 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.300000+01:00 | \n",
" 0.773293 | \n",
" -1.646943 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.350000+01:00 | \n",
" -0.293174 | \n",
" 0.268033 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.200000+01:00 | \n",
" 0.209013 | \n",
" 0.178377 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.250000+01:00 | \n",
" 0.714508 | \n",
" -0.081073 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.300000+01:00 | \n",
" -0.115000 | \n",
" -1.940541 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.350000+01:00 | \n",
" -0.317933 | \n",
" 0.245165 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.200000+01:00 | \n",
" 1.088952 | \n",
" -0.470424 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.250000+01:00 | \n",
" -1.921369 | \n",
" -1.620379 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.300000+01:00 | \n",
" 0.799032 | \n",
" 0.974941 | \n",
"
\n",
" \n",
" 2000-01-01 12:00:00.350000+01:00 | \n",
" 0.839663 | \n",
" 0.301037 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"2000-01-01 12:00:00.200000+01:00 -0.516980 0.231047\n",
"2000-01-01 12:00:00.250000+01:00 0.151773 1.263926\n",
"2000-01-01 12:00:00.300000+01:00 -0.221863 2.628103\n",
"2000-01-01 12:00:00.350000+01:00 0.454621 -0.378720\n",
"2000-01-01 12:00:00.200000+01:00 -3.104964 -1.301010\n",
"2000-01-01 12:00:00.250000+01:00 0.883893 0.319630\n",
"2000-01-01 12:00:00.300000+01:00 0.780367 -0.921197\n",
"2000-01-01 12:00:00.350000+01:00 1.117064 -0.678611\n",
"2000-01-01 12:00:00.200000+01:00 1.091494 -1.163870\n",
"2000-01-01 12:00:00.250000+01:00 -0.131051 -1.156213\n",
"2000-01-01 12:00:00.300000+01:00 0.773293 -1.646943\n",
"2000-01-01 12:00:00.350000+01:00 -0.293174 0.268033\n",
"2000-01-01 12:00:00.200000+01:00 0.209013 0.178377\n",
"2000-01-01 12:00:00.250000+01:00 0.714508 -0.081073\n",
"2000-01-01 12:00:00.300000+01:00 -0.115000 -1.940541\n",
"2000-01-01 12:00:00.350000+01:00 -0.317933 0.245165\n",
"2000-01-01 12:00:00.200000+01:00 1.088952 -0.470424\n",
"2000-01-01 12:00:00.250000+01:00 -1.921369 -1.620379\n",
"2000-01-01 12:00:00.300000+01:00 0.799032 0.974941\n",
"2000-01-01 12:00:00.350000+01:00 0.839663 0.301037"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with pd.HDFStore('temp/store53.h5') as store:\n",
" query01 = store.select('table/df1_appended',\n",
" \"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \\\n",
" & index) for more tutorials and updates.\n",
"\n",
"This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:addfor_tutorials]",
"language": "python",
"name": "conda-env-addfor_tutorials-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}