{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# pandas I/O tools and examples" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import addutils.toc ; addutils.toc.js(ipy_notebook=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from addutils import css_notebook\n", "css_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 Matlab Variables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Import a Matlab variable from file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import os \n", "import scipy.io\n", "import numpy as np\n", "import pandas as pd\n", "import sqlite3\n", "#from pandas.io.data import DataFrame, read_csv\n", "import pandas.io.sql as psql\n", "from time import time\n", "from IPython.display import display\n", "from addutils import css_notebook\n", "css_notebook()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import from '.mat' files" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "x = scipy.io.loadmat('example_data/matlab_variable.mat')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 92, 99, 1, 8, 15, 67, 74, 51, 58, 40],\n", " [ 98, 80, 7, 14, 16, 73, 55, 57, 64, 41],\n", " [ 4, 81, 88, 20, 22, 54, 56, 63, 70, 47],\n", " [ 85, 87, 19, 21, 3, 60, 62, 69, 71, 28],\n", " [ 86, 93, 25, 2, 9, 61, 68, 75, 52, 34],\n", " [ 17, 24, 76, 83, 90, 42, 49, 26, 33, 65],\n", " [ 23, 5, 82, 89, 91, 48, 30, 32, 39, 66],\n", " [ 79, 6, 13, 95, 97, 29, 31, 38, 45, 72],\n", " [ 10, 12, 94, 96, 78, 35, 37, 44, 46, 53],\n", " [ 11, 18, 100, 77, 84, 36, 43, 50, 27, 59]], dtype=uint8)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyA = x['a']\n", "pyA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The Matlab variable is passed to a pandas DataFrame:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
0929918156774515840
19880714167355576441
24818820225456637047
38587192136062697128
4869325296168755234
517247683904249263365
62358289914830323966
77961395972931384572
810129496783537444653
9111810077843643502759
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9\n", "0 92 99 1 8 15 67 74 51 58 40\n", "1 98 80 7 14 16 73 55 57 64 41\n", "2 4 81 88 20 22 54 56 63 70 47\n", "3 85 87 19 21 3 60 62 69 71 28\n", "4 86 93 25 2 9 61 68 75 52 34\n", "5 17 24 76 83 90 42 49 26 33 65\n", "6 23 5 82 89 91 48 30 32 39 66\n", "7 79 6 13 95 97 29 31 38 45 72\n", "8 10 12 94 96 78 35 37 44 46 53\n", "9 11 18 100 77 84 36 43 50 27 59" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(pyA)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 Importing a compressed CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following example shows how to import directly a compressed csv file, in this case with multiple separators:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDNamesPhoneTownDescriptionIncomeCoordinates
01Lydia Z. Flores(02) 0548 5995Haddingtonsed, sapien. Nunc pulvinar arcu€6,22323.28054, -24.48755
12Bell X. Guerra(07) 8599 9926Montes Clarosconsectetuer, cursus et, magna. Praesent€14,967-77.76329, 69.22339
23Kirk Q. Bowman(06) 4153 7501Caglitortor. Nunc commodo auctor velit.€17,399-58.80037, 22.50537
34Karen Pickett(02) 3216 9708Cobourgat auctor ullamcorper, nisl arcu€17,37375.73982, -78.01872
45Vance J. Johnson(01) 7568 6371Carltonultricies adipiscing, enim mi tempor€9,025-86.51337, 109.46298
\n", "
" ], "text/plain": [ " ID Names Phone Town \\\n", "0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n", "1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n", "2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n", "3 4 Karen Pickett (02) 3216 9708 Cobourg \n", "4 5 Vance J. Johnson (01) 7568 6371 Carlton \n", "\n", " Description Income \\\n", "0 sed, sapien. Nunc pulvinar arcu €6,223 \n", "1 consectetuer, cursus et, magna. Praesent €14,967 \n", "2 tortor. Nunc commodo auctor velit. €17,399 \n", "3 at auctor ullamcorper, nisl arcu €17,373 \n", "4 ultricies adipiscing, enim mi tempor €9,025 \n", "\n", " Coordinates \n", "0 23.28054, -24.48755 \n", "1 -77.76329, 69.22339 \n", "2 -58.80037, 22.50537 \n", "3 75.73982, -78.01872 \n", "4 -86.51337, 109.46298 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\\,\\;\\.', \n", " skipinitialspace=True, compression='gzip', engine='python',\n", " encoding='iso8859_15')\n", "df_csv.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**TODO**: utf-8 has problems decoding euro sign. \"\\u00e2\\u0082\\u00ac\" are the three chars you get when the UTF-8 encoded \\u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDNamesPhoneTownDescriptionIncomeCoordinateslatlon
01Lydia Z. Flores(02) 0548 5995Haddingtonsed, sapien. Nunc pulvinar arcu€6,22323.28054, -24.4875523.28054-24.48755
12Bell X. Guerra(07) 8599 9926Montes Clarosconsectetuer, cursus et, magna. Praesent€14,967-77.76329, 69.22339-77.7632969.22339
23Kirk Q. Bowman(06) 4153 7501Caglitortor. Nunc commodo auctor velit.€17,399-58.80037, 22.50537-58.8003722.50537
34Karen Pickett(02) 3216 9708Cobourgat auctor ullamcorper, nisl arcu€17,37375.73982, -78.0187275.73982-78.01872
45Vance J. Johnson(01) 7568 6371Carltonultricies adipiscing, enim mi tempor€9,025-86.51337, 109.46298-86.51337109.46298
\n", "
" ], "text/plain": [ " ID Names Phone Town \\\n", "0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n", "1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n", "2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n", "3 4 Karen Pickett (02) 3216 9708 Cobourg \n", "4 5 Vance J. Johnson (01) 7568 6371 Carlton \n", "\n", " Description Income \\\n", "0 sed, sapien. Nunc pulvinar arcu €6,223 \n", "1 consectetuer, cursus et, magna. Praesent €14,967 \n", "2 tortor. Nunc commodo auctor velit. €17,399 \n", "3 at auctor ullamcorper, nisl arcu €17,373 \n", "4 ultricies adipiscing, enim mi tempor €9,025 \n", "\n", " Coordinates lat lon \n", "0 23.28054, -24.48755 23.28054 -24.48755 \n", "1 -77.76329, 69.22339 -77.76329 69.22339 \n", "2 -58.80037, 22.50537 -58.80037 22.50537 \n", "3 75.73982, -78.01872 75.73982 -78.01872 \n", "4 -86.51337, 109.46298 -86.51337 109.46298 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coor = df_csv['Coordinates']\n", "df_csv['lat'] = ''\n", "df_csv['lon'] = ''\n", "for j, coo in enumerate(coor):\n", " spl = re.split(',', str(coo))\n", " df_csv.loc[j, 'lat'] = spl[0] \n", " df_csv.loc[j, 'lon'] = spl[1]\n", "df_csv.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df_csv.to_json('temp/converted_json.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Importing JSON files" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CoordinatesDescriptionIDIncomeNamesPhoneTownlatlon
023.28054, -24.48755sed, sapien. Nunc pulvinar arcu1€6,223Lydia Z. Flores(02) 0548 5995Haddington23.28054-24.48755
1-77.76329, 69.22339consectetuer, cursus et, magna. Praesent2€14,967Bell X. Guerra(07) 8599 9926Montes Claros-77.7632969.22339
10-62.95199, 116.29751Duis sit amet diam eu11€15,764Jameson H. Craig(07) 8640 5274Fairbanks-62.95199116.29751
11-42.26793, -92.23605et netus et malesuada fames12€18,460Michael Randall(04) 3451 8606Mezzana-42.26793-92.23605
12-20.39897, 152.4122id risus quis diam luctus13€6,175Sierra Rivers(01) 2020 4511Lugnano in Teverina-20.39897152.41220
\n", "
" ], "text/plain": [ " Coordinates Description ID \\\n", "0 23.28054, -24.48755 sed, sapien. Nunc pulvinar arcu 1 \n", "1 -77.76329, 69.22339 consectetuer, cursus et, magna. Praesent 2 \n", "10 -62.95199, 116.29751 Duis sit amet diam eu 11 \n", "11 -42.26793, -92.23605 et netus et malesuada fames 12 \n", "12 -20.39897, 152.4122 id risus quis diam luctus 13 \n", "\n", " Income Names Phone \\\n", "0 €6,223 Lydia Z. Flores (02) 0548 5995 \n", "1 €14,967 Bell X. Guerra (07) 8599 9926 \n", "10 €15,764 Jameson H. Craig (07) 8640 5274 \n", "11 €18,460 Michael Randall (04) 3451 8606 \n", "12 €6,175 Sierra Rivers (01) 2020 4511 \n", "\n", " Town lat lon \n", "0 Haddington 23.28054 -24.48755 \n", "1 Montes Claros -77.76329 69.22339 \n", "10 Fairbanks -62.95199 116.29751 \n", "11 Mezzana -42.26793 -92.23605 \n", "12 Lugnano in Teverina -20.39897 152.41220 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('temp/converted_json.json') as f:\n", " data = f.read().encode('utf-8')\n", "dfjson = pd.read_json(data)\n", "dfjson.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since the import reordered columns in alphabetical order, we can choose a preferred column order:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDNamesPhoneIncomeTownlatlon
01Lydia Z. Flores(02) 0548 5995€6,223Haddington23.28054-24.48755
12Bell X. Guerra(07) 8599 9926€14,967Montes Claros-77.7632969.22339
1011Jameson H. Craig(07) 8640 5274€15,764Fairbanks-62.95199116.29751
1112Michael Randall(04) 3451 8606€18,460Mezzana-42.26793-92.23605
1213Sierra Rivers(01) 2020 4511€6,175Lugnano in Teverina-20.39897152.41220
\n", "
" ], "text/plain": [ " ID Names Phone Income \\\n", "0 1 Lydia Z. Flores (02) 0548 5995 €6,223 \n", "1 2 Bell X. Guerra (07) 8599 9926 €14,967 \n", "10 11 Jameson H. Craig (07) 8640 5274 €15,764 \n", "11 12 Michael Randall (04) 3451 8606 €18,460 \n", "12 13 Sierra Rivers (01) 2020 4511 €6,175 \n", "\n", " Town lat lon \n", "0 Haddington 23.28054 -24.48755 \n", "1 Montes Claros -77.76329 69.22339 \n", "10 Fairbanks -62.95199 116.29751 \n", "11 Mezzana -42.26793 -92.23605 \n", "12 Lugnano in Teverina -20.39897 152.41220 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]\n", "dfjson.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4 Importing HTML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Nomi maschiliStradeCittàCoordinateEntemail
0Drew5672 Accumsan RoadWhitburn-54.68429, -67.21709Ac Consultinga.aliquet.vel@vitae.co.uk
1FinnAp #170-4074 Interdum Rd.Pelotas81.93087, 168.14556Varius Incorporatednec.malesuada.ut@primisin.com
2MartinAp #471-6260 Etiam AveAliano33.99272, -148.70584Arcu Industrieslacus@risus.net
3ZephaniaAp #720-7951 Aliquam AveSenneville-56.82569, -72.80435Tristique PCarcu@sempereratin.co.uk
4Devin3538 Dui St.Zwettl-Niederösterreich-11.90874, -149.61123Imperdiet Dictum PClectus.rutrum.urna@massaQuisqueporttitor.ca
\n", "
" ], "text/plain": [ " Nomi maschili Strade Città \\\n", "0 Drew 5672 Accumsan Road Whitburn \n", "1 Finn Ap #170-4074 Interdum Rd. Pelotas \n", "2 Martin Ap #471-6260 Etiam Ave Aliano \n", "3 Zephania Ap #720-7951 Aliquam Ave Senneville \n", "4 Devin 3538 Dui St. Zwettl-Niederösterreich \n", "\n", " Coordinate Ente \\\n", "0 -54.68429, -67.21709 Ac Consulting \n", "1 81.93087, 168.14556 Varius Incorporated \n", "2 33.99272, -148.70584 Arcu Industries \n", "3 -56.82569, -72.80435 Tristique PC \n", "4 -11.90874, -149.61123 Imperdiet Dictum PC \n", "\n", " mail \n", "0 a.aliquet.vel@vitae.co.uk \n", "1 nec.malesuada.ut@primisin.com \n", "2 lacus@risus.net \n", "3 arcu@sempereratin.co.uk \n", "4 lectus.rutrum.urna@massaQuisqueporttitor.ca " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfhtml = pd.read_html('example_data/generated.html', header=0)\n", "dfhtml[0].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5 Importing Excel" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamesStreetsTownCoordinatesCorporationmail
0StephenAp #389-3365 Risus, St.Lelystad51.91783, -47.01037Consectetuer Rhoncus Nullam CorporationSuspendisse.sed@pedePraesenteu.org
1Kenneth120-6483 Ligula. AveLabrecque-1.92625, 10.02451A Corporationorci@egetmollislectus.com
2LeoAp #727-2085 Eget Av.Strathcona County-17.31839, 137.99307Sed LimitedEtiam@vel.org
3JoshuaP.O. Box 425, 6462 Arcu Rd.Municipal District-51.34642, 80.32145Mauris Sapien Cursus Corp.lorem@ornarelectusante.net
4Zephania655 Et, St.Couillet29.96525, 124.18391Odio Semper Cursus Corp.metus@a.ca
\n", "
" ], "text/plain": [ " Names Streets Town \\\n", "0 Stephen Ap #389-3365 Risus, St. Lelystad \n", "1 Kenneth 120-6483 Ligula. Ave Labrecque \n", "2 Leo Ap #727-2085 Eget Av. Strathcona County \n", "3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n", "4 Zephania 655 Et, St. Couillet \n", "\n", " Coordinates Corporation \\\n", "0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n", "1 -1.92625, 10.02451 A Corporation \n", "2 -17.31839, 137.99307 Sed Limited \n", "3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n", "4 29.96525, 124.18391 Odio Semper Cursus Corp. \n", "\n", " mail \n", "0 Suspendisse.sed@pedePraesenteu.org \n", "1 orci@egetmollislectus.com \n", "2 Etiam@vel.org \n", "3 lorem@ornarelectusante.net \n", "4 metus@a.ca " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')\n", "dfxl.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6 Working with SQL and databases" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1 Write SQL" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "con = sqlite3.connect(\"temp.sql\")\n", "sqlfile = dfxl.to_sql('Names', con, flavor='sqlite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.2 Import SQL" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(100, 7)\n" ] } ], "source": [ "con = sqlite3.connect('temp.sql')\n", "with con:\n", " sql = \"SELECT * FROM Names;\"\n", " df = psql.read_sql(sql, con)\n", " print(df.shape)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexNamesStreetsTownCoordinatesCorporationmail
00StephenAp #389-3365 Risus, St.Lelystad51.91783, -47.01037Consectetuer Rhoncus Nullam CorporationSuspendisse.sed@pedePraesenteu.org
11Kenneth120-6483 Ligula. AveLabrecque-1.92625, 10.02451A Corporationorci@egetmollislectus.com
22LeoAp #727-2085 Eget Av.Strathcona County-17.31839, 137.99307Sed LimitedEtiam@vel.org
33JoshuaP.O. Box 425, 6462 Arcu Rd.Municipal District-51.34642, 80.32145Mauris Sapien Cursus Corp.lorem@ornarelectusante.net
44Zephania655 Et, St.Couillet29.96525, 124.18391Odio Semper Cursus Corp.metus@a.ca
\n", "
" ], "text/plain": [ " index Names Streets Town \\\n", "0 0 Stephen Ap #389-3365 Risus, St. Lelystad \n", "1 1 Kenneth 120-6483 Ligula. Ave Labrecque \n", "2 2 Leo Ap #727-2085 Eget Av. Strathcona County \n", "3 3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n", "4 4 Zephania 655 Et, St. Couillet \n", "\n", " Coordinates Corporation \\\n", "0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n", "1 -1.92625, 10.02451 A Corporation \n", "2 -17.31839, 137.99307 Sed Limited \n", "3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n", "4 29.96525, 124.18391 Odio Semper Cursus Corp. \n", "\n", " mail \n", "0 Suspendisse.sed@pedePraesenteu.org \n", "1 orci@egetmollislectus.com \n", "2 Etiam@vel.org \n", "3 lorem@ornarelectusante.net \n", "4 metus@a.ca " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "con.close()\n", "os.remove(\"temp.sql\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7 Working with HDF5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7.1 Storer format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add DataFrames to the HDFStore object:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "samples_01 = int(3e5)\n", "samples_02 = int(1e5)\n", "idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')\n", "idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')\n", "randn = np.random.randn" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])\n", "df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Size of the Dataset: 13.73291015625 MB\n" ] } ], "source": [ "print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10 loops, best of 3: 32 ms per loop\n" ] } ], "source": [ "%%timeit\n", "with pd.get_store('temp/store53.h5') as store:\n", " store.put('storer/df1', df1)\n", " store.put('storer/df2', df2)\n", " store.put('to_remove', df2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Retrieve stored objects:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['/to_remove', '/storer/df1', '/storer/df2']\n", " A B C\n", "2000-01-01 12:00:00.050000+01:00 -0.022728 0.015469 0.962709\n", "2000-01-01 12:00:00.100000+01:00 -0.453089 0.513491 -1.270842\n", " A B C\n", "2000-01-01 12:00:00.050000+01:00 -0.022728 0.015469 0.962709\n", "2000-01-01 12:00:00.100000+01:00 -0.453089 0.513491 -1.270842\n", "Check retrieved data equal to original data: \n", "True\n" ] } ], "source": [ "with pd.get_store('temp/store53.h5') as store:\n", " print (store.keys())\n", " df1_retrieved = store.get('storer/df1')\n", " print (df1_retrieved[1:3])\n", " print (df1[1:3])\n", " print ('Check retrieved data equal to original data: ')\n", " print (df1_retrieved[1:3].equals(df1[1:3]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Delete objects:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "File path: temp/store53.h5\n", "/storer/df1 frame (shape->[300000,3])\n", "/storer/df2 frame (shape->[100000,4])\n" ] } ], "source": [ "with pd.get_store('temp/store53.h5') as store:\n", " try:\n", " store.remove('to_remove')\n", " except:\n", " pass\n", " print (store)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7.2 Table format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with pd.get_store('temp/store53.h5') as store:\n", " # store.append creates a table automatically:\n", " store.append('table/df1_appended', df1.ix[:10000])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with pd.get_store('temp/store53.h5') as store:\n", " store.append('table/df1_appended', df1.ix[10001:20000])\n", " store.append('table/df1_appended', df1.ix[20001:50000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7.3 Querying a Table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Query the table using boolean expression with in-line function evaluation:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2000-01-01 12:00:00.200000+01:00-0.232597-0.705741
2000-01-01 12:00:00.250000+01:000.1832340.024464
2000-01-01 12:00:00.300000+01:00-0.801220-0.732630
2000-01-01 12:00:00.350000+01:000.762667-0.585462
\n", "
" ], "text/plain": [ " A B\n", "2000-01-01 12:00:00.200000+01:00 -0.232597 -0.705741\n", "2000-01-01 12:00:00.250000+01:00 0.183234 0.024464\n", "2000-01-01 12:00:00.300000+01:00 -0.801220 -0.732630\n", "2000-01-01 12:00:00.350000+01:00 0.762667 -0.585462" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with pd.get_store('temp/store53.h5') as store:\n", " query01 = store.select('table/df1_appended',\n", " \"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \\\n", " & index) for more tutorials and updates.\n", "\n", "This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }