{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# pandas I/O tools and examples" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import addutils.toc ; addutils.toc.js(ipy_notebook=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from addutils import css_notebook\n", "css_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 Matlab Variables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Import a Matlab variable from file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import os \n", "import scipy.io\n", "import numpy as np\n", "import pandas as pd\n", "import sqlite3\n", "import pandas.io.sql as psql\n", "from time import time\n", "from IPython.display import display\n", "from addutils import css_notebook\n", "css_notebook()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import from '.mat' files" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "x = scipy.io.loadmat('example_data/matlab_variable.mat')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 92, 99, 1, 8, 15, 67, 74, 51, 58, 40],\n", " [ 98, 80, 7, 14, 16, 73, 55, 57, 64, 41],\n", " [ 4, 81, 88, 20, 22, 54, 56, 63, 70, 47],\n", " [ 85, 87, 19, 21, 3, 60, 62, 69, 71, 28],\n", " [ 86, 93, 25, 2, 9, 61, 68, 75, 52, 34],\n", " [ 17, 24, 76, 83, 90, 42, 49, 26, 33, 65],\n", " [ 23, 5, 82, 89, 91, 48, 30, 32, 39, 66],\n", " [ 79, 6, 13, 95, 97, 29, 31, 38, 45, 72],\n", " [ 10, 12, 94, 96, 78, 35, 37, 44, 46, 53],\n", " [ 11, 18, 100, 77, 84, 36, 43, 50, 27, 59]], dtype=uint8)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyA = x['a']\n", "pyA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The Matlab variable is passed to a pandas DataFrame:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789
0929918156774515840
19880714167355576441
24818820225456637047
38587192136062697128
4869325296168755234
517247683904249263365
62358289914830323966
77961395972931384572
810129496783537444653
9111810077843643502759
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9\n", "0 92 99 1 8 15 67 74 51 58 40\n", "1 98 80 7 14 16 73 55 57 64 41\n", "2 4 81 88 20 22 54 56 63 70 47\n", "3 85 87 19 21 3 60 62 69 71 28\n", "4 86 93 25 2 9 61 68 75 52 34\n", "5 17 24 76 83 90 42 49 26 33 65\n", "6 23 5 82 89 91 48 30 32 39 66\n", "7 79 6 13 95 97 29 31 38 45 72\n", "8 10 12 94 96 78 35 37 44 46 53\n", "9 11 18 100 77 84 36 43 50 27 59" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(pyA)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 Importing a compressed CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following example shows how to import directly a compressed csv file, in this case with multiple separators:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDNamesPhoneTownDescriptionIncomeCoordinates
01Lydia Z. Flores(02) 0548 5995Haddingtonsed, sapien. Nunc pulvinar arcu€6,22323.28054, -24.48755
12Bell X. Guerra(07) 8599 9926Montes Clarosconsectetuer, cursus et, magna. Praesent€14,967-77.76329, 69.22339
23Kirk Q. Bowman(06) 4153 7501Caglitortor. Nunc commodo auctor velit.€17,399-58.80037, 22.50537
34Karen Pickett(02) 3216 9708Cobourgat auctor ullamcorper, nisl arcu€17,37375.73982, -78.01872
45Vance J. Johnson(01) 7568 6371Carltonultricies adipiscing, enim mi tempor€9,025-86.51337, 109.46298
\n", "
" ], "text/plain": [ " ID Names Phone Town \\\n", "0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n", "1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n", "2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n", "3 4 Karen Pickett (02) 3216 9708 Cobourg \n", "4 5 Vance J. Johnson (01) 7568 6371 Carlton \n", "\n", " Description Income \\\n", "0 sed, sapien. Nunc pulvinar arcu €6,223 \n", "1 consectetuer, cursus et, magna. Praesent €14,967 \n", "2 tortor. Nunc commodo auctor velit. €17,399 \n", "3 at auctor ullamcorper, nisl arcu €17,373 \n", "4 ultricies adipiscing, enim mi tempor €9,025 \n", "\n", " Coordinates \n", "0 23.28054, -24.48755 \n", "1 -77.76329, 69.22339 \n", "2 -58.80037, 22.50537 \n", "3 75.73982, -78.01872 \n", "4 -86.51337, 109.46298 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\\,\\;\\.', \n", " skipinitialspace=True, compression='gzip', engine='python',\n", " encoding='iso8859_15')\n", "df_csv.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**TODO**: utf-8 has problems decoding euro sign. \"\\u00e2\\u0082\\u00ac\" are the three chars you get when the UTF-8 encoded \\u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDNamesPhoneTownDescriptionIncomeCoordinateslatlon
01Lydia Z. Flores(02) 0548 5995Haddingtonsed, sapien. Nunc pulvinar arcu€6,22323.28054, -24.4875523.28054-24.48755
12Bell X. Guerra(07) 8599 9926Montes Clarosconsectetuer, cursus et, magna. Praesent€14,967-77.76329, 69.22339-77.7632969.22339
23Kirk Q. Bowman(06) 4153 7501Caglitortor. Nunc commodo auctor velit.€17,399-58.80037, 22.50537-58.8003722.50537
34Karen Pickett(02) 3216 9708Cobourgat auctor ullamcorper, nisl arcu€17,37375.73982, -78.0187275.73982-78.01872
45Vance J. Johnson(01) 7568 6371Carltonultricies adipiscing, enim mi tempor€9,025-86.51337, 109.46298-86.51337109.46298
\n", "
" ], "text/plain": [ " ID Names Phone Town \\\n", "0 1 Lydia Z. Flores (02) 0548 5995 Haddington \n", "1 2 Bell X. Guerra (07) 8599 9926 Montes Claros \n", "2 3 Kirk Q. Bowman (06) 4153 7501 Cagli \n", "3 4 Karen Pickett (02) 3216 9708 Cobourg \n", "4 5 Vance J. Johnson (01) 7568 6371 Carlton \n", "\n", " Description Income \\\n", "0 sed, sapien. Nunc pulvinar arcu €6,223 \n", "1 consectetuer, cursus et, magna. Praesent €14,967 \n", "2 tortor. Nunc commodo auctor velit. €17,399 \n", "3 at auctor ullamcorper, nisl arcu €17,373 \n", "4 ultricies adipiscing, enim mi tempor €9,025 \n", "\n", " Coordinates lat lon \n", "0 23.28054, -24.48755 23.28054 -24.48755 \n", "1 -77.76329, 69.22339 -77.76329 69.22339 \n", "2 -58.80037, 22.50537 -58.80037 22.50537 \n", "3 75.73982, -78.01872 75.73982 -78.01872 \n", "4 -86.51337, 109.46298 -86.51337 109.46298 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coor = df_csv['Coordinates']\n", "df_csv['lat'] = ''\n", "df_csv['lon'] = ''\n", "for j, coo in enumerate(coor):\n", " spl = re.split(',', str(coo))\n", " df_csv.loc[j, 'lat'] = spl[0] \n", " df_csv.loc[j, 'lon'] = spl[1]\n", "df_csv.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_csv.to_json('temp/converted_json.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Importing JSON files" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CoordinatesDescriptionIDIncomeNamesPhoneTownlatlon
023.28054, -24.48755sed, sapien. Nunc pulvinar arcu1€6,223Lydia Z. Flores(02) 0548 5995Haddington23.28054-24.48755
1-77.76329, 69.22339consectetuer, cursus et, magna. Praesent2€14,967Bell X. Guerra(07) 8599 9926Montes Claros-77.7632969.22339
10-62.95199, 116.29751Duis sit amet diam eu11€15,764Jameson H. Craig(07) 8640 5274Fairbanks-62.95199116.29751
11-42.26793, -92.23605et netus et malesuada fames12€18,460Michael Randall(04) 3451 8606Mezzana-42.26793-92.23605
12-20.39897, 152.4122id risus quis diam luctus13€6,175Sierra Rivers(01) 2020 4511Lugnano in Teverina-20.39897152.41220
\n", "
" ], "text/plain": [ " Coordinates Description ID \\\n", "0 23.28054, -24.48755 sed, sapien. Nunc pulvinar arcu 1 \n", "1 -77.76329, 69.22339 consectetuer, cursus et, magna. Praesent 2 \n", "10 -62.95199, 116.29751 Duis sit amet diam eu 11 \n", "11 -42.26793, -92.23605 et netus et malesuada fames 12 \n", "12 -20.39897, 152.4122 id risus quis diam luctus 13 \n", "\n", " Income Names Phone \\\n", "0 €6,223 Lydia Z. Flores (02) 0548 5995 \n", "1 €14,967 Bell X. Guerra (07) 8599 9926 \n", "10 €15,764 Jameson H. Craig (07) 8640 5274 \n", "11 €18,460 Michael Randall (04) 3451 8606 \n", "12 €6,175 Sierra Rivers (01) 2020 4511 \n", "\n", " Town lat lon \n", "0 Haddington 23.28054 -24.48755 \n", "1 Montes Claros -77.76329 69.22339 \n", "10 Fairbanks -62.95199 116.29751 \n", "11 Mezzana -42.26793 -92.23605 \n", "12 Lugnano in Teverina -20.39897 152.41220 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('temp/converted_json.json') as f:\n", " data = f.read().encode('utf-8')\n", "dfjson = pd.read_json(data)\n", "dfjson.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since the import reordered columns in alphabetical order, we can choose a preferred column order:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDNamesPhoneIncomeTownlatlon
01Lydia Z. Flores(02) 0548 5995€6,223Haddington23.28054-24.48755
12Bell X. Guerra(07) 8599 9926€14,967Montes Claros-77.7632969.22339
1011Jameson H. Craig(07) 8640 5274€15,764Fairbanks-62.95199116.29751
1112Michael Randall(04) 3451 8606€18,460Mezzana-42.26793-92.23605
1213Sierra Rivers(01) 2020 4511€6,175Lugnano in Teverina-20.39897152.41220
\n", "
" ], "text/plain": [ " ID Names Phone Income \\\n", "0 1 Lydia Z. Flores (02) 0548 5995 €6,223 \n", "1 2 Bell X. Guerra (07) 8599 9926 €14,967 \n", "10 11 Jameson H. Craig (07) 8640 5274 €15,764 \n", "11 12 Michael Randall (04) 3451 8606 €18,460 \n", "12 13 Sierra Rivers (01) 2020 4511 €6,175 \n", "\n", " Town lat lon \n", "0 Haddington 23.28054 -24.48755 \n", "1 Montes Claros -77.76329 69.22339 \n", "10 Fairbanks -62.95199 116.29751 \n", "11 Mezzana -42.26793 -92.23605 \n", "12 Lugnano in Teverina -20.39897 152.41220 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfjson = dfjson.loc[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]\n", "dfjson.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4 Importing HTML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Nomi maschiliStradeCittàCoordinateEntemail
0Drew5672 Accumsan RoadWhitburn-54.68429, -67.21709Ac Consultinga.aliquet.vel@vitae.co.uk
1FinnAp #170-4074 Interdum Rd.Pelotas81.93087, 168.14556Varius Incorporatednec.malesuada.ut@primisin.com
2MartinAp #471-6260 Etiam AveAliano33.99272, -148.70584Arcu Industrieslacus@risus.net
3ZephaniaAp #720-7951 Aliquam AveSenneville-56.82569, -72.80435Tristique PCarcu@sempereratin.co.uk
4Devin3538 Dui St.Zwettl-Niederösterreich-11.90874, -149.61123Imperdiet Dictum PClectus.rutrum.urna@massaQuisqueporttitor.ca
\n", "
" ], "text/plain": [ " Nomi maschili Strade Città \\\n", "0 Drew 5672 Accumsan Road Whitburn \n", "1 Finn Ap #170-4074 Interdum Rd. Pelotas \n", "2 Martin Ap #471-6260 Etiam Ave Aliano \n", "3 Zephania Ap #720-7951 Aliquam Ave Senneville \n", "4 Devin 3538 Dui St. Zwettl-Niederösterreich \n", "\n", " Coordinate Ente \\\n", "0 -54.68429, -67.21709 Ac Consulting \n", "1 81.93087, 168.14556 Varius Incorporated \n", "2 33.99272, -148.70584 Arcu Industries \n", "3 -56.82569, -72.80435 Tristique PC \n", "4 -11.90874, -149.61123 Imperdiet Dictum PC \n", "\n", " mail \n", "0 a.aliquet.vel@vitae.co.uk \n", "1 nec.malesuada.ut@primisin.com \n", "2 lacus@risus.net \n", "3 arcu@sempereratin.co.uk \n", "4 lectus.rutrum.urna@massaQuisqueporttitor.ca " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfhtml = pd.read_html('example_data/generated.html', header=0)\n", "dfhtml[0].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5 Importing Excel" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamesStreetsTownCoordinatesCorporationmail
0StephenAp #389-3365 Risus, St.Lelystad51.91783, -47.01037Consectetuer Rhoncus Nullam CorporationSuspendisse.sed@pedePraesenteu.org
1Kenneth120-6483 Ligula. AveLabrecque-1.92625, 10.02451A Corporationorci@egetmollislectus.com
2LeoAp #727-2085 Eget Av.Strathcona County-17.31839, 137.99307Sed LimitedEtiam@vel.org
3JoshuaP.O. Box 425, 6462 Arcu Rd.Municipal District-51.34642, 80.32145Mauris Sapien Cursus Corp.lorem@ornarelectusante.net
4Zephania655 Et, St.Couillet29.96525, 124.18391Odio Semper Cursus Corp.metus@a.ca
\n", "
" ], "text/plain": [ " Names Streets Town \\\n", "0 Stephen Ap #389-3365 Risus, St. Lelystad \n", "1 Kenneth 120-6483 Ligula. Ave Labrecque \n", "2 Leo Ap #727-2085 Eget Av. Strathcona County \n", "3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n", "4 Zephania 655 Et, St. Couillet \n", "\n", " Coordinates Corporation \\\n", "0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n", "1 -1.92625, 10.02451 A Corporation \n", "2 -17.31839, 137.99307 Sed Limited \n", "3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n", "4 29.96525, 124.18391 Odio Semper Cursus Corp. \n", "\n", " mail \n", "0 Suspendisse.sed@pedePraesenteu.org \n", "1 orci@egetmollislectus.com \n", "2 Etiam@vel.org \n", "3 lorem@ornarelectusante.net \n", "4 metus@a.ca " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')\n", "dfxl.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6 Working with SQL and databases" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1 Write SQL" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con = sqlite3.connect(\"temp.sql\")\n", "sqlfile = dfxl.to_sql('Names', con)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.2 Import SQL" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(100, 7)\n" ] } ], "source": [ "con = sqlite3.connect('temp.sql')\n", "with con:\n", " sql = \"SELECT * FROM Names;\"\n", " df = psql.read_sql(sql, con)\n", " print(df.shape)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexNamesStreetsTownCoordinatesCorporationmail
00StephenAp #389-3365 Risus, St.Lelystad51.91783, -47.01037Consectetuer Rhoncus Nullam CorporationSuspendisse.sed@pedePraesenteu.org
11Kenneth120-6483 Ligula. AveLabrecque-1.92625, 10.02451A Corporationorci@egetmollislectus.com
22LeoAp #727-2085 Eget Av.Strathcona County-17.31839, 137.99307Sed LimitedEtiam@vel.org
33JoshuaP.O. Box 425, 6462 Arcu Rd.Municipal District-51.34642, 80.32145Mauris Sapien Cursus Corp.lorem@ornarelectusante.net
44Zephania655 Et, St.Couillet29.96525, 124.18391Odio Semper Cursus Corp.metus@a.ca
\n", "
" ], "text/plain": [ " index Names Streets Town \\\n", "0 0 Stephen Ap #389-3365 Risus, St. Lelystad \n", "1 1 Kenneth 120-6483 Ligula. Ave Labrecque \n", "2 2 Leo Ap #727-2085 Eget Av. Strathcona County \n", "3 3 Joshua P.O. Box 425, 6462 Arcu Rd. Municipal District \n", "4 4 Zephania 655 Et, St. Couillet \n", "\n", " Coordinates Corporation \\\n", "0 51.91783, -47.01037 Consectetuer Rhoncus Nullam Corporation \n", "1 -1.92625, 10.02451 A Corporation \n", "2 -17.31839, 137.99307 Sed Limited \n", "3 -51.34642, 80.32145 Mauris Sapien Cursus Corp. \n", "4 29.96525, 124.18391 Odio Semper Cursus Corp. \n", "\n", " mail \n", "0 Suspendisse.sed@pedePraesenteu.org \n", "1 orci@egetmollislectus.com \n", "2 Etiam@vel.org \n", "3 lorem@ornarelectusante.net \n", "4 metus@a.ca " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con.close()\n", "os.remove(\"temp.sql\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7 Working with HDF5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7.1 Storer format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add DataFrames to the HDFStore object:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "samples_01 = int(3e5)\n", "samples_02 = int(1e5)\n", "idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')\n", "idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')\n", "randn = np.random.randn" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])\n", "df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Size of the Dataset: 13.73291015625 MB\n" ] } ], "source": [ "print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20.4 ms ± 612 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "with pd.HDFStore('temp/store53.h5') as store:\n", " store.put('storer/df1', df1)\n", " store.put('storer/df2', df2)\n", " store.put('to_remove', df2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Retrieve stored objects:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['/', '/to_remove', '/storer/df1', '/storer/df2', '/table/df1_appended']\n", " A B C\n", "2000-01-01 12:00:00.050000+01:00 -1.224140 0.058376 2.181400\n", "2000-01-01 12:00:00.100000+01:00 0.461911 0.018024 -0.165891\n", " A B C\n", "2000-01-01 12:00:00.050000+01:00 -1.224140 0.058376 2.181400\n", "2000-01-01 12:00:00.100000+01:00 0.461911 0.018024 -0.165891\n", "Check retrieved data equal to original data: \n", "True\n" ] } ], "source": [ "with pd.HDFStore('temp/store53.h5') as store:\n", " print (store.keys())\n", " df1_retrieved = store.get('storer/df1')\n", " print (df1_retrieved[1:3])\n", " print (df1[1:3])\n", " print ('Check retrieved data equal to original data: ')\n", " print (df1_retrieved[1:3].equals(df1[1:3]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Delete objects:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "File path: temp/store53.h5\n", "\n" ] } ], "source": [ "with pd.HDFStore('temp/store53.h5') as store:\n", " try:\n", " store.remove('to_remove')\n", " except:\n", " pass\n", " print (store)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7.2 Table format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with pd.HDFStore('temp/store53.h5') as store:\n", " # store.append creates a table automatically:\n", " store.append('table/df1_appended', df1.iloc[:10000])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with pd.HDFStore('temp/store53.h5') as store:\n", " store.append('table/df1_appended', df1.iloc[10001:20000])\n", " store.append('table/df1_appended', df1.iloc[20001:50000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 7.3 Querying a Table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Query the table using boolean expression with in-line function evaluation:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2000-01-01 12:00:00.200000+01:00-0.5169800.231047
2000-01-01 12:00:00.250000+01:000.1517731.263926
2000-01-01 12:00:00.300000+01:00-0.2218632.628103
2000-01-01 12:00:00.350000+01:000.454621-0.378720
2000-01-01 12:00:00.200000+01:00-3.104964-1.301010
2000-01-01 12:00:00.250000+01:000.8838930.319630
2000-01-01 12:00:00.300000+01:000.780367-0.921197
2000-01-01 12:00:00.350000+01:001.117064-0.678611
2000-01-01 12:00:00.200000+01:001.091494-1.163870
2000-01-01 12:00:00.250000+01:00-0.131051-1.156213
2000-01-01 12:00:00.300000+01:000.773293-1.646943
2000-01-01 12:00:00.350000+01:00-0.2931740.268033
2000-01-01 12:00:00.200000+01:000.2090130.178377
2000-01-01 12:00:00.250000+01:000.714508-0.081073
2000-01-01 12:00:00.300000+01:00-0.115000-1.940541
2000-01-01 12:00:00.350000+01:00-0.3179330.245165
2000-01-01 12:00:00.200000+01:001.088952-0.470424
2000-01-01 12:00:00.250000+01:00-1.921369-1.620379
2000-01-01 12:00:00.300000+01:000.7990320.974941
2000-01-01 12:00:00.350000+01:000.8396630.301037
\n", "
" ], "text/plain": [ " A B\n", "2000-01-01 12:00:00.200000+01:00 -0.516980 0.231047\n", "2000-01-01 12:00:00.250000+01:00 0.151773 1.263926\n", "2000-01-01 12:00:00.300000+01:00 -0.221863 2.628103\n", "2000-01-01 12:00:00.350000+01:00 0.454621 -0.378720\n", "2000-01-01 12:00:00.200000+01:00 -3.104964 -1.301010\n", "2000-01-01 12:00:00.250000+01:00 0.883893 0.319630\n", "2000-01-01 12:00:00.300000+01:00 0.780367 -0.921197\n", "2000-01-01 12:00:00.350000+01:00 1.117064 -0.678611\n", "2000-01-01 12:00:00.200000+01:00 1.091494 -1.163870\n", "2000-01-01 12:00:00.250000+01:00 -0.131051 -1.156213\n", "2000-01-01 12:00:00.300000+01:00 0.773293 -1.646943\n", "2000-01-01 12:00:00.350000+01:00 -0.293174 0.268033\n", "2000-01-01 12:00:00.200000+01:00 0.209013 0.178377\n", "2000-01-01 12:00:00.250000+01:00 0.714508 -0.081073\n", "2000-01-01 12:00:00.300000+01:00 -0.115000 -1.940541\n", "2000-01-01 12:00:00.350000+01:00 -0.317933 0.245165\n", "2000-01-01 12:00:00.200000+01:00 1.088952 -0.470424\n", "2000-01-01 12:00:00.250000+01:00 -1.921369 -1.620379\n", "2000-01-01 12:00:00.300000+01:00 0.799032 0.974941\n", "2000-01-01 12:00:00.350000+01:00 0.839663 0.301037" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with pd.HDFStore('temp/store53.h5') as store:\n", " query01 = store.select('table/df1_appended',\n", " \"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \\\n", " & index) for more tutorials and updates.\n", "\n", "This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License." ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:addfor_tutorials]", "language": "python", "name": "conda-env-addfor_tutorials-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }