{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using HDF5 with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import the required libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from pandas import (\n", " DataFrame, HDFStore\n", ")\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a dataframe" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C',])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-0.0928940.480401-0.967241
1-1.0038290.0126450.527720
2-0.060884-0.088839-0.269744
30.729817-0.0422340.229410
4-1.117705-0.778368-1.280790
\n", "
" ], "text/plain": [ " A B C\n", "0 -0.092894 0.480401 -0.967241\n", "1 -1.003829 0.012645 0.527720\n", "2 -0.060884 -0.088839 -0.269744\n", "3 0.729817 -0.042234 0.229410\n", "4 -1.117705 -0.778368 -1.280790" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a HDF5 format file for saving th dataframe" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store = HDFStore('dataset.h5')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "File path: dataset.h5\n", "Empty" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add the dataframe to the HDF5 file" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store.put('d1', df, format='table', data_columns=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "File path: dataset.h5\n", "/d1 frame_table (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Accessing the dataframe from HDF5 file" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-0.0928940.480401-0.967241
1-1.0038290.0126450.527720
2-0.060884-0.088839-0.269744
30.729817-0.0422340.229410
4-1.117705-0.778368-1.280790
\n", "
" ], "text/plain": [ " A B C\n", "0 -0.092894 0.480401 -0.967241\n", "1 -1.003829 0.012645 0.527720\n", "2 -0.060884 -0.088839 -0.269744\n", "3 0.729817 -0.042234 0.229410\n", "4 -1.117705 -0.778368 -1.280790" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store['d1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Appending another dataframe to already exisiting dataframe in HDF5 file" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store.append('d1', DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C']))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "File path: dataset.h5\n", "/d1 frame_table (typ->appendable,nrows->10,ncols->3,indexers->[index],dc->[A,B,C])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-0.0928940.480401-0.967241
1-1.0038290.0126450.527720
2-0.060884-0.088839-0.269744
30.729817-0.0422340.229410
4-1.117705-0.778368-1.280790
0-0.3962920.2053550.995982
10.450495-0.744076-1.320831
21.003412-0.8761431.677286
3-0.395701-0.4650950.287003
41.4426140.818815-0.378552
\n", "
" ], "text/plain": [ " A B C\n", "0 -0.092894 0.480401 -0.967241\n", "1 -1.003829 0.012645 0.527720\n", "2 -0.060884 -0.088839 -0.269744\n", "3 0.729817 -0.042234 0.229410\n", "4 -1.117705 -0.778368 -1.280790\n", "0 -0.396292 0.205355 0.995982\n", "1 0.450495 -0.744076 -1.320831\n", "2 1.003412 -0.876143 1.677286\n", "3 -0.395701 -0.465095 0.287003\n", "4 1.442614 0.818815 -0.378552" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store['d1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Closing the HDF5 file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store.close()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "File path: dataset.h5\n", "File is CLOSED" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Opening HDF5 file - Method 1 (not advised)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.read_hdf('dataset.h5')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-0.0928940.480401-0.967241
1-1.0038290.0126450.527720
2-0.060884-0.088839-0.269744
30.729817-0.0422340.229410
4-1.117705-0.778368-1.280790
0-0.3962920.2053550.995982
10.450495-0.744076-1.320831
21.003412-0.8761431.677286
3-0.395701-0.4650950.287003
41.4426140.818815-0.378552
\n", "
" ], "text/plain": [ " A B C\n", "0 -0.092894 0.480401 -0.967241\n", "1 -1.003829 0.012645 0.527720\n", "2 -0.060884 -0.088839 -0.269744\n", "3 0.729817 -0.042234 0.229410\n", "4 -1.117705 -0.778368 -1.280790\n", "0 -0.396292 0.205355 0.995982\n", "1 0.450495 -0.744076 -1.320831\n", "2 1.003412 -0.876143 1.677286\n", "3 -0.395701 -0.465095 0.287003\n", "4 1.442614 0.818815 -0.378552" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Opening HDF5 file - Method 2 (recommended way)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store = HDFStore('dataset.h5')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "File path: dataset.h5\n", "/d1 frame_table (typ->appendable,nrows->10,ncols->3,indexers->[index],dc->[A,B,C])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding dataframe to the opened HDF5, using the default format" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store.put('d2', DataFrame(np.random.randn(7,4)))\n", "store.put('d3', DataFrame(np.random.randn(14,3)))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "File path: dataset.h5\n", "/d1 frame_table (typ->appendable,nrows->10,ncols->3,indexers->[index],dc->[A,B,C])\n", "/d2 frame (shape->[7,4]) \n", "/d3 frame (shape->[14,3]) " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Difference between frame_table and frame formats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### frame_table format" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store.append('d1', pd.DataFrame(np.random.randn(3,3), columns=['A', 'B', 'C']))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-0.0928940.480401-0.967241
1-1.0038290.0126450.527720
2-0.060884-0.088839-0.269744
30.729817-0.0422340.229410
4-1.117705-0.778368-1.280790
0-0.3962920.2053550.995982
10.450495-0.744076-1.320831
21.003412-0.8761431.677286
3-0.395701-0.4650950.287003
41.4426140.818815-0.378552
0-0.1966170.5851341.052465
10.386223-0.9329650.749832
20.7210931.453562-0.426798
\n", "
" ], "text/plain": [ " A B C\n", "0 -0.092894 0.480401 -0.967241\n", "1 -1.003829 0.012645 0.527720\n", "2 -0.060884 -0.088839 -0.269744\n", "3 0.729817 -0.042234 0.229410\n", "4 -1.117705 -0.778368 -1.280790\n", "0 -0.396292 0.205355 0.995982\n", "1 0.450495 -0.744076 -1.320831\n", "2 1.003412 -0.876143 1.677286\n", "3 -0.395701 -0.465095 0.287003\n", "4 1.442614 0.818815 -0.378552\n", "0 -0.196617 0.585134 1.052465\n", "1 0.386223 -0.932965 0.749832\n", "2 0.721093 1.453562 -0.426798" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store['d1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### frame format" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "ename": "ValueError", "evalue": "Can only append to Tables", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'd2'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrandn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32mC:\\Users\\Jeri_Dabba\\pandas\\io\\pytables.pyc\u001b[0m in \u001b[0;36mappend\u001b[0;34m(self, key, value, format, append, columns, dropna, **kwargs)\u001b[0m\n\u001b[1;32m 917\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_format\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 918\u001b[0m self._write_to_group(key, value, append=append, dropna=dropna,\n\u001b[0;32m--> 919\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 920\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 921\u001b[0m def append_to_multiple(self, d, value, selector, data_columns=None,\n", "\u001b[0;32mC:\\Users\\Jeri_Dabba\\pandas\\io\\pytables.pyc\u001b[0m in \u001b[0;36m_write_to_group\u001b[0;34m(self, key, value, format, index, append, complib, encoding, **kwargs)\u001b[0m\n\u001b[1;32m 1250\u001b[0m if (not s.is_table or\n\u001b[1;32m 1251\u001b[0m (s.is_table and format == 'fixed' and s.is_exists)):\n\u001b[0;32m-> 1252\u001b[0;31m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Can only append to Tables'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1253\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_exists\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_object_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Can only append to Tables" ] } ], "source": [ "store.append('d2', pd.DataFrame(np.random.randn(4,4)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The frame format (default) is faster than frame_table format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### To view the dataframe with ordered index" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexABC
00-0.0928940.480401-0.967241
11-1.0038290.0126450.527720
22-0.060884-0.088839-0.269744
330.729817-0.0422340.229410
44-1.117705-0.778368-1.280790
50-0.3962920.2053550.995982
610.450495-0.744076-1.320831
721.003412-0.8761431.677286
83-0.395701-0.4650950.287003
941.4426140.818815-0.378552
100-0.1966170.5851341.052465
1110.386223-0.9329650.749832
1220.7210931.453562-0.426798
\n", "
" ], "text/plain": [ " index A B C\n", "0 0 -0.092894 0.480401 -0.967241\n", "1 1 -1.003829 0.012645 0.527720\n", "2 2 -0.060884 -0.088839 -0.269744\n", "3 3 0.729817 -0.042234 0.229410\n", "4 4 -1.117705 -0.778368 -1.280790\n", "5 0 -0.396292 0.205355 0.995982\n", "6 1 0.450495 -0.744076 -1.320831\n", "7 2 1.003412 -0.876143 1.677286\n", "8 3 -0.395701 -0.465095 0.287003\n", "9 4 1.442614 0.818815 -0.378552\n", "10 0 -0.196617 0.585134 1.052465\n", "11 1 0.386223 -0.932965 0.749832\n", "12 2 0.721093 1.453562 -0.426798" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "store['d1/table']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### To get the dataframe from the HDF5 file" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = store['d1']" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-0.0928940.480401-0.967241
1-1.0038290.0126450.527720
2-0.060884-0.088839-0.269744
30.729817-0.0422340.229410
4-1.117705-0.778368-1.280790
0-0.3962920.2053550.995982
10.450495-0.744076-1.320831
21.003412-0.8761431.677286
3-0.395701-0.4650950.287003
41.4426140.818815-0.378552
0-0.1966170.5851341.052465
10.386223-0.9329650.749832
20.7210931.453562-0.426798
\n", "
" ], "text/plain": [ " A B C\n", "0 -0.092894 0.480401 -0.967241\n", "1 -1.003829 0.012645 0.527720\n", "2 -0.060884 -0.088839 -0.269744\n", "3 0.729817 -0.042234 0.229410\n", "4 -1.117705 -0.778368 -1.280790\n", "0 -0.396292 0.205355 0.995982\n", "1 0.450495 -0.744076 -1.320831\n", "2 1.003412 -0.876143 1.677286\n", "3 -0.395701 -0.465095 0.287003\n", "4 1.442614 0.818815 -0.378552\n", "0 -0.196617 0.585134 1.052465\n", "1 0.386223 -0.932965 0.749832\n", "2 0.721093 1.453562 -0.426798" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "store.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 2 }