# Using HDF5 with Python

### Import the required libraries

In [1]:
from pandas import (
    DataFrame, HDFStore
)
import pandas as pd
import numpy as np

### Create a dataframe

In [2]:
df = DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C',])

In [3]:
df

Unnamed: 0,A,B,C
0,-0.092894,0.480401,-0.967241
1,-1.003829,0.012645,0.52772
2,-0.060884,-0.088839,-0.269744
3,0.729817,-0.042234,0.22941
4,-1.117705,-0.778368,-1.28079


### Create a HDF5 format file for saving th dataframe

In [4]:
store = HDFStore('dataset.h5')

In [5]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: dataset.h5
Empty

### Add the dataframe to the HDF5 file

In [6]:
store.put('d1', df, format='table', data_columns=True)

In [7]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: dataset.h5
/d1            frame_table  (typ->appendable,nrows->5,ncols->3,indexers->[index],dc->[A,B,C])

### Accessing the dataframe from HDF5 file

In [8]:
store['d1']

Unnamed: 0,A,B,C
0,-0.092894,0.480401,-0.967241
1,-1.003829,0.012645,0.52772
2,-0.060884,-0.088839,-0.269744
3,0.729817,-0.042234,0.22941
4,-1.117705,-0.778368,-1.28079


### Appending another dataframe to already exisiting dataframe in HDF5 file

In [9]:
store.append('d1', DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C']))

In [10]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: dataset.h5
/d1            frame_table  (typ->appendable,nrows->10,ncols->3,indexers->[index],dc->[A,B,C])

In [11]:
store['d1']

Unnamed: 0,A,B,C
0,-0.092894,0.480401,-0.967241
1,-1.003829,0.012645,0.52772
2,-0.060884,-0.088839,-0.269744
3,0.729817,-0.042234,0.22941
4,-1.117705,-0.778368,-1.28079
0,-0.396292,0.205355,0.995982
1,0.450495,-0.744076,-1.320831
2,1.003412,-0.876143,1.677286
3,-0.395701,-0.465095,0.287003
4,1.442614,0.818815,-0.378552


### Closing the HDF5 file

In [12]:
store.close()

In [13]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: dataset.h5
File is CLOSED

### Opening HDF5 file - Method 1 (not advised)

In [14]:
df = pd.read_hdf('dataset.h5')

In [15]:
df

Unnamed: 0,A,B,C
0,-0.092894,0.480401,-0.967241
1,-1.003829,0.012645,0.52772
2,-0.060884,-0.088839,-0.269744
3,0.729817,-0.042234,0.22941
4,-1.117705,-0.778368,-1.28079
0,-0.396292,0.205355,0.995982
1,0.450495,-0.744076,-1.320831
2,1.003412,-0.876143,1.677286
3,-0.395701,-0.465095,0.287003
4,1.442614,0.818815,-0.378552


### Opening HDF5 file - Method 2 (recommended way)

In [16]:
store = HDFStore('dataset.h5')

In [17]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: dataset.h5
/d1            frame_table  (typ->appendable,nrows->10,ncols->3,indexers->[index],dc->[A,B,C])

### Adding dataframe to the opened HDF5, using the default format

In [18]:
store.put('d2', DataFrame(np.random.randn(7,4)))
store.put('d3', DataFrame(np.random.randn(14,3)))

In [19]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: dataset.h5
/d1            frame_table  (typ->appendable,nrows->10,ncols->3,indexers->[index],dc->[A,B,C])
/d2            frame        (shape->[7,4])                                                    
/d3            frame        (shape->[14,3])                                                   

### Difference between frame_table and frame formats

#### frame_table format

In [20]:
store.append('d1', pd.DataFrame(np.random.randn(3,3), columns=['A', 'B', 'C']))

In [21]:
store['d1']

Unnamed: 0,A,B,C
0,-0.092894,0.480401,-0.967241
1,-1.003829,0.012645,0.52772
2,-0.060884,-0.088839,-0.269744
3,0.729817,-0.042234,0.22941
4,-1.117705,-0.778368,-1.28079
0,-0.396292,0.205355,0.995982
1,0.450495,-0.744076,-1.320831
2,1.003412,-0.876143,1.677286
3,-0.395701,-0.465095,0.287003
4,1.442614,0.818815,-0.378552


#### frame format

In [22]:
store.append('d2', pd.DataFrame(np.random.randn(4,4)))

ValueError: Can only append to Tables

#### The frame format (default) is faster than frame_table format

### To view the dataframe with ordered index

In [23]:
store['d1/table']

Unnamed: 0,index,A,B,C
0,0,-0.092894,0.480401,-0.967241
1,1,-1.003829,0.012645,0.52772
2,2,-0.060884,-0.088839,-0.269744
3,3,0.729817,-0.042234,0.22941
4,4,-1.117705,-0.778368,-1.28079
5,0,-0.396292,0.205355,0.995982
6,1,0.450495,-0.744076,-1.320831
7,2,1.003412,-0.876143,1.677286
8,3,-0.395701,-0.465095,0.287003
9,4,1.442614,0.818815,-0.378552


### To get the dataframe from the HDF5 file

In [24]:
df = store['d1']

In [25]:
df

Unnamed: 0,A,B,C
0,-0.092894,0.480401,-0.967241
1,-1.003829,0.012645,0.52772
2,-0.060884,-0.088839,-0.269744
3,0.729817,-0.042234,0.22941
4,-1.117705,-0.778368,-1.28079
0,-0.396292,0.205355,0.995982
1,0.450495,-0.744076,-1.320831
2,1.003412,-0.876143,1.677286
3,-0.395701,-0.465095,0.287003
4,1.442614,0.818815,-0.378552


In [26]:
store.close()