# Storage

In [1]:
import numpy as np
import pandas as pd
np.random.seed(1234)
pd.options.display.max_rows=10
pd.__version__

'0.18.1'

In [2]:
df = pd.DataFrame({'A' : range(4), 
                   'B' : 1.0, 
                   'C' : 'foo', 
                   'D' : pd.Timestamp('20130101'), 
                   'E' : 2.0})

In [3]:
df

Unnamed: 0,A,B,C,D,E
0,0,1.0,foo,2013-01-01,2.0
1,1,1.0,foo,2013-01-01,2.0
2,2,1.0,foo,2013-01-01,2.0
3,3,1.0,foo,2013-01-01,2.0


In [4]:
df.dtypes

A             int64
B           float64
C            object
D    datetime64[ns]
E           float64
dtype: object

In [5]:
df._data

BlockManager
Items: Index(['A', 'B', 'C', 'D', 'E'], dtype='object')
Axis 1: RangeIndex(start=0, stop=4, step=1)
FloatBlock: slice(1, 7, 3), 2 x 4, dtype: float64
IntBlock: slice(0, 1, 1), 1 x 4, dtype: int64
DatetimeBlock: slice(3, 4, 1), 1 x 4, dtype: datetime64[ns]
ObjectBlock: slice(2, 3, 1), 1 x 4, dtype: object

## Why do we have this arrangement?

In [6]:
arr = np.random.randn(1000000, 10)

In [7]:
df = DataFrame(arr)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
0    1000000 non-null float64
1    1000000 non-null float64
2    1000000 non-null float64
3    1000000 non-null float64
4    1000000 non-null float64
5    1000000 non-null float64
6    1000000 non-null float64
7    1000000 non-null float64
8    1000000 non-null float64
9    1000000 non-null float64
dtypes: float64(10)
memory usage: 76.3 MB


Constructing direct from a np.array

In [8]:
%timeit DataFrame(arr)

10000 loops, best of 3: 84.2 Âµs per loop


Ufunc operation

In [9]:
result_blocked = df.sum()
result_blocked

0    1405.248349
1     717.455769
2     -41.601276
3    -735.066948
4      24.217678
5     879.240977
6   -1041.996276
7     967.221230
8    -561.771728
9    -901.415467
dtype: float64

In [10]:
%timeit df.sum()

10 loops, best of 3: 42.6 ms per loop


Construct a columnar layout

In [11]:
d = [ Series(v.copy()) for c, v in df.iteritems() ]

In [12]:
def f(d):
    return pd.Series([ e.sum() for e in d])
result_columnar = f(d)
result_columnar

0    1405.248349
1     717.455769
2     -41.601276
3    -735.066948
4      24.217678
5     879.240977
6   -1041.996276
7     967.221230
8    -561.771728
9    -901.415467
dtype: float64

In [13]:
np.allclose(result_columnar, result_blocked)

True

## We are *cache* friendly!

In [14]:
%timeit f(d)

100 loops, best of 3: 11 ms per loop


Can't we just use *views*?

In [15]:
d = [ pd.Series(v) for c, v in df.iteritems() ]

But this is NOT friendly

In [16]:
%timeit f(d)

10 loops, best of 3: 42.6 ms per loop


Construct a column like DataFrame

In [17]:
%timeit [ pd.Series(v.copy()) for c, v in df.iteritems() ]

10 loops, best of 3: 65.9 ms per loop


Construct a frame from columnar, with blocking

In [18]:
%timeit DataFrame({c:pd.Series(v) for c, v in df.iteritems()})

10 loops, best of 3: 64.8 ms per loop


## Further Reading

http://eli.thegreenplace.net/2015/memory-layout-of-multi-dimensional-arrays/