PERF: unnecessary materialization of a MultiIndex.values when introspecting memory_usage by jreback · Pull Request #14308 · pandas-dev/pandas (original) (raw)

In [2]: import string
   ...: import pandas as pd
   ...: import numpy as np
   ...: 
   ...: def memory_usage(f):
   ...:     return f.memory_usage(deep=True).sum()
   ...: 
   ...: N = 100
   ...: M = len(string.uppercase)
   ...: df = pd.DataFrame({'value' : np.random.randn(N*M)},
   ...:                   index=pd.MultiIndex.from_product([list(string.uppercase),
   ...:                                                     pd.date_range('20160101',periods=N)],
   ...:                                                    names=['id','date'])
   ...:                   )
   ...: 
   ...: 
   ...: stacked = df.unstack('id')
   ...: 
   ...: assert df.values.nbytes == stacked.values.nbytes
   ...: 

In [3]: memory_usage(df)
Out[3]: 145600

In [4]: memory_usage(stacked)
Out[4]: 21600
I
n [7]: df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2600 entries, (A, 2016-01-01 00:00:00) to (Z, 2016-04-09 00:00:00)
Data columns (total 1 columns):
value    2600 non-null float64
dtypes: float64(1)
memory usage: 142.2 KB

In [8]: stacked.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100 entries, 2016-01-01 to 2016-04-09
Freq: D
Data columns (total 26 columns):
(value, A)    100 non-null float64
(value, B)    100 non-null float64
(value, C)    100 non-null float64
(value, D)    100 non-null float64
(value, E)    100 non-null float64
(value, F)    100 non-null float64
(value, G)    100 non-null float64
(value, H)    100 non-null float64
(value, I)    100 non-null float64
(value, J)    100 non-null float64
(value, K)    100 non-null float64
(value, L)    100 non-null float64
(value, M)    100 non-null float64
(value, N)    100 non-null float64
(value, O)    100 non-null float64
(value, P)    100 non-null float64
(value, Q)    100 non-null float64
(value, R)    100 non-null float64
(value, S)    100 non-null float64
(value, T)    100 non-null float64
(value, U)    100 non-null float64
(value, V)    100 non-null float64
(value, W)    100 non-null float64
(value, X)    100 non-null float64
(value, Y)    100 non-null float64
(value, Z)    100 non-null float64
dtypes: float64(26)
memory usage: 21.1 KB
In [2]: memory_usage(df)
Out[2]: 27088

In [3]: memory_usage(stacked)
Out[3]: 21600

In [4]: df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2600 entries, (A, 2016-01-01 00:00:00) to (Z, 2016-04-09 00:00:00)
Data columns (total 1 columns):
value    2600 non-null float64
dtypes: float64(1)
memory usage: 26.5 KB

In [5]: stacked.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100 entries, 2016-01-01 to 2016-04-09
Freq: D
Data columns (total 26 columns):
(value, A)    100 non-null float64
(value, B)    100 non-null float64
(value, C)    100 non-null float64
(value, D)    100 non-null float64
(value, E)    100 non-null float64
(value, F)    100 non-null float64
(value, G)    100 non-null float64
(value, H)    100 non-null float64
(value, I)    100 non-null float64
(value, J)    100 non-null float64
(value, K)    100 non-null float64
(value, L)    100 non-null float64
(value, M)    100 non-null float64
(value, N)    100 non-null float64
(value, O)    100 non-null float64
(value, P)    100 non-null float64
(value, Q)    100 non-null float64
(value, R)    100 non-null float64
(value, S)    100 non-null float64
(value, T)    100 non-null float64
(value, U)    100 non-null float64
(value, V)    100 non-null float64
(value, W)    100 non-null float64
(value, X)    100 non-null float64
(value, Y)    100 non-null float64
(value, Z)    100 non-null float64
dtypes: float64(26)
memory usage: 21.1 KB