PERF: unnecessary materialization of a MultiIndex.values when introspecting memory_usage by jreback · Pull Request #14308 · pandas-dev/pandas (original ) (raw )In [2]: import string
...: import pandas as pd
...: import numpy as np
...:
...: def memory_usage(f):
...: return f.memory_usage(deep=True).sum()
...:
...: N = 100
...: M = len(string.uppercase)
...: df = pd.DataFrame({'value' : np.random.randn(N*M)},
...: index=pd.MultiIndex.from_product([list(string.uppercase),
...: pd.date_range('20160101',periods=N)],
...: names=['id','date'])
...: )
...:
...:
...: stacked = df.unstack('id')
...:
...: assert df.values.nbytes == stacked.values.nbytes
...:
In [3]: memory_usage(df)
Out[3]: 145600
In [4]: memory_usage(stacked)
Out[4]: 21600
I
n [7]: df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2600 entries, (A, 2016-01-01 00:00:00) to (Z, 2016-04-09 00:00:00)
Data columns (total 1 columns):
value 2600 non-null float64
dtypes: float64(1)
memory usage: 142.2 KB
In [8]: stacked.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100 entries, 2016-01-01 to 2016-04-09
Freq: D
Data columns (total 26 columns):
(value, A) 100 non-null float64
(value, B) 100 non-null float64
(value, C) 100 non-null float64
(value, D) 100 non-null float64
(value, E) 100 non-null float64
(value, F) 100 non-null float64
(value, G) 100 non-null float64
(value, H) 100 non-null float64
(value, I) 100 non-null float64
(value, J) 100 non-null float64
(value, K) 100 non-null float64
(value, L) 100 non-null float64
(value, M) 100 non-null float64
(value, N) 100 non-null float64
(value, O) 100 non-null float64
(value, P) 100 non-null float64
(value, Q) 100 non-null float64
(value, R) 100 non-null float64
(value, S) 100 non-null float64
(value, T) 100 non-null float64
(value, U) 100 non-null float64
(value, V) 100 non-null float64
(value, W) 100 non-null float64
(value, X) 100 non-null float64
(value, Y) 100 non-null float64
(value, Z) 100 non-null float64
dtypes: float64(26)
memory usage: 21.1 KB
In [2]: memory_usage(df)
Out[2]: 27088
In [3]: memory_usage(stacked)
Out[3]: 21600
In [4]: df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2600 entries, (A, 2016-01-01 00:00:00) to (Z, 2016-04-09 00:00:00)
Data columns (total 1 columns):
value 2600 non-null float64
dtypes: float64(1)
memory usage: 26.5 KB
In [5]: stacked.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100 entries, 2016-01-01 to 2016-04-09
Freq: D
Data columns (total 26 columns):
(value, A) 100 non-null float64
(value, B) 100 non-null float64
(value, C) 100 non-null float64
(value, D) 100 non-null float64
(value, E) 100 non-null float64
(value, F) 100 non-null float64
(value, G) 100 non-null float64
(value, H) 100 non-null float64
(value, I) 100 non-null float64
(value, J) 100 non-null float64
(value, K) 100 non-null float64
(value, L) 100 non-null float64
(value, M) 100 non-null float64
(value, N) 100 non-null float64
(value, O) 100 non-null float64
(value, P) 100 non-null float64
(value, Q) 100 non-null float64
(value, R) 100 non-null float64
(value, S) 100 non-null float64
(value, T) 100 non-null float64
(value, U) 100 non-null float64
(value, V) 100 non-null float64
(value, W) 100 non-null float64
(value, X) 100 non-null float64
(value, Y) 100 non-null float64
(value, Z) 100 non-null float64
dtypes: float64(26)
memory usage: 21.1 KB