Operators between DataFrame and Series fail on large dataframes · Issue #27636 · pandas-dev/pandas (original) (raw)
Code Sample
import pandas as pd
ind = list(range(0, 100)) cols = list(range(0, 300)) df = pd.DataFrame(index=ind, columns=cols, data=1.0) series = pd.Series(index=cols, data=cols) print(df.multiply(series, axis=1).head()) # Works fine ind = list(range(0, 100000)) cols = list(range(0, 300)) df = pd.DataFrame(index=ind, columns=cols, data=1.0) series = pd.Series(index=cols, data=cols) print(df.add(series,axis=1).head())
Code Output:
0 1 2 3 4 5 ... 294 295 296 297 298 299
0 0.0 1.0 2.0 3.0 4.0 5.0 ... 294.0 295.0 296.0 297.0 298.0 299.0
1 0.0 1.0 2.0 3.0 4.0 5.0 ... 294.0 295.0 296.0 297.0 298.0 299.0
2 0.0 1.0 2.0 3.0 4.0 5.0 ... 294.0 295.0 296.0 297.0 298.0 299.0
3 0.0 1.0 2.0 3.0 4.0 5.0 ... 294.0 295.0 296.0 297.0 298.0 299.0
4 0.0 1.0 2.0 3.0 4.0 5.0 ... 294.0 295.0 296.0 297.0 298.0 299.0
[5 rows x 300 columns]
Traceback (most recent call last):
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-25-4d9165e5df4a>", line 15, in <module>
print(df.add(series,axis=1).head())
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\ops\__init__.py", line 1499, in f
self, other, pass_op, fill_value=fill_value, axis=axis, level=level
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\ops\__init__.py", line 1388, in _combine_series_frame
return self._combine_match_columns(other, func, level=level)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\frame.py", line 5392, in _combine_match_columns
return ops.dispatch_to_series(left, right, func, axis="columns")
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\ops\__init__.py", line 596, in dispatch_to_series
new_data = expressions.evaluate(column_op, str_rep, left, right)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\computation\expressions.py", line 220, in evaluate
return _evaluate(op, op_str, a, b, **eval_kwargs)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\computation\expressions.py", line 126, in _evaluate_numexpr
result = _evaluate_standard(op, op_str, a, b)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\computation\expressions.py", line 70, in _evaluate_standard
return op(a, b)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\ops\__init__.py", line 584, in column_op
return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))}
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\ops\__init__.py", line 584, in <dictcomp>
return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))}
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\ops\__init__.py", line 1473, in na_op
result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\computation\expressions.py", line 220, in evaluate
return _evaluate(op, op_str, a, b, **eval_kwargs)
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\computation\expressions.py", line 101, in _evaluate_numexpr
if _can_use_numexpr(op, op_str, a, b, "evaluate"):
File "C:\dev\bin\anaconda\envs\py36\lib\site-packages\pandas\core\computation\expressions.py", line 84, in _can_use_numexpr
s = o.dtypes.value_counts()
AttributeError: 'numpy.dtype' object has no attribute 'value_counts'
Problem description
I think this is a regression somewhere between pandas 0.19.2 and 0.25. If you multiply or use any other operator function such as add/divide on a DataFrame by a Series where axis=1 pandas will crash in the _can_use_numexpr
functon when the DataFrame/Series becomes very large. This is presumably down to check of the size of the objects being operated on not passing for small datasets but for larger ones it gets to the failing line.
#pandas/core/computation/expressions.py : 73 def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ if op_str is not None:
# required min elements (otherwise we are adding overhead)
if np.prod(a.shape) > _MIN_ELEMENTS:
# check for dtype compatibility
dtypes = set()
for o in [a, b]:
if hasattr(o, "dtypes"):
s = o.dtypes.value_counts() # Fails here
In pandas 0.19.2 the function uses the get_dtype_counts() method instead to inspect if the dtype is uniform in the object:
def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ if op_str is not None:
# required min elements (otherwise we are adding overhead)
if np.prod(a.shape) > _MIN_ELEMENTS:
# check for dtype compatiblity
dtypes = set()
for o in [a, b]:
if hasattr(o, 'get_dtype_counts'):
s = o.get_dtype_counts()
I have a workaround which is to transpose the dataframe and use axis=0:
df.T.add(series,axis=0).T.head()
I noticed get_dtype_counts() is deprecated ( #27145 ) which appears to be the PR that has caused this regression as a Series only returns a single numpy dtype which does not have a value_counts() method.
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.6.5.final.0
python-bits : 64
OS : Windows
OS-release : 7
machine : AMD64
processor : Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 0.25.0
numpy : 1.16.4
pytz : 2018.4
dateutil : 2.7.3
pip : 10.0.1
setuptools : 39.1.0
Cython : None
pytest : 3.5.1
hypothesis : None
sphinx : 1.8.2
blosc : None
feather : None
xlsxwriter : 1.0.4
lxml.etree : 4.1.1
html5lib : 1.0.1
pymysql : None
psycopg2 : None
jinja2 : 2.10
IPython : 6.4.0
pandas_datareader: None
bs4 : 4.7.1
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : 4.1.1
matplotlib : 2.2.2
numexpr : 2.6.5
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
s3fs : None
scipy : 1.1.0
sqlalchemy : 1.2.8
tables : 3.5.2
xarray : None
xlrd : 1.1.0
xlwt : None
xlsxwriter : 1.0.4