REGR: cumsum regression with groupby call to agg · Issue #31802 · pandas-dev/pandas (original) (raw)
I want to define a custom function that I can pass to the agg
method. It uses the cumsum
method, which appears to be problematic recently.
Your code here
import pandas as pd
def max_test(s): return s.cumsum().max() #return s.max()
dummy_data = pd.DataFrame( {'AIRLINE': {0: 'WN', 1: 'UA', 2: 'MQ', 3: 'AA', 4: 'WN'}, 'ORG_AIR': {0: 'LAX', 1: 'DEN', 2: 'DFW', 3: 'DFW', 4: 'LAX'}, 'DIST': {0: 590, 1: 1452, 2: 641, 3: 1192, 4: 1363}})
gb = dummy_data.groupby(['AIRLINE', 'ORG_AIR'])
result = gb.agg( #'max' max_test )
print(result)
Prior to Pandas 1.0rc this worked. It now raises an exception:
$ python /tmp/regpandas.py
Traceback (most recent call last):
File "/tmp/regpandas.py", line 16, in <module>
max_test
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/generic.py", line 948, in aggregate
return self._python_agg_general(func, *args, **kwargs)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/groupby.py", line 936, in _python_agg_general
result, counts = self.grouper.agg_series(obj, f)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/ops.py", line 641, in agg_series
return self._aggregate_series_fast(obj, func)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/ops.py", line 666, in _aggregate_series_fast
result, counts = grouper.get_result()
File "pandas/_libs/reduction.pyx", line 376, in pandas._libs.reduction.SeriesGrouper.get_result
File "pandas/_libs/reduction.pyx", line 193, in pandas._libs.reduction._BaseGrouper._apply_to_group
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/groupby.py", line 913, in <lambda>
f = lambda x: func(x, *args, **kwargs)
File "/tmp/regpandas.py", line 4, in max_test
return s.cumsum().max()
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/generic.py", line 11331, in cum_func
result = self._data.apply(na_accum_func)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 440, in apply
applied = b.apply(f, **kwargs)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 403, in apply
result = self.make_block(values=_block_shape(result, ndim=self.ndim))
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 273, in make_block
return make_block(values, placement=placement, ndim=self.ndim)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 3041, in make_block
return klass(values, ndim=ndim, placement=placement)
File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 125, in __init__
f"Wrong number of items passed {len(self.values)}, "
ValueError: Wrong number of items passed 2, placement implies 1
$ python /tmp/regpandas.py
DIST
AIRLINE ORG_AIR
AA DFW 1192
MQ DFW 641
UA DEN 1452
WN LAX 1953
>>> pd.show_versions()
INSTALLED VERSIONS
------------------
commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Darwin
OS-release : 18.6.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.0.1
numpy : 1.18.1
pytz : 2019.3
dateutil : 2.8.1
pip : 19.0.3
setuptools : 40.8.0
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.3
IPython : 7.11.1
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.2
numexpr : 2.7.1
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pytest : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : 1.3.13
tables : 3.6.1
tabulate : None
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None
numba : None