REGR: cumsum regression with groupby call to agg · Issue #31802 · pandas-dev/pandas (original) (raw)

I want to define a custom function that I can pass to the agg method. It uses the cumsum method, which appears to be problematic recently.

Your code here

import pandas as pd

def max_test(s): return s.cumsum().max() #return s.max()

dummy_data = pd.DataFrame( {'AIRLINE': {0: 'WN', 1: 'UA', 2: 'MQ', 3: 'AA', 4: 'WN'}, 'ORG_AIR': {0: 'LAX', 1: 'DEN', 2: 'DFW', 3: 'DFW', 4: 'LAX'}, 'DIST': {0: 590, 1: 1452, 2: 641, 3: 1192, 4: 1363}})

gb = dummy_data.groupby(['AIRLINE', 'ORG_AIR'])

result = gb.agg( #'max' max_test )

print(result)

Prior to Pandas 1.0rc this worked. It now raises an exception:

$ python /tmp/regpandas.py
Traceback (most recent call last):
  File "/tmp/regpandas.py", line 16, in <module>
    max_test
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/generic.py", line 948, in aggregate
    return self._python_agg_general(func, *args, **kwargs)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/groupby.py", line 936, in _python_agg_general
    result, counts = self.grouper.agg_series(obj, f)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/ops.py", line 641, in agg_series
    return self._aggregate_series_fast(obj, func)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/ops.py", line 666, in _aggregate_series_fast
    result, counts = grouper.get_result()
  File "pandas/_libs/reduction.pyx", line 376, in pandas._libs.reduction.SeriesGrouper.get_result
  File "pandas/_libs/reduction.pyx", line 193, in pandas._libs.reduction._BaseGrouper._apply_to_group
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/groupby/groupby.py", line 913, in <lambda>
    f = lambda x: func(x, *args, **kwargs)
  File "/tmp/regpandas.py", line 4, in max_test
    return s.cumsum().max()
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/generic.py", line 11331, in cum_func
    result = self._data.apply(na_accum_func)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 440, in apply
    applied = b.apply(f, **kwargs)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 403, in apply
    result = self.make_block(values=_block_shape(result, ndim=self.ndim))
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 273, in make_block
    return make_block(values, placement=placement, ndim=self.ndim)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 3041, in make_block
    return klass(values, ndim=ndim, placement=placement)
  File "/Users/matt/.env/pandas1/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 125, in __init__
    f"Wrong number of items passed {len(self.values)}, "
ValueError: Wrong number of items passed 2, placement implies 1
$ python /tmp/regpandas.py
                 DIST
AIRLINE ORG_AIR
AA      DFW      1192
MQ      DFW       641
UA      DEN      1452
WN      LAX      1953
>>> pd.show_versions()

INSTALLED VERSIONS
------------------
commit           : None
python           : 3.7.3.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 18.6.0
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.0.1
numpy            : 1.18.1
pytz             : 2019.3
dateutil         : 2.8.1
pip              : 19.0.3
setuptools       : 40.8.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : None
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.10.3
IPython          : 7.11.1
pandas_datareader: None
bs4              : None
bottleneck       : None
fastparquet      : None
gcsfs            : None
lxml.etree       : None
matplotlib       : 3.1.2
numexpr          : 2.7.1
odfpy            : None
openpyxl         : None
pandas_gbq       : None
pyarrow          : None
pytables         : None
pytest           : None
pyxlsb           : None
s3fs             : None
scipy            : 1.4.1
sqlalchemy       : 1.3.13
tables           : 3.6.1
tabulate         : None
xarray           : None
xlrd             : None
xlwt             : None
xlsxwriter       : None
numba            : None