Multiple lambdas for the same column return KeyError in DataFrameGroupBy.agg with named aggregation · Issue #27519 · pandas-dev/pandas (original) (raw)
Multiple lambdas for the same column return KeyError
in DataFrameGroupBy.agg
In [1]: import pandas as pd
In [2]: df = pd.DataFrame({"A": [1, 2]})
In [3]: df.groupby([1, 1]).agg(foo=('A', lambda x: x.max()), bar=('A', lambda x: x.min()))
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-58-5b7e2c8bacf8> in <module>
3 df = pd.DataFrame({"A": [1, 2]})
4
----> 5 df.groupby([1, 1]).agg(foo=('A', lambda x: x.max()), bar=("A", lambda x: x.min()))
~\AppData\Local\Continuum\anaconda3\envs\insight\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, arg, *args, **kwargs)
1453 @Appender(_shared_docs["aggregate"])
1454 def aggregate(self, arg=None, *args, **kwargs):
-> 1455 return super().aggregate(arg, *args, **kwargs)
1456
1457 agg = aggregate
~\AppData\Local\Continuum\anaconda3\envs\insight\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, *args, **kwargs)
262
263 if relabeling:
--> 264 result = result[order]
265 result.columns = columns
266
~\AppData\Local\Continuum\anaconda3\envs\insight\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2979 if is_iterator(key):
2980 key = list(key)
-> 2981 indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True)
2982
2983 # take() does not accept boolean indexers
~\AppData\Local\Continuum\anaconda3\envs\insight\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
1269 # When setting, missing keys are not allowed, even with .loc:
1270 kwargs = {"raise_missing": True if is_setter else raise_missing}
-> 1271 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
1272 else:
1273 try:
~\AppData\Local\Continuum\anaconda3\envs\insight\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1076
1077 self._validate_read_indexer(
-> 1078 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1079 )
1080 return keyarr, indexer
~\AppData\Local\Continuum\anaconda3\envs\insight\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1161 raise KeyError(
1162 "None of [{key}] are in the [{axis}]".format(
-> 1163 key=key, axis=self.obj._get_axis_name(axis)
1164 )
1165 )
KeyError: "None of [MultiIndex([('A', '<lambda>'),\n ('A', '<lambda>')],\n )] are in the [columns]"
Problem description
When using the new groupby aggregation with relabeling API in pandas 0.25.0, a KeyError
is raised when the same source column is used with multiple lambdas, as in the example above. This issue isn't present when using multiple lambdas with SeriesGroupBy
, as in the release notes.
@TomAugspurger notes also that in DataFrameGroupby.aggregate
, order
needs to be mangled too.
Expected Output
Bonus related issue
If the applied function has the same name, a SpecificationError
is raised with the message Function names must be unique, found multiple named mean
, even though the kwargs are different:
df.groupby([1, 1]).agg(mean=('A', 'mean'), another_mean=('A', 'mean'))
(Obviously this is a silly example, but I encountered it having defined a closure for np.percentile
to get around the lambda issue!)
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Windows
OS-release : 10
machine : AMD64
processor : Intel64 Family 6 Model 142 Stepping 9, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 0.25.0
numpy : 1.16.4
pytz : 2019.1
dateutil : 2.8.0
pip : 19.1.1
setuptools : 41.0.1
Cython : None
pytest : None
hypothesis : None
sphinx : 2.0.1
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.8.2 (dt dec pq3 ext lo64)
jinja2 : 2.10.1
IPython : 7.5.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.0
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
s3fs : None
scipy : 1.3.0
sqlalchemy : 1.3.3
tables : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None