BUG: Series.groupby
fails when grouping on MultiIndex
with nulls in first level (original) (raw)
Pandas version checks
- I have checked that this issue has not already been reported.
- I have confirmed this bug exists on the latest version of pandas.
- I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
arr = [list(range(4)) * 4, list(range(2)) * 8]
insert nulls into first level of MultiIndex
arr[0][2] = None
ser = pd.Series( list(range(16)), index=pd.MultiIndex.from_arrays(arr, names=("a", "b")) )
ser.groupby(level=[0, 1])
Issue Description
When attempting a groupby on all levels of a series with a multi-index with nulls in the first level, I get the following traceback:
TypeError Traceback (most recent call last) Input In [1], in <cell line: 13>() 6 arr[1][4] = None 8 ser = pd.Series( 9 list(range(16)), 10 index=pd.MultiIndex.from_arrays(arr, names=("a", "b")) 11 ) ---> 13 ser.groupby(level=[0, 1])
File /datasets/charlesb/dev/pandas/pandas/pandas/core/series.py:1956, in Series.groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, dropna) 1953 raise TypeError("You have to supply one of 'by' and 'level'") 1954 axis = self._get_axis_number(axis) -> 1956 return SeriesGroupBy( 1957 obj=self, 1958 keys=by, 1959 axis=axis, 1960 level=level, 1961 as_index=as_index, 1962 sort=sort, 1963 group_keys=group_keys, 1964 squeeze=squeeze, 1965 observed=observed, 1966 dropna=dropna, 1967 )
File /datasets/charlesb/dev/pandas/pandas/pandas/core/groupby/groupby.py:937, in GroupBy.init(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated, dropna) 934 if grouper is None: 935 from pandas.core.groupby.grouper import get_grouper --> 937 grouper, exclusions, obj = get_grouper( 938 obj, 939 keys, 940 axis=axis, 941 level=level, 942 sort=sort, 943 observed=observed, 944 mutated=self.mutated, 945 dropna=self.dropna, 946 ) 948 self.obj = obj 949 self.axis = obj._get_axis_number(axis)
File /datasets/charlesb/dev/pandas/pandas/pandas/core/groupby/grouper.py:892, in get_grouper(obj, key, axis, level, sort, observed, mutated, validate, dropna) 887 in_axis = False 889 # create the Grouping 890 # allow us to passing the actual Grouping as the gpr 891 ping = ( --> 892 Grouping( 893 group_axis, 894 gpr, 895 obj=obj, 896 level=level, 897 sort=sort, 898 observed=observed, 899 in_axis=in_axis, 900 dropna=dropna, 901 ) 902 if not isinstance(gpr, Grouping) 903 else gpr 904 ) 906 groupings.append(ping) 908 if len(groupings) == 0 and len(obj):
File /datasets/charlesb/dev/pandas/pandas/pandas/core/groupby/grouper.py:504, in Grouping.init(self, index, grouper, obj, level, sort, observed, in_axis, dropna)
496 mapper = self.grouping_vector
497 # In extant tests, the new self.grouping_vector matches
498 # index.get_level_values(ilevel)
whenever
499 # mapper is None and isinstance(index, MultiIndex)
500 (
501 self.grouping_vector, # Index
502 self._codes,
503 self._group_index,
--> 504 ) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna)
506 # a passed Grouper like, directly get the grouper in the same way
507 # as single grouper groupby, use the group_info to get codes
508 elif isinstance(self.grouping_vector, Grouper):
509 # get the new grouper; we already have disambiguated
510 # what key/level refer to exactly, don't need to
511 # check again as we have by this point converted these
512 # to an actual value (rather than a pd.Grouper)
File /datasets/charlesb/dev/pandas/pandas/pandas/core/indexes/multi.py:1516, in MultiIndex._get_grouper_for_level(self, mapper, level, dropna) 1514 # Handle group mapping function and return 1515 level_values = self.levels[level].take(indexer) -> 1516 grouper = level_values.map(mapper) 1517 return grouper, None, None 1519 values = self.get_level_values(level)
File /datasets/charlesb/dev/pandas/pandas/pandas/core/indexes/base.py:6369, in Index.map(self, mapper, na_action) 6349 """ 6350 Map values using an input mapping or function. 6351 (...) 6365 a MultiIndex will be returned. 6366 """ 6367 from pandas.core.indexes.multi import MultiIndex -> 6369 new_values = self._map_values(mapper, na_action=na_action) 6371 # we can return a MultiIndex 6372 if new_values.size and isinstance(new_values[0], tuple):
File /datasets/charlesb/dev/pandas/pandas/pandas/core/base.py:882, in IndexOpsMixin._map_values(self, mapper, na_action) 879 raise ValueError(msg) 881 # mapper is a function --> 882 new_values = map_f(values, mapper) 884 return new_values
File /datasets/charlesb/dev/pandas/pandas/pandas/_libs/lib.pyx:2898, in pandas._libs.lib.map_infer() 2896 result[i] = arr[i] 2897 continue -> 2898 val = f(arr[i]) 2899 2900 if cnp.PyArray_IsZeroDim(val):
TypeError: 'numpy.ndarray' object is not callable
Expected Behavior
I would expect to successfully get back a groupby object; this behavior occurs if we instead have nulls in the second level of the multi-index:
import pandas as pd
arr = [list(range(4)) * 4, list(range(2)) * 8]
insert nulls into second level of MultiIndex
arr[1][2] = None
ser = pd.Series( list(range(16)), index=pd.MultiIndex.from_arrays(arr, names=("a", "b")) )
ser.groupby(level=[0, 1])
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f88c4192730>
Installed Versions
INSTALLED VERSIONS
commit : 696e9bd
python : 3.8.13.final.0
python-bits : 64
OS : Linux
OS-release : 4.15.0-1083-oracle
Version : #91-Ubuntu SMP Mon Oct 25 06:45:22 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.5.0.dev0+953.g696e9bd04f
numpy : 1.22.4
pytz : 2022.1
dateutil : 2.8.2
setuptools : 62.3.4
pip : 22.1.2
Cython : 0.29.30
pytest : 7.1.2
hypothesis : 6.47.1
sphinx : 4.5.0
blosc : None
feather : None
xlsxwriter : 3.0.3
lxml.etree : 4.9.0
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.0.3
IPython : 8.4.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : 1.3.4
brotli :
fastparquet : 0.8.1
fsspec : 2021.11.0
gcsfs : 2021.11.0
matplotlib : 3.5.2
numba : 0.53.1
numexpr : 2.8.0
odfpy : None
openpyxl : 3.0.9
pandas_gbq : None
pyarrow : 8.0.0
pyreadstat : 1.1.7
pyxlsb : None
s3fs : 2021.11.0
scipy : 1.8.1
snappy :
sqlalchemy : 1.4.37
tables : 3.7.0
tabulate : 0.8.9
xarray : 2022.3.0
xlrd : 2.0.1
xlwt : 1.3.0
zstandard : None