BUG: Fix Series.get() for ExtensionArray and Categorical by Dr-Irv · Pull Request #20885 · pandas-dev/pandas (original) (raw)
@jreback The tests that fail are
pandas\tests\test_base.py
intest_value_counts_unique_nunique
tests/groupby/aggregate/test_other.py
intest_agg_timezone_round_trip
I've investigated the first one and here is what I have found out. (I think the second one is the same problem). If we use the way you propose, where we use get_loc(key)
on the Index
, the example below fails on the expression s2[0]
.
In [1]: import pandas as pd
In [2]: def makeDateIndex(k=10, freq='B', name=None):
...: dt = pd.datetime(2000, 1, 1)
...: dr = pd.bdate_range(dt, periods=k, freq=freq, name=name)
...: return pd.DatetimeIndex(dr, name=name)
...:
...: dt_tz_index = makeDateIndex(10, name='a').tz_localize(tz='US/Eastern')
...: s1 = pd.Series([i for i in range(len(dt_tz_index))], index=dt_tz_index)
...:
...: s2 = pd.Series(dt_tz_index, index=dt_tz_index)
...:
In [3]: s1
Out[3]:
a
2000-01-03 00:00:00-05:00 0
2000-01-04 00:00:00-05:00 1
2000-01-05 00:00:00-05:00 2
2000-01-06 00:00:00-05:00 3
2000-01-07 00:00:00-05:00 4
2000-01-10 00:00:00-05:00 5
2000-01-11 00:00:00-05:00 6
2000-01-12 00:00:00-05:00 7
2000-01-13 00:00:00-05:00 8
2000-01-14 00:00:00-05:00 9
Freq: B, dtype: int64
In [4]: s2
Out[4]:
a
2000-01-03 00:00:00-05:00 2000-01-03 00:00:00-05:00
2000-01-04 00:00:00-05:00 2000-01-04 00:00:00-05:00
2000-01-05 00:00:00-05:00 2000-01-05 00:00:00-05:00
2000-01-06 00:00:00-05:00 2000-01-06 00:00:00-05:00
2000-01-07 00:00:00-05:00 2000-01-07 00:00:00-05:00
2000-01-10 00:00:00-05:00 2000-01-10 00:00:00-05:00
2000-01-11 00:00:00-05:00 2000-01-11 00:00:00-05:00
2000-01-12 00:00:00-05:00 2000-01-12 00:00:00-05:00
2000-01-13 00:00:00-05:00 2000-01-13 00:00:00-05:00
2000-01-14 00:00:00-05:00 2000-01-14 00:00:00-05:00
Freq: B, Name: a, dtype: datetime64[ns, US/Eastern]
In [5]: s1[0]
Out[5]: 0
In [6]: s2[0]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
429
--> 430 cpdef get_loc(self, object val):
431 if is_definitely_invalid_key(val):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10616)()
465 val = maybe_datetimelike_to_i8(val)
--> 466 return self.mapping.get_item(val)
467 except (TypeError, ValueError):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
957
--> 958 cpdef get_item(self, int64_t val):
959 cdef khiter_t k
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
963 else:
--> 964 raise KeyError(val)
965
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
1609 try:
-> 1610 return Index.get_loc(self, key, method, tolerance)
1611 except (KeyError, ValueError, TypeError):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
429
--> 430 cpdef get_loc(self, object val):
431 if is_definitely_invalid_key(val):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10616)()
465 val = maybe_datetimelike_to_i8(val)
--> 466 return self.mapping.get_item(val)
467 except (TypeError, ValueError):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
957
--> 958 cpdef get_item(self, int64_t val):
959 cdef khiter_t k
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
963 else:
--> 964 raise KeyError(val)
965
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10434)()
457 try:
--> 458 return self.mapping.get_item(val.value)
459 except KeyError:
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
957
--> 958 cpdef get_item(self, int64_t val):
959 cdef khiter_t k
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
963 else:
--> 964 raise KeyError(val)
965
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3062 try:
-> 3063 return self._engine.get_loc(key)
3064 except KeyError:
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
429
--> 430 cpdef get_loc(self, object val):
431 if is_definitely_invalid_key(val):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10521)()
459 except KeyError:
--> 460 raise KeyError(val)
461 except AttributeError:
KeyError: Timestamp('1969-12-31 19:00:00-0500', tz='US/Eastern')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10434)()
457 try:
--> 458 return self.mapping.get_item(val.value)
459 except KeyError:
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
957
--> 958 cpdef get_item(self, int64_t val):
959 cdef khiter_t k
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
963 else:
--> 964 raise KeyError(val)
965
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
1618 stamp = Timestamp(key, tz=self.tz)
-> 1619 return Index.get_loc(self, stamp, method, tolerance)
1620 except KeyError:
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3064 except KeyError:
-> 3065 return self._engine.get_loc(self._maybe_cast_indexer(key))
3066
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
429
--> 430 cpdef get_loc(self, object val):
431 if is_definitely_invalid_key(val):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10521)()
459 except KeyError:
--> 460 raise KeyError(val)
461 except AttributeError:
KeyError: Timestamp('1969-12-31 19:00:00-0500', tz='US/Eastern')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value(self, series, key)
1559 try:
-> 1560 return com._maybe_box(self, Index.get_value(self, series, key),
1561 series, key)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_value(self, series, key)
3087 # return s[key]
-> 3088 return s[self.get_loc(key)]
3089 except (IndexError, ValueError):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
1620 except KeyError:
-> 1621 raise KeyError(key)
1622 except ValueError as e:
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10434)()
457 try:
--> 458 return self.mapping.get_item(val.value)
459 except KeyError:
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
957
--> 958 cpdef get_item(self, int64_t val):
959 cdef khiter_t k
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
963 else:
--> 964 raise KeyError(val)
965
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value(self, series, key)
1569 try:
-> 1570 return self.get_value_maybe_box(series, key)
1571 except (TypeError, ValueError, KeyError):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value_maybe_box(self, series, key)
1580 values = self._engine.get_value(com._values_from_object(series),
-> 1581 key, tz=self.tz)
1582 return com._maybe_box(self, values, series, key)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value (pandas\_libs\index.c:4847)()
104
--> 105 cpdef get_value(self, ndarray arr, object key, object tz=None):
106 """
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value (pandas\_libs\index.c:4530)()
112
--> 113 loc = self.get_loc(key)
114 if PySlice_Check(loc) or cnp.PyArray_Check(loc):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10521)()
459 except KeyError:
--> 460 raise KeyError(val)
461 except AttributeError:
KeyError: Timestamp('1969-12-31 19:00:00-0500', tz='US/Eastern')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-6-ab7c8e26b0d3> in <module>()
----> 1 s2[0]
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\series.py in __getitem__(self, key)
764 key = com._apply_if_callable(key, self)
765 try:
--> 766 result = self.index.get_value(self, key)
767
768 if not is_scalar(result):
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value(self, series, key)
1570 return self.get_value_maybe_box(series, key)
1571 except (TypeError, ValueError, KeyError):
-> 1572 raise KeyError(key)
1573
1574 def get_value_maybe_box(self, series, key):
KeyError: 0
Here's an easier-to-read stack trace of calls for s2[0]
in terms of where it bombs out:
pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
--> 3063 return self._engine.get_loc(key)
pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
-->1610 return Index.get_loc(self, key, method, tolerance)
pandas\core\indexes\base.py in get_value(self, series, key)
--> 3088 return s[self.get_loc(key)]
pandas\core\indexes\datetimes.py in get_value(self, series, key)
--> 1560 return com._maybe_box(self, Index.get_value(self, series, key),
1561 series, key)
pandas\core\series.py in __getitem__(self, key)
--> 766 result = self.index.get_value(self, key)
So what may be the error here is that if the values of a Series
containing TZ-aware values end up being a DateTimeIndex
and get_loc(0)
isn't defined for an Index
. Note that in the code below, if you remove the .tz_localize()
part, it works fine.
Maybe when DateTimeIndex
is based on ExtensionArray
, then this code as I wrote it could change so that ExtensionArray
and Index
are handled the same way.
Or maybe you know how to fix things for DateTimeIndex
, but it's not clear what is the thing I should be testing to be sure that this issue is fixed.