BUG: Fix Series.get() for ExtensionArray and Categorical by Dr-Irv · Pull Request #20885 · pandas-dev/pandas (original) (raw)

@jreback The tests that fail are

I've investigated the first one and here is what I have found out. (I think the second one is the same problem). If we use the way you propose, where we use get_loc(key) on the Index, the example below fails on the expression s2[0].

In [1]: import pandas as pd

In [2]: def makeDateIndex(k=10, freq='B', name=None):
   ...:     dt = pd.datetime(2000, 1, 1)
   ...:     dr = pd.bdate_range(dt, periods=k, freq=freq, name=name)
   ...:     return pd.DatetimeIndex(dr, name=name)
   ...:
   ...: dt_tz_index = makeDateIndex(10, name='a').tz_localize(tz='US/Eastern')
   ...: s1 = pd.Series([i for i in range(len(dt_tz_index))], index=dt_tz_index)
   ...:
   ...: s2 = pd.Series(dt_tz_index, index=dt_tz_index)
   ...:

In [3]: s1
Out[3]:
a
2000-01-03 00:00:00-05:00    0
2000-01-04 00:00:00-05:00    1
2000-01-05 00:00:00-05:00    2
2000-01-06 00:00:00-05:00    3
2000-01-07 00:00:00-05:00    4
2000-01-10 00:00:00-05:00    5
2000-01-11 00:00:00-05:00    6
2000-01-12 00:00:00-05:00    7
2000-01-13 00:00:00-05:00    8
2000-01-14 00:00:00-05:00    9
Freq: B, dtype: int64

In [4]: s2
Out[4]:
a
2000-01-03 00:00:00-05:00   2000-01-03 00:00:00-05:00
2000-01-04 00:00:00-05:00   2000-01-04 00:00:00-05:00
2000-01-05 00:00:00-05:00   2000-01-05 00:00:00-05:00
2000-01-06 00:00:00-05:00   2000-01-06 00:00:00-05:00
2000-01-07 00:00:00-05:00   2000-01-07 00:00:00-05:00
2000-01-10 00:00:00-05:00   2000-01-10 00:00:00-05:00
2000-01-11 00:00:00-05:00   2000-01-11 00:00:00-05:00
2000-01-12 00:00:00-05:00   2000-01-12 00:00:00-05:00
2000-01-13 00:00:00-05:00   2000-01-13 00:00:00-05:00
2000-01-14 00:00:00-05:00   2000-01-14 00:00:00-05:00
Freq: B, Name: a, dtype: datetime64[ns, US/Eastern]

In [5]: s1[0]
Out[5]: 0

In [6]: s2[0]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3062             try:
-> 3063                 return self._engine.get_loc(key)
   3064             except KeyError:

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
    429
--> 430     cpdef get_loc(self, object val):
    431         if is_definitely_invalid_key(val):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10616)()
    465             val = maybe_datetimelike_to_i8(val)
--> 466             return self.mapping.get_item(val)
    467         except (TypeError, ValueError):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
    957
--> 958     cpdef get_item(self, int64_t val):
    959         cdef khiter_t k

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
    963         else:
--> 964             raise KeyError(val)
    965

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
   1609         try:
-> 1610             return Index.get_loc(self, key, method, tolerance)
   1611         except (KeyError, ValueError, TypeError):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3064             except KeyError:
-> 3065                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3066

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
    429
--> 430     cpdef get_loc(self, object val):
    431         if is_definitely_invalid_key(val):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10616)()
    465             val = maybe_datetimelike_to_i8(val)
--> 466             return self.mapping.get_item(val)
    467         except (TypeError, ValueError):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
    957
--> 958     cpdef get_item(self, int64_t val):
    959         cdef khiter_t k

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
    963         else:
--> 964             raise KeyError(val)
    965

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10434)()
    457         try:
--> 458             return self.mapping.get_item(val.value)
    459         except KeyError:

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
    957
--> 958     cpdef get_item(self, int64_t val):
    959         cdef khiter_t k

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
    963         else:
--> 964             raise KeyError(val)
    965

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3062             try:
-> 3063                 return self._engine.get_loc(key)
   3064             except KeyError:

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
    429
--> 430     cpdef get_loc(self, object val):
    431         if is_definitely_invalid_key(val):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10521)()
    459         except KeyError:
--> 460             raise KeyError(val)
    461         except AttributeError:

KeyError: Timestamp('1969-12-31 19:00:00-0500', tz='US/Eastern')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10434)()
    457         try:
--> 458             return self.mapping.get_item(val.value)
    459         except KeyError:

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
    957
--> 958     cpdef get_item(self, int64_t val):
    959         cdef khiter_t k

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
    963         else:
--> 964             raise KeyError(val)
    965

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
   1618                 stamp = Timestamp(key, tz=self.tz)
-> 1619                 return Index.get_loc(self, stamp, method, tolerance)
   1620             except KeyError:

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3064             except KeyError:
-> 3065                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3066

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10784)()
    429
--> 430     cpdef get_loc(self, object val):
    431         if is_definitely_invalid_key(val):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10521)()
    459         except KeyError:
--> 460             raise KeyError(val)
    461         except AttributeError:

KeyError: Timestamp('1969-12-31 19:00:00-0500', tz='US/Eastern')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value(self, series, key)
   1559         try:
-> 1560             return com._maybe_box(self, Index.get_value(self, series, key),
   1561                                   series, key)

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\base.py in get_value(self, series, key)
   3087 #                    return s[key]
-> 3088                     return s[self.get_loc(key)]
   3089                 except (IndexError, ValueError):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
   1620             except KeyError:
-> 1621                 raise KeyError(key)
   1622             except ValueError as e:

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10434)()
    457         try:
--> 458             return self.mapping.get_item(val.value)
    459         except KeyError:

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15389)()
    957
--> 958     cpdef get_item(self, int64_t val):
    959         cdef khiter_t k

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas\_libs\hashtable.c:15333)()
    963         else:
--> 964             raise KeyError(val)
    965

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value(self, series, key)
   1569             try:
-> 1570                 return self.get_value_maybe_box(series, key)
   1571             except (TypeError, ValueError, KeyError):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value_maybe_box(self, series, key)
   1580         values = self._engine.get_value(com._values_from_object(series),

-> 1581                                         key, tz=self.tz)
   1582         return com._maybe_box(self, values, series, key)

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value (pandas\_libs\index.c:4847)()
    104
--> 105     cpdef get_value(self, ndarray arr, object key, object tz=None):
    106         """

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value (pandas\_libs\index.c:4530)()
    112
--> 113         loc = self.get_loc(key)
    114         if PySlice_Check(loc) or cnp.PyArray_Check(loc):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc (pandas\_libs\index.c:10521)()
    459         except KeyError:
--> 460             raise KeyError(val)
    461         except AttributeError:

KeyError: Timestamp('1969-12-31 19:00:00-0500', tz='US/Eastern')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-6-ab7c8e26b0d3> in <module>()
----> 1 s2[0]

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\series.py in __getitem__(self, key)
    764         key = com._apply_if_callable(key, self)
    765         try:
--> 766             result = self.index.get_value(self, key)
    767
    768             if not is_scalar(result):

C:\EclipseWorkspaces\LiClipseWorkspace\pandas-dev\pandas36\pandas\core\indexes\datetimes.py in get_value(self, series, key)
   1570                 return self.get_value_maybe_box(series, key)
   1571             except (TypeError, ValueError, KeyError):
-> 1572                 raise KeyError(key)
   1573
   1574     def get_value_maybe_box(self, series, key):

KeyError: 0

Here's an easier-to-read stack trace of calls for s2[0] in terms of where it bombs out:

pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
--> 3063                 return self._engine.get_loc(key)
pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance)
-->1610             return Index.get_loc(self, key, method, tolerance)
pandas\core\indexes\base.py in get_value(self, series, key)
--> 3088                     return s[self.get_loc(key)]
pandas\core\indexes\datetimes.py in get_value(self, series, key)
--> 1560             return com._maybe_box(self, Index.get_value(self, series, key),
    1561                                   series, key)
pandas\core\series.py in __getitem__(self, key)
--> 766             result = self.index.get_value(self, key)

So what may be the error here is that if the values of a Series containing TZ-aware values end up being a DateTimeIndex and get_loc(0) isn't defined for an Index. Note that in the code below, if you remove the .tz_localize() part, it works fine.

Maybe when DateTimeIndex is based on ExtensionArray, then this code as I wrote it could change so that ExtensionArray and Index are handled the same way.

Or maybe you know how to fix things for DateTimeIndex, but it's not clear what is the thing I should be testing to be sure that this issue is fixed.