KeyError when comparing DataFrame with tz-aware DatetimeIndex on columns with DST change · Issue #19970 · pandas-dev/pandas (original) (raw)

dr = pd.date_range('20160101', '20161130', freq='4H', tz='America/New_York') df = pd.DataFrame({'a':np.arange(len(dr)), 'b':np.arange(len(dr))}, index=dr) dfnov_view = df.loc['2016-11'] drnov = pd.date_range('20161101', '20161130', freq='4H', tz='America/New_York') dfnov = pd.DataFrame({'a':np.arange(len(drnov)), 'b':np.arange(len(drnov))}, index=drnov)

df == df # works dfnov_view == dfnov_view # works dfnov == dfnov # works

dfnov.T == dfnov.T # works df.T == df.T # raises KeyError on master; works on 0.22 dfnov_view.T == dfnov_view.T # raises KeyError on master; works on 0.22

KeyError stacktrace:

In [7]: dfnov_view.T == dfnov_view.T # raises KeyError

KeyError Traceback (most recent call last) C:\projects\pandas-dk\pandas_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc() 457 try: --> 458 return self.mapping.get_item(val.value) 459 except KeyError:

C:\projects\pandas-dk\pandas_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() 933 --> 934 cpdef get_item(self, int64_t val): 935 cdef khiter_t k

C:\projects\pandas-dk\pandas_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() 939 else: --> 940 raise KeyError(val) 941

KeyError: 1478412000000000000

During handling of the above exception, another exception occurred:

KeyError Traceback (most recent call last) C:\projects\pandas-dk\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 2635 try: -> 2636 return self._engine.get_loc(key) 2637 except KeyError:

C:\projects\pandas-dk\pandas_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc() 429 --> 430 cpdef get_loc(self, object val): 431 if is_definitely_invalid_key(val):

C:\projects\pandas-dk\pandas_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc() 459 except KeyError: --> 460 raise KeyError(val) 461 except AttributeError:

KeyError: Timestamp('2016-11-06 01:00:00-0500', tz='America/New_York')

During handling of the above exception, another exception occurred:

KeyError Traceback (most recent call last) C:\projects\pandas-dk\pandas_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc() 457 try: --> 458 return self.mapping.get_item(val.value) 459 except KeyError:

C:\projects\pandas-dk\pandas_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() 933 --> 934 cpdef get_item(self, int64_t val): 935 cdef khiter_t k

C:\projects\pandas-dk\pandas_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() 939 else: --> 940 raise KeyError(val) 941

KeyError: 1478412000000000000

During handling of the above exception, another exception occurred:

KeyError Traceback (most recent call last) in () ----> 1 dfnov_view.T == dfnov_view.T # raises KeyError

C:\projects\pandas-dk\pandas\core\ops.py in f(self, other) 1558 raise ValueError('Can only compare identically-labeled ' 1559 'DataFrame objects') -> 1560 return self._compare_frame(other, func, str_rep) 1561 1562 elif isinstance(other, ABCSeries):

C:\projects\pandas-dk\pandas\core\frame.py in _compare_frame(self, other, func, str_rep) 4032 return {col: func(a[col], b[col]) for col in a.columns} 4033 -> 4034 new_data = expressions.evaluate(_compare, str_rep, self, other) 4035 return self._constructor(data=new_data, index=self.index, 4036 columns=self.columns, copy=False)

C:\projects\pandas-dk\pandas\core\computation\expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs) 203 use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) 204 if use_numexpr: --> 205 return _evaluate(op, op_str, a, b, **eval_kwargs) 206 return _evaluate_standard(op, op_str, a, b) 207

C:\projects\pandas-dk\pandas\core\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs) 118 119 if result is None: --> 120 result = _evaluate_standard(op, op_str, a, b) 121 122 return result

C:\projects\pandas-dk\pandas\core\computation\expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs) 63 _store_test_result(False) 64 with np.errstate(all='ignore'): ---> 65 return op(a, b) 66 67

C:\projects\pandas-dk\pandas\core\frame.py in _compare(a, b) 4030 4031 def _compare(a, b): -> 4032 return {col: func(a[col], b[col]) for col in a.columns} 4033 4034 new_data = expressions.evaluate(_compare, str_rep, self, other)

C:\projects\pandas-dk\pandas\core\frame.py in (.0) 4030 4031 def _compare(a, b): -> 4032 return {col: func(a[col], b[col]) for col in a.columns} 4033 4034 new_data = expressions.evaluate(_compare, str_rep, self, other)

C:\projects\pandas-dk\pandas\core\frame.py in getitem(self, key) 2202 return self._getitem_multilevel(key) 2203 else: -> 2204 return self._getitem_column(key) 2205 2206 def _getitem_column(self, key):

C:\projects\pandas-dk\pandas\core\frame.py in _getitem_column(self, key) 2209 # get column 2210 if self.columns.is_unique: -> 2211 return self._get_item_cache(key) 2212 2213 # duplicate columns & possible reduce dimensionality

C:\projects\pandas-dk\pandas\core\generic.py in _get_item_cache(self, item) 2193 res = cache.get(item) 2194 if res is None: -> 2195 values = self._data.get(item) 2196 res = self._box_item_values(item, values) 2197 cache[item] = res

C:\projects\pandas-dk\pandas\core\internals.py in get(self, item, fastpath) 4070 4071 if not isna(item): -> 4072 loc = self.items.get_loc(item) 4073 else: 4074 indexer = np.arange(len(self.items))[isna(self.items)]

C:\projects\pandas-dk\pandas\core\indexes\datetimes.py in get_loc(self, key, method, tolerance) 1555 # needed to localize naive datetimes 1556 key = Timestamp(key, tz=self.tz) -> 1557 return Index.get_loc(self, key, method, tolerance) 1558 1559 if isinstance(key, time):

C:\projects\pandas-dk\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 2636 return self._engine.get_loc(key) 2637 except KeyError: -> 2638 return self._engine.get_loc(self._maybe_cast_indexer(key)) 2639 2640 indexer = self.get_indexer([key], method=method, tolerance=tolerance)

C:\projects\pandas-dk\pandas_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc() 428 return algos.is_monotonic_int64(values, timelike=True) 429 --> 430 cpdef get_loc(self, object val): 431 if is_definitely_invalid_key(val): 432 raise TypeError

C:\projects\pandas-dk\pandas_libs\index.pyx in pandas._libs.index.DatetimeEngine.get_loc() 458 return self.mapping.get_item(val.value) 459 except KeyError: --> 460 raise KeyError(val) 461 except AttributeError: 462 pass

KeyError: Timestamp('2016-11-06 01:00:00-0500', tz='America/New_York')

On current master, if you have a DataFrame with a tz-aware DatetimeIndex in the columns, comparison can fail with a KeyError. Oddly, it appears this occurs only if the DatetimeIndex is the columns and not on the index.

Based on the KeyError it appears that for some reason .loc is looking for the pre-DST change value when it should be looking at the post-DST value. Reproducing the bug requires at least 2 DST switches in the original DataFrame. DataFrames with just a single DST switch do not seem to exhibit the behavior unless they are views on a larger DataFrame with two switches.

This is new on master; behavior doesn't occur in 0.22.