ENH: DTI/DTA.astype support non-nano (#47579) · pandas-dev/pandas@67e8c4c (original) (raw)

12 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -275,7 +275,9 @@ Other enhancements
275 275 - :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
276 276 - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
277 277 - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
278 +- :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
278 279 - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
280 +-
279 281
280 282 .. ---------------------------------------------------------------------------
281 283 .. _whatsnew_150.notable_bug_fixes:
Original file line number Diff line number Diff line change
@@ -30,12 +30,14 @@
30 30 "get_unit_from_dtype",
31 31 "periods_per_day",
32 32 "periods_per_second",
33 +"is_supported_unit",
33 34 ]
34 35
35 36 from pandas._libs.tslibs import dtypes
36 37 from pandas._libs.tslibs.conversion import localize_pydatetime
37 38 from pandas._libs.tslibs.dtypes import (
38 39 Resolution,
40 +is_supported_unit,
39 41 periods_per_day,
40 42 periods_per_second,
41 43 )
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ _period_code_map: dict[str, int]
7 7
8 8 def periods_per_day(reso: int) -> int: ...
9 9 def periods_per_second(reso: int) -> int: ...
10 +def is_supported_unit(reso: int) -> bool: ...
10 11
11 12 class PeriodDtypeBase:
12 13 _dtype_code: int # PeriodDtypeCode
Original file line number Diff line number Diff line change
@@ -277,6 +277,15 @@ class NpyDatetimeUnit(Enum):
277 277 NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC
278 278
279 279
280 +def is_supported_unit(NPY_DATETIMEUNIT reso):
281 +return (
282 + reso == NPY_DATETIMEUNIT.NPY_FR_ns
283 +or reso == NPY_DATETIMEUNIT.NPY_FR_us
284 +or reso == NPY_DATETIMEUNIT.NPY_FR_ms
285 +or reso == NPY_DATETIMEUNIT.NPY_FR_s
286 + )
287 +
288 +
280 289 cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
281 290 if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
282 291 # generic -> default to nanoseconds
Original file line number Diff line number Diff line change
@@ -31,6 +31,7 @@
31 31 get_unit_from_dtype,
32 32 ints_to_pydatetime,
33 33 is_date_array_normalized,
34 +is_supported_unit,
34 35 is_unitless,
35 36 normalize_i8_timestamps,
36 37 timezones,
@@ -603,12 +604,26 @@ def astype(self, dtype, copy: bool = True):
603 604 return self.copy()
604 605 return self
605 606
607 +elif (
608 +self.tz is None
609 +and is_datetime64_dtype(dtype)
610 +and not is_unitless(dtype)
611 +and is_supported_unit(get_unit_from_dtype(dtype))
612 + ):
613 +# unit conversion e.g. datetime64[s]
614 +res_values = astype_overflowsafe(self._ndarray, dtype, copy=True)
615 +return type(self)._simple_new(res_values, dtype=res_values.dtype)
616 +# TODO: preserve freq?
617 +
606 618 elif is_datetime64_ns_dtype(dtype):
607 619 return astype_dt64_to_dt64tz(self, dtype, copy, via_utc=False)
608 620
609 -elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype:
610 -# unit conversion e.g. datetime64[s]
611 -return self._ndarray.astype(dtype)
621 +elif self.tz is not None and isinstance(dtype, DatetimeTZDtype):
622 +# tzaware unit conversion e.g. datetime64[s, UTC]
623 +np_dtype = np.dtype(dtype.str)
624 +res_values = astype_overflowsafe(self._ndarray, np_dtype, copy=copy)
625 +return type(self)._simple_new(res_values, dtype=dtype)
626 +# TODO: preserve freq?
612 627
613 628 elif is_period_dtype(dtype):
614 629 return self.to_period(freq=dtype.freq)
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
15 15 import numpy as np
16 16
17 17 from pandas._libs import lib
18 +from pandas._libs.tslibs import is_unitless
18 19 from pandas._libs.tslibs.timedeltas import array_to_timedelta64
19 20 from pandas._typing import (
20 21 ArrayLike,
@@ -280,6 +281,20 @@ def astype_array_safe(
280 281 # Ensure we don't end up with a PandasArray
281 282 dtype = dtype.numpy_dtype
282 283
284 +if (
285 +is_datetime64_dtype(values.dtype)
286 +# need to do np.dtype check instead of is_datetime64_dtype
287 +# otherwise pyright complains
288 +and isinstance(dtype, np.dtype)
289 +and dtype.kind == "M"
290 +and not is_unitless(dtype)
291 +and not is_dtype_equal(dtype, values.dtype)
292 + ):
293 +# unit conversion, we would re-cast to nanosecond, so this is
294 +# effectively just a copy (regardless of copy kwd)
295 +# TODO(2.0): remove special-case
296 +return values.copy()
297 +
283 298 try:
284 299 new_values = astype_array(values, dtype, copy=copy)
285 300 except (ValueError, TypeError):
Original file line number Diff line number Diff line change
@@ -966,7 +966,9 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool:
966 966 tipo = get_dtype(arr_or_dtype.dtype)
967 967 else:
968 968 return False
969 -return tipo == DT64NS_DTYPE or getattr(tipo, "base", None) == DT64NS_DTYPE
969 +return tipo == DT64NS_DTYPE or (
970 +isinstance(tipo, DatetimeTZDtype) and tipo._unit == "ns"
971 + )
970 972
971 973
972 974 def is_timedelta64_ns_dtype(arr_or_dtype) -> bool:
Original file line number Diff line number Diff line change
@@ -1064,16 +1064,6 @@ def astype(self, dtype, copy: bool = True):
1064 1064 # Ensure that self.astype(self.dtype) is self
1065 1065 return self.copy() if copy else self
1066 1066
1067 -if (
1068 -self.dtype == np.dtype("M8[ns]")
1069 -and isinstance(dtype, np.dtype)
1070 -and dtype.kind == "M"
1071 -and dtype != np.dtype("M8[ns]")
1072 - ):
1073 -# For now DatetimeArray supports this by unwrapping ndarray,
1074 -# but DatetimeIndex doesn't
1075 -raise TypeError(f"Cannot cast {type(self).__name__} to dtype")
1076 -
1077 1067 values = self._data
1078 1068 if isinstance(values, ExtensionArray):
1079 1069 with rewrite_exception(type(values).__name__, type(self).__name__):
Original file line number Diff line number Diff line change
@@ -48,6 +48,7 @@
48 48 from pandas.core.dtypes.common import (
49 49 is_datetime64_dtype,
50 50 is_datetime64tz_dtype,
51 +is_dtype_equal,
51 52 is_scalar,
52 53 )
53 54 from pandas.core.dtypes.missing import is_valid_na_for_dtype
@@ -338,6 +339,18 @@ def __new__(
338 339 if copy:
339 340 data = data.copy()
340 341 return cls._simple_new(data, name=name)
342 +elif (
343 +isinstance(data, DatetimeArray)
344 +and freq is lib.no_default
345 +and tz is None
346 +and is_dtype_equal(data.dtype, dtype)
347 + ):
348 +# Reached via Index.__new__ when we call .astype
349 +# TODO(2.0): special casing can be removed once _from_sequence_not_strict
350 +# no longer chokes on non-nano
351 +if copy:
352 +data = data.copy()
353 +return cls._simple_new(data, name=name)
341 354
342 355 dtarr = DatetimeArray._from_sequence_not_strict(
343 356 data,
Original file line number Diff line number Diff line change
@@ -207,6 +207,36 @@ def test_cmp_dt64_arraylike_tznaive(self, comparison_op):
207 207
208 208
209 209 class TestDatetimeArray:
210 +def test_astype_non_nano_tznaive(self):
211 +dti = pd.date_range("2016-01-01", periods=3)
212 +
213 +res = dti.astype("M8[s]")
214 +assert res.dtype == "M8[s]"
215 +
216 +dta = dti._data
217 +res = dta.astype("M8[s]")
218 +assert res.dtype == "M8[s]"
219 +assert isinstance(res, pd.core.arrays.DatetimeArray) # used to be ndarray
220 +
221 +def test_astype_non_nano_tzaware(self):
222 +dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
223 +
224 +res = dti.astype("M8[s, US/Pacific]")
225 +assert res.dtype == "M8[s, US/Pacific]"
226 +
227 +dta = dti._data
228 +res = dta.astype("M8[s, US/Pacific]")
229 +assert res.dtype == "M8[s, US/Pacific]"
230 +
231 +# from non-nano to non-nano, preserving reso
232 +res2 = res.astype("M8[s, UTC]")
233 +assert res2.dtype == "M8[s, UTC]"
234 +assert not tm.shares_memory(res2, res)
235 +
236 +res3 = res.astype("M8[s, UTC]", copy=False)
237 +assert res2.dtype == "M8[s, UTC]"
238 +assert tm.shares_memory(res3, res)
239 +
210 240 def test_astype_to_same(self):
211 241 arr = DatetimeArray._from_sequence(
212 242 ["2000"], dtype=DatetimeTZDtype(tz="US/Central")
Original file line number Diff line number Diff line change
@@ -474,6 +474,9 @@ def test_is_datetime64_ns_dtype():
474 474 pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]"))
475 475 )
476 476
477 +# non-nano dt64tz
478 +assert not com.is_datetime64_ns_dtype(DatetimeTZDtype("us", "US/Eastern"))
479 +
477 480
478 481 def test_is_timedelta64_ns_dtype():
479 482 assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]"))
Original file line number Diff line number Diff line change
@@ -55,6 +55,7 @@ def test_namespace():
55 55 "get_unit_from_dtype",
56 56 "periods_per_day",
57 57 "periods_per_second",
58 +"is_supported_unit",
58 59 ]
59 60
60 61 expected = set(submodules + api)