PERF: ffill/bfill with non-numpy dtypes (#53950) · pandas-dev/pandas@4da9cb6 (original) (raw)
7 files changed
lines changed
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -342,6 +342,7 @@ Performance improvements | ||
342 | 342 | - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) |
343 | 343 | - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) |
344 | 344 | - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) |
345 | +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) | |
345 | 346 | - Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) |
346 | 347 | - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) |
347 | 348 | - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -60,6 +60,10 @@ def nancorr_spearman( | ||
60 | 60 | # ---------------------------------------------------------------------- |
61 | 61 | |
62 | 62 | def validate_limit(nobs: int | None, limit=...) -> int: ... |
63 | +def get_fill_indexer( | |
64 | +mask: npt.NDArray[np.bool_], | |
65 | +limit: int | None = None, | |
66 | +) -> npt.NDArray[np.intp]: ... | |
63 | 67 | def pad( |
64 | 68 | old: np.ndarray, # ndarray[numeric_object_t] |
65 | 69 | new: np.ndarray, # ndarray[numeric_object_t] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int: | ||
525 | 525 | return lim |
526 | 526 | |
527 | 527 | |
528 | +# TODO: overlap with libgroupby.group_fillna_indexer? | |
529 | +@cython.boundscheck(False) | |
530 | +@cython.wraparound(False) | |
531 | +def get_fill_indexer(const uint8_t[:] mask, limit=None): | |
532 | +""" | |
533 | + Find an indexer to use for ffill to `take` on the array being filled. | |
534 | + """ | |
535 | + cdef: | |
536 | + ndarray[intp_t, ndim=1] indexer | |
537 | + Py_ssize_t i, N = len(mask), last_valid | |
538 | +int lim | |
539 | + | |
540 | +# fill_count is the number of consecutive NAs we have seen. | |
541 | +# If it exceeds the given limit, we stop padding. | |
542 | +int fill_count = 0 | |
543 | + | |
544 | + lim = validate_limit(N, limit) | |
545 | + indexer = np.empty(N, dtype=np.intp) | |
546 | + | |
547 | + last_valid = -1 # haven't yet seen anything non-NA | |
548 | + | |
549 | +for i in range(N): | |
550 | +if not mask[i]: | |
551 | + indexer[i] = i | |
552 | + last_valid = i | |
553 | + fill_count = 0 | |
554 | +else: | |
555 | +if fill_count < lim: | |
556 | + indexer[i] = last_valid | |
557 | +else: | |
558 | + indexer[i] = -1 | |
559 | + fill_count += 1 | |
560 | + | |
561 | +return indexer | |
562 | + | |
563 | + | |
528 | 564 | @cython.boundscheck(False) |
529 | 565 | @cython.wraparound(False) |
530 | 566 | def pad( |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -67,8 +67,6 @@ | ||
67 | 67 | |
68 | 68 | from pandas.core.dtypes.dtypes import ArrowDtype |
69 | 69 | |
70 | -from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning | |
71 | - | |
72 | 70 | ARROW_CMP_FUNCS = { |
73 | 71 | "eq": pc.equal, |
74 | 72 | "ne": pc.not_equal, |
@@ -918,7 +916,6 @@ def fillna( | ||
918 | 916 | return super().fillna(value=value, method=method, limit=limit) |
919 | 917 | |
920 | 918 | if method is not None: |
921 | -fallback_performancewarning() | |
922 | 919 | return super().fillna(value=value, method=method, limit=limit) |
923 | 920 | |
924 | 921 | if isinstance(value, (np.ndarray, ExtensionArray)): |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -23,7 +23,10 @@ | ||
23 | 23 | |
24 | 24 | import numpy as np |
25 | 25 | |
26 | -from pandas._libs import lib | |
26 | +from pandas._libs import ( | |
27 | +algos as libalgos, | |
28 | +lib, | |
29 | +) | |
27 | 30 | from pandas.compat import set_function_name |
28 | 31 | from pandas.compat.numpy import function as nv |
29 | 32 | from pandas.errors import AbstractMethodError |
@@ -824,10 +827,16 @@ def fillna( | ||
824 | 827 | |
825 | 828 | if mask.any(): |
826 | 829 | if method is not None: |
827 | -func = missing.get_fill_func(method) | |
828 | -npvalues = self.astype(object) | |
829 | -func(npvalues, limit=limit, mask=mask) | |
830 | -new_values = self._from_sequence(npvalues, dtype=self.dtype) | |
830 | +meth = missing.clean_fill_method(method) | |
831 | + | |
832 | +npmask = np.asarray(mask) | |
833 | +if meth == "pad": | |
834 | +indexer = libalgos.get_fill_indexer(npmask, limit=limit) | |
835 | +return self.take(indexer, allow_fill=True) | |
836 | +else: | |
837 | +# i.e. meth == "backfill" | |
838 | +indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] | |
839 | +return self[::-1].take(indexer, allow_fill=True) | |
831 | 840 | else: |
832 | 841 | # fill with value |
833 | 842 | new_values = self.copy() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -38,7 +38,6 @@ | ||
38 | 38 | pa_version_under9p0, |
39 | 39 | pa_version_under11p0, |
40 | 40 | ) |
41 | -from pandas.errors import PerformanceWarning | |
42 | 41 | |
43 | 42 | from pandas.core.dtypes.dtypes import ( |
44 | 43 | ArrowDtype, |
@@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data): | ||
698 | 697 | assert result is not data |
699 | 698 | self.assert_extension_array_equal(result, data) |
700 | 699 | |
701 | -def test_fillna_series_method(self, data_missing, fillna_method): | |
702 | -with tm.maybe_produces_warning( | |
703 | -PerformanceWarning, fillna_method is not None, check_stacklevel=False | |
704 | - ): | |
705 | -super().test_fillna_series_method(data_missing, fillna_method) | |
706 | - | |
707 | 700 | |
708 | 701 | class TestBasePrinting(base.BasePrintingTests): |
709 | 702 | pass |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -18,10 +18,7 @@ | ||
18 | 18 | import numpy as np |
19 | 19 | import pytest |
20 | 20 | |
21 | -from pandas.errors import PerformanceWarning | |
22 | - | |
23 | 21 | import pandas as pd |
24 | -import pandas._testing as tm | |
25 | 22 | from pandas.api.types import is_string_dtype |
26 | 23 | from pandas.core.arrays import ArrowStringArray |
27 | 24 | from pandas.core.arrays.string_ import StringDtype |
@@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data): | ||
169 | 166 | assert result is not data |
170 | 167 | self.assert_extension_array_equal(result, data) |
171 | 168 | |
172 | -def test_fillna_series_method(self, data_missing, fillna_method): | |
173 | -with tm.maybe_produces_warning( | |
174 | -PerformanceWarning, | |
175 | -fillna_method is not None and data_missing.dtype.storage == "pyarrow", | |
176 | -check_stacklevel=False, | |
177 | - ): | |
178 | -super().test_fillna_series_method(data_missing, fillna_method) | |
179 | - | |
180 | 169 | |
181 | 170 | class TestNoReduce(base.BaseNoReduceTests): |
182 | 171 | @pytest.mark.parametrize("skipna", [True, False]) |