PERF: ffill/bfill with non-numpy dtypes (#53950) · pandas-dev/pandas@4da9cb6 (original) (raw)

7 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -342,6 +342,7 @@ Performance improvements
342 342 - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
343 343 - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`)
344 344 - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
345 +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`)
345 346 - Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`)
346 347 - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`)
347 348 - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`)
Original file line number Diff line number Diff line change
@@ -60,6 +60,10 @@ def nancorr_spearman(
60 60 # ----------------------------------------------------------------------
61 61
62 62 def validate_limit(nobs: int | None, limit=...) -> int: ...
63 +def get_fill_indexer(
64 +mask: npt.NDArray[np.bool_],
65 +limit: int | None = None,
66 +) -> npt.NDArray[np.intp]: ...
63 67 def pad(
64 68 old: np.ndarray, # ndarray[numeric_object_t]
65 69 new: np.ndarray, # ndarray[numeric_object_t]
Original file line number Diff line number Diff line change
@@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int:
525 525 return lim
526 526
527 527
528 +# TODO: overlap with libgroupby.group_fillna_indexer?
529 +@cython.boundscheck(False)
530 +@cython.wraparound(False)
531 +def get_fill_indexer(const uint8_t[:] mask, limit=None):
532 +"""
533 + Find an indexer to use for ffill to `take` on the array being filled.
534 + """
535 + cdef:
536 + ndarray[intp_t, ndim=1] indexer
537 + Py_ssize_t i, N = len(mask), last_valid
538 +int lim
539 +
540 +# fill_count is the number of consecutive NAs we have seen.
541 +# If it exceeds the given limit, we stop padding.
542 +int fill_count = 0
543 +
544 + lim = validate_limit(N, limit)
545 + indexer = np.empty(N, dtype=np.intp)
546 +
547 + last_valid = -1 # haven't yet seen anything non-NA
548 +
549 +for i in range(N):
550 +if not mask[i]:
551 + indexer[i] = i
552 + last_valid = i
553 + fill_count = 0
554 +else:
555 +if fill_count < lim:
556 + indexer[i] = last_valid
557 +else:
558 + indexer[i] = -1
559 + fill_count += 1
560 +
561 +return indexer
562 +
563 +
528 564 @cython.boundscheck(False)
529 565 @cython.wraparound(False)
530 566 def pad(
Original file line number Diff line number Diff line change
@@ -67,8 +67,6 @@
67 67
68 68 from pandas.core.dtypes.dtypes import ArrowDtype
69 69
70 -from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
71 -
72 70 ARROW_CMP_FUNCS = {
73 71 "eq": pc.equal,
74 72 "ne": pc.not_equal,
@@ -918,7 +916,6 @@ def fillna(
918 916 return super().fillna(value=value, method=method, limit=limit)
919 917
920 918 if method is not None:
921 -fallback_performancewarning()
922 919 return super().fillna(value=value, method=method, limit=limit)
923 920
924 921 if isinstance(value, (np.ndarray, ExtensionArray)):
Original file line number Diff line number Diff line change
@@ -23,7 +23,10 @@
23 23
24 24 import numpy as np
25 25
26 -from pandas._libs import lib
26 +from pandas._libs import (
27 +algos as libalgos,
28 +lib,
29 +)
27 30 from pandas.compat import set_function_name
28 31 from pandas.compat.numpy import function as nv
29 32 from pandas.errors import AbstractMethodError
@@ -824,10 +827,16 @@ def fillna(
824 827
825 828 if mask.any():
826 829 if method is not None:
827 -func = missing.get_fill_func(method)
828 -npvalues = self.astype(object)
829 -func(npvalues, limit=limit, mask=mask)
830 -new_values = self._from_sequence(npvalues, dtype=self.dtype)
830 +meth = missing.clean_fill_method(method)
831 +
832 +npmask = np.asarray(mask)
833 +if meth == "pad":
834 +indexer = libalgos.get_fill_indexer(npmask, limit=limit)
835 +return self.take(indexer, allow_fill=True)
836 +else:
837 +# i.e. meth == "backfill"
838 +indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1]
839 +return self[::-1].take(indexer, allow_fill=True)
831 840 else:
832 841 # fill with value
833 842 new_values = self.copy()
Original file line number Diff line number Diff line change
@@ -38,7 +38,6 @@
38 38 pa_version_under9p0,
39 39 pa_version_under11p0,
40 40 )
41 -from pandas.errors import PerformanceWarning
42 41
43 42 from pandas.core.dtypes.dtypes import (
44 43 ArrowDtype,
@@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data):
698 697 assert result is not data
699 698 self.assert_extension_array_equal(result, data)
700 699
701 -def test_fillna_series_method(self, data_missing, fillna_method):
702 -with tm.maybe_produces_warning(
703 -PerformanceWarning, fillna_method is not None, check_stacklevel=False
704 - ):
705 -super().test_fillna_series_method(data_missing, fillna_method)
706 -
707 700
708 701 class TestBasePrinting(base.BasePrintingTests):
709 702 pass
Original file line number Diff line number Diff line change
@@ -18,10 +18,7 @@
18 18 import numpy as np
19 19 import pytest
20 20
21 -from pandas.errors import PerformanceWarning
22 -
23 21 import pandas as pd
24 -import pandas._testing as tm
25 22 from pandas.api.types import is_string_dtype
26 23 from pandas.core.arrays import ArrowStringArray
27 24 from pandas.core.arrays.string_ import StringDtype
@@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data):
169 166 assert result is not data
170 167 self.assert_extension_array_equal(result, data)
171 168
172 -def test_fillna_series_method(self, data_missing, fillna_method):
173 -with tm.maybe_produces_warning(
174 -PerformanceWarning,
175 -fillna_method is not None and data_missing.dtype.storage == "pyarrow",
176 -check_stacklevel=False,
177 - ):
178 -super().test_fillna_series_method(data_missing, fillna_method)
179 -
180 169
181 170 class TestNoReduce(base.BaseNoReduceTests):
182 171 @pytest.mark.parametrize("skipna", [True, False])