FIX: fix interpolate with kwarg limit area and limit direction using pad or bfill by cchwala · Pull Request #31048 · pandas-dev/pandas (original) (raw)

Expand Up

@@ -5,6 +5,7 @@

import numpy as np

from pandas._libs import algos, lib

from pandas._typing import ArrayLike, Dtype, Hashable, List, Optional

from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.cast import infer_dtype_from_array

Expand Down Expand Up

@@ -222,40 +223,12 @@ def interpolate_1d(

# default limit is unlimited GH #16282

limit = algos._validate_limit(nobs=None, limit=limit)

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...

all_nans = set(np.flatnonzero(invalid))

start_nans = set(range(find_valid_index(yvalues, "first")))

end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))

mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,

# but in this case, it is the final set of indices that need to be

# preserved as NaN after the interpolation.

# For example if limit_direction='forward' then preserve_nans will

# contain indices of NaNs at the beginning of the series, and NaNs that

# are more than'limit' away from the prior non-NaN.

# set preserve_nans based on direction using _interp_limit

if limit_direction == "forward":

preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))

elif limit_direction == "backward":

preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))

else:

# both directions... just use _interp_limit

preserve_nans = set(_interp_limit(invalid, limit, limit))

# if limit_area is set, add either mid or outside indices

# to preserve_nans GH #16284

if limit_area == "inside":

# preserve NaNs on the outside

preserve_nans |= start_nans | end_nans

elif limit_area == "outside":

# preserve NaNs on the inside

preserve_nans |= mid_nans

# sort preserve_nans and covert to list

preserve_nans = sorted(preserve_nans)

preserve_nans = _derive_indices_of_nans_to_preserve(

yvalues=yvalues,

limit=limit,

limit_area=limit_area,

limit_direction=limit_direction,

)

xvalues = getattr(xvalues, "values", xvalues)

yvalues = getattr(yvalues, "values", yvalues)

Expand Down Expand Up

@@ -314,6 +287,73 @@ def interpolate_1d(

return result

def _derive_indices_of_nans_to_preserve(

yvalues: ArrayLike,

limit: Optional[int] = None,

limit_area: Optional[str] = None,

limit_direction: Optional[str] = None,

) -> List[int]:

"""

Derive the indices of NaNs that shall be preserved after interpolation

This function is called by `interpolate_1d` and takes the arguments with

the same name from there. In `interpolate_1d`, after performing the

interpolation, the list of indices of NaNs to preserve is used to put

NaNs in the desired locations.

Parameters

----------

yvalues: ArrayLike

1-d array of values of the initial Series or DataFrame

limit: int

limit_area: str

limit_direction: str

Returns

-------

preserve_nans: list of int

Set of index pointers to where NaNs should be preserved in `yvalues`

"""

invalid = isna(yvalues)

valid = ~invalid

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...

all_nans = set(np.flatnonzero(invalid))

start_nans = set(range(find_valid_index(yvalues, "first")))

end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))

mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,

# but in this case, it is the final set of indices that need to be

# preserved as NaN after the interpolation.

# For example if limit_direction='forward' then preserve_nans will

# contain indices of NaNs at the beginning of the series, and NaNs that

# are more than'limit' away from the prior non-NaN.

# set preserve_nans based on direction using _interp_limit

if limit_direction == "forward":

preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))

elif limit_direction == "backward":

preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))

else:

# both directions... just use _interp_limit

preserve_nans = set(_interp_limit(invalid, limit, limit))

# if limit_area is set, add either mid or outside indices

# to preserve_nans GH #16284

if limit_area == "inside":

# preserve NaNs on the outside

preserve_nans |= start_nans | end_nans

elif limit_area == "outside":

# preserve NaNs on the inside

preserve_nans |= mid_nans

# sort preserve_nans and covert to list

preserve_nans_sorted = sorted(preserve_nans)

return preserve_nans_sorted

def _interpolate_scipy_wrapper(

x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs

Expand Down Expand Up

@@ -478,45 +518,127 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):

return [P(x, nu) for nu in der]

def interpolate_2d(

values, method="pad", axis=0, limit=None, fill_value=None, dtype=None

def interpolate_1d_fill(

values,

method: str = "pad",

limit: Optional[int] = None,

limit_area: Optional[str] = None,

fill_value: Optional[Hashable] = None,

dtype: Optional[Dtype] = None,

"""

Perform an actual interpolation of values, values will be make 2-d if

needed fills inplace, returns the result.

This is a 1D-versoin of `interpolate_2d`, which is used for methods `pad`

and `backfill` when interpolating. This 1D-version is necessary to be

able to handle kwarg `limit_area` via the function

` _derive_indices_of_nans_to_preserve`. It is used the same way as the

1D-interpolation functions which are based on scipy-interpolation, i.e.

via np.apply_along_axis.

"""

if method == "pad":

limit_direction = "forward"

elif method == "backfill":

limit_direction = "backward"

else:

raise ValueError("`method` must be either 'pad' or 'backfill'.")

orig_values = values

transf = (lambda x: x) if axis == 0 else (lambda x: x.T)

yvalues = values

# reshape a 1 dim if needed

ndim = values.ndim

if values.ndim == 1:

if axis != 0: # pragma: no cover

raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")

values = values.reshape(tuple((1,) + values.shape))

if values.ndim > 1:

raise AssertionError("This only works with 1D data.")

if fill_value is None:

mask = None

else: # todo create faster fill func without masking

mask = mask_missing(transf(values), fill_value)

mask = mask_missing(values, fill_value)

preserve_nans = _derive_indices_of_nans_to_preserve(

yvalues=yvalues,

limit=limit,

limit_area=limit_area,

limit_direction=limit_direction,

)

method = clean_fill_method(method)

if method == "pad":

values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))

values = pad_1d(values, limit=limit, mask=mask, dtype=dtype)

else:

values = transf(

backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)

)

# reshape back

if ndim == 1:

values = values[0]

values = backfill_1d(values, limit=limit, mask=mask, dtype=dtype)

if orig_values.dtype.kind == "M":

# convert float back to datetime64

values = values.astype(orig_values.dtype)

values[preserve_nans] = fill_value

return values

def interpolate_2d(

values,

method="pad",

axis=0,

limit=None,

fill_value=None,

limit_area=None,

dtype=None,

"""

Perform an actual interpolation of values, values will be make 2-d if

needed fills inplace, returns the result.

"""

orig_values = values

# We have to distinguish two cases:

# 1. When kwarg `limit_area` is used: It is not

# supported by `pad_2d` and `backfill_2d`. Using this kwarg only

# works by applying the fill along a certain axis.

# 2. All other cases.

if limit_area is not None:

def func(x):

return interpolate_1d_fill(

method=method,

limit=limit,

limit_area=limit_area,

fill_value=fill_value,

dtype=dtype,

)

# Beware that this also changes the input array `values`!

values = np.apply_along_axis(func, axis, values)

else:

transf = (lambda x: x) if axis == 0 else (lambda x: x.T)

# reshape a 1 dim if needed

ndim = values.ndim

if values.ndim == 1:

if axis != 0: # pragma: no cover

raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")

values = values.reshape(tuple((1,) + values.shape))

if fill_value is None:

mask = None

else: # todo create faster fill func without masking

mask = mask_missing(transf(values), fill_value)

method = clean_fill_method(method)

if method == "pad":

values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))

else:

values = transf(

backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)

)

# reshape back

if ndim == 1:

values = values[0]

if orig_values.dtype.kind == "M":

# convert float back to datetime64

values = values.astype(orig_values.dtype)

return values

Expand Down