FIX: fix interpolate with kwarg limit area and limit direction using pad or bfill by cchwala · Pull Request #31048 · pandas-dev/pandas (original) (raw)
@@ -5,6 +5,7 @@
import numpy as np
from pandas._libs import algos, lib
from pandas._typing import ArrayLike, Dtype, Hashable, List, Optional
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.cast import infer_dtype_from_array
@@ -222,40 +223,12 @@ def interpolate_1d(
# default limit is unlimited GH #16282
limit = algos._validate_limit(nobs=None, limit=limit)
# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
mid_nans = all_nans - start_nans - end_nans
# Like the sets above, preserve_nans contains indices of invalid values,
# but in this case, it is the final set of indices that need to be
# preserved as NaN after the interpolation.
# For example if limit_direction='forward' then preserve_nans will
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.
# set preserve_nans based on direction using _interp_limit
if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))
# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans
# sort preserve_nans and covert to list
preserve_nans = sorted(preserve_nans)
preserve_nans = _derive_indices_of_nans_to_preserve(
yvalues=yvalues,
limit=limit,
limit_area=limit_area,
limit_direction=limit_direction,
)
xvalues = getattr(xvalues, "values", xvalues)
yvalues = getattr(yvalues, "values", yvalues)
@@ -314,6 +287,73 @@ def interpolate_1d(
return result
def _derive_indices_of_nans_to_preserve(
yvalues: ArrayLike,
limit: Optional[int] = None,
limit_area: Optional[str] = None,
limit_direction: Optional[str] = None,
) -> List[int]:
"""
Derive the indices of NaNs that shall be preserved after interpolation
This function is called by `interpolate_1d` and takes the arguments with
the same name from there. In `interpolate_1d`, after performing the
interpolation, the list of indices of NaNs to preserve is used to put
NaNs in the desired locations.
Parameters
----------
yvalues: ArrayLike
1-d array of values of the initial Series or DataFrame
limit: int
limit_area: str
limit_direction: str
Returns
-------
preserve_nans: list of int
Set of index pointers to where NaNs should be preserved in `yvalues`
"""
invalid = isna(yvalues)
valid = ~invalid
# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
mid_nans = all_nans - start_nans - end_nans
# Like the sets above, preserve_nans contains indices of invalid values,
# but in this case, it is the final set of indices that need to be
# preserved as NaN after the interpolation.
# For example if limit_direction='forward' then preserve_nans will
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.
# set preserve_nans based on direction using _interp_limit
if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))
# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans
# sort preserve_nans and covert to list
preserve_nans_sorted = sorted(preserve_nans)
return preserve_nans_sorted
def _interpolate_scipy_wrapper(
x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs
):
@@ -478,45 +518,127 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):
return [P(x, nu) for nu in der]
def interpolate_2d(
values, method="pad", axis=0, limit=None, fill_value=None, dtype=None
def interpolate_1d_fill(
values,
method: str = "pad",
limit: Optional[int] = None,
limit_area: Optional[str] = None,
fill_value: Optional[Hashable] = None,
dtype: Optional[Dtype] = None,
):
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
This is a 1D-versoin of `interpolate_2d`, which is used for methods `pad`
and `backfill` when interpolating. This 1D-version is necessary to be
able to handle kwarg `limit_area` via the function
` _derive_indices_of_nans_to_preserve`. It is used the same way as the
1D-interpolation functions which are based on scipy-interpolation, i.e.
via np.apply_along_axis.
"""
if method == "pad":
limit_direction = "forward"
elif method == "backfill":
limit_direction = "backward"
else:
raise ValueError("`method` must be either 'pad' or 'backfill'.")
orig_values = values
transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
yvalues = values
# reshape a 1 dim if needed
ndim = values.ndim
if values.ndim == 1:
if axis != 0: # pragma: no cover
raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
values = values.reshape(tuple((1,) + values.shape))
if values.ndim > 1:
raise AssertionError("This only works with 1D data.")
if fill_value is None:
mask = None
else: # todo create faster fill func without masking
mask = mask_missing(transf(values), fill_value)
mask = mask_missing(values, fill_value)
preserve_nans = _derive_indices_of_nans_to_preserve(
yvalues=yvalues,
limit=limit,
limit_area=limit_area,
limit_direction=limit_direction,
)
method = clean_fill_method(method)
if method == "pad":
values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
values = pad_1d(values, limit=limit, mask=mask, dtype=dtype)
else:
values = transf(
backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
)
# reshape back
if ndim == 1:
values = values[0]
values = backfill_1d(values, limit=limit, mask=mask, dtype=dtype)
if orig_values.dtype.kind == "M":
# convert float back to datetime64
values = values.astype(orig_values.dtype)
values[preserve_nans] = fill_value
return values
def interpolate_2d(
values,
method="pad",
axis=0,
limit=None,
fill_value=None,
limit_area=None,
dtype=None,
):
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
"""
orig_values = values
# We have to distinguish two cases:
# 1. When kwarg `limit_area` is used: It is not
# supported by `pad_2d` and `backfill_2d`. Using this kwarg only
# works by applying the fill along a certain axis.
# 2. All other cases.
if limit_area is not None:
def func(x):
return interpolate_1d_fill(
x,
method=method,
limit=limit,
limit_area=limit_area,
fill_value=fill_value,
dtype=dtype,
)
# Beware that this also changes the input array `values`!
values = np.apply_along_axis(func, axis, values)
else:
transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
# reshape a 1 dim if needed
ndim = values.ndim
if values.ndim == 1:
if axis != 0: # pragma: no cover
raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
values = values.reshape(tuple((1,) + values.shape))
if fill_value is None:
mask = None
else: # todo create faster fill func without masking
mask = mask_missing(transf(values), fill_value)
method = clean_fill_method(method)
if method == "pad":
values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
else:
values = transf(
backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
)
# reshape back
if ndim == 1:
values = values[0]
if orig_values.dtype.kind == "M":
# convert float back to datetime64
values = values.astype(orig_values.dtype)
return values