REF: Move methods that can be shared with new string dtype (#54534) · pandas-dev/pandas@fc30823 (original) (raw)
`@@ -42,6 +42,7 @@
`
42
42
``
43
43
`from pandas.core import roperator
`
44
44
`from pandas.core.arraylike import OpsMixin
`
``
45
`+
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
`
45
46
`from pandas.core.arrays.base import (
`
46
47
`ExtensionArray,
`
47
48
`ExtensionArraySupportsAnyAll,
`
`@@ -184,7 +185,10 @@ def to_pyarrow_type(
`
184
185
``
185
186
``
186
187
`class ArrowExtensionArray(
`
187
``
`-
OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
`
``
188
`+
OpsMixin,
`
``
189
`+
ExtensionArraySupportsAnyAll,
`
``
190
`+
ArrowStringArrayMixin,
`
``
191
`+
BaseStringArrayMethods,
`
188
192
`):
`
189
193
`"""
`
190
194
` Pandas ExtensionArray backed by a PyArrow ChunkedArray.
`
`@@ -1986,24 +1990,6 @@ def _str_count(self, pat: str, flags: int = 0):
`
1986
1990
`raise NotImplementedError(f"count not implemented with {flags=}")
`
1987
1991
`return type(self)(pc.count_substring_regex(self._pa_array, pat))
`
1988
1992
``
1989
``
`-
def _str_pad(
`
1990
``
`-
self,
`
1991
``
`-
width: int,
`
1992
``
`-
side: Literal["left", "right", "both"] = "left",
`
1993
``
`-
fillchar: str = " ",
`
1994
``
`-
):
`
1995
``
`-
if side == "left":
`
1996
``
`-
pa_pad = pc.utf8_lpad
`
1997
``
`-
elif side == "right":
`
1998
``
`-
pa_pad = pc.utf8_rpad
`
1999
``
`-
elif side == "both":
`
2000
``
`-
pa_pad = pc.utf8_center
`
2001
``
`-
else:
`
2002
``
`-
raise ValueError(
`
2003
``
`-
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
`
2004
``
`-
)
`
2005
``
`-
return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
`
2006
``
-
2007
1993
`def _str_contains(
`
2008
1994
`self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
`
2009
1995
` ):
`
`@@ -2088,26 +2074,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
`
2088
2074
` )
`
2089
2075
`return type(self)(result)
`
2090
2076
``
2091
``
`-
def _str_get(self, i: int):
`
2092
``
`-
lengths = pc.utf8_length(self._pa_array)
`
2093
``
`-
if i >= 0:
`
2094
``
`-
out_of_bounds = pc.greater_equal(i, lengths)
`
2095
``
`-
start = i
`
2096
``
`-
stop = i + 1
`
2097
``
`-
step = 1
`
2098
``
`-
else:
`
2099
``
`-
out_of_bounds = pc.greater(-i, lengths)
`
2100
``
`-
start = i
`
2101
``
`-
stop = i - 1
`
2102
``
`-
step = -1
`
2103
``
`-
not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
`
2104
``
`-
selected = pc.utf8_slice_codeunits(
`
2105
``
`-
self._pa_array, start=start, stop=stop, step=step
`
2106
``
`-
)
`
2107
``
`-
null_value = pa.scalar(None, type=self._pa_array.type)
`
2108
``
`-
result = pc.if_else(not_out_of_bounds, selected, null_value)
`
2109
``
`-
return type(self)(result)
`
2110
``
-
2111
2077
`def _str_join(self, sep: str):
`
2112
2078
`if pa.types.is_string(self._pa_array.type):
`
2113
2079
`result = self._apply_elementwise(list)
`
`@@ -2137,15 +2103,6 @@ def _str_slice(
`
2137
2103
`pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
`
2138
2104
` )
`
2139
2105
``
2140
``
`-
def _str_slice_replace(
`
2141
``
`-
self, start: int | None = None, stop: int | None = None, repl: str | None = None
`
2142
``
`-
):
`
2143
``
`-
if repl is None:
`
2144
``
`-
repl = ""
`
2145
``
`-
if start is None:
`
2146
``
`-
start = 0
`
2147
``
`-
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
`
2148
``
-
2149
2106
`def _str_isalnum(self):
`
2150
2107
`return type(self)(pc.utf8_is_alnum(self._pa_array))
`
2151
2108
``
`@@ -2170,18 +2127,9 @@ def _str_isspace(self):
`
2170
2127
`def _str_istitle(self):
`
2171
2128
`return type(self)(pc.utf8_is_title(self._pa_array))
`
2172
2129
``
2173
``
`-
def _str_capitalize(self):
`
2174
``
`-
return type(self)(pc.utf8_capitalize(self._pa_array))
`
2175
``
-
2176
``
`-
def _str_title(self):
`
2177
``
`-
return type(self)(pc.utf8_title(self._pa_array))
`
2178
``
-
2179
2130
`def _str_isupper(self):
`
2180
2131
`return type(self)(pc.utf8_is_upper(self._pa_array))
`
2181
2132
``
2182
``
`-
def _str_swapcase(self):
`
2183
``
`-
return type(self)(pc.utf8_swapcase(self._pa_array))
`
2184
``
-
2185
2133
`def _str_len(self):
`
2186
2134
`return type(self)(pc.utf8_length(self._pa_array))
`
2187
2135
``
`@@ -2222,12 +2170,6 @@ def _str_removeprefix(self, prefix: str):
`
2222
2170
`result = self._apply_elementwise(predicate)
`
2223
2171
`return type(self)(pa.chunked_array(result))
`
2224
2172
``
2225
``
`-
def _str_removesuffix(self, suffix: str):
`
2226
``
`-
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
`
2227
``
`-
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
`
2228
``
`-
result = pc.if_else(ends_with, removed, self._pa_array)
`
2229
``
`-
return type(self)(result)
`
2230
``
-
2231
2173
`def _str_casefold(self):
`
2232
2174
`predicate = lambda val: val.casefold()
`
2233
2175
`result = self._apply_elementwise(predicate)
`