REF: Move methods that can be shared with new string dtype (#54534) · pandas-dev/pandas@fc30823 (original) (raw)

`@@ -42,6 +42,7 @@

42

43

`from pandas.core import roperator

44

`from pandas.core.arraylike import OpsMixin

45

from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin

45

46

`from pandas.core.arrays.base import (

46

47

`ExtensionArray,

47

48

`ExtensionArraySupportsAnyAll,

`@@ -184,7 +185,10 @@ def to_pyarrow_type(

184

185

186

187

`class ArrowExtensionArray(

187

OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods

188

OpsMixin,

189

ExtensionArraySupportsAnyAll,

190

ArrowStringArrayMixin,

191

BaseStringArrayMethods,

188

192

`):

189

193

`"""

190

194

` Pandas ExtensionArray backed by a PyArrow ChunkedArray.

`@@ -1986,24 +1990,6 @@ def _str_count(self, pat: str, flags: int = 0):

1986

1990

`raise NotImplementedError(f"count not implemented with {flags=}")

1987

1991

`return type(self)(pc.count_substring_regex(self._pa_array, pat))

1988

1992

1989

def _str_pad(

1990

self,

1991

width: int,

1992

side: Literal["left", "right", "both"] = "left",

1993

fillchar: str = " ",

1994

1995

if side == "left":

1996

pa_pad = pc.utf8_lpad

1997

elif side == "right":

1998

pa_pad = pc.utf8_rpad

1999

elif side == "both":

2000

pa_pad = pc.utf8_center

2001

else:

2002

raise ValueError(

2003

f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"

2004

)

2005

return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))

2006

-

2007

1993

`def _str_contains(

2008

1994

`self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True

2009

1995

` ):

`@@ -2088,26 +2074,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):

2088

2074

` )

2089

2075

`return type(self)(result)

2090

2076

2091

def _str_get(self, i: int):

2092

lengths = pc.utf8_length(self._pa_array)

2093

if i >= 0:

2094

out_of_bounds = pc.greater_equal(i, lengths)

2095

start = i

2096

stop = i + 1

2097

step = 1

2098

else:

2099

out_of_bounds = pc.greater(-i, lengths)

2100

start = i

2101

stop = i - 1

2102

step = -1

2103

not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))

2104

selected = pc.utf8_slice_codeunits(

2105

self._pa_array, start=start, stop=stop, step=step

2106

)

2107

null_value = pa.scalar(None, type=self._pa_array.type)

2108

result = pc.if_else(not_out_of_bounds, selected, null_value)

2109

return type(self)(result)

2110

-

2111

2077

`def _str_join(self, sep: str):

2112

2078

`if pa.types.is_string(self._pa_array.type):

2113

2079

`result = self._apply_elementwise(list)

`@@ -2137,15 +2103,6 @@ def _str_slice(

2137

2103

`pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)

2138

2104

` )

2139

2105

2140

def _str_slice_replace(

2141

self, start: int | None = None, stop: int | None = None, repl: str | None = None

2142

2143

if repl is None:

2144

repl = ""

2145

if start is None:

2146

start = 0

2147

return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))

2148

-

2149

2106

`def _str_isalnum(self):

2150

2107

`return type(self)(pc.utf8_is_alnum(self._pa_array))

2151

2108

`@@ -2170,18 +2127,9 @@ def _str_isspace(self):

2170

2127

`def _str_istitle(self):

2171

2128

`return type(self)(pc.utf8_is_title(self._pa_array))

2172

2129

2173

def _str_capitalize(self):

2174

return type(self)(pc.utf8_capitalize(self._pa_array))

2175

-

2176

def _str_title(self):

2177

return type(self)(pc.utf8_title(self._pa_array))

2178

-

2179

2130

`def _str_isupper(self):

2180

2131

`return type(self)(pc.utf8_is_upper(self._pa_array))

2181

2132

2182

def _str_swapcase(self):

2183

return type(self)(pc.utf8_swapcase(self._pa_array))

2184

-

2185

2133

`def _str_len(self):

2186

2134

`return type(self)(pc.utf8_length(self._pa_array))

2187

2135

`@@ -2222,12 +2170,6 @@ def _str_removeprefix(self, prefix: str):

2222

2170

`result = self._apply_elementwise(predicate)

2223

2171

`return type(self)(pa.chunked_array(result))

2224

2172

2225

def _str_removesuffix(self, suffix: str):

2226

ends_with = pc.ends_with(self._pa_array, pattern=suffix)

2227

removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))

2228

result = pc.if_else(ends_with, removed, self._pa_array)

2229

return type(self)(result)

2230

-

2231

2173

`def _str_casefold(self):

2232

2174

`predicate = lambda val: val.casefold()

2233

2175

`result = self._apply_elementwise(predicate)