REF: Move methods that can be shared with new string dtype (#54534) · pandas-dev/pandas@fc30823 (original) (raw)

`@@ -42,6 +42,7 @@

`

42

42

``

43

43

`from pandas.core import roperator

`

44

44

`from pandas.core.arraylike import OpsMixin

`

``

45

`+

from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin

`

45

46

`from pandas.core.arrays.base import (

`

46

47

`ExtensionArray,

`

47

48

`ExtensionArraySupportsAnyAll,

`

`@@ -184,7 +185,10 @@ def to_pyarrow_type(

`

184

185

``

185

186

``

186

187

`class ArrowExtensionArray(

`

187

``

`-

OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods

`

``

188

`+

OpsMixin,

`

``

189

`+

ExtensionArraySupportsAnyAll,

`

``

190

`+

ArrowStringArrayMixin,

`

``

191

`+

BaseStringArrayMethods,

`

188

192

`):

`

189

193

`"""

`

190

194

` Pandas ExtensionArray backed by a PyArrow ChunkedArray.

`

`@@ -1986,24 +1990,6 @@ def _str_count(self, pat: str, flags: int = 0):

`

1986

1990

`raise NotImplementedError(f"count not implemented with {flags=}")

`

1987

1991

`return type(self)(pc.count_substring_regex(self._pa_array, pat))

`

1988

1992

``

1989

``

`-

def _str_pad(

`

1990

``

`-

self,

`

1991

``

`-

width: int,

`

1992

``

`-

side: Literal["left", "right", "both"] = "left",

`

1993

``

`-

fillchar: str = " ",

`

1994

``

`-

):

`

1995

``

`-

if side == "left":

`

1996

``

`-

pa_pad = pc.utf8_lpad

`

1997

``

`-

elif side == "right":

`

1998

``

`-

pa_pad = pc.utf8_rpad

`

1999

``

`-

elif side == "both":

`

2000

``

`-

pa_pad = pc.utf8_center

`

2001

``

`-

else:

`

2002

``

`-

raise ValueError(

`

2003

``

`-

f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"

`

2004

``

`-

)

`

2005

``

`-

return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))

`

2006

``

-

2007

1993

`def _str_contains(

`

2008

1994

`self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True

`

2009

1995

` ):

`

`@@ -2088,26 +2074,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):

`

2088

2074

` )

`

2089

2075

`return type(self)(result)

`

2090

2076

``

2091

``

`-

def _str_get(self, i: int):

`

2092

``

`-

lengths = pc.utf8_length(self._pa_array)

`

2093

``

`-

if i >= 0:

`

2094

``

`-

out_of_bounds = pc.greater_equal(i, lengths)

`

2095

``

`-

start = i

`

2096

``

`-

stop = i + 1

`

2097

``

`-

step = 1

`

2098

``

`-

else:

`

2099

``

`-

out_of_bounds = pc.greater(-i, lengths)

`

2100

``

`-

start = i

`

2101

``

`-

stop = i - 1

`

2102

``

`-

step = -1

`

2103

``

`-

not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))

`

2104

``

`-

selected = pc.utf8_slice_codeunits(

`

2105

``

`-

self._pa_array, start=start, stop=stop, step=step

`

2106

``

`-

)

`

2107

``

`-

null_value = pa.scalar(None, type=self._pa_array.type)

`

2108

``

`-

result = pc.if_else(not_out_of_bounds, selected, null_value)

`

2109

``

`-

return type(self)(result)

`

2110

``

-

2111

2077

`def _str_join(self, sep: str):

`

2112

2078

`if pa.types.is_string(self._pa_array.type):

`

2113

2079

`result = self._apply_elementwise(list)

`

`@@ -2137,15 +2103,6 @@ def _str_slice(

`

2137

2103

`pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)

`

2138

2104

` )

`

2139

2105

``

2140

``

`-

def _str_slice_replace(

`

2141

``

`-

self, start: int | None = None, stop: int | None = None, repl: str | None = None

`

2142

``

`-

):

`

2143

``

`-

if repl is None:

`

2144

``

`-

repl = ""

`

2145

``

`-

if start is None:

`

2146

``

`-

start = 0

`

2147

``

`-

return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))

`

2148

``

-

2149

2106

`def _str_isalnum(self):

`

2150

2107

`return type(self)(pc.utf8_is_alnum(self._pa_array))

`

2151

2108

``

`@@ -2170,18 +2127,9 @@ def _str_isspace(self):

`

2170

2127

`def _str_istitle(self):

`

2171

2128

`return type(self)(pc.utf8_is_title(self._pa_array))

`

2172

2129

``

2173

``

`-

def _str_capitalize(self):

`

2174

``

`-

return type(self)(pc.utf8_capitalize(self._pa_array))

`

2175

``

-

2176

``

`-

def _str_title(self):

`

2177

``

`-

return type(self)(pc.utf8_title(self._pa_array))

`

2178

``

-

2179

2130

`def _str_isupper(self):

`

2180

2131

`return type(self)(pc.utf8_is_upper(self._pa_array))

`

2181

2132

``

2182

``

`-

def _str_swapcase(self):

`

2183

``

`-

return type(self)(pc.utf8_swapcase(self._pa_array))

`

2184

``

-

2185

2133

`def _str_len(self):

`

2186

2134

`return type(self)(pc.utf8_length(self._pa_array))

`

2187

2135

``

`@@ -2222,12 +2170,6 @@ def _str_removeprefix(self, prefix: str):

`

2222

2170

`result = self._apply_elementwise(predicate)

`

2223

2171

`return type(self)(pa.chunked_array(result))

`

2224

2172

``

2225

``

`-

def _str_removesuffix(self, suffix: str):

`

2226

``

`-

ends_with = pc.ends_with(self._pa_array, pattern=suffix)

`

2227

``

`-

removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))

`

2228

``

`-

result = pc.if_else(ends_with, removed, self._pa_array)

`

2229

``

`-

return type(self)(result)

`

2230

``

-

2231

2173

`def _str_casefold(self):

`

2232

2174

`predicate = lambda val: val.casefold()

`

2233

2175

`result = self._apply_elementwise(predicate)

`