BUG: astype('category') on dataframe backed by non-writeable arrays raises ValueError · Issue #53658 · pandas-dev/pandas (original) (raw)

Pandas version checks

I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd

df = pd.DataFrame([0, 1, 2], dtype="Int64") df._mgr.arrays[0]._mask.flags["WRITEABLE"] = False df.astype('category')

Issue Description

works with the default writeable array, but with non-writeable array, I get this error:

Details

ValueError Traceback (most recent call last) Cell In[1], line 5 3 df = pd.DataFrame([0, 1, 2], dtype="Int64") 4 df._mgr.arrays[0]._mask.flags["WRITEABLE"] = False ----> 5 df.astype('category')

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/generic.py:6427, in NDFrame.astype(self, dtype, copy, errors) 6421 results.append(res_col) 6423 elif is_extension_array_dtype(dtype) and self.ndim > 1: 6424 # GH 18099/22869: columnwise conversion to extension dtype 6425 # GH 24704: use iloc to handle duplicate column names 6426 # TODO(EA2D): special case not needed with 2D EAs -> 6427 results = [ 6428 self.iloc[:, i].astype(dtype, copy=copy) 6429 for i in range(len(self.columns)) 6430 ] 6432 else: 6433 # else, only a single dtype is given 6434 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/generic.py:6428, in (.0) 6421 results.append(res_col) 6423 elif is_extension_array_dtype(dtype) and self.ndim > 1: 6424 # GH 18099/22869: columnwise conversion to extension dtype 6425 # GH 24704: use iloc to handle duplicate column names 6426 # TODO(EA2D): special case not needed with 2D EAs 6427 results = [ -> 6428 self.iloc[:, i].astype(dtype, copy=copy) 6429 for i in range(len(self.columns)) 6430 ] 6432 else: 6433 # else, only a single dtype is given 6434 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/generic.py:6434, in NDFrame.astype(self, dtype, copy, errors) 6427 results = [ 6428 self.iloc[:, i].astype(dtype, copy=copy) 6429 for i in range(len(self.columns)) 6430 ] 6432 else: 6433 # else, only a single dtype is given -> 6434 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 6435 return self._constructor(new_data).finalize(self, method="astype") 6437 # GH 33113: handle empty frame or series

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/internals/managers.py:454, in BaseBlockManager.astype(self, dtype, copy, errors) 451 elif using_copy_on_write(): 452 copy = False --> 454 return self.apply( 455 "astype", 456 dtype=dtype, 457 copy=copy, 458 errors=errors, 459 using_cow=using_copy_on_write(), 460 )

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/internals/managers.py:355, in BaseBlockManager.apply(self, f, align_keys, **kwargs) 353 applied = b.apply(f, **kwargs) 354 else: --> 355 applied = getattr(b, f)(**kwargs) 356 result_blocks = extend_blocks(applied, result_blocks) 358 out = type(self).from_blocks(result_blocks, self.axes)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/internals/blocks.py:538, in Block.astype(self, dtype, copy, errors, using_cow) 518 """ 519 Coerce to the new dtype. 520 (...) 534 Block 535 """ 536 values = self.values --> 538 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 540 new_values = maybe_coerce_values(new_values) 542 refs = None

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/dtypes/astype.py:238, in astype_array_safe(values, dtype, copy, errors) 235 dtype = dtype.numpy_dtype 237 try: --> 238 new_values = astype_array(values, dtype, copy=copy) 239 except (ValueError, TypeError): 240 # e.g. _astype_nansafe can fail on object-dtype of strings 241 # trying to convert to float 242 if errors == "ignore":

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/dtypes/astype.py:180, in astype_array(values, dtype, copy) 176 return values 178 if not isinstance(values, np.ndarray): 179 # i.e. ExtensionArray --> 180 values = values.astype(dtype, copy=copy) 182 else: 183 values = _astype_nansafe(values, dtype, copy=copy)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/masked.py:497, in BaseMaskedArray.astype(self, dtype, copy) 495 if isinstance(dtype, ExtensionDtype): 496 eacls = dtype.construct_array_type() --> 497 return eacls._from_sequence(self, dtype=dtype, copy=copy) 499 na_value: float | np.datetime64 | lib.NoDefault 501 # coerce

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/categorical.py:498, in Categorical._from_sequence(cls, scalars, dtype, copy) 494 @classmethod 495 def _from_sequence( 496 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False 497 ) -> Self: --> 498 return cls(scalars, dtype=dtype, copy=copy)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/categorical.py:446, in Categorical.init(self, values, categories, ordered, dtype, fastpath, copy) 444 values = sanitize_array(values, None) 445 try: --> 446 codes, categories = factorize(values, sort=True) 447 except TypeError as err: 448 codes, categories = factorize(values, sort=False)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/algorithms.py:775, in factorize(values, sort, use_na_sentinel, size_hint) 771 return codes, uniques 773 elif not isinstance(values, np.ndarray): 774 # i.e. ExtensionArray --> 775 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) 777 else: 778 values = np.asarray(values) # convert DTA/TDA/MultiIndex

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/masked.py:944, in BaseMaskedArray.factorize(self, use_na_sentinel) 941 mask = self._mask 943 # Use a sentinel for na; recode and add NA to uniques if necessary below --> 944 codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask) 946 # check that factorize_array correctly preserves dtype. 947 assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)

File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/algorithms.py:591, in factorize_array(values, use_na_sentinel, size_hint, na_value, mask) 588 hash_klass, values = _get_hashtable_algo(values) 590 table = hash_klass(size_hint or len(values)) --> 591 uniques, codes = table.factorize( 592 values, 593 na_sentinel=-1, 594 na_value=na_value, 595 mask=mask, 596 ignore_na=use_na_sentinel, 597 ) 599 # re-cast e.g. i8->dt64/td64, uint8->bool 600 uniques = _reconstruct_data(uniques, original.dtype, original)

File pandas/_libs/hashtable_class_helper.pxi:2976, in pandas._libs.hashtable.Int64HashTable.factorize()

File pandas/_libs/hashtable_class_helper.pxi:2825, in pandas._libs.hashtable.Int64HashTable._unique()

File stringsource:660, in View.MemoryView.memoryview_cwrapper()

File stringsource:350, in View.MemoryView.memoryview.cinit()

ValueError: buffer source array is read-only

This was one cause of modin-project/modin#6259

Expected Behavior

should convert to category dtype without error

Installed Versions

INSTALLED VERSIONS

commit : c65c8dd
python : 3.9.16.final.0
python-bits : 64
OS : Darwin
OS-release : 22.5.0
Version : Darwin Kernel Version 22.5.0: Mon Apr 24 20:51:50 PDT 2023; root:xnu-8796.121.2~5/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8

pandas : 2.1.0.dev0+942.gc65c8ddd73
numpy : 2.0.0.dev0+84.g828fba29e
pytz : 2023.3
dateutil : 2.8.2
setuptools : 67.8.0
pip : 23.1.2
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : 8.14.0
pandas_datareader: None
bs4 : None
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : None
snappy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None