BUG: extraneous copy of extension arrays in v1.3.0 · Issue #42501 · pandas-dev/pandas (original) (raw)
Code Sample, a copy-pastable example
import pandas as pd from pandas.core.arrays.integer import coerce_to_array
class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
return IntegerArrayNoCopy(values, mask)
def copy(self):
raise NotImplementedError
class Int16DtypeNoCopy(pd.Int16Dtype): @classmethod def construct_array_type(cls): return IntegerArrayNoCopy
if name == 'main': df = pd.DataFrame({"col": [1, 4, None, 5]}, dtype=object) print(df.dtypes) df = df.astype({"col": Int16DtypeNoCopy()}, copy=False) print(df.dtypes) print(df)
Problem description
In 1.3.0, astype
attempts to create an extension array copy even when explicitly passed copy=False
:
Traceback (most recent call last):
File "test_astype.py", line 24, in <module>
df = df.astype({"col": Int16DtypeNoCopy()}, copy=False)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/generic.py", line 5814, in astype
result = concat(results, axis=1, copy=False)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/util/_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 307, in concat
return op.get_result()
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 508, in get_result
df = cons(data, index=index)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 458, in dict_to_mgr
for x in arrays
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 458, in <listcomp>
for x in arrays
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/generic.py", line 5924, in copy
data = self._mgr.copy(deep=deep)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 595, in copy
res = self.apply("copy", deep=deep)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 327, in apply
applied = getattr(b, f)(**kwargs)
File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 651, in copy
values = values.copy()
File "test_astype.py", line 13, in copy
raise NotImplementedError
NotImplementedError
Expected Output
1.2.5 works as expected (at least for this example):
col object
dtype: object
col Int16
dtype: object
col
0 1
1 4
2 <NA>
3 5
Output of pd.show_versions()
INSTALLED VERSIONS
commit : f00ed8f
python : 3.7.10.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.0-1029-oem
Version : #30-Ubuntu SMP Fri May 28 23:53:50 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.3.0
numpy : 1.20.2
pytz : 2021.1
dateutil : 2.8.1
pip : 21.0.1
setuptools : 49.6.0.post20210108
Cython : 0.29.22
pytest : 6.2.2
hypothesis : 6.7.0
jinja2 : 2.11.3
IPython : 7.20.0
fsspec : 2021.04.0
fastparquet : 0.5.0
matplotlib : 3.4.1
pyarrow : 4.0.1
s3fs : 2021.04.0
scipy : 1.6.0
xarray : 0.17.0
numba : 0.53.1