BUG: extraneous copy of extension arrays in v1.3.0 · Issue #42501 · pandas-dev/pandas (original) (raw)

Code Sample, a copy-pastable example

import pandas as pd from pandas.core.arrays.integer import coerce_to_array

class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
    values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
    return IntegerArrayNoCopy(values, mask)

def copy(self):
    raise NotImplementedError

class Int16DtypeNoCopy(pd.Int16Dtype): @classmethod def construct_array_type(cls): return IntegerArrayNoCopy

if name == 'main': df = pd.DataFrame({"col": [1, 4, None, 5]}, dtype=object) print(df.dtypes) df = df.astype({"col": Int16DtypeNoCopy()}, copy=False) print(df.dtypes) print(df)

Problem description

In 1.3.0, astype attempts to create an extension array copy even when explicitly passed copy=False:

Traceback (most recent call last):
  File "test_astype.py", line 24, in <module>
    df = df.astype({"col": Int16DtypeNoCopy()}, copy=False)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/generic.py", line 5814, in astype
    result = concat(results, axis=1, copy=False)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 307, in concat
    return op.get_result()
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/reshape/concat.py", line 508, in get_result
    df = cons(data, index=index)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/frame.py", line 614, in __init__
    mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 458, in dict_to_mgr
    for x in arrays
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 458, in <listcomp>
    for x in arrays
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/generic.py", line 5924, in copy
    data = self._mgr.copy(deep=deep)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 595, in copy
    res = self.apply("copy", deep=deep)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 327, in apply
    applied = getattr(b, f)(**kwargs)
  File "/home/gsk/miniconda/envs/pdenv/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 651, in copy
    values = values.copy()
  File "test_astype.py", line 13, in copy
    raise NotImplementedError
NotImplementedError

Expected Output

1.2.5 works as expected (at least for this example):

col    object
dtype: object
col    Int16
dtype: object
    col
0     1
1     4
2  <NA>
3     5

Output of pd.show_versions()

INSTALLED VERSIONS

commit : f00ed8f
python : 3.7.10.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.0-1029-oem
Version : #30-Ubuntu SMP Fri May 28 23:53:50 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8

pandas : 1.3.0
numpy : 1.20.2
pytz : 2021.1
dateutil : 2.8.1
pip : 21.0.1
setuptools : 49.6.0.post20210108
Cython : 0.29.22
pytest : 6.2.2
hypothesis : 6.7.0
jinja2 : 2.11.3
IPython : 7.20.0
fsspec : 2021.04.0
fastparquet : 0.5.0
matplotlib : 3.4.1
pyarrow : 4.0.1
s3fs : 2021.04.0
scipy : 1.6.0
xarray : 0.17.0
numba : 0.53.1