BUG: Fix pd.NA na_rep
truncated in to_csv by jbman223 · Pull Request #30146 · pandas-dev/pandas (original) (raw)
This one was puzzling to me.
list/test_list.py::test_to_csv FAILED [100%]
pandas/tests/extension/list/test_list.py:23 (test_to_csv)
self = ExtensionBlock: slice(0, 1, 1), 1 x 100, dtype: list
slicer = slice(0, 100, None), na_rep = '', quoting = 0
kwargs = {'date_format': None, 'decimal': '.', 'float_format': None}
values = <ListArray>
[ ['Z', 'r', 'y', 'P', 'D', 'p'],
['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b'],
... ['Q', 'p', 'D', 'L', 'z', 'Z', 'i'],
['l']]
Length: 100, dtype: list
mask = array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False,...False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False])
def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
"""override to use ExtensionArray astype for the conversion"""
values = self.values
if slicer is not None:
values = values[slicer]
mask = isna(values)
try:
> values[mask] = na_rep
../../core/internals/blocks.py:1782:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <ListArray>
[ ['Z', 'r', 'y', 'P', 'D', 'p'],
['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b'],
... ['Q', 'p', 'D', 'L', 'z', 'Z', 'i'],
['l']]
Length: 100, dtype: list
key = array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False,...False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False])
value = ''
def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
"""
Set one or more values inplace.
This method is not required to satisfy the pandas extension array
interface.
Parameters
----------
key : int, ndarray, or slice
When called from, e.g. ``Series.__setitem__``, ``key`` will be
one of
* scalar int
* ndarray of integers.
* boolean ndarray
* slice object
value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
value or values to be set of ``key``.
Returns
-------
None
"""
# Some notes to the ExtensionArray implementor who may have ended up
# here. While this method is not required for the interface, if you
# *do* choose to implement __setitem__, then some semantics should be
# observed:
#
# * Setting multiple values : ExtensionArrays should support setting
# multiple values at once, 'key' will be a sequence of integers and
# 'value' will be a same-length sequence.
#
# * Broadcasting : For a sequence 'key' and a scalar 'value',
# each position in 'key' should be set to 'value'.
#
# * Coercion : Most users will expect basic coercion to work. For
# example, a string like '2018-01-01' is coerced to a datetime
# when setting on a datetime64ns array. In general, if the
# __init__ method coerces that value, then so should __setitem__
# Note, also, that Series/DataFrame.where internally use __setitem__
# on a copy of the data.
raise NotImplementedError(
> _not_implemented_message.format(type(self), "__setitem__")
)
E NotImplementedError: <class 'pandas.tests.extension.list.array.ListArray'> does not implement __setitem__.
../../core/arrays/base.py:334: NotImplementedError
During handling of the above exception, another exception occurred:
data = <ListArray>
[ ['Z', 'r', 'y', 'P', 'D', 'p'],
['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b'],
... ['Q', 'p', 'D', 'L', 'z', 'Z', 'i'],
['l']]
Length: 100, dtype: list
def test_to_csv(data):
# https://github.com/pandas-dev/pandas/issues/28840
# array with list-likes fail when doing astype(str) on the numpy array
# which was done in to_native_types
df = pd.DataFrame({"a": data})
> res = df.to_csv()
list/test_list.py:29:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../core/generic.py:3242: in to_csv
formatter.save()
../../io/formats/csvs.py:204: in save
self._save()
../../io/formats/csvs.py:328: in _save
self._save_chunk(start_i, end_i)
../../io/formats/csvs.py:344: in _save_chunk
quoting=self.quoting,
../../core/internals/blocks.py:1785: in to_native_types
return super().to_native_types(slicer, na_rep, quoting, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = ExtensionBlock: slice(0, 1, 1), 1 x 100, dtype: list
slicer = slice(0, 100, None), na_rep = '', quoting = 0
kwargs = {'date_format': None, 'decimal': '.', 'float_format': None}
values = array([[list(['Z', 'r', 'y', 'P', 'D', 'p']),
list(['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b']),
...c']), list(['L', 'l', 'M', 'c']),
list(['Q', 'p', 'D', 'L', 'z', 'Z', 'i']), list(['l'])]],
dtype=object)
mask = array([[False, False, False, False, False, False, False, False, False,
False, False, False, False, False, Fals...se, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False]])
itemsize = 0
def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.get_values()
if slicer is not None:
values = values[:, slicer]
mask = isna(values)
if not self.is_object and not quoting:
# try:
# itemsize = writers.word_len(na_rep)
# values = values.astype("<U{size}".format(size=itemsize))
# except Exception:
# values = np.array(values, dtype="object")
itemsize = writers.word_len(na_rep)
> values = values.astype("<U{size}".format(size=itemsize))
E ValueError: setting an array element with a sequence
../../core/internals/blocks.py:666: ValueError
Assertion failed
Assertion failed
Assertion failed
Assertion failed
It seems that when you don't format the array to a string first, the masking causes exceptional behavior, which then fails when casting the array to "<U{size}"
, but honestly I'm not totally sure.