BUG: Fix pd.NA na_rep truncated in to_csv by jbman223 · Pull Request #30146 · pandas-dev/pandas (original) (raw)

This one was puzzling to me.

list/test_list.py::test_to_csv FAILED                                    [100%]
pandas/tests/extension/list/test_list.py:23 (test_to_csv)
self = ExtensionBlock: slice(0, 1, 1), 1 x 100, dtype: list
slicer = slice(0, 100, None), na_rep = '', quoting = 0
kwargs = {'date_format': None, 'decimal': '.', 'float_format': None}
values = <ListArray>
[                    ['Z', 'r', 'y', 'P', 'D', 'p'],
 ['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b'],
...     ['Q', 'p', 'D', 'L', 'z', 'Z', 'i'],
                                              ['l']]
Length: 100, dtype: list
mask = array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,...False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

    def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
        """override to use ExtensionArray astype for the conversion"""
        values = self.values
        if slicer is not None:
            values = values[slicer]
        mask = isna(values)
    
        try:
>           values[mask] = na_rep

../../core/internals/blocks.py:1782: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <ListArray>
[                    ['Z', 'r', 'y', 'P', 'D', 'p'],
 ['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b'],
...     ['Q', 'p', 'D', 'L', 'z', 'Z', 'i'],
                                              ['l']]
Length: 100, dtype: list
key = array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,...False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])
value = ''

    def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
        """
        Set one or more values inplace.
    
        This method is not required to satisfy the pandas extension array
        interface.
    
        Parameters
        ----------
        key : int, ndarray, or slice
            When called from, e.g. ``Series.__setitem__``, ``key`` will be
            one of
    
            * scalar int
            * ndarray of integers.
            * boolean ndarray
            * slice object
    
        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
            value or values to be set of ``key``.
    
        Returns
        -------
        None
        """
        # Some notes to the ExtensionArray implementor who may have ended up
        # here. While this method is not required for the interface, if you
        # *do* choose to implement __setitem__, then some semantics should be
        # observed:
        #
        # * Setting multiple values : ExtensionArrays should support setting
        #   multiple values at once, 'key' will be a sequence of integers and
        #  'value' will be a same-length sequence.
        #
        # * Broadcasting : For a sequence 'key' and a scalar 'value',
        #   each position in 'key' should be set to 'value'.
        #
        # * Coercion : Most users will expect basic coercion to work. For
        #   example, a string like '2018-01-01' is coerced to a datetime
        #   when setting on a datetime64ns array. In general, if the
        #   __init__ method coerces that value, then so should __setitem__
        # Note, also, that Series/DataFrame.where internally use __setitem__
        # on a copy of the data.
        raise NotImplementedError(
>           _not_implemented_message.format(type(self), "__setitem__")
        )
E       NotImplementedError: <class 'pandas.tests.extension.list.array.ListArray'> does not implement __setitem__.

../../core/arrays/base.py:334: NotImplementedError

During handling of the above exception, another exception occurred:

data = <ListArray>
[                    ['Z', 'r', 'y', 'P', 'D', 'p'],
 ['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b'],
...     ['Q', 'p', 'D', 'L', 'z', 'Z', 'i'],
                                              ['l']]
Length: 100, dtype: list

    def test_to_csv(data):
        # https://github.com/pandas-dev/pandas/issues/28840
        # array with list-likes fail when doing astype(str) on the numpy array
        # which was done in to_native_types
        df = pd.DataFrame({"a": data})
>       res = df.to_csv()

list/test_list.py:29: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../core/generic.py:3242: in to_csv
    formatter.save()
../../io/formats/csvs.py:204: in save
    self._save()
../../io/formats/csvs.py:328: in _save
    self._save_chunk(start_i, end_i)
../../io/formats/csvs.py:344: in _save_chunk
    quoting=self.quoting,
../../core/internals/blocks.py:1785: in to_native_types
    return super().to_native_types(slicer, na_rep, quoting, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = ExtensionBlock: slice(0, 1, 1), 1 x 100, dtype: list
slicer = slice(0, 100, None), na_rep = '', quoting = 0
kwargs = {'date_format': None, 'decimal': '.', 'float_format': None}
values = array([[list(['Z', 'r', 'y', 'P', 'D', 'p']),
        list(['a', 'Q', 'd', 'J', 'X', 'l', 'd', 's', 'q', 'b']),
      ...c']), list(['L', 'l', 'M', 'c']),
        list(['Q', 'p', 'D', 'L', 'z', 'Z', 'i']), list(['l'])]],
      dtype=object)
mask = array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, Fals...se, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False]])
itemsize = 0

    def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
        """ convert to our native types format, slicing if desired """
        values = self.get_values()
    
        if slicer is not None:
            values = values[:, slicer]
        mask = isna(values)
    
        if not self.is_object and not quoting:
            # try:
            #     itemsize = writers.word_len(na_rep)
            #     values = values.astype("<U{size}".format(size=itemsize))
            # except Exception:
            #     values = np.array(values, dtype="object")
            itemsize = writers.word_len(na_rep)
>           values = values.astype("<U{size}".format(size=itemsize))
E           ValueError: setting an array element with a sequence

../../core/internals/blocks.py:666: ValueError

Assertion failed

Assertion failed

Assertion failed

Assertion failed

It seems that when you don't format the array to a string first, the masking causes exceptional behavior, which then fails when casting the array to "<U{size}", but honestly I'm not totally sure.