ENH: add to/from_parquet with pyarrow & fastparquet by jreback · Pull Request #15838 · pandas-dev/pandas (original) (raw)

yeah was trying to have helpful errors. ok I think both pyarrow and fastparquet should fail gracefully here then.

In [1]: df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('aaa'))
In [3]: df.to_parquet('foo', 'pyarrow')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/jreback/pandas/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
    398                 result = np.empty(len(values), dtype=object)
--> 399                 result[:] = values
    400             except ValueError:

ValueError: could not broadcast input array from shape (4,3) into shape (4)

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-3-185ceaef9fe4> in <module>()
----> 1 df.to_parquet('foo', 'pyarrow')

/Users/jreback/pandas/pandas/core/frame.py in to_parquet(self, fname, engine, compression)
   1538         """
   1539         from pandas.io.parquet import to_parquet
-> 1540         to_parquet(self, fname, engine, compression=compression)
   1541 
   1542     @Substitution(header='Write out column names. If a list of string is given, \

/Users/jreback/pandas/pandas/io/parquet.py in to_parquet(df, path, engine, compression)
     97         from pyarrow import parquet as pq
     98 
---> 99         table = pyarrow.Table.from_pandas(df)
    100         pq.write_table(table, path, compression=compression)
 ValueError: cannot copy sequence with size 3 to array axis with dimension 4
n [4]: df.to_parquet('foo', 'fastparquet')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-4-6b46c1abdc2f> in <module>()
----> 1 df.to_parquet('foo', 'fastparquet')

/Users/jreback/pandas/pandas/core/frame.py in to_parquet(self, fname, engine, compression)
   1538         """
   1539         from pandas.io.parquet import to_parquet
-> 1540         to_parquet(self, fname, engine, compression=compression)
   1541 
   1542     @Substitution(header='Write out column names. If a list of string is given, \

/Users/jreback/pandas/pandas/io/parquet.py in to_parquet(df, path, engine, compression)
    107         # Use tobytes() instead.
    108         with catch_warnings(record=True):
--> 109             fastparquet.write(path, df, compression=compression)
    110 
    111 

/Users/jreback/miniconda3/envs/pandas/lib/python3.6/site-packages/fastparquet/writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
    747     fmd = make_metadata(data, has_nulls=has_nulls, ignore_columns=ignore,
    748                         fixed_text=fixed_text, object_encoding=object_encoding,
--> 749                         times=times)
    750 
    751     if file_scheme == 'simple':

/Users/jreback/miniconda3/envs/pandas/lib/python3.6/site-packages/fastparquet/writer.py in make_metadata(data, has_nulls, ignore_columns, fixed_text, object_encoding, times)
    608                      object_encoding.get(column, None))
    609         fixed = None if fixed_text is None else fixed_text.get(column, None)
--> 610         if str(data[column].dtype) == 'category':
    611             se, type = find_type(data[column].cat.categories,
    612                                  fixed_text=fixed, object_encoding=oencoding)

/Users/jreback/pandas/pandas/core/generic.py in __getattr__(self, name)
   2888             if name in self._info_axis:
   2889                 return self[name]
-> 2890             return object.__getattribute__(self, name)
   2891 
   2892     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'dtype'