BUG: cannot read back columns of dtype interval[datetime64[ns]] from parquet file or pyarrow table · Issue #45881 · pandas-dev/pandas (original) (raw)

import pandas as pd

vals = [ pd.Timestamp(date) for date in [ "2020-12-07", "2020-12-08", "2020-12-09", "2020-12-10", "2020-12-11", "2020-12-12", ] ]

example = pd.DataFrame({ 'start':vals[::2], 'stop':vals[1::2], })

example['interval'] = example.apply(lambda x: pd.Interval(x.start, x.stop), axis=1)

example.to_parquet('example.prqt', engine='pyarrow')

df = pd.read_parquet('example.prqt')

pandas successfully writes df with column of dtype interval[datetime64] either to parquet file or to pyarrow table but cannot read it back

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_71061/1505077241.py in <module>
----> 1 df = pd.read_parquet('example.prqt')

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, **kwargs)
    491     impl = get_engine(engine)
    492 
--> 493     return impl.read(
    494         path,
    495         columns=columns,

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pandas/io/parquet.py in read(self, path, columns, use_nullable_dtypes, storage_options, **kwargs)
    238         )
    239         try:
--> 240             result = self.api.parquet.read_table(
    241                 path_or_handle, columns=columns, **kwargs
    242             ).to_pandas(**to_pandas_kwargs)

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    787     _check_data_column_metadata_consistency(all_columns)
    788     columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 789     blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
    790 
    791     axes = [columns, index]

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories, extension_columns)
   1135     result = pa.lib.table_to_blocks(options, block_table, categories,
   1136                                     list(extension_columns.keys()))
-> 1137     return [_reconstruct_block(item, columns, extension_columns)
   1138             for item in result]
   1139 

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in <listcomp>(.0)
   1135     result = pa.lib.table_to_blocks(options, block_table, categories,
   1136                                     list(extension_columns.keys()))
-> 1137     return [_reconstruct_block(item, columns, extension_columns)
   1138             for item in result]
   1139 

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _reconstruct_block(item, columns, extension_columns)
    747             raise ValueError("This column does not support to be converted "
    748                              "to a pandas ExtensionArray")
--> 749         pd_ext_arr = pandas_dtype.__from_arrow__(arr)
    750         block = _int.make_block(pd_ext_arr, placement=placement)
    751     else:

~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in __from_arrow__(self, array)
   1240         results = []
   1241         for arr in chunks:
-> 1242             left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
   1243             right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
   1244             iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)

AttributeError: 'pyarrow.lib.StructArray' object has no attribute 'storage'

Should be possible to read columns of dtype interval[datetime64] either from parquet or from pyarrow table which where created with pandas in the first place