BUG: cannot read back columns of dtype interval[datetime64[ns]] from parquet file or pyarrow table · Issue #45881 · pandas-dev/pandas (original) (raw)
import pandas as pd
vals = [ pd.Timestamp(date) for date in [ "2020-12-07", "2020-12-08", "2020-12-09", "2020-12-10", "2020-12-11", "2020-12-12", ] ]
example = pd.DataFrame({ 'start':vals[::2], 'stop':vals[1::2], })
example['interval'] = example.apply(lambda x: pd.Interval(x.start, x.stop), axis=1)
example.to_parquet('example.prqt', engine='pyarrow')
df = pd.read_parquet('example.prqt')
pandas successfully writes df with column of dtype interval[datetime64]
either to parquet file or to pyarrow table but cannot read it back
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_71061/1505077241.py in <module>
----> 1 df = pd.read_parquet('example.prqt')
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, **kwargs)
491 impl = get_engine(engine)
492
--> 493 return impl.read(
494 path,
495 columns=columns,
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pandas/io/parquet.py in read(self, path, columns, use_nullable_dtypes, storage_options, **kwargs)
238 )
239 try:
--> 240 result = self.api.parquet.read_table(
241 path_or_handle, columns=columns, **kwargs
242 ).to_pandas(**to_pandas_kwargs)
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
787 _check_data_column_metadata_consistency(all_columns)
788 columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 789 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
790
791 axes = [columns, index]
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories, extension_columns)
1135 result = pa.lib.table_to_blocks(options, block_table, categories,
1136 list(extension_columns.keys()))
-> 1137 return [_reconstruct_block(item, columns, extension_columns)
1138 for item in result]
1139
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in <listcomp>(.0)
1135 result = pa.lib.table_to_blocks(options, block_table, categories,
1136 list(extension_columns.keys()))
-> 1137 return [_reconstruct_block(item, columns, extension_columns)
1138 for item in result]
1139
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pyarrow/pandas_compat.py in _reconstruct_block(item, columns, extension_columns)
747 raise ValueError("This column does not support to be converted "
748 "to a pandas ExtensionArray")
--> 749 pd_ext_arr = pandas_dtype.__from_arrow__(arr)
750 block = _int.make_block(pd_ext_arr, placement=placement)
751 else:
~/anaconda3/envs/pandas_issue/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in __from_arrow__(self, array)
1240 results = []
1241 for arr in chunks:
-> 1242 left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
1243 right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
1244 iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
AttributeError: 'pyarrow.lib.StructArray' object has no attribute 'storage'
Should be possible to read columns of dtype interval[datetime64]
either from parquet or from pyarrow table which where created with pandas in the first place