BUG: the from_arrow conversion for numeric arrays broken if buffer size doesn't match · Issue #40896 · pandas-dev/pandas (original) (raw)

The original reproducer is (but it would be good to see if we can find a simpler case for a test):

df = pd.DataFrame({"Int_col": [1, 2, 10], "str_col": ["A", "B", "Z"]})
df = df.astype({"Int_col": "Int64"})
table = pa.table(df)
path_1 = "./test_1.parquet"
pa.parquet.write_table(table, path_1)

schema = pa.parquet.read_schema(path_1)
ds = pa.dataset.FileSystemDataset.from_paths(
    paths=[path_1],
    filesystem=pa.fs.LocalFileSystem(),
    schema=schema, 
    format=pa.dataset.ParquetFileFormat(),
)
table = ds.to_table(filter=(pa.dataset.field("str_col") == "C"))

print("Size of array: " + str(table.column(0).nbytes))
df = table.to_pandas()
Traceback (most recent call last):
  File "/Users/xxx/empty_array_buffer_size.py", line 47, in <module>
    df = table.to_pandas()
  File "pyarrow/array.pxi", line 756, in pyarrow.lib._PandasConvertible.to_pandas
  File "pyarrow/table.pxi", line 1740, in pyarrow.lib.Table._to_pandas
  File "/usr/local/mambaforge/envs/pa_nightly/lib/python3.9/site-packages/pyarrow/pandas_compat.py", line 794, in table_to_blockmanager
    blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
  File "/usr/local/mambaforge/envs/pa_nightly/lib/python3.9/site-packages/pyarrow/pandas_compat.py", line 1135, in _table_to_blocks
    return [_reconstruct_block(item, columns, extension_columns)
  File "/usr/local/mambaforge/envs/pa_nightly/lib/python3.9/site-packages/pyarrow/pandas_compat.py", line 1135, in <listcomp>
    return [_reconstruct_block(item, columns, extension_columns)
  File "/usr/local/mambaforge/envs/pa_nightly/lib/python3.9/site-packages/pyarrow/pandas_compat.py", line 753, in _reconstruct_block
    pd_ext_arr = pandas_dtype.__from_arrow__(arr)
  File "/usr/local/mambaforge/envs/pa_nightly/lib/python3.9/site-packages/pandas/core/arrays/integer.py", line 117, in __from_arrow__
    data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
  File "/usr/local/mambaforge/envs/pa_nightly/lib/python3.9/site-packages/pandas/core/arrays/_arrow_utils.py", line 32, in pyarrow_array_to_numpy_and_mask
    data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
ValueError: buffer size must be a multiple of element size