IO: Fix parquet read from s3 directory (#33632) · pandas-dev/pandas@22cf0f5 (original) (raw)

1

`""" test parquet compat """

2

`import datetime

3

`from distutils.version import LooseVersion

4

import locale

5

4

`import os

6

5

`from warnings import catch_warnings

7

6

`@@ -131,6 +130,7 @@ def check_round_trip(

131

130

`read_kwargs=None,

132

131

`expected=None,

133

132

`check_names=True,

133

check_like=False,

134

`repeat=2,

135

`):

136

`"""Verify parquet serializer and deserializer produce the same results.

`@@ -150,6 +150,8 @@ def check_round_trip(

150

`` Expected deserialization result, otherwise will be equal to df

151

` check_names: list of str, optional

152

` Closed set of column names to be compared

153

check_like: bool, optional

154

If True, ignore the order of index & columns.

153

155

` repeat: int, optional

154

156

` How many times to repeat the test

155

157

` """

`@@ -169,7 +171,9 @@ def compare(repeat):

169

171

`with catch_warnings(record=True):

170

172

`actual = read_parquet(path, **read_kwargs)

171

173

172

tm.assert_frame_equal(expected, actual, check_names=check_names)

174

tm.assert_frame_equal(

175

expected, actual, check_names=check_names, check_like=check_like

176

)

173

177

174

178

`if path is None:

175

179

`with tm.ensure_clean() as path:

`@@ -532,15 +536,37 @@ def test_categorical(self, pa):

532

536

`expected = df.astype(object)

533

537

`check_round_trip(df, pa, expected=expected)

534

538

535

GH#33077 2020-03-27

536

@pytest.mark.xfail(

537

locale.getlocale()[0] == "zh_CN",

538

reason="dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'",

539

)

540

539

`def test_s3_roundtrip(self, df_compat, s3_resource, pa):

541

540

`# GH #19134

542

541

`check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet")

543

542

543

@td.skip_if_no("s3fs")

544

@pytest.mark.parametrize("partition_col", [["A"], []])

545

def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):

546

from pandas.io.s3 import get_fs as get_s3_fs

547

+

548

GH #26388

549

https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716

550

As per pyarrow partitioned columns become 'categorical' dtypes

551

and are added to back of dataframe on read

552

+

553

expected_df = df_compat.copy()

554

if partition_col:

555

expected_df[partition_col] = expected_df[partition_col].astype("category")

556

check_round_trip(

557

df_compat,

558

pa,

559

expected=expected_df,

560

path="s3://pandas-test/parquet_dir",

561

write_kwargs={

562

"partition_cols": partition_col,

563

"compression": None,

564

"filesystem": get_s3_fs(),

565

566

check_like=True,

567

repeat=1,

568

)

569

+

544

570

`def test_partition_cols_supported(self, pa, df_full):

545

571

`# GH #23283

546

572

`partition_cols = ["bool", "int"]