IO: Fix parquet read from s3 directory (#33632) · pandas-dev/pandas@22cf0f5 (original) (raw)

1

1

`""" test parquet compat """

`

2

2

`import datetime

`

3

3

`from distutils.version import LooseVersion

`

4

``

`-

import locale

`

5

4

`import os

`

6

5

`from warnings import catch_warnings

`

7

6

``

`@@ -131,6 +130,7 @@ def check_round_trip(

`

131

130

`read_kwargs=None,

`

132

131

`expected=None,

`

133

132

`check_names=True,

`

``

133

`+

check_like=False,

`

134

134

`repeat=2,

`

135

135

`):

`

136

136

`"""Verify parquet serializer and deserializer produce the same results.

`

`@@ -150,6 +150,8 @@ def check_round_trip(

`

150

150

`` Expected deserialization result, otherwise will be equal to df

``

151

151

` check_names: list of str, optional

`

152

152

` Closed set of column names to be compared

`

``

153

`+

check_like: bool, optional

`

``

154

`+

If True, ignore the order of index & columns.

`

153

155

` repeat: int, optional

`

154

156

` How many times to repeat the test

`

155

157

` """

`

`@@ -169,7 +171,9 @@ def compare(repeat):

`

169

171

`with catch_warnings(record=True):

`

170

172

`actual = read_parquet(path, **read_kwargs)

`

171

173

``

172

``

`-

tm.assert_frame_equal(expected, actual, check_names=check_names)

`

``

174

`+

tm.assert_frame_equal(

`

``

175

`+

expected, actual, check_names=check_names, check_like=check_like

`

``

176

`+

)

`

173

177

``

174

178

`if path is None:

`

175

179

`with tm.ensure_clean() as path:

`

`@@ -532,15 +536,37 @@ def test_categorical(self, pa):

`

532

536

`expected = df.astype(object)

`

533

537

`check_round_trip(df, pa, expected=expected)

`

534

538

``

535

``

`-

GH#33077 2020-03-27

`

536

``

`-

@pytest.mark.xfail(

`

537

``

`-

locale.getlocale()[0] == "zh_CN",

`

538

``

`-

reason="dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'",

`

539

``

`-

)

`

540

539

`def test_s3_roundtrip(self, df_compat, s3_resource, pa):

`

541

540

`# GH #19134

`

542

541

`check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet")

`

543

542

``

``

543

`+

@td.skip_if_no("s3fs")

`

``

544

`+

@pytest.mark.parametrize("partition_col", [["A"], []])

`

``

545

`+

def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):

`

``

546

`+

from pandas.io.s3 import get_fs as get_s3_fs

`

``

547

+

``

548

`+

GH #26388

`

``

549

`+

https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716

`

``

550

`+

As per pyarrow partitioned columns become 'categorical' dtypes

`

``

551

`+

and are added to back of dataframe on read

`

``

552

+

``

553

`+

expected_df = df_compat.copy()

`

``

554

`+

if partition_col:

`

``

555

`+

expected_df[partition_col] = expected_df[partition_col].astype("category")

`

``

556

`+

check_round_trip(

`

``

557

`+

df_compat,

`

``

558

`+

pa,

`

``

559

`+

expected=expected_df,

`

``

560

`+

path="s3://pandas-test/parquet_dir",

`

``

561

`+

write_kwargs={

`

``

562

`+

"partition_cols": partition_col,

`

``

563

`+

"compression": None,

`

``

564

`+

"filesystem": get_s3_fs(),

`

``

565

`+

},

`

``

566

`+

check_like=True,

`

``

567

`+

repeat=1,

`

``

568

`+

)

`

``

569

+

544

570

`def test_partition_cols_supported(self, pa, df_full):

`

545

571

`# GH #23283

`

546

572

`partition_cols = ["bool", "int"]

`