IO: Fix parquet read from s3 directory (#33632) · pandas-dev/pandas@22cf0f5 (original) (raw)
1
1
`""" test parquet compat """
`
2
2
`import datetime
`
3
3
`from distutils.version import LooseVersion
`
4
``
`-
import locale
`
5
4
`import os
`
6
5
`from warnings import catch_warnings
`
7
6
``
`@@ -131,6 +130,7 @@ def check_round_trip(
`
131
130
`read_kwargs=None,
`
132
131
`expected=None,
`
133
132
`check_names=True,
`
``
133
`+
check_like=False,
`
134
134
`repeat=2,
`
135
135
`):
`
136
136
`"""Verify parquet serializer and deserializer produce the same results.
`
`@@ -150,6 +150,8 @@ def check_round_trip(
`
150
150
`` Expected deserialization result, otherwise will be equal to df
``
151
151
` check_names: list of str, optional
`
152
152
` Closed set of column names to be compared
`
``
153
`+
check_like: bool, optional
`
``
154
`+
If True, ignore the order of index & columns.
`
153
155
` repeat: int, optional
`
154
156
` How many times to repeat the test
`
155
157
` """
`
`@@ -169,7 +171,9 @@ def compare(repeat):
`
169
171
`with catch_warnings(record=True):
`
170
172
`actual = read_parquet(path, **read_kwargs)
`
171
173
``
172
``
`-
tm.assert_frame_equal(expected, actual, check_names=check_names)
`
``
174
`+
tm.assert_frame_equal(
`
``
175
`+
expected, actual, check_names=check_names, check_like=check_like
`
``
176
`+
)
`
173
177
``
174
178
`if path is None:
`
175
179
`with tm.ensure_clean() as path:
`
`@@ -532,15 +536,37 @@ def test_categorical(self, pa):
`
532
536
`expected = df.astype(object)
`
533
537
`check_round_trip(df, pa, expected=expected)
`
534
538
``
535
``
`-
GH#33077 2020-03-27
`
536
``
`-
@pytest.mark.xfail(
`
537
``
`-
locale.getlocale()[0] == "zh_CN",
`
538
``
`-
reason="dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'",
`
539
``
`-
)
`
540
539
`def test_s3_roundtrip(self, df_compat, s3_resource, pa):
`
541
540
`# GH #19134
`
542
541
`check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet")
`
543
542
``
``
543
`+
@td.skip_if_no("s3fs")
`
``
544
`+
@pytest.mark.parametrize("partition_col", [["A"], []])
`
``
545
`+
def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):
`
``
546
`+
from pandas.io.s3 import get_fs as get_s3_fs
`
``
547
+
``
548
`+
GH #26388
`
``
549
`+
https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716
`
``
550
`+
As per pyarrow partitioned columns become 'categorical' dtypes
`
``
551
`+
and are added to back of dataframe on read
`
``
552
+
``
553
`+
expected_df = df_compat.copy()
`
``
554
`+
if partition_col:
`
``
555
`+
expected_df[partition_col] = expected_df[partition_col].astype("category")
`
``
556
`+
check_round_trip(
`
``
557
`+
df_compat,
`
``
558
`+
pa,
`
``
559
`+
expected=expected_df,
`
``
560
`+
path="s3://pandas-test/parquet_dir",
`
``
561
`+
write_kwargs={
`
``
562
`+
"partition_cols": partition_col,
`
``
563
`+
"compression": None,
`
``
564
`+
"filesystem": get_s3_fs(),
`
``
565
`+
},
`
``
566
`+
check_like=True,
`
``
567
`+
repeat=1,
`
``
568
`+
)
`
``
569
+
544
570
`def test_partition_cols_supported(self, pa, df_full):
`
545
571
`# GH #23283
`
546
572
`partition_cols = ["bool", "int"]
`