BUG: inconsistent concat casting EA vs non-EA (#38843) · pandas-dev/pandas@2362df9 (original) (raw)

5 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -293,10 +293,11 @@ Groupby/resample/rolling
293 293
294 294 Reshaping
295 295 ^^^^^^^^^
296 -
297 296 - Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`)
297 +- Bug in :func:`concat` incorrectly casting to ``object`` dtype in some cases when one or more of the operands is empty (:issue:`38843`)
298 298 -
299 299
300 +
300 301 Sparse
301 302 ^^^^^^
302 303
Original file line number Diff line number Diff line change
@@ -128,7 +128,7 @@ def is_nonempty(x) -> bool:
128 128 # marginal given that it would still require shape & dtype calculation and
129 129 # np.concatenate which has them both implemented is compiled.
130 130 non_empties = [x for x in to_concat if is_nonempty(x)]
131 -if non_empties and axis == 0:
131 +if non_empties:
132 132 to_concat = non_empties
133 133
134 134 typs = _get_dtype_kinds(to_concat)
Original file line number Diff line number Diff line change
@@ -170,11 +170,21 @@ def test_partial_setting_mixed_dtype(self):
170 170 with pytest.raises(ValueError, match=msg):
171 171 df.loc[0] = [1, 2, 3]
172 172
173 -# TODO: #15657, these are left as object and not coerced
173 +@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
174 +def test_loc_setitem_expanding_empty(self, dtype):
174 175 df = DataFrame(columns=["A", "B"])
175 -df.loc[3] = [6, 7]
176 176
177 -exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object")
177 +value = [6, 7]
178 +if dtype == "int64":
179 +value = np.array(value, dtype=dtype)
180 +elif dtype == "Int64":
181 +value = pd.array(value, dtype=dtype)
182 +
183 +df.loc[3] = value
184 +
185 +exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=dtype)
186 +if dtype is not None:
187 +exp = exp.astype(dtype)
178 188 tm.assert_frame_equal(df, exp)
179 189
180 190 def test_series_partial_set(self):
Original file line number Diff line number Diff line change
@@ -474,11 +474,12 @@ def test_concat_will_upcast(dt, pdt):
474 474 assert x.values.dtype == "float64"
475 475
476 476
477 -def test_concat_empty_and_non_empty_frame_regression():
477 +@pytest.mark.parametrize("dtype", ["int64", "Int64"])
478 +def test_concat_empty_and_non_empty_frame_regression(dtype):
478 479 # GH 18178 regression test
479 -df1 = DataFrame({"foo": [1]})
480 +df1 = DataFrame({"foo": [1]}).astype(dtype)
480 481 df2 = DataFrame({"foo": []})
481 -expected = DataFrame({"foo": [1.0]})
482 +expected = df1
482 483 result = pd.concat([df1, df2])
483 484 tm.assert_frame_equal(result, expected)
484 485
Original file line number Diff line number Diff line change
@@ -202,12 +202,15 @@ def test_concat_empty_series_dtypes_sparse(self):
202 202 expected = pd.SparseDtype("object")
203 203 assert result.dtype == expected
204 204
205 -def test_concat_empty_df_object_dtype(self):
205 +@pytest.mark.parametrize("dtype", ["int64", "Int64"])
206 +def test_concat_empty_df_object_dtype(self, dtype):
206 207 # GH 9149
207 208 df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
209 +df_1["Row"] = df_1["Row"].astype(dtype)
208 210 df_2 = DataFrame(columns=df_1.columns)
209 211 result = pd.concat([df_1, df_2], axis=0)
210 -expected = df_1.astype(object)
212 +expected = df_1.copy()
213 +expected["EmptyCol"] = expected["EmptyCol"].astype(object) # TODO: why?
211 214 tm.assert_frame_equal(result, expected)
212 215
213 216 def test_concat_empty_dataframe_dtypes(self):