BUG: inconsistent concat casting EA vs non-EA (#38843) · pandas-dev/pandas@2362df9 (original) (raw)

5 files changed

lines changed

Original file line number	Diff line number	Diff line change
@@ -293,10 +293,11 @@ Groupby/resample/rolling
293	293
294	294	Reshaping
295	295	^^^^^^^^^
296		-
297	296	- Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`)
	297	+- Bug in :func:`concat` incorrectly casting to ``object`` dtype in some cases when one or more of the operands is empty (:issue:`38843`)
298	298	-
299	299
	300	+
300	301	Sparse
301	302	^^^^^^
302	303

Original file line number	Diff line number	Diff line change
@@ -128,7 +128,7 @@ def is_nonempty(x) -> bool:
128	128	# marginal given that it would still require shape & dtype calculation and
129	129	# np.concatenate which has them both implemented is compiled.
130	130	non_empties = [x for x in to_concat if is_nonempty(x)]
131		-if non_empties and axis == 0:
	131	+if non_empties:
132	132	to_concat = non_empties
133	133
134	134	typs = _get_dtype_kinds(to_concat)

Original file line number	Diff line number	Diff line change
@@ -170,11 +170,21 @@ def test_partial_setting_mixed_dtype(self):
170	170	with pytest.raises(ValueError, match=msg):
171	171	df.loc[0] = [1, 2, 3]
172	172
173		-# TODO: #15657, these are left as object and not coerced
	173	+@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
	174	+def test_loc_setitem_expanding_empty(self, dtype):
174	175	df = DataFrame(columns=["A", "B"])
175		-df.loc[3] = [6, 7]
176	176
177		-exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object")
	177	+value = [6, 7]
	178	+if dtype == "int64":
	179	+value = np.array(value, dtype=dtype)
	180	+elif dtype == "Int64":
	181	+value = pd.array(value, dtype=dtype)
	182	+
	183	+df.loc[3] = value
	184	+
	185	+exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=dtype)
	186	+if dtype is not None:
	187	+exp = exp.astype(dtype)
178	188	tm.assert_frame_equal(df, exp)
179	189
180	190	def test_series_partial_set(self):

Original file line number	Diff line number	Diff line change
@@ -474,11 +474,12 @@ def test_concat_will_upcast(dt, pdt):
474	474	assert x.values.dtype == "float64"
475	475
476	476
477		-def test_concat_empty_and_non_empty_frame_regression():
	477	+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
	478	+def test_concat_empty_and_non_empty_frame_regression(dtype):
478	479	# GH 18178 regression test
479		-df1 = DataFrame({"foo": [1]})
	480	+df1 = DataFrame({"foo": [1]}).astype(dtype)
480	481	df2 = DataFrame({"foo": []})
481		-expected = DataFrame({"foo": [1.0]})
	482	+expected = df1
482	483	result = pd.concat([df1, df2])
483	484	tm.assert_frame_equal(result, expected)
484	485

Original file line number	Diff line number	Diff line change
@@ -202,12 +202,15 @@ def test_concat_empty_series_dtypes_sparse(self):
202	202	expected = pd.SparseDtype("object")
203	203	assert result.dtype == expected
204	204
205		-def test_concat_empty_df_object_dtype(self):
	205	+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
	206	+def test_concat_empty_df_object_dtype(self, dtype):
206	207	# GH 9149
207	208	df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
	209	+df_1["Row"] = df_1["Row"].astype(dtype)
208	210	df_2 = DataFrame(columns=df_1.columns)
209	211	result = pd.concat([df_1, df_2], axis=0)
210		-expected = df_1.astype(object)
	212	+expected = df_1.copy()
	213	+expected["EmptyCol"] = expected["EmptyCol"].astype(object) # TODO: why?
211	214	tm.assert_frame_equal(result, expected)
212	215
213	216	def test_concat_empty_dataframe_dtypes(self):