TST: Split and simplify test_value_counts_unique_nunique by SaturnFromTitan · Pull Request #32281 · pandas-dev/pandas (original) (raw)
@@ -1,3 +1,4 @@
import collections
from datetime import datetime, timedelta
from io import StringIO
import sys
@@ -15,7 +16,6 @@
is_datetime64_dtype,
is_datetime64tz_dtype,
is_object_dtype,
is_period_dtype,
needs_i8_conversion,
)
@@ -26,11 +26,9 @@
Index,
Interval,
IntervalIndex,
PeriodIndex,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm
@@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj):
assert Index([1]).item() == 1
assert Series([1]).item() == 1
def test_value_counts_unique_nunique(self, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
klass = type(obj)
values = obj._values
if orig.duplicated().any():
pytest.xfail(
"The test implementation isn't flexible enough to deal "
"with duplicated values. This isn't a bug in the "
"application code, but in the test code."
)
def test_unique(self, index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.unique()
# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(obj, Index):
expected_index = Index(obj[::-1])
expected_index.name = None
obj = obj.repeat(range(1, len(obj) + 1))
# dict.fromkeys preserves the order
unique_values = list(dict.fromkeys(obj.values))
if isinstance(obj, pd.MultiIndex):
expected = pd.MultiIndex.from_tuples(unique_values)
expected.names = obj.names
tm.assert_index_equal(result, expected)
elif isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj):
expected = expected.normalize()
tm.assert_index_equal(result, expected)
else:
expected_index = Index(values[::-1])
idx = obj.index.repeat(range(1, len(obj) + 1))
# take-based repeat
indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))
rep = values.take(indices)
obj = klass(rep, index=idx)
# check values has the same dtype as the original
assert obj.dtype == orig.dtype
expected_s = Series(
range(len(orig), 0, -1), index=expected_index, dtype="int64"
)
expected = np.array(unique_values)
tm.assert_numpy_array_equal(result, expected)
result = obj.value_counts()
tm.assert_series_equal(result, expected_s)
assert result.index.name is None
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_unique_null(self, null_obj, index_or_series_obj):
obj = index_or_series_obj
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj.values
if needs_i8_conversion(obj):
values[0:2] = iNaT
else:
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
result = obj.unique()
if isinstance(obj, Index):
assert isinstance(result, type(obj))
tm.assert_index_equal(result, orig)
assert result.dtype == orig.dtype
elif is_datetime64tz_dtype(obj):
# datetimetz Series returns array of Timestamp
assert result[0] == orig[0]
for r in result:
assert isinstance(r, Timestamp)
tm.assert_numpy_array_equal(
result.astype(object), orig._values.astype(object)
)
unique_values_raw = dict.fromkeys(obj.values)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
unique_values_not_null = [
val for val in unique_values_raw if not pd.isnull(val)
]
unique_values = [null_obj] + unique_values_not_null
if isinstance(obj, pd.Index):
expected = pd.Index(unique_values, dtype=obj.dtype)
if is_datetime64tz_dtype(obj):
result = result.normalize()
expected = expected.normalize()
elif isinstance(obj, pd.CategoricalIndex):
expected = expected.set_categories(unique_values_not_null)
tm.assert_index_equal(result, expected)
else:
tm.assert_numpy_array_equal(result, orig.values)
assert result.dtype == orig.dtype
expected = np.array(unique_values, dtype=obj.dtype)
tm.assert_numpy_array_equal(result, expected)
# dropna=True would break for MultiIndex
assert obj.nunique(dropna=False) == len(np.unique(obj.values))
def test_nunique(self, index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
expected = len(obj.unique())
assert obj.nunique(dropna=False) == expected
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
klass = type(obj)
values = obj._ndarray_values
num_values = len(orig)
def test_nunique_null(self, null_obj, index_or_series_obj):
obj = index_or_series_obj
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):
pytest.skip(f"values of {klass} cannot be changed")
elif isinstance(orig, pd.MultiIndex):
pytest.skip("MultiIndex doesn't support isna")
elif orig.duplicated().any():
pytest.xfail(
"The test implementation isn't flexible enough to deal "
"with duplicated values. This isn't a bug in the "
"application code, but in the test code."
)
# special assign to the numpy array
if is_datetime64tz_dtype(obj):
if isinstance(obj, DatetimeIndex):
v = obj.asi8
v[0:2] = iNaT
values = obj._shallow_copy(v)
else:
obj = obj.copy()
obj[0:2] = pd.NaT
values = obj._values
elif isinstance(obj, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
elif is_period_dtype(obj):
values[0:2] = iNaT
parr = type(obj._data)(values, dtype=obj.dtype)
values = obj._shallow_copy(parr)
elif needs_i8_conversion(obj):
values = obj.values
if needs_i8_conversion(obj):
values[0:2] = iNaT
values = obj._shallow_copy(values)
else:
values[0:2] = null_obj
# check values has the same dtype as the original
assert values.dtype == obj.dtype
# create repeated values, 'n'th element is repeated by n+1
# times
if isinstance(obj, (DatetimeIndex, PeriodIndex)):
expected_index = obj.copy()
expected_index.name = None
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
# attach name to klass
obj = klass(values.repeat(range(1, len(obj) + 1)))
obj.name = "a"
else:
if isinstance(obj, DatetimeIndex):
expected_index = orig._values._shallow_copy(values)
else:
expected_index = Index(values)
expected_index.name = None
obj = obj.repeat(range(1, len(obj) + 1))
obj.name = "a"
# check values has the same dtype as the original
assert obj.dtype == orig.dtype
# check values correctly have NaN
nanloc = np.zeros(len(obj), dtype=np.bool)
nanloc[:3] = True
if isinstance(obj, Index):
tm.assert_numpy_array_equal(pd.isna(obj), nanloc)
if isinstance(obj, pd.CategoricalIndex):
assert obj.nunique() == len(obj.categories)
assert obj.nunique(dropna=False) == len(obj.categories) + 1
else:
exp = Series(nanloc, obj.index, name="a")
tm.assert_series_equal(pd.isna(obj), exp)
expected_data = list(range(num_values, 2, -1))
expected_data_na = expected_data.copy()
if expected_data_na:
expected_data_na.append(3)
expected_s_na = Series(
expected_data_na,
index=expected_index[num_values - 1 : 0 : -1],
dtype="int64",
name="a",
)
expected_s = Series(
expected_data,
index=expected_index[num_values - 1 : 1 : -1],
dtype="int64",
name="a",
)
num_unique_values = len(obj.unique())
assert obj.nunique() == max(0, num_unique_values - 1)
assert obj.nunique(dropna=False) == max(0, num_unique_values)
result_s_na = obj.value_counts(dropna=False)
tm.assert_series_equal(result_s_na, expected_s_na)
assert result_s_na.index.name is None
assert result_s_na.name == "a"
result_s = obj.value_counts()
tm.assert_series_equal(obj.value_counts(), expected_s)
assert result_s.index.name is None
assert result_s.name == "a"
def test_value_counts(self, index_or_series_obj):
obj = index_or_series_obj
obj = np.repeat(obj, range(1, len(obj) + 1))
result = obj.value_counts()
result = obj.unique()
if isinstance(obj, Index):
tm.assert_index_equal(result, Index(values[1:], name="a"))
elif is_datetime64tz_dtype(obj):
# unable to compare NaT / nan
tm.assert_extension_array_equal(result[1:], values[2:])
assert result[0] is pd.NaT
elif len(obj) > 0:
tm.assert_numpy_array_equal(result[1:], values[2:])
assert pd.isna(result[0])
assert result.dtype == orig.dtype
assert obj.nunique() == max(0, num_values - 2)
assert obj.nunique(dropna=False) == max(0, num_values - 1)
counter = collections.Counter(obj)
expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)
expected.index = expected.index.astype(obj.dtype)
if isinstance(obj, pd.MultiIndex):
expected.index = pd.Index(expected.index)
# sort_index to avoid switched order when values share the same count
result = result.sort_index()
expected = expected.sort_index()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("null_obj", [np.nan, None])
def test_value_counts_null(self, null_obj, index_or_series_obj):
orig = index_or_series_obj
obj = orig.copy()
if not allow_na_ops(obj):
pytest.skip("type doesn't allow for NA operations")
elif len(obj) < 1:
pytest.skip("Test doesn't make sense on empty data")
elif isinstance(orig, pd.MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
values = obj.values
if needs_i8_conversion(obj):
values[0:2] = iNaT
else:
values[0:2] = null_obj
klass = type(obj)
repeated_values = np.repeat(values, range(1, len(values) + 1))
obj = klass(repeated_values, dtype=obj.dtype)
# because np.nan == np.nan is False, but None == None is True
# np.nan would be duplicated, whereas None wouldn't
counter = collections.Counter(obj.dropna())
expected = pd.Series(dict(counter.most_common()), dtype=np.int64)
expected.index = expected.index.astype(obj.dtype)
tm.assert_series_equal(obj.value_counts(), expected)
# can't use expected[null_obj] = 3 as
# IntervalIndex doesn't allow assignment
new_entry = pd.Series({np.nan: 3}, dtype=np.int64)
expected = expected.append(new_entry)
tm.assert_series_equal(obj.value_counts(dropna=False), expected)
def test_value_counts_inferred(self, index_or_series):
klass = index_or_series