TST: Split and simplify test_value_counts_unique_nunique by SaturnFromTitan · Pull Request #32281 · pandas-dev/pandas (original) (raw)

@@ -1,3 +1,4 @@

import collections

from datetime import datetime, timedelta

from io import StringIO

import sys

Expand All

@@ -15,7 +16,6 @@

is_datetime64_dtype,

is_datetime64tz_dtype,

is_object_dtype,

is_period_dtype,

needs_i8_conversion,

)

Expand All

@@ -26,11 +26,9 @@

Index,

Interval,

IntervalIndex,

PeriodIndex,

Series,

Timedelta,

TimedeltaIndex,

Timestamp,

)

import pandas._testing as tm

Expand Down Expand Up

@@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj):

assert Index([1]).item() == 1

assert Series([1]).item() == 1

def test_value_counts_unique_nunique(self, index_or_series_obj):

orig = index_or_series_obj

obj = orig.copy()

klass = type(obj)

values = obj._values

if orig.duplicated().any():

pytest.xfail(

"The test implementation isn't flexible enough to deal "

"with duplicated values. This isn't a bug in the "

"application code, but in the test code."

)

def test_unique(self, index_or_series_obj):

obj = index_or_series_obj

obj = np.repeat(obj, range(1, len(obj) + 1))

result = obj.unique()

# create repeated values, 'n'th element is repeated by n+1 times

if isinstance(obj, Index):

expected_index = Index(obj[::-1])

expected_index.name = None

obj = obj.repeat(range(1, len(obj) + 1))

# dict.fromkeys preserves the order

unique_values = list(dict.fromkeys(obj.values))

if isinstance(obj, pd.MultiIndex):

expected = pd.MultiIndex.from_tuples(unique_values)

expected.names = obj.names

tm.assert_index_equal(result, expected)

elif isinstance(obj, pd.Index):

expected = pd.Index(unique_values, dtype=obj.dtype)

if is_datetime64tz_dtype(obj):

expected = expected.normalize()

tm.assert_index_equal(result, expected)

else:

expected_index = Index(values[::-1])

idx = obj.index.repeat(range(1, len(obj) + 1))

# take-based repeat

indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1))

rep = values.take(indices)

obj = klass(rep, index=idx)

# check values has the same dtype as the original

assert obj.dtype == orig.dtype

expected_s = Series(

range(len(orig), 0, -1), index=expected_index, dtype="int64"

)

expected = np.array(unique_values)

tm.assert_numpy_array_equal(result, expected)

result = obj.value_counts()

tm.assert_series_equal(result, expected_s)

assert result.index.name is None

@pytest.mark.parametrize("null_obj", [np.nan, None])

def test_unique_null(self, null_obj, index_or_series_obj):

obj = index_or_series_obj

if not allow_na_ops(obj):

pytest.skip("type doesn't allow for NA operations")

elif len(obj) < 1:

pytest.skip("Test doesn't make sense on empty data")

elif isinstance(obj, pd.MultiIndex):

pytest.skip(f"MultiIndex can't hold '{null_obj}'")

values = obj.values

if needs_i8_conversion(obj):

values[0:2] = iNaT

else:

values[0:2] = null_obj

klass = type(obj)

repeated_values = np.repeat(values, range(1, len(values) + 1))

obj = klass(repeated_values, dtype=obj.dtype)

result = obj.unique()

if isinstance(obj, Index):

assert isinstance(result, type(obj))

tm.assert_index_equal(result, orig)

assert result.dtype == orig.dtype

elif is_datetime64tz_dtype(obj):

# datetimetz Series returns array of Timestamp

assert result[0] == orig[0]

for r in result:

assert isinstance(r, Timestamp)

tm.assert_numpy_array_equal(

result.astype(object), orig._values.astype(object)

)

unique_values_raw = dict.fromkeys(obj.values)

# because np.nan == np.nan is False, but None == None is True

# np.nan would be duplicated, whereas None wouldn't

unique_values_not_null = [

val for val in unique_values_raw if not pd.isnull(val)

]

unique_values = [null_obj] + unique_values_not_null

if isinstance(obj, pd.Index):

expected = pd.Index(unique_values, dtype=obj.dtype)

if is_datetime64tz_dtype(obj):

result = result.normalize()

expected = expected.normalize()

elif isinstance(obj, pd.CategoricalIndex):

expected = expected.set_categories(unique_values_not_null)

tm.assert_index_equal(result, expected)

else:

tm.assert_numpy_array_equal(result, orig.values)

assert result.dtype == orig.dtype

expected = np.array(unique_values, dtype=obj.dtype)

tm.assert_numpy_array_equal(result, expected)

# dropna=True would break for MultiIndex

assert obj.nunique(dropna=False) == len(np.unique(obj.values))

def test_nunique(self, index_or_series_obj):

obj = index_or_series_obj

obj = np.repeat(obj, range(1, len(obj) + 1))

expected = len(obj.unique())

assert obj.nunique(dropna=False) == expected

@pytest.mark.parametrize("null_obj", [np.nan, None])

def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj):

orig = index_or_series_obj

obj = orig.copy()

klass = type(obj)

values = obj._ndarray_values

num_values = len(orig)

def test_nunique_null(self, null_obj, index_or_series_obj):

obj = index_or_series_obj

if not allow_na_ops(obj):

pytest.skip("type doesn't allow for NA operations")

elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):

pytest.skip(f"values of {klass} cannot be changed")

elif isinstance(orig, pd.MultiIndex):

pytest.skip("MultiIndex doesn't support isna")

elif orig.duplicated().any():

pytest.xfail(

"The test implementation isn't flexible enough to deal "

"with duplicated values. This isn't a bug in the "

"application code, but in the test code."

)

# special assign to the numpy array

if is_datetime64tz_dtype(obj):

if isinstance(obj, DatetimeIndex):

v = obj.asi8

v[0:2] = iNaT

values = obj._shallow_copy(v)

else:

obj = obj.copy()

obj[0:2] = pd.NaT

values = obj._values

elif isinstance(obj, pd.MultiIndex):

pytest.skip(f"MultiIndex can't hold '{null_obj}'")

elif is_period_dtype(obj):

values[0:2] = iNaT

parr = type(obj._data)(values, dtype=obj.dtype)

values = obj._shallow_copy(parr)

elif needs_i8_conversion(obj):

values = obj.values

if needs_i8_conversion(obj):

values[0:2] = iNaT

values = obj._shallow_copy(values)

else:

values[0:2] = null_obj

# check values has the same dtype as the original

assert values.dtype == obj.dtype

# create repeated values, 'n'th element is repeated by n+1

# times

if isinstance(obj, (DatetimeIndex, PeriodIndex)):

expected_index = obj.copy()

expected_index.name = None

klass = type(obj)

repeated_values = np.repeat(values, range(1, len(values) + 1))

obj = klass(repeated_values, dtype=obj.dtype)

# attach name to klass

obj = klass(values.repeat(range(1, len(obj) + 1)))

obj.name = "a"

else:

if isinstance(obj, DatetimeIndex):

expected_index = orig._values._shallow_copy(values)

else:

expected_index = Index(values)

expected_index.name = None

obj = obj.repeat(range(1, len(obj) + 1))

obj.name = "a"

# check values has the same dtype as the original

assert obj.dtype == orig.dtype

# check values correctly have NaN

nanloc = np.zeros(len(obj), dtype=np.bool)

nanloc[:3] = True

if isinstance(obj, Index):

tm.assert_numpy_array_equal(pd.isna(obj), nanloc)

if isinstance(obj, pd.CategoricalIndex):

assert obj.nunique() == len(obj.categories)

assert obj.nunique(dropna=False) == len(obj.categories) + 1

else:

exp = Series(nanloc, obj.index, name="a")

tm.assert_series_equal(pd.isna(obj), exp)

expected_data = list(range(num_values, 2, -1))

expected_data_na = expected_data.copy()

if expected_data_na:

expected_data_na.append(3)

expected_s_na = Series(

expected_data_na,

index=expected_index[num_values - 1 : 0 : -1],

dtype="int64",

name="a",

)

expected_s = Series(

expected_data,

index=expected_index[num_values - 1 : 1 : -1],

dtype="int64",

name="a",

)

num_unique_values = len(obj.unique())

assert obj.nunique() == max(0, num_unique_values - 1)

assert obj.nunique(dropna=False) == max(0, num_unique_values)

result_s_na = obj.value_counts(dropna=False)

tm.assert_series_equal(result_s_na, expected_s_na)

assert result_s_na.index.name is None

assert result_s_na.name == "a"

result_s = obj.value_counts()

tm.assert_series_equal(obj.value_counts(), expected_s)

assert result_s.index.name is None

assert result_s.name == "a"

def test_value_counts(self, index_or_series_obj):

obj = index_or_series_obj

obj = np.repeat(obj, range(1, len(obj) + 1))

result = obj.value_counts()

result = obj.unique()

if isinstance(obj, Index):

tm.assert_index_equal(result, Index(values[1:], name="a"))

elif is_datetime64tz_dtype(obj):

# unable to compare NaT / nan

tm.assert_extension_array_equal(result[1:], values[2:])

assert result[0] is pd.NaT

elif len(obj) > 0:

tm.assert_numpy_array_equal(result[1:], values[2:])

assert pd.isna(result[0])

assert result.dtype == orig.dtype

assert obj.nunique() == max(0, num_values - 2)

assert obj.nunique(dropna=False) == max(0, num_values - 1)

counter = collections.Counter(obj)

expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name)

expected.index = expected.index.astype(obj.dtype)

if isinstance(obj, pd.MultiIndex):

expected.index = pd.Index(expected.index)

# sort_index to avoid switched order when values share the same count

result = result.sort_index()

expected = expected.sort_index()

tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("null_obj", [np.nan, None])

def test_value_counts_null(self, null_obj, index_or_series_obj):

orig = index_or_series_obj

obj = orig.copy()

if not allow_na_ops(obj):

pytest.skip("type doesn't allow for NA operations")

elif len(obj) < 1:

pytest.skip("Test doesn't make sense on empty data")

elif isinstance(orig, pd.MultiIndex):

pytest.skip(f"MultiIndex can't hold '{null_obj}'")

values = obj.values

if needs_i8_conversion(obj):

values[0:2] = iNaT

else:

values[0:2] = null_obj

klass = type(obj)

repeated_values = np.repeat(values, range(1, len(values) + 1))

obj = klass(repeated_values, dtype=obj.dtype)

# because np.nan == np.nan is False, but None == None is True

# np.nan would be duplicated, whereas None wouldn't

counter = collections.Counter(obj.dropna())

expected = pd.Series(dict(counter.most_common()), dtype=np.int64)

expected.index = expected.index.astype(obj.dtype)

tm.assert_series_equal(obj.value_counts(), expected)

# can't use expected[null_obj] = 3 as

# IntervalIndex doesn't allow assignment

new_entry = pd.Series({np.nan: 3}, dtype=np.int64)

expected = expected.append(new_entry)

tm.assert_series_equal(obj.value_counts(dropna=False), expected)

def test_value_counts_inferred(self, index_or_series):

klass = index_or_series

Expand Down