PERF: Performance improvement value_counts for masked arrays by phofl · Pull Request #48338 · pandas-dev/pandas (original) (raw)

ser = pd.Series([1, 2, pd.NA] + list(range(1_000_000)), dtype="Int64")
ser.value_counts(dropna=True/False)

# old dropna=False
# 99.6 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# new dropna=False
# 46.1 ms ± 349 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# old dropna=True
# 138 ms ± 639 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# new dropna=True
# 44.6 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

data = np.array(list(range(1_000_000)))
Series(data, dtype="Int64")

# old Series(numpy_array, dtype="Int64")
# 45.8 ms ± 499 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# new
# 63.8 µs ± 329 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)