ENH: Add numba engine to groupby.aggregate by mroeschke · Pull Request #33388 · pandas-dev/pandas (original) (raw)
In [1]: N = 10 ** 3
In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
In [3]: df = pd.DataFrame(data, columns=[0, 1])
In [4]: def f_numba(values, index):
...: total = 0
...: for i, value in enumerate(values):
...: if i % 2:
...: total += value + 5
...: else:
...: total += value * 2
...: return total
...:
In [5]: def f_cython(values):
...: total = 0
...: for i, value in enumerate(values):
...: if i % 2:
...: total += value + 5
...: else:
...: total += value * 2
...: return total
...:
In [6]: groupby = df.groupby(0)
# Run the first time, compilation time will affect performance
In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225
2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
# Function is cached and performance will improve
In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)