ENH: Add numba engine to groupby.aggregate by mroeschke · Pull Request #33388 · pandas-dev/pandas (original) (raw)

   In [1]: N = 10 ** 3

   In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}

   In [3]: df = pd.DataFrame(data, columns=[0, 1])

   In [4]: def f_numba(values, index):
      ...:     total = 0
      ...:     for i, value in enumerate(values):
      ...:         if i % 2:
      ...:             total += value + 5
      ...:         else:
      ...:             total += value * 2
      ...:     return total
      ...:

   In [5]: def f_cython(values):
      ...:     total = 0
      ...:     for i, value in enumerate(values):
      ...:         if i % 2:
      ...:             total += value + 5
      ...:         else:
      ...:             total += value * 2
      ...:     return total
      ...:

   In [6]: groupby = df.groupby(0)
   # Run the first time, compilation time will affect performance
   In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba')  # noqa: E225
   2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
   # Function is cached and performance will improve
   In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
   4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

   In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
   18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)