PERF: Sparse ops speedup by sinhrks · Pull Request #13082 · pandas-dev/pandas (original) (raw)
Follow-up for #13036. Perf improvements are not significant because the number of previous list.append
call is not so large.
import numpy as np
import pandas as pd
np.random.seed(1)
N = 1000000
a = np.array([np.nan] * N)
b = np.array([np.nan] * N)
indexer_a = np.unique(np.random.randint(0, N, N / 10))
indexer_b = np.unique(np.random.randint(0, N, N / 10))
a[indexer_a] = np.random.randint(0, 100, len(indexer_a))
b[indexer_b] = np.random.randint(0, 100, len(indexer_b))
sa = pd.SparseArray(a)
sb = pd.SparseArray(b)
%timeit a.sp_index.intersect(sb.sp_index0)
# before
#100 loops, best of 3: 3.04 ms per loop
# after
#100 loops, best of 3: 2.11 ms per loop
def make_sparse_array(length, num_blocks, block_size, fill_value):
a = np.array([fill_value] * length)
for block in range(num_blocks):
i = np.random.randint(0, length)
a[i:i + block_size] = np.random.randint(0, 100, len(a[i:i + block_size]))
return pd.SparseArray(a, fill_value=fill_value)
N = 1000000
B = 10000
BS = 10
a = make_sparse_array(length=N, num_blocks=B, block_size=BS, fill_value=np.nan)
b = make_sparse_array(length=N, num_blocks=B, block_size=BS, fill_value=np.nan)
%timeit a + b
# before
#10 loops, best of 3: 70.8 ms per loop
# after
#10 loops, best of 3: 66 ms per loop