BUG: MultiIndex intersection with sort=False does not preserve order by jeffzi · Pull Request #31312 · pandas-dev/pandas (original) (raw)
I benchmarked the intersection of 2 arrays of tuples: master Vs this PR Vs suggestion by @jreback to take inspiration of difference.
I also looked at the implementation of Index.intersection
and found that performances can be increased in the case of 2 monotonic MultiIndex
.
@jreback If that's ok with you, I can commit the "mixed" version below.
iimport numpy as np import pandas as pd from pandas.testing import assert_index_equal
def master(self, other): return set(self._ndarray_values) & set(other._ndarray_values)
def pr(self, other): lvals = self._ndarray_values rvals = other._ndarray_values
other_uniq = set(rvals)
seen = set()
return [x for x in lvals if x in other_uniq and not (x in seen or seen.add(x))]
def indexer(self, other): lvals = self._ndarray_values
if self.is_monotonic and other.is_monotonic:
rvals = other._ndarray_values
return self._inner_indexer(lvals, rvals)[0]
other_uniq = other.unique()
indexer = other_uniq.get_indexer(lvals)
indexer = indexer.take((indexer != -1).nonzero()[0])
return other_uniq.take(indexer)._ndarray_values
def mixed(self, other): lvals = self._ndarray_values rvals = other._ndarray_values
if self.is_monotonic and other.is_monotonic:
return self._inner_indexer(lvals, rvals)[0]
other_uniq = set(rvals)
seen = set()
return [x for x in lvals if x in other_uniq and not (x in seen or seen.add(x))]
size = 100000 left = pd.MultiIndex.from_arrays([np.arange(1, size), np.arange(1, size)]) right = pd.MultiIndex.from_arrays([np.arange(1, size//10)[::-1], np.arange(1, size//10)[::-1]]) right_monotonic = pd.MultiIndex.from_arrays([np.arange(1, size//10), np.arange(1, size//10)])
for r in [right, right_monotonic]: print(f"\nBoth monotonic: {r.is_monotonic}\n---------------") expected = pd.MultiIndex.from_tuples(pr(left, r)) actual_indexer = pd.MultiIndex.from_tuples(indexer(left, r)) assert_index_equal(expected, actual_indexer) actual_mixed = pd.MultiIndex.from_tuples(mixed(left, r)) assert_index_equal(expected, actual_mixed)
print("master implementation:")
%timeit -n 10 master(left, r)
print("pr implementation:")
%timeit -n 10 pr(left, r)
print("Suggested indexer implementation:")
%timeit -n 10 indexer(left, r)
print("Mixed implementation:")
%timeit -n 10 mixed(left, r)
Both monotonic: False
---------------
master implementation:
18.2 ms ± 5.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
pr implementation:
10.6 ms ± 625 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Suggested indexer implementation:
170 ms ± 4.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Mixed implementation:
10.2 ms ± 562 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Both monotonic: True
---------------
master implementation:
15.1 ms ± 984 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
pr implementation:
10.4 ms ± 730 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Suggested indexer implementation:
2.56 ms ± 292 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Mixed implementation:
2.63 ms ± 199 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
'platform': 'Darwin',
'platform-release': '19.2.0',
'architecture': 'x86_64',
'ram': '16 GB'