PERF: improve MultiIndex get_loc performance (#16346) · pandas-dev/pandas@34ebad8 (original) (raw)
`@@ -4,14 +4,17 @@
`
4
4
`import itertools
`
5
5
``
6
6
`import numpy as np
`
7
``
`-
from pandas._libs import hashing
`
``
7
`+
from pandas._libs import hashing, tslib
`
8
8
`from pandas.core.dtypes.generic import (
`
9
9
`ABCMultiIndex,
`
10
10
`ABCIndexClass,
`
11
11
`ABCSeries,
`
12
12
`ABCDataFrame)
`
13
13
`from pandas.core.dtypes.common import (
`
14
14
`is_categorical_dtype, is_list_like)
`
``
15
`+
from pandas.core.dtypes.missing import isnull
`
``
16
`+
from pandas.core.dtypes.cast import infer_dtype_from_scalar
`
``
17
+
15
18
``
16
19
`# 16 byte long hashing key
`
17
20
`_default_hash_key = '0123456789123456'
`
`@@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
`
164
167
`return h
`
165
168
``
166
169
``
``
170
`+
def hash_tuple(val, encoding='utf8', hash_key=None):
`
``
171
`+
"""
`
``
172
`+
Hash a single tuple efficiently
`
``
173
+
``
174
`+
Parameters
`
``
175
`+
`
``
176
`+
val : single tuple
`
``
177
`+
encoding : string, default 'utf8'
`
``
178
`+
hash_key : string key to encode, default to _default_hash_key
`
``
179
+
``
180
`+
Returns
`
``
181
`+
`
``
182
`+
hash
`
``
183
+
``
184
`+
"""
`
``
185
`+
hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
`
``
186
`+
for v in val)
`
``
187
+
``
188
`+
h = _combine_hash_arrays(hashes, len(val))[0]
`
``
189
+
``
190
`+
return h
`
``
191
+
``
192
+
167
193
`def _hash_categorical(c, encoding, hash_key):
`
168
194
`"""
`
169
195
` Hash a Categorical by hashing its categories, and then mapping the codes
`
`@@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
`
276
302
`vals *= np.uint64(0x94d049bb133111eb)
`
277
303
`vals ^= vals >> 31
`
278
304
`return vals
`
``
305
+
``
306
+
``
307
`+
def _hash_scalar(val, encoding='utf8', hash_key=None):
`
``
308
`+
"""
`
``
309
`+
Hash scalar value
`
``
310
+
``
311
`+
Returns
`
``
312
`+
`
``
313
`+
1d uint64 numpy array of hash value, of length 1
`
``
314
`+
"""
`
``
315
+
``
316
`+
if isnull(val):
`
``
317
`+
this is to be consistent with the _hash_categorical implementation
`
``
318
`+
return np.array([np.iinfo(np.uint64).max], dtype='u8')
`
``
319
+
``
320
`+
if getattr(val, 'tzinfo', None) is not None:
`
``
321
`+
for tz-aware datetimes, we need the underlying naive UTC value and
`
``
322
`+
not the tz aware object or pd extension type (as
`
``
323
`+
infer_dtype_from_scalar would do)
`
``
324
`+
if not isinstance(val, tslib.Timestamp):
`
``
325
`+
val = tslib.Timestamp(val)
`
``
326
`+
val = val.tz_convert(None)
`
``
327
+
``
328
`+
dtype, val = infer_dtype_from_scalar(val)
`
``
329
`+
vals = np.array([val], dtype=dtype)
`
``
330
+
``
331
`+
return hash_array(vals, hash_key=hash_key, encoding=encoding,
`
``
332
`+
categorize=False)
`