PERF: improve MultiIndex get_loc performance (#16346) · pandas-dev/pandas@34ebad8 (original) (raw)

`@@ -4,14 +4,17 @@

`

4

4

`import itertools

`

5

5

``

6

6

`import numpy as np

`

7

``

`-

from pandas._libs import hashing

`

``

7

`+

from pandas._libs import hashing, tslib

`

8

8

`from pandas.core.dtypes.generic import (

`

9

9

`ABCMultiIndex,

`

10

10

`ABCIndexClass,

`

11

11

`ABCSeries,

`

12

12

`ABCDataFrame)

`

13

13

`from pandas.core.dtypes.common import (

`

14

14

`is_categorical_dtype, is_list_like)

`

``

15

`+

from pandas.core.dtypes.missing import isnull

`

``

16

`+

from pandas.core.dtypes.cast import infer_dtype_from_scalar

`

``

17

+

15

18

``

16

19

`# 16 byte long hashing key

`

17

20

`_default_hash_key = '0123456789123456'

`

`@@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):

`

164

167

`return h

`

165

168

``

166

169

``

``

170

`+

def hash_tuple(val, encoding='utf8', hash_key=None):

`

``

171

`+

"""

`

``

172

`+

Hash a single tuple efficiently

`

``

173

+

``

174

`+

Parameters

`

``

175

`+


`

``

176

`+

val : single tuple

`

``

177

`+

encoding : string, default 'utf8'

`

``

178

`+

hash_key : string key to encode, default to _default_hash_key

`

``

179

+

``

180

`+

Returns

`

``

181

`+


`

``

182

`+

hash

`

``

183

+

``

184

`+

"""

`

``

185

`+

hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)

`

``

186

`+

for v in val)

`

``

187

+

``

188

`+

h = _combine_hash_arrays(hashes, len(val))[0]

`

``

189

+

``

190

`+

return h

`

``

191

+

``

192

+

167

193

`def _hash_categorical(c, encoding, hash_key):

`

168

194

`"""

`

169

195

` Hash a Categorical by hashing its categories, and then mapping the codes

`

`@@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):

`

276

302

`vals *= np.uint64(0x94d049bb133111eb)

`

277

303

`vals ^= vals >> 31

`

278

304

`return vals

`

``

305

+

``

306

+

``

307

`+

def _hash_scalar(val, encoding='utf8', hash_key=None):

`

``

308

`+

"""

`

``

309

`+

Hash scalar value

`

``

310

+

``

311

`+

Returns

`

``

312

`+


`

``

313

`+

1d uint64 numpy array of hash value, of length 1

`

``

314

`+

"""

`

``

315

+

``

316

`+

if isnull(val):

`

``

317

`+

this is to be consistent with the _hash_categorical implementation

`

``

318

`+

return np.array([np.iinfo(np.uint64).max], dtype='u8')

`

``

319

+

``

320

`+

if getattr(val, 'tzinfo', None) is not None:

`

``

321

`+

for tz-aware datetimes, we need the underlying naive UTC value and

`

``

322

`+

not the tz aware object or pd extension type (as

`

``

323

`+

infer_dtype_from_scalar would do)

`

``

324

`+

if not isinstance(val, tslib.Timestamp):

`

``

325

`+

val = tslib.Timestamp(val)

`

``

326

`+

val = val.tz_convert(None)

`

``

327

+

``

328

`+

dtype, val = infer_dtype_from_scalar(val)

`

``

329

`+

vals = np.array([val], dtype=dtype)

`

``

330

+

``

331

`+

return hash_array(vals, hash_key=hash_key, encoding=encoding,

`

``

332

`+

categorize=False)

`