PERF: improve MultiIndex get_loc performance (#16346) · pandas-dev/pandas@34ebad8 (original) (raw)

`@@ -4,14 +4,17 @@

4

`import itertools

5

6

`import numpy as np

7

from pandas._libs import hashing

7

from pandas._libs import hashing, tslib

8

`from pandas.core.dtypes.generic import (

9

`ABCMultiIndex,

10

`ABCIndexClass,

11

`ABCSeries,

12

`ABCDataFrame)

13

`from pandas.core.dtypes.common import (

14

`is_categorical_dtype, is_list_like)

15

from pandas.core.dtypes.missing import isnull

16

from pandas.core.dtypes.cast import infer_dtype_from_scalar

17

+

15

18

16

19

`# 16 byte long hashing key

17

20

`_default_hash_key = '0123456789123456'

`@@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):

164

167

`return h

165

168

166

169

170

def hash_tuple(val, encoding='utf8', hash_key=None):

171

"""

172

Hash a single tuple efficiently

173

+

174

Parameters

175

176

val : single tuple

177

encoding : string, default 'utf8'

178

hash_key : string key to encode, default to _default_hash_key

179

+

180

Returns

181

182

hash

183

+

184

"""

185

hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)

186

for v in val)

187

+

188

h = _combine_hash_arrays(hashes, len(val))[0]

189

+

190

return h

191

+

192

+

167

193

`def _hash_categorical(c, encoding, hash_key):

168

194

`"""

169

195

` Hash a Categorical by hashing its categories, and then mapping the codes

`@@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):

276

302

`vals *= np.uint64(0x94d049bb133111eb)

277

303

`vals ^= vals >> 31

278

304

`return vals

305

+

306

+

307

def _hash_scalar(val, encoding='utf8', hash_key=None):

308

"""

309

Hash scalar value

310

+

311

Returns

312

313

1d uint64 numpy array of hash value, of length 1

314

"""

315

+

316

if isnull(val):

317

this is to be consistent with the _hash_categorical implementation

318

return np.array([np.iinfo(np.uint64).max], dtype='u8')

319

+

320

if getattr(val, 'tzinfo', None) is not None:

321

for tz-aware datetimes, we need the underlying naive UTC value and

322

not the tz aware object or pd extension type (as

323

infer_dtype_from_scalar would do)

324

if not isinstance(val, tslib.Timestamp):

325

val = tslib.Timestamp(val)

326

val = val.tz_convert(None)

327

+

328

dtype, val = infer_dtype_from_scalar(val)

329

vals = np.array([val], dtype=dtype)

330

+

331

return hash_array(vals, hash_key=hash_key, encoding=encoding,

332

categorize=False)