Merge pull request #9380 from behzadnouri/i8grby · pandas-dev/pandas@9f439f0 (original) (raw)
`@@ -1367,30 +1367,16 @@ def group_info(self):
`
1367
1367
``
1368
1368
`def _get_compressed_labels(self):
`
1369
1369
`all_labels = [ping.labels for ping in self.groupings]
`
1370
``
`-
if self._overflow_possible:
`
1371
``
`-
tups = lib.fast_zip(all_labels)
`
1372
``
`-
labs, uniques = algos.factorize(tups)
`
``
1370
`+
if len(all_labels) > 1:
`
``
1371
`+
group_index = get_group_index(all_labels, self.shape,
`
``
1372
`+
sort=True, xnull=True)
`
``
1373
`+
return _compress_group_index(group_index)
`
1373
1374
``
1374
``
`-
if self.sort:
`
1375
``
`-
uniques, labs = _reorder_by_uniques(uniques, labs)
`
``
1375
`+
ping = self.groupings[0]
`
``
1376
`+
self.compressed = False
`
``
1377
`+
self._filter_empty_groups = False
`
1376
1378
``
1377
``
`-
return labs, uniques
`
1378
``
`-
else:
`
1379
``
`-
if len(all_labels) > 1:
`
1380
``
`-
group_index = get_group_index(all_labels, self.shape)
`
1381
``
`-
comp_ids, obs_group_ids = _compress_group_index(group_index)
`
1382
``
`-
else:
`
1383
``
`-
ping = self.groupings[0]
`
1384
``
`-
comp_ids = ping.labels
`
1385
``
`-
obs_group_ids = np.arange(len(ping.group_index))
`
1386
``
`-
self.compressed = False
`
1387
``
`-
self._filter_empty_groups = False
`
1388
``
-
1389
``
`-
return comp_ids, obs_group_ids
`
1390
``
-
1391
``
`-
@cache_readonly
`
1392
``
`-
def _overflow_possible(self):
`
1393
``
`-
return _int64_overflow_possible(self.shape)
`
``
1379
`+
return ping.labels, np.arange(len(ping.group_index))
`
1394
1380
``
1395
1381
`@cache_readonly
`
1396
1382
`def ngroups(self):
`
`@@ -1402,15 +1388,13 @@ def result_index(self):
`
1402
1388
`return MultiIndex.from_arrays(recons, names=self.names)
`
1403
1389
``
1404
1390
`def get_group_levels(self):
`
1405
``
`-
obs_ids = self.group_info[1]
`
``
1391
`+
comp_ids, obs_ids, _ = self.group_info
`
1406
1392
``
1407
1393
`if not self.compressed and len(self.groupings) == 1:
`
1408
1394
`return [self.groupings[0].group_index]
`
1409
1395
``
1410
``
`-
if self._overflow_possible:
`
1411
``
`-
recons_labels = [np.array(x) for x in zip(*obs_ids)]
`
1412
``
`-
else:
`
1413
``
`-
recons_labels = decons_group_index(obs_ids, self.shape)
`
``
1396
`+
recons_labels = decons_obs_group_ids(comp_ids, obs_ids,
`
``
1397
`+
self.shape, (ping.labels for ping in self.groupings))
`
1414
1398
``
1415
1399
`name_list = []
`
1416
1400
`for ping, labels in zip(self.groupings, recons_labels):
`
`@@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):
`
3490
3474
`# Misc utilities
`
3491
3475
``
3492
3476
``
3493
``
`-
def get_group_index(label_list, shape):
`
``
3477
`+
def get_group_index(labels, shape, sort, xnull):
`
3494
3478
`"""
`
3495
3479
` For the particular label_list, gets the offsets into the hypothetical list
`
3496
3480
` representing the totally ordered cartesian product of all possible label
`
3497
``
`-
combinations.
`
3498
``
`-
"""
`
3499
``
`-
if len(label_list) == 1:
`
3500
``
`-
return label_list[0]
`
3501
``
-
3502
``
`-
n = len(label_list[0])
`
3503
``
`-
group_index = np.zeros(n, dtype=np.int64)
`
3504
``
`-
mask = np.zeros(n, dtype=bool)
`
3505
``
`-
for i in range(len(shape)):
`
3506
``
`-
stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)
`
3507
``
`-
group_index += com._ensure_int64(label_list[i]) * stride
`
3508
``
`-
mask |= label_list[i] < 0
`
3509
``
-
3510
``
`-
np.putmask(group_index, mask, -1)
`
3511
``
`-
return group_index
`
3512
``
-
3513
``
-
3514
``
`-
def get_flat_ids(labels, shape, retain_lex_rank):
`
3515
``
`-
"""
`
3516
``
`-
Given a list of labels at each level, returns a flat array of int64 ids
`
3517
``
`` -
corresponding to unique tuples across the labels. If retain_lex_rank
,
``
3518
``
`-
rank of returned ids preserve lexical ranks of labels.
`
``
3481
`+
combinations, as long as this space fits within int64 bounds;
`
``
3482
`+
otherwise, though group indices identify unique combinations of
`
``
3483
`+
labels, they cannot be deconstructed.
`
``
3484
`` +
- If
sort
, rank of returned ids preserve lexical ranks of labels.
``
``
3485
`+
i.e. returned id's can be used to do lexical sort on labels;
`
``
3486
`` +
- If
xnull
nulls (-1 labels) are passed through.
``
3519
3487
``
3520
3488
` Parameters
`
3521
3489
` ----------
`
3522
3490
` labels: sequence of arrays
`
3523
3491
` Integers identifying levels at each location
`
3524
3492
` shape: sequence of ints same length as labels
`
3525
3493
` Number of unique levels at each location
`
3526
``
`-
retain_lex_rank: boolean
`
``
3494
`+
sort: boolean
`
3527
3495
` If the ranks of returned ids should match lexical ranks of labels
`
3528
``
-
``
3496
`+
xnull: boolean
`
``
3497
`+
If true nulls are eXcluded. i.e. -1 values in the labels are
`
``
3498
`+
passed through
`
3529
3499
` Returns
`
3530
3500
` -------
`
3531
3501
` An array of type int64 where two elements are equal if their corresponding
`
`@@ -3544,12 +3514,18 @@ def loop(labels, shape):
`
3544
3514
`stride //= shape[i]
`
3545
3515
`out += labels[i] * stride
`
3546
3516
``
``
3517
`+
if xnull: # exclude nulls
`
``
3518
`+
mask = labels[0] == -1
`
``
3519
`+
for lab in labels[1:nlev]:
`
``
3520
`+
mask |= lab == -1
`
``
3521
`+
out[mask] = -1
`
``
3522
+
3547
3523
`if nlev == len(shape): # all levels done!
`
3548
3524
`return out
`
3549
3525
``
3550
3526
`# compress what has been done so far in order to avoid overflow
`
3551
3527
`# to retain lexical ranks, obs_ids should be sorted
`
3552
``
`-
comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)
`
``
3528
`+
comp_ids, obs_ids = _compress_group_index(out, sort=sort)
`
3553
3529
``
3554
3530
`labels = [comp_ids] + labels[nlev:]
`
3555
3531
`shape = [len(obs_ids)] + shape[nlev:]
`
`@@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values
`
3560
3536
`return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
`
3561
3537
``
3562
3538
`labels = map(com._ensure_int64, labels)
`
3563
``
`-
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
`
``
3539
`+
if not xnull:
`
``
3540
`+
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
`
3564
3541
``
3565
``
`-
return loop(labels, shape)
`
``
3542
`+
return loop(list(labels), list(shape))
`
3566
3543
``
3567
3544
``
3568
3545
`_INT64_MAX = np.iinfo(np.int64).max
`
`@@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):
`
3578
3555
``
3579
3556
`def decons_group_index(comp_labels, shape):
`
3580
3557
`# reconstruct labels
`
``
3558
`+
if _int64_overflow_possible(shape):
`
``
3559
`+
at some point group indices are factorized,
`
``
3560
`+
and may not be deconstructed here! wrong path!
`
``
3561
`+
raise ValueError('cannot deconstruct factorized group indices!')
`
``
3562
+
3581
3563
`label_list = []
`
3582
3564
`factor = 1
`
3583
3565
`y = 0
`
`@@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):
`
3591
3573
`return label_list[::-1]
`
3592
3574
``
3593
3575
``
``
3576
`+
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
`
``
3577
`+
"""reconstruct labels from observed ids"""
`
``
3578
`+
from pandas.hashtable import unique_label_indices
`
``
3579
+
``
3580
`+
if not _int64_overflow_possible(shape):
`
``
3581
`+
obs ids are deconstructable! take the fast route!
`
``
3582
`+
return decons_group_index(obs_ids, shape)
`
``
3583
+
``
3584
`+
i = unique_label_indices(comp_ids)
`
``
3585
`+
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
`
``
3586
`+
return [i8copy(lab[i]) for lab in labels]
`
``
3587
+
``
3588
+
3594
3589
`def _indexer_from_factorized(labels, shape, compress=True):
`
3595
3590
`if _int64_overflow_possible(shape):
`
3596
3591
`indexer = np.lexsort(np.array(labels[::-1]))
`
3597
3592
`return indexer
`
3598
3593
``
3599
``
`-
group_index = get_group_index(labels, shape)
`
``
3594
`+
group_index = get_group_index(labels, shape, sort=True, xnull=True)
`
3600
3595
``
3601
3596
`if compress:
`
3602
3597
`comp_ids, obs_ids = _compress_group_index(group_index)
`
`@@ -3712,9 +3707,12 @@ def get_key(self, comp_id):
`
3712
3707
``
3713
3708
`def _get_indices_dict(label_list, keys):
`
3714
3709
`shape = list(map(len, keys))
`
3715
``
`-
ngroups = np.prod(shape)
`
3716
3710
``
3717
``
`-
group_index = get_group_index(label_list, shape)
`
``
3711
`+
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
`
``
3712
`+
ngroups = ((group_index.size and group_index.max()) + 1) \
`
``
3713
`+
if _int64_overflow_possible(shape) \
`
``
3714
`+
else np.prod(shape, dtype='i8')
`
``
3715
+
3718
3716
`sorter = _get_group_index_sorter(group_index, ngroups)
`
3719
3717
``
3720
3718
`sorted_labels = [lab.take(sorter) for lab in label_list]
`