Merge pull request #9380 from behzadnouri/i8grby · pandas-dev/pandas@9f439f0 (original) (raw)

`@@ -1367,30 +1367,16 @@ def group_info(self):

`

1367

1367

``

1368

1368

`def _get_compressed_labels(self):

`

1369

1369

`all_labels = [ping.labels for ping in self.groupings]

`

1370

``

`-

if self._overflow_possible:

`

1371

``

`-

tups = lib.fast_zip(all_labels)

`

1372

``

`-

labs, uniques = algos.factorize(tups)

`

``

1370

`+

if len(all_labels) > 1:

`

``

1371

`+

group_index = get_group_index(all_labels, self.shape,

`

``

1372

`+

sort=True, xnull=True)

`

``

1373

`+

return _compress_group_index(group_index)

`

1373

1374

``

1374

``

`-

if self.sort:

`

1375

``

`-

uniques, labs = _reorder_by_uniques(uniques, labs)

`

``

1375

`+

ping = self.groupings[0]

`

``

1376

`+

self.compressed = False

`

``

1377

`+

self._filter_empty_groups = False

`

1376

1378

``

1377

``

`-

return labs, uniques

`

1378

``

`-

else:

`

1379

``

`-

if len(all_labels) > 1:

`

1380

``

`-

group_index = get_group_index(all_labels, self.shape)

`

1381

``

`-

comp_ids, obs_group_ids = _compress_group_index(group_index)

`

1382

``

`-

else:

`

1383

``

`-

ping = self.groupings[0]

`

1384

``

`-

comp_ids = ping.labels

`

1385

``

`-

obs_group_ids = np.arange(len(ping.group_index))

`

1386

``

`-

self.compressed = False

`

1387

``

`-

self._filter_empty_groups = False

`

1388

``

-

1389

``

`-

return comp_ids, obs_group_ids

`

1390

``

-

1391

``

`-

@cache_readonly

`

1392

``

`-

def _overflow_possible(self):

`

1393

``

`-

return _int64_overflow_possible(self.shape)

`

``

1379

`+

return ping.labels, np.arange(len(ping.group_index))

`

1394

1380

``

1395

1381

`@cache_readonly

`

1396

1382

`def ngroups(self):

`

`@@ -1402,15 +1388,13 @@ def result_index(self):

`

1402

1388

`return MultiIndex.from_arrays(recons, names=self.names)

`

1403

1389

``

1404

1390

`def get_group_levels(self):

`

1405

``

`-

obs_ids = self.group_info[1]

`

``

1391

`+

comp_ids, obs_ids, _ = self.group_info

`

1406

1392

``

1407

1393

`if not self.compressed and len(self.groupings) == 1:

`

1408

1394

`return [self.groupings[0].group_index]

`

1409

1395

``

1410

``

`-

if self._overflow_possible:

`

1411

``

`-

recons_labels = [np.array(x) for x in zip(*obs_ids)]

`

1412

``

`-

else:

`

1413

``

`-

recons_labels = decons_group_index(obs_ids, self.shape)

`

``

1396

`+

recons_labels = decons_obs_group_ids(comp_ids, obs_ids,

`

``

1397

`+

self.shape, (ping.labels for ping in self.groupings))

`

1414

1398

``

1415

1399

`name_list = []

`

1416

1400

`for ping, labels in zip(self.groupings, recons_labels):

`

`@@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):

`

3490

3474

`# Misc utilities

`

3491

3475

``

3492

3476

``

3493

``

`-

def get_group_index(label_list, shape):

`

``

3477

`+

def get_group_index(labels, shape, sort, xnull):

`

3494

3478

`"""

`

3495

3479

` For the particular label_list, gets the offsets into the hypothetical list

`

3496

3480

` representing the totally ordered cartesian product of all possible label

`

3497

``

`-

combinations.

`

3498

``

`-

"""

`

3499

``

`-

if len(label_list) == 1:

`

3500

``

`-

return label_list[0]

`

3501

``

-

3502

``

`-

n = len(label_list[0])

`

3503

``

`-

group_index = np.zeros(n, dtype=np.int64)

`

3504

``

`-

mask = np.zeros(n, dtype=bool)

`

3505

``

`-

for i in range(len(shape)):

`

3506

``

`-

stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)

`

3507

``

`-

group_index += com._ensure_int64(label_list[i]) * stride

`

3508

``

`-

mask |= label_list[i] < 0

`

3509

``

-

3510

``

`-

np.putmask(group_index, mask, -1)

`

3511

``

`-

return group_index

`

3512

``

-

3513

``

-

3514

``

`-

def get_flat_ids(labels, shape, retain_lex_rank):

`

3515

``

`-

"""

`

3516

``

`-

Given a list of labels at each level, returns a flat array of int64 ids

`

3517

``

`` -

corresponding to unique tuples across the labels. If retain_lex_rank,

``

3518

``

`-

rank of returned ids preserve lexical ranks of labels.

`

``

3481

`+

combinations, as long as this space fits within int64 bounds;

`

``

3482

`+

otherwise, though group indices identify unique combinations of

`

``

3483

`+

labels, they cannot be deconstructed.

`

``

3484

`` +

``

``

3485

`+

i.e. returned id's can be used to do lexical sort on labels;

`

``

3486

`` +

``

3519

3487

``

3520

3488

` Parameters

`

3521

3489

` ----------

`

3522

3490

` labels: sequence of arrays

`

3523

3491

` Integers identifying levels at each location

`

3524

3492

` shape: sequence of ints same length as labels

`

3525

3493

` Number of unique levels at each location

`

3526

``

`-

retain_lex_rank: boolean

`

``

3494

`+

sort: boolean

`

3527

3495

` If the ranks of returned ids should match lexical ranks of labels

`

3528

``

-

``

3496

`+

xnull: boolean

`

``

3497

`+

If true nulls are eXcluded. i.e. -1 values in the labels are

`

``

3498

`+

passed through

`

3529

3499

` Returns

`

3530

3500

` -------

`

3531

3501

` An array of type int64 where two elements are equal if their corresponding

`

`@@ -3544,12 +3514,18 @@ def loop(labels, shape):

`

3544

3514

`stride //= shape[i]

`

3545

3515

`out += labels[i] * stride

`

3546

3516

``

``

3517

`+

if xnull: # exclude nulls

`

``

3518

`+

mask = labels[0] == -1

`

``

3519

`+

for lab in labels[1:nlev]:

`

``

3520

`+

mask |= lab == -1

`

``

3521

`+

out[mask] = -1

`

``

3522

+

3547

3523

`if nlev == len(shape): # all levels done!

`

3548

3524

`return out

`

3549

3525

``

3550

3526

`# compress what has been done so far in order to avoid overflow

`

3551

3527

`# to retain lexical ranks, obs_ids should be sorted

`

3552

``

`-

comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)

`

``

3528

`+

comp_ids, obs_ids = _compress_group_index(out, sort=sort)

`

3553

3529

``

3554

3530

`labels = [comp_ids] + labels[nlev:]

`

3555

3531

`shape = [len(obs_ids)] + shape[nlev:]

`

`@@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values

`

3560

3536

`return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

`

3561

3537

``

3562

3538

`labels = map(com._ensure_int64, labels)

`

3563

``

`-

labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

`

``

3539

`+

if not xnull:

`

``

3540

`+

labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

`

3564

3541

``

3565

``

`-

return loop(labels, shape)

`

``

3542

`+

return loop(list(labels), list(shape))

`

3566

3543

``

3567

3544

``

3568

3545

`_INT64_MAX = np.iinfo(np.int64).max

`

`@@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):

`

3578

3555

``

3579

3556

`def decons_group_index(comp_labels, shape):

`

3580

3557

`# reconstruct labels

`

``

3558

`+

if _int64_overflow_possible(shape):

`

``

3559

`+

at some point group indices are factorized,

`

``

3560

`+

and may not be deconstructed here! wrong path!

`

``

3561

`+

raise ValueError('cannot deconstruct factorized group indices!')

`

``

3562

+

3581

3563

`label_list = []

`

3582

3564

`factor = 1

`

3583

3565

`y = 0

`

`@@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):

`

3591

3573

`return label_list[::-1]

`

3592

3574

``

3593

3575

``

``

3576

`+

def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):

`

``

3577

`+

"""reconstruct labels from observed ids"""

`

``

3578

`+

from pandas.hashtable import unique_label_indices

`

``

3579

+

``

3580

`+

if not _int64_overflow_possible(shape):

`

``

3581

`+

obs ids are deconstructable! take the fast route!

`

``

3582

`+

return decons_group_index(obs_ids, shape)

`

``

3583

+

``

3584

`+

i = unique_label_indices(comp_ids)

`

``

3585

`+

i8copy = lambda a: a.astype('i8', subok=False, copy=True)

`

``

3586

`+

return [i8copy(lab[i]) for lab in labels]

`

``

3587

+

``

3588

+

3594

3589

`def _indexer_from_factorized(labels, shape, compress=True):

`

3595

3590

`if _int64_overflow_possible(shape):

`

3596

3591

`indexer = np.lexsort(np.array(labels[::-1]))

`

3597

3592

`return indexer

`

3598

3593

``

3599

``

`-

group_index = get_group_index(labels, shape)

`

``

3594

`+

group_index = get_group_index(labels, shape, sort=True, xnull=True)

`

3600

3595

``

3601

3596

`if compress:

`

3602

3597

`comp_ids, obs_ids = _compress_group_index(group_index)

`

`@@ -3712,9 +3707,12 @@ def get_key(self, comp_id):

`

3712

3707

``

3713

3708

`def _get_indices_dict(label_list, keys):

`

3714

3709

`shape = list(map(len, keys))

`

3715

``

`-

ngroups = np.prod(shape)

`

3716

3710

``

3717

``

`-

group_index = get_group_index(label_list, shape)

`

``

3711

`+

group_index = get_group_index(label_list, shape, sort=True, xnull=True)

`

``

3712

`+

ngroups = ((group_index.size and group_index.max()) + 1) \

`

``

3713

`+

if _int64_overflow_possible(shape) \

`

``

3714

`+

else np.prod(shape, dtype='i8')

`

``

3715

+

3718

3716

`sorter = _get_group_index_sorter(group_index, ngroups)

`

3719

3717

``

3720

3718

`sorted_labels = [lab.take(sorter) for lab in label_list]

`