pandas@9f439f0 (original) (raw)

`@@ -1367,30 +1367,16 @@ def group_info(self):

1367

1368

`def _get_compressed_labels(self):

1369

`all_labels = [ping.labels for ping in self.groupings]

1370

if self._overflow_possible:

1371

tups = lib.fast_zip(all_labels)

1372

labs, uniques = algos.factorize(tups)

1370

if len(all_labels) > 1:

1371

group_index = get_group_index(all_labels, self.shape,

1372

sort=True, xnull=True)

1373

return _compress_group_index(group_index)

1373

1374

if self.sort:

1375

uniques, labs = _reorder_by_uniques(uniques, labs)

1375

ping = self.groupings[0]

1376

self.compressed = False

1377

self._filter_empty_groups = False

1376

1378

1377

return labs, uniques

1378

else:

1379

if len(all_labels) > 1:

1380

group_index = get_group_index(all_labels, self.shape)

1381

comp_ids, obs_group_ids = _compress_group_index(group_index)

1382

else:

1383

ping = self.groupings[0]

1384

comp_ids = ping.labels

1385

obs_group_ids = np.arange(len(ping.group_index))

1386

self.compressed = False

1387

self._filter_empty_groups = False

1388

-

1389

return comp_ids, obs_group_ids

1390

-

1391

@cache_readonly

1392

def _overflow_possible(self):

1393

return _int64_overflow_possible(self.shape)

1379

return ping.labels, np.arange(len(ping.group_index))

1394

1380

1395

1381

`@cache_readonly

1396

1382

`def ngroups(self):

`@@ -1402,15 +1388,13 @@ def result_index(self):

1402

1388

`return MultiIndex.from_arrays(recons, names=self.names)

1403

1389

1404

1390

`def get_group_levels(self):

1405

obs_ids = self.group_info[1]

1391

comp_ids, obs_ids, _ = self.group_info

1406

1392

1407

1393

`if not self.compressed and len(self.groupings) == 1:

1408

1394

`return [self.groupings[0].group_index]

1409

1395

1410

if self._overflow_possible:

1411

recons_labels = [np.array(x) for x in zip(*obs_ids)]

1412

else:

1413

recons_labels = decons_group_index(obs_ids, self.shape)

1396

recons_labels = decons_obs_group_ids(comp_ids, obs_ids,

1397

self.shape, (ping.labels for ping in self.groupings))

1414

1398

1415

1399

`name_list = []

1416

1400

`for ping, labels in zip(self.groupings, recons_labels):

`@@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):

3490

3474

`# Misc utilities

3491

3475

3492

3476

3493

def get_group_index(label_list, shape):

3477

def get_group_index(labels, shape, sort, xnull):

3494

3478

`"""

3495

3479

` For the particular label_list, gets the offsets into the hypothetical list

3496

3480

` representing the totally ordered cartesian product of all possible label

3497

combinations.

3498

"""

3499

if len(label_list) == 1:

3500

return label_list[0]

3501

-

3502

n = len(label_list[0])

3503

group_index = np.zeros(n, dtype=np.int64)

3504

mask = np.zeros(n, dtype=bool)

3505

for i in range(len(shape)):

3506

stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)

3507

group_index += com._ensure_int64(label_list[i]) * stride

3508

mask |= label_list[i] < 0

3509

-

3510

np.putmask(group_index, mask, -1)

3511

return group_index

3512

-

3513

-

3514

def get_flat_ids(labels, shape, retain_lex_rank):

3515

"""

3516

Given a list of labels at each level, returns a flat array of int64 ids

3517

`` -

corresponding to unique tuples across the labels. If retain_lex_rank,

3518

rank of returned ids preserve lexical ranks of labels.

3481

combinations, as long as this space fits within int64 bounds;

3482

otherwise, though group indices identify unique combinations of

3483

labels, they cannot be deconstructed.

3484

`` +

If sort, rank of returned ids preserve lexical ranks of labels.

3485

i.e. returned id's can be used to do lexical sort on labels;

3486

`` +

If xnull nulls (-1 labels) are passed through.

3519

3487

3520

3488

` Parameters

3521

3489

` ----------

3522

3490

` labels: sequence of arrays

3523

3491

` Integers identifying levels at each location

3524

3492

` shape: sequence of ints same length as labels

3525

3493

` Number of unique levels at each location

3526

retain_lex_rank: boolean

3494

sort: boolean

3527

3495

` If the ranks of returned ids should match lexical ranks of labels

3528

-

3496

xnull: boolean

3497

If true nulls are eXcluded. i.e. -1 values in the labels are

3498

passed through

3529

3499

` Returns

3530

3500

` -------

3531

3501

` An array of type int64 where two elements are equal if their corresponding

`@@ -3544,12 +3514,18 @@ def loop(labels, shape):

3544

3514

`stride //= shape[i]

3545

3515

`out += labels[i] * stride

3546

3516

3517

if xnull: # exclude nulls

3518

mask = labels[0] == -1

3519

for lab in labels[1:nlev]:

3520

mask |= lab == -1

3521

out[mask] = -1

3522

+

3547

3523

`if nlev == len(shape): # all levels done!

3548

3524

`return out

3549

3525

3550

3526

`# compress what has been done so far in order to avoid overflow

3551

3527

`# to retain lexical ranks, obs_ids should be sorted

3552

comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)

3528

comp_ids, obs_ids = _compress_group_index(out, sort=sort)

3553

3529

3554

3530

`labels = [comp_ids] + labels[nlev:]

3555

3531

`shape = [len(obs_ids)] + shape[nlev:]

`@@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values

3560

3536

`return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

3561

3537

3562

3538

`labels = map(com._ensure_int64, labels)

3563

labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

3539

if not xnull:

3540

labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

3564

3541

3565

return loop(labels, shape)

3542

return loop(list(labels), list(shape))

3566

3543

3567

3544

3568

3545

`_INT64_MAX = np.iinfo(np.int64).max

`@@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):

3578

3555

3579

3556

`def decons_group_index(comp_labels, shape):

3580

3557

`# reconstruct labels

3558

if _int64_overflow_possible(shape):

3559

at some point group indices are factorized,

3560

and may not be deconstructed here! wrong path!

3561

raise ValueError('cannot deconstruct factorized group indices!')

3562

+

3581

3563

`label_list = []

3582

3564

`factor = 1

3583

3565

`y = 0

`@@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):

3591

3573

`return label_list[::-1]

3592

3574

3593

3575

3576

def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):

3577

"""reconstruct labels from observed ids"""

3578

from pandas.hashtable import unique_label_indices

3579

+

3580

if not _int64_overflow_possible(shape):

3581

obs ids are deconstructable! take the fast route!

3582

return decons_group_index(obs_ids, shape)

3583

+

3584

i = unique_label_indices(comp_ids)

3585

i8copy = lambda a: a.astype('i8', subok=False, copy=True)

3586

return [i8copy(lab[i]) for lab in labels]

3587

+

3588

+

3594

3589

`def _indexer_from_factorized(labels, shape, compress=True):

3595

3590

`if _int64_overflow_possible(shape):

3596

3591

`indexer = np.lexsort(np.array(labels[::-1]))

3597

3592

`return indexer

3598

3593

3599

group_index = get_group_index(labels, shape)

3594

group_index = get_group_index(labels, shape, sort=True, xnull=True)

3600

3595

3601

3596

`if compress:

3602

3597

`comp_ids, obs_ids = _compress_group_index(group_index)

`@@ -3712,9 +3707,12 @@ def get_key(self, comp_id):

3712

3707

3713

3708

`def _get_indices_dict(label_list, keys):

3714

3709

`shape = list(map(len, keys))

3715

ngroups = np.prod(shape)

3716

3710

3717

group_index = get_group_index(label_list, shape)

3711

group_index = get_group_index(label_list, shape, sort=True, xnull=True)

3712

ngroups = ((group_index.size and group_index.max()) + 1) \

3713

if _int64_overflow_possible(shape) \

3714

else np.prod(shape, dtype='i8')

3715

+

3718

3716

`sorter = _get_group_index_sorter(group_index, ngroups)

3719

3717

3720

3718

`sorted_labels = [lab.take(sorter) for lab in label_list]