ENH: Allow keep='all' for nlargest/nsmallest (#21650) · pandas-dev/pandas@0801b8c (original) (raw)

`@@ -4559,11 +4559,15 @@ def nlargest(self, n, columns, keep='first'):

`

4559

4559

` Number of rows to return.

`

4560

4560

` columns : label or list of labels

`

4561

4561

` Column label(s) to order by.

`

4562

``

`-

keep : {'first', 'last'}, default 'first'

`

``

4562

`+

keep : {'first', 'last', 'all'}, default 'first'

`

4563

4563

` Where there are duplicate values:

`

4564

4564

``

4565

4565

`` - first : prioritize the first occurrence(s)

``

4566

4566

`` - last : prioritize the last occurrence(s)

``

``

4567


 - ``all`` : do not drop any duplicates, even it means

``

4568

`` +

selecting more than n items.

``

``

4569

+

``

4570

`+

.. versionadded:: 0.24.0

`

4567

4571

``

4568

4572

` Returns

`

4569

4573

` -------

`

`@@ -4586,47 +4590,58 @@ def nlargest(self, n, columns, keep='first'):

`

4586

4590

``

4587

4591

` Examples

`

4588

4592

` --------

`

4589

``

`-

df = pd.DataFrame({'a': [1, 10, 8, 10, -1],

`

4590

``

`-

... 'b': list('abdce'),

`

4591

``

`-

... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})

`

``

4593

`+

df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2],

`

``

4594

`+

... 'b': list('abdcef'),

`

``

4595

`+

... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]})

`

4592

4596

` >>> df

`

4593

4597

` a b c

`

4594

4598

` 0 1 a 1.0

`

4595

4599

` 1 10 b 2.0

`

4596

4600

` 2 8 d NaN

`

4597

``

`-

3 10 c 3.0

`

4598

``

`-

4 -1 e 4.0

`

``

4601

`+

3 11 c 3.0

`

``

4602

`+

4 8 e 4.0

`

``

4603

`+

5 2 f 9.0

`

4599

4604

``

4600

4605

``` In the following example, we will use nlargest to select the three


`4601`

`4606`

` rows having the largest values in column "a".

`

`4602`

`4607`

``

`4603`

`4608`

` >>> df.nlargest(3, 'a')

`

`4604`

`4609`

` a b c

`

``

`4610`

`+

3 11 c 3.0

`

`4605`

`4611`

` 1 10 b 2.0

`

`4606`

``

`-

3 10 c 3.0

`

`4607`

`4612`

` 2 8 d NaN

`

`4608`

`4613`

``

`4609`

`4614`

```  When using ``keep='last'``, ties are resolved in reverse order:

4610

4615

``

4611

4616

` >>> df.nlargest(3, 'a', keep='last')

`

4612

4617

` a b c

`

4613

``

`-

3 10 c 3.0

`

``

4618

`+

3 11 c 3.0

`

``

4619

`+

1 10 b 2.0

`

``

4620

`+

4 8 e 4.0

`

``

4621

+

``

4622


 When using ``keep='all'``, all duplicate items are maintained:

``

4623

+

``

4624

`+

df.nlargest(3, 'a', keep='all')

`

``

4625

`+

a b c

`

``

4626

`+

3 11 c 3.0

`

4614

4627

` 1 10 b 2.0

`

4615

4628

` 2 8 d NaN

`

``

4629

`+

4 8 e 4.0

`

4616

4630

``

4617

4631

` To order by the largest values in column "a" and then "c", we can

`

4618

4632

` specify multiple columns like in the next example.

`

4619

4633

``

4620

4634

` >>> df.nlargest(3, ['a', 'c'])

`

4621

4635

` a b c

`

4622

``

`-

3 10 c 3.0

`

``

4636

`+

4 8 e 4.0

`

``

4637

`+

3 11 c 3.0

`

4623

4638

` 1 10 b 2.0

`

4624

``

`-

2 8 d NaN

`

4625

4639

``

4626

4640

``` Attempting to use nlargest on non-numeric dtypes will raise a


`4627`

`4641`

```  ``TypeError``:

4628

4642

``

4629

4643

` >>> df.nlargest(3, 'b')

`

``

4644

+

4630

4645

` Traceback (most recent call last):

`

4631

4646

` TypeError: Column 'b' has dtype object, cannot use method 'nlargest'

`

4632

4647

` """

`

`@@ -4645,25 +4660,75 @@ def nsmallest(self, n, columns, keep='first'):

`

4645

4660

` Number of items to retrieve

`

4646

4661

` columns : list or str

`

4647

4662

` Column name or names to order by

`

4648

``

`-

keep : {'first', 'last'}, default 'first'

`

``

4663

`+

keep : {'first', 'last', 'all'}, default 'first'

`

4649

4664

` Where there are duplicate values:

`

4650

4665

``` - first : take the first occurrence.


`4651`

`4666`

```  - ``last`` : take the last occurrence.

``

4667


 - ``all`` : do not drop any duplicates, even it means

``

4668

`` +

selecting more than n items.

``

``

4669

+

``

4670

`+

.. versionadded:: 0.24.0

`

4652

4671

``

4653

4672

` Returns

`

4654

4673

` -------

`

4655

4674

` DataFrame

`

4656

4675

``

4657

4676

` Examples

`

4658

4677

` --------

`

4659

``

`-

df = pd.DataFrame({'a': [1, 10, 8, 11, -1],

`

4660

``

`-

... 'b': list('abdce'),

`

4661

``

`-

... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})

`

``

4678

`+

df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2],

`

``

4679

`+

... 'b': list('abdcef'),

`

``

4680

`+

... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]})

`

``

4681

`+

df

`

``

4682

`+

a b c

`

``

4683

`+

0 1 a 1.0

`

``

4684

`+

1 10 b 2.0

`

``

4685

`+

2 8 d NaN

`

``

4686

`+

3 11 c 3.0

`

``

4687

`+

4 8 e 4.0

`

``

4688

`+

5 2 f 9.0

`

``

4689

+

``

4690


 In the following example, we will use ``nsmallest`` to select the

``

4691

`+

three rows having the smallest values in column "a".

`

``

4692

+

4662

4693

` >>> df.nsmallest(3, 'a')

`

4663

``

`-

a b c

`

4664

``

`-

4 -1 e 4

`

4665

``

`-

0 1 a 1

`

4666

``

`-

2 8 d NaN

`

``

4694

`+

a b c

`

``

4695

`+

0 1 a 1.0

`

``

4696

`+

5 2 f 9.0

`

``

4697

`+

2 8 d NaN

`

``

4698

+

``

4699


 When using ``keep='last'``, ties are resolved in reverse order:

``

4700

+

``

4701

`+

df.nsmallest(3, 'a', keep='last')

`

``

4702

`+

a b c

`

``

4703

`+

0 1 a 1.0

`

``

4704

`+

5 2 f 9.0

`

``

4705

`+

4 8 e 4.0

`

``

4706

+

``

4707


 When using ``keep='all'``, all duplicate items are maintained:

``

4708

+

``

4709

`+

df.nsmallest(3, 'a', keep='all')

`

``

4710

`+

a b c

`

``

4711

`+

0 1 a 1.0

`

``

4712

`+

5 2 f 9.0

`

``

4713

`+

2 8 d NaN

`

``

4714

`+

4 8 e 4.0

`

``

4715

+

``

4716

`+

To order by the largest values in column "a" and then "c", we can

`

``

4717

`+

specify multiple columns like in the next example.

`

``

4718

+

``

4719

`+

df.nsmallest(3, ['a', 'c'])

`

``

4720

`+

a b c

`

``

4721

`+

0 1 a 1.0

`

``

4722

`+

5 2 f 9.0

`

``

4723

`+

4 8 e 4.0

`

``

4724

+

``

4725


 Attempting to use ``nsmallest`` on non-numeric dtypes will raise a

``

4726


 ``TypeError``:

``

4727

+

``

4728

`+

df.nsmallest(3, 'b')

`

``

4729

+

``

4730

`+

Traceback (most recent call last):

`

``

4731

`+

TypeError: Column 'b' has dtype object, cannot use method 'nsmallest'

`

4667

4732

` """

`

4668

4733

`return algorithms.SelectNFrame(self,

`

4669

4734

`n=n,

`