ENH: Allow keep='all' for nlargest/nsmallest (#21650) · pandas-dev/pandas@0801b8c (original) (raw)
`@@ -4559,11 +4559,15 @@ def nlargest(self, n, columns, keep='first'):
`
4559
4559
` Number of rows to return.
`
4560
4560
` columns : label or list of labels
`
4561
4561
` Column label(s) to order by.
`
4562
``
`-
keep : {'first', 'last'}, default 'first'
`
``
4562
`+
keep : {'first', 'last', 'all'}, default 'first'
`
4563
4563
` Where there are duplicate values:
`
4564
4564
``
4565
4565
`` - first
: prioritize the first occurrence(s)
``
4566
4566
`` - last
: prioritize the last occurrence(s)
``
``
4567
- ``all`` : do not drop any duplicates, even it means
``
4568
`` +
selecting more than n
items.
``
``
4569
+
``
4570
`+
.. versionadded:: 0.24.0
`
4567
4571
``
4568
4572
` Returns
`
4569
4573
` -------
`
`@@ -4586,47 +4590,58 @@ def nlargest(self, n, columns, keep='first'):
`
4586
4590
``
4587
4591
` Examples
`
4588
4592
` --------
`
4589
``
`-
df = pd.DataFrame({'a': [1, 10, 8, 10, -1],
`
4590
``
`-
... 'b': list('abdce'),
`
4591
``
`-
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
`
``
4593
`+
df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2],
`
``
4594
`+
... 'b': list('abdcef'),
`
``
4595
`+
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]})
`
4592
4596
` >>> df
`
4593
4597
` a b c
`
4594
4598
` 0 1 a 1.0
`
4595
4599
` 1 10 b 2.0
`
4596
4600
` 2 8 d NaN
`
4597
``
`-
3 10 c 3.0
`
4598
``
`-
4 -1 e 4.0
`
``
4601
`+
3 11 c 3.0
`
``
4602
`+
4 8 e 4.0
`
``
4603
`+
5 2 f 9.0
`
4599
4604
``
4600
4605
``` In the following example, we will use nlargest
to select the three
`4601`
`4606`
` rows having the largest values in column "a".
`
`4602`
`4607`
``
`4603`
`4608`
` >>> df.nlargest(3, 'a')
`
`4604`
`4609`
` a b c
`
``
`4610`
`+
3 11 c 3.0
`
`4605`
`4611`
` 1 10 b 2.0
`
`4606`
``
`-
3 10 c 3.0
`
`4607`
`4612`
` 2 8 d NaN
`
`4608`
`4613`
``
`4609`
`4614`
``` When using ``keep='last'``, ties are resolved in reverse order:
4610
4615
``
4611
4616
` >>> df.nlargest(3, 'a', keep='last')
`
4612
4617
` a b c
`
4613
``
`-
3 10 c 3.0
`
``
4618
`+
3 11 c 3.0
`
``
4619
`+
1 10 b 2.0
`
``
4620
`+
4 8 e 4.0
`
``
4621
+
``
4622
When using ``keep='all'``, all duplicate items are maintained:
``
4623
+
``
4624
`+
df.nlargest(3, 'a', keep='all')
`
``
4625
`+
a b c
`
``
4626
`+
3 11 c 3.0
`
4614
4627
` 1 10 b 2.0
`
4615
4628
` 2 8 d NaN
`
``
4629
`+
4 8 e 4.0
`
4616
4630
``
4617
4631
` To order by the largest values in column "a" and then "c", we can
`
4618
4632
` specify multiple columns like in the next example.
`
4619
4633
``
4620
4634
` >>> df.nlargest(3, ['a', 'c'])
`
4621
4635
` a b c
`
4622
``
`-
3 10 c 3.0
`
``
4636
`+
4 8 e 4.0
`
``
4637
`+
3 11 c 3.0
`
4623
4638
` 1 10 b 2.0
`
4624
``
`-
2 8 d NaN
`
4625
4639
``
4626
4640
``` Attempting to use nlargest
on non-numeric dtypes will raise a
`4627`
`4641`
``` ``TypeError``:
4628
4642
``
4629
4643
` >>> df.nlargest(3, 'b')
`
``
4644
+
4630
4645
` Traceback (most recent call last):
`
4631
4646
` TypeError: Column 'b' has dtype object, cannot use method 'nlargest'
`
4632
4647
` """
`
`@@ -4645,25 +4660,75 @@ def nsmallest(self, n, columns, keep='first'):
`
4645
4660
` Number of items to retrieve
`
4646
4661
` columns : list or str
`
4647
4662
` Column name or names to order by
`
4648
``
`-
keep : {'first', 'last'}, default 'first'
`
``
4663
`+
keep : {'first', 'last', 'all'}, default 'first'
`
4649
4664
` Where there are duplicate values:
`
4650
4665
``` - first
: take the first occurrence.
`4651`
`4666`
``` - ``last`` : take the last occurrence.
``
4667
- ``all`` : do not drop any duplicates, even it means
``
4668
`` +
selecting more than n
items.
``
``
4669
+
``
4670
`+
.. versionadded:: 0.24.0
`
4652
4671
``
4653
4672
` Returns
`
4654
4673
` -------
`
4655
4674
` DataFrame
`
4656
4675
``
4657
4676
` Examples
`
4658
4677
` --------
`
4659
``
`-
df = pd.DataFrame({'a': [1, 10, 8, 11, -1],
`
4660
``
`-
... 'b': list('abdce'),
`
4661
``
`-
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
`
``
4678
`+
df = pd.DataFrame({'a': [1, 10, 8, 11, 8, 2],
`
``
4679
`+
... 'b': list('abdcef'),
`
``
4680
`+
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0, 9.0]})
`
``
4681
`+
df
`
``
4682
`+
a b c
`
``
4683
`+
0 1 a 1.0
`
``
4684
`+
1 10 b 2.0
`
``
4685
`+
2 8 d NaN
`
``
4686
`+
3 11 c 3.0
`
``
4687
`+
4 8 e 4.0
`
``
4688
`+
5 2 f 9.0
`
``
4689
+
``
4690
In the following example, we will use ``nsmallest`` to select the
``
4691
`+
three rows having the smallest values in column "a".
`
``
4692
+
4662
4693
` >>> df.nsmallest(3, 'a')
`
4663
``
`-
a b c
`
4664
``
`-
4 -1 e 4
`
4665
``
`-
0 1 a 1
`
4666
``
`-
2 8 d NaN
`
``
4694
`+
a b c
`
``
4695
`+
0 1 a 1.0
`
``
4696
`+
5 2 f 9.0
`
``
4697
`+
2 8 d NaN
`
``
4698
+
``
4699
When using ``keep='last'``, ties are resolved in reverse order:
``
4700
+
``
4701
`+
df.nsmallest(3, 'a', keep='last')
`
``
4702
`+
a b c
`
``
4703
`+
0 1 a 1.0
`
``
4704
`+
5 2 f 9.0
`
``
4705
`+
4 8 e 4.0
`
``
4706
+
``
4707
When using ``keep='all'``, all duplicate items are maintained:
``
4708
+
``
4709
`+
df.nsmallest(3, 'a', keep='all')
`
``
4710
`+
a b c
`
``
4711
`+
0 1 a 1.0
`
``
4712
`+
5 2 f 9.0
`
``
4713
`+
2 8 d NaN
`
``
4714
`+
4 8 e 4.0
`
``
4715
+
``
4716
`+
To order by the largest values in column "a" and then "c", we can
`
``
4717
`+
specify multiple columns like in the next example.
`
``
4718
+
``
4719
`+
df.nsmallest(3, ['a', 'c'])
`
``
4720
`+
a b c
`
``
4721
`+
0 1 a 1.0
`
``
4722
`+
5 2 f 9.0
`
``
4723
`+
4 8 e 4.0
`
``
4724
+
``
4725
Attempting to use ``nsmallest`` on non-numeric dtypes will raise a
``
4726
``TypeError``:
``
4727
+
``
4728
`+
df.nsmallest(3, 'b')
`
``
4729
+
``
4730
`+
Traceback (most recent call last):
`
``
4731
`+
TypeError: Column 'b' has dtype object, cannot use method 'nsmallest'
`
4667
4732
` """
`
4668
4733
`return algorithms.SelectNFrame(self,
`
4669
4734
`n=n,
`