ENH: Added a min_count keyword to stat funcs (#18876) · pandas-dev/pandas@dbec3c9 (original) (raw)
`@@ -36,7 +36,8 @@ def get_dispatch(dtypes):
`
36
36
`def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
37
37
` ndarray[int64_t] counts,
`
38
38
` ndarray[{{c_type}}, ndim=2] values,
`
39
``
`-
ndarray[int64_t] labels):
`
``
39
`+
ndarray[int64_t] labels,
`
``
40
`+
Py_ssize_t min_count=1):
`
40
41
` """
`
41
42
` Only aggregates on axis=0
`
42
43
` """
`
`@@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
88
89
``
89
90
` for i in range(ncounts):
`
90
91
` for j in range(K):
`
91
``
`-
if nobs[i, j] == 0:
`
``
92
`+
if nobs[i, j] < min_count:
`
92
93
` out[i, j] = NAN
`
93
94
` else:
`
94
95
` out[i, j] = sumx[i, j]
`
`@@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
99
100
`def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
100
101
` ndarray[int64_t] counts,
`
101
102
` ndarray[{{c_type}}, ndim=2] values,
`
102
``
`-
ndarray[int64_t] labels):
`
``
103
`+
ndarray[int64_t] labels,
`
``
104
`+
Py_ssize_t min_count=1):
`
103
105
` """
`
104
106
` Only aggregates on axis=0
`
105
107
` """
`
`@@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
147
149
``
148
150
` for i in range(ncounts):
`
149
151
` for j in range(K):
`
150
``
`-
if nobs[i, j] == 0:
`
``
152
`+
if nobs[i, j] < min_count:
`
151
153
` out[i, j] = NAN
`
152
154
` else:
`
153
155
` out[i, j] = prodx[i, j]
`
`@@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
159
161
`def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
160
162
` ndarray[int64_t] counts,
`
161
163
` ndarray[{{dest_type2}}, ndim=2] values,
`
162
``
`-
ndarray[int64_t] labels):
`
``
164
`+
ndarray[int64_t] labels,
`
``
165
`+
Py_ssize_t min_count=-1):
`
163
166
` cdef:
`
164
167
` Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
`
165
168
` {{dest_type2}} val, ct, oldmean
`
166
169
` ndarray[{{dest_type2}}, ndim=2] nobs, mean
`
167
170
``
``
171
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
172
+
168
173
` if not len(values) == len(labels):
`
169
174
` raise AssertionError("len(index) != len(labels)")
`
170
175
``
`@@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
208
213
`def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
209
214
` ndarray[int64_t] counts,
`
210
215
` ndarray[{{dest_type2}}, ndim=2] values,
`
211
``
`-
ndarray[int64_t] labels):
`
``
216
`+
ndarray[int64_t] labels,
`
``
217
`+
Py_ssize_t min_count=-1):
`
212
218
` cdef:
`
213
219
` Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
`
214
220
` {{dest_type2}} val, count
`
215
221
` ndarray[{{dest_type2}}, ndim=2] sumx, nobs
`
216
222
``
``
223
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
224
+
217
225
` if not len(values) == len(labels):
`
218
226
` raise AssertionError("len(index) != len(labels)")
`
219
227
``
`@@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
263
271
`def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
264
272
` ndarray[int64_t] counts,
`
265
273
` ndarray[{{dest_type2}}, ndim=2] values,
`
266
``
`-
ndarray[int64_t] labels):
`
``
274
`+
ndarray[int64_t] labels,
`
``
275
`+
Py_ssize_t min_count=-1):
`
267
276
` """
`
268
277
` Only aggregates on axis=0
`
269
278
` """
`
`@@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
272
281
` {{dest_type2}} val, count
`
273
282
` Py_ssize_t ngroups = len(counts)
`
274
283
``
``
284
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
285
+
275
286
` if len(labels) == 0:
`
276
287
` return
`
277
288
``
`@@ -332,7 +343,8 @@ def get_dispatch(dtypes):
`
332
343
`def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
333
344
` ndarray[int64_t] counts,
`
334
345
` ndarray[{{c_type}}, ndim=2] values,
`
335
``
`-
ndarray[int64_t] labels):
`
``
346
`+
ndarray[int64_t] labels,
`
``
347
`+
Py_ssize_t min_count=-1):
`
336
348
` """
`
337
349
` Only aggregates on axis=0
`
338
350
` """
`
`@@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
342
354
` ndarray[{{dest_type2}}, ndim=2] resx
`
343
355
` ndarray[int64_t, ndim=2] nobs
`
344
356
``
``
357
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
358
+
345
359
` if not len(values) == len(labels):
`
346
360
` raise AssertionError("len(index) != len(labels)")
`
347
361
``
`@@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
382
396
`def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
383
397
` ndarray[int64_t] counts,
`
384
398
` ndarray[{{c_type}}, ndim=2] values,
`
385
``
`-
ndarray[int64_t] labels, int64_t rank):
`
``
399
`+
ndarray[int64_t] labels, int64_t rank,
`
``
400
`+
Py_ssize_t min_count=-1):
`
386
401
` """
`
387
402
` Only aggregates on axis=0
`
388
403
` """
`
`@@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
392
407
` ndarray[{{dest_type2}}, ndim=2] resx
`
393
408
` ndarray[int64_t, ndim=2] nobs
`
394
409
``
``
410
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
411
+
395
412
` if not len(values) == len(labels):
`
396
413
` raise AssertionError("len(index) != len(labels)")
`
397
414
``
`@@ -455,7 +472,8 @@ def get_dispatch(dtypes):
`
455
472
`def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
456
473
` ndarray[int64_t] counts,
`
457
474
` ndarray[{{dest_type2}}, ndim=2] values,
`
458
``
`-
ndarray[int64_t] labels):
`
``
475
`+
ndarray[int64_t] labels,
`
``
476
`+
Py_ssize_t min_count=-1):
`
459
477
` """
`
460
478
` Only aggregates on axis=0
`
461
479
` """
`
`@@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
464
482
` {{dest_type2}} val, count
`
465
483
` ndarray[{{dest_type2}}, ndim=2] maxx, nobs
`
466
484
``
``
485
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
486
+
467
487
` if not len(values) == len(labels):
`
468
488
` raise AssertionError("len(index) != len(labels)")
`
469
489
``
`@@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
526
546
`def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
527
547
` ndarray[int64_t] counts,
`
528
548
` ndarray[{{dest_type2}}, ndim=2] values,
`
529
``
`-
ndarray[int64_t] labels):
`
``
549
`+
ndarray[int64_t] labels,
`
``
550
`+
Py_ssize_t min_count=-1):
`
530
551
` """
`
531
552
` Only aggregates on axis=0
`
532
553
` """
`
`@@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
535
556
` {{dest_type2}} val, count
`
536
557
` ndarray[{{dest_type2}}, ndim=2] minx, nobs
`
537
558
``
``
559
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
560
+
538
561
` if not len(values) == len(labels):
`
539
562
` raise AssertionError("len(index) != len(labels)")
`
540
563
``
`@@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
`
686
709
`def group_median_float64(ndarray[float64_t, ndim=2] out,
`
687
710
` ndarray[int64_t] counts,
`
688
711
` ndarray[float64_t, ndim=2] values,
`
689
``
`-
ndarray[int64_t] labels):
`
``
712
`+
ndarray[int64_t] labels,
`
``
713
`+
Py_ssize_t min_count=-1):
`
690
714
` """
`
691
715
` Only aggregates on axis=0
`
692
716
` """
`
`@@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
`
695
719
` ndarray[int64_t] _counts
`
696
720
` ndarray data
`
697
721
` float64_t* ptr
`
``
722
+
``
723
`+
assert min_count == -1, "'min_count' only used in add and prod"
`
``
724
+
698
725
` ngroups = len(counts)
`
699
726
` N, K = ( values).shape
`
700
727
``