REGR: Performance decrease in factorize by rhshadrach · Pull Request #48620 · pandas-dev/pandas (original) (raw)

       before           after         ratio
     [85246fe4]       [3a609934]
     <groupby_na_regr_v2~1^2>       <groupby_na_regr_v2~2>
+      22.7±0.3ms       32.2±0.3ms     1.42  groupby.GroupByCythonAggEaDtypes.time_frame_agg('Int32', 'any')
+      14.2±0.8ms       20.1±0.1ms     1.41  multiindex_object.SetOperations.time_operation('non_monotonic', 'int', 'union')
+     7.08±0.06μs       9.76±0.2μs     1.38  timeseries.TzLocalize.time_infer_dst(None)
+      25.2±0.2ms       34.3±0.2ms     1.36  groupby.GroupByCythonAggEaDtypes.time_frame_agg('Float64', 'any')
+      46.5±0.1ms         53.9±1ms     1.16  groupby.GroupByCythonAggEaDtypes.time_frame_agg('Int64', 'any')
+     5.79±0.01ms       6.48±0.3ms     1.12  algos.isin.IsinWithArange.time_isin(<class 'numpy.object_'>, 1000, -2)
+      16.4±0.3ms       18.3±0.5ms     1.12  frame_ctor.FromDicts.time_list_of_dict
-     3.24±0.04ms      2.94±0.02ms     0.91  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('timestamp', 10000, tzfile('/usr/share/zoneinfo/Asia/Tokyo'))
-      18.4±0.9μs      16.7±0.09μs     0.91  timeseries.SortIndex.time_get_slice(True)
-     1.06±0.03ms         966±20μs     0.91  groupby.GroupByMethods.time_dtype_as_field('object', 'nunique', 'transformation', 5)
-         279±4μs          253±4μs     0.91  groupby.GroupByMethods.time_dtype_as_group('uint', 'diff', 'transformation', 1)
-     3.21±0.02ms      2.90±0.01ms     0.91  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('timestamp', 10000, datetime.timezone.utc)
-        171±10ns        155±0.2ns     0.90  tslibs.timestamp.TimestampProperties.time_is_month_end(None, None)
-      18.8±0.8ms       17.0±0.4ms     0.90  index_cached_properties.IndexCache.time_values('MultiIndex')
-         133±2ns          120±2ns     0.90  tslibs.timestamp.TimestampProperties.time_is_quarter_start(datetime.timezone.utc, None)
-         652±8μs         590±10μs     0.90  groupby.GroupByMethods.time_dtype_as_group('object', 'ffill', 'transformation', 5)
-     3.55±0.08ms      3.21±0.08ms     0.90  series_methods.NanOps.time_func('kurt', 1000000, 'int32')
-       347±0.8ms          313±4ms     0.90  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('timestamp', 1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo'))
-        150±10ns        136±0.2ns     0.90  tslibs.timestamp.TimestampProperties.time_days_in_month(datetime.timezone(datetime.timedelta(seconds=3600)), 'B')
-         168±8ns          152±2ns     0.90  tslibs.timestamp.TimestampProperties.time_is_month_end(datetime.timezone(datetime.timedelta(seconds=3600)), None)
-         886±3μs          799±3μs     0.90  reindex.DropDuplicates.time_frame_drop_dups_bool(False)
-     11.7±0.05ms       10.5±0.2ms     0.90  groupby.AggEngine.time_series_cython(True)
-     3.20±0.02ms      2.89±0.01ms     0.90  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('timestamp', 10000, None)
-         751±7μs         677±10μs     0.90  groupby.GroupByMethods.time_dtype_as_group('object', 'count', 'transformation', 5)
-        151±10ns          135±3ns     0.90  tslibs.timestamp.TimestampProperties.time_days_in_month(None, 'B')
-        3.23±0ms      2.91±0.01ms     0.90  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('timestamp', 10000, datetime.timezone(datetime.timedelta(seconds=3600)))
-     11.7±0.07ms      10.5±0.03ms     0.90  groupby.AggEngine.time_series_cython(False)
-         163±6ms        145±0.9ms     0.89  reshape.WideToLong.time_wide_to_long_big
-         256±5ms        228±0.7ms     0.89  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('datetime', 1000000, None)
-     11.0±0.07ms       9.81±0.5ms     0.89  algorithms.Factorize.time_factorize(False, False, 'string[pyarrow]')
-      25.1±0.3ms      22.2±0.07ms     0.88  timeseries.DatetimeIndex.time_to_pydatetime('tz_naive')
-        656±10μs         581±10μs     0.88  groupby.GroupByMethods.time_dtype_as_group('object', 'bfill', 'transformation', 5)
-         583±8μs          515±2μs     0.88  groupby.GroupByMethods.time_dtype_as_group('object', 'shift', 'transformation', 5)
-        148±10ns          131±1ns     0.88  tslibs.timestamp.TimestampProperties.time_days_in_month(datetime.timezone(datetime.timedelta(seconds=3600)), None)
-         149±9ns          131±1ns     0.88  tslibs.timestamp.TimestampProperties.time_days_in_month(datetime.timezone.utc, None)
-     54.7±0.09μs       48.1±0.5μs     0.88  indexing.NonNumericSeriesIndexing.time_getitem_label_slice('string', 'unique_monotonic_inc')
-         261±6ms          229±2ms     0.88  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('datetime', 1000000, datetime.timezone(datetime.timedelta(seconds=3600)))
-        145±10ns        128±0.6ns     0.88  tslibs.timestamp.TimestampProperties.time_days_in_month(None, None)
-      17.5±0.5ms       15.4±0.1ms     0.88  groupby.TransformEngine.time_series_cython(False)
-      43.1±0.3ms       37.7±0.2ms     0.88  groupby.DateAttributes.time_len_groupby_object
-      25.1±0.1ms       22.0±0.4ms     0.88  timeseries.DatetimeIndex.time_to_pydatetime('repeated')
-         710±4μs          620±2μs     0.87  reindex.DropDuplicates.time_frame_drop_dups_bool(True)
-      17.0±0.3ms      14.8±0.08ms     0.87  groupby.TransformEngine.time_dataframe_cython(True)
-         264±2ms        231±0.2ms     0.87  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('datetime', 1000000, datetime.timezone.utc)
-       612±300μs         533±20μs     0.87  array.IntegerArray.time_from_integer_array
-      17.2±0.2ms      14.9±0.06ms     0.87  groupby.TransformEngine.time_dataframe_cython(False)
-         265±2ms          230±6ms     0.87  tslibs.tslib.TimeIntsToPydatetime.time_ints_to_pydatetime('datetime', 1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo'))
-      17.8±0.2ms       15.4±0.1ms     0.86  groupby.TransformEngine.time_series_cython(True)
-      30.6±0.7ms         26.4±3ms     0.86  io.csv.ReadCSVCategorical.time_convert_post('c')
-      82.4±0.9ms       70.7±0.5ms     0.86  algorithms.Factorize.time_factorize(False, True, 'object')
-         525±1μs          446±7μs     0.85  groupby.GroupByMethods.time_dtype_as_field('object', 'nunique', 'transformation', 1)
-         466±1μs          395±2μs     0.85  groupby.GroupByMethods.time_dtype_as_group('object', 'nunique', 'transformation', 1)
-         379±9μs          320±2μs     0.85  groupby.GroupByMethods.time_dtype_as_group('object', 'first', 'transformation', 1)
-       420±0.8μs          353±8μs     0.84  groupby.GroupByMethods.time_dtype_as_field('object', 'value_counts', 'direct', 1)
-       383±0.8μs          319±3μs     0.83  groupby.GroupByMethods.time_dtype_as_group('object', 'rank', 'transformation', 1)
-         383±2μs          320±1μs     0.83  groupby.GroupByMethods.time_dtype_as_group('object', 'last', 'transformation', 1)
-         357±1μs        295±0.7μs     0.82  groupby.GroupByMethods.time_dtype_as_group('object', 'any', 'transformation', 1)
-         352±2μs          290±2μs     0.82  groupby.GroupByMethods.time_dtype_as_group('object', 'all', 'transformation', 1)
-         335±3μs          274±2μs     0.82  groupby.GroupByMethods.time_dtype_as_group('object', 'count', 'transformation', 1)
-      21.5±0.4ms       17.5±0.2ms     0.81  frame_methods.SortIndexByColumns.time_frame_sort_values_by_columns
-         314±3μs        254±0.7μs     0.81  groupby.GroupByMethods.time_dtype_as_group('object', 'cumcount', 'transformation', 1)
-      46.5±0.4ms       36.9±0.2ms     0.79  reshape.Crosstab.time_crosstab_normalize_margins
-        84.4±1ms         66.8±1ms     0.79  categoricals.Constructor.time_with_nan
-         238±4ms          188±1ms     0.79  groupby.GroupStrings.time_multi_columns
-     2.11±0.04ms      1.67±0.05ms     0.79  sparse.ToCoo.time_sparse_series_to_coo(False)
-      51.8±0.2ms       40.7±0.2ms     0.79  frame_methods.Duplicated.time_frame_duplicated_wide
-     2.23±0.02ms      1.75±0.04ms     0.78  sparse.ToCoo.time_sparse_series_to_coo(True)
-        97.8±2ms       76.6±0.9ms     0.78  groupby.Groups.time_series_groups('object_large')
-         262±2μs        205±0.9μs     0.78  groupby.GroupByMethods.time_dtype_as_group('object', 'ffill', 'transformation', 1)
-         258±1μs          201±1μs     0.78  groupby.GroupByMethods.time_dtype_as_group('object', 'shift', 'transformation', 1)
-      62.2±0.4ms       48.2±0.3ms     0.78  reshape.PivotTable.time_pivot_table_margins
-         266±3μs        205±0.7μs     0.77  groupby.GroupByMethods.time_dtype_as_group('object', 'bfill', 'transformation', 1)
-        89.9±2ms       68.9±0.2ms     0.77  groupby.Groups.time_series_indices('object_large')
-     1.76±0.06ms      1.32±0.01ms     0.75  sparse.ToCoo.time_sparse_series_to_coo_single_level(False)
-         217±2μs          159±2μs     0.73  groupby.GroupByMethods.time_dtype_as_field('object', 'nunique', 'direct', 1)
-         242±6μs          175±2μs     0.72  groupby.GroupByMethods.time_dtype_as_field('object', 'nunique', 'direct', 5)
-        836±20μs          593±3μs     0.71  groupby.RankWithTies.time_rank_ties('datetime64', 'max')
-        807±10μs          566±1μs     0.70  groupby.RankWithTies.time_rank_ties('int64', 'min')
-        848±10μs          595±3μs     0.70  groupby.RankWithTies.time_rank_ties('datetime64', 'min')
-      29.6±0.8ms       20.8±0.2ms     0.70  gil.ParallelFactorize.time_loop(4)
-        801±20μs          561±2μs     0.70  groupby.RankWithTies.time_rank_ties('int64', 'max')
-      15.2±0.1ms      10.6±0.04ms     0.70  gil.ParallelFactorize.time_loop(2)
-      7.96±0.2ms       5.56±0.2ms     0.70  arithmetic.Ops2.time_frame_dot
-      60.1±0.2ms      42.0±0.08ms     0.70  gil.ParallelFactorize.time_loop(8)
-         826±2μs          576±4μs     0.70  groupby.RankWithTies.time_rank_ties('float32', 'dense')
-         827±2μs          576±6μs     0.70  groupby.RankWithTies.time_rank_ties('float64', 'first')
-         849±4μs          590±5μs     0.70  groupby.RankWithTies.time_rank_ties('datetime64', 'first')
-         823±6μs          571±3μs     0.69  groupby.RankWithTies.time_rank_ties('float32', 'min')
-        812±20μs         564±10μs     0.69  groupby.RankWithTies.time_rank_ties('float32', 'max')
-        836±20μs         580±10μs     0.69  groupby.RankWithTies.time_rank_ties('datetime64', 'dense')
-         823±8μs          571±5μs     0.69  groupby.RankWithTies.time_rank_ties('int64', 'first')
-         821±4μs         567±10μs     0.69  groupby.RankWithTies.time_rank_ties('float64', 'average')
-        817±20μs          565±2μs     0.69  groupby.RankWithTies.time_rank_ties('int64', 'dense')
-         819±2μs          566±8μs     0.69  groupby.RankWithTies.time_rank_ties('float32', 'average')
-     7.58±0.03ms      5.23±0.01ms     0.69  algorithms.Factorize.time_factorize(True, False, 'object')
-         817±3μs         562±10μs     0.69  groupby.RankWithTies.time_rank_ties('float32', 'first')
-     13.6±0.08ms       9.32±0.1ms     0.69  groupby.MultiColumn.time_cython_sum
-         828±4μs          569±2μs     0.69  groupby.RankWithTies.time_rank_ties('float64', 'max')
-       829±0.8μs         565±10μs     0.68  groupby.RankWithTies.time_rank_ties('float64', 'min')
-        824±20μs          560±9μs     0.68  groupby.RankWithTies.time_rank_ties('float64', 'dense')
-         850±6μs          572±1μs     0.67  groupby.RankWithTies.time_rank_ties('datetime64', 'average')
-     13.1±0.05ms      8.75±0.04ms     0.67  groupby.MultiColumn.time_col_select_numpy_sum
-         818±9μs          545±1μs     0.67  groupby.RankWithTies.time_rank_ties('int64', 'average')
-     13.1±0.09ms      8.71±0.01ms     0.67  reshape.Crosstab.time_crosstab_normalize
-      12.2±0.4ms      8.07±0.02ms     0.66  reshape.Crosstab.time_crosstab
-      10.6±0.2ms       6.99±0.1ms     0.66  algorithms.Hashing.time_frame
-      59.8±0.3ms       39.3±0.8ms     0.66  groupby.Groups.time_series_indices('object_small')
-      12.8±0.2ms      8.32±0.05ms     0.65  groupby.AggFunctions.time_different_python_functions_multicol
-      31.9±0.6ms       20.2±0.4ms     0.63  algorithms.Factorize.time_factorize(False, False, 'object')
-     12.0±0.01ms      7.56±0.09ms     0.63  reshape.Crosstab.time_crosstab_values
-        29.4±1ms       18.5±0.6ms     0.63  gil.ParallelFactorize.time_parallel(4)
-      54.0±0.8ms       33.3±0.4ms     0.62  groupby.Groups.time_series_groups('object_small')
-     3.26±0.03ms      1.98±0.01ms     0.61  reindex.DropDuplicates.time_frame_drop_dups(True)
-        64.4±3ms         38.7±2ms     0.60  gil.ParallelFactorize.time_parallel(8)
-     10.5±0.02ms       6.10±0.2ms     0.58  groupby.AggFunctions.time_different_str_functions
-      10.5±0.2ms       6.09±0.1ms     0.58  groupby.AggFunctions.time_different_numpy_functions
-      17.0±0.5ms         9.73±2ms     0.57  gil.ParallelFactorize.time_parallel(2)
-      9.12±0.3ms       5.17±0.1ms     0.57  algorithms.Hashing.time_series_string
-      31.4±0.2ms       17.2±0.1ms     0.55  reshape.PivotTable.time_pivot_table_agg
-      15.5±0.4ms      8.42±0.06ms     0.54  reshape.PivotTable.time_pivot_table
-     9.87±0.08ms       5.32±0.1ms     0.54  reindex.DropDuplicates.time_frame_drop_dups(False)
-     14.8±0.09ms      7.73±0.08ms     0.52  reindex.DropDuplicates.time_frame_drop_dups_na(False)
-      4.76±0.1ms      2.41±0.02ms     0.51  reindex.DropDuplicates.time_frame_drop_dups_na(True)