Flip scanning direction of stable sort · rust-lang/rust@f297afa (original) (raw)
`@@ -1196,52 +1196,37 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
`
1196
1196
``
1197
1197
`let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
`
1198
1198
``
1199
``
`` -
// In order to identify natural runs in v
, we traverse it backwards. That might seem like a
``
1200
``
`-
// strange decision, but consider the fact that merges more often go in the opposite direction
`
1201
``
`-
// (forwards). According to benchmarks, merging forwards is slightly faster than merging
`
1202
``
`-
// backwards. To conclude, identifying runs by traversing backwards improves performance.
`
1203
``
`-
let mut end = len;
`
1204
``
`-
while end > 0 {
`
1205
``
`-
// Find the next natural run, and reverse it if it's strictly descending.
`
1206
``
`-
let mut start = end - 1;
`
1207
``
`-
if start > 0 {
`
1208
``
`-
start -= 1;
`
1209
``
-
1210
``
`-
// SAFETY: The v.get_unchecked must be fed with correct inbound indicies.
`
1211
``
`-
unsafe {
`
1212
``
`-
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
`
1213
``
`-
while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
`
1214
``
`-
start -= 1;
`
1215
``
`-
}
`
1216
``
`-
v[start..end].reverse();
`
1217
``
`-
} else {
`
1218
``
`-
while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
`
1219
``
`-
{
`
1220
``
`-
start -= 1;
`
1221
``
`-
}
`
1222
``
`-
}
`
1223
``
`-
}
`
``
1199
`+
let mut end = 0;
`
``
1200
`+
let mut start = 0;
`
``
1201
+
``
1202
`+
// Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the
`
``
1203
`+
// code-gen is usually better. For the most sensitive types such as integers, these are merged
`
``
1204
`+
// bidirectionally at once. So there is no benefit in scanning backwards.
`
``
1205
`+
while end < len {
`
``
1206
`+
let (streak_end, was_reversed) = find_streak(&v[start..], is_less);
`
``
1207
`+
end += streak_end;
`
``
1208
`+
if was_reversed {
`
``
1209
`+
v[start..end].reverse();
`
1224
1210
`}
`
1225
1211
``
1226
1212
`// Insert some more elements into the run if it's too short. Insertion sort is faster than
`
1227
1213
`// merge sort on short sequences, so this significantly improves performance.
`
1228
``
`-
start = provide_sorted_batch(v, start, end, is_less);
`
``
1214
`+
end = provide_sorted_batch(v, start, end, is_less);
`
1229
1215
``
1230
1216
`// Push this run onto the stack.
`
1231
1217
` runs.push(TimSortRun { start, len: end - start });
`
1232
``
`-
end = start;
`
``
1218
`+
start = end;
`
1233
1219
``
1234
1220
`// Merge some pairs of adjacent runs to satisfy the invariants.
`
1235
``
`-
while let Some(r) = collapse(runs.as_slice()) {
`
1236
``
`-
let left = runs[r + 1];
`
1237
``
`-
let right = runs[r];
`
1238
``
`` -
// SAFETY: buf_ptr
must hold enough capacity for the shorter of the two sides, and
``
1239
``
`-
// neither side may be on length 0.
`
``
1221
`+
while let Some(r) = collapse(runs.as_slice(), len) {
`
``
1222
`+
let left = runs[r];
`
``
1223
`+
let right = runs[r + 1];
`
``
1224
`+
let merge_slice = &mut v[left.start..right.start + right.len];
`
1240
1225
`unsafe {
`
1241
``
`-
merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less);
`
``
1226
`+
merge(merge_slice, left.len, buf_ptr, is_less);
`
1242
1227
`}
`
1243
``
`-
runs[r] = TimSortRun { start: left.start, len: left.len + right.len };
`
1244
``
`-
runs.remove(r + 1);
`
``
1228
`+
runs[r + 1] = TimSortRun { start: left.start, len: left.len + right.len };
`
``
1229
`+
runs.remove(r);
`
1245
1230
`}
`
1246
1231
`}
`
1247
1232
``
`@@ -1263,10 +1248,10 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
`
1263
1248
`// run starts at index 0, it will always demand a merge operation until the stack is fully
`
1264
1249
`// collapsed, in order to complete the sort.
`
1265
1250
`#[inline]
`
1266
``
`-
fn collapse(runs: &[TimSortRun]) -> Option {
`
``
1251
`+
fn collapse(runs: &[TimSortRun], stop: usize) -> Option {
`
1267
1252
`let n = runs.len();
`
1268
1253
`if n >= 2
`
1269
``
`-
&& (runs[n - 1].start == 0
`
``
1254
`+
&& (runs[n - 1].start + runs[n - 1].len == stop
`
1270
1255
` || runs[n - 2].len <= runs[n - 1].len
`
1271
1256
` || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
`
1272
1257
` || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
`
`@@ -1454,33 +1439,70 @@ pub struct TimSortRun {
`
1454
1439
`start: usize,
`
1455
1440
`}
`
1456
1441
``
1457
``
`-
/// Takes a range as denoted by start and end, that is already sorted and extends it to the left if
`
``
1442
`+
/// Takes a range as denoted by start and end, that is already sorted and extends it to the right if
`
1458
1443
`/// necessary with sorts optimized for smaller ranges such as insertion sort.
`
1459
1444
`#[cfg(not(no_global_oom_handling))]
`
1460
``
`-
fn provide_sorted_batch<T, F>(v: &mut [T], mut start: usize, end: usize, is_less: &mut F) -> usize
`
``
1445
`+
fn provide_sorted_batch<T, F>(v: &mut [T], start: usize, mut end: usize, is_less: &mut F) -> usize
`
1461
1446
`where
`
1462
1447
`F: FnMut(&T, &T) -> bool,
`
1463
1448
`{
`
1464
``
`-
debug_assert!(end > start);
`
``
1449
`+
let len = v.len();
`
``
1450
`+
assert!(end >= start && end <= len);
`
1465
1451
``
1466
1452
`// This value is a balance between least comparisons and best performance, as
`
1467
1453
`// influenced by for example cache locality.
`
1468
1454
`const MIN_INSERTION_RUN: usize = 10;
`
1469
1455
``
1470
1456
`// Insert some more elements into the run if it's too short. Insertion sort is faster than
`
1471
1457
`// merge sort on short sequences, so this significantly improves performance.
`
1472
``
`-
let start_found = start;
`
1473
1458
`let start_end_diff = end - start;
`
1474
1459
``
1475
``
`-
if start_end_diff < MIN_INSERTION_RUN && start != 0 {
`
``
1460
`+
if start_end_diff < MIN_INSERTION_RUN && end < len {
`
1476
1461
`// v[start_found..end] are elements that are already sorted in the input. We want to extend
`
1477
1462
`// the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is
`
1478
1463
`// more efficient that trying to push those already sorted elements to the left.
`
``
1464
`+
end = cmp::min(start + MIN_INSERTION_RUN, len);
`
``
1465
`+
let presorted_start = cmp::max(start_end_diff, 1);
`
1479
1466
``
1480
``
`-
start = if end >= MIN_INSERTION_RUN { end - MIN_INSERTION_RUN } else { 0 };
`
``
1467
`+
insertion_sort_shift_left(&mut v[start..end], presorted_start, is_less);
`
``
1468
`+
}
`
1481
1469
``
1482
``
`-
insertion_sort_shift_right(&mut v[start..end], start_found - start, is_less);
`
``
1470
`+
end
`
``
1471
`+
}
`
``
1472
+
``
1473
`+
/// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first
`
``
1474
`+
/// value that is not part of said streak, and a bool denoting wether the streak was reversed.
`
``
1475
`+
/// Streaks can be increasing or decreasing.
`
``
1476
`+
fn find_streak<T, F>(v: &[T], is_less: &mut F) -> (usize, bool)
`
``
1477
`+
where
`
``
1478
`+
F: FnMut(&T, &T) -> bool,
`
``
1479
`+
{
`
``
1480
`+
let len = v.len();
`
``
1481
+
``
1482
`+
if len < 2 {
`
``
1483
`+
return (len, false);
`
1483
1484
`}
`
1484
1485
``
1485
``
`-
start
`
``
1486
`+
let mut end = 2;
`
``
1487
+
``
1488
`+
// SAFETY: See below specific.
`
``
1489
`+
unsafe {
`
``
1490
`+
// SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.
`
``
1491
`+
let assume_reverse = is_less(v.get_unchecked(1), v.get_unchecked(0));
`
``
1492
+
``
1493
`+
// SAFETY: We know end >= 2 and check end < len.
`
``
1494
`+
// From that follows that accessing v at end and end - 1 is safe.
`
``
1495
`+
if assume_reverse {
`
``
1496
`+
while end < len && is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
`
``
1497
`+
end += 1;
`
``
1498
`+
}
`
``
1499
+
``
1500
`+
(end, true)
`
``
1501
`+
} else {
`
``
1502
`+
while end < len && !is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {
`
``
1503
`+
end += 1;
`
``
1504
`+
}
`
``
1505
`+
(end, false)
`
``
1506
`+
}
`
``
1507
`+
}
`
1486
1508
`}
`