Flip scanning direction of stable sort · rust-lang/rust@f297afa (original) (raw)

`@@ -1196,52 +1196,37 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(

`

1196

1196

``

1197

1197

`let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);

`

1198

1198

``

1199

``

`` -

// In order to identify natural runs in v, we traverse it backwards. That might seem like a

``

1200

``

`-

// strange decision, but consider the fact that merges more often go in the opposite direction

`

1201

``

`-

// (forwards). According to benchmarks, merging forwards is slightly faster than merging

`

1202

``

`-

// backwards. To conclude, identifying runs by traversing backwards improves performance.

`

1203

``

`-

let mut end = len;

`

1204

``

`-

while end > 0 {

`

1205

``

`-

// Find the next natural run, and reverse it if it's strictly descending.

`

1206

``

`-

let mut start = end - 1;

`

1207

``

`-

if start > 0 {

`

1208

``

`-

start -= 1;

`

1209

``

-

1210

``

`-

// SAFETY: The v.get_unchecked must be fed with correct inbound indicies.

`

1211

``

`-

unsafe {

`

1212

``

`-

if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {

`

1213

``

`-

while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {

`

1214

``

`-

start -= 1;

`

1215

``

`-

}

`

1216

``

`-

v[start..end].reverse();

`

1217

``

`-

} else {

`

1218

``

`-

while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))

`

1219

``

`-

{

`

1220

``

`-

start -= 1;

`

1221

``

`-

}

`

1222

``

`-

}

`

1223

``

`-

}

`

``

1199

`+

let mut end = 0;

`

``

1200

`+

let mut start = 0;

`

``

1201

+

``

1202

`+

// Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the

`

``

1203

`+

// code-gen is usually better. For the most sensitive types such as integers, these are merged

`

``

1204

`+

// bidirectionally at once. So there is no benefit in scanning backwards.

`

``

1205

`+

while end < len {

`

``

1206

`+

let (streak_end, was_reversed) = find_streak(&v[start..], is_less);

`

``

1207

`+

end += streak_end;

`

``

1208

`+

if was_reversed {

`

``

1209

`+

v[start..end].reverse();

`

1224

1210

`}

`

1225

1211

``

1226

1212

`// Insert some more elements into the run if it's too short. Insertion sort is faster than

`

1227

1213

`// merge sort on short sequences, so this significantly improves performance.

`

1228

``

`-

start = provide_sorted_batch(v, start, end, is_less);

`

``

1214

`+

end = provide_sorted_batch(v, start, end, is_less);

`

1229

1215

``

1230

1216

`// Push this run onto the stack.

`

1231

1217

` runs.push(TimSortRun { start, len: end - start });

`

1232

``

`-

end = start;

`

``

1218

`+

start = end;

`

1233

1219

``

1234

1220

`// Merge some pairs of adjacent runs to satisfy the invariants.

`

1235

``

`-

while let Some(r) = collapse(runs.as_slice()) {

`

1236

``

`-

let left = runs[r + 1];

`

1237

``

`-

let right = runs[r];

`

1238

``

`` -

// SAFETY: buf_ptr must hold enough capacity for the shorter of the two sides, and

``

1239

``

`-

// neither side may be on length 0.

`

``

1221

`+

while let Some(r) = collapse(runs.as_slice(), len) {

`

``

1222

`+

let left = runs[r];

`

``

1223

`+

let right = runs[r + 1];

`

``

1224

`+

let merge_slice = &mut v[left.start..right.start + right.len];

`

1240

1225

`unsafe {

`

1241

``

`-

merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less);

`

``

1226

`+

merge(merge_slice, left.len, buf_ptr, is_less);

`

1242

1227

`}

`

1243

``

`-

runs[r] = TimSortRun { start: left.start, len: left.len + right.len };

`

1244

``

`-

runs.remove(r + 1);

`

``

1228

`+

runs[r + 1] = TimSortRun { start: left.start, len: left.len + right.len };

`

``

1229

`+

runs.remove(r);

`

1245

1230

`}

`

1246

1231

`}

`

1247

1232

``

`@@ -1263,10 +1248,10 @@ pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(

`

1263

1248

`// run starts at index 0, it will always demand a merge operation until the stack is fully

`

1264

1249

`// collapsed, in order to complete the sort.

`

1265

1250

`#[inline]

`

1266

``

`-

fn collapse(runs: &[TimSortRun]) -> Option {

`

``

1251

`+

fn collapse(runs: &[TimSortRun], stop: usize) -> Option {

`

1267

1252

`let n = runs.len();

`

1268

1253

`if n >= 2

`

1269

``

`-

&& (runs[n - 1].start == 0

`

``

1254

`+

&& (runs[n - 1].start + runs[n - 1].len == stop

`

1270

1255

` || runs[n - 2].len <= runs[n - 1].len

`

1271

1256

` || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)

`

1272

1257

` || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))

`

`@@ -1454,33 +1439,70 @@ pub struct TimSortRun {

`

1454

1439

`start: usize,

`

1455

1440

`}

`

1456

1441

``

1457

``

`-

/// Takes a range as denoted by start and end, that is already sorted and extends it to the left if

`

``

1442

`+

/// Takes a range as denoted by start and end, that is already sorted and extends it to the right if

`

1458

1443

`/// necessary with sorts optimized for smaller ranges such as insertion sort.

`

1459

1444

`#[cfg(not(no_global_oom_handling))]

`

1460

``

`-

fn provide_sorted_batch<T, F>(v: &mut [T], mut start: usize, end: usize, is_less: &mut F) -> usize

`

``

1445

`+

fn provide_sorted_batch<T, F>(v: &mut [T], start: usize, mut end: usize, is_less: &mut F) -> usize

`

1461

1446

`where

`

1462

1447

`F: FnMut(&T, &T) -> bool,

`

1463

1448

`{

`

1464

``

`-

debug_assert!(end > start);

`

``

1449

`+

let len = v.len();

`

``

1450

`+

assert!(end >= start && end <= len);

`

1465

1451

``

1466

1452

`// This value is a balance between least comparisons and best performance, as

`

1467

1453

`// influenced by for example cache locality.

`

1468

1454

`const MIN_INSERTION_RUN: usize = 10;

`

1469

1455

``

1470

1456

`// Insert some more elements into the run if it's too short. Insertion sort is faster than

`

1471

1457

`// merge sort on short sequences, so this significantly improves performance.

`

1472

``

`-

let start_found = start;

`

1473

1458

`let start_end_diff = end - start;

`

1474

1459

``

1475

``

`-

if start_end_diff < MIN_INSERTION_RUN && start != 0 {

`

``

1460

`+

if start_end_diff < MIN_INSERTION_RUN && end < len {

`

1476

1461

`// v[start_found..end] are elements that are already sorted in the input. We want to extend

`

1477

1462

`// the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is

`

1478

1463

`// more efficient that trying to push those already sorted elements to the left.

`

``

1464

`+

end = cmp::min(start + MIN_INSERTION_RUN, len);

`

``

1465

`+

let presorted_start = cmp::max(start_end_diff, 1);

`

1479

1466

``

1480

``

`-

start = if end >= MIN_INSERTION_RUN { end - MIN_INSERTION_RUN } else { 0 };

`

``

1467

`+

insertion_sort_shift_left(&mut v[start..end], presorted_start, is_less);

`

``

1468

`+

}

`

1481

1469

``

1482

``

`-

insertion_sort_shift_right(&mut v[start..end], start_found - start, is_less);

`

``

1470

`+

end

`

``

1471

`+

}

`

``

1472

+

``

1473

`+

/// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first

`

``

1474

`+

/// value that is not part of said streak, and a bool denoting wether the streak was reversed.

`

``

1475

`+

/// Streaks can be increasing or decreasing.

`

``

1476

`+

fn find_streak<T, F>(v: &[T], is_less: &mut F) -> (usize, bool)

`

``

1477

`+

where

`

``

1478

`+

F: FnMut(&T, &T) -> bool,

`

``

1479

`+

{

`

``

1480

`+

let len = v.len();

`

``

1481

+

``

1482

`+

if len < 2 {

`

``

1483

`+

return (len, false);

`

1483

1484

`}

`

1484

1485

``

1485

``

`-

start

`

``

1486

`+

let mut end = 2;

`

``

1487

+

``

1488

`+

// SAFETY: See below specific.

`

``

1489

`+

unsafe {

`

``

1490

`+

// SAFETY: We checked that len >= 2, so 0 and 1 are valid indices.

`

``

1491

`+

let assume_reverse = is_less(v.get_unchecked(1), v.get_unchecked(0));

`

``

1492

+

``

1493

`+

// SAFETY: We know end >= 2 and check end < len.

`

``

1494

`+

// From that follows that accessing v at end and end - 1 is safe.

`

``

1495

`+

if assume_reverse {

`

``

1496

`+

while end < len && is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {

`

``

1497

`+

end += 1;

`

``

1498

`+

}

`

``

1499

+

``

1500

`+

(end, true)

`

``

1501

`+

} else {

`

``

1502

`+

while end < len && !is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) {

`

``

1503

`+

end += 1;

`

``

1504

`+

}

`

``

1505

`+

(end, false)

`

``

1506

`+

}

`

``

1507

`+

}

`

1486

1508

`}

`