Vectorize reverse_copy() by pawREP · Pull Request #804 · microsoft/STL (original) (raw)

Here's the benchmark I wrote way back when I added the vector reverse:

#include <algorithm>
#include <benchmark/benchmark.h>
#include <deque>
#include <functional>
#include <list>
#include <numeric>
#include <stdlib.h>
#include <utility>
#include <vector>

using namespace std;

void verify(bool b) {
  if (!b) {
    exit(1);
  }
}

template<class _BidIt>
void plain_bidi_reverse(_BidIt _First, _BidIt _Last)
    {
    for (; _First != _Last && _First != --_Last; ++_First)
        {
        const auto _Temp = *_First;
        *_First = *_Last;
        *_Last = _Temp;
        }
    }

template <class Container, class TestedFn>
inline void RunTest(benchmark::State &state, size_t dataSize, TestedFn fn) {
  Container data(dataSize);
  iota(data.begin(), data.end(),
       static_cast<typename Container::value_type>(1));
  fn(data);
  verify(is_sorted(data.begin(), data.end(), greater<>{}));
  fn(data);
  verify(is_sorted(data.begin(), data.end(), less<>{}));
  for (auto _ : state) {
    (void)_;
    fn(data);
  }
}

template <class Container> void BenchPlainBidiReverse(benchmark::State &state) {
  RunTest<Container>(state, static_cast<size_t>(state.range(0)),
                     [](auto &c) { plain_bidi_reverse(c.begin(), c.end()); });
}

template <class Container> void BenchStdReverse(benchmark::State &state) {
  RunTest<Container>(state, static_cast<size_t>(state.range(0)),
                     [](auto &c) { reverse(c.begin(), c.end()); });
}

BENCHMARK_TEMPLATE(BenchStdReverse, deque<unsigned int>)->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchStdReverse, list<unsigned int>)->Range(8, 100'000);

BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned char>)->Range(8, 255);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned char>)->Range(8, 255);
BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned short>)->Range(8, 65535);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned short>)->Range(8, 65535);
BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned int>)->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned int>)->Range(8, 100'000);

extern "C" extern long __isa_enabled;
constexpr long __ISA_AVAILABLE_SSE2 = 1;
constexpr long __ISA_AVAILABLE_AVX2 = 5;

#include <emmintrin.h>
#include <intrin.h>
#include <xmmintrin.h>

extern "C" void _cdecl __std_sse_reverse_trivially_copyable_4(
    unsigned int *_First, unsigned int *_Last) throw() {
  if (_Last - _First > 8
#ifndef _M_X64
      && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2)
#endif /* _M_X64 */
          ) {
    unsigned int *_Stop_at = _First + ((_Last - _First) >> 3 << 2);
    do {
      _Last -= 4;
      const __m128i _Left =
          _mm_loadu_si128(reinterpret_cast<__m128i *>(_First));
      const __m128i _Right =
          _mm_loadu_si128(reinterpret_cast<__m128i *>(_Last));
      const __m128i _Left_reversed = _mm_shuffle_epi32(_Left, 27);
      const __m128i _Right_reversed = _mm_shuffle_epi32(_Right, 27);
      _mm_storeu_si128(reinterpret_cast<__m128i *>(_First), _Right_reversed);
      _mm_storeu_si128(reinterpret_cast<__m128i *>(_Last), _Left_reversed);
      _First += 4;
    } while (_First != _Stop_at);
  }

  for (; _First != _Last && _First != --_Last; ++_First) {
    const unsigned int _Temp = *_First;
    *_First = *_Last;
    *_Last = _Temp;
  }
}

void BenchUnsignedIntSseReverse(benchmark::State &state) {
  RunTest<vector<unsigned int>>(state, static_cast<size_t>(state.range(0)), [](auto &c) {
    __std_sse_reverse_trivially_copyable_4(&*c.begin(), &*c.end());
  });
}

BENCHMARK(BenchUnsignedIntSseReverse)->Range(8, 100'000);

extern "C" void _cdecl __std_avx2_reverse_trivially_copyable_4(
    unsigned int *_First, unsigned int *_Last) throw() {
  if (_Last - _First > 16 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) {
    unsigned int *_Stop_at = _First + ((_Last - _First) >> 4 << 3);
    do {
      _Last -= 8;
      const __m256i _Left =
          _mm256_loadu_si256(reinterpret_cast<__m256i *>(_First));
      const __m256i _Right =
          _mm256_loadu_si256(reinterpret_cast<__m256i *>(_Last));
      const __m256i _Left_lane_reversed = _mm256_shuffle_epi32(_Left, 27);
      const __m256i _Right_lane_reversed = _mm256_shuffle_epi32(_Right, 27);
      const __m256i _Left_reversed =
          _mm256_permute4x64_epi64(_Left_lane_reversed, 78);
      const __m256i _Right_reversed =
          _mm256_permute4x64_epi64(_Right_lane_reversed, 78);
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(_First), _Right_reversed);
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(_Last), _Left_reversed);
      _First += 8;
    } while (_First != _Stop_at);
  }

  for (; _First != _Last && _First != --_Last; ++_First) {
    const unsigned int _Temp = *_First;
    *_First = *_Last;
    *_Last = _Temp;
  }
}

void BenchAvx2UnsignedIntReverse(benchmark::State &state) {
  RunTest<vector<unsigned int>>(state, static_cast<size_t>(state.range(0)), [](auto &c) {
    __std_avx2_reverse_trivially_copyable_4(&*c.begin(), &*c.end());
  });
}

BENCHMARK(BenchAvx2UnsignedIntReverse)->Range(8, 100'000);

BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned long long>)
    ->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned long long>)
    ->Range(8, 100'000);

BENCHMARK_MAIN();

Here are results from my 3970X; first, the important ones like vector: by the time you get to 64 elements the wins are huge. There's no reason this wouldn't apply just as much to reverse_copy (although absolute wins might be lower because memory bandwidth consumption is higher for that algorithm)

---------------------------------------------------------------------------------------------------
Benchmark                                                         Time             CPU   Iterations
---------------------------------------------------------------------------------------------------
BenchPlainBidiReverse<vector<unsigned char>>/8                 2.14 ns         2.10 ns    320000000
BenchPlainBidiReverse<vector<unsigned char>>/64                23.6 ns         23.4 ns     21333333
BenchPlainBidiReverse<vector<unsigned char>>/255                119 ns          120 ns      5600000
BenchStdReverse<vector<unsigned char>>/8                       3.57 ns         3.61 ns    203636364
BenchStdReverse<vector<unsigned char>>/64                      3.85 ns         3.85 ns    194782609
BenchStdReverse<vector<unsigned char>>/255                     14.2 ns         14.4 ns     49777778
BenchPlainBidiReverse<vector<unsigned long long>>/8            1.92 ns         1.93 ns    373333333
BenchPlainBidiReverse<vector<unsigned long long>>/64           15.4 ns         15.7 ns     49777778
BenchPlainBidiReverse<vector<unsigned long long>>/512           132 ns          131 ns      5600000
BenchPlainBidiReverse<vector<unsigned long long>>/4096         1017 ns         1004 ns       746667
BenchPlainBidiReverse<vector<unsigned long long>>/32768        8397 ns         8371 ns        89600
BenchPlainBidiReverse<vector<unsigned long long>>/100000      25675 ns        25495 ns        26353
BenchStdReverse<vector<unsigned long long>>/8                  3.46 ns         3.52 ns    213333333
BenchStdReverse<vector<unsigned long long>>/64                 8.98 ns         9.00 ns     74666667
BenchStdReverse<vector<unsigned long long>>/512                62.0 ns         61.4 ns     11200000
BenchStdReverse<vector<unsigned long long>>/4096                493 ns          488 ns      1120000
BenchStdReverse<vector<unsigned long long>>/32768              3890 ns         3836 ns       179200
BenchStdReverse<vector<unsigned long long>>/100000            11978 ns        11998 ns        56000

The full list:

D:\vclib-benchmarks\windows.x64.release>.\reverse.exe
07/21/20 11:28:22
Running .\reverse.exe
Run on (64 X 3700 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x32)
  L1 Instruction 32 KiB (x32)
  L2 Unified 512 KiB (x32)
  L3 Unified 16384 KiB (x8)
---------------------------------------------------------------------------------------------------
Benchmark                                                         Time             CPU   Iterations
---------------------------------------------------------------------------------------------------
BenchStdReverse<deque<unsigned int>>/8                         7.41 ns         7.32 ns     89600000
BenchStdReverse<deque<unsigned int>>/64                        50.8 ns         51.6 ns     10000000
BenchStdReverse<deque<unsigned int>>/512                        407 ns          410 ns      1792000
BenchStdReverse<deque<unsigned int>>/4096                      4039 ns         4028 ns       213333
BenchStdReverse<deque<unsigned int>>/32768                    26808 ns        26367 ns        24889
BenchStdReverse<deque<unsigned int>>/100000                   92349 ns        92072 ns         7467
BenchStdReverse<list<unsigned int>>/8                          3.65 ns         3.61 ns    194782609
BenchStdReverse<list<unsigned int>>/64                         36.6 ns         36.8 ns     18666667
BenchStdReverse<list<unsigned int>>/512                         314 ns          314 ns      2240000
BenchStdReverse<list<unsigned int>>/4096                       3560 ns         3530 ns       194783
BenchStdReverse<list<unsigned int>>/32768                     43682 ns        43493 ns        15448
BenchStdReverse<list<unsigned int>>/100000                   123828 ns       122768 ns         5600
BenchPlainBidiReverse<vector<unsigned char>>/8                 2.14 ns         2.10 ns    320000000
BenchPlainBidiReverse<vector<unsigned char>>/64                23.6 ns         23.4 ns     21333333
BenchPlainBidiReverse<vector<unsigned char>>/255                119 ns          120 ns      5600000
BenchStdReverse<vector<unsigned char>>/8                       3.57 ns         3.61 ns    203636364
BenchStdReverse<vector<unsigned char>>/64                      3.85 ns         3.85 ns    194782609
BenchStdReverse<vector<unsigned char>>/255                     14.2 ns         14.4 ns     49777778
BenchPlainBidiReverse<vector<unsigned short>>/8                1.90 ns         1.88 ns    373333333
BenchPlainBidiReverse<vector<unsigned short>>/64               31.9 ns         32.1 ns     22400000
BenchPlainBidiReverse<vector<unsigned short>>/512               251 ns          251 ns      2800000
BenchPlainBidiReverse<vector<unsigned short>>/4096             2047 ns         2040 ns       344615
BenchPlainBidiReverse<vector<unsigned short>>/32768           16528 ns        16741 ns        44800
BenchPlainBidiReverse<vector<unsigned short>>/65535           33049 ns        32993 ns        20364
BenchStdReverse<vector<unsigned short>>/8                      2.91 ns         2.93 ns    224000000
BenchStdReverse<vector<unsigned short>>/64                     3.82 ns         3.85 ns    186666667
BenchStdReverse<vector<unsigned short>>/512                    19.9 ns         19.9 ns     34461538
BenchStdReverse<vector<unsigned short>>/4096                    139 ns          138 ns      4977778
BenchStdReverse<vector<unsigned short>>/32768                  1124 ns         1123 ns       640000
BenchStdReverse<vector<unsigned short>>/65535                  2486 ns         2455 ns       280000
BenchPlainBidiReverse<vector<unsigned int>>/8                  1.90 ns         1.88 ns    373333333
BenchPlainBidiReverse<vector<unsigned int>>/64                 15.3 ns         15.4 ns     49777778
BenchPlainBidiReverse<vector<unsigned int>>/512                 131 ns          129 ns      4977778
BenchPlainBidiReverse<vector<unsigned int>>/4096               1017 ns         1001 ns       640000
BenchPlainBidiReverse<vector<unsigned int>>/32768              8173 ns         8196 ns        89600
BenchPlainBidiReverse<vector<unsigned int>>/100000            25026 ns        25112 ns        28000
BenchStdReverse<vector<unsigned int>>/8                        2.17 ns         2.20 ns    320000000
BenchStdReverse<vector<unsigned int>>/64                       5.26 ns         5.16 ns    112000000
BenchStdReverse<vector<unsigned int>>/512                      35.1 ns         35.3 ns     20363636
BenchStdReverse<vector<unsigned int>>/4096                      277 ns          276 ns      2488889
BenchStdReverse<vector<unsigned int>>/32768                    2231 ns         2246 ns       320000
BenchStdReverse<vector<unsigned int>>/100000                   6814 ns         6801 ns        89600
BenchUnsignedIntSseReverse/8                                   2.01 ns         2.04 ns    344615385
BenchUnsignedIntSseReverse/64                                  4.81 ns         4.76 ns    144516129
BenchUnsignedIntSseReverse/512                                 32.2 ns         32.1 ns     22400000
BenchUnsignedIntSseReverse/4096                                 250 ns          251 ns      2800000
BenchUnsignedIntSseReverse/32768                               2140 ns         2131 ns       344615
BenchUnsignedIntSseReverse/100000                              6558 ns         6557 ns       112000
BenchAvx2UnsignedIntReverse/8                                  2.11 ns         2.13 ns    344615385
BenchAvx2UnsignedIntReverse/64                                 4.73 ns         4.81 ns    149333333
BenchAvx2UnsignedIntReverse/512                                30.2 ns         30.5 ns     23578947
BenchAvx2UnsignedIntReverse/4096                                242 ns          241 ns      2986667
BenchAvx2UnsignedIntReverse/32768                              1945 ns         1967 ns       373333
BenchAvx2UnsignedIntReverse/100000                             5955 ns         5999 ns       112000
BenchPlainBidiReverse<vector<unsigned long long>>/8            1.92 ns         1.93 ns    373333333
BenchPlainBidiReverse<vector<unsigned long long>>/64           15.4 ns         15.7 ns     49777778
BenchPlainBidiReverse<vector<unsigned long long>>/512           132 ns          131 ns      5600000
BenchPlainBidiReverse<vector<unsigned long long>>/4096         1017 ns         1004 ns       746667
BenchPlainBidiReverse<vector<unsigned long long>>/32768        8397 ns         8371 ns        89600
BenchPlainBidiReverse<vector<unsigned long long>>/100000      25675 ns        25495 ns        26353
BenchStdReverse<vector<unsigned long long>>/8                  3.46 ns         3.52 ns    213333333
BenchStdReverse<vector<unsigned long long>>/64                 8.98 ns         9.00 ns     74666667
BenchStdReverse<vector<unsigned long long>>/512                62.0 ns         61.4 ns     11200000
BenchStdReverse<vector<unsigned long long>>/4096                493 ns          488 ns      1120000
BenchStdReverse<vector<unsigned long long>>/32768              3890 ns         3836 ns       179200
BenchStdReverse<vector<unsigned long long>>/100000            11978 ns        11998 ns        56000