Vectorize reverse_copy() by pawREP · Pull Request #804 · microsoft/STL (original) (raw)
Here's the benchmark I wrote way back when I added the vector reverse:
#include <algorithm>
#include <benchmark/benchmark.h>
#include <deque>
#include <functional>
#include <list>
#include <numeric>
#include <stdlib.h>
#include <utility>
#include <vector>
using namespace std;
void verify(bool b) {
if (!b) {
exit(1);
}
}
template<class _BidIt>
void plain_bidi_reverse(_BidIt _First, _BidIt _Last)
{
for (; _First != _Last && _First != --_Last; ++_First)
{
const auto _Temp = *_First;
*_First = *_Last;
*_Last = _Temp;
}
}
template <class Container, class TestedFn>
inline void RunTest(benchmark::State &state, size_t dataSize, TestedFn fn) {
Container data(dataSize);
iota(data.begin(), data.end(),
static_cast<typename Container::value_type>(1));
fn(data);
verify(is_sorted(data.begin(), data.end(), greater<>{}));
fn(data);
verify(is_sorted(data.begin(), data.end(), less<>{}));
for (auto _ : state) {
(void)_;
fn(data);
}
}
template <class Container> void BenchPlainBidiReverse(benchmark::State &state) {
RunTest<Container>(state, static_cast<size_t>(state.range(0)),
[](auto &c) { plain_bidi_reverse(c.begin(), c.end()); });
}
template <class Container> void BenchStdReverse(benchmark::State &state) {
RunTest<Container>(state, static_cast<size_t>(state.range(0)),
[](auto &c) { reverse(c.begin(), c.end()); });
}
BENCHMARK_TEMPLATE(BenchStdReverse, deque<unsigned int>)->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchStdReverse, list<unsigned int>)->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned char>)->Range(8, 255);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned char>)->Range(8, 255);
BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned short>)->Range(8, 65535);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned short>)->Range(8, 65535);
BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned int>)->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned int>)->Range(8, 100'000);
extern "C" extern long __isa_enabled;
constexpr long __ISA_AVAILABLE_SSE2 = 1;
constexpr long __ISA_AVAILABLE_AVX2 = 5;
#include <emmintrin.h>
#include <intrin.h>
#include <xmmintrin.h>
extern "C" void _cdecl __std_sse_reverse_trivially_copyable_4(
unsigned int *_First, unsigned int *_Last) throw() {
if (_Last - _First > 8
#ifndef _M_X64
&& _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2)
#endif /* _M_X64 */
) {
unsigned int *_Stop_at = _First + ((_Last - _First) >> 3 << 2);
do {
_Last -= 4;
const __m128i _Left =
_mm_loadu_si128(reinterpret_cast<__m128i *>(_First));
const __m128i _Right =
_mm_loadu_si128(reinterpret_cast<__m128i *>(_Last));
const __m128i _Left_reversed = _mm_shuffle_epi32(_Left, 27);
const __m128i _Right_reversed = _mm_shuffle_epi32(_Right, 27);
_mm_storeu_si128(reinterpret_cast<__m128i *>(_First), _Right_reversed);
_mm_storeu_si128(reinterpret_cast<__m128i *>(_Last), _Left_reversed);
_First += 4;
} while (_First != _Stop_at);
}
for (; _First != _Last && _First != --_Last; ++_First) {
const unsigned int _Temp = *_First;
*_First = *_Last;
*_Last = _Temp;
}
}
void BenchUnsignedIntSseReverse(benchmark::State &state) {
RunTest<vector<unsigned int>>(state, static_cast<size_t>(state.range(0)), [](auto &c) {
__std_sse_reverse_trivially_copyable_4(&*c.begin(), &*c.end());
});
}
BENCHMARK(BenchUnsignedIntSseReverse)->Range(8, 100'000);
extern "C" void _cdecl __std_avx2_reverse_trivially_copyable_4(
unsigned int *_First, unsigned int *_Last) throw() {
if (_Last - _First > 16 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) {
unsigned int *_Stop_at = _First + ((_Last - _First) >> 4 << 3);
do {
_Last -= 8;
const __m256i _Left =
_mm256_loadu_si256(reinterpret_cast<__m256i *>(_First));
const __m256i _Right =
_mm256_loadu_si256(reinterpret_cast<__m256i *>(_Last));
const __m256i _Left_lane_reversed = _mm256_shuffle_epi32(_Left, 27);
const __m256i _Right_lane_reversed = _mm256_shuffle_epi32(_Right, 27);
const __m256i _Left_reversed =
_mm256_permute4x64_epi64(_Left_lane_reversed, 78);
const __m256i _Right_reversed =
_mm256_permute4x64_epi64(_Right_lane_reversed, 78);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(_First), _Right_reversed);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(_Last), _Left_reversed);
_First += 8;
} while (_First != _Stop_at);
}
for (; _First != _Last && _First != --_Last; ++_First) {
const unsigned int _Temp = *_First;
*_First = *_Last;
*_Last = _Temp;
}
}
void BenchAvx2UnsignedIntReverse(benchmark::State &state) {
RunTest<vector<unsigned int>>(state, static_cast<size_t>(state.range(0)), [](auto &c) {
__std_avx2_reverse_trivially_copyable_4(&*c.begin(), &*c.end());
});
}
BENCHMARK(BenchAvx2UnsignedIntReverse)->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchPlainBidiReverse, vector<unsigned long long>)
->Range(8, 100'000);
BENCHMARK_TEMPLATE(BenchStdReverse, vector<unsigned long long>)
->Range(8, 100'000);
BENCHMARK_MAIN();
Here are results from my 3970X; first, the important ones like vector: by the time you get to 64 elements the wins are huge. There's no reason this wouldn't apply just as much to reverse_copy (although absolute wins might be lower because memory bandwidth consumption is higher for that algorithm)
---------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------------------------------------
BenchPlainBidiReverse<vector<unsigned char>>/8 2.14 ns 2.10 ns 320000000
BenchPlainBidiReverse<vector<unsigned char>>/64 23.6 ns 23.4 ns 21333333
BenchPlainBidiReverse<vector<unsigned char>>/255 119 ns 120 ns 5600000
BenchStdReverse<vector<unsigned char>>/8 3.57 ns 3.61 ns 203636364
BenchStdReverse<vector<unsigned char>>/64 3.85 ns 3.85 ns 194782609
BenchStdReverse<vector<unsigned char>>/255 14.2 ns 14.4 ns 49777778
BenchPlainBidiReverse<vector<unsigned long long>>/8 1.92 ns 1.93 ns 373333333
BenchPlainBidiReverse<vector<unsigned long long>>/64 15.4 ns 15.7 ns 49777778
BenchPlainBidiReverse<vector<unsigned long long>>/512 132 ns 131 ns 5600000
BenchPlainBidiReverse<vector<unsigned long long>>/4096 1017 ns 1004 ns 746667
BenchPlainBidiReverse<vector<unsigned long long>>/32768 8397 ns 8371 ns 89600
BenchPlainBidiReverse<vector<unsigned long long>>/100000 25675 ns 25495 ns 26353
BenchStdReverse<vector<unsigned long long>>/8 3.46 ns 3.52 ns 213333333
BenchStdReverse<vector<unsigned long long>>/64 8.98 ns 9.00 ns 74666667
BenchStdReverse<vector<unsigned long long>>/512 62.0 ns 61.4 ns 11200000
BenchStdReverse<vector<unsigned long long>>/4096 493 ns 488 ns 1120000
BenchStdReverse<vector<unsigned long long>>/32768 3890 ns 3836 ns 179200
BenchStdReverse<vector<unsigned long long>>/100000 11978 ns 11998 ns 56000
The full list:
D:\vclib-benchmarks\windows.x64.release>.\reverse.exe
07/21/20 11:28:22
Running .\reverse.exe
Run on (64 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x32)
L1 Instruction 32 KiB (x32)
L2 Unified 512 KiB (x32)
L3 Unified 16384 KiB (x8)
---------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------------------------------------
BenchStdReverse<deque<unsigned int>>/8 7.41 ns 7.32 ns 89600000
BenchStdReverse<deque<unsigned int>>/64 50.8 ns 51.6 ns 10000000
BenchStdReverse<deque<unsigned int>>/512 407 ns 410 ns 1792000
BenchStdReverse<deque<unsigned int>>/4096 4039 ns 4028 ns 213333
BenchStdReverse<deque<unsigned int>>/32768 26808 ns 26367 ns 24889
BenchStdReverse<deque<unsigned int>>/100000 92349 ns 92072 ns 7467
BenchStdReverse<list<unsigned int>>/8 3.65 ns 3.61 ns 194782609
BenchStdReverse<list<unsigned int>>/64 36.6 ns 36.8 ns 18666667
BenchStdReverse<list<unsigned int>>/512 314 ns 314 ns 2240000
BenchStdReverse<list<unsigned int>>/4096 3560 ns 3530 ns 194783
BenchStdReverse<list<unsigned int>>/32768 43682 ns 43493 ns 15448
BenchStdReverse<list<unsigned int>>/100000 123828 ns 122768 ns 5600
BenchPlainBidiReverse<vector<unsigned char>>/8 2.14 ns 2.10 ns 320000000
BenchPlainBidiReverse<vector<unsigned char>>/64 23.6 ns 23.4 ns 21333333
BenchPlainBidiReverse<vector<unsigned char>>/255 119 ns 120 ns 5600000
BenchStdReverse<vector<unsigned char>>/8 3.57 ns 3.61 ns 203636364
BenchStdReverse<vector<unsigned char>>/64 3.85 ns 3.85 ns 194782609
BenchStdReverse<vector<unsigned char>>/255 14.2 ns 14.4 ns 49777778
BenchPlainBidiReverse<vector<unsigned short>>/8 1.90 ns 1.88 ns 373333333
BenchPlainBidiReverse<vector<unsigned short>>/64 31.9 ns 32.1 ns 22400000
BenchPlainBidiReverse<vector<unsigned short>>/512 251 ns 251 ns 2800000
BenchPlainBidiReverse<vector<unsigned short>>/4096 2047 ns 2040 ns 344615
BenchPlainBidiReverse<vector<unsigned short>>/32768 16528 ns 16741 ns 44800
BenchPlainBidiReverse<vector<unsigned short>>/65535 33049 ns 32993 ns 20364
BenchStdReverse<vector<unsigned short>>/8 2.91 ns 2.93 ns 224000000
BenchStdReverse<vector<unsigned short>>/64 3.82 ns 3.85 ns 186666667
BenchStdReverse<vector<unsigned short>>/512 19.9 ns 19.9 ns 34461538
BenchStdReverse<vector<unsigned short>>/4096 139 ns 138 ns 4977778
BenchStdReverse<vector<unsigned short>>/32768 1124 ns 1123 ns 640000
BenchStdReverse<vector<unsigned short>>/65535 2486 ns 2455 ns 280000
BenchPlainBidiReverse<vector<unsigned int>>/8 1.90 ns 1.88 ns 373333333
BenchPlainBidiReverse<vector<unsigned int>>/64 15.3 ns 15.4 ns 49777778
BenchPlainBidiReverse<vector<unsigned int>>/512 131 ns 129 ns 4977778
BenchPlainBidiReverse<vector<unsigned int>>/4096 1017 ns 1001 ns 640000
BenchPlainBidiReverse<vector<unsigned int>>/32768 8173 ns 8196 ns 89600
BenchPlainBidiReverse<vector<unsigned int>>/100000 25026 ns 25112 ns 28000
BenchStdReverse<vector<unsigned int>>/8 2.17 ns 2.20 ns 320000000
BenchStdReverse<vector<unsigned int>>/64 5.26 ns 5.16 ns 112000000
BenchStdReverse<vector<unsigned int>>/512 35.1 ns 35.3 ns 20363636
BenchStdReverse<vector<unsigned int>>/4096 277 ns 276 ns 2488889
BenchStdReverse<vector<unsigned int>>/32768 2231 ns 2246 ns 320000
BenchStdReverse<vector<unsigned int>>/100000 6814 ns 6801 ns 89600
BenchUnsignedIntSseReverse/8 2.01 ns 2.04 ns 344615385
BenchUnsignedIntSseReverse/64 4.81 ns 4.76 ns 144516129
BenchUnsignedIntSseReverse/512 32.2 ns 32.1 ns 22400000
BenchUnsignedIntSseReverse/4096 250 ns 251 ns 2800000
BenchUnsignedIntSseReverse/32768 2140 ns 2131 ns 344615
BenchUnsignedIntSseReverse/100000 6558 ns 6557 ns 112000
BenchAvx2UnsignedIntReverse/8 2.11 ns 2.13 ns 344615385
BenchAvx2UnsignedIntReverse/64 4.73 ns 4.81 ns 149333333
BenchAvx2UnsignedIntReverse/512 30.2 ns 30.5 ns 23578947
BenchAvx2UnsignedIntReverse/4096 242 ns 241 ns 2986667
BenchAvx2UnsignedIntReverse/32768 1945 ns 1967 ns 373333
BenchAvx2UnsignedIntReverse/100000 5955 ns 5999 ns 112000
BenchPlainBidiReverse<vector<unsigned long long>>/8 1.92 ns 1.93 ns 373333333
BenchPlainBidiReverse<vector<unsigned long long>>/64 15.4 ns 15.7 ns 49777778
BenchPlainBidiReverse<vector<unsigned long long>>/512 132 ns 131 ns 5600000
BenchPlainBidiReverse<vector<unsigned long long>>/4096 1017 ns 1004 ns 746667
BenchPlainBidiReverse<vector<unsigned long long>>/32768 8397 ns 8371 ns 89600
BenchPlainBidiReverse<vector<unsigned long long>>/100000 25675 ns 25495 ns 26353
BenchStdReverse<vector<unsigned long long>>/8 3.46 ns 3.52 ns 213333333
BenchStdReverse<vector<unsigned long long>>/64 8.98 ns 9.00 ns 74666667
BenchStdReverse<vector<unsigned long long>>/512 62.0 ns 61.4 ns 11200000
BenchStdReverse<vector<unsigned long long>>/4096 493 ns 488 ns 1120000
BenchStdReverse<vector<unsigned long long>>/32768 3890 ns 3836 ns 179200
BenchStdReverse<vector<unsigned long long>>/100000 11978 ns 11998 ns 56000