<random>: Implement Lemire's fast integer generation by MattStephanson · Pull Request #3012 · microsoft/STL (original) (raw)

Implements @lemire's "Fast Random Integer Generation in an Interval", https://dl.acm.org/doi/10.1145/3230636 and https://arxiv.org/abs/1805.10941. Fixes #178.

I'm not happy with the x86 or LCG performance, but I've been tinkering with it for weeks and haven't been able to improve it further. I'm using a Surface Pro 8, i5-1135G7. It's plugged in and set to "Best Performance", but I'm otherwise not very knowledgeable about how to run good microbenchmarks. If anyone has any thoughts, I'd love to hear them.

Benchmark code

#include #include <benchmark/benchmark.h>

/// Test URBGs alone

static void BM_mt19937(benchmark::State& state) { std::mt19937 gen; for (auto _ : state) { benchmark::DoNotOptimize(gen()); } } BENCHMARK(BM_mt19937);

static void BM_mt19937_64(benchmark::State& state) { std::mt19937_64 gen; for (auto _ : state) { benchmark::DoNotOptimize(gen()); } } BENCHMARK(BM_mt19937_64);

static void BM_lcg(benchmark::State& state) { std::minstd_rand gen; for (auto _ : state) { benchmark::DoNotOptimize(gen()); } } BENCHMARK(BM_lcg);

uint32_t GetMax() { std::random_device gen; std::uniform_int_distribution dist(10'000'000, 20'000'000); return dist(gen); }

static const uint32_t max = GetMax(); // random divisor to prevent strength reduction

/// Test mt19937

static void BM_raw_mt19937_old(benchmark::State& state) { std::mt19937 gen; std::_Rng_from_urng<uint32_t, decltype(gen)> rng(gen); for (auto _ : state) { benchmark::DoNotOptimize(rng(max)); } } BENCHMARK(BM_raw_mt19937_old);

static void BM_raw_mt19937_new(benchmark::State& state) { std::mt19937 gen; std::_Rng_from_urng_v2<uint32_t, decltype(gen)> rng(gen); for (auto _ : state) { benchmark::DoNotOptimize(rng(max)); } } BENCHMARK(BM_raw_mt19937_new);

/// Test mt19937_64

static void BM_raw_mt19937_64_old(benchmark::State& state) { std::mt19937_64 gen; std::_Rng_from_urng<uint64_t, decltype(gen)> rng(gen); for (auto _ : state) { benchmark::DoNotOptimize(rng(max)); } } BENCHMARK(BM_raw_mt19937_64_old);

static void BM_raw_mt19937_64_new(benchmark::State& state) { std::mt19937_64 gen; std::_Rng_from_urng_v2<uint64_t, decltype(gen)> rng(gen); for (auto _ : state) { benchmark::DoNotOptimize(rng(max)); } } BENCHMARK(BM_raw_mt19937_64_new);

/// Test minstd_rand

static void BM_raw_lcg_old(benchmark::State& state) { std::minstd_rand gen; std::_Rng_from_urng<uint32_t, decltype(gen)> rng(gen); for (auto _ : state) { benchmark::DoNotOptimize(rng(max)); } } BENCHMARK(BM_raw_lcg_old);

static void BM_raw_lcg_new(benchmark::State& state) { std::minstd_rand gen; std::_Rng_from_urng_v2<uint32_t, decltype(gen)> rng(gen); for (auto _ : state) { benchmark::DoNotOptimize(rng(max)); } } BENCHMARK(BM_raw_lcg_new);

BENCHMARK_MAIN();

Benchmark results

x86

2022-08-08T19:53:31-07:00
Running C:\Users\steph\source\repos\sandbox\Release\sandbox.exe
Run on (8 X 2424.25 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 1280 KiB (x4)
  L3 Unified 8192 KiB (x1)
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
BM_mt19937                  4.38 ns         4.39 ns    160000000
BM_mt19937_64               9.79 ns         9.77 ns     64000000
BM_lcg                      9.39 ns         8.54 ns     64000000
BM_raw_mt19937_old          7.75 ns         7.67 ns    112000000
BM_raw_mt19937_new          5.18 ns         5.16 ns    100000000
BM_raw_mt19937_64_old       21.2 ns         21.0 ns     32000000
BM_raw_mt19937_64_new       19.0 ns         18.8 ns     37333333
BM_raw_lcg_old              25.9 ns         26.1 ns     26352941
BM_raw_lcg_new              28.2 ns         28.3 ns     24888889

x64

2022-08-08T19:54:41-07:00
Running C:\Users\steph\source\repos\sandbox\x64\Release\sandbox.exe
Run on (8 X 2444.76 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 1280 KiB (x4)
  L3 Unified 8192 KiB (x1)
----------------------------------------------------------------
Benchmark                      Time             CPU   Iterations
----------------------------------------------------------------
BM_mt19937                  3.77 ns         3.75 ns    179200000
BM_mt19937_64               3.87 ns         3.84 ns    179200000
BM_lcg                      3.96 ns         4.01 ns    179200000
BM_raw_mt19937_old          5.70 ns         5.72 ns    112000000
BM_raw_mt19937_new          4.20 ns         4.24 ns    165925926
BM_raw_mt19937_64_old       8.50 ns         8.58 ns     74666667
BM_raw_mt19937_64_new       4.64 ns         4.50 ns    149333333
BM_raw_lcg_old              15.2 ns         15.4 ns     49777778
BM_raw_lcg_new              17.3 ns         17.3 ns     40727273