Vectorize search_n for CPUs with SSE4.2 but not AVX2 support; handle AVX2 tail by AlexGuteniev · Pull Request #5544 · microsoft/STL (original) (raw)

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/40

42.0 ns

41.2 ns

1.02

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/18

57.1 ns

57.2 ns

1.00

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/16

62.5 ns

63.2 ns

0.99

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/14

69.2 ns

68.7 ns

1.01

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/10

84.7 ns

84.8 ns

1.00

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/8

103 ns

118 ns

0.87 📈

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/5

161 ns

117 ns

1.38 📉

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/4

191 ns

134 ns

1.43 📉

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/3

248 ns

117 ns

2.12 📉

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/2

373 ns

116 ns

3.22 📉

bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/1

18.1 ns

28.7 ns

0.63

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/40

42.1 ns

41.4 ns

1.02

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/18

57.7 ns

57.5 ns

1.00

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/16

63.0 ns

62.9 ns

1.00

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/14

70.1 ns

69.8 ns

1.00

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/10

86.6 ns

85.7 ns

1.01

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/8

104 ns

120 ns

0.87 📈

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/5

157 ns

120 ns

1.31 📉

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/4

189 ns

118 ns

1.60 📉

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/3

244 ns

118 ns

2.07 📉

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/2

369 ns

117 ns

3.15 📉

bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/1

17.7 ns

28.4 ns

0.62

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40

353 ns

355 ns

0.99

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18

461 ns

459 ns

1.00

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16

474 ns

478 ns

0.99

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14

513 ns

530 ns

0.97

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10

636 ns

633 ns

1.00

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8

739 ns

216 ns

3.42 📉

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5

930 ns

213 ns

4.37 📉

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4

972 ns

218 ns

4.46 📉

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3

1014 ns

222 ns

4.57 📉

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2

1053 ns

220 ns

4.79 📉

bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1

50.2 ns

38.7 ns

1.30

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40

356 ns

355 ns

1.00

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18

450 ns

463 ns

0.97

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16

480 ns

483 ns

0.99

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14

516 ns

521 ns

0.99

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10

618 ns

645 ns

0.96

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8

736 ns

216 ns

3.41 📉

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5

951 ns

217 ns

4.38 📉

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4

1039 ns

216 ns

4.81 📉

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3

1086 ns

214 ns

5.07 📉

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2

1855 ns

218 ns

8.51 📉

bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1

51.8 ns

38.6 ns

1.34

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/40

58.8 ns

45.4 ns

1.30

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/18

74.8 ns

62.7 ns

1.19

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/16

79.1 ns

68.3 ns

1.16

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/14

82.4 ns

84.6 ns

0.97

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/10

102 ns

108 ns

0.94

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/8

131 ns

135 ns

0.97

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/5

197 ns

200 ns

0.99

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/4

233 ns

162 ns

1.44 📉

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/3

311 ns

161 ns

1.93 📉

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/2

450 ns

161 ns

2.80 📉

bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/1

36.1 ns

35.1 ns

1.03

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/40

53.4 ns

46.6 ns

1.15

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/18

68.9 ns

69.3 ns

0.99

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/16

74.6 ns

75.7 ns

0.99

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/14

81.4 ns

84.2 ns

0.97

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/10

104 ns

112 ns

0.93

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/8

126 ns

131 ns

0.96

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/5

191 ns

195 ns

0.98

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/4

228 ns

161 ns

1.42 📉

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/3

298 ns

161 ns

1.85 📉

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/2

454 ns

160 ns

2.84 📉

bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/1

38.0 ns

36.0 ns

1.06

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40

613 ns

372 ns

1.65

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18

651 ns

449 ns

1.45

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16

710 ns

476 ns

1.49

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14

732 ns

517 ns

1.42

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10

812 ns

633 ns

1.28

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8

873 ns

706 ns

1.24

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5

986 ns

868 ns

1.14

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4

1077 ns

331 ns

3.25 📉

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3

1152 ns

327 ns

3.52 📉

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2

1138 ns

333 ns

3.42 📉

bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1

81.8 ns

79.8 ns

1.03

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40

610 ns

370 ns

1.65

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18

660 ns

454 ns

1.45

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16

722 ns

484 ns

1.49

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14

724 ns

507 ns

1.43

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10

816 ns

625 ns

1.31

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8

894 ns

715 ns

1.25

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5

988 ns

880 ns

1.12

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4

1061 ns

329 ns

3.22 📉

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3

1131 ns

330 ns

3.43 📉

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2

1124 ns

327 ns

3.44 📉

bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1

81.1 ns

81.2 ns

1.00

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/40

45.9 ns

51.4 ns

0.89

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/18

67.8 ns

70.1 ns

0.97

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/16

74.2 ns

77.3 ns

0.96

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/14

82.2 ns

84.8 ns

0.97

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/10

104 ns

105 ns

0.99

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/8

125 ns

128 ns

0.98

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/5

184 ns

187 ns

0.98

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/4

231 ns

227 ns

1.02

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/3

294 ns

292 ns

1.01

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/2

430 ns

244 ns

1.76 📉

bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/1

69.9 ns

68.4 ns

1.02

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/40

46.4 ns

51.2 ns

0.91

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/18

68.5 ns

71.5 ns

0.96

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/16

74.5 ns

77.3 ns

0.96

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/14

84.3 ns

85.0 ns

0.99

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/10

107 ns

104 ns

1.03

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/8

131 ns

125 ns

1.05

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/5

193 ns

191 ns

1.01

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/4

220 ns

226 ns

0.97

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/3

293 ns

294 ns

1.00

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/2

441 ns

243 ns

1.81 📉

bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/1

70.0 ns

70.1 ns

1.00

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40

373 ns

369 ns

1.01

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18

501 ns

521 ns

0.96

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16

531 ns

551 ns

0.96

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14

565 ns

591 ns

0.96

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10

699 ns

771 ns

0.91

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8

831 ns

938 ns

0.89

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5

1033 ns

1069 ns

0.97

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4

1154 ns

1221 ns

0.95

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3

1231 ns

1280 ns

0.96

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2

1454 ns

462 ns

3.15 📉

bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1

155 ns

146 ns

1.06

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40

377 ns

408 ns

0.92

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18

496 ns

555 ns

0.89

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16

529 ns

541 ns

0.98

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14

562 ns

601 ns

0.94

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10

687 ns

772 ns

0.89

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8

799 ns

1005 ns

0.80

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5

1031 ns

1276 ns

0.81

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4

1157 ns

1316 ns

0.88

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3

1246 ns

1371 ns

0.91

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2

1424 ns

514 ns

2.77 📉

bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1

148 ns

164 ns

0.90