Vectorize search_n for CPUs with SSE4.2 but not AVX2 support; handle AVX2 tail by AlexGuteniev · Pull Request #5544 · microsoft/STL (original) (raw)
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/40
42.0 ns
41.2 ns
1.02
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/18
57.1 ns
57.2 ns
1.00
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/16
62.5 ns
63.2 ns
0.99
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/14
69.2 ns
68.7 ns
1.01
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/10
84.7 ns
84.8 ns
1.00
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/8
103 ns
118 ns
0.87 📈
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/5
161 ns
117 ns
1.38 📉
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/4
191 ns
134 ns
1.43 📉
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/3
248 ns
117 ns
2.12 📉
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/2
373 ns
116 ns
3.22 📉
bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/1
18.1 ns
28.7 ns
0.63
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/40
42.1 ns
41.4 ns
1.02
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/18
57.7 ns
57.5 ns
1.00
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/16
63.0 ns
62.9 ns
1.00
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/14
70.1 ns
69.8 ns
1.00
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/10
86.6 ns
85.7 ns
1.01
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/8
104 ns
120 ns
0.87 📈
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/5
157 ns
120 ns
1.31 📉
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/4
189 ns
118 ns
1.60 📉
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/3
244 ns
118 ns
2.07 📉
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/2
369 ns
117 ns
3.15 📉
bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/1
17.7 ns
28.4 ns
0.62
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40
353 ns
355 ns
0.99
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18
461 ns
459 ns
1.00
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16
474 ns
478 ns
0.99
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14
513 ns
530 ns
0.97
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10
636 ns
633 ns
1.00
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8
739 ns
216 ns
3.42 📉
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5
930 ns
213 ns
4.37 📉
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4
972 ns
218 ns
4.46 📉
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3
1014 ns
222 ns
4.57 📉
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2
1053 ns
220 ns
4.79 📉
bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1
50.2 ns
38.7 ns
1.30
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40
356 ns
355 ns
1.00
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18
450 ns
463 ns
0.97
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16
480 ns
483 ns
0.99
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14
516 ns
521 ns
0.99
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10
618 ns
645 ns
0.96
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8
736 ns
216 ns
3.41 📉
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5
951 ns
217 ns
4.38 📉
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4
1039 ns
216 ns
4.81 📉
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3
1086 ns
214 ns
5.07 📉
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2
1855 ns
218 ns
8.51 📉
bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1
51.8 ns
38.6 ns
1.34
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/40
58.8 ns
45.4 ns
1.30
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/18
74.8 ns
62.7 ns
1.19
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/16
79.1 ns
68.3 ns
1.16
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/14
82.4 ns
84.6 ns
0.97
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/10
102 ns
108 ns
0.94
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/8
131 ns
135 ns
0.97
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/5
197 ns
200 ns
0.99
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/4
233 ns
162 ns
1.44 📉
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/3
311 ns
161 ns
1.93 📉
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/2
450 ns
161 ns
2.80 📉
bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/1
36.1 ns
35.1 ns
1.03
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/40
53.4 ns
46.6 ns
1.15
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/18
68.9 ns
69.3 ns
0.99
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/16
74.6 ns
75.7 ns
0.99
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/14
81.4 ns
84.2 ns
0.97
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/10
104 ns
112 ns
0.93
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/8
126 ns
131 ns
0.96
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/5
191 ns
195 ns
0.98
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/4
228 ns
161 ns
1.42 📉
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/3
298 ns
161 ns
1.85 📉
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/2
454 ns
160 ns
2.84 📉
bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/1
38.0 ns
36.0 ns
1.06
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40
613 ns
372 ns
1.65
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18
651 ns
449 ns
1.45
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16
710 ns
476 ns
1.49
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14
732 ns
517 ns
1.42
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10
812 ns
633 ns
1.28
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8
873 ns
706 ns
1.24
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5
986 ns
868 ns
1.14
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4
1077 ns
331 ns
3.25 📉
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3
1152 ns
327 ns
3.52 📉
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2
1138 ns
333 ns
3.42 📉
bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1
81.8 ns
79.8 ns
1.03
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40
610 ns
370 ns
1.65
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18
660 ns
454 ns
1.45
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16
722 ns
484 ns
1.49
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14
724 ns
507 ns
1.43
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10
816 ns
625 ns
1.31
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8
894 ns
715 ns
1.25
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5
988 ns
880 ns
1.12
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4
1061 ns
329 ns
3.22 📉
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3
1131 ns
330 ns
3.43 📉
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2
1124 ns
327 ns
3.44 📉
bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1
81.1 ns
81.2 ns
1.00
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/40
45.9 ns
51.4 ns
0.89
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/18
67.8 ns
70.1 ns
0.97
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/16
74.2 ns
77.3 ns
0.96
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/14
82.2 ns
84.8 ns
0.97
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/10
104 ns
105 ns
0.99
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/8
125 ns
128 ns
0.98
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/5
184 ns
187 ns
0.98
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/4
231 ns
227 ns
1.02
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/3
294 ns
292 ns
1.01
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/2
430 ns
244 ns
1.76 📉
bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/1
69.9 ns
68.4 ns
1.02
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/40
46.4 ns
51.2 ns
0.91
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/18
68.5 ns
71.5 ns
0.96
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/16
74.5 ns
77.3 ns
0.96
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/14
84.3 ns
85.0 ns
0.99
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/10
107 ns
104 ns
1.03
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/8
131 ns
125 ns
1.05
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/5
193 ns
191 ns
1.01
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/4
220 ns
226 ns
0.97
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/3
293 ns
294 ns
1.00
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/2
441 ns
243 ns
1.81 📉
bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/1
70.0 ns
70.1 ns
1.00
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40
373 ns
369 ns
1.01
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18
501 ns
521 ns
0.96
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16
531 ns
551 ns
0.96
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14
565 ns
591 ns
0.96
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10
699 ns
771 ns
0.91
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8
831 ns
938 ns
0.89
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5
1033 ns
1069 ns
0.97
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4
1154 ns
1221 ns
0.95
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3
1231 ns
1280 ns
0.96
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2
1454 ns
462 ns
3.15 📉
bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1
155 ns
146 ns
1.06
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40
377 ns
408 ns
0.92
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18
496 ns
555 ns
0.89
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16
529 ns
541 ns
0.98
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14
562 ns
601 ns
0.94
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10
687 ns
772 ns
0.89
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8
799 ns
1005 ns
0.80
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5
1031 ns
1276 ns
0.81
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4
1157 ns
1316 ns
0.88
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3
1246 ns
1371 ns
0.91
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2
1424 ns
514 ns
2.77 📉
bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1
148 ns
164 ns
0.90