LLVM: lib/Support/BLAKE3/blake3_sse41.c Source File (original) (raw)

2

3#include <immintrin.h>

4

5#define DEGREE 4

6

7#define _mm_shuffle_ps2(a, b, c) \

8 (_mm_castps_si128( \

9 _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))

10

12 return _mm_loadu_si128((const __m128i *)src);

13}

14

16 _mm_storeu_si128((__m128i *)dest, src);

17}

18

19INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }

20

21

22INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }

23

25

27 return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);

28}

29

31 return _mm_shuffle_epi8(

32 x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));

33}

34

36 return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));

37}

38

40 return _mm_shuffle_epi8(

41 x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));

42}

43

45 return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));

46}

47

48INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,

49 __m128i m) {

50 *row0 = addv(addv(*row0, m), *row1);

51 *row3 = xorv(*row3, *row0);

52 *row3 = rot16(*row3);

53 *row2 = addv(*row2, *row3);

54 *row1 = xorv(*row1, *row2);

55 *row1 = rot12(*row1);

56}

57

58INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,

59 __m128i m) {

60 *row0 = addv(addv(*row0, m), *row1);

61 *row3 = xorv(*row3, *row0);

62 *row3 = rot8(*row3);

63 *row2 = addv(*row2, *row3);

64 *row1 = xorv(*row1, *row2);

65 *row1 = rot7(*row1);

66}

67

68

69

70

72 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));

73 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));

74 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));

75}

76

78 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));

79 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));

80 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));

81}

82

91

92 __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);

93 __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);

94 __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);

95 __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);

96

97 __m128i t0, t1, t2, t3, tt;

98

99

100

101 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));

102 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

103 t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));

104 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

105 diagonalize(&rows[0], &rows[2], &rows[3]);

106 t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));

107 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));

108 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

109 t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));

110 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));

111 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

113 m0 = t0;

114 m1 = t1;

115 m2 = t2;

116 m3 = t3;

117

118

119

121 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));

122 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

124 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));

125 t1 = _mm_blend_epi16(tt, t1, 0xCC);

126 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

127 diagonalize(&rows[0], &rows[2], &rows[3]);

128 t2 = _mm_unpacklo_epi64(m3, m1);

129 tt = _mm_blend_epi16(t2, m2, 0xC0);

130 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));

131 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

132 t3 = _mm_unpackhi_epi32(m1, m3);

133 tt = _mm_unpacklo_epi32(m2, t3);

134 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));

135 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

137 m0 = t0;

138 m1 = t1;

139 m2 = t2;

140 m3 = t3;

141

142

144 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));

145 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

147 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));

148 t1 = _mm_blend_epi16(tt, t1, 0xCC);

149 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

150 diagonalize(&rows[0], &rows[2], &rows[3]);

151 t2 = _mm_unpacklo_epi64(m3, m1);

152 tt = _mm_blend_epi16(t2, m2, 0xC0);

153 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));

154 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

155 t3 = _mm_unpackhi_epi32(m1, m3);

156 tt = _mm_unpacklo_epi32(m2, t3);

157 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));

158 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

160 m0 = t0;

161 m1 = t1;

162 m2 = t2;

163 m3 = t3;

164

165

167 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));

168 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

170 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));

171 t1 = _mm_blend_epi16(tt, t1, 0xCC);

172 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

173 diagonalize(&rows[0], &rows[2], &rows[3]);

174 t2 = _mm_unpacklo_epi64(m3, m1);

175 tt = _mm_blend_epi16(t2, m2, 0xC0);

176 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));

177 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

178 t3 = _mm_unpackhi_epi32(m1, m3);

179 tt = _mm_unpacklo_epi32(m2, t3);

180 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));

181 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

183 m0 = t0;

184 m1 = t1;

185 m2 = t2;

186 m3 = t3;

187

188

190 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));

191 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

193 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));

194 t1 = _mm_blend_epi16(tt, t1, 0xCC);

195 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

196 diagonalize(&rows[0], &rows[2], &rows[3]);

197 t2 = _mm_unpacklo_epi64(m3, m1);

198 tt = _mm_blend_epi16(t2, m2, 0xC0);

199 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));

200 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

201 t3 = _mm_unpackhi_epi32(m1, m3);

202 tt = _mm_unpacklo_epi32(m2, t3);

203 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));

204 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

206 m0 = t0;

207 m1 = t1;

208 m2 = t2;

209 m3 = t3;

210

211

213 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));

214 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

216 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));

217 t1 = _mm_blend_epi16(tt, t1, 0xCC);

218 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

219 diagonalize(&rows[0], &rows[2], &rows[3]);

220 t2 = _mm_unpacklo_epi64(m3, m1);

221 tt = _mm_blend_epi16(t2, m2, 0xC0);

222 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));

223 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

224 t3 = _mm_unpackhi_epi32(m1, m3);

225 tt = _mm_unpacklo_epi32(m2, t3);

226 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));

227 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

229 m0 = t0;

230 m1 = t1;

231 m2 = t2;

232 m3 = t3;

233

234

236 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));

237 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);

239 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));

240 t1 = _mm_blend_epi16(tt, t1, 0xCC);

241 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);

242 diagonalize(&rows[0], &rows[2], &rows[3]);

243 t2 = _mm_unpacklo_epi64(m3, m1);

244 tt = _mm_blend_epi16(t2, m2, 0xC0);

245 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));

246 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);

247 t3 = _mm_unpackhi_epi32(m1, m3);

248 tt = _mm_unpacklo_epi32(m2, t3);

249 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));

250 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);

252}

253

258 __m128i rows[4];

262}

263

268 __m128i rows[4];

270 storeu(xorv(rows[0], rows[2]), &out[0]);

271 storeu(xorv(rows[1], rows[3]), &out[16]);

274}

275

281 v[0] = addv(v[0], v[4]);

282 v[1] = addv(v[1], v[5]);

283 v[2] = addv(v[2], v[6]);

284 v[3] = addv(v[3], v[7]);

285 v[12] = xorv(v[12], v[0]);

286 v[13] = xorv(v[13], v[1]);

287 v[14] = xorv(v[14], v[2]);

288 v[15] = xorv(v[15], v[3]);

289 v[12] = rot16(v[12]);

290 v[13] = rot16(v[13]);

291 v[14] = rot16(v[14]);

292 v[15] = rot16(v[15]);

293 v[8] = addv(v[8], v[12]);

294 v[9] = addv(v[9], v[13]);

295 v[10] = addv(v[10], v[14]);

296 v[11] = addv(v[11], v[15]);

297 v[4] = xorv(v[4], v[8]);

298 v[5] = xorv(v[5], v[9]);

299 v[6] = xorv(v[6], v[10]);

300 v[7] = xorv(v[7], v[11]);

301 v[4] = rot12(v[4]);

302 v[5] = rot12(v[5]);

303 v[6] = rot12(v[6]);

304 v[7] = rot12(v[7]);

309 v[0] = addv(v[0], v[4]);

310 v[1] = addv(v[1], v[5]);

311 v[2] = addv(v[2], v[6]);

312 v[3] = addv(v[3], v[7]);

313 v[12] = xorv(v[12], v[0]);

314 v[13] = xorv(v[13], v[1]);

315 v[14] = xorv(v[14], v[2]);

316 v[15] = xorv(v[15], v[3]);

317 v[12] = rot8(v[12]);

318 v[13] = rot8(v[13]);

319 v[14] = rot8(v[14]);

320 v[15] = rot8(v[15]);

321 v[8] = addv(v[8], v[12]);

322 v[9] = addv(v[9], v[13]);

323 v[10] = addv(v[10], v[14]);

324 v[11] = addv(v[11], v[15]);

325 v[4] = xorv(v[4], v[8]);

326 v[5] = xorv(v[5], v[9]);

327 v[6] = xorv(v[6], v[10]);

328 v[7] = xorv(v[7], v[11]);

329 v[4] = rot7(v[4]);

330 v[5] = rot7(v[5]);

331 v[6] = rot7(v[6]);

332 v[7] = rot7(v[7]);

333

338 v[0] = addv(v[0], v[5]);

339 v[1] = addv(v[1], v[6]);

340 v[2] = addv(v[2], v[7]);

341 v[3] = addv(v[3], v[4]);

342 v[15] = xorv(v[15], v[0]);

343 v[12] = xorv(v[12], v[1]);

344 v[13] = xorv(v[13], v[2]);

345 v[14] = xorv(v[14], v[3]);

346 v[15] = rot16(v[15]);

347 v[12] = rot16(v[12]);

348 v[13] = rot16(v[13]);

349 v[14] = rot16(v[14]);

350 v[10] = addv(v[10], v[15]);

351 v[11] = addv(v[11], v[12]);

352 v[8] = addv(v[8], v[13]);

353 v[9] = addv(v[9], v[14]);

354 v[5] = xorv(v[5], v[10]);

355 v[6] = xorv(v[6], v[11]);

356 v[7] = xorv(v[7], v[8]);

357 v[4] = xorv(v[4], v[9]);

358 v[5] = rot12(v[5]);

359 v[6] = rot12(v[6]);

360 v[7] = rot12(v[7]);

361 v[4] = rot12(v[4]);

366 v[0] = addv(v[0], v[5]);

367 v[1] = addv(v[1], v[6]);

368 v[2] = addv(v[2], v[7]);

369 v[3] = addv(v[3], v[4]);

370 v[15] = xorv(v[15], v[0]);

371 v[12] = xorv(v[12], v[1]);

372 v[13] = xorv(v[13], v[2]);

373 v[14] = xorv(v[14], v[3]);

374 v[15] = rot8(v[15]);

375 v[12] = rot8(v[12]);

376 v[13] = rot8(v[13]);

377 v[14] = rot8(v[14]);

378 v[10] = addv(v[10], v[15]);

379 v[11] = addv(v[11], v[12]);

380 v[8] = addv(v[8], v[13]);

381 v[9] = addv(v[9], v[14]);

382 v[5] = xorv(v[5], v[10]);

383 v[6] = xorv(v[6], v[11]);

384 v[7] = xorv(v[7], v[8]);

385 v[4] = xorv(v[4], v[9]);

386 v[5] = rot7(v[5]);

387 v[6] = rot7(v[6]);

388 v[7] = rot7(v[7]);

389 v[4] = rot7(v[4]);

390}

391

393

394

395

396 __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);

397 __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);

398 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);

399 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);

400

401

402 __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);

403 __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);

404 __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);

405 __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);

406

407 vecs[0] = abcd_0;

408 vecs[1] = abcd_1;

409 vecs[2] = abcd_2;

410 vecs[3] = abcd_3;

411}

412

414 size_t block_offset, __m128i out[16]) {

415 out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);

416 out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);

417 out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);

418 out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);

419 out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);

420 out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);

421 out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);

422 out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);

423 out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);

424 out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);

425 out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);

426 out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);

427 out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);

428 out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);

429 out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);

430 out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);

431 for (size_t i = 0; i < 4; ++i) {

432 _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);

433 }

438}

439

441 __m128i *out_lo, __m128i *out_hi) {

442 const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);

443 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);

444 const __m128i add1 = _mm_and_si128(mask, add0);

445 __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);

446 __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),

447 _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));

448 __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);

449 *out_lo = l;

450 *out_hi = h;

451}

452

453static

456 bool increment_counter, uint8_t flags,

458 __m128i h_vecs[8] = {

461 };

462 __m128i counter_low_vec, counter_high_vec;

463 load_counters(counter, increment_counter, &counter_low_vec,

464 &counter_high_vec);

465 uint8_t block_flags = flags | flags_start;

466

469 block_flags |= flags_end;

470 }

472 __m128i block_flags_vec = set1(block_flags);

473 __m128i msg_vecs[16];

475

476 __m128i v[16] = {

477 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],

478 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],

480 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,

481 };

489 h_vecs[0] = xorv(v[0], v[8]);

490 h_vecs[1] = xorv(v[1], v[9]);

491 h_vecs[2] = xorv(v[2], v[10]);

492 h_vecs[3] = xorv(v[3], v[11]);

493 h_vecs[4] = xorv(v[4], v[12]);

494 h_vecs[5] = xorv(v[5], v[13]);

495 h_vecs[6] = xorv(v[6], v[14]);

496 h_vecs[7] = xorv(v[7], v[15]);

497

498 block_flags = flags;

499 }

500

503

504

505 storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);

506 storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);

507 storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);

508 storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);

509 storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);

510 storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);

511 storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);

512 storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);

513}

514

521 uint8_t block_flags = flags | flags_start;

524 block_flags |= flags_end;

525 }

527 block_flags);

530 block_flags = flags;

531 }

533}

534

537 uint64_t counter, bool increment_counter,

540 while (num_inputs >= DEGREE) {

542 flags_start, flags_end, out);

543 if (increment_counter) {

545 }

547 num_inputs -= DEGREE;

549 }

550 while (num_inputs > 0) {

552 flags_end, out);

553 if (increment_counter) {

554 counter += 1;

555 }

556 inputs += 1;

557 num_inputs -= 1;

559 }

560}

bbsections Prepares for basic block by splitting functions into clusters of basic blocks

static constexpr unsigned long long mask(BlockVerifier::State S)

unify loop Fixup each natural loop to have a single exit block

static const uint8_t MSG_SCHEDULE[7][16]

static const uint32_t IV[8]

INLINE uint32_t counter_high(uint64_t counter)

INLINE uint32_t counter_low(uint64_t counter)

INLINE __m128i rot12(__m128i x)

Definition blake3_sse41.c:35

INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)

Definition blake3_sse41.c:26

#define _mm_shuffle_ps2(a, b, c)

Definition blake3_sse41.c:7

INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)

Definition blake3_sse41.c:48

INLINE __m128i rot7(__m128i x)

Definition blake3_sse41.c:44

INLINE void storeu(__m128i src, uint8_t dest[16])

Definition blake3_sse41.c:15

INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16])

Definition blake3_sse41.c:413

INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3)

Definition blake3_sse41.c:71

INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r)

Definition blake3_sse41.c:276

INLINE __m128i xorv(__m128i a, __m128i b)

Definition blake3_sse41.c:22

INLINE void transpose_vecs(__m128i vecs[DEGREE])

Definition blake3_sse41.c:392

INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3)

Definition blake3_sse41.c:77

INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)

Definition blake3_sse41.c:440

INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)

Definition blake3_sse41.c:83

INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)

Definition blake3_sse41.c:58

static void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)

Definition blake3_sse41.c:454

INLINE __m128i rot16(__m128i x)

Definition blake3_sse41.c:30

INLINE __m128i addv(__m128i a, __m128i b)

Definition blake3_sse41.c:19

INLINE __m128i loadu(const uint8_t src[16])

Definition blake3_sse41.c:11

INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])

Definition blake3_sse41.c:515

INLINE __m128i set1(uint32_t x)

Definition blake3_sse41.c:24

INLINE __m128i rot8(__m128i x)

Definition blake3_sse41.c:39

#define blake3_compress_in_place_sse41

#define blake3_hash_many_sse41

#define blake3_compress_xof_sse41