">

LLVM: lib/Support/BLAKE3/blake3_avx512.c File Reference (original) (raw)

#include "[blake3_impl.h](blake3%5F%5Fimpl%5F8h%5Fsource.html)"
#include <immintrin.h>

Go to the source code of this file.

Macros
#define _mm_shuffle_ps2(a, b, c)
#define LO_IMM8 0x88
#define HI_IMM8 0xdd
Functions
INLINE __m128i loadu_128 (const uint8_t src[16])
INLINE __m256i loadu_256 (const uint8_t src[32])
INLINE __m512i loadu_512 (const uint8_t src[64])
INLINE void storeu_128 (__m128i src, uint8_t dest[16])
INLINE void storeu_256 (__m256i src, uint8_t dest[32])
INLINE void storeu_512 (__m512i src, uint8_t dest[64])
INLINE __m128i add_128 (__m128i a, __m128i b)
INLINE __m256i add_256 (__m256i a, __m256i b)
INLINE __m512i add_512 (__m512i a, __m512i b)
INLINE __m128i xor_128 (__m128i a, __m128i b)
INLINE __m256i xor_256 (__m256i a, __m256i b)
INLINE __m512i xor_512 (__m512i a, __m512i b)
INLINE __m128i set1_128 (uint32_t x)
INLINE __m256i set1_256 (uint32_t x)
INLINE __m512i set1_512 (uint32_t x)
INLINE __m128i set4 (uint32_t a, uint32_t b, uint32_t c, uint32_t d)
INLINE __m128i rot16_128 (__m128i x)
INLINE __m256i rot16_256 (__m256i x)
INLINE __m512i rot16_512 (__m512i x)
INLINE __m128i rot12_128 (__m128i x)
INLINE __m256i rot12_256 (__m256i x)
INLINE __m512i rot12_512 (__m512i x)
INLINE __m128i rot8_128 (__m128i x)
INLINE __m256i rot8_256 (__m256i x)
INLINE __m512i rot8_512 (__m512i x)
INLINE __m128i rot7_128 (__m128i x)
INLINE __m256i rot7_256 (__m256i x)
INLINE __m512i rot7_512 (__m512i x)
INLINE void g1 (__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
INLINE void g2 (__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
INLINE void diagonalize (__m128i *row0, __m128i *row2, __m128i *row3)
INLINE void undiagonalize (__m128i *row0, __m128i *row2, __m128i *row3)
INLINE void compress_pre (__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
void blake3_compress_xof_avx512 (const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
void blake3_compress_in_place_avx512 (uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
INLINE void round_fn4 (__m128i v[16], __m128i m[16], size_t r)
INLINE void transpose_vecs_128 (__m128i vecs[4])
INLINE void transpose_msg_vecs4 (const uint8_t *const *inputs, size_t block_offset, __m128i out[16])
INLINE void load_counters4 (uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)
static void blake3_hash4_avx512 (const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
static void blake3_xof4_avx512 (const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[4 *64])
INLINE void round_fn8 (__m256i v[16], __m256i m[16], size_t r)
INLINE void transpose_vecs_256 (__m256i vecs[8])
INLINE void transpose_msg_vecs8 (const uint8_t *const *inputs, size_t block_offset, __m256i out[16])
INLINE void load_counters8 (uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi)
static void blake3_hash8_avx512 (const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
static void blake3_xof8_avx512 (const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[8 *64])
INLINE void round_fn16 (__m512i v[16], __m512i m[16], size_t r)
INLINE __m512i unpack_lo_128 (__m512i a, __m512i b)
INLINE __m512i unpack_hi_128 (__m512i a, __m512i b)
INLINE void transpose_vecs_512 (__m512i vecs[16])
INLINE void transpose_msg_vecs16 (const uint8_t *const *inputs, size_t block_offset, __m512i out[16])
INLINE void load_counters16 (uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi)
static void blake3_hash16_avx512 (const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
static void blake3_xof16_avx512 (const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[16 *64])
INLINE void hash_one_avx512 (const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
void blake3_hash_many_avx512 (const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_xof_many_avx512 (const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t *out, size_t outblocks)

_mm_shuffle_ps2

#define _mm_shuffle_ps2 ( a,
b,
c )

Value:

(_mm_castps_si128( \

_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))

Definition at line 5 of file blake3_avx512.c.

Referenced by compress_pre().

HI_IMM8

LO_IMM8

add_128()

INLINE __m128i add_128 ( __m128i a,
__m128i b )

add_256()

INLINE __m256i add_256 ( __m256i a,
__m256i b )

add_512()

INLINE __m512i add_512 ( __m512i a,
__m512i b )

blake3_compress_in_place_avx512()

blake3_compress_xof_avx512()

blake3_hash16_avx512()

blake3_hash4_avx512()

Definition at line 496 of file blake3_avx512.c.

References BLAKE3_BLOCK_LEN, block, blocks, IV, load_counters4(), round_fn4(), set1_128(), storeu_128(), transpose_msg_vecs4(), transpose_vecs_128(), and xor_128().

Referenced by blake3_hash_many_avx512().

blake3_hash8_avx512()

Definition at line 799 of file blake3_avx512.c.

References BLAKE3_BLOCK_LEN, block, blocks, IV, load_counters8(), round_fn8(), set1_256(), storeu_256(), transpose_msg_vecs8(), transpose_vecs_256(), and xor_256().

Referenced by blake3_hash_many_avx512().

blake3_hash_many_avx512()

blake3_xof16_avx512()

blake3_xof4_avx512()

blake3_xof8_avx512()

blake3_xof_many_avx512()

compress_pre()

Definition at line 120 of file blake3_avx512.c.

References _mm_shuffle_ps2, BLAKE3_BLOCK_LEN, block, counter_high(), counter_low(), diagonalize(), g1(), g2(), INLINE, IV, loadu_128(), set4(), and undiagonalize().

Referenced by blake3_compress_in_place_avx512(), and blake3_compress_xof_avx512().

diagonalize()

INLINE void diagonalize ( __m128i * row0,
__m128i * row2,
__m128i * row3 )

g1()

INLINE void g1 ( __m128i * row0,
__m128i * row1,
__m128i * row2,
__m128i * row3,
__m128i m )

g2()

INLINE void g2 ( __m128i * row0,
__m128i * row1,
__m128i * row2,
__m128i * row3,
__m128i m )

hash_one_avx512()

load_counters16()

INLINE void load_counters16 ( uint64_t counter,
bool increment_counter,
__m512i * out_lo,
__m512i * out_hi )

load_counters4()

INLINE void load_counters4 ( uint64_t counter,
bool increment_counter,
__m128i * out_lo,
__m128i * out_hi )

load_counters8()

INLINE void load_counters8 ( uint64_t counter,
bool increment_counter,
__m256i * out_lo,
__m256i * out_hi )

loadu_128()

loadu_256()

loadu_512()

rot12_128()

INLINE __m128i rot12_128 ( __m128i x )

rot12_256()

INLINE __m256i rot12_256 ( __m256i x )

rot12_512()

INLINE __m512i rot12_512 ( __m512i x )

rot16_128()

INLINE __m128i rot16_128 ( __m128i x )

rot16_256()

INLINE __m256i rot16_256 ( __m256i x )

rot16_512()

INLINE __m512i rot16_512 ( __m512i x )

rot7_128()

INLINE __m128i rot7_128 ( __m128i x )

rot7_256()

INLINE __m256i rot7_256 ( __m256i x )

rot7_512()

INLINE __m512i rot7_512 ( __m512i x )

rot8_128()

INLINE __m128i rot8_128 ( __m128i x )

rot8_256()

INLINE __m256i rot8_256 ( __m256i x )

rot8_512()

INLINE __m512i rot8_512 ( __m512i x )

round_fn16()

INLINE void round_fn16 ( __m512i _v_[16],
__m512i _m_[16],
size_t r )

round_fn4()

INLINE void round_fn4 ( __m128i _v_[16],
__m128i _m_[16],
size_t r )

round_fn8()

INLINE void round_fn8 ( __m256i _v_[16],
__m256i _m_[16],
size_t r )

set1_128()

set1_256()

set1_512()

set4()

storeu_128()

storeu_256()

storeu_512()

transpose_msg_vecs16()

transpose_msg_vecs4()

transpose_msg_vecs8()

transpose_vecs_128()

INLINE void transpose_vecs_128 ( __m128i _vecs_[4] )

transpose_vecs_256()

INLINE void transpose_vecs_256 ( __m256i _vecs_[8] )

transpose_vecs_512()

INLINE void transpose_vecs_512 ( __m512i _vecs_[16] )

undiagonalize()

INLINE void undiagonalize ( __m128i * row0,
__m128i * row2,
__m128i * row3 )

unpack_hi_128()

INLINE __m512i unpack_hi_128 ( __m512i a,
__m512i b )

unpack_lo_128()

INLINE __m512i unpack_lo_128 ( __m512i a,
__m512i b )

xor_128()

INLINE __m128i xor_128 ( __m128i a,
__m128i b )

xor_256()

INLINE __m256i xor_256 ( __m256i a,
__m256i b )

xor_512()

INLINE __m512i xor_512 ( __m512i a,
__m512i b )