clang: lib/Headers/amdgpuintrin.h Source File (original) (raw)
1
2
3
4
5
6
7
8
9#ifndef __AMDGPUINTRIN_H
10#define __AMDGPUINTRIN_H
11
12#ifndef __AMDGPU__
13#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
14#endif
15
17
18#if !defined(__cplusplus)
20#define bool _Bool
21#endif
22
23_Pragma("omp begin declare target device_type(nohost)");
24_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
25
26
27#define __gpu_private __attribute__((address_space(5)))
28#define __gpu_constant __attribute__((address_space(4)))
29#define __gpu_local __attribute__((address_space(3)))
30#define __gpu_global __attribute__((address_space(1)))
31#define __gpu_generic __attribute__((address_space(0)))
32
33
34#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
35
36
38 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
39}
40
41
43 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
44}
45
46
48 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
49}
50
51
53 return __builtin_amdgcn_workgroup_id_x();
54}
55
56
58 return __builtin_amdgcn_workgroup_id_y();
59}
60
61
63 return __builtin_amdgcn_workgroup_id_z();
64}
65
66
68 return __builtin_amdgcn_workgroup_size_x();
69}
70
71
73 return __builtin_amdgcn_workgroup_size_y();
74}
75
76
78 return __builtin_amdgcn_workgroup_size_z();
79}
80
81
83 return __builtin_amdgcn_workitem_id_x();
84}
85
86
88 return __builtin_amdgcn_workitem_id_y();
89}
90
91
93 return __builtin_amdgcn_workitem_id_z();
94}
95
96
97
99 return __builtin_amdgcn_wavefrontsize();
100}
101
102
104 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
105}
106
107
109 return __builtin_amdgcn_read_exec();
110}
111
112
115 return __builtin_amdgcn_readfirstlane(__x);
116}
117
118
121 uint32_t __hi = (uint32_t)(__x >> 32ull);
122 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123 return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124 ((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
125}
126
127
129 bool __x) {
130
131
132 return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
133}
134
135
137 __builtin_amdgcn_s_barrier();
138 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
139}
140
141
143 __builtin_amdgcn_wave_barrier();
144}
145
146
149 return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
150}
151
152
155 uint32_t __hi = (uint32_t)(__x >> 32ull);
156 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157 return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158 ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159}
160
161
163 return __builtin_amdgcn_is_shared((void __attribute__((address_space(0))) *)((
165}
166
167
169 return __builtin_amdgcn_is_private((void __attribute__((
171}
172
173
175 __builtin_amdgcn_endpgm();
176}
177
178
180 __builtin_amdgcn_s_sleep(2);
181}
182
185
186#if !defined(__cplusplus)
188#endif
189
190#endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static _DEFAULT_FN_ATTRS __inline__ void __gpu_thread_suspend(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_private(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_ptr_local(void *ptr)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_lanes(void)
_DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)
static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_exit(void)
static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_threads(void)
static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)
_Pragma("push_macro(\"bool\")")
#define _DEFAULT_FN_ATTRS