clang: lib/Headers/gpuintrin.h Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17#ifndef __GPUINTRIN_H

18#define __GPUINTRIN_H

19

20#if !defined(_DEFAULT_FN_ATTRS)

21#if defined(__HIP__) || defined(__CUDA__)

22#define _DEFAULT_FN_ATTRS __attribute__((device))

23#else

24#define _DEFAULT_FN_ATTRS

25#endif

26#endif

27

29

30#if !defined(__cplusplus)

32#define bool _Bool

33#endif

34

35_Pragma("omp begin declare target device_type(nohost)");

36_Pragma("omp begin declare variant match(device = {kind(gpu)})");

37

38

39

40

43

44

47

48

51

52

55

56_Pragma("omp end declare variant");

58

59#if defined(__NVPTX__)

61#elif defined(__AMDGPU__)

63#elif !defined(_OPENMP)

64#error "This header is only meant to be used on GPU architectures."

65#endif

66

67_Pragma("omp begin declare target device_type(nohost)");

68_Pragma("omp begin declare variant match(device = {kind(gpu)})");

69

70#define __GPU_X_DIM 0

71#define __GPU_Y_DIM 1

72#define __GPU_Z_DIM 2

73

74

76 switch (__dim) {

77 case 0:

79 case 1:

81 case 2:

83 default:

84 __builtin_unreachable();

85 }

86}

87

88

90 switch (__dim) {

91 case 0:

93 case 1:

95 case 2:

97 default:

98 __builtin_unreachable();

99 }

100}

101

102

104 switch (__dim) {

105 case 0:

107 case 1:

109 case 2:

111 default:

112 __builtin_unreachable();

113 }

114}

115

116

118 switch (__dim) {

119 case 0:

121 case 1:

123 case 2:

125 default:

126 __builtin_unreachable();

127 }

128}

129

130

133 return __builtin_ffsll(__lane_mask) - 1;

134}

135

136

141

142

145 uint32_t __hi = (uint32_t)(__x >> 32ull);

146 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);

149 0xFFFFFFFFull);

150}

151

152

155 return __builtin_bit_cast(

157 __builtin_bit_cast(uint32_t, __x)));

158}

159

160

163 return __builtin_bit_cast(

165 __builtin_bit_cast(uint64_t, __x)));

166}

167

168

171 uint32_t __width) {

172 uint32_t __hi = (uint32_t)(__x >> 32ull);

173 uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);

174 uint32_t __mask = (uint32_t)__lane_mask;

176 << 32ull) |

178}

179

180

183 uint32_t __width) {

184 return __builtin_bit_cast(

186 __builtin_bit_cast(uint32_t, __x), __width));

187}

188

189

192 uint32_t __width) {

193 return __builtin_bit_cast(

194 double,

196 __builtin_bit_cast(uint64_t, __x), __width));

197}

198

199

200#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \

201 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \

202 uint64_t __lane_mask, uint32_t __x) { \

203 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \

204 bool __divergent = __gpu_read_first_lane_##__suffix( \

205 __lane_mask, __first & (__first + 1)); \

206 if (__divergent) { \

207 __type __accum = 0; \

208 for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \

209 __type __index = __builtin_ctzll(__mask); \

210 __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \

211 __gpu_num_lanes()); \

212 __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \

213 __accum += __tmp; \

214 } \

215 } else { \

216 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \

217 uint32_t __index = __gpu_lane_id() - __step; \

218 __bitmask_type bitmask = __gpu_lane_id() >= __step; \

219 __x += __builtin_bit_cast( \

220 __type, \

221 -bitmask & __builtin_bit_cast(__bitmask_type, \

222 __gpu_shuffle_idx_##__suffix( \

223 __lane_mask, __index, __x, \

224 __gpu_num_lanes()))); \

225 } \

226 } \

227 return __x; \

228 }

229__DO_LANE_SCAN(uint32_t, uint32_t, u32);

230__DO_LANE_SCAN(uint64_t, uint64_t, u64);

231__DO_LANE_SCAN(float, uint32_t, f32);

232__DO_LANE_SCAN(double, uint64_t, f64);

233#undef __DO_LANE_SCAN

234

235

236#define __DO_LANE_SUM(__type, __suffix) \

237 _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \

238 uint64_t __lane_mask, __type __x) { \

239 uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \

240 bool __divergent = __gpu_read_first_lane_##__suffix( \

241 __lane_mask, __first & (__first + 1)); \

242 if (__divergent) { \

243 return __gpu_shuffle_idx_##__suffix( \

244 __lane_mask, 63 - __builtin_clzll(__lane_mask), \

245 __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \

246 } else { \

247 for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \

248 uint32_t __index = __step + __gpu_lane_id(); \

249 __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \

250 __gpu_num_lanes()); \

251 } \

252 return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \

253 } \

254 }

255__DO_LANE_SUM(uint32_t, u32);

256__DO_LANE_SUM(uint64_t, u64);

258__DO_LANE_SUM(double, f64);

259#undef __DO_LANE_SUM

260

261

264 uint64_t __match_mask = 0;

265

266 bool __done = 0;

267 for (uint64_t __active_mask = __lane_mask; __active_mask;

268 __active_mask = __gpu_ballot(__lane_mask, !__done)) {

269 if (!__done) {

271 if (__first == __x) {

273 __done = 1;

274 }

275 }

276 }

278 return __match_mask;

279}

280

281

284 uint64_t __match_mask = 0;

285

286 bool __done = 0;

287 for (uint64_t __active_mask = __lane_mask; __active_mask;

288 __active_mask = __gpu_ballot(__lane_mask, !__done)) {

289 if (!__done) {

291 if (__first == __x) {

293 __done = 1;

294 }

295 }

296 }

298 return __match_mask;

299}

300

301

309

310

318

319_Pragma("omp end declare variant");

320_Pragma("omp end declare target");

321

322#if !defined(__cplusplus)

324#endif

325

326#undef _DEFAULT_FN_ATTRS

327

328#endif

__DEVICE__ unsigned int __ballot(int __a)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_x(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_lane_id(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_z(void)

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_lane_mask(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_y(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_z(void)

static _DEFAULT_FN_ATTRS __inline__ void __gpu_sync_lane(uint64_t __lane_mask)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_x(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_z(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_y(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads_x(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id_y(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_z(void)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id_x(void)

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, bool __x)

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks_y(void)

#define _DEFAULT_FN_ATTRS

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_block_id(int __dim)

Definition gpuintrin.h:89

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x)

Definition gpuintrin.h:144

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x)

Definition gpuintrin.h:263

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, uint32_t __width)

Definition gpuintrin.h:170

static _DEFAULT_FN_ATTRS __inline__ double __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x)

Definition gpuintrin.h:162

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x)

Definition gpuintrin.h:312

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x)

Definition gpuintrin.h:303

static _DEFAULT_FN_ATTRS __inline__ bool __gpu_is_first_in_lane(uint64_t __lane_mask)

Definition gpuintrin.h:138

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_threads(int __dim)

Definition gpuintrin.h:103

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_thread_id(int __dim)

Definition gpuintrin.h:117

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_first_lane_id(uint64_t __lane_mask)

Definition gpuintrin.h:132

#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)

Definition gpuintrin.h:200

static _DEFAULT_FN_ATTRS __inline__ uint64_t __gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x)

Definition gpuintrin.h:283

static _DEFAULT_FN_ATTRS __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, uint32_t __width)

Definition gpuintrin.h:182

#define __DO_LANE_SUM(__type, __suffix)

Definition gpuintrin.h:236

static _DEFAULT_FN_ATTRS __inline__ uint32_t __gpu_num_blocks(int __dim)

Definition gpuintrin.h:75

static _DEFAULT_FN_ATTRS __inline__ float __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x)

Definition gpuintrin.h:154

static _DEFAULT_FN_ATTRS __inline__ double __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, uint32_t __width)

Definition gpuintrin.h:191

_Pragma("push_macro(\"bool\")")