LLVM: lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

70#include "llvm/IR/IntrinsicsAMDGPU.h"

73

74using namespace llvm;

75

76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"

77

78namespace {

79class AMDGPUImageIntrinsicOptimizer : public FunctionPass {

81

82public:

83 static char ID;

84

85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)

87

89

90};

91}

92

94 "AMDGPU Image Intrinsic Optimizer", false, false)

95

96char AMDGPUImageIntrinsicOptimizer::ID = 0;

97

101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {

103

104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())

105 continue;

106

107

108 if (IIList.front()->getType() != II->getType())

109 continue;

110

111

112 bool AllEqual = true;

113 assert(IIList.front()->arg_size() == II->arg_size());

114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {

115 Value *ArgList = IIList.front()->getArgOperand(I);

116 Value *Arg = II->getArgOperand(I);

117 if (I == ImageDimIntr->VAddrEnd - 1) {

118

119 auto *FragIdList = cast(IIList.front()->getArgOperand(I));

121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);

122 } else {

123

124 AllEqual = ArgList == Arg;

125 }

126 }

127 if (!AllEqual)

128 continue;

129

130

131 IIList.emplace_back(II);

132 return;

133 }

134

135

136 MergeableInsts.emplace_back(1, II);

138}

139

140

141

145 for (; I != E; ++I) {

146

147

148 if (I->mayHaveSideEffects()) {

149 ++I;

150 break;

151 }

152

153

156

157

158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&

159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)

160 continue;

161

162

164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;

166 continue;

167

170 }

171 }

172

173 return I;

174}

175

178

180 for (const auto &IIList : MergeableInsts) {

181 if (IIList.size() <= 1)

182 continue;

183

184

186

187

188

190 Function *F = IIList.front()->getCalledFunction();

192 continue;

193

194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();

197

200 OverloadTys[0] = NewTy;

201 bool isD16 = EltTy->isHalfTy();

202

204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));

205 unsigned DMaskVal = DMask->getZExtValue() & 0xf;

206 unsigned NumElts = popcount(DMaskVal);

207

208

209

210 unsigned NumLoads = IIList.size();

211 unsigned NumMsaas = NumElts;

212 unsigned NumVAddrLoads = 3 * NumLoads;

213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;

214 unsigned NumVAddrMsaas = 3 * NumMsaas;

215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;

216

217 if (NumLoads < NumMsaas ||

218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))

219 continue;

220

222 auto *FragId =

224 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;

225

226

228

229

231 while (DMaskVal != 0) {

232 unsigned NewMaskVal = 1 << countr_zero(DMaskVal);

233

235 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)

236 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;

237 else

238 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;

239

241 ConstantInt::get(DMask->getType(), NewMaskVal);

242 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);

243 CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);

244 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");

245

247 DMaskVal -= NewMaskVal;

248 }

249

250

251 for (auto &II : IIList) {

252 Value *VecOp = nullptr;

254 B.SetCurrentDebugLocation(II->getDebugLoc());

255 if (NumElts == 1) {

256 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));

258 } else {

260 for (unsigned I = 0; I < NumElts; ++I) {

261 VecOp = B.CreateInsertElement(

262 VecOp,

263 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);

265 }

266 }

267

268

269 II->replaceAllUsesWith(VecOp);

272 }

273

275 }

276

277 for (auto *I : InstrsToErase)

278 I->eraseFromParent();

279

281}

282

284 if (!TM)

285 return false;

286

287

290 return false;

291

292 Module *M = F.getParent();

293

294

296 return F.users().empty() &&

297 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||

298 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);

299 }))

300 return false;

301

303 for (auto &BB : F) {

306 I = SectionEnd) {

308

311 }

312 }

313

315}

316

317bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {

318 if (skipFunction(F))

319 return false;

320

322}

323

324FunctionPass *

326 return new AMDGPUImageIntrinsicOptimizer(TM);

327}

328

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)

Definition AMDGPUImageIntrinsicOptimizer.cpp:176

INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)

Definition AMDGPUImageIntrinsicOptimizer.cpp:98

BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)

Definition AMDGPUImageIntrinsicOptimizer.cpp:142

static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)

Definition AMDGPUImageIntrinsicOptimizer.cpp:283

Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

The AMDGPU TargetMachine interface definition for hw codegen targets.

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static bool runOnFunction(Function &F, bool PostInlining)

uint64_t IntrinsicInst * II

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

Class for arbitrary precision integers.

LLVM_ABI APInt udiv(const APInt &RHS) const

Unsigned division operation.

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

InstListType::iterator iterator

Instruction iterators...

This class represents a function call, abstracting a target machine's calling convention.

This is the shared class of boolean and integer constants.

uint64_t getZExtValue() const

Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...

static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)

FunctionPass class - This class is used to implement most global optimizations.

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

A wrapper class for inspecting calls to intrinsic functions.

A Module instance is used to store all the information related to an LLVM module.

static LLVM_ABI PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses none()

Convenience factory function for the empty preserved set.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Primary interface to the complete machine description for the target machine.

const STC & getSubtarget(const Function &F) const

This method returns a pointer to the specified type of TargetSubtargetInfo.

The instances of the Type class are immutable: once they are created, they are never changed.

Type * getScalarType() const

If this is a vector type, return the element type, otherwise return 'this'.

bool isHalfTy() const

Return true if this is 'half', a 16-bit IEEE fp type.

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

LLVM_ABI void takeName(Value *V)

Transfer the name from V to this value.

bool isGFX11Plus(const MCSubtargetInfo &STI)

const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)

Gets the type arguments of an intrinsic call by matching type contraints specified by the ....

This is an optimization pass for GlobalISel generic memory operations.

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

constexpr int popcount(T Value) noexcept

Count the number of set bits in a value.

int countr_zero(T Val)

Count number of 0's from the least significant bit to the most stopping at the first 1.

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

bool none_of(R &&Range, UnaryPredicate P)

Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.

bool isa(const From &Val)

isa - Return true if the parameter to the template is an instance of one of the template type argu...

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

decltype(auto) cast(const From &Val)

cast - Return the argument parameter cast to the specified type.

AnalysisManager< Function > FunctionAnalysisManager

Convenience typedef for the Function analysis manager.

FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)

Definition AMDGPUImageIntrinsicOptimizer.cpp:325

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)

Definition AMDGPUImageIntrinsicOptimizer.cpp:330