LLVM: lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

70#include "llvm/IR/IntrinsicsAMDGPU.h"

73

74using namespace llvm;

75

76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"

77

78namespace {

79class AMDGPUImageIntrinsicOptimizer : public FunctionPass {

81

82public:

83 static char ID;

84

85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)

87

89

90};

91}

92

94 "AMDGPU Image Intrinsic Optimizer", false, false)

95

96char AMDGPUImageIntrinsicOptimizer::ID = 0;

97

101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {

103

104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())

105 continue;

106

107

108 if (IIList.front()->getType() != II->getType())

109 continue;

110

111

112 bool AllEqual = true;

113 assert(IIList.front()->arg_size() == II->arg_size());

114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {

115 Value *ArgList = IIList.front()->getArgOperand(I);

116 Value *Arg = II->getArgOperand(I);

117 if (I == ImageDimIntr->VAddrEnd - 1) {

118

119 auto *FragIdList = cast(IIList.front()->getArgOperand(I));

120 auto *FragId = cast(II->getArgOperand(I));

121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);

122 } else {

123

124 AllEqual = ArgList == Arg;

125 }

126 }

127 if (!AllEqual)

128 continue;

129

130

131 IIList.emplace_back(II);

132 return;

133 }

134

135

136 MergeableInsts.emplace_back(1, II);

138}

139

140

141

145 for (; I != E; ++I) {

146

147

148 if (I->mayHaveSideEffects()) {

149 ++I;

150 break;

151 }

152

153

156

157

158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&

159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)

160 continue;

161

162

164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;

165 if (!isa(II->getArgOperand(FragIdIndex)))

166 continue;

167

170 }

171 }

172

173 return I;

174}

175

178

180 for (const auto &IIList : MergeableInsts) {

181 if (IIList.size() <= 1)

182 continue;

183

184

186

187

188

190 Function *F = IIList.front()->getCalledFunction();

192 continue;

193

194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();

197

200 OverloadTys[0] = NewTy;

201 bool isD16 = EltTy->isHalfTy();

202

204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));

205 unsigned DMaskVal = DMask->getZExtValue() & 0xf;

206 unsigned NumElts = popcount(DMaskVal);

207

208

209

210 unsigned NumLoads = IIList.size();

211 unsigned NumMsaas = NumElts;

212 unsigned NumVAddrLoads = 3 * NumLoads;

213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;

214 unsigned NumVAddrMsaas = 3 * NumMsaas;

215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;

216

217 if (NumLoads < NumMsaas ||

218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))

219 continue;

220

222 auto *FragId =

223 cast(IIList.front()->getArgOperand(FragIdIndex));

224 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;

225

226

228

229

231 while (DMaskVal != 0) {

232 unsigned NewMaskVal = 1 << countr_zero(DMaskVal);

233

235 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)

236 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;

237 else

238 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;

239

241 ConstantInt::get(DMask->getType(), NewMaskVal);

242 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);

243 CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);

244 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");

245

247 DMaskVal -= NewMaskVal;

248 }

249

250

251 for (auto &II : IIList) {

252 Value *VecOp = nullptr;

253 auto *Idx = cast(II->getArgOperand(FragIdIndex));

254 B.SetCurrentDebugLocation(II->getDebugLoc());

255 if (NumElts == 1) {

256 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));

258 } else {

260 for (unsigned I = 0; I < NumElts; ++I) {

261 VecOp = B.CreateInsertElement(

262 VecOp,

263 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);

265 }

266 }

267

268

269 II->replaceAllUsesWith(VecOp);

272 }

273

275 }

276

277 for (auto *I : InstrsToErase)

278 I->eraseFromParent();

279

281}

282

284 if (!TM)

285 return false;

286

287

290 return false;

291

292 Module *M = F.getParent();

293

294

296 return F.users().empty() &&

297 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||

298 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);

299 }))

300 return false;

301

303 for (auto &BB : F) {

306 I = SectionEnd) {

308

311 }

312 }

313

315}

316

317bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {

318 if (skipFunction(F))

319 return false;

320

322}

323

326 return new AMDGPUImageIntrinsicOptimizer(TM);

327}

328

332

335}

bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)

INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)

BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)

static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)

Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

The AMDGPU TargetMachine interface definition for hw codegen targets.

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx

uint64_t IntrinsicInst * II

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

Class for arbitrary precision integers.

APInt udiv(const APInt &RHS) const

Unsigned division operation.

A container for analyses that lazily runs them and caches their results.

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

InstListType::iterator iterator

Instruction iterators...

This class represents a function call, abstracting a target machine's calling convention.

This is the shared class of boolean and integer constants.

uint64_t getZExtValue() const

Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...

static FixedVectorType * get(Type *ElementType, unsigned NumElts)

FunctionPass class - This class is used to implement most global optimizations.

virtual bool runOnFunction(Function &F)=0

runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

A wrapper class for inspecting calls to intrinsic functions.

A Module instance is used to store all the information related to an LLVM module.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses none()

Convenience factory function for the empty preserved set.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Primary interface to the complete machine description for the target machine.

The instances of the Type class are immutable: once they are created, they are never changed.

bool isHalfTy() const

Return true if this is 'half', a 16-bit IEEE fp type.

Type * getScalarType() const

If this is a vector type, return the element type, otherwise return 'this'.

static UndefValue * get(Type *T)

Static factory methods - Return an 'undef' object of the specified type.

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

void takeName(Value *V)

Transfer the name from V to this value.

bool isGFX11Plus(const MCSubtargetInfo &STI)

const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)

Gets the type arguments of an intrinsic call by matching type contraints specified by the ....

This is an optimization pass for GlobalISel generic memory operations.

int popcount(T Value) noexcept

Count the number of set bits in a value.

int countr_zero(T Val)

Count number of 0's from the least significant bit to the most stopping at the first 1.

raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

bool none_of(R &&Range, UnaryPredicate P)

Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)