LLVM: lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
70#include "llvm/IR/IntrinsicsAMDGPU.h"
73
74using namespace llvm;
75
76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77
78namespace {
79class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
81
82public:
83 static char ID;
84
85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
87
89
90};
91}
92
94 "AMDGPU Image Intrinsic Optimizer", false, false)
95
96char AMDGPUImageIntrinsicOptimizer::ID = 0;
97
101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
103
104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105 continue;
106
107
108 if (IIList.front()->getType() != II->getType())
109 continue;
110
111
112 bool AllEqual = true;
113 assert(IIList.front()->arg_size() == II->arg_size());
114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115 Value *ArgList = IIList.front()->getArgOperand(I);
116 Value *Arg = II->getArgOperand(I);
117 if (I == ImageDimIntr->VAddrEnd - 1) {
118
119 auto *FragIdList = cast(IIList.front()->getArgOperand(I));
120 auto *FragId = cast(II->getArgOperand(I));
121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122 } else {
123
124 AllEqual = ArgList == Arg;
125 }
126 }
127 if (!AllEqual)
128 continue;
129
130
131 IIList.emplace_back(II);
132 return;
133 }
134
135
136 MergeableInsts.emplace_back(1, II);
138}
139
140
141
146
147
148 if (I->mayHaveSideEffects()) {
149 ++I;
150 break;
151 }
152
153
156
157
158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160 continue;
161
162
164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165 if (!isa(II->getArgOperand(FragIdIndex)))
166 continue;
167
170 }
171 }
172
173 return I;
174}
175
178
180 for (const auto &IIList : MergeableInsts) {
181 if (IIList.size() <= 1)
182 continue;
183
184
186
187
188
190 Function *F = IIList.front()->getCalledFunction();
192 continue;
193
194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
197
200 OverloadTys[0] = NewTy;
201 bool isD16 = EltTy->isHalfTy();
202
204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206 unsigned NumElts = popcount(DMaskVal);
207
208
209
210 unsigned NumLoads = IIList.size();
211 unsigned NumMsaas = NumElts;
212 unsigned NumVAddrLoads = 3 * NumLoads;
213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214 unsigned NumVAddrMsaas = 3 * NumMsaas;
215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216
217 if (NumLoads < NumMsaas ||
218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219 continue;
220
222 auto *FragId =
223 cast(IIList.front()->getArgOperand(FragIdIndex));
224 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
225
226
228
229
231 while (DMaskVal != 0) {
232 unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
233
235 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
236 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
237 else
238 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
239
241 ConstantInt::get(DMask->getType(), NewMaskVal);
242 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
243 CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);
244 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
245
247 DMaskVal -= NewMaskVal;
248 }
249
250
251 for (auto &II : IIList) {
252 Value *VecOp = nullptr;
253 auto *Idx = cast(II->getArgOperand(FragIdIndex));
254 B.SetCurrentDebugLocation(II->getDebugLoc());
255 if (NumElts == 1) {
256 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
258 } else {
260 for (unsigned I = 0; I < NumElts; ++I) {
261 VecOp = B.CreateInsertElement(
262 VecOp,
263 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
265 }
266 }
267
268
269 II->replaceAllUsesWith(VecOp);
272 }
273
275 }
276
277 for (auto *I : InstrsToErase)
278 I->eraseFromParent();
279
281}
282
284 if (!TM)
285 return false;
286
287
290 return false;
291
292 Module *M = F.getParent();
293
294
296 return .users().empty() &&
297 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
298 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
299 }))
300 return false;
301
303 for (auto &BB : F) {
306 I = SectionEnd) {
308
311 }
312 }
313
315}
316
317bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
318 if (skipFunction(F))
319 return false;
320
322}
323
326 return new AMDGPUImageIntrinsicOptimizer(TM);
327}
328
332
335}
bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Class for arbitrary precision integers.
APInt udiv(const APInt &RHS) const
Unsigned division operation.
A container for analyses that lazily runs them and caches their results.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
InstListType::iterator iterator
Instruction iterators...
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void takeName(Value *V)
Transfer the name from V to this value.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
This is an optimization pass for GlobalISel generic memory operations.
int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)