LLVM: lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
70#include "llvm/IR/IntrinsicsAMDGPU.h"
73
74using namespace llvm;
75
76#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77
78namespace {
79class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
81
82public:
83 static char ID;
84
85 AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
87
89
90};
91}
92
94 "AMDGPU Image Intrinsic Optimizer", false, false)
95
96char AMDGPUImageIntrinsicOptimizer::ID = 0;
97
101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
103
104 if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105 continue;
106
107
108 if (IIList.front()->getType() != II->getType())
109 continue;
110
111
112 bool AllEqual = true;
113 assert(IIList.front()->arg_size() == II->arg_size());
114 for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115 Value *ArgList = IIList.front()->getArgOperand(I);
116 Value *Arg = II->getArgOperand(I);
117 if (I == ImageDimIntr->VAddrEnd - 1) {
118
119 auto *FragIdList = cast(IIList.front()->getArgOperand(I));
121 AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122 } else {
123
124 AllEqual = ArgList == Arg;
125 }
126 }
127 if (!AllEqual)
128 continue;
129
130
131 IIList.emplace_back(II);
132 return;
133 }
134
135
136 MergeableInsts.emplace_back(1, II);
138}
139
140
141
146
147
148 if (I->mayHaveSideEffects()) {
149 ++I;
150 break;
151 }
152
153
156
157
158 if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159 IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160 continue;
161
162
164 const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
166 continue;
167
170 }
171 }
172
173 return I;
174}
175
178
180 for (const auto &IIList : MergeableInsts) {
181 if (IIList.size() <= 1)
182 continue;
183
184
186
187
188
190 Function *F = IIList.front()->getCalledFunction();
192 continue;
193
194 Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
197
200 OverloadTys[0] = NewTy;
201 bool isD16 = EltTy->isHalfTy();
202
204 IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206 unsigned NumElts = popcount(DMaskVal);
207
208
209
210 unsigned NumLoads = IIList.size();
211 unsigned NumMsaas = NumElts;
212 unsigned NumVAddrLoads = 3 * NumLoads;
213 unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214 unsigned NumVAddrMsaas = 3 * NumMsaas;
215 unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216
217 if (NumLoads < NumMsaas ||
218 (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219 continue;
220
222 auto *FragId =
224 const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
225
226
228
229
231 while (DMaskVal != 0) {
232 unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
233
235 if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
236 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
237 else
238 NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
239
241 ConstantInt::get(DMask->getType(), NewMaskVal);
242 Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
243 CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);
244 LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
245
247 DMaskVal -= NewMaskVal;
248 }
249
250
251 for (auto &II : IIList) {
252 Value *VecOp = nullptr;
254 B.SetCurrentDebugLocation(II->getDebugLoc());
255 if (NumElts == 1) {
256 VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
258 } else {
260 for (unsigned I = 0; I < NumElts; ++I) {
261 VecOp = B.CreateInsertElement(
262 VecOp,
263 B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
265 }
266 }
267
268
269 II->replaceAllUsesWith(VecOp);
272 }
273
275 }
276
277 for (auto *I : InstrsToErase)
278 I->eraseFromParent();
279
281}
282
284 if (!TM)
285 return false;
286
287
290 return false;
291
292 Module *M = F.getParent();
293
294
296 return .users().empty() &&
297 (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
298 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
299 }))
300 return false;
301
303 for (auto &BB : F) {
306 I = SectionEnd) {
308
311 }
312 }
313
315}
316
317bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
318 if (skipFunction(F))
319 return false;
320
322}
323
324FunctionPass *
326 return new AMDGPUImageIntrinsicOptimizer(TM);
327}
328
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)
Definition AMDGPUImageIntrinsicOptimizer.cpp:176
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
Definition AMDGPUImageIntrinsicOptimizer.cpp:98
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Definition AMDGPUImageIntrinsicOptimizer.cpp:142
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)
Definition AMDGPUImageIntrinsicOptimizer.cpp:283
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Class for arbitrary precision integers.
LLVM_ABI APInt udiv(const APInt &RHS) const
Unsigned division operation.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
InstListType::iterator iterator
Instruction iterators...
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool isa(const From &Val)
isa - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
decltype(auto) cast(const From &Val)
cast - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
Definition AMDGPUImageIntrinsicOptimizer.cpp:325
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition AMDGPUImageIntrinsicOptimizer.cpp:330